Merge branch 'master' into dev
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..4ad67d3
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,4 @@
+/.travis.yml export-ignore
+/appveyor.yml export-ignore
+/ci export-ignore
+/.gitattributes export-ignore
diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index db7a0c4..0000000
--- a/.gitignore
+++ /dev/null
@@ -1,14 +0,0 @@
-.DS_Store
-Makefile.in
-aclocal.m4
-ar-lib
-autom4te.cache
-compile
-config.guess
-config.h.in
-config.sub
-configure
-depcomp
-install-sh
-ltmain.sh
-missing
diff --git a/.travis.yml b/.travis.yml
index 55df6da..9569525 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,35 +11,41 @@
       env: BUILD_OFFICIAL=1
       osx_image: xcode7.3
     - os: linux
+      dist: trusty
       compiler: clang
       env:
-        CFLAGS="-O1 -g -fsanitize=address -fno-omit-frame-pointer"
-        CONFIGURE_FLAGS="--disable-shared"
+        CMAKE_BUILD_TYPE=RelWithDebInfo
+        CFLAGS_RELWITHDEBINFO="-O1 -g -fsanitize=address -fno-omit-frame-pointer"
+        CMAKE_FLAGS="-DENABLE_SHARED=0"
         ASAN_OPTIONS="detect_leaks=1 symbolize=1"
       addons:
         apt:
           packages:
             - nasm
     - os: linux
+      dist: trusty
       compiler: gcc
-      env: CONFIGURE_FLAGS="--with-12bit"
+      env: CMAKE_FLAGS="-DWITH_12BIT=1"
     - os: linux
+      dist: trusty
       compiler: gcc
-      env: CONFIGURE_FLAGS="--with-jpeg7"
+      env: CMAKE_FLAGS="-DWITH_JPEG7=1"
       addons:
         apt:
           packages:
             - nasm
     - os: linux
+      dist: trusty
       compiler: gcc
-      env: CONFIGURE_FLAGS="--with-jpeg8"
+      env: CMAKE_FLAGS="-DWITH_JPEG8=1"
       addons:
         apt:
           packages:
             - nasm
     - os: linux
+      dist: trusty
       compiler: gcc
-      env: CONFIGURE_FLAGS="--without-simd"
+      env: CMAKE_FLAGS="-DWITH_SIMD=0"
 
 before_install:
   - if [ "$TRAVIS_OS_NAME" = "osx" ]; then
@@ -81,18 +87,17 @@
       fi
     fi
   - if [ "${BUILD_OFFICIAL:-}" == "" ]; then
-      autoreconf -fiv &&
       mkdir build &&
       pushd build &&
-      ../configure ${CONFIGURE_FLAGS} &&
+      cmake -G"Unix Makefiles" -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE "-DCMAKE_C_FLAGS_RELWITHDEBINFO=$CFLAGS_RELWITHDEBINFO" $CMAKE_FLAGS .. &&
       export NUMCPUS=`grep -c '^processor' /proc/cpuinfo` &&
       make -j$NUMCPUS --load-average=$NUMCPUS &&
-      if [[ "${CONFIGURE_FLAGS}" =~ "with-12bit" ||
-            "${CONFIGURE_FLAGS}" =~ "without-simd" ]]; then
-        make test FLOATTEST=32bit;
-      else
-        make test FLOATTEST=sse &&
-        JSIMD_FORCENONE=1 make test FLOATTEST=32bit;
+      make test &&
+      if [[ ! "${CMAKE_FLAGS[0]}" =~ "WITH_12BIT" &&
+            ! "${CMAKE_FLAGS[0]}" =~ "WITH_SIMD" ]]; then
+        JSIMD_FORCESSE2=1 make test &&
+        cmake -DFLOATTEST=32bit .. &&
+        JSIMD_FORCENONE=1 make test;
       fi &&
       popd;
     fi
diff --git a/BUILDING.md b/BUILDING.md
index 42aadf2..429963e 100644
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -1,26 +1,27 @@
-Un*x Platforms (including Mac and Cygwin)
-=========================================
+Building libjpeg-turbo
+======================
 
 
 Build Requirements
 ------------------
 
-- autoconf 2.56 or later
-- automake 1.7 or later
-- libtool 1.4 or later
-  * If using Xcode 4.3 or later on OS X, autoconf and automake are no longer
-    provided.  The easiest way to obtain them is from
-    [MacPorts](http://www.MacPorts.org) or [Homebrew](http://brew.sh/).
+
+### All Systems
+
+- [CMake](http://www.cmake.org) v2.8.12 or later
 
 - [NASM](http://www.nasm.us) or [YASM](http://yasm.tortall.net)
   (if building x86 or x86-64 SIMD extensions)
-  * If using NASM, 0.98, or 2.01 or later is required for an x86 build (0.99
-    and 2.00 do not work properly with libjpeg-turbo's x86 SIMD code.)
-  * If using NASM, 2.00 or later is required for an x86-64 build.
-  * If using NASM, 2.07 or later (except 2.11.08) is required for an x86-64
-    Mac build (2.11.08 does not work properly with libjpeg-turbo's x86-64 SIMD
-    code when building macho64 objects.)  NASM or YASM can be obtained from
+  * If using NASM, 2.10 or later is required.
+  * If using NASM, 2.10 or later (except 2.11.08) is required for an x86-64 Mac
+    build (2.11.08 does not work properly with libjpeg-turbo's x86-64 SIMD code
+    when building macho64 objects.)  NASM or YASM can be obtained from
     [MacPorts](http://www.macports.org/) or [Homebrew](http://brew.sh/).
+  * If using YASM, 1.2.0 or later is required.
+     - NOTE: Currently, if it is desirable to hide the SIMD function symbols in
+       Mac executables or shared libraries that statically link with
+       libjpeg-turbo, then YASM must be used when building libjpeg-turbo.
+  * If building on Windows, **nasm.exe**/**yasm.exe** should be in your `PATH`.
 
   The binary RPMs released by the NASM project do not work on older Linux
   systems, such as Red Hat Enterprise Linux 5.  On such systems, you can easily
@@ -36,6 +37,9 @@
 
   NOTE: the NASM build will fail if texinfo is not installed.
 
+
+### Un*x Platforms (including Linux, Mac, FreeBSD, Solaris, and Cygwin)
+
 - GCC v4.1 (or later) or Clang recommended for best performance
 
 - If building the TurboJPEG Java wrapper, JDK or OpenJDK 1.5 or later is
@@ -47,517 +51,7 @@
   <http://www.java.com>.
 
 
-Out-of-Tree Builds
-------------------
-
-Binary objects, libraries, and executables are generated in the directory from
-which `configure` is executed (the "binary directory"), and this directory need
-not necessarily be the same as the libjpeg-turbo source directory.  You can
-create multiple independent binary directories, in which different versions of
-libjpeg-turbo can be built from the same source tree using different compilers
-or settings.  In the sections below, *{build_directory}* refers to the binary
-directory, whereas *{source_directory}* refers to the libjpeg-turbo source
-directory.  For in-tree builds, these directories are the same.
-
-
-Build Procedure
----------------
-
-The following procedure will build libjpeg-turbo on Unix and Unix-like systems.
-(On Solaris, this generates a 32-bit build.  See "Build Recipes" below for
-64-bit build instructions.)
-
-    cd {source_directory}
-    autoreconf -fiv
-    cd {build_directory}
-    sh {source_directory}/configure [additional configure flags]
-    make
-
-NOTE: Running autoreconf in the source directory is not necessary if building
-libjpeg-turbo from one of the official release tarballs.
-
-This will generate the following files under **.libs/**:
-
-**libjpeg.a**<br>
-Static link library for the libjpeg API
-
-**libjpeg.so.{version}** (Linux, Unix)<br>
-**libjpeg.{version}.dylib** (Mac)<br>
-**cygjpeg-{version}.dll** (Cygwin)<br>
-Shared library for the libjpeg API
-
-By default, *{version}* is 62.2.0, 7.2.0, or 8.1.2, depending on whether
-libjpeg v6b (default), v7, or v8 emulation is enabled.  If using Cygwin,
-*{version}* is 62, 7, or 8.
-
-**libjpeg.so** (Linux, Unix)<br>
-**libjpeg.dylib** (Mac)<br>
-Development symlink for the libjpeg API
-
-**libjpeg.dll.a** (Cygwin)<br>
-Import library for the libjpeg API
-
-**libturbojpeg.a**<br>
-Static link library for the TurboJPEG API
-
-**libturbojpeg.so.0.1.0** (Linux, Unix)<br>
-**libturbojpeg.0.1.0.dylib** (Mac)<br>
-**cygturbojpeg-0.dll** (Cygwin)<br>
-Shared library for the TurboJPEG API
-
-**libturbojpeg.so** (Linux, Unix)<br>
-**libturbojpeg.dylib** (Mac)<br>
-Development symlink for the TurboJPEG API
-
-**libturbojpeg.dll.a** (Cygwin)<br>
-Import library for the TurboJPEG API
-
-
-### libjpeg v7 or v8 API/ABI Emulation
-
-Add `--with-jpeg7` to the `configure` command line to build a version of
-libjpeg-turbo that is API/ABI-compatible with libjpeg v7.  Add `--with-jpeg8`
-to the `configure` command to build a version of libjpeg-turbo that is
-API/ABI-compatible with libjpeg v8.  See [README.md](README.md) for more
-information about libjpeg v7 and v8 emulation.
-
-
-### In-Memory Source/Destination Managers
-
-When using libjpeg v6b or v7 API/ABI emulation, add `--without-mem-srcdst` to
-the `configure` command line to build a version of libjpeg-turbo that lacks the
-`jpeg_mem_src()` and `jpeg_mem_dest()` functions.  These functions were not
-part of the original libjpeg v6b and v7 APIs, so removing them ensures strict
-conformance with those APIs.  See [README.md](README.md) for more information.
-
-
-### Arithmetic Coding Support
-
-Since the patent on arithmetic coding has expired, this functionality has been
-included in this release of libjpeg-turbo.  libjpeg-turbo's implementation is
-based on the implementation in libjpeg v8, but it works when emulating libjpeg
-v7 or v6b as well.  The default is to enable both arithmetic encoding and
-decoding, but those who have philosophical objections to arithmetic coding can
-add `--without-arith-enc` or `--without-arith-dec` to the `configure` command
-line to disable encoding or decoding (respectively.)
-
-
-### TurboJPEG Java Wrapper
-
-Add `--with-java` to the `configure` command line to incorporate an optional
-Java Native Interface (JNI) wrapper into the TurboJPEG shared library and build
-the Java front-end classes to support it.  This allows the TurboJPEG shared
-library to be used directly from Java applications.  See
-[java/README](java/README) for more details.
-
-You can set the `JAVAC`, `JAR`, and `JAVA` configure variables to specify
-alternate commands for javac, jar, and java (respectively.)  You can also
-set the `JAVACFLAGS` configure variable to specify arguments that should be
-passed to the Java compiler when building the TurboJPEG classes, and
-`JNI_CFLAGS` to specify arguments that should be passed to the C compiler when
-building the JNI wrapper.  Run `configure --help` for more details.
-
-
-Build Recipes
--------------
-
-
-### 32-bit Build on 64-bit Linux
-
-Add
-
-    --host i686-pc-linux-gnu CFLAGS='-O3 -m32' LDFLAGS=-m32
-
-to the `configure` command line.
-
-
-### 64-bit Build on 64-bit OS X
-
-Add
-
-    --host x86_64-apple-darwin NASM=/opt/local/bin/nasm
-
-to the `configure` command line.  NASM 2.07 or later from MacPorts or Homebrew
-must be installed.  If using Homebrew, then replace `/opt/local` with
-`/usr/local`.
-
-
-### 32-bit Build on 64-bit OS X
-
-Add
-
-    --host i686-apple-darwin CFLAGS='-O3 -m32' LDFLAGS=-m32
-
-to the `configure` command line.
-
-
-### 64-bit Backward-Compatible Build on 64-bit OS X
-
-Add
-
-    --host x86_64-apple-darwin NASM=/opt/local/bin/nasm \
-      CFLAGS='-mmacosx-version-min=10.5 -O3' \
-      LDFLAGS='-mmacosx-version-min=10.5'
-
-to the `configure` command line.  NASM 2.07 or later from MacPorts or Homebrew
-must be installed.  If using Homebrew, then replace `/opt/local` with
-`/usr/local`.
-
-
-### 32-bit Backward-Compatible Build on OS X
-
-Add
-
-    --host i686-apple-darwin \
-      CFLAGS='-mmacosx-version-min=10.5 -O3 -m32' \
-      LDFLAGS='-mmacosx-version-min=10.5 -m32'
-
-to the `configure` command line.
-
-
-### 64-bit Build on 64-bit Solaris
-
-Add
-
-    --host x86_64-pc-solaris CFLAGS='-O3 -m64' LDFLAGS=-m64
-
-to the `configure` command line.
-
-
-### 32-bit Build on 64-bit FreeBSD
-
-Add
-
-    --host i386-unknown-freebsd CFLAGS='-O3 -m32' LDFLAGS=-m32
-
-to the `configure` command line.  NASM 2.07 or later from FreeBSD ports must be
-installed.
-
-
-### Oracle Solaris Studio
-
-Add
-
-    CC=cc
-
-to the `configure` command line.  libjpeg-turbo will automatically be built
-with the maximum optimization level (-xO5) unless you override `CFLAGS`.
-
-To build a 64-bit version of libjpeg-turbo using Oracle Solaris Studio, add
-
-    --host x86_64-pc-solaris CC=cc CFLAGS='-xO5 -m64' LDFLAGS=-m64
-
-to the `configure` command line.
-
-
-### MinGW Build on Cygwin
-
-Use CMake (see recipes below)
-
-
-Building libjpeg-turbo for iOS
-------------------------------
-
-iOS platforms, such as the iPhone and iPad, use ARM processors, and all
-currently supported models include NEON instructions.  Thus, they can take
-advantage of libjpeg-turbo's SIMD extensions to significantly accelerate JPEG
-compression/decompression.  This section describes how to build libjpeg-turbo
-for these platforms.
-
-
-### Additional build requirements
-
-- For configurations that require [gas-preprocessor.pl]
-  (https://raw.githubusercontent.com/libjpeg-turbo/gas-preprocessor/master/gas-preprocessor.pl),
-  it should be installed in your `PATH`.
-
-
-### ARMv7 (32-bit)
-
-**gas-preprocessor.pl required**
-
-The following scripts demonstrate how to build libjpeg-turbo to run on the
-iPhone 3GS-4S/iPad 1st-3rd Generation and newer:
-
-#### Xcode 4.2 and earlier (LLVM-GCC)
-
-    IOS_PLATFORMDIR=/Developer/Platforms/iPhoneOS.platform
-    IOS_SYSROOT=($IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk)
-
-    export host_alias=arm-apple-darwin10
-    export CC=${IOS_PLATFORMDIR}/Developer/usr/bin/arm-apple-darwin10-llvm-gcc-4.2
-    export CFLAGS="-mfloat-abi=softfp -isysroot ${IOS_SYSROOT[0]} -O3 -march=armv7 -mcpu=cortex-a8 -mtune=cortex-a8 -mfpu=neon -miphoneos-version-min=3.0"
-
-    cd {build_directory}
-    sh {source_directory}/configure [additional configure flags]
-    make
-
-#### Xcode 4.3-4.6 (LLVM-GCC)
-
-Same as above, but replace the first line with:
-
-    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
-
-#### Xcode 5 and later (Clang)
-
-    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
-    IOS_SYSROOT=($IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk)
-
-    export host_alias=arm-apple-darwin10
-    export CC=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
-    export CFLAGS="-mfloat-abi=softfp -isysroot ${IOS_SYSROOT[0]} -O3 -arch armv7 -miphoneos-version-min=3.0"
-    export CCASFLAGS="$CFLAGS -no-integrated-as"
-
-    cd {build_directory}
-    sh {source_directory}/configure [additional configure flags]
-    make
-
-
-### ARMv7s (32-bit)
-
-**gas-preprocessor.pl required**
-
-The following scripts demonstrate how to build libjpeg-turbo to run on the
-iPhone 5/iPad 4th Generation and newer:
-
-#### Xcode 4.5-4.6 (LLVM-GCC)
-
-    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
-    IOS_SYSROOT=($IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk)
-
-    export host_alias=arm-apple-darwin10
-    export CC=${IOS_PLATFORMDIR}/Developer/usr/bin/arm-apple-darwin10-llvm-gcc-4.2
-    export CFLAGS="-mfloat-abi=softfp -isysroot ${IOS_SYSROOT[0]} -O3 -march=armv7s -mcpu=swift -mtune=swift -mfpu=neon -miphoneos-version-min=6.0"
-
-    cd {build_directory}
-    sh {source_directory}/configure [additional configure flags]
-    make
-
-#### Xcode 5 and later (Clang)
-
-Same as the ARMv7 build procedure for Xcode 5 and later, except replace the
-compiler flags as follows:
-
-    export CFLAGS="-mfloat-abi=softfp -isysroot ${IOS_SYSROOT[0]} -O3 -arch armv7s -miphoneos-version-min=6.0"
-
-
-### ARMv8 (64-bit)
-
-**gas-preprocessor.pl required if using Xcode < 6**
-
-The following script demonstrates how to build libjpeg-turbo to run on the
-iPhone 5S/iPad Mini 2/iPad Air and newer.
-
-    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
-    IOS_SYSROOT=($IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk)
-
-    export host_alias=aarch64-apple-darwin
-    export CC=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
-    export CFLAGS="-isysroot ${IOS_SYSROOT[0]} -O3 -arch arm64 -miphoneos-version-min=7.0 -funwind-tables"
-
-    cd {build_directory}
-    sh {source_directory}/configure [additional configure flags]
-    make
-
-Once built, lipo can be used to combine the ARMv7, v7s, and/or v8 variants into
-a universal library.
-
-
-Building libjpeg-turbo for Android
-----------------------------------
-
-Building libjpeg-turbo for Android platforms requires the
-[Android NDK](https://developer.android.com/tools/sdk/ndk) and autotools.
-
-
-### ARMv7 (32-bit)
-
-The following is a general recipe script that can be modified for your specific
-needs.
-
-    # Set these variables to suit your needs
-    NDK_PATH={full path to the "ndk" directory-- for example, /opt/android/sdk/ndk-bundle}
-    BUILD_PLATFORM={the platform name for the NDK package you installed--
-      for example, "windows-x86" or "linux-x86_64" or "darwin-x86_64"}
-    TOOLCHAIN_VERSION={"4.8", "4.9", "clang3.5", etc.  This corresponds to a
-      toolchain directory under ${NDK_PATH}/toolchains/.}
-    ANDROID_VERSION={The minimum version of Android to support-- for example,
-      "16", "19", etc.}
-
-    # It should not be necessary to modify the rest
-    HOST=arm-linux-androideabi
-    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-arm
-    ANDROID_CFLAGS="-march=armv7-a -mfloat-abi=softfp -fprefetch-loop-arrays \
-      -D__ANDROID_API__=${ANDROID_VERSION} --sysroot=${SYSROOT} \
-      -isystem ${NDK_PATH}/sysroot/usr/include \
-      -isystem ${NDK_PATH}/sysroot/usr/include/${HOST}"
-
-    TOOLCHAIN=${NDK_PATH}/toolchains/${HOST}-${TOOLCHAIN_VERSION}/prebuilt/${BUILD_PLATFORM}
-    export CPP=${TOOLCHAIN}/bin/${HOST}-cpp
-    export AR=${TOOLCHAIN}/bin/${HOST}-ar
-    export NM=${TOOLCHAIN}/bin/${HOST}-nm
-    export CC=${TOOLCHAIN}/bin/${HOST}-gcc
-    export LD=${TOOLCHAIN}/bin/${HOST}-ld
-    export RANLIB=${TOOLCHAIN}/bin/${HOST}-ranlib
-    export OBJDUMP=${TOOLCHAIN}/bin/${HOST}-objdump
-    export STRIP=${TOOLCHAIN}/bin/${HOST}-strip
-    cd {build_directory}
-    sh {source_directory}/configure --host=${HOST} \
-      CFLAGS="${ANDROID_CFLAGS} -O3 -fPIE" \
-      CPPFLAGS="${ANDROID_CFLAGS}" \
-      LDFLAGS="${ANDROID_CFLAGS} -pie" --with-simd ${1+"$@"}
-    make
-
-
-### ARMv8 (64-bit)
-
-The following is a general recipe script that can be modified for your specific
-needs.
-
-    # Set these variables to suit your needs
-    NDK_PATH={full path to the "ndk" directory-- for example, /opt/android/sdk/ndk-bundle}
-    BUILD_PLATFORM={the platform name for the NDK package you installed--
-      for example, "windows-x86" or "linux-x86_64" or "darwin-x86_64"}
-    TOOLCHAIN_VERSION={"4.8", "4.9", "clang3.5", etc.  This corresponds to a
-      toolchain directory under ${NDK_PATH}/toolchains/.}
-    ANDROID_VERSION={The minimum version of Android to support.  "21" or later
-      is required for a 64-bit build.}
-
-    # It should not be necessary to modify the rest
-    HOST=aarch64-linux-android
-    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-arm64
-    ANDROID_CFLAGS="-D__ANDROID_API__=${ANDROID_VERSION} --sysroot=${SYSROOT} \
-      -isystem ${NDK_PATH}/sysroot/usr/include \
-      -isystem ${NDK_PATH}/sysroot/usr/include/${HOST}"
-
-    TOOLCHAIN=${NDK_PATH}/toolchains/${HOST}-${TOOLCHAIN_VERSION}/prebuilt/${BUILD_PLATFORM}
-    export CPP=${TOOLCHAIN}/bin/${HOST}-cpp
-    export AR=${TOOLCHAIN}/bin/${HOST}-ar
-    export NM=${TOOLCHAIN}/bin/${HOST}-nm
-    export CC=${TOOLCHAIN}/bin/${HOST}-gcc
-    export LD=${TOOLCHAIN}/bin/${HOST}-ld
-    export RANLIB=${TOOLCHAIN}/bin/${HOST}-ranlib
-    export OBJDUMP=${TOOLCHAIN}/bin/${HOST}-objdump
-    export STRIP=${TOOLCHAIN}/bin/${HOST}-strip
-    cd {build_directory}
-    sh {source_directory}/configure --host=${HOST} \
-      CFLAGS="${ANDROID_CFLAGS} -O3 -fPIE" \
-      CPPFLAGS="${ANDROID_CFLAGS}" \
-      LDFLAGS="${ANDROID_CFLAGS} -pie" --with-simd ${1+"$@"}
-    make
-
-
-### x86 (32-bit)
-
-The following is a general recipe script that can be modified for your specific
-needs.
-
-    # Set these variables to suit your needs
-    NDK_PATH={full path to the "ndk" directory-- for example, /opt/android/sdk/ndk-bundle}
-    BUILD_PLATFORM={the platform name for the NDK package you installed--
-      for example, "windows-x86" or "linux-x86_64" or "darwin-x86_64"}
-    TOOLCHAIN_VERSION={"4.8", "4.9", "clang3.5", etc.  This corresponds to a
-      toolchain directory under ${NDK_PATH}/toolchains/.}
-    ANDROID_VERSION={The minimum version of Android to support-- for example,
-      "16", "19", etc.}
-
-    # It should not be necessary to modify the rest
-    HOST=i686-linux-android
-    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-x86
-    ANDROID_CFLAGS="-D__ANDROID_API__=${ANDROID_VERSION} --sysroot=${SYSROOT} \
-      -isystem ${NDK_PATH}/sysroot/usr/include \
-      -isystem ${NDK_PATH}/sysroot/usr/include/${HOST}"
-
-    TOOLCHAIN=${NDK_PATH}/toolchains/x86-${TOOLCHAIN_VERSION}/prebuilt/${BUILD_PLATFORM}
-    export CPP=${TOOLCHAIN}/bin/${HOST}-cpp
-    export AR=${TOOLCHAIN}/bin/${HOST}-ar
-    export NM=${TOOLCHAIN}/bin/${HOST}-nm
-    export CC=${TOOLCHAIN}/bin/${HOST}-gcc
-    export LD=${TOOLCHAIN}/bin/${HOST}-ld
-    export RANLIB=${TOOLCHAIN}/bin/${HOST}-ranlib
-    export OBJDUMP=${TOOLCHAIN}/bin/${HOST}-objdump
-    export STRIP=${TOOLCHAIN}/bin/${HOST}-strip
-    cd {build_directory}
-    sh {source_directory}/configure --host=${HOST} \
-      CFLAGS="${ANDROID_CFLAGS} -O3 -fPIE" \
-      CPPFLAGS="${ANDROID_CFLAGS}" \
-      LDFLAGS="${ANDROID_CFLAGS} -pie" --with-simd ${1+"$@"}
-    make
-
-
-### x86-64 (64-bit)
-
-The following is a general recipe script that can be modified for your specific
-needs.
-
-    # Set these variables to suit your needs
-    NDK_PATH={full path to the "ndk" directory-- for example, /opt/android/sdk/ndk-bundle}
-    BUILD_PLATFORM={the platform name for the NDK package you installed--
-      for example, "windows-x86" or "linux-x86_64" or "darwin-x86_64"}
-    TOOLCHAIN_VERSION={"4.8", "4.9", "clang3.5", etc.  This corresponds to a
-      toolchain directory under ${NDK_PATH}/toolchains/.}
-    ANDROID_VERSION={The minimum version of Android to support.  "21" or later
-      is required for a 64-bit build.}
-
-    # It should not be necessary to modify the rest
-    HOST=x86_64-linux-android
-    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-x86_64
-    ANDROID_CFLAGS="-D__ANDROID_API__=${ANDROID_VERSION} --sysroot=${SYSROOT} \
-      -isystem ${NDK_PATH}/sysroot/usr/include \
-      -isystem ${NDK_PATH}/sysroot/usr/include/${HOST}"
-
-    TOOLCHAIN=${NDK_PATH}/toolchains/x86_64-${TOOLCHAIN_VERSION}/prebuilt/${BUILD_PLATFORM}
-    export CPP=${TOOLCHAIN}/bin/${HOST}-cpp
-    export AR=${TOOLCHAIN}/bin/${HOST}-ar
-    export NM=${TOOLCHAIN}/bin/${HOST}-nm
-    export CC=${TOOLCHAIN}/bin/${HOST}-gcc
-    export LD=${TOOLCHAIN}/bin/${HOST}-ld
-    export RANLIB=${TOOLCHAIN}/bin/${HOST}-ranlib
-    export OBJDUMP=${TOOLCHAIN}/bin/${HOST}-objdump
-    export STRIP=${TOOLCHAIN}/bin/${HOST}-strip
-    cd {build_directory}
-    sh {source_directory}/configure --host=${HOST} \
-      CFLAGS="${ANDROID_CFLAGS} -O3 -fPIE" \
-      CPPFLAGS="${ANDROID_CFLAGS}" \
-      LDFLAGS="${ANDROID_CFLAGS} -pie" --with-simd ${1+"$@"}
-    make
-
-
-If building for Android 4.0.x (API level < 16) or earlier, remove `-fPIE` from
-`CFLAGS` and `-pie` from `LDFLAGS`.
-
-
-Installing libjpeg-turbo
-------------------------
-
-To install libjpeg-turbo after it is built, replace `make` in the build
-instructions with `make install`.
-
-The `--prefix` argument to configure (or the `prefix` configure variable) can
-be used to specify an installation directory of your choosing.  If you don't
-specify an installation directory, then the default is to install libjpeg-turbo
-under **/opt/libjpeg-turbo** and to place the libraries in
-**/opt/libjpeg-turbo/lib32** (32-bit) or **/opt/libjpeg-turbo/lib64** (64-bit.)
-
-The `bindir`, `datadir`, `docdir`, `includedir`, `libdir`, and `mandir`
-configure variables allow a finer degree of control over where specific files in
-the libjpeg-turbo distribution should be installed.  These variables can either
-be specified at configure time or passed as arguments to `make install`.
-
-
-Windows (Visual C++ or MinGW)
-=============================
-
-
-Build Requirements
-------------------
-
-- [CMake](http://www.cmake.org) v2.8.11 or later
-
-- [NASM](http://www.nasm.us) or [YASM](http://yasm.tortall.net)
-  * If using NASM, 0.98 or later is required for an x86 build.
-  * If using NASM, 2.05 or later is required for an x86-64 build.
-  * **nasm.exe**/**yasm.exe** should be in your `PATH`.
+### Windows
 
 - Microsoft Visual C++ 2005 or later
 
@@ -613,6 +107,53 @@
 well.
 
 
+### Un*x
+
+The following procedure will build libjpeg-turbo on Unix and Unix-like systems.
+(On Solaris, this generates a 32-bit build.  See "Build Recipes" below for
+64-bit build instructions.)
+
+    cd {build_directory}
+    cmake -G"Unix Makefiles" [additional CMake flags] {source_directory}
+    make
+
+This will generate the following files under *{build_directory}*:
+
+**libjpeg.a**<br>
+Static link library for the libjpeg API
+
+**libjpeg.so.{version}** (Linux, Unix)<br>
+**libjpeg.{version}.dylib** (Mac)<br>
+**cygjpeg-{version}.dll** (Cygwin)<br>
+Shared library for the libjpeg API
+
+By default, *{version}* is 62.2.0, 7.2.0, or 8.1.2, depending on whether
+libjpeg v6b (default), v7, or v8 emulation is enabled.  If using Cygwin,
+*{version}* is 62, 7, or 8.
+
+**libjpeg.so** (Linux, Unix)<br>
+**libjpeg.dylib** (Mac)<br>
+Development symlink for the libjpeg API
+
+**libjpeg.dll.a** (Cygwin)<br>
+Import library for the libjpeg API
+
+**libturbojpeg.a**<br>
+Static link library for the TurboJPEG API
+
+**libturbojpeg.so.0.2.0** (Linux, Unix)<br>
+**libturbojpeg.0.2.0.dylib** (Mac)<br>
+**cygturbojpeg-0.dll** (Cygwin)<br>
+Shared library for the TurboJPEG API
+
+**libturbojpeg.so** (Linux, Unix)<br>
+**libturbojpeg.dylib** (Mac)<br>
+Development symlink for the TurboJPEG API
+
+**libturbojpeg.dll.a** (Cygwin)<br>
+Import library for the TurboJPEG API
+
+
 ### Visual C++ (Command Line)
 
     cd {build_directory}
@@ -627,10 +168,10 @@
 **jpeg-static.lib**<br>
 Static link library for the libjpeg API
 
-**sharedlib/jpeg{version}.dll**<br>
+**jpeg{version}.dll**<br>
 DLL for the libjpeg API
 
-**sharedlib/jpeg.lib**<br>
+**jpeg.lib**<br>
 Import library for the libjpeg API
 
 **turbojpeg-static.lib**<br>
@@ -655,9 +196,9 @@
     cd {build_directory}
     cmake -G"Visual Studio 10" [additional CMake flags] {source_directory}
 
-NOTE: Add "Win64" to the generator name (for example, "Visual Studio 10
-Win64") to build a 64-bit version of libjpeg-turbo.  A separate build directory
-must be used for 32-bit and 64-bit builds.
+NOTE: Add "Win64" to the generator name (for example, "Visual Studio 10 Win64")
+to build a 64-bit version of libjpeg-turbo.  A separate build directory must be
+used for 32-bit and 64-bit builds.
 
 You can then open **ALL_BUILD.vcproj** in Visual Studio and build one of the
 configurations in that project ("Debug", "Release", etc.) to generate a full
@@ -668,10 +209,10 @@
 **{configuration}/jpeg-static.lib**<br>
 Static link library for the libjpeg API
 
-**sharedlib/{configuration}/jpeg{version}.dll**<br>
+**{configuration}/jpeg{version}.dll**<br>
 DLL for the libjpeg API
 
-**sharedlib/{configuration}/jpeg.lib**<br>
+**{configuration}/jpeg.lib**<br>
 Import library for the libjpeg API
 
 **{configuration}/turbojpeg-static.lib**<br>
@@ -703,10 +244,10 @@
 **libjpeg.a**<br>
 Static link library for the libjpeg API
 
-**sharedlib/libjpeg-{version}.dll**<br>
+**libjpeg-{version}.dll**<br>
 DLL for the libjpeg API
 
-**sharedlib/libjpeg.dll.a**<br>
+**libjpeg.dll.a**<br>
 Import library for the libjpeg API
 
 **libturbojpeg.a**<br>
@@ -771,15 +312,42 @@
 environment variable to the location of the JDK that you wish to use.  The
 `Java_JAVAC_EXECUTABLE`, `Java_JAVA_EXECUTABLE`, and `Java_JAR_EXECUTABLE`
 CMake variables can also be used to specify alternate commands or locations for
-javac, jar, and java (respectively.)  You can also set the `JAVACFLAGS` CMake
+javac, jar, and java (respectively.)  You can also set the
+`CMAKE_JAVA_COMPILE_FLAGS` CMake variable or the `JAVAFLAGS` environment
 variable to specify arguments that should be passed to the Java compiler when
-building the TurboJPEG classes.
+building the TurboJPEG classes, and the `JAVAARGS` CMake variable to specify
+arguments that should be passed to the JRE when running the TurboJPEG Java unit
+tests.
 
 
 Build Recipes
 -------------
 
 
+### 32-bit Build on 64-bit Linux/Unix/Mac
+
+Use export/setenv to set the following environment variables before running
+CMake:
+
+    CFLAGS=-m32
+    LDFLAGS=-m32
+
+
+### 64-bit Build on Solaris
+
+Use export/setenv to set the following environment variables before running
+CMake:
+
+    CFLAGS=-m64
+    LDFLAGS=-m64
+
+
+### Other Compilers
+
+On Un*x systems, prior to running CMake, you can set the `CC` environment
+variable to the command used to invoke the C compiler.
+
+
 ### 32-bit MinGW Build on Un*x (including Mac and Cygwin)
 
 Create a file called **toolchain.cmake** under *{build_directory}*, with the
@@ -818,8 +386,315 @@
     make
 
 
+Building libjpeg-turbo for iOS
+------------------------------
+
+iOS platforms, such as the iPhone and iPad, use ARM processors, and all
+currently supported models include NEON instructions.  Thus, they can take
+advantage of libjpeg-turbo's SIMD extensions to significantly accelerate JPEG
+compression/decompression.  This section describes how to build libjpeg-turbo
+for these platforms.
+
+
+### Additional build requirements
+
+- For configurations that require [gas-preprocessor.pl]
+  (https://raw.githubusercontent.com/libjpeg-turbo/gas-preprocessor/master/gas-preprocessor.pl),
+  it should be installed in your `PATH`.
+
+
+### ARMv7 (32-bit)
+
+**gas-preprocessor.pl required**
+
+The following scripts demonstrate how to build libjpeg-turbo to run on the
+iPhone 3GS-4S/iPad 1st-3rd Generation and newer:
+
+#### Xcode 4.2 and earlier (LLVM-GCC)
+
+    IOS_PLATFORMDIR=/Developer/Platforms/iPhoneOS.platform
+    IOS_SYSROOT=($IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk)
+    export CFLAGS="-mfloat-abi=softfp -march=armv7 -mcpu=cortex-a8 -mtune=cortex-a8 -mfpu=neon -miphoneos-version-min=3.0"
+
+    cat <<EOF >toolchain.cmake
+    set(CMAKE_SYSTEM_NAME Darwin)
+    set(CMAKE_SYSTEM_PROCESSOR arm)
+    set(CMAKE_C_COMPILER ${IOS_PLATFORMDIR}/Developer/usr/bin/arm-apple-darwin10-llvm-gcc-4.2)
+    EOF
+
+    cd {build_directory}
+    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
+      -DCMAKE_OSX_SYSROOT=${IOS_SYSROOT[0]} \
+      [additional CMake flags] {source_directory}
+    make
+
+#### Xcode 4.3-4.6 (LLVM-GCC)
+
+Same as above, but replace the first line with:
+
+    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
+
+#### Xcode 5 and later (Clang)
+
+    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
+    IOS_SYSROOT=($IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk)
+    export CFLAGS="-mfloat-abi=softfp -arch armv7 -miphoneos-version-min=3.0"
+    export ASMFLAGS="-no-integrated-as"
+
+    cat <<EOF >toolchain.cmake
+    set(CMAKE_SYSTEM_NAME Darwin)
+    set(CMAKE_SYSTEM_PROCESSOR arm)
+    set(CMAKE_C_COMPILER /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang)
+    EOF
+
+    cd {build_directory}
+    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
+      -DCMAKE_OSX_SYSROOT=${IOS_SYSROOT[0]} \
+      [additional CMake flags] {source_directory}
+    make
+
+
+### ARMv7s (32-bit)
+
+**gas-preprocessor.pl required**
+
+The following scripts demonstrate how to build libjpeg-turbo to run on the
+iPhone 5/iPad 4th Generation and newer:
+
+#### Xcode 4.5-4.6 (LLVM-GCC)
+
+    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
+    IOS_SYSROOT=($IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk)
+    export CFLAGS="-Wall -mfloat-abi=softfp -march=armv7s -mcpu=swift -mtune=swift -mfpu=neon -miphoneos-version-min=6.0"
+
+    cat <<EOF >toolchain.cmake
+    set(CMAKE_SYSTEM_NAME Darwin)
+    set(CMAKE_SYSTEM_PROCESSOR arm)
+    set(CMAKE_C_COMPILER ${IOS_PLATFORMDIR}/Developer/usr/bin/arm-apple-darwin10-llvm-gcc-4.2)
+    EOF
+
+    cd {build_directory}
+    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
+      -DCMAKE_OSX_SYSROOT=${IOS_SYSROOT[0]} \
+      [additional CMake flags] {source_directory}
+    make
+
+#### Xcode 5 and later (Clang)
+
+Same as the ARMv7 build procedure for Xcode 5 and later, except replace the
+compiler flags as follows:
+
+    export CFLAGS="-Wall -mfloat-abi=softfp -arch armv7s -miphoneos-version-min=6.0"
+
+
+### ARMv8 (64-bit)
+
+**gas-preprocessor.pl required if using Xcode < 6**
+
+The following script demonstrates how to build libjpeg-turbo to run on the
+iPhone 5S/iPad Mini 2/iPad Air and newer.
+
+    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
+    IOS_SYSROOT=($IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk)
+    export CFLAGS="-Wall -arch arm64 -miphoneos-version-min=7.0 -funwind-tables"
+
+    cat <<EOF >toolchain.cmake
+    set(CMAKE_SYSTEM_NAME Darwin)
+    set(CMAKE_SYSTEM_PROCESSOR aarch64)
+    set(CMAKE_C_COMPILER /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang)
+    EOF
+
+    cd {build_directory}
+    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
+      -DCMAKE_OSX_SYSROOT=${IOS_SYSROOT[0]} \
+      [additional CMake flags] {source_directory}
+    make
+
+Once built, lipo can be used to combine the ARMv7, v7s, and/or v8 variants into
+a universal library.
+
+
+Building libjpeg-turbo for Android
+----------------------------------
+
+Building libjpeg-turbo for Android platforms requires the
+[Android NDK](https://developer.android.com/tools/sdk/ndk).
+
+
+### ARMv7 (32-bit)
+
+The following is a general recipe script that can be modified for your specific
+needs.
+
+    # Set these variables to suit your needs
+    NDK_PATH={full path to the "ndk" directory-- for example, /opt/android/sdk/ndk-bundle}
+    BUILD_PLATFORM={the platform name for the NDK package you installed--
+      for example, "windows-x86" or "linux-x86_64" or "darwin-x86_64"}
+    TOOLCHAIN_VERSION={"4.8", "4.9", "clang3.5", etc.  This corresponds to a
+      toolchain directory under ${NDK_PATH}/toolchains/.}
+    ANDROID_VERSION={The minimum version of Android to support-- for example,
+      "16", "19", etc.}
+
+    # It should not be necessary to modify the rest
+    HOST=arm-linux-androideabi
+    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-arm
+    export CFLAGS="-march=armv7-a -mfloat-abi=softfp -fprefetch-loop-arrays \
+      -D__ANDROID_API__=${ANDROID_VERSION} --sysroot=${SYSROOT} \
+      -isystem ${NDK_PATH}/sysroot/usr/include \
+      -isystem ${NDK_PATH}/sysroot/usr/include/${HOST}"
+    export LDFLAGS=-pie
+    TOOLCHAIN=${NDK_PATH}/toolchains/${HOST}-${TOOLCHAIN_VERSION}/prebuilt/${BUILD_PLATFORM}
+
+    cat <<EOF >toolchain.cmake
+    set(CMAKE_SYSTEM_NAME Linux)
+    set(CMAKE_SYSTEM_PROCESSOR arm)
+    set(CMAKE_C_COMPILER ${TOOLCHAIN}/bin/${HOST}-gcc)
+    set(CMAKE_FIND_ROOT_PATH ${TOOLCHAIN}/${HOST})
+    EOF
+
+    cd {build_directory}
+    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
+      -DCMAKE_POSITION_INDEPENDENT_CODE=1 \
+      [additional CMake flags] {source_directory}
+    make
+
+
+### ARMv8 (64-bit)
+
+The following is a general recipe script that can be modified for your specific
+needs.
+
+    # Set these variables to suit your needs
+    NDK_PATH={full path to the "ndk" directory-- for example, /opt/android/sdk/ndk-bundle}
+    BUILD_PLATFORM={the platform name for the NDK package you installed--
+      for example, "windows-x86" or "linux-x86_64" or "darwin-x86_64"}
+    TOOLCHAIN_VERSION={"4.8", "4.9", "clang3.5", etc.  This corresponds to a
+      toolchain directory under ${NDK_PATH}/toolchains/.}
+    ANDROID_VERSION={The minimum version of Android to support.  "21" or later
+      is required for a 64-bit build.}
+
+    # It should not be necessary to modify the rest
+    HOST=aarch64-linux-android
+    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-arm64
+    export CFLAGS="-D__ANDROID_API__=${ANDROID_VERSION} --sysroot=${SYSROOT} \
+      -isystem ${NDK_PATH}/sysroot/usr/include \
+      -isystem ${NDK_PATH}/sysroot/usr/include/${HOST}"
+    export LDFLAGS=-pie
+    TOOLCHAIN=${NDK_PATH}/toolchains/${HOST}-${TOOLCHAIN_VERSION}/prebuilt/${BUILD_PLATFORM}
+
+    cat <<EOF >toolchain.cmake
+    set(CMAKE_SYSTEM_NAME Linux)
+    set(CMAKE_SYSTEM_PROCESSOR aarch64)
+    set(CMAKE_C_COMPILER ${TOOLCHAIN}/bin/${HOST}-gcc)
+    set(CMAKE_FIND_ROOT_PATH ${TOOLCHAIN}/${HOST})
+    EOF
+
+    cd {build_directory}
+    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
+      -DCMAKE_POSITION_INDEPENDENT_CODE=1 \
+      [additional CMake flags] {source_directory}
+    make
+
+
+### x86 (32-bit)
+
+The following is a general recipe script that can be modified for your specific
+needs.
+
+    # Set these variables to suit your needs
+    NDK_PATH={full path to the "ndk" directory-- for example, /opt/android/sdk/ndk-bundle}
+    BUILD_PLATFORM={the platform name for the NDK package you installed--
+      for example, "windows-x86" or "linux-x86_64" or "darwin-x86_64"}
+    TOOLCHAIN_VERSION={"4.8", "4.9", "clang3.5", etc.  This corresponds to a
+      toolchain directory under ${NDK_PATH}/toolchains/.}
+    ANDROID_VERSION={The minimum version of Android to support-- for example,
+      "16", "19", etc.}
+
+    # It should not be necessary to modify the rest
+    HOST=i686-linux-android
+    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-x86
+    export CFLAGS="-D__ANDROID_API__=${ANDROID_VERSION} --sysroot=${SYSROOT} \
+      -isystem ${NDK_PATH}/sysroot/usr/include \
+      -isystem ${NDK_PATH}/sysroot/usr/include/${HOST}"
+    export LDFLAGS=-pie
+    TOOLCHAIN=${NDK_PATH}/toolchains/x86-${TOOLCHAIN_VERSION}/prebuilt/${BUILD_PLATFORM}
+
+    cat <<EOF >toolchain.cmake
+    set(CMAKE_SYSTEM_NAME Linux)
+    set(CMAKE_SYSTEM_PROCESSOR i386)
+    set(CMAKE_C_COMPILER ${TOOLCHAIN}/bin/${HOST}-gcc)
+    set(CMAKE_FIND_ROOT_PATH ${TOOLCHAIN}/${HOST})
+    EOF
+
+    cd {build_directory}
+    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
+      -DCMAKE_POSITION_INDEPENDENT_CODE=1 \
+      [additional CMake flags] {source_directory}
+    make
+
+
+### x86-64 (64-bit)
+
+The following is a general recipe script that can be modified for your specific
+needs.
+
+    # Set these variables to suit your needs
+    NDK_PATH={full path to the "ndk" directory-- for example, /opt/android/sdk/ndk-bundle}
+    BUILD_PLATFORM={the platform name for the NDK package you installed--
+      for example, "windows-x86" or "linux-x86_64" or "darwin-x86_64"}
+    TOOLCHAIN_VERSION={"4.8", "4.9", "clang3.5", etc.  This corresponds to a
+      toolchain directory under ${NDK_PATH}/toolchains/.}
+    ANDROID_VERSION={The minimum version of Android to support.  "21" or later
+      is required for a 64-bit build.}
+
+    # It should not be necessary to modify the rest
+    HOST=x86_64-linux-android
+    SYSROOT=${NDK_PATH}/platforms/android-${ANDROID_VERSION}/arch-x86_64
+    export CFLAGS="-D__ANDROID_API__=${ANDROID_VERSION} --sysroot=${SYSROOT} \
+      -isystem ${NDK_PATH}/sysroot/usr/include \
+      -isystem ${NDK_PATH}/sysroot/usr/include/${HOST}"
+    export LDFLAGS=-pie
+    TOOLCHAIN=${NDK_PATH}/toolchains/x86_64-${TOOLCHAIN_VERSION}/prebuilt/${BUILD_PLATFORM}
+
+    cat <<EOF >toolchain.cmake
+    set(CMAKE_SYSTEM_NAME Linux)
+    set(CMAKE_SYSTEM_PROCESSOR x86_64)
+    set(CMAKE_C_COMPILER ${TOOLCHAIN}/bin/${HOST}-gcc)
+    set(CMAKE_FIND_ROOT_PATH ${TOOLCHAIN}/${HOST})
+    EOF
+
+    cd {build_directory}
+    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
+      -DCMAKE_POSITION_INDEPENDENT_CODE=1 \
+      [additional CMake flags] {source_directory}
+    make
+
+
+If building for Android 4.0.x (API level < 16) or earlier, remove
+`-DCMAKE_POSITION_INDEPENDENT_CODE=1` from the CMake arguments and `-pie` from
+`LDFLAGS`.
+
+
+Advanced CMake Options
+----------------------
+
+To list and configure other CMake options not specifically mentioned in this
+guide, run
+
+    ccmake {source_directory}
+
+or
+
+    cmake-gui {source_directory}
+
+from the build directory after initially configuring the build.  CCMake is a
+text-based interactive version of CMake, and CMake-GUI is a GUI version.  Both
+will display all variables that are relevant to the libjpeg-turbo build, their
+current values, and a help string describing what they do.
+
+
 Installing libjpeg-turbo
-------------------------
+========================
 
 You can use the build system to install libjpeg-turbo (as opposed to creating
 an installer package.)  To do this, run `make install` or `nmake install`
@@ -843,6 +718,37 @@
 **c:\libjpeg-turbo-gcc64**<br>
 MinGW 64-bit build
 
+**/opt/libjpeg-turbo**<br>
+Un*x
+
+The default value of `CMAKE_INSTALL_PREFIX` causes the libjpeg-turbo files to
+be installed with a directory structure resembling that of the official
+libjpeg-turbo binary packages.  Changing the value of `CMAKE_INSTALL_PREFIX`
+(for instance, to **/usr/local**) causes the libjpeg-turbo files to be
+installed with a directory structure that conforms to GNU standards.
+
+The `CMAKE_INSTALL_BINDIR`, `CMAKE_INSTALL_DATAROOTDIR`,
+`CMAKE_INSTALL_DOCDIR`, `CMAKE_INSTALL_INCLUDEDIR`, `CMAKE_INSTALL_JAVADIR`,
+`CMAKE_INSTALL_LIBDIR`, and `CMAKE_INSTALL_MANDIR` CMake variables allow a
+finer degree of control over where specific files in the libjpeg-turbo
+distribution should be installed.  These directory variables can either be
+specified as absolute paths or as paths relative to `CMAKE_INSTALL_PREFIX` (for
+instance, setting `CMAKE_INSTALL_DOCDIR` to **doc** would cause the
+documentation to be installed in **${CMAKE\_INSTALL\_PREFIX}/doc**.)  If a
+directory variable contains the name of another directory variable in angle
+brackets, then its final value will depend on the final value of that other
+variable.  For instance, the default value of `CMAKE_INSTALL_MANDIR` is
+**\<CMAKE\_INSTALL\_DATAROOTDIR\>/man**.
+
+NOTE: If setting one of these directory variables to a relative path using the
+CMake command line, you must specify that the variable is of type `PATH`.
+For example:
+
+    cmake -G"{generator type}" -DCMAKE_INSTALL_LIBDIR:PATH=lib {source_directory}
+
+Otherwise, CMake will assume that the path is relative to the build directory
+rather than the install directory.
+
 
 Creating Distribution Packages
 ==============================
@@ -879,40 +785,35 @@
 Packages built in this manner can be installed on OS X 10.5 and later, but they
 must be built on OS X 10.6 or later.
 
-    make udmg [BUILDDIR32={32-bit build directory}]
+    make udmg
 
-On 64-bit OS X systems, this creates a Mac package/disk image that contains
-universal i386/x86-64 binaries.  You should first configure a 32-bit
-out-of-tree build of libjpeg-turbo, then configure a 64-bit out-of-tree build,
-then run `make udmg` from the 64-bit build directory.  The build system will
-look for the 32-bit build under *{source_directory}*/osxx86 by default, but you
-can override this by setting the `BUILDDIR32` variable on the make command line
-as shown above.
+This creates a Mac package/disk image that contains universal x86-64/i386/ARM
+binaries.  The following CMake variables control which architectures are
+included in the universal binaries.  Setting any of these variables to an empty
+string excludes that architecture from the package.
 
-    make iosdmg [BUILDDIR32={32-bit build directory}] \
-      [BUILDDIRARMV7={ARMv7 build directory}] \
-      [BUILDDIRARMV7S={ARMv7s build directory}] \
-      [BUILDDIRARMV8={ARMv8 build directory}]
+* `OSX_32BIT_BUILD`: Directory containing an i386 (32-bit) Mac build of
+  libjpeg-turbo (default: *{source_directory}*/osxx86)
+* `IOS_ARMV7_BUILD`: Directory containing an ARMv7 (32-bit) iOS build of
+  libjpeg-turbo (default: *{source_directory}*/iosarmv7)
+* `IOS_ARMV7S_BUILD`: Directory containing an ARMv7s (32-bit) iOS build of
+  libjpeg-turbo (default: *{source_directory}*/iosarmv7s)
+* `IOS_ARMV8_BUILD`: Directory containing an ARMv8 (64-bit) iOS build of
+  libjpeg-turbo (default: *{source_directory}*/iosarmv8)
 
-This creates a Mac package/disk image in which the libjpeg-turbo libraries
-contain ARM architectures necessary to build iOS applications.  If building on
-an x86-64 system, the binaries will also contain the i386 architecture, as with
-`make udmg` above.  You should first configure ARMv7, ARMv7s, and/or ARMv8
-out-of-tree builds of libjpeg-turbo (see "Building libjpeg-turbo for iOS"
-above.)  If you are building an x86-64 version of libjpeg-turbo, you should
-configure a 32-bit out-of-tree build as well.  Next, build libjpeg-turbo as you
-would normally, using an out-of-tree build.  When it is built, run `make
-iosdmg` from the build directory.  The build system will look for the ARMv7
-build under *{source_directory}*/iosarmv7 by default, the ARMv7s build under
-*{source_directory}*/iosarmv7s by default, the ARMv8 build under
-*{source_directory}*/iosarmv8 by default, and (if applicable) the 32-bit build
-under *{source_directory}*/osxx86 by default, but you can override this by
-setting the `BUILDDIR32`, `BUILDDIRARMV7`, `BUILDDIRARMV7S`, and/or
-`BUILDDIRARMV8` variables on the `make` command line as shown above.
+You should first use CMake to configure i386, ARMv7, ARMv7s, and/or ARMv8
+sub-builds of libjpeg-turbo (see "Build Recipes" and "Building libjpeg-turbo
+for iOS" above) in build directories that match those specified in the
+aforementioned CMake variables.  Next, configure the primary build of
+libjpeg-turbo as an out-of-tree build, and build it.  Once the primary build
+has been built, run `make udmg` from the build directory.  The packaging system
+will build the sub-builds, use lipo to combine them into a single set of
+universal binaries, then package the universal binaries in the same manner as
+`make dmg`.
 
-NOTE: If including an ARMv8 build in the package, then you may need to use
-Xcode's version of lipo instead of the operating system's.  To do this, pass
-an argument of `LIPO="xcrun lipo"` on the make command line.
+
+Cygwin
+------
 
     make cygwinpkg
 
diff --git a/Brewfile b/Brewfile
index 02d8457..4a9cb3d 100644
--- a/Brewfile
+++ b/Brewfile
@@ -1,4 +1,4 @@
-brew 'nasm'
+brew 'yasm'
 brew 'gcc@5'
 brew 'md5sha1sum'
 cask 'Caskroom/versions/java6'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 296e408..312eab4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,15 +1,7 @@
-#
-# Setup
-#
-
-cmake_minimum_required(VERSION 2.8.11)
-# Use LINK_INTERFACE_LIBRARIES instead of INTERFACE_LINK_LIBRARIES
-if(POLICY CMP0022)
-  cmake_policy(SET CMP0022 OLD)
-endif()
+cmake_minimum_required(VERSION 2.8.12)
 
 project(libjpeg-turbo C)
-set(VERSION 1.5.4)
+set(VERSION 1.5.80)
 string(REPLACE "." ";" VERSION_TRIPLET ${VERSION})
 list(GET VERSION_TRIPLET 0 VERSION_MAJOR)
 list(GET VERSION_TRIPLET 1 VERSION_MINOR)
@@ -29,116 +21,281 @@
 pad_number(VERSION_REVISION 3)
 set(LIBJPEG_TURBO_VERSION_NUMBER ${VERSION_MAJOR}${VERSION_MINOR}${VERSION_REVISION})
 
-if(NOT WIN32)
-  message(FATAL_ERROR "Platform not supported by this build system.  Use autotools instead.")
-endif()
+string(TIMESTAMP DEFAULT_BUILD "%Y%m%d")
+set(BUILD ${DEFAULT_BUILD} CACHE STRING "Build string (default: ${DEFAULT_BUILD})")
 
-string(TIMESTAMP BUILD "%Y%m%d")
-
-# This does nothing except when using MinGW.  CMAKE_BUILD_TYPE has no meaning
-# in Visual Studio, and it always defaults to Debug when using NMake.
+# NOTE: On Windows, this does nothing except when using MinGW or Cygwin.
+# CMAKE_BUILD_TYPE has no meaning in Visual Studio, and it always defaults to
+# Debug when using NMake.
 if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release)
 endif()
-
 message(STATUS "CMAKE_BUILD_TYPE = ${CMAKE_BUILD_TYPE}")
 
-# This only works if building from the command line.  There is currently no way
-# to set a variable's value based on the build type when using Visual Studio.
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-  set(BUILD "${BUILD}d")
-endif()
-
 message(STATUS "VERSION = ${VERSION}, BUILD = ${BUILD}")
 
-option(WITH_SIMD "Include SIMD extensions" TRUE)
-option(WITH_ARITH_ENC "Include arithmetic encoding support when emulating the libjpeg v6b API/ABI" TRUE)
-option(WITH_ARITH_DEC "Include arithmetic decoding support when emulating the libjpeg v6b API/ABI" TRUE)
-option(WITH_JPEG7 "Emulate libjpeg v7 API/ABI (this makes libjpeg-turbo backward incompatible with libjpeg v6b)" FALSE)
-option(WITH_JPEG8 "Emulate libjpeg v8 API/ABI (this makes libjpeg-turbo backward incompatible with libjpeg v6b)" FALSE)
-option(WITH_MEM_SRCDST "Include in-memory source/destination manager functions when emulating the libjpeg v6b or v7 API/ABI" TRUE)
-option(WITH_TURBOJPEG "Include the TurboJPEG wrapper library and associated test programs" TRUE)
-option(WITH_JAVA "Build Java wrapper for the TurboJPEG library" FALSE)
-option(WITH_12BIT "Encode/decode JPEG images with 12-bit samples (implies WITH_SIMD=0 WITH_TURBOJPEG=0 WITH_ARITH_ENC=0 WITH_ARITH_DEC=0)" FALSE)
-option(ENABLE_STATIC "Build static libraries" TRUE)
+# Detect CPU type and whether we're building 64-bit or 32-bit code
+math(EXPR BITS "${CMAKE_SIZEOF_VOID_P} * 8")
+string(TOLOWER ${CMAKE_SYSTEM_PROCESSOR} CMAKE_SYSTEM_PROCESSOR_LC)
+if(CMAKE_SYSTEM_PROCESSOR_LC MATCHES "x86_64" OR
+  CMAKE_SYSTEM_PROCESSOR_LC MATCHES "amd64" OR
+  CMAKE_SYSTEM_PROCESSOR_LC MATCHES "i[0-9]86" OR
+  CMAKE_SYSTEM_PROCESSOR_LC MATCHES "x86" OR
+  CMAKE_SYSTEM_PROCESSOR_LC MATCHES "ia32")
+  if(BITS EQUAL 64)
+    set(CPU_TYPE x86_64)
+  else()
+    set(CPU_TYPE i386)
+  endif()
+  if(NOT CMAKE_SYSTEM_PROCESSOR STREQUAL ${CPU_TYPE})
+    set(CMAKE_SYSTEM_PROCESSOR ${CPU_TYPE})
+  endif()
+elseif(CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "aarch64" OR
+  CMAKE_SYSTEM_PROCESSOR_LC MATCHES "arm*64*")
+  set(CPU_TYPE arm64)
+elseif(CMAKE_SYSTEM_PROCESSOR_LC MATCHES "arm*")
+  set(CPU_TYPE arm)
+elseif(CMAKE_SYSTEM_PROCESSOR_LC MATCHES "ppc*" OR
+  CMAKE_SYSTEM_PROCESSOR_LC MATCHES "powerpc*")
+  set(CPU_TYPE powerpc)
+else()
+  set(CPU_TYPE ${CMAKE_SYSTEM_PROCESSOR_LC})
+endif()
+message(STATUS "${BITS}-bit build (${CPU_TYPE})")
+
+
+###############################################################################
+# INSTALL DIRECTORIES
+###############################################################################
+
+if(WIN32)
+  if(MSVC)
+    set(CMAKE_INSTALL_DEFAULT_PREFIX "c:/${CMAKE_PROJECT_NAME}")
+  else()
+    set(CMAKE_INSTALL_DEFAULT_PREFIX "c:/${CMAKE_PROJECT_NAME}-gcc")
+  endif()
+  if(BITS EQUAL 64)
+    set(CMAKE_INSTALL_DEFAULT_PREFIX "${CMAKE_INSTALL_DEFAULT_PREFIX}64")
+  endif()
+else()
+  set(CMAKE_INSTALL_DEFAULT_PREFIX /opt/${CMAKE_PROJECT_NAME})
+endif()
+if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+  set(CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_DEFAULT_PREFIX}" CACHE PATH
+    "Directory into which to install ${CMAKE_PROJECT_NAME} (default: ${CMAKE_INSTALL_DEFAULT_PREFIX})"
+    FORCE)
+endif()
+message(STATUS "CMAKE_INSTALL_PREFIX = ${CMAKE_INSTALL_PREFIX}")
+
+# When the prefix is /opt/${CMAKE_PROJECT_NAME}, we assume that an "official"
+# build is being created, and thus we install things into specific locations.
+
+if(CMAKE_INSTALL_PREFIX STREQUAL "${CMAKE_INSTALL_DEFAULT_PREFIX}")
+  set(CMAKE_INSTALL_DEFAULT_DATAROOTDIR "")
+  set(CMAKE_INSTALL_DEFAULT_DOCDIR "<CMAKE_INSTALL_DATAROOTDIR>/doc")
+  set(CMAKE_INSTALL_DEFAULT_JAVADIR "<CMAKE_INSTALL_DATAROOTDIR>/classes")
+  if(UNIX AND NOT APPLE)
+    if(BITS EQUAL 64)
+      set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib64")
+    else()
+      set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib32")
+    endif()
+  endif()
+endif()
+
+include(cmakescripts/GNUInstallDirs.cmake)
+
+set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_FULL_LIBDIR})
+
+macro(report_directory var)
+  if(CMAKE_INSTALL_${var} STREQUAL CMAKE_INSTALL_FULL_${var})
+    message(STATUS "CMAKE_INSTALL_${var} = ${CMAKE_INSTALL_${var}}")
+  else()
+    message(STATUS "CMAKE_INSTALL_${var} = ${CMAKE_INSTALL_${var}} (${CMAKE_INSTALL_FULL_${var}})")
+  endif()
+  mark_as_advanced(CLEAR CMAKE_INSTALL_${var})
+endmacro()
+
+set(DIRLIST "BINDIR;DATAROOTDIR;DOCDIR;INCLUDEDIR;LIBDIR")
+if(UNIX)
+  list(APPEND DIRLIST "MANDIR")
+endif()
+foreach(dir ${DIRLIST})
+  report_directory(${dir})
+endforeach()
+
+
+###############################################################################
+# CONFIGURATION OPTIONS
+###############################################################################
+
+macro(boolean_number var)
+  if(${var})
+    set(${var} 1)
+  else()
+    set(${var} 0)
+  endif()
+endmacro()
+
 option(ENABLE_SHARED "Build shared libraries" TRUE)
+boolean_number(ENABLE_SHARED)
+option(ENABLE_STATIC "Build static libraries" TRUE)
+boolean_number(ENABLE_STATIC)
+option(REQUIRE_SIMD "Generate a fatal error if SIMD extensions are not available for this platform (default is to fall back to a non-SIMD build)" FALSE)
+boolean_number(REQUIRE_SIMD)
+option(WITH_12BIT "Encode/decode JPEG images with 12-bit samples (implies WITH_ARITH_DEC=0 WITH_ARITH_ENC=0 WITH_JAVA=0 WITH_SIMD=0 WITH_TURBOJPEG=0 )" FALSE)
+boolean_number(WITH_12BIT)
+option(WITH_ARITH_DEC "Include arithmetic decoding support when emulating the libjpeg v6b API/ABI" TRUE)
+boolean_number(WITH_ARITH_DEC)
+option(WITH_ARITH_ENC "Include arithmetic encoding support when emulating the libjpeg v6b API/ABI" TRUE)
+boolean_number(WITH_ARITH_ENC)
+option(WITH_JAVA "Build Java wrapper for the TurboJPEG API library (implies ENABLE_SHARED=1)" FALSE)
+boolean_number(WITH_JAVA)
+option(WITH_JPEG7 "Emulate libjpeg v7 API/ABI (this makes ${CMAKE_PROJECT_NAME} backward-incompatible with libjpeg v6b)" FALSE)
+boolean_number(WITH_JPEG7)
+option(WITH_JPEG8 "Emulate libjpeg v8 API/ABI (this makes ${CMAKE_PROJECT_NAME} backward-incompatible with libjpeg v6b)" FALSE)
+boolean_number(WITH_JPEG8)
+option(WITH_MEM_SRCDST "Include in-memory source/destination manager functions when emulating the libjpeg v6b or v7 API/ABI" TRUE)
+boolean_number(WITH_MEM_SRCDST)
+option(WITH_SIMD "Include SIMD extensions, if available for this platform" TRUE)
+boolean_number(WITH_SIMD)
+option(WITH_TURBOJPEG "Include the TurboJPEG API library and associated test programs" TRUE)
+boolean_number(WITH_TURBOJPEG)
+
+macro(report_option var desc)
+  if(${var})
+    message(STATUS "${desc} enabled (${var} = ${${var}})")
+  else()
+    message(STATUS "${desc} disabled (${var} = ${${var}})")
+  endif()
+endmacro()
+
+if(WITH_JAVA)
+  set(ENABLE_SHARED 1)
+endif()
+
+# Explicitly setting CMAKE_POSITION_INDEPENDENT_CODE=FALSE disables PIC for all
+# targets, which will cause the shared library builds to fail.  Thus, if shared
+# libraries are enabled and CMAKE_POSITION_INDEPENDENT_CODE is explicitly set
+# to FALSE, we need to unset it, thus restoring the default behavior
+# (automatically using PIC for shared library targets.)
+if(DEFINED CMAKE_POSITION_INDEPENDENT_CODE AND
+  NOT CMAKE_POSITION_INDEPENDENT_CODE AND ENABLE_SHARED)
+  unset(CMAKE_POSITION_INDEPENDENT_CODE CACHE)
+endif()
+
+report_option(ENABLE_SHARED "Shared libraries")
+report_option(ENABLE_STATIC "Static libraries")
 
 if(WITH_12BIT)
-  set(WITH_SIMD FALSE)
-  set(WITH_TURBOJPEG FALSE)
-  set(WITH_JAVA FALSE)
-  set(WITH_ARITH_ENC FALSE)
-  set(WITH_ARITH_DEC FALSE)
+  set(WITH_ARITH_DEC 0)
+  set(WITH_ARITH_ENC 0)
+  set(WITH_JAVA 0)
+  set(WITH_SIMD 0)
+  set(WITH_TURBOJPEG 0)
   set(BITS_IN_JSAMPLE 12)
-  message(STATUS "12-bit JPEG support enabled")
 else()
   set(BITS_IN_JSAMPLE 8)
 endif()
+report_option(WITH_12BIT "12-bit JPEG support")
 
 if(WITH_JPEG8 OR WITH_JPEG7)
   set(WITH_ARITH_ENC 1)
   set(WITH_ARITH_DEC 1)
 endif()
 if(WITH_JPEG8)
-  set(WITH_MEM_SRCDST 1)
-endif()
-
-if(WITH_ARITH_ENC)
-  set(C_ARITH_CODING_SUPPORTED 1)
-  message(STATUS "Arithmetic encoding support enabled")
-else()
-  message(STATUS "Arithmetic encoding support disabled")
+  set(WITH_MEM_SRCDST 0)
 endif()
 
 if(WITH_ARITH_DEC)
   set(D_ARITH_CODING_SUPPORTED 1)
-  message(STATUS "Arithmetic decoding support enabled")
-else()
-  message(STATUS "Arithmetic decoding support disabled")
+endif()
+if(NOT WITH_12BIT)
+  report_option(WITH_ARITH_DEC "Arithmetic decoding support")
 endif()
 
-if(WITH_TURBOJPEG)
-  message(STATUS "TurboJPEG C wrapper enabled")
-else()
-  message(STATUS "TurboJPEG C wrapper disabled")
+if(WITH_ARITH_ENC)
+  set(C_ARITH_CODING_SUPPORTED 1)
+endif()
+if(NOT WITH_12BIT)
+  report_option(WITH_ARITH_ENC "Arithmetic encoding support")
 endif()
 
-if(WITH_JAVA)
-  message(STATUS "TurboJPEG Java wrapper enabled")
-else()
-  message(STATUS "TurboJPEG Java wrapper disabled")
+if(NOT WITH_12BIT)
+  report_option(WITH_TURBOJPEG "TurboJPEG API library")
+  report_option(WITH_JAVA "TurboJPEG Java wrapper")
 endif()
 
-set(SO_AGE 0)
-if(WITH_MEM_SRCDST)
-  set(SO_AGE 1)
-endif()
-
-set(JPEG_LIB_VERSION 62)
-set(DLL_VERSION ${JPEG_LIB_VERSION})
-set(FULLVERSION ${DLL_VERSION}.${SO_AGE}.0)
-if(WITH_JPEG8)
-  set(JPEG_LIB_VERSION 80)
-  set(DLL_VERSION 8)
-  set(FULLVERSION ${DLL_VERSION}.0.2)
-  message(STATUS "Emulating libjpeg v8 API/ABI")
-elseif(WITH_JPEG7)
-  set(JPEG_LIB_VERSION 70)
-  set(DLL_VERSION 7)
-  set(FULLVERSION ${DLL_VERSION}.${SO_AGE}.0)
-  message(STATUS "Emulating libjpeg v7 API/ABI")
-endif(WITH_JPEG8)
-
 if(WITH_MEM_SRCDST)
   set(MEM_SRCDST_SUPPORTED 1)
-  message(STATUS "In-memory source/destination managers enabled")
-else()
-  message(STATUS "In-memory source/destination managers disabled")
+  set(MEM_SRCDST_FUNCTIONS "global:  jpeg_mem_dest;  jpeg_mem_src;")
 endif()
+if(NOT WITH_JPEG8)
+  report_option(WITH_MEM_SRCDST "In-memory source/destination managers")
+endif()
+
+set(SO_AGE 2)
+if(WITH_MEM_SRCDST)
+  set(SO_AGE 3)
+endif()
+
+if(WITH_JPEG8)
+  set(JPEG_LIB_VERSION 80)
+elseif(WITH_JPEG7)
+  set(JPEG_LIB_VERSION 70)
+else()
+  set(JPEG_LIB_VERSION 62)
+endif()
+
+math(EXPR JPEG_LIB_VERSION_DIV10 "${JPEG_LIB_VERSION} / 10")
+math(EXPR JPEG_LIB_VERSION_MOD10 "${JPEG_LIB_VERSION} % 10")
+if(JPEG_LIB_VERSION STREQUAL "62")
+  set(DEFAULT_SO_MAJOR_VERSION ${JPEG_LIB_VERSION})
+else()
+  set(DEFAULT_SO_MAJOR_VERSION ${JPEG_LIB_VERSION_DIV10})
+endif()
+if(JPEG_LIB_VERSION STREQUAL "80")
+  set(DEFAULT_SO_MINOR_VERSION 2)
+else()
+  set(DEFAULT_SO_MINOR_VERSION 0)
+endif()
+
+# This causes SO_MAJOR_VERSION/SO_MINOR_VERSION to reset to defaults if
+# WITH_JPEG7 or WITH_JPEG8 has changed.
+if((DEFINED WITH_JPEG7_INT AND NOT WITH_JPEG7 EQUAL WITH_JPEG7_INT) OR
+  (DEFINED WITH_JPEG8_INT AND NOT WITH_JPEG8 EQUAL WITH_JPEG8_INT))
+  set(FORCE_SO_VERSION "FORCE")
+endif()
+set(WITH_JPEG7_INT ${WITH_JPEG7} CACHE INTERNAL "")
+set(WITH_JPEG8_INT ${WITH_JPEG8} CACHE INTERNAL "")
+
+set(SO_MAJOR_VERSION ${DEFAULT_SO_MAJOR_VERSION} CACHE STRING
+  "Major version of the libjpeg API shared library (default: ${DEFAULT_SO_MAJOR_VERSION})"
+  ${FORCE_SO_VERSION})
+set(SO_MINOR_VERSION ${DEFAULT_SO_MINOR_VERSION} CACHE STRING
+  "Minor version of the libjpeg API shared library (default: ${DEFAULT_SO_MINOR_VERSION})"
+  ${FORCE_SO_VERSION})
+
+set(JPEG_LIB_VERSION_DECIMAL "${JPEG_LIB_VERSION_DIV10}.${JPEG_LIB_VERSION_MOD10}")
+message(STATUS "Emulating libjpeg API/ABI v${JPEG_LIB_VERSION_DECIMAL} (WITH_JPEG7 = ${WITH_JPEG7}, WITH_JPEG8 = ${WITH_JPEG8})")
+message(STATUS "libjpeg API shared library version = ${SO_MAJOR_VERSION}.${SO_AGE}.${SO_MINOR_VERSION}")
+
+# Because the TurboJPEG API library uses versioned symbols and changes the
+# names of functions whenever they are modified in a backward-incompatible
+# manner, it is always backward-ABI-compatible with itself, so the major and
+# minor SO versions don't change.  However, we increase the middle number (the
+# SO "age") whenever functions are added to the API.
+set(TURBOJPEG_SO_MAJOR_VERSION 0)
+set(TURBOJPEG_SO_VERSION 0.2.0)
+
+
+###############################################################################
+# COMPILER SETTINGS
+###############################################################################
 
 if(MSVC)
   option(WITH_CRT_DLL
-    "Link all libjpeg-turbo libraries and executables with the C run-time DLL (msvcr*.dll) instead of the static C run-time library (libcmt*.lib.)  The default is to use the C run-time DLL only with the libraries and executables that need it."
+    "Link all ${CMAKE_PROJECT_NAME} libraries and executables with the C run-time DLL (msvcr*.dll) instead of the static C run-time library (libcmt*.lib.)  The default is to use the C run-time DLL only with the libraries and executables that need it."
     FALSE)
   if(NOT WITH_CRT_DLL)
     # Use the static C library for all build types
@@ -149,45 +306,32 @@
       endif()
     endforeach()
   endif()
-  add_definitions(-W3 -wd4996)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W3 /wd4996")
 endif()
 
-# Detect whether compiler is 64-bit
-if(MSVC AND CMAKE_CL_64)
-  set(SIMD_X86_64 1)
-  set(64BIT 1)
-elseif(CMAKE_SIZEOF_VOID_P MATCHES 8)
-  set(SIMD_X86_64 1)
-  set(64BIT 1)
+if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_C_COMPILER_ID STREQUAL "Clang")
+  # Use the maximum optimization level for release builds
+  foreach(var CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_RELWITHDEBINFO)
+    if(${var} MATCHES "-O2")
+      string(REGEX REPLACE "-O2" "-O3" ${var} "${${var}}")
+    endif()
+  endforeach()
 endif()
 
-if(64BIT)
-  message(STATUS "64-bit build")
-else()
-  message(STATUS "32-bit build")
-endif()
-
-if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
-  if(MSVC)
-    set(CMAKE_INSTALL_PREFIX_DEFAULT ${CMAKE_PROJECT_NAME})
-  else()
-    set(CMAKE_INSTALL_PREFIX_DEFAULT ${CMAKE_PROJECT_NAME}-gcc)
+if(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
+  if(CMAKE_C_COMPILER_ID MATCHES "SunPro")
+    # Use the maximum optimization level for release builds
+    foreach(var CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_RELWITHDEBINFO)
+      if(${var} MATCHES "-xO3")
+        string(REGEX REPLACE "-xO3" "-xO5" ${var} "${${var}}")
+      endif()
+      if(${var} MATCHES "-xO2")
+        string(REGEX REPLACE "-xO2" "-xO5" ${var} "${${var}}")
+      endif()
+    endforeach()
   endif()
-  if(64BIT)
-    set(CMAKE_INSTALL_PREFIX_DEFAULT ${CMAKE_INSTALL_PREFIX_DEFAULT}64)
-  endif()
-  set(CMAKE_INSTALL_PREFIX "c:/${CMAKE_INSTALL_PREFIX_DEFAULT}" CACHE PATH
-    "Directory into which to install libjpeg-turbo (default: c:/${CMAKE_INSTALL_PREFIX_DEFAULT})"
-    FORCE)
 endif()
 
-message(STATUS "Install directory = ${CMAKE_INSTALL_PREFIX}")
-
-configure_file(win/jconfig.h.in jconfig.h)
-configure_file(win/jconfigint.h.in jconfigint.h)
-
-include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_SOURCE_DIR})
-
 string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
 
 set(EFFECTIVE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
@@ -196,26 +340,161 @@
 set(EFFECTIVE_LD_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
 message(STATUS "Linker flags = ${EFFECTIVE_LD_FLAGS}")
 
-if(WITH_JAVA)
-  find_package(Java)
-  find_package(JNI)
-  if(DEFINED JAVACFLAGS)
-    message(STATUS "Java compiler flags = ${JAVACFLAGS}")
+include(CheckCSourceCompiles)
+
+if(UNIX)
+  # Check for headers
+  include(CheckIncludeFiles)
+  check_include_files(locale.h HAVE_LOCALE_H)
+  check_include_files(stddef.h HAVE_STDDEF_H)
+  check_include_files(stdlib.h HAVE_STDLIB_H)
+  check_include_files(sys/types.h NEED_SYS_TYPES_H)
+
+  # Check for functions
+  include(CheckFunctionExists)
+  check_function_exists(memset HAVE_MEMSET)
+  check_function_exists(memcpy HAVE_MEMCPY)
+  if(NOT HAVE_MEMSET AND NOT HAVE_MEMCPY)
+    set(NEED_BSD_STRINGS 1)
+  endif()
+
+  # Check for types
+  include(CheckTypeSize)
+  check_type_size("unsigned char" UNSIGNED_CHAR)
+  check_type_size("unsigned short" UNSIGNED_SHORT)
+  check_type_size("size_t" SIZE_T)
+
+  # Check for compiler features
+  check_c_source_compiles("int main(void) { typedef struct undefined_structure *undef_struct_ptr; }"
+    INCOMPLETE_TYPES)
+  if(INCOMPLETE_TYPES)
+    message(STATUS "Compiler supports pointers to undefined structures.")
+  else()
+    set(INCOMPLETE_TYPES_BROKEN 1)
+    message(STATUS "Compiler does not support pointers to undefined structures.")
+  endif()
+
+  if(CMAKE_CROSSCOMPILING)
+    set(RIGHT_SHIFT_IS_UNSIGNED 0)
+  else()
+    include(CheckCSourceRuns)
+    check_c_source_runs("
+      #include <stdio.h>
+      #include <stdlib.h>
+      int is_shifting_signed (long arg) {
+        long res = arg >> 4;
+        if (res == -0x7F7E80CL)
+          return 1; /* right shift is signed */
+        /* see if unsigned-shift hack will fix it. */
+        /* we can't just test exact value since it depends on width of long... */
+        res |= (~0L) << (32-4);
+        if (res == -0x7F7E80CL)
+          return 0; /* right shift is unsigned */
+        printf(\"Right shift isn't acting as I expect it to.\\\\n\");
+        printf(\"I fear the JPEG software will not work at all.\\\\n\\\\n\");
+        return 0; /* try it with unsigned anyway */
+      }
+      int main (void) {
+        exit(is_shifting_signed(-0x7F7E80B1L));
+      }" RIGHT_SHIFT_IS_UNSIGNED)
+  endif()
+
+  if(CMAKE_CROSSCOMPILING)
+    set(__CHAR_UNSIGNED__ 0)
+  else()
+    check_c_source_runs("int main(void) { return ((char) -1 < 0); }"
+      __CHAR_UNSIGNED__)
   endif()
 endif()
 
+if(MSVC)
+  set(INLINE_OPTIONS "__inline;inline")
+else()
+  set(INLINE_OPTIONS "__inline__;inline")
+endif()
+option(FORCE_INLINE "Force function inlining" TRUE)
+boolean_number(FORCE_INLINE)
+if(FORCE_INLINE)
+  if(MSVC)
+    list(INSERT INLINE_OPTIONS 0 "__forceinline")
+  else()
+    list(INSERT INLINE_OPTIONS 0 "inline __attribute__((always_inline))")
+    list(INSERT INLINE_OPTIONS 0 "__inline__ __attribute__((always_inline))")
+  endif()
+endif()
+foreach(inline ${INLINE_OPTIONS})
+  check_c_source_compiles("${inline} static void foo(void) {} int main(void) { foo(); }"
+    INLINE_WORKS)
+  if(INLINE_WORKS)
+    set(INLINE ${inline})
+    break()
+  endif()
+endforeach()
+if(NOT INLINE_WORKS)
+  message(FATAL_ERROR "Could not determine how to inline functions.")
+endif()
+message(STATUS "INLINE = ${INLINE} (FORCE_INLINE = ${FORCE_INLINE})")
 
-#
-# Targets
-#
+if(UNIX AND NOT APPLE)
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/conftest.map "VERS_1 { global: *; };")
+  set(CMAKE_REQUIRED_FLAGS
+    "-Wl,--version-script,${CMAKE_CURRENT_BINARY_DIR}/conftest.map")
+  check_c_source_compiles("int main(void) { return 0; }" HAVE_VERSION_SCRIPT)
+  set(CMAKE_REQUIRED_FLAGS)
+  file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/conftest.map)
+  if(HAVE_VERSION_SCRIPT)
+    message(STATUS "Linker supports GNU-style version scripts")
+    set(MAPFLAG "-Wl,--version-script,")
+    set(TJMAPFLAG "-Wl,--version-script,")
+  else()
+    message(STATUS "Linker does not support GNU-style version scripts")
+    if(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
+      # The Solaris linker doesn't like our version script for the libjpeg API
+      # library, but the version script for the TurboJPEG API library should
+      # still work.
+      file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/conftest.map
+        "VERS_1 { global: foo;  local: *; }; VERS_2 { global: foo2; } VERS_1;")
+      set(CMAKE_REQUIRED_FLAGS "-Wl,-M,${CMAKE_CURRENT_BINARY_DIR}/conftest.map")
+      check_c_source_compiles("void foo() {} void foo2() {} int main(void) { return 0; }"
+        HAVE_MAPFILE)
+      set(CMAKE_REQUIRED_FLAGS)
+      file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/conftest.map)
+      if(HAVE_MAPFILE)
+        message(STATUS "Linker supports mapfiles")
+        set(TJMAPFLAG "-Wl,-M,")
+      else()
+        message(STATUS "Linker does not support mapfiles")
+      endif()
+    endif()
+  endif()
+endif()
+
+# Generate files
+if(WIN32)
+  configure_file(win/jconfig.h.in jconfig.h)
+else()
+  configure_file(jconfig.h.in jconfig.h)
+endif()
+configure_file(jconfigint.h.in jconfigint.h)
+if(UNIX)
+  configure_file(libjpeg.map.in libjpeg.map)
+endif()
+
+# Include directories and compiler definitions
+include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
+
+
+###############################################################################
+# TARGETS
+###############################################################################
 
 set(JPEG_SOURCES jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c
-  jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c jcphuff.c
-  jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c jdatadst.c jdatasrc.c
-  jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c jdinput.c jdmainct.c jdmarker.c
-  jdmaster.c jdmerge.c jdphuff.c jdpostct.c jdsample.c jdtrans.c jerror.c
-  jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c
-  jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c)
+  jcicc.c jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c
+  jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c jdatadst.c
+  jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c jdicc.c jdinput.c
+  jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c jdpostct.c jdsample.c
+  jdtrans.c jerror.c jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c
+  jidctint.c jidctred.c jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c)
 
 if(WITH_ARITH_ENC OR WITH_ARITH_DEC)
   set(JPEG_SOURCES ${JPEG_SOURCES} jaricom.c)
@@ -230,90 +509,107 @@
 endif()
 
 if(WITH_SIMD)
-  add_definitions(-DWITH_SIMD)
   add_subdirectory(simd)
-  if(SIMD_X86_64)
-    set(JPEG_SOURCES ${JPEG_SOURCES} simd/jsimd_x86_64.c)
-  else()
-    set(JPEG_SOURCES ${JPEG_SOURCES} simd/jsimd_i386.c)
+elseif(NOT WITH_12BIT)
+  message(STATUS "SIMD extensions: None (WITH_SIMD = ${WITH_SIMD})")
+endif()
+if(WITH_SIMD)
+  message(STATUS "SIMD extensions: ${CPU_TYPE} (WITH_SIMD = ${WITH_SIMD})")
+  if(MSVC_IDE)
+    set_source_files_properties(${SIMD_OBJS} PROPERTIES GENERATED 1)
   endif()
-  # This tells CMake that the "source" files haven't been generated yet
-  set_source_files_properties(${SIMD_OBJS} PROPERTIES GENERATED 1)
 else()
-  set(JPEG_SOURCES ${JPEG_SOURCES} jsimd_none.c)
-  message(STATUS "Not using SIMD acceleration")
+  add_library(simd OBJECT jsimd_none.c)
 endif()
 
 if(WITH_JAVA)
   add_subdirectory(java)
-  set(ENABLE_SHARED TRUE)
 endif()
 
 if(ENABLE_SHARED)
   add_subdirectory(sharedlib)
 endif()
 
-if(ENABLE_STATIC OR WITH_TURBOJPEG)
-  add_library(jpeg-static STATIC ${JPEG_SOURCES} ${SIMD_OBJS})
+if(ENABLE_STATIC)
+  add_library(jpeg-static STATIC ${JPEG_SOURCES} $<TARGET_OBJECTS:simd>
+    ${SIMD_OBJS})
   if(NOT MSVC)
     set_target_properties(jpeg-static PROPERTIES OUTPUT_NAME jpeg)
   endif()
-  if(WITH_SIMD)
-    add_dependencies(jpeg-static simd)
-  endif()
 endif()
 
 if(WITH_TURBOJPEG)
-  set(TURBOJPEG_SOURCES turbojpeg.c transupp.c jdatadst-tj.c jdatasrc-tj.c)
-  if(WITH_JAVA)
-    set(TURBOJPEG_SOURCES ${TURBOJPEG_SOURCES} turbojpeg-jni.c)
-    include_directories(${JAVA_INCLUDE_PATH} ${JAVA_INCLUDE_PATH2})
-  endif()
-
   if(ENABLE_SHARED)
+    set(TURBOJPEG_SOURCES ${JPEG_SOURCES} $<TARGET_OBJECTS:simd> ${SIMD_OBJS}
+      turbojpeg.c transupp.c jdatadst-tj.c jdatasrc-tj.c rdbmp.c rdppm.c
+      wrbmp.c wrppm.c)
+    set(TJMAPFILE ${CMAKE_CURRENT_SOURCE_DIR}/turbojpeg-mapfile)
+    if(WITH_JAVA)
+      set(TURBOJPEG_SOURCES ${TURBOJPEG_SOURCES} turbojpeg-jni.c)
+      include_directories(${JAVA_INCLUDE_PATH} ${JAVA_INCLUDE_PATH2})
+      set(TJMAPFILE ${CMAKE_CURRENT_SOURCE_DIR}/turbojpeg-mapfile.jni)
+    endif()
     add_library(turbojpeg SHARED ${TURBOJPEG_SOURCES})
-    set_target_properties(turbojpeg PROPERTIES DEFINE_SYMBOL DLLDEFINE)
+    set_property(TARGET turbojpeg PROPERTY COMPILE_FLAGS
+      "-DBMP_SUPPORTED -DPPM_SUPPORTED")
+    if(WIN32)
+      set_target_properties(turbojpeg PROPERTIES DEFINE_SYMBOL DLLDEFINE)
+    endif()
     if(MINGW)
       set_target_properties(turbojpeg PROPERTIES LINK_FLAGS -Wl,--kill-at)
     endif()
-    target_link_libraries(turbojpeg jpeg-static)
-    set_target_properties(turbojpeg PROPERTIES LINK_INTERFACE_LIBRARIES "")
+    if(APPLE)
+      set_target_properties(turbojpeg PROPERTIES MACOSX_RPATH 1)
+    endif()
+    set_target_properties(turbojpeg PROPERTIES
+      SOVERSION ${TURBOJPEG_SO_MAJOR_VERSION} VERSION ${TURBOJPEG_SO_VERSION})
+    if(TJMAPFLAG)
+      set_target_properties(turbojpeg PROPERTIES
+        LINK_FLAGS "${TJMAPFLAG}${TJMAPFILE}")
+    endif()
 
-    add_executable(tjunittest tjunittest.c tjutil.c)
+    add_executable(tjunittest tjunittest.c tjutil.c md5/md5.c md5/md5hl.c)
     target_link_libraries(tjunittest turbojpeg)
 
-    add_executable(tjbench tjbench.c bmp.c tjutil.c rdbmp.c rdppm.c wrbmp.c
-      wrppm.c)
-    target_link_libraries(tjbench turbojpeg jpeg-static)
-    set_property(TARGET tjbench PROPERTY COMPILE_FLAGS
-      "-DBMP_SUPPORTED -DPPM_SUPPORTED")
+    add_executable(tjbench tjbench.c tjutil.c)
+    target_link_libraries(tjbench turbojpeg)
+    if(UNIX)
+      target_link_libraries(tjbench m)
+    endif()
+
+    add_executable(tjexample tjexample.c)
+    target_link_libraries(tjexample turbojpeg)
   endif()
 
   if(ENABLE_STATIC)
-    add_library(turbojpeg-static STATIC ${JPEG_SOURCES} ${SIMD_OBJS}
-      turbojpeg.c transupp.c jdatadst-tj.c jdatasrc-tj.c)
+    add_library(turbojpeg-static STATIC ${JPEG_SOURCES} $<TARGET_OBJECTS:simd>
+      ${SIMD_OBJS} turbojpeg.c transupp.c jdatadst-tj.c jdatasrc-tj.c rdbmp.c
+      rdppm.c wrbmp.c wrppm.c)
+    set_property(TARGET turbojpeg-static PROPERTY COMPILE_FLAGS
+      "-DBMP_SUPPORTED -DPPM_SUPPORTED")
     if(NOT MSVC)
       set_target_properties(turbojpeg-static PROPERTIES OUTPUT_NAME turbojpeg)
     endif()
-    if(WITH_SIMD)
-      add_dependencies(turbojpeg-static simd)
-    endif()
 
-    add_executable(tjunittest-static tjunittest.c tjutil.c)
+    add_executable(tjunittest-static tjunittest.c tjutil.c md5/md5.c
+      md5/md5hl.c)
     target_link_libraries(tjunittest-static turbojpeg-static)
 
-    add_executable(tjbench-static tjbench.c bmp.c tjutil.c rdbmp.c rdppm.c
-      wrbmp.c wrppm.c)
-    target_link_libraries(tjbench-static turbojpeg-static jpeg-static)
-    set_property(TARGET tjbench-static PROPERTY COMPILE_FLAGS
-      "-DBMP_SUPPORTED -DPPM_SUPPORTED")
+    add_executable(tjbench-static tjbench.c tjutil.c)
+    target_link_libraries(tjbench-static turbojpeg-static)
+    if(UNIX)
+      target_link_libraries(tjbench-static m)
+    endif()
   endif()
 endif()
 
+if(WIN32)
+  set(USE_SETMODE "-DUSE_SETMODE")
+endif()
 if(WITH_12BIT)
-  set(COMPILE_FLAGS "-DGIF_SUPPORTED -DPPM_SUPPORTED -DUSE_SETMODE")
+  set(COMPILE_FLAGS "-DGIF_SUPPORTED -DPPM_SUPPORTED ${USE_SETMODE}")
 else()
-  set(COMPILE_FLAGS "-DBMP_SUPPORTED -DGIF_SUPPORTED -DPPM_SUPPORTED -DTARGA_SUPPORTED -DUSE_SETMODE")
+  set(COMPILE_FLAGS "-DBMP_SUPPORTED -DGIF_SUPPORTED -DPPM_SUPPORTED -DTARGA_SUPPORTED ${USE_SETMODE}")
   set(CJPEG_BMP_SOURCES rdbmp.c rdtarga.c)
   set(DJPEG_BMP_SOURCES wrbmp.c wrtarga.c)
 endif()
@@ -331,7 +627,7 @@
 
   add_executable(jpegtran-static jpegtran.c cdjpeg.c rdswitch.c transupp.c)
   target_link_libraries(jpegtran-static jpeg-static)
-  set_property(TARGET jpegtran-static PROPERTY COMPILE_FLAGS "-DUSE_SETMODE")
+  set_property(TARGET jpegtran-static PROPERTY COMPILE_FLAGS "${USE_SETMODE}")
 endif()
 
 add_executable(rdjpgcom rdjpgcom.c)
@@ -339,9 +635,9 @@
 add_executable(wrjpgcom wrjpgcom.c)
 
 
-#
-# Tests
-#
+###############################################################################
+# TESTS
+###############################################################################
 
 add_subdirectory(md5)
 
@@ -355,7 +651,8 @@
 
 if(WITH_12BIT)
   set(TESTORIG testorig12.jpg)
-  set(MD5_JPEG_RGB_ISLOW 9620f424569594bb9242b48498ad801f)
+  set(MD5_JPEG_RGB_ISLOW 9d7369207c520d37f2c1cbfcb82b2964)
+  set(MD5_JPEG_RGB_ISLOW2 a00bd20d8ae49684640ef7177d2e0b64)
   set(MD5_PPM_RGB_ISLOW f3301d2219783b8b3d942b7239fa50c0)
   set(MD5_JPEG_422_IFAST_OPT 7322e3bd2f127f7de4b40d4480ce60e4)
   set(MD5_PPM_422_IFAST 79807fa552899e66a04708f533e16950)
@@ -367,20 +664,20 @@
   set(MD5_PPM_GRAY_ISLOW 7213c10af507ad467da5578ca5ee1fca)
   set(MD5_PPM_GRAY_ISLOW_RGB e96ee81c30a6ed422d466338bd3de65d)
   set(MD5_JPEG_420S_IFAST_OPT 7af8e60be4d9c227ec63ac9b6630855e)
-  if(64BIT)
-    # Windows/x64 uses SSE for floating point
-    set(MD5_JPEG_3x2_FLOAT_PROG a8c17daf77b457725ec929e215b603f8)
-    set(MD5_PPM_3x2_FLOAT 42876ab9e5c2f76a87d08db5fbd57956)
-  else()
-    # Windows/x86 uses the 387 FPU for floating point
-    if(MSVC)
-      set(MD5_JPEG_3x2_FLOAT_PROG e27840755870fa849872e58aa0cd1400)
-      set(MD5_PPM_3x2_FLOAT 6c2880b83bb1aa41dfe330e7a9768690)
-    else()
-      set(MD5_JPEG_3x2_FLOAT_PROG bc6dbbefac2872f6b9d6c4a0ae60c3c0)
-      set(MD5_PPM_3x2_FLOAT f58119ee294198ac9b4a9f5645a34266)
-    endif()
-  endif()
+
+  set(MD5_JPEG_3x2_FLOAT_PROG_SSE a8c17daf77b457725ec929e215b603f8)
+  set(MD5_PPM_3x2_FLOAT_SSE 42876ab9e5c2f76a87d08db5fbd57956)
+  set(MD5_JPEG_3x2_FLOAT_PROG_32BIT a8c17daf77b457725ec929e215b603f8)
+  set(MD5_PPM_3x2_FLOAT_32BIT ${MD5_PPM_3x2_FLOAT_SSE})
+  set(MD5_JPEG_3x2_FLOAT_PROG_64BIT ${MD5_JPEG_3x2_FLOAT_PROG_32BIT})
+  set(MD5_PPM_3x2_FLOAT_64BIT ${MD5_PPM_3x2_FLOAT_SSE})
+  set(MD5_JPEG_3x2_FLOAT_PROG_387 bc6dbbefac2872f6b9d6c4a0ae60c3c0)
+  set(MD5_PPM_3x2_FLOAT_387 bcc5723c61560463ac60f772e742d092)
+  set(MD5_JPEG_3x2_FLOAT_PROG_MSVC e27840755870fa849872e58aa0cd1400)
+  set(MD5_PPM_3x2_FLOAT_MSVC 6c2880b83bb1aa41dfe330e7a9768690)
+
+  set(MD5_JPEG_3x2_IFAST_PROG 1396cc2b7185cfe943d408c9d305339e)
+  set(MD5_PPM_3x2_IFAST 3975985ef6eeb0a2cdc58daa651ccc00)
   set(MD5_PPM_420M_ISLOW_2_1 4ca6be2a6f326ff9eaab63e70a8259c0)
   set(MD5_PPM_420M_ISLOW_15_8 12aa9f9534c1b3d7ba047322226365eb)
   set(MD5_PPM_420M_ISLOW_13_8 f7e22817c7b25e1393e4ec101e9d4e96)
@@ -400,7 +697,8 @@
   set(MD5_JPEG_CROP cdb35ff4b4519392690ea040c56ea99c)
 else()
   set(TESTORIG testorig.jpg)
-  set(MD5_JPEG_RGB_ISLOW 768e970dd57b340ff1b83c9d3d47c77b)
+  set(MD5_JPEG_RGB_ISLOW 1d44a406f61da743b5fd31c0a9abdca3)
+  set(MD5_JPEG_RGB_ISLOW2 31d121e57b6c2934c890a7fc7763bcd4)
   set(MD5_PPM_RGB_ISLOW 00a257f5393fef8821f2b88ac7421291)
   set(MD5_BMP_RGB_ISLOW_565 f07d2e75073e4bb10f6c6f4d36e2e3be)
   set(MD5_BMP_RGB_ISLOW_565D 4cfa0928ef3e6bb626d7728c924cfda4)
@@ -418,23 +716,20 @@
   set(MD5_BMP_GRAY_ISLOW_565 12f78118e56a2f48b966f792fedf23cc)
   set(MD5_BMP_GRAY_ISLOW_565D bdbbd616441a24354c98553df5dc82db)
   set(MD5_JPEG_420S_IFAST_OPT 388708217ac46273ca33086b22827ed8)
-  if(WITH_SIMD)
-    set(MD5_JPEG_3x2_FLOAT_PROG 343e3f8caf8af5986ebaf0bdc13b5c71)
-    set(MD5_PPM_3x2_FLOAT 1a75f36e5904d6fc3a85a43da9ad89bb)
-  else()
-    if(64BIT)
-      set(MD5_JPEG_3x2_FLOAT_PROG 9bca803d2042bd1eb03819e2bf92b3e5)
-      set(MD5_PPM_3x2_FLOAT f6bfab038438ed8f5522fbd33595dcdc)
-    else()
-      if(MSVC)
-        set(MD5_JPEG_3x2_FLOAT_PROG 7999ce9cd0ee9b6c7043b7351ab7639d)
-        set(MD5_PPM_3x2_FLOAT 28cdc448a6b75e97892f0e0f8d4b21f3)
-      else()
-        set(MD5_JPEG_3x2_FLOAT_PROG 1657664a410e0822c924b54f6f65e6e9)
-        set(MD5_PPM_3x2_FLOAT cb0a1f027f3d2917c902b5640214e025)
-      endif()
-    endif()
-  endif()
+
+  set(MD5_JPEG_3x2_FLOAT_PROG_SSE 343e3f8caf8af5986ebaf0bdc13b5c71)
+  set(MD5_PPM_3x2_FLOAT_SSE 1a75f36e5904d6fc3a85a43da9ad89bb)
+  set(MD5_JPEG_3x2_FLOAT_PROG_32BIT 9bca803d2042bd1eb03819e2bf92b3e5)
+  set(MD5_PPM_3x2_FLOAT_32BIT f6bfab038438ed8f5522fbd33595dcdc)
+  set(MD5_JPEG_3x2_FLOAT_PROG_64BIT ${MD5_JPEG_3x2_FLOAT_PROG_32BIT})
+  set(MD5_PPM_3x2_FLOAT_64BIT 0e917a34193ef976b679a6b069b1be26)
+  set(MD5_JPEG_3x2_FLOAT_PROG_387 1657664a410e0822c924b54f6f65e6e9)
+  set(MD5_PPM_3x2_FLOAT_387 cb0a1f027f3d2917c902b5640214e025)
+  set(MD5_JPEG_3x2_FLOAT_PROG_MSVC 7999ce9cd0ee9b6c7043b7351ab7639d)
+  set(MD5_PPM_3x2_FLOAT_MSVC 28cdc448a6b75e97892f0e0f8d4b21f3)
+
+  set(MD5_JPEG_3x2_IFAST_PROG 1ee5d2c1a77f2da495f993c8c7cceca5)
+  set(MD5_PPM_3x2_IFAST fd283664b3b49127984af0a7f118fccd)
   set(MD5_JPEG_420_ISLOW_ARI e986fb0a637a8d833d96e8a6d6d84ea1)
   set(MD5_JPEG_444_ISLOW_PROGARI 0a8f1c8f66e113c3cf635df0a475a617)
   set(MD5_PPM_420M_IFAST_ARI 72b59a99bcf1de24c5b27d151bde2437)
@@ -468,27 +763,27 @@
 
 if(WITH_JAVA)
   add_test(TJUnitTest
-    ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar
+    ${Java_JAVA_EXECUTABLE} ${JAVAARGS} -cp java/turbojpeg.jar
       -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
       TJUnitTest)
   add_test(TJUnitTest-yuv
-    ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar
+    ${Java_JAVA_EXECUTABLE} ${JAVAARGS} -cp java/turbojpeg.jar
       -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
       TJUnitTest -yuv)
   add_test(TJUnitTest-yuv-nopad
-    ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar
+    ${Java_JAVA_EXECUTABLE} ${JAVAARGS} -cp java/turbojpeg.jar
       -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
       TJUnitTest -yuv -noyuvpad)
   add_test(TJUnitTest-bi
-    ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar
+    ${Java_JAVA_EXECUTABLE} ${JAVAARGS} -cp java/turbojpeg.jar
       -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
       TJUnitTest -bi)
   add_test(TJUnitTest-bi-yuv
-    ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar
+    ${Java_JAVA_EXECUTABLE} ${JAVAARGS} -cp java/turbojpeg.jar
       -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
       TJUnitTest -bi -yuv)
   add_test(TJUnitTest-bi-yuv-nopad
-    ${JAVA_RUNTIME} -cp java/${OBJDIR}turbojpeg.jar
+    ${Java_JAVA_EXECUTABLE} ${JAVAARGS} -cp java/turbojpeg.jar
       -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
       TJUnitTest -bi -yuv -noyuvpad)
 endif()
@@ -501,214 +796,325 @@
   set(TEST_LIBTYPES ${TEST_LIBTYPES} static)
 endif()
 
-set(TESTIMAGES ${CMAKE_SOURCE_DIR}/testimages)
+set(TESTIMAGES ${CMAKE_CURRENT_SOURCE_DIR}/testimages)
 set(MD5CMP ${CMAKE_CURRENT_BINARY_DIR}/md5/md5cmp)
 if(CMAKE_CROSSCOMPILING)
   file(RELATIVE_PATH TESTIMAGES ${CMAKE_CURRENT_BINARY_DIR} ${TESTIMAGES})
   file(RELATIVE_PATH MD5CMP ${CMAKE_CURRENT_BINARY_DIR} ${MD5CMP})
 endif()
 
+# The output of the floating point DCT/IDCT algorithms differs depending on the
+# type of floating point math used, so the FLOATTEST CMake variable must be
+# set in order to tell the testing system which floating point results it
+# should expect:
+#
+# sse = validate against the expected results from the libjpeg-turbo SSE SIMD
+#       extensions
+# 32bit = validate against the expected results from the C code when running on
+#         a 32-bit FPU (or when SSE is being used for floating point math,
+#         which is generally the default with x86-64 compilers)
+# 64bit = validate against the expected results from the C code when running
+#         on a 64-bit FPU
+# 387 = validate against the expected results from the C code when the 387 FPU
+#       is being used for floating point math (which is generally the default
+#       with x86 compilers)
+# msvc = validate against the expected results from the C code when compiled
+#        with a 32-bit version of Visual C++
+
+if(CPU_TYPE STREQUAL "x86_64" OR CPU_TYPE STREQUAL "i386")
+  if(WITH_SIMD)
+    set(DEFAULT_FLOATTEST sse)
+  elseif(CPU_TYPE STREQUAL "x86_64")
+    set(DEFAULT_FLOATTEST 32bit)
+  elseif(CPU_TYPE STREQUAL "i386" AND MSVC)
+    set(DEFAULT_FLOATTEST msvc)
+  endif()
+else()
+  if(BITS EQUAL 64)
+    set(DEFAULT_FLOATTEST 64bit)
+  elseif(BITS EQUAL 32)
+    set(DEFAULT_FLOATTEST 32bit)
+  endif()
+endif()
+
+# This causes FLOATTEST to reset to the default value if WITH_SIMD has
+# changed.
+if(DEFINED WITH_SIMD_INT AND NOT WITH_SIMD EQUAL WITH_SIMD_INT)
+  set(FORCE_FLOATTEST "FORCE")
+endif()
+set(WITH_SIMD_INT ${WITH_SIMD} CACHE INTERNAL "")
+set(FLOATTEST ${DEFAULT_FLOATTEST} CACHE STRING
+  "The type of floating point math used by the floating point DCT/IDCT algorithms.  This tells the testing system which numerical results it should expect from those tests.  [sse = libjpeg-turbo x86/x86-64 SIMD extensions, 32bit = generic 32-bit FPU or SSE, 64bit = generic 64-bit FPU, 387 = 387 FPU, msvc = 32-bit Visual Studio] (default = ${DEFAULT_FLOATTEST})"
+  ${FORCE_FLOATTEST})
+message(STATUS "FLOATTEST = ${FLOATTEST}")
+
+if(FLOATTEST)
+  string(TOUPPER ${FLOATTEST} FLOATTEST_UC)
+  string(TOLOWER ${FLOATTEST} FLOATTEST)
+  if(NOT FLOATTEST STREQUAL "sse" AND NOT FLOATTEST STREQUAL "32bit" AND
+    NOT FLOATTEST STREQUAL "64bit" AND NOT FLOATTEST STREQUAL "387" AND
+    NOT FLOATTEST STREQUAL "msvc")
+    message(FATAL_ERROR "\"${FLOATTEST}\" is not a valid value for FLOATTEST.")
+  endif()
+endif()
+
 foreach(libtype ${TEST_LIBTYPES})
-  if(libtype STREQUAL "shared")
-    set(dir sharedlib/)
-  else()
-    set(dir "")
+  if(libtype STREQUAL "static")
     set(suffix -static)
   endif()
   if(WITH_TURBOJPEG)
-    add_test(tjunittest${suffix} tjunittest${suffix})
-    add_test(tjunittest${suffix}-alloc tjunittest${suffix} -alloc)
-    add_test(tjunittest${suffix}-yuv tjunittest${suffix} -yuv)
-    add_test(tjunittest${suffix}-yuv-alloc tjunittest${suffix} -yuv -alloc)
-    add_test(tjunittest${suffix}-yuv-nopad tjunittest${suffix} -yuv -noyuvpad)
+    add_test(tjunittest-${libtype} tjunittest${suffix})
+    add_test(tjunittest-${libtype}-alloc tjunittest${suffix} -alloc)
+    add_test(tjunittest-${libtype}-yuv tjunittest${suffix} -yuv)
+    add_test(tjunittest-${libtype}-yuv-alloc tjunittest${suffix} -yuv -alloc)
+    add_test(tjunittest-${libtype}-yuv-nopad tjunittest${suffix} -yuv -noyuvpad)
+    add_test(tjunittest-${libtype}-bmp tjunittest${suffix} -bmp)
+
+    set(MD5_PPM_GRAY_TILE 89d3ca21213d9d864b50b4e4e7de4ca6)
+    set(MD5_PPM_420_8x8_TILE 847fceab15c5b7b911cb986cf0f71de3)
+    set(MD5_PPM_420_16x16_TILE ca45552a93687e078f7137cc4126a7b0)
+    set(MD5_PPM_420_32x32_TILE d8676f1d6b68df358353bba9844f4a00)
+    set(MD5_PPM_420_64x64_TILE 4e4c1a3d7ea4bace4f868bcbe83b7050)
+    set(MD5_PPM_420_128x128_TILE f24c3429c52265832beab9df72a0ceae)
+    set(MD5_PPM_420M_8x8_TILE bc25320e1f4c31ce2e610e43e9fd173c)
+    set(MD5_PPM_420M_TILE 75ffdf14602258c5c189522af57fa605)
+    set(MD5_PPM_422_8x8_TILE d83dacd9fc73b0a6f10c09acad64eb1e)
+    set(MD5_PPM_422_16x16_TILE 35077fb610d72dd743b1eb0cbcfe10fb)
+    set(MD5_PPM_422_32x32_TILE e6902ed8a449ecc0f0d6f2bf945f65f7)
+    set(MD5_PPM_422_64x64_TILE 2b4502a8f316cedbde1da7bce3d2231e)
+    set(MD5_PPM_422_128x128_TILE f0b5617d578f5e13c8eee215d64d4877)
+    set(MD5_PPM_422M_8x8_TILE 828941d7f41cd6283abd6beffb7fd51d)
+    set(MD5_PPM_422M_TILE e877ae1324c4a280b95376f7f018172f)
+    set(MD5_PPM_444_TILE 7964e41e67cfb8d0a587c0aa4798f9c3)
+
+    # Test compressing from/decompressing to an arbitrary subregion of a larger
+    # image buffer
+    add_test(tjbench-${libtype}-tile-cp
+      ${CMAKE_COMMAND} -E copy_if_different ${TESTIMAGES}/testorig.ppm
+        testout_tile.ppm)
+    add_test(tjbench-${libtype}-tile
+      tjbench${suffix} testout_tile.ppm 95 -rgb -quiet -tile -benchtime 0.01
+        -warmup 0)
+    set_tests_properties(tjbench-${libtype}-tile
+      PROPERTIES DEPENDS tjbench-${libtype}-tile-cp)
+
+    foreach(tile 8 16 32 64 128)
+      add_test(tjbench-${libtype}-tile-gray-${tile}x${tile}-cmp
+        ${MD5CMP} ${MD5_PPM_GRAY_TILE}
+          testout_tile_GRAY_Q95_${tile}x${tile}.ppm)
+      foreach(subsamp 420 422)
+        add_test(tjbench-${libtype}-tile-${subsamp}-${tile}x${tile}-cmp
+          ${MD5CMP} ${MD5_PPM_${subsamp}_${tile}x${tile}_TILE}
+            testout_tile_${subsamp}_Q95_${tile}x${tile}.ppm)
+      endforeach()
+      add_test(tjbench-${libtype}-tile-444-${tile}x${tile}-cmp
+        ${MD5CMP} ${MD5_PPM_444_TILE}
+          testout_tile_444_Q95_${tile}x${tile}.ppm)
+      foreach(subsamp gray 420 422 444)
+        set_tests_properties(tjbench-${libtype}-tile-${subsamp}-${tile}x${tile}-cmp
+          PROPERTIES DEPENDS tjbench-${libtype}-tile)
+      endforeach()
+    endforeach()
+
+    add_test(tjbench-${libtype}-tilem-cp
+      ${CMAKE_COMMAND} -E copy_if_different ${TESTIMAGES}/testorig.ppm
+        testout_tilem.ppm)
+    add_test(tjbench-${libtype}-tilem
+      tjbench${suffix} testout_tilem.ppm 95 -rgb -fastupsample -quiet -tile
+        -benchtime 0.01 -warmup 0)
+    set_tests_properties(tjbench-${libtype}-tilem
+      PROPERTIES DEPENDS tjbench-${libtype}-tilem-cp)
+
+    add_test(tjbench-${libtype}-tile-420m-8x8-cmp
+      ${MD5CMP} ${MD5_PPM_420M_8x8_TILE} testout_tilem_420_Q95_8x8.ppm)
+    add_test(tjbench-${libtype}-tile-422m-8x8-cmp
+      ${MD5CMP} ${MD5_PPM_422M_8x8_TILE} testout_tilem_422_Q95_8x8.ppm)
+    foreach(tile 16 32 64 128)
+      foreach(subsamp 420 422)
+        add_test(tjbench-${libtype}-tile-${subsamp}m-${tile}x${tile}-cmp
+          ${MD5CMP} ${MD5_PPM_${subsamp}M_TILE}
+            testout_tilem_${subsamp}_Q95_${tile}x${tile}.ppm)
+      endforeach()
+    endforeach()
+    foreach(tile 8 16 32 64 128)
+      foreach(subsamp 420 422)
+        set_tests_properties(tjbench-${libtype}-tile-${subsamp}m-${tile}x${tile}-cmp
+          PROPERTIES DEPENDS tjbench-${libtype}-tilem)
+      endforeach()
+    endforeach()
   endif()
 
-  # These tests are carefully chosen to provide full coverage of as many of the
-  # underlying algorithms as possible (including all of the SIMD-accelerated
-  # ones.)
+  # These tests are carefully crafted to provide full coverage of as many of
+  # the underlying algorithms as possible (including all of the
+  # SIMD-accelerated ones.)
+
+  macro(add_bittest PROG NAME ARGS OUTFILE INFILE MD5SUM)
+    add_test(${PROG}-${libtype}-${NAME}
+      ${PROG}${suffix} ${ARGS} -outfile ${OUTFILE} ${INFILE})
+    add_test(${PROG}-${libtype}-${NAME}-cmp
+      ${MD5CMP} ${MD5SUM} ${OUTFILE})
+    set_tests_properties(${PROG}-${libtype}-${NAME}-cmp PROPERTIES
+      DEPENDS ${PROG}-${libtype}-${NAME})
+    if(${ARGC} GREATER 6)
+      set(DEPENDS ${ARGN})
+      set_tests_properties(${PROG}-${libtype}-${NAME} PROPERTIES
+        DEPENDS ${DEPENDS})
+    endif()
+  endmacro()
 
   # CC: null  SAMP: fullsize  FDCT: islow  ENT: huff
-  add_test(cjpeg${suffix}-rgb-islow
-    ${dir}cjpeg${suffix} -rgb -dct int
-      -outfile testout_rgb_islow.jpg ${TESTIMAGES}/testorig.ppm)
-  add_test(cjpeg${suffix}-rgb-islow-cmp
-    ${MD5CMP} ${MD5_JPEG_RGB_ISLOW} testout_rgb_islow.jpg)
+  add_bittest(cjpeg rgb-islow "-rgb;-dct;int;-icc;${TESTIMAGES}/test1.icc"
+    testout_rgb_islow.jpg ${TESTIMAGES}/testorig.ppm
+    ${MD5_JPEG_RGB_ISLOW})
 
   # CC: null  SAMP: fullsize  IDCT: islow  ENT: huff
-  add_test(djpeg${suffix}-rgb-islow
-    ${dir}djpeg${suffix} -dct int -ppm
-      -outfile testout_rgb_islow.ppm testout_rgb_islow.jpg)
-  add_test(djpeg${suffix}-rgb-islow-cmp
-    ${MD5CMP} ${MD5_PPM_RGB_ISLOW} testout_rgb_islow.ppm)
+  add_bittest(djpeg rgb-islow "-dct;int;-ppm;-icc;testout_rgb_islow.icc"
+    testout_rgb_islow.ppm testout_rgb_islow.jpg
+    ${MD5_PPM_RGB_ISLOW} cjpeg-${libtype}-rgb-islow)
+
+  add_test(djpeg-${libtype}-rgb-islow-icc-cmp
+    ${MD5CMP} b06a39d730129122e85c1363ed1bbc9e testout_rgb_islow.icc)
+
+  add_bittest(jpegtran icc "-copy;all;-icc;${TESTIMAGES}/test2.icc"
+    testout_rgb_islow2.jpg testout_rgb_islow.jpg ${MD5_JPEG_RGB_ISLOW2})
 
   if(NOT WITH_12BIT)
     # CC: RGB->RGB565  SAMP: fullsize  IDCT: islow  ENT: huff
-    add_test(djpeg${suffix}-rgb-islow-565
-      ${dir}djpeg${suffix} -dct int -rgb565 -dither none -bmp
-        -outfile testout_rgb_islow_565.bmp testout_rgb_islow.jpg)
-    add_test(djpeg${suffix}-rgb-islow-565-cmp
-      ${MD5CMP} ${MD5_BMP_RGB_ISLOW_565} testout_rgb_islow_565.bmp)
+    add_bittest(djpeg rgb-islow-565 "-dct;int;-rgb565;-dither;none;-bmp"
+      testout_rgb_islow_565.bmp testout_rgb_islow.jpg
+      ${MD5_BMP_RGB_ISLOW_565} cjpeg-${libtype}-rgb-islow)
 
     # CC: RGB->RGB565 (dithered)  SAMP: fullsize  IDCT: islow  ENT: huff
-    add_test(djpeg${suffix}-rgb-islow-565D
-      ${dir}djpeg${suffix} -dct int -rgb565 -bmp
-        -outfile testout_rgb_islow_565D.bmp testout_rgb_islow.jpg)
-    add_test(djpeg${suffix}-rgb-islow-565D-cmp
-      ${MD5CMP} ${MD5_BMP_RGB_ISLOW_565D} testout_rgb_islow_565D.bmp)
+    add_bittest(djpeg rgb-islow-565D "-dct;int;-rgb565;-bmp"
+      testout_rgb_islow_565D.bmp testout_rgb_islow.jpg
+      ${MD5_BMP_RGB_ISLOW_565D} cjpeg-${libtype}-rgb-islow)
   endif()
 
   # CC: RGB->YCC  SAMP: fullsize/h2v1  FDCT: ifast  ENT: 2-pass huff
-  add_test(cjpeg${suffix}-422-ifast-opt
-    ${dir}cjpeg${suffix} -sample 2x1 -dct fast -opt
-      -outfile testout_422_ifast_opt.jpg ${TESTIMAGES}/testorig.ppm)
-  add_test(cjpeg${suffix}-422-ifast-opt-cmp
-    ${MD5CMP} ${MD5_JPEG_422_IFAST_OPT} testout_422_ifast_opt.jpg)
+  add_bittest(cjpeg 422-ifast-opt "-sample;2x1;-dct;fast;-opt"
+    testout_422_ifast_opt.jpg ${TESTIMAGES}/testorig.ppm
+    ${MD5_JPEG_422_IFAST_OPT})
 
   # CC: YCC->RGB  SAMP: fullsize/h2v1 fancy  IDCT: ifast  ENT: huff
-  add_test(djpeg${suffix}-422-ifast
-    ${dir}djpeg${suffix} -dct fast
-      -outfile testout_422_ifast.ppm testout_422_ifast_opt.jpg)
-  add_test(djpeg${suffix}-422-ifast-cmp
-    ${MD5CMP} ${MD5_PPM_422_IFAST} testout_422_ifast.ppm)
+  add_bittest(djpeg 422-ifast "-dct;fast"
+    testout_422_ifast.ppm testout_422_ifast_opt.jpg
+    ${MD5_PPM_422_IFAST} cjpeg-${libtype}-422-ifast-opt)
 
   # CC: YCC->RGB  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
-  add_test(djpeg${suffix}-422m-ifast
-    ${dir}djpeg${suffix} -dct fast -nosmooth
-      -outfile testout_422m_ifast.ppm testout_422_ifast_opt.jpg)
-  add_test(djpeg${suffix}-422m-ifast-cmp
-    ${MD5CMP} ${MD5_PPM_422M_IFAST} testout_422m_ifast.ppm)
+  add_bittest(djpeg 422m-ifast "-dct;fast;-nosmooth"
+    testout_422m_ifast.ppm testout_422_ifast_opt.jpg
+    ${MD5_PPM_422M_IFAST} cjpeg-${libtype}-422-ifast-opt)
 
   if(NOT WITH_12BIT)
     # CC: YCC->RGB565  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
-    add_test(djpeg${suffix}-422m-ifast-565
-      ${dir}djpeg${suffix} -dct int -nosmooth -rgb565 -dither none -bmp
-        -outfile testout_422m_ifast_565.bmp testout_422_ifast_opt.jpg)
-    add_test(djpeg${suffix}-422m-ifast-565-cmp
-      ${MD5CMP} ${MD5_BMP_422M_IFAST_565} testout_422m_ifast_565.bmp)
+    add_bittest(djpeg 422m-ifast-565
+      "-dct;int;-nosmooth;-rgb565;-dither;none;-bmp"
+      testout_422m_ifast_565.bmp testout_422_ifast_opt.jpg
+      ${MD5_BMP_422M_IFAST_565} cjpeg-${libtype}-422-ifast-opt)
 
     # CC: YCC->RGB565 (dithered)  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
-    add_test(djpeg${suffix}-422m-ifast-565D
-      ${dir}djpeg${suffix} -dct int -nosmooth -rgb565 -bmp
-        -outfile testout_422m_ifast_565D.bmp testout_422_ifast_opt.jpg)
-    add_test(djpeg${suffix}-422m-ifast-565D-cmp
-      ${MD5CMP} ${MD5_BMP_422M_IFAST_565D} testout_422m_ifast_565D.bmp)
+    add_bittest(djpeg 422m-ifast-565D "-dct;int;-nosmooth;-rgb565;-bmp"
+      testout_422m_ifast_565D.bmp testout_422_ifast_opt.jpg
+      ${MD5_BMP_422M_IFAST_565D} cjpeg-${libtype}-422-ifast-opt)
   endif()
 
   # CC: RGB->YCC  SAMP: fullsize/h2v2  FDCT: ifast  ENT: prog huff
-  add_test(cjpeg${suffix}-420-q100-ifast-prog
-    ${dir}cjpeg${suffix} -sample 2x2 -quality 100 -dct fast -prog
-      -outfile testout_420_q100_ifast_prog.jpg ${TESTIMAGES}/testorig.ppm)
-  add_test(cjpeg${suffix}-420-q100-ifast-prog-cmp
-    ${MD5CMP} ${MD5_JPEG_420_IFAST_Q100_PROG} testout_420_q100_ifast_prog.jpg)
+  add_bittest(cjpeg 420-q100-ifast-prog
+    "-sample;2x2;-quality;100;-dct;fast;-prog"
+    testout_420_q100_ifast_prog.jpg ${TESTIMAGES}/testorig.ppm
+    ${MD5_JPEG_420_IFAST_Q100_PROG})
 
   # CC: YCC->RGB  SAMP: fullsize/h2v2 fancy  IDCT: ifast  ENT: prog huff
-  add_test(djpeg${suffix}-420-q100-ifast-prog
-    ${dir}djpeg${suffix} -dct fast
-      -outfile testout_420_q100_ifast.ppm testout_420_q100_ifast_prog.jpg)
-  add_test(djpeg${suffix}-420-q100-ifast-prog-cmp
-    ${MD5CMP} ${MD5_PPM_420_Q100_IFAST} testout_420_q100_ifast.ppm)
+  add_bittest(djpeg 420-q100-ifast-prog "-dct;fast"
+    testout_420_q100_ifast.ppm testout_420_q100_ifast_prog.jpg
+    ${MD5_PPM_420_Q100_IFAST} cjpeg-${libtype}-420-q100-ifast-prog)
 
   # CC: YCC->RGB  SAMP: h2v2 merged  IDCT: ifast  ENT: prog huff
-  add_test(djpeg${suffix}-420m-q100-ifast-prog
-    ${dir}djpeg${suffix} -dct fast -nosmooth
-      -outfile testout_420m_q100_ifast.ppm testout_420_q100_ifast_prog.jpg)
-  add_test(djpeg${suffix}-420m-q100-ifast-prog-cmp
-    ${MD5CMP} ${MD5_PPM_420M_Q100_IFAST} testout_420m_q100_ifast.ppm)
+  add_bittest(djpeg 420m-q100-ifast-prog "-dct;fast;-nosmooth"
+    testout_420m_q100_ifast.ppm testout_420_q100_ifast_prog.jpg
+    ${MD5_PPM_420M_Q100_IFAST} cjpeg-${libtype}-420-q100-ifast-prog)
 
   # CC: RGB->Gray  SAMP: fullsize  FDCT: islow  ENT: huff
-  add_test(cjpeg${suffix}-gray-islow
-    ${dir}cjpeg${suffix} -gray -dct int
-      -outfile testout_gray_islow.jpg ${TESTIMAGES}/testorig.ppm)
-  add_test(cjpeg${suffix}-gray-islow-cmp
-    ${MD5CMP} ${MD5_JPEG_GRAY_ISLOW} testout_gray_islow.jpg)
+  add_bittest(cjpeg gray-islow "-gray;-dct;int"
+    testout_gray_islow.jpg ${TESTIMAGES}/testorig.ppm
+    ${MD5_JPEG_GRAY_ISLOW})
 
   # CC: Gray->Gray  SAMP: fullsize  IDCT: islow  ENT: huff
-  add_test(djpeg${suffix}-gray-islow
-    ${dir}djpeg${suffix} -dct int
-      -outfile testout_gray_islow.ppm testout_gray_islow.jpg)
-  add_test(djpeg${suffix}-gray-islow-cmp
-    ${MD5CMP} ${MD5_PPM_GRAY_ISLOW} testout_gray_islow.ppm)
+  add_bittest(djpeg gray-islow "-dct;int"
+    testout_gray_islow.ppm testout_gray_islow.jpg
+    ${MD5_PPM_GRAY_ISLOW} cjpeg-${libtype}-gray-islow)
 
   # CC: Gray->RGB  SAMP: fullsize  IDCT: islow  ENT: huff
-  add_test(djpeg${suffix}-gray-islow-rgb
-    ${dir}djpeg${suffix} -dct int -rgb
-      -outfile testout_gray_islow_rgb.ppm testout_gray_islow.jpg)
-  add_test(djpeg${suffix}-gray-islow-rgb-cmp
-    ${MD5CMP} ${MD5_PPM_GRAY_ISLOW_RGB} testout_gray_islow_rgb.ppm)
+  add_bittest(djpeg gray-islow-rgb "-dct;int;-rgb"
+    testout_gray_islow_rgb.ppm testout_gray_islow.jpg
+    ${MD5_PPM_GRAY_ISLOW_RGB} cjpeg-${libtype}-gray-islow)
 
   if(NOT WITH_12BIT)
     # CC: Gray->RGB565  SAMP: fullsize  IDCT: islow  ENT: huff
-    add_test(djpeg${suffix}-gray-islow-565
-      ${dir}djpeg${suffix} -dct int -rgb565 -dither none -bmp
-        -outfile testout_gray_islow_565.bmp testout_gray_islow.jpg)
-    add_test(djpeg${suffix}-gray-islow-565-cmp
-      ${MD5CMP} ${MD5_BMP_GRAY_ISLOW_565} testout_gray_islow_565.bmp)
+    add_bittest(djpeg gray-islow-565 "-dct;int;-rgb565;-dither;none;-bmp"
+      testout_gray_islow_565.bmp testout_gray_islow.jpg
+      ${MD5_BMP_GRAY_ISLOW_565} cjpeg-${libtype}-gray-islow)
 
     # CC: Gray->RGB565 (dithered)  SAMP: fullsize  IDCT: islow  ENT: huff
-    add_test(djpeg${suffix}-gray-islow-565D
-      ${dir}djpeg${suffix} -dct int -rgb565 -bmp
-        -outfile testout_gray_islow_565D.bmp testout_gray_islow.jpg)
-    add_test(djpeg${suffix}-gray-islow-565D-cmp
-      ${MD5CMP} ${MD5_BMP_GRAY_ISLOW_565D} testout_gray_islow_565D.bmp)
+    add_bittest(djpeg gray-islow-565D "-dct;int;-rgb565;-bmp"
+      testout_gray_islow_565D.bmp testout_gray_islow.jpg
+      ${MD5_BMP_GRAY_ISLOW_565D} cjpeg-${libtype}-gray-islow)
   endif()
 
   # CC: RGB->YCC  SAMP: fullsize smooth/h2v2 smooth  FDCT: islow
   # ENT: 2-pass huff
-  add_test(cjpeg${suffix}-420s-ifast-opt
-    ${dir}cjpeg${suffix} -sample 2x2 -smooth 1 -dct int -opt
-      -outfile testout_420s_ifast_opt.jpg ${TESTIMAGES}/testorig.ppm)
-  add_test(cjpeg${suffix}-420s-ifast-opt-cmp
-    ${MD5CMP} ${MD5_JPEG_420S_IFAST_OPT} testout_420s_ifast_opt.jpg)
+  add_bittest(cjpeg 420s-ifast-opt "-sample;2x2;-smooth;1;-dct;int;-opt"
+    testout_420s_ifast_opt.jpg ${TESTIMAGES}/testorig.ppm
+    ${MD5_JPEG_420S_IFAST_OPT})
 
-  # CC: RGB->YCC  SAMP: fullsize/int  FDCT: float  ENT: prog huff
-  add_test(cjpeg${suffix}-3x2-float-prog
-    ${dir}cjpeg${suffix} -sample 3x2 -dct float -prog
-      -outfile testout_3x2_float_prog.jpg ${TESTIMAGES}/testorig.ppm)
-  add_test(cjpeg${suffix}-3x2-float-prog-cmp
-    ${MD5CMP} ${MD5_JPEG_3x2_FLOAT_PROG} testout_3x2_float_prog.jpg)
+  if(FLOATTEST)
+    # CC: RGB->YCC  SAMP: fullsize/int  FDCT: float  ENT: prog huff
+    add_bittest(cjpeg 3x2-float-prog "-sample;3x2;-dct;float;-prog"
+      testout_3x2_float_prog.jpg ${TESTIMAGES}/testorig.ppm
+      ${MD5_JPEG_3x2_FLOAT_PROG_${FLOATTEST_UC}})
 
-  # CC: YCC->RGB  SAMP: fullsize/int  IDCT: float  ENT: prog huff
-  add_test(djpeg${suffix}-3x2-float-prog
-    ${dir}djpeg${suffix} -dct float
-      -outfile testout_3x2_float.ppm testout_3x2_float_prog.jpg)
-  add_test(djpeg${suffix}-3x2-float-prog-cmp
-    ${MD5CMP} ${MD5_PPM_3x2_FLOAT} testout_3x2_float.ppm)
+    # CC: YCC->RGB  SAMP: fullsize/int  IDCT: float  ENT: prog huff
+    add_bittest(djpeg 3x2-float-prog "-dct;float"
+      testout_3x2_float.ppm testout_3x2_float_prog.jpg
+      ${MD5_PPM_3x2_FLOAT_${FLOATTEST_UC}} cjpeg-${libtype}-3x2-float-prog)
+  endif()
+
+    # CC: RGB->YCC  SAMP: fullsize/int  FDCT: ifast  ENT: prog huff
+  add_bittest(cjpeg 3x2-ifast-prog "-sample;3x2;-dct;fast;-prog"
+    testout_3x2_ifast_prog.jpg ${TESTIMAGES}/testorig.ppm
+    ${MD5_JPEG_3x2_IFAST_PROG})
+
+  # CC: YCC->RGB  SAMP: fullsize/int  IDCT: ifast  ENT: prog huff
+  add_bittest(djpeg 3x2-ifast-prog "-dct;fast"
+    testout_3x2_ifast.ppm testout_3x2_ifast_prog.jpg
+    ${MD5_PPM_3x2_IFAST} cjpeg-${libtype}-3x2-ifast-prog)
 
   if(WITH_ARITH_ENC)
     # CC: YCC->RGB  SAMP: fullsize/h2v2  FDCT: islow  ENT: arith
-    add_test(cjpeg${suffix}-420-islow-ari
-      ${dir}cjpeg${suffix} -dct int -arithmetic
-        -outfile testout_420_islow_ari.jpg ${TESTIMAGES}/testorig.ppm)
-    add_test(cjpeg${suffix}-420-islow-ari-cmp
-      ${MD5CMP} ${MD5_JPEG_420_ISLOW_ARI} testout_420_islow_ari.jpg)
+    add_bittest(cjpeg 420-islow-ari "-dct;int;-arithmetic"
+      testout_420_islow_ari.jpg ${TESTIMAGES}/testorig.ppm
+      ${MD5_JPEG_420_ISLOW_ARI})
 
-    add_test(jpegtran${suffix}-420-islow-ari
-      ${dir}jpegtran${suffix} -arithmetic
-        -outfile testout_420_islow_ari.jpg ${TESTIMAGES}/testimgint.jpg)
-    add_test(jpegtran${suffix}-420-islow-ari-cmp
-      ${MD5CMP} ${MD5_JPEG_420_ISLOW_ARI} testout_420_islow_ari.jpg)
+    add_bittest(jpegtran 420-islow-ari "-arithmetic"
+      testout_420_islow_ari2.jpg ${TESTIMAGES}/testimgint.jpg
+      ${MD5_JPEG_420_ISLOW_ARI})
 
     # CC: YCC->RGB  SAMP: fullsize  FDCT: islow  ENT: prog arith
-    add_test(cjpeg${suffix}-444-islow-progari
-      ${dir}cjpeg${suffix} -sample 1x1 -dct int -prog -arithmetic
-        -outfile testout_444_islow_progari.jpg ${TESTIMAGES}/testorig.ppm)
-    add_test(cjpeg${suffix}-444-islow-progari-cmp
-      ${MD5CMP} ${MD5_JPEG_444_ISLOW_PROGARI} testout_444_islow_progari.jpg)
+    add_bittest(cjpeg 444-islow-progari
+      "-sample;1x1;-dct;int;-prog;-arithmetic"
+      testout_444_islow_progari.jpg ${TESTIMAGES}/testorig.ppm
+      ${MD5_JPEG_444_ISLOW_PROGARI})
   endif()
 
   if(WITH_ARITH_DEC)
     # CC: RGB->YCC  SAMP: h2v2 merged  IDCT: ifast  ENT: arith
-    add_test(djpeg${suffix}-420m-ifast-ari
-      ${dir}djpeg${suffix} -fast -ppm
-        -outfile testout_420m_ifast_ari.ppm ${TESTIMAGES}/testimgari.jpg)
-    add_test(djpeg${suffix}-420m-ifast-ari-cmp
-      ${MD5CMP} ${MD5_PPM_420M_IFAST_ARI} testout_420m_ifast_ari.ppm)
+    add_bittest(djpeg 420m-ifast-ari "-fast;-ppm"
+      testout_420m_ifast_ari.ppm ${TESTIMAGES}/testimgari.jpg
+      ${MD5_PPM_420M_IFAST_ARI})
 
-    add_test(jpegtran${suffix}-420-islow
-      ${dir}jpegtran${suffix}
-        -outfile testout_420_islow.jpg ${TESTIMAGES}/testimgari.jpg)
-    add_test(jpegtran${suffix}-420-islow-cmp
-      ${MD5CMP} ${MD5_JPEG_420_ISLOW} testout_420_islow.jpg)
+    add_bittest(jpegtran 420-islow ""
+      testout_420_islow.jpg ${TESTIMAGES}/testimgari.jpg
+      ${MD5_JPEG_420_ISLOW})
   endif()
 
   # 2/1--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 16x16 islow  ENT: huff
@@ -732,231 +1138,233 @@
   #         ENT: huff
   foreach(scale 2_1 15_8 13_8 11_8 9_8 7_8 3_4 5_8 1_2 3_8 1_4 1_8)
     string(REGEX REPLACE "_" "/" scalearg ${scale})
-    add_test(djpeg${suffix}-420m-islow-${scale}
-      ${dir}djpeg${suffix} -dct int -scale ${scalearg} -nosmooth -ppm
-        -outfile testout_420m_islow_${scale}.ppm ${TESTIMAGES}/${TESTORIG})
-    add_test(djpeg${suffix}-420m-islow-${scale}-cmp
-      ${MD5CMP} ${MD5_PPM_420M_ISLOW_${scale}} testout_420m_islow_${scale}.ppm)
+    add_bittest(djpeg 420m-islow-${scale}
+      "-dct;int;-scale;${scalearg};-nosmooth;-ppm"
+      testout_420m_islow_${scale}.ppm ${TESTIMAGES}/${TESTORIG}
+      ${MD5_PPM_420M_ISLOW_${scale}})
   endforeach()
 
   if(NOT WITH_12BIT)
     # CC: YCC->RGB (dithered)  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
-    add_test(djpeg${suffix}-420-islow-256
-      ${dir}djpeg${suffix} -dct int -colors 256 -bmp
-        -outfile testout_420_islow_256.bmp ${TESTIMAGES}/${TESTORIG})
-    add_test(djpeg${suffix}-420-islow-256-cmp
-      ${MD5CMP} ${MD5_BMP_420_ISLOW_256} testout_420_islow_256.bmp)
+    add_bittest(djpeg 420-islow-256 "-dct;int;-colors;256;-bmp"
+      testout_420_islow_256.bmp ${TESTIMAGES}/${TESTORIG}
+      ${MD5_BMP_420_ISLOW_256})
 
     # CC: YCC->RGB565  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
-    add_test(djpeg${suffix}-420-islow-565
-      ${dir}djpeg${suffix} -dct int -rgb565 -dither none -bmp
-        -outfile testout_420_islow_565.bmp ${TESTIMAGES}/${TESTORIG})
-    add_test(djpeg${suffix}-420-islow-565-cmp
-      ${MD5CMP} ${MD5_BMP_420_ISLOW_565} testout_420_islow_565.bmp)
+    add_bittest(djpeg 420-islow-565 "-dct;int;-rgb565;-dither;none;-bmp"
+      testout_420_islow_565.bmp ${TESTIMAGES}/${TESTORIG}
+      ${MD5_BMP_420_ISLOW_565})
 
     # CC: YCC->RGB565 (dithered)  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
-    add_test(djpeg${suffix}-420-islow-565D
-      ${dir}djpeg${suffix} -dct int -rgb565 -bmp
-        -outfile testout_420_islow_565D.bmp ${TESTIMAGES}/${TESTORIG})
-    add_test(djpeg${suffix}-420-islow-565D-cmp
-      ${MD5CMP} ${MD5_BMP_420_ISLOW_565D} testout_420_islow_565D.bmp)
+    add_bittest(djpeg 420-islow-565D "-dct;int;-rgb565;-bmp"
+      testout_420_islow_565D.bmp ${TESTIMAGES}/${TESTORIG}
+      ${MD5_BMP_420_ISLOW_565D})
 
     # CC: YCC->RGB565  SAMP: h2v2 merged  IDCT: islow  ENT: huff
-    add_test(djpeg${suffix}-420m-islow-565
-      ${dir}djpeg${suffix} -dct int -nosmooth -rgb565 -dither none -bmp
-        -outfile testout_420m_islow_565.bmp ${TESTIMAGES}/${TESTORIG})
-    add_test(djpeg${suffix}-420m-islow-565-cmp
-      ${MD5CMP} ${MD5_BMP_420M_ISLOW_565} testout_420m_islow_565.bmp)
+    add_bittest(djpeg 420m-islow-565
+      "-dct;int;-nosmooth;-rgb565;-dither;none;-bmp"
+      testout_420m_islow_565.bmp ${TESTIMAGES}/${TESTORIG}
+      ${MD5_BMP_420M_ISLOW_565})
 
     # CC: YCC->RGB565 (dithered)  SAMP: h2v2 merged  IDCT: islow  ENT: huff
-    add_test(djpeg${suffix}-420m-islow-565D
-      ${dir}djpeg${suffix} -dct int -nosmooth -rgb565 -bmp
-        -outfile testout_420m_islow_565D.bmp ${TESTIMAGES}/${TESTORIG})
-    add_test(djpeg${suffix}-420m-islow-565D-cmp
-      ${MD5CMP} ${MD5_BMP_420M_ISLOW_565D} testout_420m_islow_565D.bmp)
+    add_bittest(djpeg 420m-islow-565D "-dct;int;-nosmooth;-rgb565;-bmp"
+      testout_420m_islow_565D.bmp ${TESTIMAGES}/${TESTORIG}
+      ${MD5_BMP_420M_ISLOW_565D})
   endif()
 
   # Partial decode tests.  These tests are designed to cover all of the
   # possible code paths in jpeg_skip_scanlines().
 
   # Context rows: Yes  Intra-iMCU row: Yes  iMCU row prefetch: No   ENT: huff
-  add_test(djpeg${suffix}-420-islow-skip15_31
-    ${dir}djpeg${suffix} -dct int -skip 15,31 -ppm
-      -outfile testout_420_islow_skip15,31.ppm ${TESTIMAGES}/${TESTORIG})
-  add_test(djpeg${suffix}-420-islow-skip15_31-cmp
-    ${MD5CMP} ${MD5_PPM_420_ISLOW_SKIP15_31} testout_420_islow_skip15,31.ppm)
+  add_bittest(djpeg 420-islow-skip15_31 "-dct;int;-skip;15,31;-ppm"
+    testout_420_islow_skip15,31.ppm ${TESTIMAGES}/${TESTORIG}
+    ${MD5_PPM_420_ISLOW_SKIP15_31})
 
   # Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: Yes  ENT: arith
   if(WITH_ARITH_DEC)
-    add_test(djpeg${suffix}-420-islow-ari-skip16_139
-      ${dir}djpeg${suffix} -dct int -skip 16,139 -ppm
-        -outfile testout_420_islow_ari_skip16,139.ppm
-        ${TESTIMAGES}/testimgari.jpg)
-    add_test(djpeg${suffix}-420-islow-ari_skip16_139-cmp
-      ${MD5CMP} ${MD5_PPM_420_ISLOW_ARI_SKIP16_139}
-        testout_420_islow_ari_skip16,139.ppm)
+    add_bittest(djpeg 420-islow-ari-skip16_139 "-dct;int;-skip;16,139;-ppm"
+      testout_420_islow_ari_skip16,139.ppm ${TESTIMAGES}/testimgari.jpg
+      ${MD5_PPM_420_ISLOW_ARI_SKIP16_139})
   endif()
 
   # Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: No   ENT: prog huff
-  add_test(cjpeg${suffix}-420-islow-prog
-    ${dir}cjpeg${suffix} -dct int -prog
+  add_test(cjpeg-${libtype}-420-islow-prog
+    cjpeg${suffix} -dct int -prog
       -outfile testout_420_islow_prog.jpg ${TESTIMAGES}/testorig.ppm)
-  add_test(djpeg${suffix}-420-islow-prog-crop62x62_71_71
-    ${dir}djpeg${suffix} -dct int -crop 62x62+71+71 -ppm
-      -outfile testout_420_islow_prog_crop62x62,71,71.ppm
-      testout_420_islow_prog.jpg)
-  add_test(djpeg${suffix}-420-islow-prog-crop62x62_71_71-cmp
-    ${MD5CMP} ${MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71}
-      testout_420_islow_prog_crop62x62,71,71.ppm)
+  add_bittest(djpeg 420-islow-prog-crop62x62_71_71
+    "-dct;int;-crop;62x62+71+71;-ppm"
+    testout_420_islow_prog_crop62x62,71,71.ppm testout_420_islow_prog.jpg
+    ${MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71} cjpeg-${libtype}-420-islow-prog)
 
   # Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: No   ENT: arith
   if(WITH_ARITH_DEC)
-    add_test(djpeg${suffix}-420-islow-ari-crop53x53_4_4
-      ${dir}djpeg${suffix} -dct int -crop 53x53+4+4 -ppm
-        -outfile testout_420_islow_ari_crop53x53,4,4.ppm
-        ${TESTIMAGES}/testimgari.jpg)
-    add_test(djpeg${suffix}-420-islow-ari-crop53x53_4_4-cmp
-      ${MD5CMP} ${MD5_PPM_420_ISLOW_ARI_CROP53x53_4_4}
-        testout_420_islow_ari_crop53x53,4,4.ppm)
+    add_bittest(djpeg 420-islow-ari-crop53x53_4_4
+      "-dct;int;-crop;53x53+4+4;-ppm"
+      testout_420_islow_ari_crop53x53,4,4.ppm ${TESTIMAGES}/testimgari.jpg
+      ${MD5_PPM_420_ISLOW_ARI_CROP53x53_4_4})
   endif()
 
   # Context rows: No   Intra-iMCU row: Yes  ENT: huff
-  add_test(cjpeg${suffix}-444-islow
-    ${dir}cjpeg${suffix} -dct int -sample 1x1
+  add_test(cjpeg-${libtype}-444-islow
+    cjpeg${suffix} -dct int -sample 1x1
       -outfile testout_444_islow.jpg ${TESTIMAGES}/testorig.ppm)
-  add_test(djpeg${suffix}-444-islow-skip1_6
-    ${dir}djpeg${suffix} -dct int -skip 1,6 -ppm
-      -outfile testout_444_islow_skip1,6.ppm testout_444_islow.jpg)
-  add_test(djpeg${suffix}-444-islow-skip1_6-cmp
-    ${MD5CMP} ${MD5_PPM_444_ISLOW_SKIP1_6} testout_444_islow_skip1,6.ppm)
+  add_bittest(djpeg 444-islow-skip1_6 "-dct;int;-skip;1,6;-ppm"
+    testout_444_islow_skip1,6.ppm testout_444_islow.jpg
+    ${MD5_PPM_444_ISLOW_SKIP1_6} cjpeg-${libtype}-444-islow)
 
   # Context rows: No   Intra-iMCU row: No   ENT: prog huff
-  add_test(cjpeg${suffix}-444-islow-prog
-    ${dir}cjpeg${suffix} -dct int -prog -sample 1x1
+  add_test(cjpeg-${libtype}-444-islow-prog
+    cjpeg${suffix} -dct int -prog -sample 1x1
       -outfile testout_444_islow_prog.jpg ${TESTIMAGES}/testorig.ppm)
-  add_test(djpeg${suffix}-444-islow-prog-crop98x98_13_13
-    ${dir}djpeg${suffix} -dct int -crop 98x98+13+13 -ppm
-      -outfile testout_444_islow_prog_crop98x98,13,13.ppm
-      testout_444_islow_prog.jpg)
-  add_test(djpeg${suffix}-444-islow-prog_crop98x98_13_13-cmp
-    ${MD5CMP} ${MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13}
-      testout_444_islow_prog_crop98x98,13,13.ppm)
+  add_bittest(djpeg 444-islow-prog-crop98x98_13_13
+    "-dct;int;-crop;98x98+13+13;-ppm"
+    testout_444_islow_prog_crop98x98,13,13.ppm testout_444_islow_prog.jpg
+    ${MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13} cjpeg-${libtype}-444-islow-prog)
 
   # Context rows: No   Intra-iMCU row: No   ENT: arith
   if(WITH_ARITH_ENC)
-    add_test(cjpeg${suffix}-444-islow-ari
-      ${dir}cjpeg${suffix} -dct int -arithmetic -sample 1x1
+    add_test(cjpeg-${libtype}-444-islow-ari
+      cjpeg${suffix} -dct int -arithmetic -sample 1x1
         -outfile testout_444_islow_ari.jpg ${TESTIMAGES}/testorig.ppm)
     if(WITH_ARITH_DEC)
-      add_test(djpeg${suffix}-444-islow-ari-crop37x37_0_0
-        ${dir}djpeg${suffix} -dct int -crop 37x37+0+0 -ppm
-          -outfile testout_444_islow_ari_crop37x37,0,0.ppm
-          testout_444_islow_ari.jpg)
-      add_test(djpeg${suffix}-444-islow-ari-crop37x37_0_0-cmp
-        ${MD5CMP} ${MD5_PPM_444_ISLOW_ARI_CROP37x37_0_0}
-          testout_444_islow_ari_crop37x37,0,0.ppm)
+      add_bittest(djpeg 444-islow-ari-crop37x37_0_0
+        "-dct;int;-crop;37x37+0+0;-ppm"
+        testout_444_islow_ari_crop37x37,0,0.ppm testout_444_islow_ari.jpg
+        ${MD5_PPM_444_ISLOW_ARI_CROP37x37_0_0} cjpeg-${libtype}-444-islow-ari)
     endif()
   endif()
 
-  add_test(jpegtran${suffix}-crop
-    ${dir}jpegtran${suffix} -crop 120x90+20+50 -transpose -perfect
-      -outfile testout_crop.jpg ${TESTIMAGES}/${TESTORIG})
-  add_test(jpegtran${suffix}-crop-cmp
-    ${MD5CMP} ${MD5_JPEG_CROP} testout_crop.jpg)
+  add_bittest(jpegtran crop "-crop;120x90+20+50;-transpose;-perfect"
+    testout_crop.jpg ${TESTIMAGES}/${TESTORIG}
+    ${MD5_JPEG_CROP})
 
 endforeach()
 
 add_custom_target(testclean COMMAND ${CMAKE_COMMAND} -P
-  ${CMAKE_SOURCE_DIR}/cmakescripts/testclean.cmake)
+  ${CMAKE_CURRENT_SOURCE_DIR}/cmakescripts/testclean.cmake)
 
-
-#
-# Installer
-#
-
-if(MSVC)
-  set(INST_PLATFORM "Visual C++")
-  set(INST_NAME ${CMAKE_PROJECT_NAME}-${VERSION}-vc)
-  set(INST_REG_NAME ${CMAKE_PROJECT_NAME})
-elseif(MINGW)
-  set(INST_PLATFORM GCC)
-  set(INST_NAME ${CMAKE_PROJECT_NAME}-${VERSION}-gcc)
-  set(INST_REG_NAME ${CMAKE_PROJECT_NAME}-gcc)
-  set(INST_DEFS -DGCC)
+if(WITH_TURBOJPEG)
+  configure_file(tjbenchtest.in tjbenchtest @ONLY)
+  configure_file(tjexampletest.in tjexampletest @ONLY)
+  if(WIN32)
+    set(BASH bash)
+  endif()
+  if(WITH_JAVA)
+    configure_file(tjbenchtest.java.in tjbenchtest.java @ONLY)
+    configure_file(tjexampletest.java.in tjexampletest.java @ONLY)
+    add_custom_target(tjtest
+      COMMAND echo tjbenchtest
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest
+      COMMAND echo tjbenchtest -alloc
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -alloc
+      COMMAND echo tjbenchtest -yuv
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -yuv
+      COMMAND echo tjbenchtest -yuv -alloc
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -yuv -alloc
+      COMMAND echo tjbenchtest -progressive
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -progressive
+      COMMAND echo tjexampletest
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjexampletest
+      COMMAND echo tjbenchtest.java
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest.java
+      COMMAND echo tjbenchtest.java -yuv
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest.java -yuv
+      COMMAND echo tjbenchtest.java -progressive
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest.java -progressive
+      COMMAND echo tjexampletest.java
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjexampletest.java
+      DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest
+        ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest.java
+        ${CMAKE_CURRENT_BINARY_DIR}/tjexampletest)
+  else()
+    add_custom_target(tjtest
+      COMMAND echo tjbenchtest
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest
+      COMMAND echo tjbenchtest -alloc
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -alloc
+      COMMAND echo tjbenchtest -yuv
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -yuv
+      COMMAND echo tjbenchtest -yuv -alloc
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -yuv -alloc
+      COMMAND echo tjexampletest
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjexampletest
+      DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest)
+  endif()
 endif()
 
-if(64BIT)
-  set(INST_PLATFORM "${INST_PLATFORM} 64-bit")
-  set(INST_NAME ${INST_NAME}64)
-  set(INST_REG_NAME ${INST_DIR}64)
-  set(INST_DEFS ${INST_DEFS} -DWIN64)
+
+###############################################################################
+# INSTALLATION
+###############################################################################
+
+if(WIN32)
+  set(EXE ".exe")
 endif()
 
-if(WITH_JAVA)
-  set(INST_DEFS ${INST_DEFS} -DJAVA)
-endif()
-
-if(MSVC_IDE)
-  set(INST_DEFS ${INST_DEFS} "-DBUILDDIR=${CMAKE_CFG_INTDIR}\\")
-else()
-  set(INST_DEFS ${INST_DEFS} "-DBUILDDIR=")
-endif()
-
-STRING(REGEX REPLACE "/" "\\\\" INST_DIR ${CMAKE_INSTALL_PREFIX})
-
-configure_file(release/libjpeg-turbo.nsi.in libjpeg-turbo.nsi @ONLY)
-
-if(WITH_JAVA)
-  set(JAVA_DEPEND java)
-endif()
-add_custom_target(installer
-  makensis -nocd ${INST_DEFS} libjpeg-turbo.nsi
-  DEPENDS jpeg jpeg-static turbojpeg turbojpeg-static rdjpgcom wrjpgcom
-    cjpeg djpeg jpegtran tjbench ${JAVA_DEPEND}
-  SOURCES libjpeg-turbo.nsi)
-
 if(WITH_TURBOJPEG)
   if(ENABLE_SHARED)
     install(TARGETS turbojpeg tjbench
-      ARCHIVE DESTINATION lib
-      LIBRARY DESTINATION lib
-      RUNTIME DESTINATION bin)
+      ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+      LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+      RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
   endif()
   if(ENABLE_STATIC)
-    install(TARGETS turbojpeg-static ARCHIVE DESTINATION lib)
+    install(TARGETS turbojpeg-static ARCHIVE
+      DESTINATION ${CMAKE_INSTALL_LIBDIR})
     if(NOT ENABLE_SHARED)
-      install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/tjbench-static.exe
-        DESTINATION bin RENAME tjbench.exe)
+      install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/tjbench-static${EXE}
+        DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME tjbench${EXE})
     endif()
   endif()
-  install(FILES ${CMAKE_SOURCE_DIR}/turbojpeg.h DESTINATION include)
+  install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/turbojpeg.h
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 endif()
 
 if(ENABLE_STATIC)
-  install(TARGETS jpeg-static ARCHIVE DESTINATION lib)
+  install(TARGETS jpeg-static ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
   if(NOT ENABLE_SHARED)
-    install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/cjpeg-static.exe
-      DESTINATION bin RENAME cjpeg.exe)
-    install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/djpeg-static.exe
-      DESTINATION bin RENAME djpeg.exe)
-    install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/jpegtran-static.exe
-      DESTINATION bin RENAME jpegtran.exe)
+    install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/cjpeg-static${EXE}
+      DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME cjpeg${EXE})
+    install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/djpeg-static${EXE}
+      DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME djpeg${EXE})
+    install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/jpegtran-static${EXE}
+      DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME jpegtran${EXE})
   endif()
 endif()
 
-install(TARGETS rdjpgcom wrjpgcom RUNTIME DESTINATION bin)
+install(TARGETS rdjpgcom wrjpgcom RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 
-install(FILES ${CMAKE_SOURCE_DIR}/README.ijg ${CMAKE_SOURCE_DIR}/README.md
-  ${CMAKE_SOURCE_DIR}/example.c ${CMAKE_SOURCE_DIR}/libjpeg.txt
-  ${CMAKE_SOURCE_DIR}/structure.txt ${CMAKE_SOURCE_DIR}/usage.txt
-  ${CMAKE_SOURCE_DIR}/wizard.txt
-  DESTINATION doc)
+install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/README.ijg
+  ${CMAKE_CURRENT_SOURCE_DIR}/README.md ${CMAKE_CURRENT_SOURCE_DIR}/example.txt
+  ${CMAKE_CURRENT_SOURCE_DIR}/tjexample.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/libjpeg.txt
+  ${CMAKE_CURRENT_SOURCE_DIR}/structure.txt
+  ${CMAKE_CURRENT_SOURCE_DIR}/usage.txt ${CMAKE_CURRENT_SOURCE_DIR}/wizard.txt
+  ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR})
+if(WITH_JAVA)
+  install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/java/TJExample.java
+    DESTINATION ${CMAKE_INSTALL_DOCDIR})
+endif()
 
-install(FILES ${CMAKE_BINARY_DIR}/jconfig.h ${CMAKE_SOURCE_DIR}/jerror.h
-  ${CMAKE_SOURCE_DIR}/jmorecfg.h ${CMAKE_SOURCE_DIR}/jpeglib.h
-  DESTINATION include)
+if(UNIX)
+  install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/cjpeg.1
+    ${CMAKE_CURRENT_SOURCE_DIR}/djpeg.1 ${CMAKE_CURRENT_SOURCE_DIR}/jpegtran.1
+    ${CMAKE_CURRENT_SOURCE_DIR}/rdjpgcom.1
+    ${CMAKE_CURRENT_SOURCE_DIR}/wrjpgcom.1
+    DESTINATION ${CMAKE_INSTALL_MANDIR}/man1)
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/pkgscripts/libjpeg.pc
+    ${CMAKE_CURRENT_BINARY_DIR}/pkgscripts/libturbojpeg.pc
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
+endif()
 
-configure_file("${CMAKE_SOURCE_DIR}/cmakescripts/cmake_uninstall.cmake.in"
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/jconfig.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/jerror.h ${CMAKE_CURRENT_SOURCE_DIR}/jmorecfg.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/jpeglib.h
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+include(cmakescripts/BuildPackages.cmake)
+
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmakescripts/cmake_uninstall.cmake.in"
   "cmake_uninstall.cmake" IMMEDIATE @ONLY)
 
 add_custom_target(uninstall COMMAND ${CMAKE_COMMAND} -P cmake_uninstall.cmake)
diff --git a/ChangeLog.md b/ChangeLog.md
index 406c6db..3a904a5 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -1,16 +1,124 @@
-1.5.4
-=====
+1.6 pre-beta
+============
 
-1. Fixed two signed integer overflows in the arithmetic decoder, detected by
+### Significant changes relative to 1.5.3:
+
+1. Added AVX2 SIMD implementations of the colorspace conversion, chroma
+downsampling and upsampling, integer quantization and sample conversion, and
+slow integer DCT/IDCT algorithms.  When using the slow integer DCT/IDCT
+algorithms, the compression of RGB images is approximately 13-36% (avg. 22%)
+faster (relative to libjpeg-turbo 1.5.x) with 64-bit code and 11-21% (avg. 17%)
+faster with 32-bit code, and the decompression of RGB images is approximately
+9-35% (avg. 17%) faster with 64-bit code and 7-17% (avg. 12%) faster with
+32-bit code.  (As tested on a 3 GHz Intel Core i7.  Actual mileage may vary.)
+
+2. Overhauled the build system to use CMake on all platforms, and removed the
+autotools-based build system.  This decision resulted from extensive
+discussions within the libjpeg-turbo community.  libjpeg-turbo traditionally
+used CMake only for Windows builds, but there was an increasing amount of
+demand to extend CMake support to other platforms.  However, because of the
+unique nature of our code base (the need to support different assemblers on
+each platform, the need for Java support, etc.), providing dual build systems
+as other OSS imaging libraries do (including libpng and libtiff) would have
+created a maintenance burden.  The use of CMake greatly simplifies some aspects
+of our build system, owing to CMake's built-in support for various assemblers,
+Java, and unit testing, as well as generally fewer quirks that have to be
+worked around in order to implement our packaging system.  Eliminating
+autotools puts our project slightly at odds with the traditional practices of
+the OSS community, since most "system libraries" tend to be built with
+autotools, but it is believed that the benefits of this move outweigh the
+risks.  In addition to providing a unified build environment, switching to
+CMake allows for the use of various build tools and IDEs that aren't supported
+under autotools, including XCode, Ninja, and Eclipse.  It also eliminates the
+need to install autotools via MacPorts/Homebrew on OS X and allows
+libjpeg-turbo to be configured without the use of a terminal/command prompt.
+Extensive testing was conducted to ensure that all features provided by the
+autotools-based build system are provided by the new build system.
+
+3. The libjpeg API in this version of libjpeg-turbo now includes two additional
+functions, `jpeg_read_icc_profile()` and `jpeg_write_icc_profile()`, that can
+be used to extract ICC profile data from a JPEG file while decompressing or to
+embed ICC profile data in a JPEG file while compressing or transforming.  This
+eliminates the need for downstream projects, such as color management libraries
+and browsers, to include their own glueware for accomplishing this.
+
+4. Improved error handling in the TurboJPEG API library:
+
+     - Introduced a new function (`tjGetErrorStr2()`) in the TurboJPEG C API
+that allows compression/decompression/transform error messages to be retrieved
+in a thread-safe manner.  Retrieving error messages from global functions, such
+as `tjInitCompress()` or `tjBufSize()`, is still thread-unsafe, but since those
+functions will only throw errors if passed an invalid argument or if a memory
+allocation failure occurs, thread safety is not as much of a concern.
+     - Introduced a new function (`tjGetErrorCode()`) in the TurboJPEG C API
+and a new method (`TJException.getErrorCode()`) in the TurboJPEG Java API that
+can be used to determine the severity of the last
+compression/decompression/transform error.  This allows applications to
+choose whether to ignore warnings (non-fatal errors) from the underlying
+libjpeg API or to treat them as fatal.
+     - Introduced a new flag (`TJFLAG_STOPONWARNING` in the TurboJPEG C API and
+`TJ.FLAG_STOPONWARNING` in the TurboJPEG Java API) that causes the library to
+immediately halt a compression/decompression/transform operation if it
+encounters a warning from the underlying libjpeg API (the default behavior is
+to allow the operation to complete unless a fatal error is encountered.)
+
+5. Introduced a new flag in the TurboJPEG C and Java APIs (`TJFLAG_PROGRESSIVE`
+and `TJ.FLAG_PROGRESSIVE`, respectively) that causes the library to use
+progressive entropy coding in JPEG images generated by compression and
+transform operations.  Additionally, a new transform option
+(`TJXOPT_PROGRESSIVE` in the C API and `TJTransform.OPT_PROGRESSIVE` in the
+Java API) has been introduced, allowing progressive entropy coding to be
+enabled for selected transforms in a multi-transform operation.
+
+6. Introduced a new transform option in the TurboJPEG API (`TJXOPT_COPYNONE` in
+the C API and `TJTransform.OPT_COPYNONE` in the Java API) that allows the
+copying of markers (including EXIF and ICC profile data) to be disabled for a
+particular transform.
+
+7. Added two functions to the TurboJPEG C API (`tjLoadImage()` and
+`tjSaveImage()`) that can be used to load/save a BMP or PPM/PGM image to/from a
+memory buffer with a specified pixel format and layout.  These functions
+replace the project-private (and slow) bmp API, which was previously used by
+TJBench, and they also provide a convenient way for first-time users of
+libjpeg-turbo to quickly develop a complete JPEG compression/decompression
+program.
+
+8. The TurboJPEG C API now includes a new convenience array (`tjAlphaOffset[]`)
+that contains the alpha component index for each pixel format (or -1 if the
+pixel format lacks an alpha component.)  The TurboJPEG Java API now includes a
+new method (`TJ.getAlphaOffset()`) that returns the same value.  In addition,
+the `tjRedOffset[]`, `tjGreenOffset[]`, and `tjBlueOffset[]` arrays-- and the
+corresponding `TJ.getRedOffset()`, `TJ.getGreenOffset()`, and
+`TJ.getBlueOffset()` methods-- now return -1 for `TJPF_GRAY`/`TJ.PF_GRAY`
+rather than 0.  This allows programs to easily determine whether a pixel format
+has red, green, blue, and alpha components.
+
+9. Added a new example (tjexample.c) that demonstrates the basic usage of the
+TurboJPEG C API.  This example mirrors the functionality of TJExample.java.
+Both files are now included in the libjpeg-turbo documentation.
+
+10. Fixed two signed integer overflows in the arithmetic decoder, detected by
 the Clang undefined behavior sanitizer, that could be triggered by attempting
 to decompress a specially-crafted malformed JPEG image.  These issues did not
 pose a security threat, but removing the warnings makes it easier to detect
 actual security issues, should they arise in the future.
 
-2. Fixed a bug in the merged 4:2:0 upsampling/dithered RGB565 color conversion
+11. Fixed a bug in the merged 4:2:0 upsampling/dithered RGB565 color conversion
 algorithm that caused incorrect dithering in the output image.  This algorithm
 now produces bitwise-identical results to the unmerged algorithms.
 
+12. The SIMD function symbols for x86[-64]/ELF, MIPS/ELF, macOS/x86[-64] (if
+libjpeg-turbo is built with YASM), and iOS/ARM[64] builds are now private.
+This prevents those symbols from being exposed in applications or shared
+libraries that link statically with libjpeg-turbo.
+
+13. Added Loongson MMI SIMD implementations of the RGB-to-YCbCr and
+YCbCr-to-RGB colorspace conversion, 4:2:0 chroma downsampling, 4:2:0 fancy
+chroma upsampling, integer quantization, and slow integer DCT/IDCT algorithms.
+When using the slow integer DCT/IDCT, this speeds up the compression of RGB
+images by approximately 70-100% and the decompression of RGB images by
+approximately 2-3.5x.
+
 
 1.5.3
 =====
diff --git a/LICENSE.md b/LICENSE.md
index 0572390..8dfc045 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -11,7 +11,8 @@
 
 - The Modified (3-clause) BSD License, which is listed below
 
-  This license covers the TurboJPEG API library and associated programs.
+  This license covers the TurboJPEG API library and associated programs, as
+  well as the build system.
 
 - The zlib License, which is listed below
 
diff --git a/Makefile.am b/Makefile.am
deleted file mode 100644
index 1d3d420..0000000
--- a/Makefile.am
+++ /dev/null
@@ -1,794 +0,0 @@
-lib_LTLIBRARIES = libjpeg.la
-libjpeg_la_LDFLAGS = -version-info ${LIBTOOL_CURRENT}:${SO_MINOR_VERSION}:${SO_AGE} -no-undefined
-include_HEADERS = jerror.h jmorecfg.h jpeglib.h
-
-if WITH_TURBOJPEG
-lib_LTLIBRARIES += libturbojpeg.la
-libturbojpeg_la_LDFLAGS = -version-info 1:0:1 -no-undefined
-include_HEADERS += turbojpeg.h
-endif
-
-nodist_include_HEADERS = jconfig.h
-
-pkgconfigdir = $(libdir)/pkgconfig
-pkgconfig_DATA = pkgscripts/libjpeg.pc
-if WITH_TURBOJPEG
-pkgconfig_DATA += pkgscripts/libturbojpeg.pc
-endif
-
-HDRS = jchuff.h jdct.h jdhuff.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
-	jpegint.h jpeglib.h jversion.h jsimd.h jsimddct.h jpegcomp.h \
-	jpeg_nbits_table.h
-
-libjpeg_la_SOURCES = $(HDRS) jcapimin.c jcapistd.c jccoefct.c jccolor.c \
-	jcdctmgr.c jchuff.c jcinit.c jcmainct.c jcmarker.c jcmaster.c \
-	jcomapi.c jcparam.c jcphuff.c jcprepct.c jcsample.c jctrans.c \
-	jdapimin.c jdapistd.c jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c \
-	jddctmgr.c jdhuff.c jdinput.c jdmainct.c jdmarker.c jdmaster.c \
-	jdmerge.c jdphuff.c jdpostct.c jdsample.c jdtrans.c jerror.c \
-	jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c \
-	jidctred.c jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c
-
-if WITH_ARITH
-libjpeg_la_SOURCES += jaricom.c
-endif
-
-if WITH_ARITH_ENC
-libjpeg_la_SOURCES += jcarith.c
-endif
-
-if WITH_ARITH_DEC
-libjpeg_la_SOURCES += jdarith.c
-endif
-
-
-SUBDIRS = java
-
-
-if WITH_TURBOJPEG
-
-libturbojpeg_la_SOURCES = $(libjpeg_la_SOURCES) turbojpeg.c turbojpeg.h \
-	transupp.c transupp.h jdatadst-tj.c jdatasrc-tj.c
-
-if WITH_JAVA
-
-libturbojpeg_la_SOURCES += turbojpeg-jni.c
-libturbojpeg_la_CFLAGS = ${JNI_CFLAGS}
-TJMAPFILE = turbojpeg-mapfile.jni
-
-else
-
-TJMAPFILE = turbojpeg-mapfile
-
-endif
-
-libturbojpeg_la_SOURCES += $(TJMAPFILE)
-
-if VERSION_SCRIPT
-libturbojpeg_la_LDFLAGS += $(VERSION_SCRIPT_FLAG)$(srcdir)/$(TJMAPFILE)
-endif
-
-endif
-
-
-if VERSION_SCRIPT
-libjpeg_la_LDFLAGS += $(VERSION_SCRIPT_FLAG)libjpeg.map
-endif
-
-
-if WITH_SIMD
-
-SUBDIRS += simd
-libjpeg_la_LIBADD = simd/libsimd.la
-libturbojpeg_la_LIBADD = simd/libsimd.la
-
-else
-
-libjpeg_la_SOURCES += jsimd_none.c
-
-endif
-
-
-bin_PROGRAMS = cjpeg djpeg jpegtran rdjpgcom wrjpgcom
-noinst_PROGRAMS = jcstest
-
-
-if WITH_TURBOJPEG
-
-bin_PROGRAMS += tjbench
-
-noinst_PROGRAMS += tjunittest
-
-tjbench_SOURCES = tjbench.c bmp.h bmp.c tjutil.h tjutil.c rdbmp.c rdppm.c \
-	wrbmp.c wrppm.c
-
-tjbench_LDADD = libturbojpeg.la libjpeg.la -lm
-
-tjbench_CFLAGS = -DBMP_SUPPORTED -DPPM_SUPPORTED
-
-tjunittest_SOURCES = tjunittest.c tjutil.h tjutil.c
-
-tjunittest_LDADD = libturbojpeg.la
-
-endif
-
-
-cjpeg_SOURCES = cdjpeg.h cderror.h cdjpeg.c cjpeg.c rdgif.c rdppm.c rdswitch.c
-if WITH_12BIT
-else
-cjpeg_SOURCES += rdbmp.c rdtarga.c
-endif
-
-cjpeg_LDADD = libjpeg.la
-
-cjpeg_CFLAGS = -DGIF_SUPPORTED -DPPM_SUPPORTED
-if WITH_12BIT
-else
-cjpeg_CFLAGS += -DBMP_SUPPORTED -DTARGA_SUPPORTED
-endif
-
-djpeg_SOURCES = cdjpeg.h cderror.h cdjpeg.c djpeg.c rdcolmap.c rdswitch.c \
-	wrgif.c wrppm.c
-if WITH_12BIT
-else
-djpeg_SOURCES += wrbmp.c wrtarga.c
-endif
-
-djpeg_LDADD = libjpeg.la
-
-djpeg_CFLAGS = -DGIF_SUPPORTED -DPPM_SUPPORTED
-if WITH_12BIT
-else
-djpeg_CFLAGS += -DBMP_SUPPORTED -DTARGA_SUPPORTED
-endif
-
-jpegtran_SOURCES = jpegtran.c rdswitch.c cdjpeg.c transupp.c transupp.h
-
-jpegtran_LDADD = libjpeg.la
-
-rdjpgcom_SOURCES = rdjpgcom.c
-
-rdjpgcom_LDADD = libjpeg.la
-
-wrjpgcom_SOURCES = wrjpgcom.c
-
-wrjpgcom_LDADD = libjpeg.la
-
-jcstest_SOURCES = jcstest.c
-
-jcstest_LDADD = libjpeg.la
-
-dist_man1_MANS = cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 wrjpgcom.1
-
-DOCS= coderules.txt jconfig.txt change.log rdrle.c wrrle.c BUILDING.md \
-	ChangeLog.md
-
-dist_doc_DATA = README.ijg README.md libjpeg.txt structure.txt usage.txt \
-	wizard.txt LICENSE.md
-
-exampledir = $(docdir)
-dist_example_DATA = example.c
-
-
-EXTRA_DIST = win release $(DOCS) testimages CMakeLists.txt \
-	sharedlib/CMakeLists.txt cmakescripts libjpeg.map.in doc doxygen.config \
-	doxygen-extra.css jccolext.c jdcolext.c jdcol565.c jdmrgext.c jdmrg565.c \
-	jstdhuff.c jdcoefct.h jdmainct.h jdmaster.h jdsample.h \
-	md5/CMakeLists.txt Brewfile
-
-dist-hook:
-	rm -rf `find $(distdir) -name .svn`
-
-
-SUBDIRS += md5
-
-if WITH_12BIT
-
-TESTORIG = testorig12.jpg
-MD5_JPEG_RGB_ISLOW = 9620f424569594bb9242b48498ad801f
-MD5_PPM_RGB_ISLOW = f3301d2219783b8b3d942b7239fa50c0
-MD5_JPEG_422_IFAST_OPT = 7322e3bd2f127f7de4b40d4480ce60e4
-MD5_PPM_422_IFAST = 79807fa552899e66a04708f533e16950
-MD5_PPM_422M_IFAST = 07737bfe8a7c1c87aaa393a0098d16b0
-MD5_JPEG_420_IFAST_Q100_PROG = a1da220b5604081863a504297ed59e55
-MD5_PPM_420_Q100_IFAST = 1b3730122709f53d007255e8dfd3305e
-MD5_PPM_420M_Q100_IFAST = 980a1a3c5bf9510022869d30b7d26566
-MD5_JPEG_GRAY_ISLOW = 235c90707b16e2e069f37c888b2636d9
-MD5_PPM_GRAY_ISLOW = 7213c10af507ad467da5578ca5ee1fca
-MD5_PPM_GRAY_ISLOW_RGB = e96ee81c30a6ed422d466338bd3de65d
-MD5_JPEG_420S_IFAST_OPT = 7af8e60be4d9c227ec63ac9b6630855e
-MD5_JPEG_3x2_FLOAT_PROG_SSE = a8c17daf77b457725ec929e215b603f8
-MD5_PPM_3x2_FLOAT_SSE = 42876ab9e5c2f76a87d08db5fbd57956
-MD5_JPEG_3x2_FLOAT_PROG_32BIT = a8c17daf77b457725ec929e215b603f8
-MD5_PPM_3x2_FLOAT_32BIT = 42876ab9e5c2f76a87d08db5fbd57956
-MD5_PPM_3x2_FLOAT_64BIT = d6fbc71153b3d8ded484dbc17c7b9cf4
-MD5_JPEG_3x2_FLOAT_PROG_387 = bc6dbbefac2872f6b9d6c4a0ae60c3c0
-MD5_PPM_3x2_FLOAT_387 = bcc5723c61560463ac60f772e742d092
-MD5_JPEG_3x2_IFAST_PROG = 1396cc2b7185cfe943d408c9d305339e
-MD5_PPM_3x2_IFAST = 3975985ef6eeb0a2cdc58daa651ccc00
-MD5_PPM_420M_ISLOW_2_1 = 4ca6be2a6f326ff9eaab63e70a8259c0
-MD5_PPM_420M_ISLOW_15_8 = 12aa9f9534c1b3d7ba047322226365eb
-MD5_PPM_420M_ISLOW_13_8 = f7e22817c7b25e1393e4ec101e9d4e96
-MD5_PPM_420M_ISLOW_11_8 = 800a16f9f4dc9b293197bfe11be10a82
-MD5_PPM_420M_ISLOW_9_8 = 06b7a92a9bc69f4dc36ec40f1937d55c
-MD5_PPM_420M_ISLOW_7_8 = 3ec444a14a4ab4eab88ffc49c48eca43
-MD5_PPM_420M_ISLOW_3_4 = 3e726b7ea872445b19437d1c1d4f0d93
-MD5_PPM_420M_ISLOW_5_8 = a8a771abdc94301d20ffac119b2caccd
-MD5_PPM_420M_ISLOW_1_2 = b419124dd5568b085787234866102866
-MD5_PPM_420M_ISLOW_3_8 = 343d19015531b7bbe746124127244fa8
-MD5_PPM_420M_ISLOW_1_4 = 35fd59d866e44659edfa3c18db2a3edb
-MD5_PPM_420M_ISLOW_1_8 = ccaed48ac0aedefda5d4abe4013f4ad7
-MD5_PPM_420_ISLOW_SKIP15_31 = 86664cd9dc956536409e44e244d20a97
-MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71 = 452a21656115a163029cfba5c04fa76a
-MD5_PPM_444_ISLOW_SKIP1_6 = ef63901f71ef7a75cd78253fc0914f84
-MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13 = 15b173fb5872d9575572fbcc1b05956f
-MD5_JPEG_CROP = cdb35ff4b4519392690ea040c56ea99c
-
-else
-
-TESTORIG = testorig.jpg
-MD5_JPEG_RGB_ISLOW = 768e970dd57b340ff1b83c9d3d47c77b
-MD5_PPM_RGB_ISLOW = 00a257f5393fef8821f2b88ac7421291
-MD5_BMP_RGB_ISLOW_565 = f07d2e75073e4bb10f6c6f4d36e2e3be
-MD5_BMP_RGB_ISLOW_565D = 4cfa0928ef3e6bb626d7728c924cfda4
-MD5_JPEG_422_IFAST_OPT = 2540287b79d913f91665e660303ab2c8
-MD5_PPM_422_IFAST = 35bd6b3f833bad23de82acea847129fa
-MD5_PPM_422M_IFAST = 8dbc65323d62cca7c91ba02dd1cfa81d
-MD5_BMP_422M_IFAST_565 = 3294bd4d9a1f2b3d08ea6020d0db7065
-MD5_BMP_422M_IFAST_565D = da98c9c7b6039511be4a79a878a9abc1
-MD5_JPEG_420_IFAST_Q100_PROG = 990cbe0329c882420a2094da7e5adade
-MD5_PPM_420_Q100_IFAST = 5a732542015c278ff43635e473a8a294
-MD5_PPM_420M_Q100_IFAST = ff692ee9323a3b424894862557c092f1
-MD5_JPEG_GRAY_ISLOW = 72b51f894b8f4a10b3ee3066770aa38d
-MD5_PPM_GRAY_ISLOW = 8d3596c56eace32f205deccc229aa5ed
-MD5_PPM_GRAY_ISLOW_RGB = 116424ac07b79e5e801f00508eab48ec
-MD5_BMP_GRAY_ISLOW_565 = 12f78118e56a2f48b966f792fedf23cc
-MD5_BMP_GRAY_ISLOW_565D = bdbbd616441a24354c98553df5dc82db
-MD5_JPEG_420S_IFAST_OPT = 388708217ac46273ca33086b22827ed8
-# See README.md for more details on why this next bit is necessary.
-MD5_JPEG_3x2_FLOAT_PROG_SSE = 343e3f8caf8af5986ebaf0bdc13b5c71
-MD5_PPM_3x2_FLOAT_SSE = 1a75f36e5904d6fc3a85a43da9ad89bb
-MD5_JPEG_3x2_FLOAT_PROG_32BIT = 9bca803d2042bd1eb03819e2bf92b3e5
-MD5_PPM_3x2_FLOAT_32BIT = f6bfab038438ed8f5522fbd33595dcdc
-MD5_PPM_3x2_FLOAT_64BIT = 0e917a34193ef976b679a6b069b1be26
-MD5_JPEG_3x2_FLOAT_PROG_387 = 1657664a410e0822c924b54f6f65e6e9
-MD5_PPM_3x2_FLOAT_387 = cb0a1f027f3d2917c902b5640214e025
-MD5_JPEG_3x2_IFAST_PROG = 1ee5d2c1a77f2da495f993c8c7cceca5
-MD5_PPM_3x2_IFAST = fd283664b3b49127984af0a7f118fccd
-MD5_JPEG_420_ISLOW_ARI = e986fb0a637a8d833d96e8a6d6d84ea1
-MD5_JPEG_444_ISLOW_PROGARI = 0a8f1c8f66e113c3cf635df0a475a617
-MD5_PPM_420M_IFAST_ARI = 72b59a99bcf1de24c5b27d151bde2437
-MD5_JPEG_420_ISLOW = 9a68f56bc76e466aa7e52f415d0f4a5f
-MD5_PPM_420M_ISLOW_2_1 = 9f9de8c0612f8d06869b960b05abf9c9
-MD5_PPM_420M_ISLOW_15_8 = b6875bc070720b899566cc06459b63b7
-MD5_PPM_420M_ISLOW_13_8 = bc3452573c8152f6ae552939ee19f82f
-MD5_PPM_420M_ISLOW_11_8 = d8cc73c0aaacd4556569b59437ba00a5
-MD5_PPM_420M_ISLOW_9_8 = d25e61bc7eac0002f5b393aa223747b6
-MD5_PPM_420M_ISLOW_7_8 = ddb564b7c74a09494016d6cd7502a946
-MD5_PPM_420M_ISLOW_3_4 = 8ed8e68808c3fbc4ea764fc9d2968646
-MD5_PPM_420M_ISLOW_5_8 = a3363274999da2366a024efae6d16c9b
-MD5_PPM_420M_ISLOW_1_2 = e692a315cea26b988c8e8b29a5dbcd81
-MD5_PPM_420M_ISLOW_3_8 = 79eca9175652ced755155c90e785a996
-MD5_PPM_420M_ISLOW_1_4 = 79cd778f8bf1a117690052cacdd54eca
-MD5_PPM_420M_ISLOW_1_8 = 391b3d4aca640c8567d6f8745eb2142f
-MD5_BMP_420_ISLOW_256 = 4980185e3776e89bd931736e1cddeee6
-MD5_BMP_420_ISLOW_565 = bf9d13e16c4923b92e1faa604d7922cb
-MD5_BMP_420_ISLOW_565D = 6bde71526acc44bcff76f696df8638d2
-MD5_BMP_420M_ISLOW_565 = 8dc0185245353cfa32ad97027342216f
-MD5_BMP_420M_ISLOW_565D = ce034037d212bc403330df6f915c161b
-MD5_PPM_420_ISLOW_SKIP15_31 = c4c65c1e43d7275cd50328a61e6534f0
-MD5_PPM_420_ISLOW_ARI_SKIP16_139 = 087c6b123db16ac00cb88c5b590bb74a
-MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71 = 26eb36ccc7d1f0cb80cdabb0ac8b5d99
-MD5_PPM_420_ISLOW_ARI_CROP53x53_4_4 = 886c6775af22370257122f8b16207e6d
-MD5_PPM_444_ISLOW_SKIP1_6 = 5606f86874cf26b8fcee1117a0a436a6
-MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13 = db87dc7ce26bcdc7a6b56239ce2b9d6c
-MD5_PPM_444_ISLOW_ARI_CROP37x37_0_0 = cb57b32bd6d03e35432362f7bf184b6d
-MD5_JPEG_CROP = b4197f377e621c4e9b1d20471432610d
-
-endif
-
-.PHONY: test
-test: tjquicktest tjbittest bittest
-
-if CROSS_COMPILING
-tjquicktest: testclean
-else
-tjquicktest: testclean all
-endif
-
-if WITH_TURBOJPEG
-if WITH_JAVA
-	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest
-	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -bi
-	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -yuv
-	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -yuv -noyuvpad
-	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -yuv -bi
-	$(JAVA) -cp java/turbojpeg.jar -Djava.library.path=.libs TJUnitTest -yuv -bi -noyuvpad
-endif
-	./tjunittest
-	./tjunittest -alloc
-	./tjunittest -yuv
-	./tjunittest -yuv -alloc
-	./tjunittest -yuv -noyuvpad
-endif
-	echo GREAT SUCCESS!
-
-if CROSS_COMPILING
-tjbittest: testclean
-else
-tjbittest: testclean all
-endif
-
-if WITH_TURBOJPEG
-
-MD5_PPM_GRAY_TILE = 89d3ca21213d9d864b50b4e4e7de4ca6
-MD5_PPM_420_8x8_TILE = 847fceab15c5b7b911cb986cf0f71de3
-MD5_PPM_420_16x16_TILE = ca45552a93687e078f7137cc4126a7b0
-MD5_PPM_420_32x32_TILE = d8676f1d6b68df358353bba9844f4a00
-MD5_PPM_420_64x64_TILE = 4e4c1a3d7ea4bace4f868bcbe83b7050
-MD5_PPM_420_128x128_TILE = f24c3429c52265832beab9df72a0ceae
-MD5_PPM_420M_8x8_TILE = bc25320e1f4c31ce2e610e43e9fd173c
-MD5_PPM_420M_TILE = 75ffdf14602258c5c189522af57fa605
-MD5_PPM_422_8x8_TILE = d83dacd9fc73b0a6f10c09acad64eb1e
-MD5_PPM_422_16x16_TILE = 35077fb610d72dd743b1eb0cbcfe10fb
-MD5_PPM_422_32x32_TILE = e6902ed8a449ecc0f0d6f2bf945f65f7
-MD5_PPM_422_64x64_TILE = 2b4502a8f316cedbde1da7bce3d2231e
-MD5_PPM_422_128x128_TILE = f0b5617d578f5e13c8eee215d64d4877
-MD5_PPM_422M_8x8_TILE = 828941d7f41cd6283abd6beffb7fd51d
-MD5_PPM_422M_TILE = e877ae1324c4a280b95376f7f018172f
-MD5_PPM_444_TILE = 7964e41e67cfb8d0a587c0aa4798f9c3
-
-# Test compressing from/decompressing to an arbitrary subregion of a larger
-# image buffer
-	cp $(srcdir)/testimages/testorig.ppm testout_tile.ppm
-	./tjbench testout_tile.ppm 95 -rgb -quiet -tile -benchtime 0.01 -warmup 0 >/dev/null 2>&1
-	for i in 8 16 32 64 128; do \
-		md5/md5cmp $(MD5_PPM_GRAY_TILE) testout_tile_GRAY_Q95_$$i\x$$i.ppm; \
-	done
-	md5/md5cmp $(MD5_PPM_420_8x8_TILE) testout_tile_420_Q95_8x8.ppm
-	md5/md5cmp $(MD5_PPM_420_16x16_TILE) testout_tile_420_Q95_16x16.ppm
-	md5/md5cmp $(MD5_PPM_420_32x32_TILE) testout_tile_420_Q95_32x32.ppm
-	md5/md5cmp $(MD5_PPM_420_64x64_TILE) testout_tile_420_Q95_64x64.ppm
-	md5/md5cmp $(MD5_PPM_420_128x128_TILE) testout_tile_420_Q95_128x128.ppm
-	md5/md5cmp $(MD5_PPM_422_8x8_TILE) testout_tile_422_Q95_8x8.ppm
-	md5/md5cmp $(MD5_PPM_422_16x16_TILE) testout_tile_422_Q95_16x16.ppm
-	md5/md5cmp $(MD5_PPM_422_32x32_TILE) testout_tile_422_Q95_32x32.ppm
-	md5/md5cmp $(MD5_PPM_422_64x64_TILE) testout_tile_422_Q95_64x64.ppm
-	md5/md5cmp $(MD5_PPM_422_128x128_TILE) testout_tile_422_Q95_128x128.ppm
-	for i in 8 16 32 64 128; do \
-		md5/md5cmp $(MD5_PPM_444_TILE) testout_tile_444_Q95_$$i\x$$i.ppm; \
-	done
-	rm -f testout_tile_GRAY_* testout_tile_420_* testout_tile_422_* testout_tile_444_*
-
-	./tjbench testout_tile.ppm 95 -rgb -fastupsample -quiet -tile -benchtime 0.01 -warmup 0 >/dev/null 2>&1
-	md5/md5cmp $(MD5_PPM_420M_8x8_TILE) testout_tile_420_Q95_8x8.ppm
-	for i in 16 32 64 128; do \
-		md5/md5cmp $(MD5_PPM_420M_TILE) testout_tile_420_Q95_$$i\x$$i.ppm; \
-	done
-	md5/md5cmp $(MD5_PPM_422M_8x8_TILE) testout_tile_422_Q95_8x8.ppm
-	for i in 16 32 64 128; do \
-		md5/md5cmp $(MD5_PPM_422M_TILE) testout_tile_422_Q95_$$i\x$$i.ppm; \
-	done
-	rm -f testout_tile_GRAY_* testout_tile_420_* testout_tile_422_* testout_tile_444_* testout_tile.ppm
-	echo GREAT SUCCESS!
-
-endif
-
-if CROSS_COMPILING
-bittest: testclean
-else
-bittest: testclean all
-endif
-
-# These tests are carefully crafted to provide full coverage of as many of the
-# underlying algorithms as possible (including all of the SIMD-accelerated
-# ones.)
-
-# CC: null  SAMP: fullsize  FDCT: islow  ENT: huff
-	./cjpeg -rgb -dct int -outfile testout_rgb_islow.jpg $(srcdir)/testimages/testorig.ppm
-	md5/md5cmp $(MD5_JPEG_RGB_ISLOW) testout_rgb_islow.jpg
-# CC: null  SAMP: fullsize  IDCT: islow  ENT: huff
-	./djpeg -dct int -ppm -outfile testout_rgb_islow.ppm testout_rgb_islow.jpg
-	md5/md5cmp $(MD5_PPM_RGB_ISLOW) testout_rgb_islow.ppm
-	rm -f testout_rgb_islow.ppm
-if WITH_12BIT
-	rm -f testout_rgb_islow.jpg
-else
-# CC: RGB->RGB565  SAMP: fullsize  IDCT: islow  ENT: huff
-	./djpeg -dct int -rgb565 -dither none -bmp -outfile testout_rgb_islow_565.bmp testout_rgb_islow.jpg
-	md5/md5cmp $(MD5_BMP_RGB_ISLOW_565) testout_rgb_islow_565.bmp
-	rm -f testout_rgb_islow_565.bmp
-# CC: RGB->RGB565 (dithered)  SAMP: fullsize  IDCT: islow  ENT: huff
-	./djpeg -dct int -rgb565 -bmp -outfile testout_rgb_islow_565D.bmp testout_rgb_islow.jpg
-	md5/md5cmp $(MD5_BMP_RGB_ISLOW_565D) testout_rgb_islow_565D.bmp
-	rm -f testout_rgb_islow_565D.bmp testout_rgb_islow.jpg
-endif
-
-# CC: RGB->YCC  SAMP: fullsize/h2v1  FDCT: ifast  ENT: 2-pass huff
-	./cjpeg -sample 2x1 -dct fast -opt -outfile testout_422_ifast_opt.jpg $(srcdir)/testimages/testorig.ppm
-	md5/md5cmp $(MD5_JPEG_422_IFAST_OPT) testout_422_ifast_opt.jpg
-# CC: YCC->RGB  SAMP: fullsize/h2v1 fancy  IDCT: ifast  ENT: huff
-	./djpeg -dct fast -outfile testout_422_ifast.ppm testout_422_ifast_opt.jpg
-	md5/md5cmp $(MD5_PPM_422_IFAST) testout_422_ifast.ppm
-	rm -f testout_422_ifast.ppm
-# CC: YCC->RGB  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
-	./djpeg -dct fast -nosmooth -outfile testout_422m_ifast.ppm testout_422_ifast_opt.jpg
-	md5/md5cmp $(MD5_PPM_422M_IFAST) testout_422m_ifast.ppm
-	rm -f testout_422m_ifast.ppm
-if WITH_12BIT
-	rm -f testout_422_ifast_opt.jpg
-else
-# CC: YCC->RGB565  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
-	./djpeg -dct int -nosmooth -rgb565 -dither none -bmp -outfile testout_422m_ifast_565.bmp testout_422_ifast_opt.jpg
-	md5/md5cmp $(MD5_BMP_422M_IFAST_565) testout_422m_ifast_565.bmp
-	rm -f testout_422m_ifast_565.bmp
-# CC: YCC->RGB565 (dithered)  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
-	./djpeg -dct int -nosmooth -rgb565 -bmp -outfile testout_422m_ifast_565D.bmp testout_422_ifast_opt.jpg
-	md5/md5cmp $(MD5_BMP_422M_IFAST_565D) testout_422m_ifast_565D.bmp
-	rm -f testout_422m_ifast_565D.bmp testout_422_ifast_opt.jpg
-endif
-
-# CC: RGB->YCC  SAMP: fullsize/h2v2  FDCT: ifast  ENT: prog huff
-	./cjpeg -sample 2x2 -quality 100 -dct fast -prog -outfile testout_420_q100_ifast_prog.jpg $(srcdir)/testimages/testorig.ppm
-	md5/md5cmp $(MD5_JPEG_420_IFAST_Q100_PROG) testout_420_q100_ifast_prog.jpg
-# CC: YCC->RGB  SAMP: fullsize/h2v2 fancy  IDCT: ifast  ENT: prog huff
-	./djpeg -dct fast -outfile testout_420_q100_ifast.ppm testout_420_q100_ifast_prog.jpg
-	md5/md5cmp $(MD5_PPM_420_Q100_IFAST) testout_420_q100_ifast.ppm
-	rm -f testout_420_q100_ifast.ppm
-# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: ifast  ENT: prog huff
-	./djpeg -dct fast -nosmooth -outfile testout_420m_q100_ifast.ppm testout_420_q100_ifast_prog.jpg
-	md5/md5cmp $(MD5_PPM_420M_Q100_IFAST) testout_420m_q100_ifast.ppm
-	rm -f testout_420m_q100_ifast.ppm testout_420_q100_ifast_prog.jpg
-
-# CC: RGB->Gray  SAMP: fullsize  FDCT: islow  ENT: huff
-	./cjpeg -gray -dct int -outfile testout_gray_islow.jpg $(srcdir)/testimages/testorig.ppm
-	md5/md5cmp $(MD5_JPEG_GRAY_ISLOW) testout_gray_islow.jpg
-# CC: Gray->Gray  SAMP: fullsize  IDCT: islow  ENT: huff
-	./djpeg -dct int -outfile testout_gray_islow.ppm testout_gray_islow.jpg
-	md5/md5cmp $(MD5_PPM_GRAY_ISLOW) testout_gray_islow.ppm
-	rm -f testout_gray_islow.ppm
-# CC: Gray->RGB  SAMP: fullsize  IDCT: islow  ENT: huff
-	./djpeg -dct int -rgb -outfile testout_gray_islow_rgb.ppm testout_gray_islow.jpg
-	md5/md5cmp $(MD5_PPM_GRAY_ISLOW_RGB) testout_gray_islow_rgb.ppm
-	rm -f testout_gray_islow_rgb.ppm
-if WITH_12BIT
-	rm -f testout_gray_islow.jpg
-else
-# CC: Gray->RGB565  SAMP: fullsize  IDCT: islow  ENT: huff
-	./djpeg -dct int -rgb565 -dither none -bmp -outfile testout_gray_islow_565.bmp testout_gray_islow.jpg
-	md5/md5cmp $(MD5_BMP_GRAY_ISLOW_565) testout_gray_islow_565.bmp
-	rm -f testout_gray_islow_565.bmp
-# CC: Gray->RGB565 (dithered)  SAMP: fullsize  IDCT: islow  ENT: huff
-	./djpeg -dct int -rgb565 -bmp -outfile testout_gray_islow_565D.bmp testout_gray_islow.jpg
-	md5/md5cmp $(MD5_BMP_GRAY_ISLOW_565D) testout_gray_islow_565D.bmp
-	rm -f testout_gray_islow_565D.bmp testout_gray_islow.jpg
-endif
-
-# CC: RGB->YCC  SAMP: fullsize smooth/h2v2 smooth  FDCT: islow
-# ENT: 2-pass huff
-	./cjpeg -sample 2x2 -smooth 1 -dct int -opt -outfile testout_420s_ifast_opt.jpg $(srcdir)/testimages/testorig.ppm
-	md5/md5cmp $(MD5_JPEG_420S_IFAST_OPT) testout_420s_ifast_opt.jpg
-	rm -f testout_420s_ifast_opt.jpg
-
-# The output of the floating point tests is not validated by default, because
-# the output differs depending on the type of floating point math used, and
-# this is only deterministic if the DCT/IDCT are implemented using SIMD
-# instructions on a particular platform.  Pass one of the following on the make
-# command line to validate the floating point tests against one of the expected
-# results:
-#
-# FLOATTEST=sse  validate against the expected results from the libjpeg-turbo
-#                SSE SIMD extensions
-# FLOATTEST=32bit  validate against the expected results from the C code
-#                  when running on a 32-bit FPU (or when SSE is being used for
-#                  floating point math, which is generally the default with
-#                  x86-64 compilers)
-# FLOATTEST=64bit  validate against the exepected results from the C code
-#                  when running on a 64-bit FPU
-# FLOATTEST=387  validate against the expected results from the C code when
-#                the 387 FPU is being used for floating point math (which is
-#                generally the default with x86 compilers)
-
-# CC: RGB->YCC  SAMP: fullsize/int  FDCT: float  ENT: prog huff
-	./cjpeg -sample 3x2 -dct float -prog -outfile testout_3x2_float_prog.jpg $(srcdir)/testimages/testorig.ppm
-	if [ "${FLOATTEST}" = "sse" ]; then \
-		md5/md5cmp $(MD5_JPEG_3x2_FLOAT_PROG_SSE) testout_3x2_float_prog.jpg; \
-	elif [ "${FLOATTEST}" = "32bit" -o "${FLOATTEST}" = "64bit" ]; then \
-		md5/md5cmp $(MD5_JPEG_3x2_FLOAT_PROG_32BIT) testout_3x2_float_prog.jpg; \
-	elif [ "${FLOATTEST}" = "387" ]; then \
-		md5/md5cmp $(MD5_JPEG_3x2_FLOAT_PROG_387) testout_3x2_float_prog.jpg; \
-	fi
-# CC: YCC->RGB  SAMP: fullsize/int  IDCT: float  ENT: prog huff
-	./djpeg -dct float -outfile testout_3x2_float.ppm testout_3x2_float_prog.jpg
-	if [ "${FLOATTEST}" = "sse" ]; then \
-		md5/md5cmp $(MD5_PPM_3x2_FLOAT_SSE) testout_3x2_float.ppm; \
-	elif [ "${FLOATTEST}" = "32bit" ]; then \
-		md5/md5cmp $(MD5_PPM_3x2_FLOAT_32BIT) testout_3x2_float.ppm; \
-	elif [ "${FLOATTEST}" = "64bit" ]; then \
-		md5/md5cmp $(MD5_PPM_3x2_FLOAT_64BIT) testout_3x2_float.ppm; \
-	elif [ "${FLOATTEST}" = "387" ]; then \
-		md5/md5cmp $(MD5_PPM_3x2_FLOAT_387) testout_3x2_float.ppm; \
-	fi
-	rm -f testout_3x2_float.ppm testout_3x2_float_prog.jpg
-
-# CC: RGB->YCC  SAMP: fullsize/int  FDCT: ifast  ENT: prog huff
-	./cjpeg -sample 3x2 -dct fast -prog -outfile testout_3x2_ifast_prog.jpg $(srcdir)/testimages/testorig.ppm
-	md5/md5cmp $(MD5_JPEG_3x2_IFAST_PROG) testout_3x2_ifast_prog.jpg
-# CC: YCC->RGB  SAMP: fullsize/int  IDCT: ifast  ENT: prog huff
-	./djpeg -dct fast -outfile testout_3x2_ifast.ppm testout_3x2_ifast_prog.jpg
-	md5/md5cmp $(MD5_PPM_3x2_IFAST) testout_3x2_ifast.ppm
-	rm -f testout_3x2_ifast.ppm testout_3x2_ifast_prog.jpg
-
-if WITH_ARITH_ENC
-# CC: YCC->RGB  SAMP: fullsize/h2v2  FDCT: islow  ENT: arith
-	./cjpeg -dct int -arithmetic -outfile testout_420_islow_ari.jpg $(srcdir)/testimages/testorig.ppm
-	md5/md5cmp $(MD5_JPEG_420_ISLOW_ARI) testout_420_islow_ari.jpg
-	rm -f testout_420_islow_ari.jpg
-	./jpegtran -arithmetic -outfile testout_420_islow_ari.jpg $(srcdir)/testimages/testimgint.jpg
-	md5/md5cmp $(MD5_JPEG_420_ISLOW_ARI) testout_420_islow_ari.jpg
-	rm -f testout_420_islow_ari.jpg
-# CC: YCC->RGB  SAMP: fullsize  FDCT: islow  ENT: prog arith
-	./cjpeg -sample 1x1 -dct int -prog -arithmetic -outfile testout_444_islow_progari.jpg $(srcdir)/testimages/testorig.ppm
-	md5/md5cmp $(MD5_JPEG_444_ISLOW_PROGARI) testout_444_islow_progari.jpg
-	rm -f testout_444_islow_progari.jpg
-endif
-if WITH_ARITH_DEC
-# CC: RGB->YCC  SAMP: h2v2 merged  IDCT: ifast  ENT: arith
-	./djpeg -fast -ppm -outfile testout_420m_ifast_ari.ppm $(srcdir)/testimages/testimgari.jpg
-	md5/md5cmp $(MD5_PPM_420M_IFAST_ARI) testout_420m_ifast_ari.ppm
-	rm -f testout_420m_ifast_ari.ppm
-	./jpegtran -outfile testout_420_islow.jpg $(srcdir)/testimages/testimgari.jpg
-	md5/md5cmp $(MD5_JPEG_420_ISLOW) testout_420_islow.jpg
-	rm -f testout_420_islow.jpg
-endif
-
-# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 16x16 islow  ENT: huff
-	./djpeg -dct int -scale 2/1 -nosmooth -ppm -outfile testout_420m_islow_2_1.ppm $(srcdir)/testimages/$(TESTORIG)
-	md5/md5cmp $(MD5_PPM_420M_ISLOW_2_1) testout_420m_islow_2_1.ppm
-	rm -f testout_420m_islow_2_1.ppm
-# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 15x15 islow  ENT: huff
-	./djpeg -dct int -scale 15/8 -nosmooth -ppm -outfile testout_420m_islow_15_8.ppm $(srcdir)/testimages/$(TESTORIG)
-	md5/md5cmp $(MD5_PPM_420M_ISLOW_15_8) testout_420m_islow_15_8.ppm
-	rm -f testout_420m_islow_15_8.ppm
-# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 13x13 islow  ENT: huff
-	./djpeg -dct int -scale 13/8 -nosmooth -ppm -outfile testout_420m_islow_13_8.ppm $(srcdir)/testimages/$(TESTORIG)
-	md5/md5cmp $(MD5_PPM_420M_ISLOW_13_8) testout_420m_islow_13_8.ppm
-	rm -f testout_420m_islow_13_8.ppm
-# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 11x11 islow  ENT: huff
-	./djpeg -dct int -scale 11/8 -nosmooth -ppm -outfile testout_420m_islow_11_8.ppm $(srcdir)/testimages/$(TESTORIG)
-	md5/md5cmp $(MD5_PPM_420M_ISLOW_11_8) testout_420m_islow_11_8.ppm
-	rm -f testout_420m_islow_11_8.ppm
-# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 9x9 islow  ENT: huff
-	./djpeg -dct int -scale 9/8 -nosmooth -ppm -outfile testout_420m_islow_9_8.ppm $(srcdir)/testimages/$(TESTORIG)
-	md5/md5cmp $(MD5_PPM_420M_ISLOW_9_8) testout_420m_islow_9_8.ppm
-	rm -f testout_420m_islow_9_8.ppm
-# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 7x7 islow/14x14 islow  ENT: huff
-	./djpeg -dct int -scale 7/8 -nosmooth -ppm -outfile testout_420m_islow_7_8.ppm $(srcdir)/testimages/$(TESTORIG)
-	md5/md5cmp $(MD5_PPM_420M_ISLOW_7_8) testout_420m_islow_7_8.ppm
-	rm -f testout_420m_islow_7_8.ppm
-# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 6x6 islow/12x12 islow  ENT: huff
-	./djpeg -dct int -scale 3/4 -nosmooth -ppm -outfile testout_420m_islow_3_4.ppm $(srcdir)/testimages/$(TESTORIG)
-	md5/md5cmp $(MD5_PPM_420M_ISLOW_3_4) testout_420m_islow_3_4.ppm
-	rm -f testout_420m_islow_3_4.ppm
-# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 5x5 islow/10x10 islow  ENT: huff
-	./djpeg -dct int -scale 5/8 -nosmooth -ppm -outfile testout_420m_islow_5_8.ppm $(srcdir)/testimages/$(TESTORIG)
-	md5/md5cmp $(MD5_PPM_420M_ISLOW_5_8) testout_420m_islow_5_8.ppm
-	rm -f testout_420m_islow_5_8.ppm
-# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 4x4 islow/8x8 islow  ENT: huff
-	./djpeg -dct int -scale 1/2 -nosmooth -ppm -outfile testout_420m_islow_1_2.ppm $(srcdir)/testimages/$(TESTORIG)
-	md5/md5cmp $(MD5_PPM_420M_ISLOW_1_2) testout_420m_islow_1_2.ppm
-	rm -f testout_420m_islow_1_2.ppm
-# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 3x3 islow/6x6 islow  ENT: huff
-	./djpeg -dct int -scale 3/8 -nosmooth -ppm -outfile testout_420m_islow_3_8.ppm $(srcdir)/testimages/$(TESTORIG)
-	md5/md5cmp $(MD5_PPM_420M_ISLOW_3_8) testout_420m_islow_3_8.ppm
-	rm -f testout_420m_islow_3_8.ppm
-# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 2x2 islow/4x4 islow  ENT: huff
-	./djpeg -dct int -scale 1/4 -nosmooth -ppm -outfile testout_420m_islow_1_4.ppm $(srcdir)/testimages/$(TESTORIG)
-	md5/md5cmp $(MD5_PPM_420M_ISLOW_1_4) testout_420m_islow_1_4.ppm
-	rm -f testout_420m_islow_1_4.ppm
-# CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 1x1 islow/2x2 islow  ENT: huff
-	./djpeg -dct int -scale 1/8 -nosmooth -ppm -outfile testout_420m_islow_1_8.ppm $(srcdir)/testimages/$(TESTORIG)
-	md5/md5cmp $(MD5_PPM_420M_ISLOW_1_8) testout_420m_islow_1_8.ppm
-	rm -f testout_420m_islow_1_8.ppm
-if WITH_12BIT
-else
-# CC: YCC->RGB (dithered)  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
-	./djpeg -dct int -colors 256 -bmp -outfile testout_420_islow_256.bmp $(srcdir)/testimages/$(TESTORIG)
-	md5/md5cmp $(MD5_BMP_420_ISLOW_256) testout_420_islow_256.bmp
-	rm -f testout_420_islow_256.bmp
-# CC: YCC->RGB565  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
-	./djpeg -dct int -rgb565 -dither none -bmp -outfile testout_420_islow_565.bmp $(srcdir)/testimages/$(TESTORIG)
-	md5/md5cmp $(MD5_BMP_420_ISLOW_565) testout_420_islow_565.bmp
-	rm -f testout_420_islow_565.bmp
-# CC: YCC->RGB565 (dithered)  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
-	./djpeg -dct int -rgb565 -bmp -outfile testout_420_islow_565D.bmp $(srcdir)/testimages/$(TESTORIG)
-	md5/md5cmp $(MD5_BMP_420_ISLOW_565D) testout_420_islow_565D.bmp
-	rm -f testout_420_islow_565D.bmp
-# CC: YCC->RGB565  SAMP: h2v2 merged  IDCT: islow  ENT: huff
-	./djpeg -dct int -nosmooth -rgb565 -dither none -bmp -outfile testout_420m_islow_565.bmp $(srcdir)/testimages/$(TESTORIG)
-	md5/md5cmp $(MD5_BMP_420M_ISLOW_565) testout_420m_islow_565.bmp
-	rm -f testout_420m_islow_565.bmp
-# CC: YCC->RGB565 (dithered)  SAMP: h2v2 merged  IDCT: islow  ENT: huff
-	./djpeg -dct int -nosmooth -rgb565 -bmp -outfile testout_420m_islow_565D.bmp $(srcdir)/testimages/$(TESTORIG)
-	md5/md5cmp $(MD5_BMP_420M_ISLOW_565D) testout_420m_islow_565D.bmp
-	rm -f testout_420m_islow_565D.bmp
-endif
-
-# Partial decode tests.  These tests are designed to cover all of the possible
-# code paths in jpeg_skip_scanlines().
-
-# Context rows: Yes  Intra-iMCU row: Yes  iMCU row prefetch: No   ENT: huff
-	./djpeg -dct int -skip 15,31 -ppm -outfile testout_420_islow_skip15,31.ppm $(srcdir)/testimages/$(TESTORIG)
-	md5/md5cmp $(MD5_PPM_420_ISLOW_SKIP15_31) testout_420_islow_skip15,31.ppm
-	rm -f testout_420_islow_skip15,31.ppm
-# Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: Yes  ENT: arith
-if WITH_ARITH_DEC
-	./djpeg -dct int -skip 16,139 -ppm -outfile testout_420_islow_ari_skip16,139.ppm $(srcdir)/testimages/testimgari.jpg
-	md5/md5cmp $(MD5_PPM_420_ISLOW_ARI_SKIP16_139) testout_420_islow_ari_skip16,139.ppm
-	rm -f testout_420_islow_ari_skip16,139.ppm
-endif
-# Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: No   ENT: prog huff
-	./cjpeg -dct int -prog -outfile testout_420_islow_prog.jpg $(srcdir)/testimages/testorig.ppm
-	./djpeg -dct int -crop 62x62+71+71 -ppm -outfile testout_420_islow_prog_crop62x62,71,71.ppm testout_420_islow_prog.jpg
-	md5/md5cmp $(MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71) testout_420_islow_prog_crop62x62,71,71.ppm
-	rm -f testout_420_islow_prog_crop62x62,71,71.ppm testout_420_islow_prog.jpg
-# Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: No   ENT: arith
-if WITH_ARITH_DEC
-	./djpeg -dct int -crop 53x53+4+4 -ppm -outfile testout_420_islow_ari_crop53x53,4,4.ppm $(srcdir)/testimages/testimgari.jpg
-	md5/md5cmp $(MD5_PPM_420_ISLOW_ARI_CROP53x53_4_4) testout_420_islow_ari_crop53x53,4,4.ppm
-	rm -f testout_420_islow_ari_crop53x53,4,4.ppm
-endif
-# Context rows: No   Intra-iMCU row: Yes  ENT: huff
-	./cjpeg -dct int -sample 1x1 -outfile testout_444_islow.jpg $(srcdir)/testimages/testorig.ppm
-	./djpeg -dct int -skip 1,6 -ppm -outfile testout_444_islow_skip1,6.ppm testout_444_islow.jpg
-	md5/md5cmp $(MD5_PPM_444_ISLOW_SKIP1_6) testout_444_islow_skip1,6.ppm
-	rm -f testout_444_islow_skip1,6.ppm testout_444_islow.jpg
-# Context rows: No   Intra-iMCU row: No   ENT: prog huff
-	./cjpeg -dct int -prog -sample 1x1 -outfile testout_444_islow_prog.jpg $(srcdir)/testimages/testorig.ppm
-	./djpeg -dct int -crop 98x98+13+13 -ppm -outfile testout_444_islow_prog_crop98x98,13,13.ppm testout_444_islow_prog.jpg
-	md5/md5cmp $(MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13) testout_444_islow_prog_crop98x98,13,13.ppm
-	rm -f testout_444_islow_prog_crop98x98,13,13.ppm testout_444_islow_prog.jpg
-# Context rows: No   Intra-iMCU row: No   ENT: arith
-if WITH_ARITH_ENC
-	./cjpeg -dct int -arithmetic -sample 1x1 -outfile testout_444_islow_ari.jpg $(srcdir)/testimages/testorig.ppm
-if WITH_ARITH_DEC
-	./djpeg -dct int -crop 37x37+0+0 -ppm -outfile testout_444_islow_ari_crop37x37,0,0.ppm testout_444_islow_ari.jpg
-	md5/md5cmp $(MD5_PPM_444_ISLOW_ARI_CROP37x37_0_0) testout_444_islow_ari_crop37x37,0,0.ppm
-	rm -f testout_444_islow_ari_crop37x37,0,0.ppm
-endif
-	rm -f testout_444_islow_ari.jpg
-endif
-
-	./jpegtran -crop 120x90+20+50 -transpose -perfect -outfile testout_crop.jpg $(srcdir)/testimages/$(TESTORIG)
-	md5/md5cmp $(MD5_JPEG_CROP) testout_crop.jpg
-	rm -f testout_crop.jpg
-	echo GREAT SUCCESS!
-
-
-testclean:
-	rm -f testout*
-	rm -f *_GRAY_*.bmp
-	rm -f *_GRAY_*.png
-	rm -f *_GRAY_*.ppm
-	rm -f *_GRAY_*.jpg
-	rm -f *_GRAY.yuv
-	rm -f *_420_*.bmp
-	rm -f *_420_*.png
-	rm -f *_420_*.ppm
-	rm -f *_420_*.jpg
-	rm -f *_420.yuv
-	rm -f *_422_*.bmp
-	rm -f *_422_*.png
-	rm -f *_422_*.ppm
-	rm -f *_422_*.jpg
-	rm -f *_422.yuv
-	rm -f *_444_*.bmp
-	rm -f *_444_*.png
-	rm -f *_444_*.ppm
-	rm -f *_444_*.jpg
-	rm -f *_444.yuv
-	rm -f *_440_*.bmp
-	rm -f *_440_*.png
-	rm -f *_440_*.ppm
-	rm -f *_440_*.jpg
-	rm -f *_440.yuv
-	rm -f *_411_*.bmp
-	rm -f *_411_*.png
-	rm -f *_411_*.ppm
-	rm -f *_411_*.jpg
-	rm -f *_411.yuv
-	rm -f tjbenchtest*.log
-	rm -f tjexampletest*.log
-
-
-tjtest:
-	sh ./tjbenchtest
-	sh ./tjbenchtest -alloc
-	sh ./tjbenchtest -yuv
-	sh ./tjbenchtest -yuv -alloc
-if WITH_JAVA
-	sh ./tjbenchtest.java
-	sh ./tjbenchtest.java -yuv
-endif
-
-
-pkgscripts/libjpeg-turbo.spec: pkgscripts/libjpeg-turbo.spec.tmpl
-	cat pkgscripts/libjpeg-turbo.spec.tmpl | sed s@%{__prefix}@$(prefix)@g | \
-		sed s@%{__bindir}@$(bindir)@g | sed s@%{__datadir}@$(datadir)@g | \
-		sed s@%{__docdir}@$(docdir)@g | sed s@%{__includedir}@$(includedir)@g | \
-		sed s@%{__libdir}@$(libdir)@g | sed s@%{__mandir}@$(mandir)@g \
-		> pkgscripts/libjpeg-turbo.spec
-
-rpm: all pkgscripts/libjpeg-turbo.spec
-	TMPDIR=`mktemp -d /tmp/${PACKAGE_NAME}-build.XXXXXX`; \
-	mkdir -p $$TMPDIR/RPMS; \
-	ln -fs `pwd` $$TMPDIR/BUILD; \
-	rm -f ${PKGNAME}-${VERSION}.${RPMARCH}.rpm; \
-	rpmbuild -bb --define "_blddir $$TMPDIR/buildroot"  \
-		--define "_topdir $$TMPDIR" \
-		--target ${RPMARCH} pkgscripts/libjpeg-turbo.spec; \
-	cp $$TMPDIR/RPMS/${RPMARCH}/${PKGNAME}-${VERSION}-${BUILD}.${RPMARCH}.rpm \
-		${PKGNAME}-${VERSION}.${RPMARCH}.rpm; \
-	rm -rf $$TMPDIR
-
-srpm: dist-gzip pkgscripts/libjpeg-turbo.spec
-	TMPDIR=`mktemp -d /tmp/${PACKAGE_NAME}-build.XXXXXX`; \
-	mkdir -p $$TMPDIR/RPMS; \
-	mkdir -p $$TMPDIR/SRPMS; \
-	mkdir -p $$TMPDIR/BUILD; \
-	mkdir -p $$TMPDIR/SOURCES; \
-	mkdir -p $$TMPDIR/SPECS; \
-	rm -f ${PKGNAME}-${VERSION}.src.rpm; \
-	cp ${PACKAGE_NAME}-${VERSION}.tar.gz $$TMPDIR/SOURCES; \
-	cat pkgscripts/libjpeg-turbo.spec | sed s/%{_blddir}/%{_tmppath}/g \
-		| sed s/#--\>//g \
-		> $$TMPDIR/SPECS/libjpeg-turbo.spec; \
-	rpmbuild -bs --define "_topdir $$TMPDIR" $$TMPDIR/SPECS/libjpeg-turbo.spec; \
-	cp $$TMPDIR/SRPMS/${PKGNAME}-${VERSION}-${BUILD}.src.rpm \
-		${PKGNAME}-${VERSION}.src.rpm; \
-	rm -rf $$TMPDIR
-
-pkgscripts/makedpkg: pkgscripts/makedpkg.tmpl
-	cat pkgscripts/makedpkg.tmpl | sed s@%{__prefix}@$(prefix)@g | \
-		sed s@%{__docdir}@$(docdir)@g | sed s@%{__libdir}@$(libdir)@g \
-		> pkgscripts/makedpkg
-
-deb: all pkgscripts/makedpkg
-	sh pkgscripts/makedpkg
-
-pkgscripts/uninstall: pkgscripts/uninstall.tmpl
-	cat pkgscripts/uninstall.tmpl | sed s@%{__prefix}@$(prefix)@g | \
-		sed s@%{__bindir}@$(bindir)@g | sed s@%{__datadir}@$(datadir)@g | \
-		sed s@%{__includedir}@$(includedir)@g | sed s@%{__libdir}@$(libdir)@g | \
-		sed s@%{__mandir}@$(mandir)@g > pkgscripts/uninstall
-
-pkgscripts/makemacpkg: pkgscripts/makemacpkg.tmpl
-	cat pkgscripts/makemacpkg.tmpl | sed s@%{__prefix}@$(prefix)@g | \
-		sed s@%{__bindir}@$(bindir)@g | sed s@%{__docdir}@$(docdir)@g | \
-		sed s@%{__libdir}@$(libdir)@g > pkgscripts/makemacpkg
-
-if X86_64
-
-udmg: all pkgscripts/makemacpkg pkgscripts/uninstall
-	sh pkgscripts/makemacpkg -build32 ${BUILDDIR32}
-
-iosdmg: all pkgscripts/makemacpkg pkgscripts/uninstall
-	sh pkgscripts/makemacpkg -build32 ${BUILDDIR32} -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S} -buildarmv8 ${BUILDDIRARMV8} -lipo "${LIPO}"
-
-else
-
-iosdmg: all pkgscripts/makemacpkg pkgscripts/uninstall
-	sh pkgscripts/makemacpkg -buildarmv7 ${BUILDDIRARMV7} -buildarmv7s ${BUILDDIRARMV7S} -buildarmv8 ${BUILDDIRARMV8} -lipo "${LIPO}"
-
-endif
-
-dmg: all pkgscripts/makemacpkg pkgscripts/uninstall
-	sh pkgscripts/makemacpkg
-
-pkgscripts/makecygwinpkg: pkgscripts/makecygwinpkg.tmpl
-	cat pkgscripts/makecygwinpkg.tmpl | sed s@%{__prefix}@$(prefix)@g | \
-		sed s@%{__docdir}@$(docdir)@g | sed s@%{__libdir}@$(libdir)@g \
-		> pkgscripts/makecygwinpkg
-
-cygwinpkg: all pkgscripts/makecygwinpkg
-	sh pkgscripts/makecygwinpkg
diff --git a/README.ijg b/README.ijg
index 9c450ce..ee9fb67 100644
--- a/README.ijg
+++ b/README.ijg
@@ -43,7 +43,7 @@
   change.log        Version-to-version change highlights.
 Programmer and internal documentation:
   libjpeg.txt       How to use the JPEG library in your own programs.
-  example.c         Sample code for calling the JPEG library.
+  example.txt       Sample code for calling the JPEG library.
   structure.txt     Overview of the JPEG library's internal structure.
   coderules.txt     Coding style rules --- please read if you contribute code.
 
@@ -159,12 +159,6 @@
 assumed by the product vendor.
 
 
-The Unix configuration script "configure" was produced with GNU Autoconf.
-It is copyright by the Free Software Foundation but is freely distributable.
-The same holds for its supporting scripts (config.guess, config.sub,
-ltmain.sh).  Another support script, install-sh, is copyright by X Consortium
-but is also freely distributable.
-
 The IJG distribution formerly included code to read and write GIF files.
 To avoid entanglement with the Unisys LZW patent (now expired), GIF reading
 support has been removed altogether, and the GIF writer has been simplified
@@ -185,8 +179,8 @@
 understand the innards of the JPEG software.
 
 The best short technical introduction to the JPEG compression algorithm is
-	Wallace, Gregory K.  "The JPEG Still Picture Compression Standard",
-	Communications of the ACM, April 1991 (vol. 34 no. 4), pp. 30-44.
+        Wallace, Gregory K.  "The JPEG Still Picture Compression Standard",
+        Communications of the ACM, April 1991 (vol. 34 no. 4), pp. 30-44.
 (Adjacent articles in that issue discuss MPEG motion picture compression,
 applications of JPEG, and related topics.)  If you don't have the CACM issue
 handy, a PDF file containing a revised version of Wallace's article is
@@ -255,8 +249,8 @@
 archive at rtfm.mit.edu: ftp://rtfm.mit.edu/pub/usenet/news.answers/jpeg-faq/.
 If you don't have Web or FTP access, send e-mail to mail-server@rtfm.mit.edu
 with body
-	send usenet/news.answers/jpeg-faq/part1
-	send usenet/news.answers/jpeg-faq/part2
+        send usenet/news.answers/jpeg-faq/part1
+        send usenet/news.answers/jpeg-faq/part2
 
 
 FILE FORMAT WARS
diff --git a/README.md b/README.md
index 74e6eac..80ba6c1 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 ==========
 
 libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
-NEON, AltiVec) to accelerate baseline JPEG compression and decompression on
-x86, x86-64, ARM, and PowerPC systems.  On such systems, libjpeg-turbo is
+AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression
+on x86, x86-64, ARM, and PowerPC systems.  On such systems, libjpeg-turbo is
 generally 2-6x as fast as libjpeg, all else being equal.  On other types of
 systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by
 virtue of its highly-optimized Huffman coding routines.  In many cases, the
@@ -48,7 +48,9 @@
   straightforward to achieve using the underlying libjpeg API, such as
   generating planar YUV images and performing multiple simultaneous lossless
   transforms on an image.  The Java interface for libjpeg-turbo is written on
-  top of the TurboJPEG API.
+  top of the TurboJPEG API.  The TurboJPEG API is recommended for first-time
+  users of libjpeg-turbo.  Refer to [tjexample.c](tjexample.c) and
+  [TJExample.java](java/TJExample.java) for examples of its usage.
 
 - **libjpeg API**<br>
   This is the de facto industry-standard API for compressing and decompressing
diff --git a/acinclude.m4 b/acinclude.m4
deleted file mode 100644
index 113169f..0000000
--- a/acinclude.m4
+++ /dev/null
@@ -1,287 +0,0 @@
-# AC_PROG_NASM
-# --------------------------
-# Check that NASM exists and determine flags
-AC_DEFUN([AC_PROG_NASM],[
-
-AC_ARG_VAR(NASM, [NASM command (used to build the x86/x86-64 SIMD code)])
-if test "x$NASM" = "x"; then
-  AC_CHECK_PROGS(NASM, [nasm nasmw yasm])
-  test -z "$NASM" && AC_MSG_ERROR([no nasm (Netwide Assembler) found])
-fi
-
-AC_MSG_CHECKING([for object file format of host system])
-case "$host_os" in
-  cygwin* | mingw* | pw32* | interix*)
-    case "$host_cpu" in
-      x86_64)
-        objfmt='Win64-COFF'
-        ;;
-      *)
-        objfmt='Win32-COFF'
-        ;;
-    esac
-  ;;
-  msdosdjgpp* | go32*)
-    objfmt='COFF'
-  ;;
-  os2-emx*)			# not tested
-    objfmt='MSOMF'		# obj
-  ;;
-  linux*coff* | linux*oldld*)
-    objfmt='COFF'		# ???
-  ;;
-  linux*aout*)
-    objfmt='a.out'
-  ;;
-  linux*)
-    case "$host_cpu" in
-      x86_64)
-        objfmt='ELF64'
-        ;;
-      *)
-        objfmt='ELF'
-        ;;
-    esac
-  ;;
-  kfreebsd* | freebsd* | netbsd* | openbsd*)
-    if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
-      objfmt='BSD-a.out'
-    else
-      case "$host_cpu" in
-        x86_64 | amd64)
-          objfmt='ELF64'
-          ;;
-        *)
-          objfmt='ELF'
-          ;;
-      esac
-    fi
-  ;;
-  solaris* | sunos* | sysv* | sco*)
-    case "$host_cpu" in
-      x86_64)
-        objfmt='ELF64'
-        ;;
-      *)
-        objfmt='ELF'
-        ;;
-    esac
-  ;;
-  darwin* | rhapsody* | nextstep* | openstep* | macos*)
-    case "$host_cpu" in
-      x86_64)
-        objfmt='Mach-O64'
-        ;;
-      *)
-        objfmt='Mach-O'
-        ;;
-    esac
-  ;;
-  *)
-    objfmt='ELF ?'
-  ;;
-esac
-
-AC_MSG_RESULT([$objfmt])
-if test "$objfmt" = 'ELF ?'; then
-  objfmt='ELF'
-  AC_MSG_WARN([unexpected host system. assumed that the format is $objfmt.])
-fi
-
-AC_MSG_CHECKING([for object file format specifier (NAFLAGS) ])
-case "$objfmt" in
-  MSOMF)      NAFLAGS='-fobj -DOBJ32';;
-  Win32-COFF) NAFLAGS='-fwin32 -DWIN32';;
-  Win64-COFF) NAFLAGS='-fwin64 -DWIN64 -D__x86_64__';;
-  COFF)       NAFLAGS='-fcoff -DCOFF';;
-  a.out)      NAFLAGS='-faout -DAOUT';;
-  BSD-a.out)  NAFLAGS='-faoutb -DAOUT';;
-  ELF)        NAFLAGS='-felf -DELF';;
-  ELF64)      NAFLAGS='-felf64 -DELF -D__x86_64__';;
-  RDF)        NAFLAGS='-frdf -DRDF';;
-  Mach-O)     NAFLAGS='-fmacho -DMACHO';;
-  Mach-O64)   NAFLAGS='-fmacho64 -DMACHO -D__x86_64__';;
-esac
-AC_MSG_RESULT([$NAFLAGS])
-AC_SUBST([NAFLAGS])
-
-AC_MSG_CHECKING([whether the assembler ($NASM $NAFLAGS) works])
-cat > conftest.asm <<EOF
-[%line __oline__ "configure"
-        section .text
-        global  _main,main
-_main:
-main:   xor     eax,eax
-        ret
-]EOF
-try_nasm='$NASM $NAFLAGS -o conftest.o conftest.asm'
-if AC_TRY_EVAL(try_nasm) && test -s conftest.o; then
-  AC_MSG_RESULT(yes)
-else
-  echo "configure: failed program was:" >&AC_FD_CC
-  cat conftest.asm >&AC_FD_CC
-  rm -rf conftest*
-  AC_MSG_RESULT(no)
-  AC_MSG_ERROR([installation or configuration problem: assembler cannot create object files.])
-fi
-
-AC_MSG_CHECKING([whether the linker accepts assembler output])
-try_nasm='${CC-cc} -o conftest${ac_exeext} $LDFLAGS conftest.o $LIBS 1>&AC_FD_CC'
-if AC_TRY_EVAL(try_nasm) && test -s conftest${ac_exeext}; then
-  rm -rf conftest*
-  AC_MSG_RESULT(yes)
-else
-  rm -rf conftest*
-  AC_MSG_RESULT(no)
-  AC_MSG_ERROR([configuration problem: maybe object file format mismatch.])
-fi
-
-])
-
-# AC_CHECK_COMPATIBLE_ARM_ASSEMBLER_IFELSE
-# --------------------------
-# Test whether the assembler is suitable and supports NEON instructions
-AC_DEFUN([AC_CHECK_COMPATIBLE_ARM_ASSEMBLER_IFELSE],[
-  ac_good_gnu_arm_assembler=no
-  ac_save_CC="$CC"
-  ac_save_CFLAGS="$CFLAGS"
-  CFLAGS="$CCASFLAGS -x assembler-with-cpp"
-  CC="$CCAS"
-  AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-    .text
-    .fpu neon
-    .arch armv7a
-    .object_arch armv4
-    .arm
-    pld [r0]
-    vmovn.u16 d0, q0]])], ac_good_gnu_arm_assembler=yes)
-
-  ac_use_gas_preprocessor=no
-  if test "x$ac_good_gnu_arm_assembler" = "xno" ; then
-    CC="gas-preprocessor.pl $CCAS"
-    AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-      .text
-      .fpu neon
-      .arch armv7a
-      .object_arch armv4
-      .arm
-      pld [r0]
-      vmovn.u16 d0, q0]])], ac_use_gas_preprocessor=yes)
-  fi
-  CFLAGS="$ac_save_CFLAGS"
-  CC="$ac_save_CC"
-
-  if test "x$ac_use_gas_preprocessor" = "xyes" ; then
-    CCAS="gas-preprocessor.pl $CCAS"
-    AC_SUBST([CCAS])
-    ac_good_gnu_arm_assembler=yes
-  fi
-
-  if test "x$ac_good_gnu_arm_assembler" = "xyes" ; then
-    $1
-  else
-    $2
-  fi
-])
-
-# AC_CHECK_COMPATIBLE_MIPSEL_ASSEMBLER_IFELSE
-# --------------------------
-# Test whether the assembler is suitable and supports MIPS instructions
-AC_DEFUN([AC_CHECK_COMPATIBLE_MIPS_ASSEMBLER_IFELSE],[
-  have_mips_dspr2=no
-  ac_save_CFLAGS="$CFLAGS"
-  CFLAGS="$CCASFLAGS -mdspr2"
-
-  AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-
-  int main ()
-  {
-    int c = 0, a = 0, b = 0;
-    __asm__ __volatile__ (
-        "precr.qb.ph %[c], %[a], %[b]          \n\t"
-        : [c] "=r" (c)
-        : [a] "r" (a), [b] "r" (b)
-    );
-    return c;
-  }
-  ]])], have_mips_dspr2=yes)
-  CFLAGS=$ac_save_CFLAGS
-
-  if test "x$have_mips_dspr2" = "xyes" ; then
-    $1
-  else
-    $2
-  fi
-])
-
-AC_DEFUN([AC_CHECK_COMPATIBLE_ARM64_ASSEMBLER_IFELSE],[
-  ac_good_gnu_arm_assembler=no
-  ac_save_CC="$CC"
-  ac_save_CFLAGS="$CFLAGS"
-  CFLAGS="$CCASFLAGS -x assembler-with-cpp"
-  CC="$CCAS"
-  AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-    .text
-    MYVAR .req x0
-    movi v0.16b, #100
-    mov MYVAR, #100
-    .unreq MYVAR]])], ac_good_gnu_arm_assembler=yes)
-
-  ac_use_gas_preprocessor=no
-  if test "x$ac_good_gnu_arm_assembler" = "xno" ; then
-    CC="gas-preprocessor.pl $CCAS"
-    AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-      .text
-      MYVAR .req x0
-      movi v0.16b, #100
-      mov MYVAR, #100
-      .unreq MYVAR]])], ac_use_gas_preprocessor=yes)
-  fi
-  CFLAGS="$ac_save_CFLAGS"
-  CC="$ac_save_CC"
-
-  if test "x$ac_use_gas_preprocessor" = "xyes" ; then
-    CCAS="gas-preprocessor.pl $CCAS"
-    AC_SUBST([CCAS])
-    ac_good_gnu_arm_assembler=yes
-  fi
-
-  if test "x$ac_good_gnu_arm_assembler" = "xyes" ; then
-    $1
-  else
-    $2
-  fi
-])
-
-# AC_CHECK_ALTIVEC
-# ----------------
-# Test whether AltiVec intrinsics are supported
-AC_DEFUN([AC_CHECK_ALTIVEC],[
-  ac_save_CFLAGS="$CFLAGS"
-  CFLAGS="$CFLAGS -maltivec"
-  AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
-    #include <altivec.h>
-    int main(void) {
-      __vector int vi = { 0, 0, 0, 0 };
-      int i[4];
-      vec_st(vi, 0, i);
-      return i[0];
-    }]])], ac_has_altivec=yes)
-  CFLAGS="$ac_save_CFLAGS"
-  if test "x$ac_has_altivec" = "xyes" ; then
-    $1
-  else
-    $2
-  fi
-])
-
-AC_DEFUN([AC_NO_SIMD],[
-  AC_MSG_RESULT([no ("$1")])
-  with_simd=no;
-  if test "x${require_simd}" = "xyes"; then
-    AC_MSG_ERROR([SIMD support not available for this CPU.])
-  else
-    AC_MSG_WARN([SIMD support not available for this CPU.  Performance will\
- suffer.])
-  fi
-])
diff --git a/appveyor.yml b/appveyor.yml
index 4f2d6cc..0acc329 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -16,11 +16,11 @@
 
       set MSYSTEM=MINGW32
 
-      bash -c "pacman --noconfirm -S autoconf automake libtool zip"
+      bash -c "pacman --noconfirm -S zip"
 
       mklink /d "%ProgramData%\Oracle\Java32" "c:\Program Files (x86)\Java\jdk1.6.0"
 
-      git clone --depth=1 https://github.com/libjpeg-turbo/buildscripts.git c:/buildscripts
+      git clone --depth=1 https://github.com/libjpeg-turbo/buildscripts.git -b %APPVEYOR_REPO_BRANCH% c:/buildscripts
 
 build_script:
   - cmd: >-
diff --git a/bmp.c b/bmp.c
deleted file mode 100644
index 2b8e80c..0000000
--- a/bmp.c
+++ /dev/null
@@ -1,341 +0,0 @@
-/*
- * Copyright (C)2011, 2015 D. R. Commander.  All Rights Reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * - Redistributions of source code must retain the above copyright notice,
- *   this list of conditions and the following disclaimer.
- * - Redistributions in binary form must reproduce the above copyright notice,
- *   this list of conditions and the following disclaimer in the documentation
- *   and/or other materials provided with the distribution.
- * - Neither the name of the libjpeg-turbo Project nor the names of its
- *   contributors may be used to endorse or promote products derived from this
- *   software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <stdio.h>
-#include <string.h>
-#include <setjmp.h>
-#include <errno.h>
-#include "cdjpeg.h"
-#include <jpeglib.h>
-#include <jpegint.h>
-#include "tjutil.h"
-#include "bmp.h"
-
-
-/* This duplicates the functionality of the VirtualGL bitmap library using
-   the components from cjpeg and djpeg */
-
-
-/* Error handling (based on example in example.c) */
-
-static char errStr[JMSG_LENGTH_MAX]="No error";
-
-struct my_error_mgr
-{
-	struct jpeg_error_mgr pub;
-	jmp_buf setjmp_buffer;
-};
-typedef struct my_error_mgr *my_error_ptr;
-
-static void my_error_exit(j_common_ptr cinfo)
-{
-	my_error_ptr myerr=(my_error_ptr)cinfo->err;
-	(*cinfo->err->output_message)(cinfo);
-	longjmp(myerr->setjmp_buffer, 1);
-}
-
-/* Based on output_message() in jerror.c */
-
-static void my_output_message(j_common_ptr cinfo)
-{
-	(*cinfo->err->format_message)(cinfo, errStr);
-}
-
-#define _throw(m) {snprintf(errStr, JMSG_LENGTH_MAX, "%s", m);  \
-	retval=-1;  goto bailout;}
-#define _throwunix(m) {snprintf(errStr, JMSG_LENGTH_MAX, "%s\n%s", m,  \
-	strerror(errno));  retval=-1;  goto bailout;}
-
-
-static void pixelconvert(unsigned char *srcbuf, int srcpf, int srcbottomup,
-	unsigned char *dstbuf, int dstpf, int dstbottomup, int w, int h)
-{
-	unsigned char *srcrowptr=srcbuf, *srccolptr;
-	int srcps=tjPixelSize[srcpf];
-	int srcstride=srcbottomup? -w*srcps:w*srcps;
-	unsigned char *dstrowptr=dstbuf, *dstcolptr;
-	int dstps=tjPixelSize[dstpf];
-	int dststride=dstbottomup? -w*dstps:w*dstps;
-	int row, col;
-
-	if(srcbottomup) srcrowptr=&srcbuf[w*srcps*(h-1)];
-	if(dstbottomup) dstrowptr=&dstbuf[w*dstps*(h-1)];
-
-	/* NOTE: These quick & dirty CMYK<->RGB conversion routines are for testing
-	   purposes only.  Properly converting between CMYK and RGB requires a color
-	   management system. */
-
-	if(dstpf==TJPF_CMYK)
-	{
-		for(row=0; row<h; row++, srcrowptr+=srcstride, dstrowptr+=dststride)
-		{
-			for(col=0, srccolptr=srcrowptr, dstcolptr=dstrowptr;
-				col<w; col++, srccolptr+=srcps)
-			{
-				double c=1.0-((double)(srccolptr[tjRedOffset[srcpf]])/255.);
-				double m=1.0-((double)(srccolptr[tjGreenOffset[srcpf]])/255.);
-				double y=1.0-((double)(srccolptr[tjBlueOffset[srcpf]])/255.);
-				double k=min(min(c,m),min(y,1.0));
-				if(k==1.0) c=m=y=0.0;
-				else
-				{
-					c=(c-k)/(1.0-k);
-					m=(m-k)/(1.0-k);
-					y=(y-k)/(1.0-k);
-				}
-				if(c>1.0) c=1.0;
-				if(c<0.) c=0.;
-				if(m>1.0) m=1.0;
-				if(m<0.) m=0.;
-				if(y>1.0) y=1.0;
-				if(y<0.) y=0.;
-				if(k>1.0) k=1.0;
-				if(k<0.) k=0.;
-				*dstcolptr++=(unsigned char)(255.0-c*255.0+0.5);
-				*dstcolptr++=(unsigned char)(255.0-m*255.0+0.5);
-				*dstcolptr++=(unsigned char)(255.0-y*255.0+0.5);
-				*dstcolptr++=(unsigned char)(255.0-k*255.0+0.5);
-			}
-		}
-	}
-	else if(srcpf==TJPF_CMYK)
-	{
-		for(row=0; row<h; row++, srcrowptr+=srcstride, dstrowptr+=dststride)
-		{
-			for(col=0, srccolptr=srcrowptr, dstcolptr=dstrowptr;
-				col<w; col++, dstcolptr+=dstps)
-			{
-				double c=(double)(*srccolptr++);
-				double m=(double)(*srccolptr++);
-				double y=(double)(*srccolptr++);
-				double k=(double)(*srccolptr++);
-				double r=c*k/255.;
-				double g=m*k/255.;
-				double b=y*k/255.;
-				if(r>255.0) r=255.0;
-				if(r<0.) r=0.;
-				if(g>255.0) g=255.0;
-				if(g<0.) g=0.;
-				if(b>255.0) b=255.0;
-				if(b<0.) b=0.;
-				dstcolptr[tjRedOffset[dstpf]]=(unsigned char)(r+0.5);
-				dstcolptr[tjGreenOffset[dstpf]]=(unsigned char)(g+0.5);
-				dstcolptr[tjBlueOffset[dstpf]]=(unsigned char)(b+0.5);
-			}
-		}
-	}
-	else
-	{
-		for(row=0; row<h; row++, srcrowptr+=srcstride, dstrowptr+=dststride)
-		{
-			for(col=0, srccolptr=srcrowptr, dstcolptr=dstrowptr;
-				col<w; col++, srccolptr+=srcps, dstcolptr+=dstps)
-			{
-				dstcolptr[tjRedOffset[dstpf]]=srccolptr[tjRedOffset[srcpf]];
-				dstcolptr[tjGreenOffset[dstpf]]=srccolptr[tjGreenOffset[srcpf]];
-				dstcolptr[tjBlueOffset[dstpf]]=srccolptr[tjBlueOffset[srcpf]];
-			}
-		}
-	}
-}
-
-
-int loadbmp(char *filename, unsigned char **buf, int *w, int *h,
-	int dstpf, int bottomup)
-{
-	int retval=0, dstps, srcpf, tempc;
-	struct jpeg_compress_struct cinfo;
-	struct my_error_mgr jerr;
-	cjpeg_source_ptr src;
-	FILE *file=NULL;
-
-	memset(&cinfo, 0, sizeof(struct jpeg_compress_struct));
-
-	if(!filename || !buf || !w || !h || dstpf<0 || dstpf>=TJ_NUMPF)
-		_throw("loadbmp(): Invalid argument");
-
-	if((file=fopen(filename, "rb"))==NULL)
-		_throwunix("loadbmp(): Cannot open input file");
-
-	cinfo.err=jpeg_std_error(&jerr.pub);
-	jerr.pub.error_exit=my_error_exit;
-	jerr.pub.output_message=my_output_message;
-
-	if(setjmp(jerr.setjmp_buffer))
-	{
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
-
-	jpeg_create_compress(&cinfo);
-	if((tempc=getc(file))<0 || ungetc(tempc, file)==EOF)
-		_throwunix("loadbmp(): Could not read input file")
-	else if(tempc==EOF) _throw("loadbmp(): Input file contains no data");
-
-	if(tempc=='B')
-	{
-		if((src=jinit_read_bmp(&cinfo))==NULL)
-			_throw("loadbmp(): Could not initialize bitmap loader");
-	}
-	else if(tempc=='P')
-	{
-		if((src=jinit_read_ppm(&cinfo))==NULL)
-			_throw("loadbmp(): Could not initialize bitmap loader");
-	}
-	else _throw("loadbmp(): Unsupported file type");
-
-	src->input_file=file;
-	(*src->start_input)(&cinfo, src);
-	(*cinfo.mem->realize_virt_arrays)((j_common_ptr)&cinfo);
-
-	*w=cinfo.image_width;  *h=cinfo.image_height;
-
-	if(cinfo.input_components==1 && cinfo.in_color_space==JCS_RGB)
-		srcpf=TJPF_GRAY;
-	else srcpf=TJPF_RGB;
-
-	dstps=tjPixelSize[dstpf];
-	if((*buf=(unsigned char *)malloc((*w)*(*h)*dstps))==NULL)
-		_throw("loadbmp(): Memory allocation failure");
-
-	while(cinfo.next_scanline<cinfo.image_height)
-	{
-		int i, nlines=(*src->get_pixel_rows)(&cinfo, src);
-		for(i=0; i<nlines; i++)
-		{
-			unsigned char *outbuf;  int row;
-			row=cinfo.next_scanline+i;
-			if(bottomup) outbuf=&(*buf)[((*h)-row-1)*(*w)*dstps];
-			else outbuf=&(*buf)[row*(*w)*dstps];
-			pixelconvert(src->buffer[i], srcpf, 0, outbuf, dstpf, bottomup, *w,
-				nlines);
-		}
-		cinfo.next_scanline+=nlines;
-	}
-
-	(*src->finish_input)(&cinfo, src);
-
-	bailout:
-	jpeg_destroy_compress(&cinfo);
-	if(file) fclose(file);
-	if(retval<0 && buf && *buf) {free(*buf);  *buf=NULL;}
-	return retval;
-}
-
-
-int savebmp(char *filename, unsigned char *buf, int w, int h, int srcpf,
-	int bottomup)
-{
-	int retval=0, srcps, dstpf;
-	struct jpeg_decompress_struct dinfo;
-	struct my_error_mgr jerr;
-	djpeg_dest_ptr dst;
-	FILE *file=NULL;
-	char *ptr=NULL;
-
-	memset(&dinfo, 0, sizeof(struct jpeg_decompress_struct));
-
-	if(!filename || !buf || w<1 || h<1 || srcpf<0 || srcpf>=TJ_NUMPF)
-		_throw("savebmp(): Invalid argument");
-
-	if((file=fopen(filename, "wb"))==NULL)
-		_throwunix("savebmp(): Cannot open output file");
-
-	dinfo.err=jpeg_std_error(&jerr.pub);
-	jerr.pub.error_exit=my_error_exit;
-	jerr.pub.output_message=my_output_message;
-
-	if(setjmp(jerr.setjmp_buffer))
-	{
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
-
-	jpeg_create_decompress(&dinfo);
-	if(srcpf==TJPF_GRAY)
-	{
-		dinfo.out_color_components=dinfo.output_components=1;
-		dinfo.out_color_space=JCS_GRAYSCALE;
-	}
-	else
-	{
-		dinfo.out_color_components=dinfo.output_components=3;
-		dinfo.out_color_space=JCS_RGB;
-	}
-	dinfo.image_width=w;  dinfo.image_height=h;
-	dinfo.global_state=DSTATE_READY;
-	dinfo.scale_num=dinfo.scale_denom=1;
-
-	ptr=strrchr(filename, '.');
-	if(ptr && !strcasecmp(ptr, ".bmp"))
-	{
-		if((dst=jinit_write_bmp(&dinfo, 0))==NULL)
-			_throw("savebmp(): Could not initialize bitmap writer");
-	}
-	else
-	{
-		if((dst=jinit_write_ppm(&dinfo))==NULL)
-			_throw("savebmp(): Could not initialize PPM writer");
-	}
-
-	dst->output_file=file;
-	(*dst->start_output)(&dinfo, dst);
-	(*dinfo.mem->realize_virt_arrays)((j_common_ptr)&dinfo);
-
-	if(srcpf==TJPF_GRAY) dstpf=srcpf;
-	else dstpf=TJPF_RGB;
-	srcps=tjPixelSize[srcpf];
-
-	while(dinfo.output_scanline<dinfo.output_height)
-	{
-		int i, nlines=dst->buffer_height;
-		for(i=0; i<nlines; i++)
-		{
-			unsigned char *inbuf;  int row;
-			row=dinfo.output_scanline+i;
-			if(bottomup) inbuf=&buf[(h-row-1)*w*srcps];
-			else inbuf=&buf[row*w*srcps];
-			pixelconvert(inbuf, srcpf, bottomup, dst->buffer[i], dstpf, 0, w,
-				nlines);
-		}
-		(*dst->put_pixel_rows)(&dinfo, dst, nlines);
-		dinfo.output_scanline+=nlines;
-	}
-
-	(*dst->finish_output)(&dinfo, dst);
-
-	bailout:
-	jpeg_destroy_decompress(&dinfo);
-	if(file) fclose(file);
-	return retval;
-}
-
-const char *bmpgeterr(void)
-{
-	return errStr;
-}
diff --git a/bmp.h b/bmp.h
deleted file mode 100644
index c50c260..0000000
--- a/bmp.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (C)2011 D. R. Commander.  All Rights Reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * - Redistributions of source code must retain the above copyright notice,
- *   this list of conditions and the following disclaimer.
- * - Redistributions in binary form must reproduce the above copyright notice,
- *   this list of conditions and the following disclaimer in the documentation
- *   and/or other materials provided with the distribution.
- * - Neither the name of the libjpeg-turbo Project nor the names of its
- *   contributors may be used to endorse or promote products derived from this
- *   software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __BMP_H__
-#define __BMP_H__
-
-#include "./turbojpeg.h"
-
-int loadbmp(char *filename, unsigned char **buf, int *w, int *h, int pf,
-	int bottomup);
-
-int savebmp(char *filename, unsigned char *buf, int w, int h, int pf,
-	int bottomup);
-
-const char *bmpgeterr(void);
-
-#endif
diff --git a/cderror.h b/cderror.h
index 63de498..37034a9 100644
--- a/cderror.h
+++ b/cderror.h
@@ -26,7 +26,7 @@
 #define JMAKE_ENUM_LIST
 #else
 /* Repeated inclusions of this file are no-ops unless JMESSAGE is defined */
-#define JMESSAGE(code,string)
+#define JMESSAGE(code, string)
 #endif /* CDERROR_H */
 #endif /* JMESSAGE */
 
@@ -34,11 +34,11 @@
 
 typedef enum {
 
-#define JMESSAGE(code,string)   code ,
+#define JMESSAGE(code, string)  code,
 
 #endif /* JMAKE_ENUM_LIST */
 
-JMESSAGE(JMSG_FIRSTADDONCODE=1000, NULL) /* Must be first entry! */
+JMESSAGE(JMSG_FIRSTADDONCODE = 1000, NULL) /* Must be first entry! */
 
 #ifdef BMP_SUPPORTED
 JMESSAGE(JERR_BMP_BADCMAP, "Unsupported BMP colormap format")
diff --git a/cdjpeg.c b/cdjpeg.c
index 441d671..e0e382d 100644
--- a/cdjpeg.c
+++ b/cdjpeg.c
@@ -28,11 +28,12 @@
 #ifdef PROGRESS_REPORT
 
 METHODDEF(void)
-progress_monitor (j_common_ptr cinfo)
+progress_monitor(j_common_ptr cinfo)
 {
-  cd_progress_ptr prog = (cd_progress_ptr) cinfo->progress;
+  cd_progress_ptr prog = (cd_progress_ptr)cinfo->progress;
   int total_passes = prog->pub.total_passes + prog->total_extra_passes;
-  int percent_done = (int) (prog->pub.pass_counter*100L/prog->pub.pass_limit);
+  int percent_done =
+    (int)(prog->pub.pass_counter * 100L / prog->pub.pass_limit);
 
   if (percent_done != prog->percent_done) {
     prog->percent_done = percent_done;
@@ -49,7 +50,7 @@
 
 
 GLOBAL(void)
-start_progress_monitor (j_common_ptr cinfo, cd_progress_ptr progress)
+start_progress_monitor(j_common_ptr cinfo, cd_progress_ptr progress)
 {
   /* Enable progress display, unless trace output is on */
   if (cinfo->err->trace_level == 0) {
@@ -63,7 +64,7 @@
 
 
 GLOBAL(void)
-end_progress_monitor (j_common_ptr cinfo)
+end_progress_monitor(j_common_ptr cinfo)
 {
   /* Clear away progress display */
   if (cinfo->err->trace_level == 0) {
@@ -82,7 +83,7 @@
  */
 
 GLOBAL(boolean)
-keymatch (char *arg, const char *keyword, int minchars)
+keymatch(char *arg, const char *keyword, int minchars)
 {
   register int ca, ck;
   register int nmatched = 0;
@@ -109,9 +110,9 @@
  */
 
 GLOBAL(FILE *)
-read_stdin (void)
+read_stdin(void)
 {
-  FILE * input_file = stdin;
+  FILE *input_file = stdin;
 
 #ifdef USE_SETMODE              /* need to hack file mode? */
   setmode(fileno(stdin), O_BINARY);
@@ -127,9 +128,9 @@
 
 
 GLOBAL(FILE *)
-write_stdout (void)
+write_stdout(void)
 {
-  FILE * output_file = stdout;
+  FILE *output_file = stdout;
 
 #ifdef USE_SETMODE              /* need to hack file mode? */
   setmode(fileno(stdout), O_BINARY);
diff --git a/cdjpeg.h b/cdjpeg.h
index bb49fbf..9868a0b 100644
--- a/cdjpeg.h
+++ b/cdjpeg.h
@@ -96,41 +96,42 @@
 
 /* Module selection routines for I/O modules. */
 
-EXTERN(cjpeg_source_ptr) jinit_read_bmp (j_compress_ptr cinfo);
-EXTERN(djpeg_dest_ptr) jinit_write_bmp (j_decompress_ptr cinfo,
-                                        boolean is_os2);
-EXTERN(cjpeg_source_ptr) jinit_read_gif (j_compress_ptr cinfo);
-EXTERN(djpeg_dest_ptr) jinit_write_gif (j_decompress_ptr cinfo);
-EXTERN(cjpeg_source_ptr) jinit_read_ppm (j_compress_ptr cinfo);
-EXTERN(djpeg_dest_ptr) jinit_write_ppm (j_decompress_ptr cinfo);
-EXTERN(cjpeg_source_ptr) jinit_read_rle (j_compress_ptr cinfo);
-EXTERN(djpeg_dest_ptr) jinit_write_rle (j_decompress_ptr cinfo);
-EXTERN(cjpeg_source_ptr) jinit_read_targa (j_compress_ptr cinfo);
-EXTERN(djpeg_dest_ptr) jinit_write_targa (j_decompress_ptr cinfo);
+EXTERN(cjpeg_source_ptr) jinit_read_bmp(j_compress_ptr cinfo,
+                                        boolean use_inversion_array);
+EXTERN(djpeg_dest_ptr) jinit_write_bmp(j_decompress_ptr cinfo, boolean is_os2,
+                                       boolean use_inversion_array);
+EXTERN(cjpeg_source_ptr) jinit_read_gif(j_compress_ptr cinfo);
+EXTERN(djpeg_dest_ptr) jinit_write_gif(j_decompress_ptr cinfo);
+EXTERN(cjpeg_source_ptr) jinit_read_ppm(j_compress_ptr cinfo);
+EXTERN(djpeg_dest_ptr) jinit_write_ppm(j_decompress_ptr cinfo);
+EXTERN(cjpeg_source_ptr) jinit_read_rle(j_compress_ptr cinfo);
+EXTERN(djpeg_dest_ptr) jinit_write_rle(j_decompress_ptr cinfo);
+EXTERN(cjpeg_source_ptr) jinit_read_targa(j_compress_ptr cinfo);
+EXTERN(djpeg_dest_ptr) jinit_write_targa(j_decompress_ptr cinfo);
 
 /* cjpeg support routines (in rdswitch.c) */
 
-EXTERN(boolean) read_quant_tables (j_compress_ptr cinfo, char *filename,
-                                   boolean force_baseline);
-EXTERN(boolean) read_scan_script (j_compress_ptr cinfo, char *filename);
-EXTERN(boolean) set_quality_ratings (j_compress_ptr cinfo, char *arg,
-                                     boolean force_baseline);
-EXTERN(boolean) set_quant_slots (j_compress_ptr cinfo, char *arg);
-EXTERN(boolean) set_sample_factors (j_compress_ptr cinfo, char *arg);
+EXTERN(boolean) read_quant_tables(j_compress_ptr cinfo, char *filename,
+                                  boolean force_baseline);
+EXTERN(boolean) read_scan_script(j_compress_ptr cinfo, char *filename);
+EXTERN(boolean) set_quality_ratings(j_compress_ptr cinfo, char *arg,
+                                    boolean force_baseline);
+EXTERN(boolean) set_quant_slots(j_compress_ptr cinfo, char *arg);
+EXTERN(boolean) set_sample_factors(j_compress_ptr cinfo, char *arg);
 
 /* djpeg support routines (in rdcolmap.c) */
 
-EXTERN(void) read_color_map (j_decompress_ptr cinfo, FILE *infile);
+EXTERN(void) read_color_map(j_decompress_ptr cinfo, FILE *infile);
 
 /* common support routines (in cdjpeg.c) */
 
-EXTERN(void) enable_signal_catcher (j_common_ptr cinfo);
-EXTERN(void) start_progress_monitor (j_common_ptr cinfo,
-                                     cd_progress_ptr progress);
-EXTERN(void) end_progress_monitor (j_common_ptr cinfo);
-EXTERN(boolean) keymatch (char *arg, const char *keyword, int minchars);
-EXTERN(FILE *) read_stdin (void);
-EXTERN(FILE *) write_stdout (void);
+EXTERN(void) enable_signal_catcher(j_common_ptr cinfo);
+EXTERN(void) start_progress_monitor(j_common_ptr cinfo,
+                                    cd_progress_ptr progress);
+EXTERN(void) end_progress_monitor(j_common_ptr cinfo);
+EXTERN(boolean) keymatch(char *arg, const char *keyword, int minchars);
+EXTERN(FILE *) read_stdin(void);
+EXTERN(FILE *) write_stdout(void);
 
 /* miscellaneous useful macros */
 
@@ -151,3 +152,6 @@
 #ifndef EXIT_WARNING
 #define EXIT_WARNING  2
 #endif
+
+#define IsExtRGB(cs) \
+  (cs == JCS_RGB || (cs >= JCS_EXT_RGB && cs <= JCS_EXT_ARGB))
diff --git a/cjpeg.1 b/cjpeg.1
index 283fc81..a3e47ba 100644
--- a/cjpeg.1
+++ b/cjpeg.1
@@ -46,7 +46,7 @@
 .B cjpeg
 isn't bright enough to notice whether a BMP file uses only shades of gray.
 By saying
-.BR \-grayscale ,
+.BR \-grayscale,
 you'll get a smaller JPEG file that takes less time to process.
 .TP
 .B \-rgb
@@ -187,6 +187,9 @@
 roundoff behavior, whereas the integer methods should give the same results on
 all machines.
 .TP
+.BI \-icc " file"
+Embed ICC color management profile contained in the specified file.
+.TP
 .BI \-restart " N"
 Emit a JPEG restart marker every N MCU rows, or every N MCU blocks if "B" is
 attached to the number.
diff --git a/cjpeg.c b/cjpeg.c
index 9d282b8..07e7db1 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -31,6 +31,11 @@
 #include "jversion.h"           /* for version message */
 #include "jconfigint.h"
 
+#ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc(),free() */
+extern void *malloc(size_t size);
+extern void free(void *ptr);
+#endif
+
 #ifdef USE_CCOMMAND             /* command-line reader for Macintosh */
 #ifdef __MWERKS__
 #include <SIOUX.h>              /* Metrowerks needs this */
@@ -44,7 +49,7 @@
 
 /* Create the add-on message string table. */
 
-#define JMESSAGE(code,string)   string ,
+#define JMESSAGE(code, string)  string,
 
 static const char * const cdjpeg_message_table[] = {
 #include "cderror.h"
@@ -82,7 +87,7 @@
 
 
 LOCAL(cjpeg_source_ptr)
-select_file_type (j_compress_ptr cinfo, FILE *infile)
+select_file_type(j_compress_ptr cinfo, FILE *infile)
 {
   int c;
 
@@ -102,7 +107,7 @@
   switch (c) {
 #ifdef BMP_SUPPORTED
   case 'B':
-    return jinit_read_bmp(cinfo);
+    return jinit_read_bmp(cinfo, TRUE);
 #endif
 #ifdef GIF_SUPPORTED
   case 'G':
@@ -139,12 +144,13 @@
 
 
 static const char *progname;    /* program name for error messages */
+static char *icc_filename;      /* for -icc switch */
 static char *outfilename;       /* for -outfile switch */
 boolean memdst;                 /* for -memdst switch */
 
 
 LOCAL(void)
-usage (void)
+usage(void)
 /* complain about bad command line */
 {
   fprintf(stderr, "usage: %s [switches] ", progname);
@@ -184,6 +190,7 @@
   fprintf(stderr, "  -dct float     Use floating-point DCT method%s\n",
           (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : ""));
 #endif
+  fprintf(stderr, "  -icc FILE      Embed ICC profile contained in FILE\n");
   fprintf(stderr, "  -restart N     Set restart interval in rows, or in blocks with B\n");
 #ifdef INPUT_SMOOTHING_SUPPORTED
   fprintf(stderr, "  -smooth N      Smooth dithered input (N=1..100 is strength)\n");
@@ -208,8 +215,8 @@
 
 
 LOCAL(int)
-parse_switches (j_compress_ptr cinfo, int argc, char **argv,
-                int last_file_arg_seen, boolean for_real)
+parse_switches(j_compress_ptr cinfo, int argc, char **argv,
+               int last_file_arg_seen, boolean for_real)
 /* Parse optional switches.
  * Returns argv[] index of first file-name argument (== argc if none).
  * Any file names with indexes <= last_file_arg_seen are ignored;
@@ -234,6 +241,7 @@
   force_baseline = FALSE;       /* by default, allow 16-bit quantizers */
   simple_progressive = FALSE;
   is_targa = FALSE;
+  icc_filename = NULL;
   outfilename = NULL;
   memdst = FALSE;
   cinfo->err->trace_level = 0;
@@ -284,7 +292,7 @@
       /* On first -d, print version identification */
       static boolean printed_version = FALSE;
 
-      if (! printed_version) {
+      if (!printed_version) {
         fprintf(stderr, "%s version %s (build %s)\n",
                 PACKAGE_NAME, VERSION, BUILD);
         fprintf(stderr, "%s\n\n", JCOPYRIGHT);
@@ -299,7 +307,8 @@
               PACKAGE_NAME, VERSION, BUILD);
       exit(EXIT_SUCCESS);
 
-    } else if (keymatch(arg, "grayscale", 2) || keymatch(arg, "greyscale",2)) {
+    } else if (keymatch(arg, "grayscale", 2) ||
+               keymatch(arg, "greyscale", 2)) {
       /* Force a monochrome JPEG file to be generated. */
       jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
 
@@ -307,6 +316,12 @@
       /* Force an RGB JPEG file to be generated. */
       jpeg_set_colorspace(cinfo, JCS_RGB);
 
+    } else if (keymatch(arg, "icc", 1)) {
+      /* Set ICC filename. */
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      icc_filename = argv[argn];
+
     } else if (keymatch(arg, "maxmemory", 3)) {
       /* Maximum memory in Kb (or Mb with 'm'). */
       long lval;
@@ -392,10 +407,10 @@
       if (lval < 0 || lval > 65535L)
         usage();
       if (ch == 'b' || ch == 'B') {
-        cinfo->restart_interval = (unsigned int) lval;
+        cinfo->restart_interval = (unsigned int)lval;
         cinfo->restart_in_rows = 0; /* else prior '-restart n' overrides me */
       } else {
-        cinfo->restart_in_rows = (int) lval;
+        cinfo->restart_in_rows = (int)lval;
         /* restart_interval will be computed during startup */
       }
 
@@ -450,19 +465,19 @@
     /* Set quantization tables for selected quality. */
     /* Some or all may be overridden if -qtables is present. */
     if (qualityarg != NULL)     /* process -quality if it was present */
-      if (! set_quality_ratings(cinfo, qualityarg, force_baseline))
+      if (!set_quality_ratings(cinfo, qualityarg, force_baseline))
         usage();
 
     if (qtablefile != NULL)     /* process -qtables if it was present */
-      if (! read_quant_tables(cinfo, qtablefile, force_baseline))
+      if (!read_quant_tables(cinfo, qtablefile, force_baseline))
         usage();
 
     if (qslotsarg != NULL)      /* process -qslots if it was present */
-      if (! set_quant_slots(cinfo, qslotsarg))
+      if (!set_quant_slots(cinfo, qslotsarg))
         usage();
 
     if (samplearg != NULL)      /* process -sample if it was present */
-      if (! set_sample_factors(cinfo, samplearg))
+      if (!set_sample_factors(cinfo, samplearg))
         usage();
 
 #ifdef C_PROGRESSIVE_SUPPORTED
@@ -472,7 +487,7 @@
 
 #ifdef C_MULTISCAN_FILES_SUPPORTED
     if (scansarg != NULL)       /* process -scans if it was present */
-      if (! read_scan_script(cinfo, scansarg))
+      if (!read_scan_script(cinfo, scansarg))
         usage();
 #endif
   }
@@ -486,7 +501,7 @@
  */
 
 int
-main (int argc, char **argv)
+main(int argc, char **argv)
 {
   struct jpeg_compress_struct cinfo;
   struct jpeg_error_mgr jerr;
@@ -496,6 +511,9 @@
   int file_index;
   cjpeg_source_ptr src_mgr;
   FILE *input_file;
+  FILE *icc_file;
+  JOCTET *icc_profile = NULL;
+  long icc_len = 0;
   FILE *output_file = NULL;
   unsigned char *outbuffer = NULL;
   unsigned long outsize = 0;
@@ -539,14 +557,14 @@
   if (!memdst) {
     /* Must have either -outfile switch or explicit output file name */
     if (outfilename == NULL) {
-      if (file_index != argc-2) {
+      if (file_index != argc - 2) {
         fprintf(stderr, "%s: must name one input and one output file\n",
                 progname);
         usage();
       }
-      outfilename = argv[file_index+1];
+      outfilename = argv[file_index + 1];
     } else {
-      if (file_index != argc-1) {
+      if (file_index != argc - 1) {
         fprintf(stderr, "%s: must name one input and one output file\n",
                 progname);
         usage();
@@ -555,7 +573,7 @@
   }
 #else
   /* Unix style: expect zero or one file name */
-  if (file_index < argc-1) {
+  if (file_index < argc - 1) {
     fprintf(stderr, "%s: only one input file\n", progname);
     usage();
   }
@@ -583,8 +601,35 @@
     output_file = write_stdout();
   }
 
+  if (icc_filename != NULL) {
+    if ((icc_file = fopen(icc_filename, READ_BINARY)) == NULL) {
+      fprintf(stderr, "%s: can't open %s\n", progname, icc_filename);
+      exit(EXIT_FAILURE);
+    }
+    if (fseek(icc_file, 0, SEEK_END) < 0 ||
+        (icc_len = ftell(icc_file)) < 1 ||
+        fseek(icc_file, 0, SEEK_SET) < 0) {
+      fprintf(stderr, "%s: can't determine size of %s\n", progname,
+              icc_filename);
+      exit(EXIT_FAILURE);
+    }
+    if ((icc_profile = (JOCTET *)malloc(icc_len)) == NULL) {
+      fprintf(stderr, "%s: can't allocate memory for ICC profile\n", progname);
+      fclose(icc_file);
+      exit(EXIT_FAILURE);
+    }
+    if (fread(icc_profile, icc_len, 1, icc_file) < 1) {
+      fprintf(stderr, "%s: can't read ICC profile from %s\n", progname,
+              icc_filename);
+      free(icc_profile);
+      fclose(icc_file);
+      exit(EXIT_FAILURE);
+    }
+    fclose(icc_file);
+  }
+
 #ifdef PROGRESS_REPORT
-  start_progress_monitor((j_common_ptr) &cinfo, &progress);
+  start_progress_monitor((j_common_ptr)&cinfo, &progress);
 #endif
 
   /* Figure out the input file format, and set up to read it. */
@@ -611,10 +656,13 @@
   /* Start compressor */
   jpeg_start_compress(&cinfo, TRUE);
 
+  if (icc_profile != NULL)
+    jpeg_write_icc_profile(&cinfo, icc_profile, (unsigned int)icc_len);
+
   /* Process data */
   while (cinfo.next_scanline < cinfo.image_height) {
     num_scanlines = (*src_mgr->get_pixel_rows) (&cinfo, src_mgr);
-    (void) jpeg_write_scanlines(&cinfo, src_mgr->buffer, num_scanlines);
+    (void)jpeg_write_scanlines(&cinfo, src_mgr->buffer, num_scanlines);
   }
 
   /* Finish compression and release memory */
@@ -629,7 +677,7 @@
     fclose(output_file);
 
 #ifdef PROGRESS_REPORT
-  end_progress_monitor((j_common_ptr) &cinfo);
+  end_progress_monitor((j_common_ptr)&cinfo);
 #endif
 
   if (memdst) {
@@ -638,6 +686,9 @@
       free(outbuffer);
   }
 
+  if (icc_profile != NULL)
+    free(icc_profile);
+
   /* All done. */
   exit(jerr.num_warnings ? EXIT_WARNING : EXIT_SUCCESS);
   return 0;                     /* suppress no-return-value warnings */
diff --git a/cmakescripts/BuildPackages.cmake b/cmakescripts/BuildPackages.cmake
new file mode 100644
index 0000000..57f0672
--- /dev/null
+++ b/cmakescripts/BuildPackages.cmake
@@ -0,0 +1,177 @@
+# This file is included from the top-level CMakeLists.txt.  We just store it
+# here to avoid cluttering up that file.
+
+set(PKGNAME ${CMAKE_PROJECT_NAME} CACHE STRING
+  "Distribution package name (default: ${CMAKE_PROJECT_NAME})")
+set(PKGVENDOR "The ${CMAKE_PROJECT_NAME} Project" CACHE STRING
+  "Vendor name to be included in distribution package descriptions (default: The ${CMAKE_PROJECT_NAME} Project)")
+set(PKGURL "http://www.${CMAKE_PROJECT_NAME}.org" CACHE STRING
+  "URL of project web site to be included in distribution package descriptions (default: http://www.${CMAKE_PROJECT_NAME}.org)")
+set(PKGEMAIL "information@${CMAKE_PROJECT_NAME}.org" CACHE STRING
+  "E-mail of project maintainer to be included in distribution package descriptions (default: information@${CMAKE_PROJECT_NAME}.org")
+set(PKGID "com.${CMAKE_PROJECT_NAME}.${PKGNAME}" CACHE STRING
+  "Globally unique package identifier (reverse DNS notation) (default: com.${CMAKE_PROJECT_NAME}.${PKGNAME})")
+
+
+###############################################################################
+# Linux RPM and DEB
+###############################################################################
+
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+
+set(RPMARCH ${CMAKE_SYSTEM_PROCESSOR})
+if(CPU_TYPE STREQUAL "x86_64")
+  set(DEBARCH amd64)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "armv7*")
+  set(DEBARCH armhf)
+elseif(CPU_TYPE STREQUAL "arm64")
+  set(DEBARCH ${CPU_TYPE})
+elseif(CPU_TYPE STREQUAL "arm")
+  set(DEBARCH armel)
+elseif(CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "ppc64le")
+  set(DEBARCH ppc64el)
+elseif(CPU_TYPE STREQUAL "powerpc" AND BITS EQUAL 32)
+  set(RPMARCH ppc)
+  set(DEBARCH ppc)
+else()
+  set(DEBARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif()
+message(STATUS "RPM architecture = ${RPMARCH}, DEB architecture = ${DEBARCH}")
+
+# Re-set CMAKE_POSITION_INDEPENDENT_CODE so that the RPM spec file works
+# properly
+boolean_number(CMAKE_POSITION_INDEPENDENT_CODE)
+
+configure_file(release/makerpm.in pkgscripts/makerpm)
+configure_file(release/rpm.spec.in pkgscripts/rpm.spec @ONLY)
+
+add_custom_target(rpm sh pkgscripts/makerpm
+  SOURCES pkgscripts/makerpm)
+
+configure_file(release/makesrpm.in pkgscripts/makesrpm)
+
+add_custom_target(srpm sh pkgscripts/makesrpm
+  SOURCES pkgscripts/makesrpm
+  DEPENDS dist)
+
+configure_file(release/makedpkg.in pkgscripts/makedpkg)
+configure_file(release/deb-control.in pkgscripts/deb-control)
+
+add_custom_target(deb sh pkgscripts/makedpkg
+  SOURCES pkgscripts/makedpkg)
+
+endif() # Linux
+
+
+###############################################################################
+# Windows installer (NullSoft Installer)
+###############################################################################
+
+if(WIN32)
+
+if(MSVC)
+  set(INST_PLATFORM "Visual C++")
+  set(INST_NAME ${CMAKE_PROJECT_NAME}-${VERSION}-vc)
+  set(INST_REG_NAME ${CMAKE_PROJECT_NAME})
+elseif(MINGW)
+  set(INST_PLATFORM GCC)
+  set(INST_NAME ${CMAKE_PROJECT_NAME}-${VERSION}-gcc)
+  set(INST_REG_NAME ${CMAKE_PROJECT_NAME}-gcc)
+  set(INST_DEFS -DGCC)
+endif()
+
+if(BITS EQUAL 64)
+  set(INST_PLATFORM "${INST_PLATFORM} 64-bit")
+  set(INST_NAME ${INST_NAME}64)
+  set(INST_REG_NAME ${INST_DIR}64)
+  set(INST_DEFS ${INST_DEFS} -DWIN64)
+endif()
+
+if(WITH_JAVA)
+  set(INST_DEFS ${INST_DEFS} -DJAVA)
+endif()
+
+if(MSVC_IDE)
+  set(INST_DEFS ${INST_DEFS} "-DBUILDDIR=${CMAKE_CFG_INTDIR}\\")
+else()
+  set(INST_DEFS ${INST_DEFS} "-DBUILDDIR=")
+endif()
+
+string(REGEX REPLACE "/" "\\\\" INST_DIR ${CMAKE_INSTALL_PREFIX})
+
+configure_file(release/installer.nsi.in installer.nsi @ONLY)
+
+if(WITH_JAVA)
+  set(JAVA_DEPEND turbojpeg-java)
+endif()
+add_custom_target(installer
+  makensis -nocd ${INST_DEFS} installer.nsi
+  DEPENDS jpeg jpeg-static turbojpeg turbojpeg-static rdjpgcom wrjpgcom
+    cjpeg djpeg jpegtran tjbench ${JAVA_DEPEND}
+  SOURCES installer.nsi)
+
+endif() # WIN32
+
+
+###############################################################################
+# Cygwin Package
+###############################################################################
+
+if(CYGWIN)
+
+configure_file(release/makecygwinpkg.in pkgscripts/makecygwinpkg)
+
+add_custom_target(cygwinpkg sh pkgscripts/makecygwinpkg)
+
+endif() # CYGWIN
+
+
+###############################################################################
+# Mac DMG
+###############################################################################
+
+if(APPLE)
+
+set(DEFAULT_OSX_32BIT_BUILD ${CMAKE_SOURCE_DIR}/osxx86)
+set(OSX_32BIT_BUILD ${DEFAULT_OSX_32BIT_BUILD} CACHE PATH
+  "Directory containing 32-bit (i386) Mac build to include in universal binaries (default: ${DEFAULT_OSX_32BIT_BUILD})")
+set(DEFAULT_IOS_ARMV7_BUILD ${CMAKE_SOURCE_DIR}/iosarmv7)
+set(IOS_ARMV7_BUILD ${DEFAULT_IOS_ARMV7_BUILD} CACHE PATH
+  "Directory containing ARMv7 iOS build to include in universal binaries (default: ${DEFAULT_IOS_ARMV7_BUILD})")
+set(DEFAULT_IOS_ARMV7S_BUILD ${CMAKE_SOURCE_DIR}/iosarmv7s)
+set(IOS_ARMV7S_BUILD ${DEFAULT_IOS_ARMV7S_BUILD} CACHE PATH
+  "Directory containing ARMv7s iOS build to include in universal binaries (default: ${DEFAULT_IOS_ARMV7S_BUILD})")
+set(DEFAULT_IOS_ARMV8_BUILD ${CMAKE_SOURCE_DIR}/iosarmv8)
+set(IOS_ARMV8_BUILD ${DEFAULT_IOS_ARMV8_BUILD} CACHE PATH
+  "Directory containing ARMv8 iOS build to include in universal binaries (default: ${DEFAULT_IOS_ARMV8_BUILD})")
+
+configure_file(release/makemacpkg.in pkgscripts/makemacpkg)
+configure_file(release/Distribution.xml.in pkgscripts/Distribution.xml)
+configure_file(release/uninstall.in pkgscripts/uninstall)
+
+add_custom_target(dmg sh pkgscripts/makemacpkg
+  SOURCES pkgscripts/makemacpkg)
+
+add_custom_target(udmg sh pkgscripts/makemacpkg universal
+  SOURCES pkgscripts/makemacpkg)
+
+endif() # APPLE
+
+
+###############################################################################
+# Generic
+###############################################################################
+
+add_custom_target(dist
+  COMMAND git archive --prefix=${CMAKE_PROJECT_NAME}-${VERSION}/ HEAD |
+    gzip > ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-${VERSION}.tar.gz
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+
+configure_file(release/maketarball.in pkgscripts/maketarball)
+
+add_custom_target(tarball sh pkgscripts/maketarball
+  SOURCES pkgscripts/maketarball)
+
+configure_file(release/libjpeg.pc.in pkgscripts/libjpeg.pc @ONLY)
+
+configure_file(release/libturbojpeg.pc.in pkgscripts/libturbojpeg.pc @ONLY)
diff --git a/cmakescripts/GNUInstallDirs.cmake b/cmakescripts/GNUInstallDirs.cmake
new file mode 100644
index 0000000..ef564bb
--- /dev/null
+++ b/cmakescripts/GNUInstallDirs.cmake
@@ -0,0 +1,416 @@
+#.rst:
+# GNUInstallDirs
+# --------------
+#
+# Define GNU standard installation directories
+#
+# Provides install directory variables as defined by the
+# `GNU Coding Standards`_.
+#
+# .. _`GNU Coding Standards`: https://www.gnu.org/prep/standards/html_node/Directory-Variables.html
+#
+# Result Variables
+# ^^^^^^^^^^^^^^^^
+#
+# Inclusion of this module defines the following variables:
+#
+# ``CMAKE_INSTALL_<dir>``
+#
+#   Destination for files of a given type.  This value may be passed to
+#   the ``DESTINATION`` options of :command:`install` commands for the
+#   corresponding file type.
+#
+# ``CMAKE_INSTALL_FULL_<dir>``
+#
+#   The absolute path generated from the corresponding ``CMAKE_INSTALL_<dir>``
+#   value.  If the value is not already an absolute path, an absolute path
+#   is constructed typically by prepending the value of the
+#   :variable:`CMAKE_INSTALL_PREFIX` variable.  However, there are some
+#   `special cases`_ as documented below.
+#
+# where ``<dir>`` is one of:
+#
+# ``BINDIR``
+#   user executables (``bin``)
+# ``SBINDIR``
+#   system admin executables (``sbin``)
+# ``LIBEXECDIR``
+#   program executables (``libexec``)
+# ``SYSCONFDIR``
+#   read-only single-machine data (``etc``)
+# ``SHAREDSTATEDIR``
+#   modifiable architecture-independent data (``com``)
+# ``LOCALSTATEDIR``
+#   modifiable single-machine data (``var``)
+# ``LIBDIR``
+#   object code libraries (``lib`` or ``lib64``
+#   or ``lib/<multiarch-tuple>`` on Debian)
+# ``INCLUDEDIR``
+#   C header files (``include``)
+# ``OLDINCLUDEDIR``
+#   C header files for non-gcc (``/usr/include``)
+# ``DATAROOTDIR``
+#   read-only architecture-independent data root (``share``)
+# ``DATADIR``
+#   read-only architecture-independent data (``DATAROOTDIR``)
+# ``INFODIR``
+#   info documentation (``DATAROOTDIR/info``)
+# ``LOCALEDIR``
+#   locale-dependent data (``DATAROOTDIR/locale``)
+# ``MANDIR``
+#   man documentation (``DATAROOTDIR/man``)
+# ``DOCDIR``
+#   documentation root (``DATAROOTDIR/doc/PROJECT_NAME``)
+#
+# If the includer does not define a value the above-shown default will be
+# used and the value will appear in the cache for editing by the user.
+#
+# Special Cases
+# ^^^^^^^^^^^^^
+#
+# The following values of :variable:`CMAKE_INSTALL_PREFIX` are special:
+#
+# ``/``
+#
+#   For ``<dir>`` other than the ``SYSCONFDIR`` and ``LOCALSTATEDIR``,
+#   the value of ``CMAKE_INSTALL_<dir>`` is prefixed with ``usr/`` if
+#   it is not user-specified as an absolute path.  For example, the
+#   ``INCLUDEDIR`` value ``include`` becomes ``usr/include``.
+#   This is required by the `GNU Coding Standards`_, which state:
+#
+#     When building the complete GNU system, the prefix will be empty
+#     and ``/usr`` will be a symbolic link to ``/``.
+#
+# ``/usr``
+#
+#   For ``<dir>`` equal to ``SYSCONFDIR`` or ``LOCALSTATEDIR``, the
+#   ``CMAKE_INSTALL_FULL_<dir>`` is computed by prepending just ``/``
+#   to the value of ``CMAKE_INSTALL_<dir>`` if it is not user-specified
+#   as an absolute path.  For example, the ``SYSCONFDIR`` value ``etc``
+#   becomes ``/etc``.  This is required by the `GNU Coding Standards`_.
+#
+# ``/opt/...``
+#
+#   For ``<dir>`` equal to ``SYSCONFDIR`` or ``LOCALSTATEDIR``, the
+#   ``CMAKE_INSTALL_FULL_<dir>`` is computed by *appending* the prefix
+#   to the value of ``CMAKE_INSTALL_<dir>`` if it is not user-specified
+#   as an absolute path.  For example, the ``SYSCONFDIR`` value ``etc``
+#   becomes ``/etc/opt/...``.  This is defined by the
+#   `Filesystem Hierarchy Standard`_.
+#
+# .. _`Filesystem Hierarchy Standard`: https://refspecs.linuxfoundation.org/FHS_3.0/fhs/index.html
+#
+# Macros
+# ^^^^^^
+#
+# .. command:: GNUInstallDirs_get_absolute_install_dir
+#
+#   ::
+#
+#     GNUInstallDirs_get_absolute_install_dir(absvar var)
+#
+#   Set the given variable ``absvar`` to the absolute path contained
+#   within the variable ``var``.  This is to allow the computation of an
+#   absolute path, accounting for all the special cases documented
+#   above.  While this macro is used to compute the various
+#   ``CMAKE_INSTALL_FULL_<dir>`` variables, it is exposed publicly to
+#   allow users who create additional path variables to also compute
+#   absolute paths where necessary, using the same logic.
+
+#=============================================================================
+# Copyright 2016 D. R. Commander
+# Copyright 2016 Dmitry Marakasov
+# Copyright 2016 Roger Leigh
+# Copyright 2015 Alex Turbov
+# Copyright 2014 Rolf Eike Beer
+# Copyright 2014 Daniele E. Domenichelli
+# Copyright 2013 Dimitri John Ledkov
+# Copyright 2011 Alex Neundorf
+# Copyright 2011 Eric NOULARD
+# Copyright 2011, 2013-2015 Kitware, Inc.
+# Copyright 2011 Nikita Krupen'ko
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+#
+# * Neither the names of Kitware, Inc., the Insight Software Consortium,
+#   nor the names of their contributors may be used to endorse or promote
+#   products derived from this software without specific prior written
+#   permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#=============================================================================
+
+# Installation directories
+#
+
+macro(GNUInstallDirs_set_install_dir var docstring)
+  # If CMAKE_INSTALL_PREFIX changes and CMAKE_INSTALL_*DIR is still set to the
+  # default value, then modify it accordingly.  This presumes that the default
+  # value may change based on the prefix.
+
+  set(_GNUInstallDirs_CMAKE_INSTALL_FORCE_${var} "")
+  if(NOT DEFINED CMAKE_INSTALL_${var})
+    set(_GNUInstallDirs_CMAKE_INSTALL_DEFAULT_${var} 1 CACHE INTERNAL
+      "CMAKE_INSTALL_${var} has default value")
+  elseif(DEFINED _GNUInstallDirs_CMAKE_INSTALL_LAST_DEFAULT_${var} AND
+    NOT "${_GNUInstallDirs_CMAKE_INSTALL_LAST_DEFAULT_${var}}" STREQUAL
+      "${CMAKE_INSTALL_DEFAULT_${var}}" AND
+    _GNUInstallDirs_CMAKE_INSTALL_DEFAULT_${var} AND
+    "${_GNUInstallDirs_CMAKE_INSTALL_LAST_${var}}" STREQUAL
+      "${CMAKE_INSTALL_${var}}")
+    set(_GNUInstallDirs_CMAKE_INSTALL_FORCE_${var} "FORCE")
+  endif()
+
+  set(CMAKE_INSTALL_${var} "${CMAKE_INSTALL_DEFAULT_${var}}" CACHE PATH
+    "${docstring} (Default: ${CMAKE_INSTALL_DEFAULT_${var}})"
+    ${_GNUInstallDirs_CMAKE_INSTALL_FORCE_${var}})
+
+  if(NOT "${CMAKE_INSTALL_${var}}" STREQUAL "${CMAKE_INSTALL_DEFAULT_${var}}")
+    unset(_GNUInstallDirs_CMAKE_INSTALL_DEFAULT_${var} CACHE)
+  endif()
+
+  # Save for next run
+  set(_GNUInstallDirs_CMAKE_INSTALL_LAST_${var} "${CMAKE_INSTALL_${var}}"
+    CACHE INTERNAL "CMAKE_INSTALL_${var} during last run")
+  set(_GNUInstallDirs_CMAKE_INSTALL_LAST_DEFAULT_${var}
+    "${CMAKE_INSTALL_DEFAULT_${var}}" CACHE INTERNAL
+    "CMAKE_INSTALL_DEFAULT_${var} during last run")
+endmacro()
+
+if(NOT DEFINED CMAKE_INSTALL_DEFAULT_BINDIR)
+  set(CMAKE_INSTALL_DEFAULT_BINDIR "bin")
+endif()
+GNUInstallDirs_set_install_dir(BINDIR
+  "Directory into which user executables should be installed")
+
+if(NOT DEFINED CMAKE_INSTALL_DEFAULT_SBINDIR)
+  set(CMAKE_INSTALL_DEFAULT_SBINDIR "sbin")
+endif()
+GNUInstallDirs_set_install_dir(SBINDIR
+  "Directory into which system admin executables should be installed")
+
+if(NOT DEFINED CMAKE_INSTALL_DEFAULT_LIBEXECDIR)
+  set(CMAKE_INSTALL_DEFAULT_LIBEXECDIR "libexec")
+endif()
+GNUInstallDirs_set_install_dir(LIBEXECDIR
+  "Directory under which executables run by other programs should be installed")
+
+if(NOT DEFINED CMAKE_INSTALL_DEFAULT_SYSCONFDIR)
+  set(CMAKE_INSTALL_DEFAULT_SYSCONFDIR "etc")
+endif()
+GNUInstallDirs_set_install_dir(SYSCONFDIR
+  "Directory into which machine-specific read-only ASCII data and configuration files should be installed")
+
+if(NOT DEFINED CMAKE_INSTALL_DEFAULT_SHAREDSTATEDIR)
+  set(CMAKE_INSTALL_DEFAULT_SHAREDSTATEDIR "com")
+endif()
+GNUInstallDirs_set_install_dir(SHAREDSTATEDIR
+  "Directory into which architecture-independent run-time-modifiable data files should be installed")
+
+if(NOT DEFINED CMAKE_INSTALL_DEFAULT_LOCALSTATEDIR)
+  set(CMAKE_INSTALL_DEFAULT_LOCALSTATEDIR "var")
+endif()
+GNUInstallDirs_set_install_dir(LOCALSTATEDIR
+  "Directory into which machine-specific run-time-modifiable data files should be installed")
+
+if(NOT DEFINED CMAKE_INSTALL_DEFAULT_LIBDIR)
+  set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib")
+  # Override this default 'lib' with 'lib64' iff:
+  #  - we are on Linux system but NOT cross-compiling
+  #  - we are NOT on debian
+  #  - we are on a 64 bits system
+  # reason is: amd64 ABI: http://www.x86-64.org/documentation/abi.pdf
+  # For Debian with multiarch, use 'lib/${CMAKE_LIBRARY_ARCHITECTURE}' if
+  # CMAKE_LIBRARY_ARCHITECTURE is set (which contains e.g. "i386-linux-gnu"
+  # and CMAKE_INSTALL_PREFIX is "/usr"
+  # See http://wiki.debian.org/Multiarch
+  if(CMAKE_SYSTEM_NAME MATCHES "^(Linux|kFreeBSD|GNU)$"
+      AND NOT CMAKE_CROSSCOMPILING)
+    if (EXISTS "/etc/debian_version") # is this a debian system ?
+      if(CMAKE_LIBRARY_ARCHITECTURE)
+        if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/?$")
+          set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib/${CMAKE_LIBRARY_ARCHITECTURE}")
+        endif()
+      endif()
+    else() # not debian, rely on CMAKE_SIZEOF_VOID_P:
+      if(NOT DEFINED CMAKE_SIZEOF_VOID_P)
+        message(AUTHOR_WARNING
+          "Unable to determine default CMAKE_INSTALL_LIBDIR directory because no target architecture is known. "
+          "Please enable at least one language before including GNUInstallDirs.")
+      else()
+        if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
+          set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib64")
+        endif()
+      endif()
+    endif()
+  endif()
+endif()
+GNUInstallDirs_set_install_dir(LIBDIR
+  "Directory into which object files and object code libraries should be installed")
+
+if(NOT DEFINED CMAKE_INSTALL_DEFAULT_INCLUDEDIR)
+  set(CMAKE_INSTALL_DEFAULT_INCLUDEDIR "include")
+endif()
+GNUInstallDirs_set_install_dir(INCLUDEDIR
+  "Directory into which C header files should be installed")
+
+if(NOT DEFINED CMAKE_INSTALL_DEFAULT_OLDINCLUDEDIR)
+  set(CMAKE_INSTALL_DEFAULT_OLDINCLUDEDIR "/usr/include")
+endif()
+GNUInstallDirs_set_install_dir(OLDINCLUDEDIR
+  PATH "Directory into which C header files for non-GCC compilers should be installed")
+
+if(NOT DEFINED CMAKE_INSTALL_DEFAULT_DATAROOTDIR)
+  set(CMAKE_INSTALL_DEFAULT_DATAROOTDIR "share")
+endif()
+GNUInstallDirs_set_install_dir(DATAROOTDIR
+  "The root of the directory tree for read-only architecture-independent data files")
+
+#-----------------------------------------------------------------------------
+# Values whose defaults are relative to DATAROOTDIR.  Store empty values in
+# the cache and store the defaults in local variables if the cache values are
+# not set explicitly.  This auto-updates the defaults as DATAROOTDIR changes.
+
+if(NOT DEFINED CMAKE_INSTALL_DEFAULT_DATADIR)
+  set(CMAKE_INSTALL_DEFAULT_DATADIR "<CMAKE_INSTALL_DATAROOTDIR>")
+endif()
+GNUInstallDirs_set_install_dir(DATADIR
+  "The directory under which read-only architecture-independent data files should be installed")
+
+if(NOT DEFINED CMAKE_INSTALL_DEFAULT_INFODIR)
+  if(CMAKE_SYSTEM_NAME MATCHES "^(.*BSD|DragonFly)$")
+    set(CMAKE_INSTALL_DEFAULT_INFODIR "info")
+  else()
+    set(CMAKE_INSTALL_DEFAULT_INFODIR "<CMAKE_INSTALL_DATAROOTDIR>/info")
+  endif()
+endif()
+GNUInstallDirs_set_install_dir(INFODIR
+  "The directory into which info documentation files should be installed")
+
+if(NOT DEFINED CMAKE_INSTALL_DEFAULT_MANDIR)
+  if(CMAKE_SYSTEM_NAME MATCHES "^(.*BSD|DragonFly)$")
+    set(CMAKE_INSTALL_DEFAULT_MANDIR "man")
+  else()
+    set(CMAKE_INSTALL_DEFAULT_MANDIR "<CMAKE_INSTALL_DATAROOTDIR>/man")
+  endif()
+endif()
+GNUInstallDirs_set_install_dir(MANDIR
+  "The directory under which man pages should be installed")
+
+if(NOT DEFINED CMAKE_INSTALL_DEFAULT_LOCALEDIR)
+  set(CMAKE_INSTALL_DEFAULT_LOCALEDIR "<CMAKE_INSTALL_DATAROOTDIR>/locale")
+endif()
+GNUInstallDirs_set_install_dir(LOCALEDIR
+  "The directory under which locale-specific message catalogs should be installed")
+
+if(NOT DEFINED CMAKE_INSTALL_DEFAULT_DOCDIR)
+  set(CMAKE_INSTALL_DEFAULT_DOCDIR "<CMAKE_INSTALL_DATAROOTDIR>/doc/${PROJECT_NAME}")
+endif()
+GNUInstallDirs_set_install_dir(DOCDIR
+  "The directory into which documentation files (other than info files) should be installed")
+
+#-----------------------------------------------------------------------------
+
+mark_as_advanced(
+  CMAKE_INSTALL_BINDIR
+  CMAKE_INSTALL_SBINDIR
+  CMAKE_INSTALL_LIBEXECDIR
+  CMAKE_INSTALL_SYSCONFDIR
+  CMAKE_INSTALL_SHAREDSTATEDIR
+  CMAKE_INSTALL_LOCALSTATEDIR
+  CMAKE_INSTALL_LIBDIR
+  CMAKE_INSTALL_INCLUDEDIR
+  CMAKE_INSTALL_OLDINCLUDEDIR
+  CMAKE_INSTALL_DATAROOTDIR
+  CMAKE_INSTALL_DATADIR
+  CMAKE_INSTALL_INFODIR
+  CMAKE_INSTALL_LOCALEDIR
+  CMAKE_INSTALL_MANDIR
+  CMAKE_INSTALL_DOCDIR
+  )
+
+macro(GNUInstallDirs_get_absolute_install_dir absvar var)
+  string(REGEX REPLACE "[<>]" "@" ${var} "${${var}}")
+  # Handle the specific case of an empty CMAKE_INSTALL_DATAROOTDIR
+  if(NOT CMAKE_INSTALL_DATAROOTDIR AND
+    ${var} MATCHES "\@CMAKE_INSTALL_DATAROOTDIR\@/")
+    string(CONFIGURE "${${var}}" ${var} @ONLY)
+    string(REGEX REPLACE "^/" "" ${var} "${${var}}")
+  else()
+    string(CONFIGURE "${${var}}" ${var} @ONLY)
+  endif()
+  if(NOT IS_ABSOLUTE "${${var}}")
+    # Handle special cases:
+    # - CMAKE_INSTALL_PREFIX == /
+    # - CMAKE_INSTALL_PREFIX == /usr
+    # - CMAKE_INSTALL_PREFIX == /opt/...
+    if("${CMAKE_INSTALL_PREFIX}" STREQUAL "/")
+      if("${dir}" STREQUAL "SYSCONFDIR" OR "${dir}" STREQUAL "LOCALSTATEDIR")
+        set(${absvar} "/${${var}}")
+      else()
+        if (NOT "${${var}}" MATCHES "^usr/")
+          set(${var} "usr/${${var}}")
+        endif()
+        set(${absvar} "/${${var}}")
+      endif()
+    elseif("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/?$")
+      if("${dir}" STREQUAL "SYSCONFDIR" OR "${dir}" STREQUAL "LOCALSTATEDIR")
+        set(${absvar} "/${${var}}")
+      else()
+        set(${absvar} "${CMAKE_INSTALL_PREFIX}/${${var}}")
+      endif()
+    elseif("${CMAKE_INSTALL_PREFIX}" MATCHES "^/opt/.*")
+      if("${dir}" STREQUAL "SYSCONFDIR" OR "${dir}" STREQUAL "LOCALSTATEDIR")
+        set(${absvar} "/${${var}}${CMAKE_INSTALL_PREFIX}")
+      else()
+        set(${absvar} "${CMAKE_INSTALL_PREFIX}/${${var}}")
+      endif()
+    else()
+      set(${absvar} "${CMAKE_INSTALL_PREFIX}/${${var}}")
+    endif()
+  else()
+    set(${absvar} "${${var}}")
+  endif()
+  string(REGEX REPLACE "/$" "" ${absvar} "${${absvar}}")
+endmacro()
+
+# Result directories
+#
+foreach(dir
+    BINDIR
+    SBINDIR
+    LIBEXECDIR
+    SYSCONFDIR
+    SHAREDSTATEDIR
+    LOCALSTATEDIR
+    LIBDIR
+    INCLUDEDIR
+    OLDINCLUDEDIR
+    DATAROOTDIR
+    DATADIR
+    INFODIR
+    LOCALEDIR
+    MANDIR
+    DOCDIR
+    )
+  GNUInstallDirs_get_absolute_install_dir(CMAKE_INSTALL_FULL_${dir} CMAKE_INSTALL_${dir})
+endforeach()
diff --git a/cmakescripts/cmake_uninstall.cmake.in b/cmakescripts/cmake_uninstall.cmake.in
index b35d100..6726a0d 100644
--- a/cmakescripts/cmake_uninstall.cmake.in
+++ b/cmakescripts/cmake_uninstall.cmake.in
@@ -1,10 +1,10 @@
 # This code is from the CMake FAQ
 
-if (NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
-  message(FATAL_ERROR "Cannot find install manifest: \"@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt\"")
-endif(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+if (NOT EXISTS "@CMAKE_BINARY_DIR@/install_manifest.txt")
+  message(FATAL_ERROR "Cannot find install manifest: \"@CMAKE_BINARY_DIR@/install_manifest.txt\"")
+endif(NOT EXISTS "@CMAKE_BINARY_DIR@/install_manifest.txt")
 
-file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files)
+file(READ "@CMAKE_BINARY_DIR@/install_manifest.txt" files)
 string(REGEX REPLACE "\n" ";" files "${files}")
 list(REVERSE files)
 foreach (file ${files})
diff --git a/cmakescripts/testclean.cmake b/cmakescripts/testclean.cmake
index 38bb03b..fc3fc25 100644
--- a/cmakescripts/testclean.cmake
+++ b/cmakescripts/testclean.cmake
@@ -29,7 +29,9 @@
   *_411_*.png
   *_411_*.ppm
   *_411_*.jpg
-  *_411.yuv)
+  *_411.yuv
+  tjbenchtest*.log
+  tjexampletest*.log)
 
 if(NOT FILES STREQUAL "")
   message(STATUS "Removing test files")
diff --git a/cmyk.h b/cmyk.h
new file mode 100644
index 0000000..48187a8
--- /dev/null
+++ b/cmyk.h
@@ -0,0 +1,61 @@
+/*
+ * cmyk.h
+ *
+ * Copyright (C) 2017-2018, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains convenience functions for performing quick & dirty
+ * CMYK<->RGB conversion.  This algorithm is suitable for testing purposes
+ * only.  Properly converting between CMYK and RGB requires a color management
+ * system.
+ */
+
+#ifndef CMYK_H
+#define CMYK_H
+
+#include <jinclude.h>
+#define JPEG_INTERNALS
+#include <jpeglib.h>
+#include "jconfigint.h"
+
+
+/* Fully reversible */
+
+INLINE
+LOCAL(void)
+rgb_to_cmyk(JSAMPLE r, JSAMPLE g, JSAMPLE b, JSAMPLE *c, JSAMPLE *m,
+            JSAMPLE *y, JSAMPLE *k)
+{
+  double ctmp = 1.0 - ((double)r / 255.0);
+  double mtmp = 1.0 - ((double)g / 255.0);
+  double ytmp = 1.0 - ((double)b / 255.0);
+  double ktmp = MIN(MIN(ctmp, mtmp), ytmp);
+
+  if (ktmp == 1.0) ctmp = mtmp = ytmp = 0.0;
+  else {
+    ctmp = (ctmp - ktmp) / (1.0 - ktmp);
+    mtmp = (mtmp - ktmp) / (1.0 - ktmp);
+    ytmp = (ytmp - ktmp) / (1.0 - ktmp);
+  }
+  *c = (JSAMPLE)(255.0 - ctmp * 255.0 + 0.5);
+  *m = (JSAMPLE)(255.0 - mtmp * 255.0 + 0.5);
+  *y = (JSAMPLE)(255.0 - ytmp * 255.0 + 0.5);
+  *k = (JSAMPLE)(255.0 - ktmp * 255.0 + 0.5);
+}
+
+
+/* Fully reversible only for C/M/Y/K values generated with rgb_to_cmyk() */
+
+INLINE
+LOCAL(void)
+cmyk_to_rgb(JSAMPLE c, JSAMPLE m, JSAMPLE y, JSAMPLE k, JSAMPLE *r, JSAMPLE *g,
+            JSAMPLE *b)
+{
+  *r = (JSAMPLE)((double)c * (double)k / 255.0 + 0.5);
+  *g = (JSAMPLE)((double)m * (double)k / 255.0 + 0.5);
+  *b = (JSAMPLE)((double)y * (double)k / 255.0 + 0.5);
+}
+
+
+#endif /* CMYK_H */
diff --git a/configure.ac b/configure.ac
deleted file mode 100644
index 6424140..0000000
--- a/configure.ac
+++ /dev/null
@@ -1,616 +0,0 @@
-#                                               -*- Autoconf -*-
-# Process this file with autoconf to produce a configure script.
-
-AC_PREREQ([2.56])
-AC_INIT([libjpeg-turbo], [1.5.4])
-
-AM_INIT_AUTOMAKE([-Wall foreign dist-bzip2])
-AC_PREFIX_DEFAULT(/opt/libjpeg-turbo)
-
-m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
-
-# Checks for programs.
-SAVED_CFLAGS=${CFLAGS}
-SAVED_CPPFLAGS=${CPPFLAGS}
-AC_PROG_CPP
-AC_PROG_CC
-m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
-AM_PROG_AS
-AM_PROG_CC_C_O
-AC_PROG_INSTALL
-AC_PROG_LIBTOOL
-AC_PROG_LN_S
-
-AC_ARG_WITH([build-date], [Use custom build string to enable reproducible builds (default: YYMMDD)],
-  [BUILD="$with_build_date"],
-  [BUILD=`date +%Y%m%d`])
-
-PKG_PROG_PKG_CONFIG
-
-# When the prefix is /opt/libjpeg-turbo, we assume that an "official" binary is
-# being created, and thus we install things into specific locations.
-
-old_prefix=${prefix}
-if test "x$prefix" = "xNONE" -a "x$ac_default_prefix" != "x"; then
-  prefix=$ac_default_prefix
-fi
-DATADIR=`eval echo ${datadir}`
-DATADIR=`eval echo $DATADIR`
-if test "$DATADIR" = "/opt/libjpeg-turbo/share"; then
-  datadir='${prefix}'
-fi
-DATADIR=`eval echo ${datarootdir}`
-DATADIR=`eval echo $DATADIR`
-if test "$DATADIR" = "/opt/libjpeg-turbo/share"; then
-  datarootdir='${prefix}'
-fi
-DOCDIR=`eval echo ${docdir}`
-DOCDIR=`eval echo $DOCDIR`
-if test "$DOCDIR" = "/opt/libjpeg-turbo/doc/libjpeg-turbo"; then
-  docdir='${datadir}/doc'
-fi
-
-old_exec_prefix=${exec_prefix}
-if test "x$exec_prefix" = "xNONE"; then
-  exec_prefix=${prefix}
-fi
-
-AC_CHECK_SIZEOF(size_t)
-
-if test "x${libdir}" = 'x${exec_prefix}/lib' -o "x${libdir}" = 'x${prefix}/lib'; then
-  LIBDIR=`eval echo ${libdir}`
-  LIBDIR=`eval echo $LIBDIR`
-  if test "$LIBDIR" = "/opt/libjpeg-turbo/lib"; then
-    case $host_os in
-      darwin*)
-        ;;
-      *)
-        if test "${ac_cv_sizeof_size_t}" = "8"; then
-          libdir='${exec_prefix}/lib64'
-        elif test "${ac_cv_sizeof_size_t}" = "4"; then
-          libdir='${exec_prefix}/lib32'
-        fi
-        ;;
-    esac
-  fi
-fi
-exec_prefix=${old_exec_prefix}
-prefix=${old_prefix}
-
-# Check whether compiler supports pointers to undefined structures
-AC_MSG_CHECKING(whether compiler supports pointers to undefined structures)
-AC_TRY_COMPILE([ typedef struct undefined_structure *undef_struct_ptr; ], ,
-  AC_MSG_RESULT(yes),
-  [AC_MSG_RESULT(no)
-   AC_DEFINE([INCOMPLETE_TYPES_BROKEN], [1],
-     [Compiler does not support pointers to undefined structures.])])
-
-if test "x${GCC}" = "xyes"; then
-  if test "x${SAVED_CFLAGS}" = "x"; then
-    CFLAGS=-O3
-  fi
-  if test "x${SAVED_CPPFLAGS}" = "x"; then
-    CPPFLAGS=-Wall
-  fi
-fi
-
-AC_CHECK_DECL([__SUNPRO_C], [SUNCC="yes"], [SUNCC="no"])
-if test "x${SUNCC}" = "xyes"; then
-  if test "x${SAVED_CFLAGS}" = "x"; then
-    CFLAGS=-xO5
-  fi
-fi
-
-# Checks for libraries.
-
-# Checks for header files.
-AC_HEADER_STDC
-AC_CHECK_HEADERS([stddef.h stdlib.h locale.h string.h])
-AC_CHECK_HEADER([sys/types.h],
-  AC_DEFINE([NEED_SYS_TYPES_H], 1, [Define if you need to include <sys/types.h> to get size_t.]))
-
-# Checks for typedefs, structures, and compiler characteristics.
-AC_C_CONST
-AC_C_CHAR_UNSIGNED
-AC_C_INLINE
-AC_TYPE_SIZE_T
-AC_CHECK_TYPES([unsigned char, unsigned short])
-
-AC_MSG_CHECKING([if right shift is signed])
-AC_TRY_RUN(
-  [#include <stdio.h>
-   int is_shifting_signed (long arg) {
-     long res = arg >> 4;
-
-     if (res == -0x7F7E80CL)
-       return 1; /* right shift is signed */
-
-     /* see if unsigned-shift hack will fix it. */
-     /* we can't just test exact value since it depends on width of long... */
-     res |= (~0L) << (32-4);
-     if (res == -0x7F7E80CL)
-       return 0; /* right shift is unsigned */
-
-     printf("Right shift isn't acting as I expect it to.\n");
-     printf("I fear the JPEG software will not work at all.\n\n");
-     return 0; /* try it with unsigned anyway */
-   }
-   int main (void) {
-     exit(is_shifting_signed(-0x7F7E80B1L));
-   }],
-  [AC_MSG_RESULT(no)
-   AC_DEFINE([RIGHT_SHIFT_IS_UNSIGNED], 1,
-     [Define if your (broken) compiler shifts signed values as if they were unsigned.])],
-  [AC_MSG_RESULT(yes)],
-  [AC_MSG_RESULT(Assuming that right shift is signed on target machine.)])
-
-# Checks for library functions.
-AC_CHECK_FUNCS([memset memcpy], [],
-  [AC_DEFINE([NEED_BSD_STRINGS], 1,
-     [Define if you have BSD-like bzero and bcopy in <strings.h> rather than memset/memcpy in <string.h>.])])
-
-AC_MSG_CHECKING([libjpeg API version])
-AC_ARG_VAR(JPEG_LIB_VERSION, [libjpeg API version (62, 70, or 80)])
-if test "x$JPEG_LIB_VERSION" = "x"; then
-  AC_ARG_WITH([jpeg7],
-    AC_HELP_STRING([--with-jpeg7],
-      [Emulate libjpeg v7 API/ABI (this makes libjpeg-turbo backward incompatible with libjpeg v6b.)]))
-  AC_ARG_WITH([jpeg8],
-    AC_HELP_STRING([--with-jpeg8],
-      [Emulate libjpeg v8 API/ABI (this makes libjpeg-turbo backward incompatible with libjpeg v6b.)]))
-  if test "x${with_jpeg8}" = "xyes"; then
-    JPEG_LIB_VERSION=80
-  else
-    if test "x${with_jpeg7}" = "xyes"; then
-      JPEG_LIB_VERSION=70
-    else
-      JPEG_LIB_VERSION=62
-    fi
-  fi
-fi
-JPEG_LIB_VERSION_DECIMAL=`expr $JPEG_LIB_VERSION / 10`.`expr $JPEG_LIB_VERSION % 10`
-AC_SUBST(JPEG_LIB_VERSION_DECIMAL)
-AC_MSG_RESULT([$JPEG_LIB_VERSION_DECIMAL])
-AC_DEFINE_UNQUOTED(JPEG_LIB_VERSION, [$JPEG_LIB_VERSION],
-  [libjpeg API version])
-
-AC_ARG_VAR(SO_MAJOR_VERSION,
-  [Major version of the libjpeg-turbo shared library (default is determined by the API version)])
-AC_ARG_VAR(SO_MINOR_VERSION,
-  [Minor version of the libjpeg-turbo shared library (default is determined by the API version)])
-if test "x$SO_MAJOR_VERSION" = "x"; then
-  case "$JPEG_LIB_VERSION" in
-    62)  SO_MAJOR_VERSION=$JPEG_LIB_VERSION ;;
-    *)   SO_MAJOR_VERSION=`expr $JPEG_LIB_VERSION / 10` ;;
-  esac
-fi
-if test "x$SO_MINOR_VERSION" = "x"; then
-  case "$JPEG_LIB_VERSION" in
-    80)  SO_MINOR_VERSION=2 ;;
-    *)   SO_MINOR_VERSION=0 ;;
-  esac
-fi
-
-RPM_CONFIG_ARGS=
-
-# Memory source/destination managers
-SO_AGE=1
-MEM_SRCDST_FUNCTIONS=
-if test "x${with_jpeg8}" != "xyes"; then
-  AC_MSG_CHECKING([whether to include in-memory source/destination managers])
-  AC_ARG_WITH([mem-srcdst],
-    AC_HELP_STRING([--without-mem-srcdst],
-      [Do not include in-memory source/destination manager functions when emulating the libjpeg v6b or v7 API/ABI]))
-  if test "x$with_mem_srcdst" != "xno"; then
-    AC_MSG_RESULT(yes)
-    AC_DEFINE([MEM_SRCDST_SUPPORTED], [1],
-      [Support in-memory source/destination managers])
-    SO_AGE=2
-    MEM_SRCDST_FUNCTIONS="global:  jpeg_mem_dest;  jpeg_mem_src;";
-  else
-    AC_MSG_RESULT(no)
-    RPM_CONFIG_ARGS="$RPM_CONFIG_ARGS --without-mem-srcdst"
-  fi
-fi
-
-AC_MSG_CHECKING([libjpeg shared library version])
-AC_MSG_RESULT([$SO_MAJOR_VERSION.$SO_AGE.$SO_MINOR_VERSION])
-LIBTOOL_CURRENT=`expr $SO_MAJOR_VERSION + $SO_AGE`
-AC_SUBST(LIBTOOL_CURRENT)
-AC_SUBST(SO_MAJOR_VERSION)
-AC_SUBST(SO_MINOR_VERSION)
-AC_SUBST(SO_AGE)
-AC_SUBST(MEM_SRCDST_FUNCTIONS)
-
-AC_DEFINE_UNQUOTED(LIBJPEG_TURBO_VERSION, [$VERSION], [libjpeg-turbo version])
-
-m4_define(version_triplet,m4_split(AC_PACKAGE_VERSION,[[.]]))
-m4_define(version_major,m4_car(m4_shiftn(1,[],version_triplet)))
-m4_define(version_minor,m4_car(m4_shiftn(2,[],version_triplet)))
-m4_define(version_revision,m4_car(m4_shiftn(3,[],version_triplet)))
-VERSION_MAJOR=version_major
-VERSION_MINOR=version_minor
-VERSION_REVISION=version_revision
-LIBJPEG_TURBO_VERSION_NUMBER=`printf "%d%03d%03d" $VERSION_MAJOR $VERSION_MINOR $VERSION_REVISION`
-AC_DEFINE_UNQUOTED(LIBJPEG_TURBO_VERSION_NUMBER, [$LIBJPEG_TURBO_VERSION_NUMBER], [libjpeg-turbo version in integer form])
-
-VERSION_SCRIPT=yes
-AC_ARG_ENABLE([ld-version-script],
-  AS_HELP_STRING([--disable-ld-version-script],
-    [Disable linker version script for libjpeg-turbo (default is to use linker version script if the linker supports it)]),
-  [VERSION_SCRIPT=$enableval], [])
-
-AC_MSG_CHECKING([whether the linker supports version scripts])
-SAVED_LDFLAGS="$LDFLAGS"
-LDFLAGS="$LDFLAGS -Wl,--version-script,conftest.map"
-cat > conftest.map <<EOF
-VERS_1 {
-  global: *;
-};
-EOF
-AC_LINK_IFELSE([AC_LANG_PROGRAM([], [])],
-  [VERSION_SCRIPT_FLAG=-Wl,--version-script,;
-   AC_MSG_RESULT([yes (GNU style)])],
-  [])
-if test "x$VERSION_SCRIPT_FLAG" = "x"; then
-  LDFLAGS="$SAVED_LDFLAGS -Wl,-M,conftest.map"
-  AC_LINK_IFELSE([AC_LANG_PROGRAM([], [])],
-    [VERSION_SCRIPT_FLAG=-Wl,-M,;
-     AC_MSG_RESULT([yes (Sun style)])],
-    [])
-fi
-if test "x$VERSION_SCRIPT_FLAG" = "x"; then
-  VERSION_SCRIPT=no
-  AC_MSG_RESULT(no)
-fi
-LDFLAGS="$SAVED_LDFLAGS"
-
-AC_MSG_CHECKING([whether to use version script when building libjpeg-turbo])
-AC_MSG_RESULT($VERSION_SCRIPT)
-
-AM_CONDITIONAL(VERSION_SCRIPT, test "x$VERSION_SCRIPT" = "xyes")
-AC_SUBST(VERSION_SCRIPT_FLAG)
-
-# Check for non-broken inline under various spellings
-AC_MSG_CHECKING(for inline)
-ljt_cv_inline=""
-AC_TRY_COMPILE(, [} inline __attribute__((always_inline)) int foo() { return 0; }
-int bar() { return foo();], ljt_cv_inline="inline __attribute__((always_inline))",
-AC_TRY_COMPILE(, [} __inline__ int foo() { return 0; }
-int bar() { return foo();], ljt_cv_inline="__inline__",
-AC_TRY_COMPILE(, [} __inline int foo() { return 0; }
-int bar() { return foo();], ljt_cv_inline="__inline",
-AC_TRY_COMPILE(, [} inline int foo() { return 0; }
-int bar() { return foo();], ljt_cv_inline="inline"))))
-AC_MSG_RESULT($ljt_cv_inline)
-AC_DEFINE_UNQUOTED([INLINE],[$ljt_cv_inline],[How to obtain function inlining.])
-
-# Arithmetic coding support
-AC_MSG_CHECKING([whether to include arithmetic encoding support])
-AC_ARG_WITH([arith-enc],
-  AC_HELP_STRING([--without-arith-enc],
-    [Do not include arithmetic encoding support when emulating the libjpeg v6b API/ABI]))
-if test "x$with_12bit" = "xyes"; then
-  with_arith_enc=no
-fi
-if test "x${with_jpeg8}" = "xyes" -o "x${with_jpeg7}" = "xyes"; then
-  with_arith_enc=yes
-fi
-if test "x$with_arith_enc" = "xno"; then
-  AC_MSG_RESULT(no)
-  RPM_CONFIG_ARGS="$RPM_CONFIG_ARGS --without-arith-enc"
-else
-  AC_DEFINE([C_ARITH_CODING_SUPPORTED], [1], [Support arithmetic encoding])
-  AC_MSG_RESULT(yes)
-fi
-AM_CONDITIONAL([WITH_ARITH_ENC], [test "x$with_arith_enc" != "xno"])
-
-AC_MSG_CHECKING([whether to include arithmetic decoding support])
-AC_ARG_WITH([arith-dec],
-  AC_HELP_STRING([--without-arith-dec],
-    [Do not include arithmetic decoding support when emulating the libjpeg v6b API/ABI]))
-if test "x$with_12bit" = "xyes"; then
-  with_arith_dec=no
-fi
-if test "x${with_jpeg8}" = "xyes" -o "x${with_jpeg7}" = "xyes"; then
-  with_arith_dec=yes
-fi
-if test "x$with_arith_dec" = "xno"; then
-  AC_MSG_RESULT(no)
-  RPM_CONFIG_ARGS="$RPM_CONFIG_ARGS --without-arith-dec"
-else
-  AC_DEFINE([D_ARITH_CODING_SUPPORTED], [1], [Support arithmetic decoding])
-  AC_MSG_RESULT(yes)
-fi
-AM_CONDITIONAL([WITH_ARITH_DEC], [test "x$with_arith_dec" != "xno"])
-
-AM_CONDITIONAL([WITH_ARITH],
-  [test "x$with_arith_dec" != "xno" -o "x$with_arith_enc" != "xno"])
-
-# 12-bit component support
-AC_MSG_CHECKING([whether to use 12-bit samples])
-AC_ARG_WITH([12bit],
-  AC_HELP_STRING([--with-12bit], [Encode/decode JPEG images with 12-bit samples (implies --without-simd --without-turbojpeg --without-arith-dec --without-arith-enc)]))
-if test "x$with_12bit" = "xyes"; then
-  AC_DEFINE([BITS_IN_JSAMPLE], [12], [use 8 or 12])
-  AC_MSG_RESULT(yes)
-else
-  AC_MSG_RESULT(no)
-fi
-AM_CONDITIONAL([WITH_12BIT], [test "x$with_12bit" = "xyes"])
-
-# TurboJPEG support
-AC_MSG_CHECKING([whether to build TurboJPEG C wrapper])
-AC_ARG_WITH([turbojpeg],
-  AC_HELP_STRING([--without-turbojpeg],
-    [Do not include the TurboJPEG wrapper library and associated test programs]))
-if test "x$with_12bit" = "xyes"; then
-  with_turbojpeg=no
-fi
-if test "x$with_turbojpeg" = "xno"; then
-  AC_MSG_RESULT(no)
-  RPM_CONFIG_ARGS="$RPM_CONFIG_ARGS --without-turbojpeg"
-else
-  AC_MSG_RESULT(yes)
-fi
-
-# Java support
-AC_ARG_VAR(JAVAC, [Java compiler command (default: javac)])
-if test "x$JAVAC" = "x"; then
-  JAVAC=javac
-fi
-AC_SUBST(JAVAC)
-AC_ARG_VAR(JAVACFLAGS, [Java compiler flags])
-JAVACFLAGS="$JAVACFLAGS -J-Dfile.encoding=UTF8"
-AC_SUBST(JAVACFLAGS)
-AC_ARG_VAR(JAR, [Java archive command (default: jar)])
-if test "x$JAR" = "x"; then
-  JAR=jar
-fi
-AC_SUBST(JAR)
-AC_ARG_VAR(JAVA, [Java runtime command (default: java)])
-if test "x$JAVA" = "x"; then
-  JAVA=java
-fi
-AC_SUBST(JAVA)
-AC_ARG_VAR(JNI_CFLAGS,
-  [C compiler flags needed to include jni.h (default: -I/System/Library/Frameworks/JavaVM.framework/Headers on OS X, '-I/usr/java/include -I/usr/java/include/solaris' on Solaris, and '-I/usr/java/default/include -I/usr/java/default/include/linux' on Linux)])
-
-AC_MSG_CHECKING([whether to build TurboJPEG Java wrapper])
-AC_ARG_WITH([java],
-  AC_HELP_STRING([--with-java], [Build Java wrapper for the TurboJPEG library]))
-if test "x$with_12bit" = "xyes" -o "x$with_turbojpeg" = "xno"; then
-  with_java=no
-fi
-
-WITH_JAVA=0
-if test "x$with_java" = "xyes"; then
-  AC_MSG_RESULT(yes)
-
-  case $host_os in
-    darwin*)
-      DEFAULT_JNI_CFLAGS=-I/System/Library/Frameworks/JavaVM.framework/Headers
-      ;;
-    solaris*)
-      DEFAULT_JNI_CFLAGS='-I/usr/java/include -I/usr/java/include/solaris'
-      ;;
-    linux*)
-      DEFAULT_JNI_CFLAGS='-I/usr/java/default/include -I/usr/java/default/include/linux'
-      ;;
-  esac
-  if test "x$JNI_CFLAGS" = "x"; then
-    JNI_CFLAGS=$DEFAULT_JNI_CFLAGS
-  fi
-
-  SAVE_CPPFLAGS=${CPPFLAGS}
-  CPPFLAGS="${CPPFLAGS} ${JNI_CFLAGS}"
-  AC_CHECK_HEADERS([jni.h], [DUMMY=1],
-    [AC_MSG_ERROR([Could not find JNI header file])])
-  CPPFLAGS=${SAVE_CPPFLAGS}
-  AC_SUBST(JNI_CFLAGS)
-
-  RPM_CONFIG_ARGS="$RPM_CONFIG_ARGS --with-java"
-  JAVA_RPM_CONTENTS_1='%dir %{_datadir}/classes'
-  JAVA_RPM_CONTENTS_2=%{_datadir}/classes/turbojpeg.jar
-  WITH_JAVA=1
-else
-  AC_MSG_RESULT(no)
-fi
-AM_CONDITIONAL([WITH_JAVA], [test "x$with_java" = "xyes"])
-AC_SUBST(WITH_JAVA)
-AC_SUBST(JAVA_RPM_CONTENTS_1)
-AC_SUBST(JAVA_RPM_CONTENTS_2)
-
-# optionally force using gas-preprocessor.pl for compatibility testing
-AC_ARG_WITH([gas-preprocessor],
-  AC_HELP_STRING([--with-gas-preprocessor],
-    [Force using gas-preprocessor.pl on ARM.]))
-if test "x${with_gas_preprocessor}" = "xyes"; then
-  case $host_os in
-    darwin*)
-      CCAS="gas-preprocessor.pl -fix-unreq $CC"
-      ;;
-    *)
-      CCAS="gas-preprocessor.pl -no-fix-unreq $CC"
-      ;;
-  esac
-  AC_SUBST([CCAS])
-fi
-
-# SIMD is optional
-AC_ARG_WITH([simd],
-  AC_HELP_STRING([--without-simd], [Do not include SIMD extensions]))
-if test "x$with_12bit" = "xyes"; then
-  with_simd=no
-fi
-if test "x${with_simd}" != "xno"; then
-  require_simd=no
-  if test "x${with_simd}" = "xyes"; then
-    require_simd=yes
-  fi
-  # Check if we're on a supported CPU
-  AC_MSG_CHECKING([if we have SIMD optimisations for cpu type])
-  case "$host_cpu" in
-    x86_64 | amd64)
-      AC_MSG_RESULT([yes (x86_64)])
-      AC_PROG_NASM
-      simd_arch=x86_64
-      ;;
-    i*86 | x86 | ia32)
-      AC_MSG_RESULT([yes (i386)])
-      AC_PROG_NASM
-      simd_arch=i386
-      ;;
-    arm*)
-      AC_MSG_RESULT([yes (arm)])
-      AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
-      AC_CHECK_COMPATIBLE_ARM_ASSEMBLER_IFELSE(
-        [if test "x$ac_use_gas_preprocessor" = "xyes"; then
-           AC_MSG_RESULT([yes (with gas-preprocessor)])
-         else
-           AC_MSG_RESULT([yes])
-         fi
-         simd_arch=arm],
-        [AC_MSG_RESULT([no])
-         with_simd=no])
-      if test "x${with_simd}" = "xno"; then
-        if test "x${require_simd}" = "xyes"; then
-          AC_MSG_ERROR([SIMD support can't be enabled.])
-        else
-          AC_MSG_WARN([SIMD support can't be enabled.  Performance will suffer.])
-        fi
-      fi
-      ;;
-    aarch64*)
-      AC_MSG_RESULT([yes (arm64)])
-      AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
-      AC_CHECK_COMPATIBLE_ARM64_ASSEMBLER_IFELSE(
-        [if test "x$ac_use_gas_preprocessor" = "xyes"; then
-           AC_MSG_RESULT([yes (with gas-preprocessor)])
-         else
-           AC_MSG_RESULT([yes])
-         fi
-         simd_arch=aarch64],
-        [AC_MSG_RESULT([no])
-         with_simd=no])
-      if test "x${with_simd}" = "xno"; then
-        if test "x${require_simd}" = "xyes"; then
-          AC_MSG_ERROR([SIMD support can't be enabled.])
-        else
-          AC_MSG_WARN([SIMD support can't be enabled.  Performance will suffer.])
-        fi
-      fi
-      ;;
-    mips*)
-      AC_MSG_RESULT([yes (mips)])
-      AC_MSG_CHECKING([if the assembler is GNU-compatible and can be used])
-      AC_CHECK_COMPATIBLE_MIPS_ASSEMBLER_IFELSE(
-        [AC_MSG_RESULT([yes])
-         simd_arch=mips],
-        [AC_MSG_RESULT([no])
-         with_simd=no])
-      if test "x${with_simd}" = "xno"; then
-        if test "x${require_simd}" = "xyes"; then
-          AC_MSG_ERROR([SIMD support can't be enabled.])
-        else
-          AC_MSG_WARN([SIMD support can't be enabled.  Performance will suffer.])
-        fi
-      fi
-      ;;
-    powerpc*)
-      AC_CHECK_ALTIVEC(
-        [AC_MSG_RESULT([yes (powerpc)])
-         simd_arch=powerpc],
-        [AC_NO_SIMD(PowerPC SPE)])
-      ;;
-    *)
-      AC_NO_SIMD($host_cpu)
-      ;;
-  esac
-
-  if test "x${with_simd}" != "xno"; then
-    AC_DEFINE([WITH_SIMD], [1], [Use accelerated SIMD routines.])
-  fi
-else
-  RPM_CONFIG_ARGS="$RPM_CONFIG_ARGS --without-simd"
-fi
-
-AM_CONDITIONAL([WITH_SIMD], [test "x$with_simd" != "xno"])
-AM_CONDITIONAL([WITH_SSE_FLOAT_DCT], [test "x$simd_arch" = "xx86_64" -o "x$simd_arch" = "xi386"])
-AM_CONDITIONAL([SIMD_I386], [test "x$simd_arch" = "xi386"])
-AM_CONDITIONAL([SIMD_X86_64], [test "x$simd_arch" = "xx86_64"])
-AM_CONDITIONAL([SIMD_ARM], [test "x$simd_arch" = "xarm"])
-AM_CONDITIONAL([SIMD_ARM_64], [test "x$simd_arch" = "xaarch64"])
-AM_CONDITIONAL([SIMD_MIPS], [test "x$simd_arch" = "xmips"])
-AM_CONDITIONAL([SIMD_POWERPC], [test "x$simd_arch" = "xpowerpc"])
-AM_CONDITIONAL([X86_64], [test "x$host_cpu" = "xx86_64" -o "x$host_cpu" = "xamd64"])
-AM_CONDITIONAL([WITH_TURBOJPEG], [test "x$with_turbojpeg" != "xno"])
-AM_CONDITIONAL([CROSS_COMPILING], [test "x$cross_compiling" = "xyes"])
-
-AC_ARG_VAR(PKGNAME, [distribution package name (default: libjpeg-turbo)])
-if test "x$PKGNAME" = "x"; then
-  PKGNAME=$PACKAGE_NAME
-fi
-AC_SUBST(PKGNAME)
-
-case "$host_cpu" in
-  x86_64)
-    RPMARCH=x86_64
-    DEBARCH=amd64
-    ;;
-  i*86 | x86 | ia32)
-    RPMARCH=i386
-    DEBARCH=i386
-    ;;
-  powerpc64le)
-    RPMARCH=`uname -m`
-    DEBARCH=ppc64el
-    ;;
-  powerpc)
-    RPMARCH=ppc
-    DEBARCH=ppc
-    ;;
-  *)
-    RPMARCH=`uname -m`
-    DEBARCH=$RPMARCH
-    ;;
-esac
-
-if test "${docdir}" = ""; then
-  docdir=${datadir}/doc
-  AC_SUBST(docdir)
-fi
-
-AC_SUBST(RPMARCH)
-AC_SUBST(RPM_CONFIG_ARGS)
-AC_SUBST(DEBARCH)
-AC_SUBST(BUILD)
-AC_DEFINE_UNQUOTED([BUILD], "$BUILD", [libjpeg-turbo build number])
-
-# NOTE: autoheader automatically modifies the input file of the first
-# invocation of AC_CONFIG_HEADERS, so we put config.h first to prevent
-# jconfig.h.in from being clobbered.  config.h is used only internally, whereas
-# jconfig.h contains macros that are relevant to external programs (macros that
-# specify which features were built into the library.)
-AC_CONFIG_HEADERS([config.h])
-AC_CONFIG_HEADERS([jconfig.h])
-AC_CONFIG_HEADERS([jconfigint.h])
-AC_CONFIG_FILES([pkgscripts/libjpeg-turbo.spec.tmpl:release/libjpeg-turbo.spec.in])
-AC_CONFIG_FILES([pkgscripts/makecygwinpkg.tmpl:release/makecygwinpkg.in])
-AC_CONFIG_FILES([pkgscripts/makedpkg.tmpl:release/makedpkg.in])
-AC_CONFIG_FILES([pkgscripts/makemacpkg.tmpl:release/makemacpkg.in])
-AC_CONFIG_FILES([pkgscripts/uninstall.tmpl:release/uninstall.in])
-AC_CONFIG_FILES([pkgscripts/libjpeg.pc:release/libjpeg.pc.in])
-AC_CONFIG_FILES([pkgscripts/libturbojpeg.pc:release/libturbojpeg.pc.in])
-if test "x$with_turbojpeg" != "xno"; then
-  AC_CONFIG_FILES([tjbenchtest])
-fi
-if test "x$with_java" = "xyes"; then
-  AC_CONFIG_FILES([tjbenchtest.java])
-  AC_CONFIG_FILES([tjexampletest])
-fi
-AC_CONFIG_FILES([libjpeg.map])
-AC_CONFIG_FILES([Makefile simd/Makefile])
-AC_CONFIG_FILES([java/Makefile])
-AC_CONFIG_FILES([md5/Makefile])
-AC_OUTPUT
diff --git a/djpeg.1 b/djpeg.1
index 0a89927..e4204b2 100644
--- a/djpeg.1
+++ b/djpeg.1
@@ -157,6 +157,9 @@
 .B \-onepass
 mode.
 .TP
+.BI \-icc " file"
+Extract ICC color management profile to the specified file.
+.TP
 .BI \-map " file"
 Quantize to the colors used in the specified image file.  This is useful for
 producing multiple files with identical color maps, or for forcing a
diff --git a/djpeg.c b/djpeg.c
index 96db401..920e90d 100644
--- a/djpeg.c
+++ b/djpeg.c
@@ -32,6 +32,10 @@
 #include "jversion.h"           /* for version message */
 #include "jconfigint.h"
 
+#ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare free() */
+extern void free(void *ptr);
+#endif
+
 #include <ctype.h>              /* to declare isprint() */
 
 #ifdef USE_CCOMMAND             /* command-line reader for Macintosh */
@@ -47,7 +51,7 @@
 
 /* Create the add-on message string table. */
 
-#define JMESSAGE(code,string)   string ,
+#define JMESSAGE(code, string)  string,
 
 static const char * const cdjpeg_message_table[] = {
 #include "cderror.h"
@@ -63,13 +67,13 @@
  */
 
 typedef enum {
-        FMT_BMP,                /* BMP format (Windows flavor) */
-        FMT_GIF,                /* GIF format */
-        FMT_OS2,                /* BMP format (OS/2 flavor) */
-        FMT_PPM,                /* PPM/PGM (PBMPLUS formats) */
-        FMT_RLE,                /* RLE format */
-        FMT_TARGA,              /* Targa format */
-        FMT_TIFF                /* TIFF format */
+  FMT_BMP,                      /* BMP format (Windows flavor) */
+  FMT_GIF,                      /* GIF format */
+  FMT_OS2,                      /* BMP format (OS/2 flavor) */
+  FMT_PPM,                      /* PPM/PGM (PBMPLUS formats) */
+  FMT_RLE,                      /* RLE format */
+  FMT_TARGA,                    /* Targa format */
+  FMT_TIFF                      /* TIFF format */
 } IMAGE_FORMATS;
 
 #ifndef DEFAULT_FMT             /* so can override from CFLAGS in Makefile */
@@ -89,6 +93,7 @@
 
 
 static const char *progname;    /* program name for error messages */
+static char *icc_filename;      /* for -icc switch */
 static char *outfilename;       /* for -outfile switch */
 boolean memsrc;                 /* for -memsrc switch */
 boolean skip, crop;
@@ -98,7 +103,7 @@
 
 
 LOCAL(void)
-usage (void)
+usage(void)
 /* complain about bad command line */
 {
   fprintf(stderr, "usage: %s [switches] ", progname);
@@ -157,6 +162,7 @@
   fprintf(stderr, "  -dither fs     Use F-S dithering (default)\n");
   fprintf(stderr, "  -dither none   Don't use dithering in quantization\n");
   fprintf(stderr, "  -dither ordered  Use ordered dither (medium speed, quality)\n");
+  fprintf(stderr, "  -icc FILE      Extract ICC profile to FILE\n");
 #ifdef QUANT_2PASS_SUPPORTED
   fprintf(stderr, "  -map FILE      Map to colors used in named image file\n");
 #endif
@@ -180,8 +186,8 @@
 
 
 LOCAL(int)
-parse_switches (j_decompress_ptr cinfo, int argc, char **argv,
-                int last_file_arg_seen, boolean for_real)
+parse_switches(j_decompress_ptr cinfo, int argc, char **argv,
+               int last_file_arg_seen, boolean for_real)
 /* Parse optional switches.
  * Returns argv[] index of first file-name argument (== argc if none).
  * Any file names with indexes <= last_file_arg_seen are ignored;
@@ -196,6 +202,7 @@
 
   /* Set up default JPEG parameters. */
   requested_fmt = DEFAULT_FMT;  /* set default output file format */
+  icc_filename = NULL;
   outfilename = NULL;
   memsrc = FALSE;
   skip = FALSE;
@@ -263,7 +270,7 @@
       /* On first -d, print version identification */
       static boolean printed_version = FALSE;
 
-      if (! printed_version) {
+      if (!printed_version) {
         fprintf(stderr, "%s version %s (build %s)\n",
                 PACKAGE_NAME, VERSION, BUILD);
         fprintf(stderr, "%s\n\n", JCOPYRIGHT);
@@ -282,7 +289,7 @@
       /* Select recommended processing options for quick-and-dirty output. */
       cinfo->two_pass_quantize = FALSE;
       cinfo->dither_mode = JDITHER_ORDERED;
-      if (! cinfo->quantize_colors) /* don't override an earlier -colors */
+      if (!cinfo->quantize_colors) /* don't override an earlier -colors */
         cinfo->desired_number_of_colors = 216;
       cinfo->dct_method = JDCT_FASTEST;
       cinfo->do_fancy_upsampling = FALSE;
@@ -291,7 +298,8 @@
       /* GIF output format. */
       requested_fmt = FMT_GIF;
 
-    } else if (keymatch(arg, "grayscale", 2) || keymatch(arg, "greyscale",2)) {
+    } else if (keymatch(arg, "grayscale", 2) ||
+               keymatch(arg, "greyscale", 2)) {
       /* Force monochrome output. */
       cinfo->out_color_space = JCS_GRAYSCALE;
 
@@ -303,6 +311,13 @@
       /* Force RGB565 output. */
       cinfo->out_color_space = JCS_RGB565;
 
+    } else if (keymatch(arg, "icc", 1)) {
+      /* Set ICC filename. */
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      icc_filename = argv[argn];
+      jpeg_save_markers(cinfo, JPEG_APP0 + 2, 0xFFFF);
+
     } else if (keymatch(arg, "map", 3)) {
       /* Quantize to a color map taken from an input file. */
       if (++argn >= argc)       /* advance to next argument */
@@ -419,13 +434,13 @@
  */
 
 LOCAL(unsigned int)
-jpeg_getc (j_decompress_ptr cinfo)
+jpeg_getc(j_decompress_ptr cinfo)
 /* Read next byte */
 {
   struct jpeg_source_mgr *datasrc = cinfo->src;
 
   if (datasrc->bytes_in_buffer == 0) {
-    if (! (*datasrc->fill_input_buffer) (cinfo))
+    if (!(*datasrc->fill_input_buffer) (cinfo))
       ERREXIT(cinfo, JERR_CANT_SUSPEND);
   }
   datasrc->bytes_in_buffer--;
@@ -434,7 +449,7 @@
 
 
 METHODDEF(boolean)
-print_text_marker (j_decompress_ptr cinfo)
+print_text_marker(j_decompress_ptr cinfo)
 {
   boolean traceit = (cinfo->err->trace_level >= 1);
   long length;
@@ -447,10 +462,10 @@
 
   if (traceit) {
     if (cinfo->unread_marker == JPEG_COM)
-      fprintf(stderr, "Comment, length %ld:\n", (long) length);
+      fprintf(stderr, "Comment, length %ld:\n", (long)length);
     else                        /* assume it is an APPn otherwise */
       fprintf(stderr, "APP%d, length %ld:\n",
-              cinfo->unread_marker - JPEG_APP0, (long) length);
+              cinfo->unread_marker - JPEG_APP0, (long)length);
   }
 
   while (--length >= 0) {
@@ -489,7 +504,7 @@
  */
 
 int
-main (int argc, char **argv)
+main(int argc, char **argv)
 {
   struct jpeg_decompress_struct cinfo;
   struct jpeg_error_mgr jerr;
@@ -528,7 +543,7 @@
    * but don't try to override APP0 or APP14 this way (see libjpeg.txt).
    */
   jpeg_set_marker_processor(&cinfo, JPEG_COM, print_text_marker);
-  jpeg_set_marker_processor(&cinfo, JPEG_APP0+12, print_text_marker);
+  jpeg_set_marker_processor(&cinfo, JPEG_APP0 + 12, print_text_marker);
 
   /* Scan command line to find file names. */
   /* It is convenient to use just one switch-parsing routine, but the switch
@@ -543,14 +558,14 @@
 #ifdef TWO_FILE_COMMANDLINE
   /* Must have either -outfile switch or explicit output file name */
   if (outfilename == NULL) {
-    if (file_index != argc-2) {
+    if (file_index != argc - 2) {
       fprintf(stderr, "%s: must name one input and one output file\n",
               progname);
       usage();
     }
-    outfilename = argv[file_index+1];
+    outfilename = argv[file_index + 1];
   } else {
-    if (file_index != argc-1) {
+    if (file_index != argc - 1) {
       fprintf(stderr, "%s: must name one input and one output file\n",
               progname);
       usage();
@@ -558,7 +573,7 @@
   }
 #else
   /* Unix style: expect zero or one file name */
-  if (file_index < argc-1) {
+  if (file_index < argc - 1) {
     fprintf(stderr, "%s: only one input file\n", progname);
     usage();
   }
@@ -587,7 +602,7 @@
   }
 
 #ifdef PROGRESS_REPORT
-  start_progress_monitor((j_common_ptr) &cinfo, &progress);
+  start_progress_monitor((j_common_ptr)&cinfo, &progress);
 #endif
 
   /* Specify data source for decompression */
@@ -617,7 +632,7 @@
     jpeg_stdio_src(&cinfo, input_file);
 
   /* Read file header, set default decompression parameters */
-  (void) jpeg_read_header(&cinfo, TRUE);
+  (void)jpeg_read_header(&cinfo, TRUE);
 
   /* Adjust default decompression parameters by re-parsing the options */
   file_index = parse_switches(&cinfo, argc, argv, 0, TRUE);
@@ -628,10 +643,10 @@
   switch (requested_fmt) {
 #ifdef BMP_SUPPORTED
   case FMT_BMP:
-    dest_mgr = jinit_write_bmp(&cinfo, FALSE);
+    dest_mgr = jinit_write_bmp(&cinfo, FALSE, TRUE);
     break;
   case FMT_OS2:
-    dest_mgr = jinit_write_bmp(&cinfo, TRUE);
+    dest_mgr = jinit_write_bmp(&cinfo, TRUE, TRUE);
     break;
 #endif
 #ifdef GIF_SUPPORTED
@@ -661,7 +676,7 @@
   dest_mgr->output_file = output_file;
 
   /* Start decompressor */
-  (void) jpeg_start_decompress(&cinfo);
+  (void)jpeg_start_decompress(&cinfo);
 
   /* Skip rows */
   if (skip) {
@@ -755,12 +770,35 @@
   progress.pub.completed_passes = progress.pub.total_passes;
 #endif
 
+  if (icc_filename != NULL) {
+    FILE *icc_file;
+    JOCTET *icc_profile;
+    unsigned int icc_len;
+
+    if ((icc_file = fopen(icc_filename, WRITE_BINARY)) == NULL) {
+      fprintf(stderr, "%s: can't open %s\n", progname, icc_filename);
+      exit(EXIT_FAILURE);
+    }
+    if (jpeg_read_icc_profile(&cinfo, &icc_profile, &icc_len)) {
+      if (fwrite(icc_profile, icc_len, 1, icc_file) < 1) {
+        fprintf(stderr, "%s: can't read ICC profile from %s\n", progname,
+                icc_filename);
+        free(icc_profile);
+        fclose(icc_file);
+        exit(EXIT_FAILURE);
+      }
+      free(icc_profile);
+      fclose(icc_file);
+    } else if (cinfo.err->msg_code != JWRN_BOGUS_ICC)
+      fprintf(stderr, "%s: no ICC profile data in JPEG file\n", progname);
+  }
+
   /* Finish decompression and release memory.
    * I must do it in this order because output module has allocated memory
    * of lifespan JPOOL_IMAGE; it needs to finish before releasing memory.
    */
   (*dest_mgr->finish_output) (&cinfo, dest_mgr);
-  (void) jpeg_finish_decompress(&cinfo);
+  (void)jpeg_finish_decompress(&cinfo);
   jpeg_destroy_decompress(&cinfo);
 
   /* Close files, if we opened them */
@@ -770,7 +808,7 @@
     fclose(output_file);
 
 #ifdef PROGRESS_REPORT
-  end_progress_monitor((j_common_ptr) &cinfo);
+  end_progress_monitor((j_common_ptr)&cinfo);
 #endif
 
   if (memsrc && inbuffer != NULL)
diff --git a/doc/html/annotated.html b/doc/html/annotated.html
index d0b0e1e..8172f55 100644
--- a/doc/html/annotated.html
+++ b/doc/html/annotated.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.5</span>
+   &#160;<span id="projectnumber">1.6</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/classes.html b/doc/html/classes.html
index 275e96d..bc23e51 100644
--- a/doc/html/classes.html
+++ b/doc/html/classes.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.5</span>
+   &#160;<span id="projectnumber">1.6</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/functions.html b/doc/html/functions.html
index 31d78f5..570debd 100644
--- a/doc/html/functions.html
+++ b/doc/html/functions.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.5</span>
+   &#160;<span id="projectnumber">1.6</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/functions_vars.html b/doc/html/functions_vars.html
index 8373eac..7d45f87 100644
--- a/doc/html/functions_vars.html
+++ b/doc/html/functions_vars.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.5</span>
+   &#160;<span id="projectnumber">1.6</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/group___turbo_j_p_e_g.html b/doc/html/group___turbo_j_p_e_g.html
index 89780d4..40f8b28 100644
--- a/doc/html/group___turbo_j_p_e_g.html
+++ b/doc/html/group___turbo_j_p_e_g.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.5</span>
+   &#160;<span id="projectnumber">1.6</span>
    </div>
   </td>
  </tr>
@@ -128,6 +128,15 @@
 <tr class="memitem:gacb233cfd722d66d1ccbf48a7de81f0e0"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gacb233cfd722d66d1ccbf48a7de81f0e0">TJFLAG_ACCURATEDCT</a></td></tr>
 <tr class="memdesc:gacb233cfd722d66d1ccbf48a7de81f0e0"><td class="mdescLeft">&#160;</td><td class="mdescRight">Use the most accurate DCT/IDCT algorithm available in the underlying codec.  <a href="#gacb233cfd722d66d1ccbf48a7de81f0e0">More...</a><br/></td></tr>
 <tr class="separator:gacb233cfd722d66d1ccbf48a7de81f0e0"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga519cfa4ef6c18d9e5b455fdf59306a3a"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga519cfa4ef6c18d9e5b455fdf59306a3a">TJFLAG_STOPONWARNING</a></td></tr>
+<tr class="memdesc:ga519cfa4ef6c18d9e5b455fdf59306a3a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Immediately discontinue the current compression/decompression/transform operation if the underlying codec throws a warning (non-fatal error).  <a href="#ga519cfa4ef6c18d9e5b455fdf59306a3a">More...</a><br/></td></tr>
+<tr class="separator:ga519cfa4ef6c18d9e5b455fdf59306a3a"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga43b426750b46190a25d34a67ef76df1b"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga43b426750b46190a25d34a67ef76df1b">TJFLAG_PROGRESSIVE</a></td></tr>
+<tr class="memdesc:ga43b426750b46190a25d34a67ef76df1b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Use progressive entropy coding in JPEG images generated by the compression and transform functions.  <a href="#ga43b426750b46190a25d34a67ef76df1b">More...</a><br/></td></tr>
+<tr class="separator:ga43b426750b46190a25d34a67ef76df1b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga79bde1b4a3e2351e00887e47781b966e"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga79bde1b4a3e2351e00887e47781b966e">TJ_NUMERR</a></td></tr>
+<tr class="memdesc:ga79bde1b4a3e2351e00887e47781b966e"><td class="mdescLeft">&#160;</td><td class="mdescRight">The number of error codes.  <a href="#ga79bde1b4a3e2351e00887e47781b966e">More...</a><br/></td></tr>
+<tr class="separator:ga79bde1b4a3e2351e00887e47781b966e"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga0f6dbd18adf38b7d46ac547f0f4d562c"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga0f6dbd18adf38b7d46ac547f0f4d562c">TJ_NUMXOP</a></td></tr>
 <tr class="memdesc:ga0f6dbd18adf38b7d46ac547f0f4d562c"><td class="mdescLeft">&#160;</td><td class="mdescRight">The number of transform operations.  <a href="#ga0f6dbd18adf38b7d46ac547f0f4d562c">More...</a><br/></td></tr>
 <tr class="separator:ga0f6dbd18adf38b7d46ac547f0f4d562c"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -146,6 +155,12 @@
 <tr class="memitem:gafbf992bbf6e006705886333703ffab31"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gafbf992bbf6e006705886333703ffab31">TJXOPT_NOOUTPUT</a></td></tr>
 <tr class="memdesc:gafbf992bbf6e006705886333703ffab31"><td class="mdescLeft">&#160;</td><td class="mdescRight">This option will prevent <a class="el" href="group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> from outputting a JPEG image for this particular transform (this can be used in conjunction with a custom filter to capture the transformed DCT coefficients without transcoding them.)  <a href="#gafbf992bbf6e006705886333703ffab31">More...</a><br/></td></tr>
 <tr class="separator:gafbf992bbf6e006705886333703ffab31"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gad2371c80674584ecc1a7d75e564cf026"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gad2371c80674584ecc1a7d75e564cf026">TJXOPT_PROGRESSIVE</a></td></tr>
+<tr class="memdesc:gad2371c80674584ecc1a7d75e564cf026"><td class="mdescLeft">&#160;</td><td class="mdescRight">This option will enable progressive entropy coding in the output image generated by this particular transform.  <a href="#gad2371c80674584ecc1a7d75e564cf026">More...</a><br/></td></tr>
+<tr class="separator:gad2371c80674584ecc1a7d75e564cf026"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga153b468cfb905d0de61706c838986fe8"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga153b468cfb905d0de61706c838986fe8">TJXOPT_COPYNONE</a></td></tr>
+<tr class="memdesc:ga153b468cfb905d0de61706c838986fe8"><td class="mdescLeft">&#160;</td><td class="mdescRight">This option will prevent <a class="el" href="group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> from copying any extra markers (including EXIF and ICC profile data) from the source image to the output image.  <a href="#ga153b468cfb905d0de61706c838986fe8">More...</a><br/></td></tr>
+<tr class="separator:ga153b468cfb905d0de61706c838986fe8"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga0aba955473315e405295d978f0c16511"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga0aba955473315e405295d978f0c16511">TJPAD</a>(width)</td></tr>
 <tr class="memdesc:ga0aba955473315e405295d978f0c16511"><td class="mdescLeft">&#160;</td><td class="mdescRight">Pad the given width to the nearest 32-bit boundary.  <a href="#ga0aba955473315e405295d978f0c16511">More...</a><br/></td></tr>
 <tr class="separator:ga0aba955473315e405295d978f0c16511"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -190,7 +205,9 @@
 &#160;&#160;<a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aac037ff1845cf9b74bb81a3659c2b9fb4">TJPF_BGRA</a>, 
 <a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa1ba1a7f1631dbeaa49a0a85fc4a40081">TJPF_ABGR</a>, 
 <a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aae8f846ed9d9de99b6e1dfe448848765c">TJPF_ARGB</a>, 
-<a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa7f5100ec44c91994e243f1cf55553f8b">TJPF_CMYK</a>
+<a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa7f5100ec44c91994e243f1cf55553f8b">TJPF_CMYK</a>, 
+<br/>
+&#160;&#160;<a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa84c1a6cead7952998e2fb895844a21ed">TJPF_UNKNOWN</a>
 <br/>
  }</td></tr>
 <tr class="memdesc:gac916144e26c3817ac514e64ae5d12e2a"><td class="mdescLeft">&#160;</td><td class="mdescRight">Pixel formats.  <a href="group___turbo_j_p_e_g.html#gac916144e26c3817ac514e64ae5d12e2a">More...</a><br/></td></tr>
@@ -206,6 +223,11 @@
  }</td></tr>
 <tr class="memdesc:ga4f83ad3368e0e29d1957be0efa7c3720"><td class="mdescLeft">&#160;</td><td class="mdescRight">JPEG colorspaces.  <a href="group___turbo_j_p_e_g.html#ga4f83ad3368e0e29d1957be0efa7c3720">More...</a><br/></td></tr>
 <tr class="separator:ga4f83ad3368e0e29d1957be0efa7c3720"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gafbc17cfa57d0d5d11fea35ac025950fe"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gafbc17cfa57d0d5d11fea35ac025950fe">TJERR</a> { <a class="el" href="group___turbo_j_p_e_g.html#ggafbc17cfa57d0d5d11fea35ac025950fea342dd6e2aedb47bb257b4e7568329b59">TJERR_WARNING</a>, 
+<a class="el" href="group___turbo_j_p_e_g.html#ggafbc17cfa57d0d5d11fea35ac025950feafc9cceeada13122b09e4851e3788039a">TJERR_FATAL</a>
+ }</td></tr>
+<tr class="memdesc:gafbc17cfa57d0d5d11fea35ac025950fe"><td class="mdescLeft">&#160;</td><td class="mdescRight">Error codes.  <a href="group___turbo_j_p_e_g.html#gafbc17cfa57d0d5d11fea35ac025950fe">More...</a><br/></td></tr>
+<tr class="separator:gafbc17cfa57d0d5d11fea35ac025950fe"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga2de531af4e7e6c4f124908376b354866"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga2de531af4e7e6c4f124908376b354866">TJXOP</a> { <br/>
 &#160;&#160;<a class="el" href="group___turbo_j_p_e_g.html#gga2de531af4e7e6c4f124908376b354866aad88c0366cd3f7d0eac9d7a3fa1c2c27">TJXOP_NONE</a>, 
 <a class="el" href="group___turbo_j_p_e_g.html#gga2de531af4e7e6c4f124908376b354866aa0df69776caa30f0fa28e26332d311ce">TJXOP_HFLIP</a>, 
@@ -292,12 +314,21 @@
 <tr class="memitem:ga5c9234bda6d993cdaffdd89bf81a00ff"><td class="memItemLeft" align="right" valign="top">DLLEXPORT unsigned char *DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga5c9234bda6d993cdaffdd89bf81a00ff">tjAlloc</a> (int bytes)</td></tr>
 <tr class="memdesc:ga5c9234bda6d993cdaffdd89bf81a00ff"><td class="mdescLeft">&#160;</td><td class="mdescRight">Allocate an image buffer for use with TurboJPEG.  <a href="#ga5c9234bda6d993cdaffdd89bf81a00ff">More...</a><br/></td></tr>
 <tr class="separator:ga5c9234bda6d993cdaffdd89bf81a00ff"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga144b981d6b281ecca4cbb4709de75749"><td class="memItemLeft" align="right" valign="top">DLLEXPORT unsigned char *DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga144b981d6b281ecca4cbb4709de75749">tjLoadImage</a> (const char *filename, int *width, int align, int *height, int *pixelFormat, int flags)</td></tr>
+<tr class="memdesc:ga144b981d6b281ecca4cbb4709de75749"><td class="mdescLeft">&#160;</td><td class="mdescRight">Load an uncompressed image from disk into memory.  <a href="#ga144b981d6b281ecca4cbb4709de75749">More...</a><br/></td></tr>
+<tr class="separator:ga144b981d6b281ecca4cbb4709de75749"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga2e78b7b79796e74584028da880a6a29c"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga2e78b7b79796e74584028da880a6a29c">tjSaveImage</a> (const char *filename, unsigned char *buffer, int width, int pitch, int height, int pixelFormat, int flags)</td></tr>
+<tr class="memdesc:ga2e78b7b79796e74584028da880a6a29c"><td class="mdescLeft">&#160;</td><td class="mdescRight">Save an uncompressed image from memory to disk.  <a href="#ga2e78b7b79796e74584028da880a6a29c">More...</a><br/></td></tr>
+<tr class="separator:ga2e78b7b79796e74584028da880a6a29c"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga8c4a1231dc06a450514c835f6471f137"><td class="memItemLeft" align="right" valign="top">DLLEXPORT void DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga8c4a1231dc06a450514c835f6471f137">tjFree</a> (unsigned char *buffer)</td></tr>
 <tr class="memdesc:ga8c4a1231dc06a450514c835f6471f137"><td class="mdescLeft">&#160;</td><td class="mdescRight">Free an image buffer previously allocated by TurboJPEG.  <a href="#ga8c4a1231dc06a450514c835f6471f137">More...</a><br/></td></tr>
 <tr class="separator:ga8c4a1231dc06a450514c835f6471f137"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga9af79c908ec131b1ae8d52fe40375abf"><td class="memItemLeft" align="right" valign="top">DLLEXPORT char *DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf">tjGetErrorStr</a> (void)</td></tr>
-<tr class="memdesc:ga9af79c908ec131b1ae8d52fe40375abf"><td class="mdescLeft">&#160;</td><td class="mdescRight">Returns a descriptive error message explaining why the last command failed.  <a href="#ga9af79c908ec131b1ae8d52fe40375abf">More...</a><br/></td></tr>
-<tr class="separator:ga9af79c908ec131b1ae8d52fe40375abf"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga94a235bd4f1088f61ad87b4eadb64c9c"><td class="memItemLeft" align="right" valign="top">DLLEXPORT char *DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c">tjGetErrorStr2</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle)</td></tr>
+<tr class="memdesc:ga94a235bd4f1088f61ad87b4eadb64c9c"><td class="mdescLeft">&#160;</td><td class="mdescRight">Returns a descriptive error message explaining why the last command failed.  <a href="#ga94a235bd4f1088f61ad87b4eadb64c9c">More...</a><br/></td></tr>
+<tr class="separator:ga94a235bd4f1088f61ad87b4eadb64c9c"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga0be00a62bd1be897f170fa1fed5fb4cb"><td class="memItemLeft" align="right" valign="top">DLLEXPORT int DLLCALL&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga0be00a62bd1be897f170fa1fed5fb4cb">tjGetErrorCode</a> (<a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a> handle)</td></tr>
+<tr class="memdesc:ga0be00a62bd1be897f170fa1fed5fb4cb"><td class="mdescLeft">&#160;</td><td class="mdescRight">Returns a code indicating the severity of the last error.  <a href="#ga0be00a62bd1be897f170fa1fed5fb4cb">More...</a><br/></td></tr>
+<tr class="separator:ga0be00a62bd1be897f170fa1fed5fb4cb"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="var-members"></a>
 Variables</h2></td></tr>
@@ -316,6 +347,9 @@
 <tr class="memitem:ga84e2e35d3f08025f976ec1ec53693dea"><td class="memItemLeft" align="right" valign="top">static const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga84e2e35d3f08025f976ec1ec53693dea">tjBlueOffset</a> [<a class="el" href="group___turbo_j_p_e_g.html#ga7010a4402f54a45ba822ad8675a4655e">TJ_NUMPF</a>]</td></tr>
 <tr class="memdesc:ga84e2e35d3f08025f976ec1ec53693dea"><td class="mdescLeft">&#160;</td><td class="mdescRight">Blue offset (in bytes) for a given pixel format.  <a href="#ga84e2e35d3f08025f976ec1ec53693dea">More...</a><br/></td></tr>
 <tr class="separator:ga84e2e35d3f08025f976ec1ec53693dea"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga5af0ab065feefd526debf1e20c43e837"><td class="memItemLeft" align="right" valign="top">static const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#ga5af0ab065feefd526debf1e20c43e837">tjAlphaOffset</a> [<a class="el" href="group___turbo_j_p_e_g.html#ga7010a4402f54a45ba822ad8675a4655e">TJ_NUMPF</a>]</td></tr>
+<tr class="memdesc:ga5af0ab065feefd526debf1e20c43e837"><td class="mdescLeft">&#160;</td><td class="mdescRight">Alpha offset (in bytes) for a given pixel format.  <a href="#ga5af0ab065feefd526debf1e20c43e837">More...</a><br/></td></tr>
+<tr class="separator:ga5af0ab065feefd526debf1e20c43e837"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gad77cf8fe5b2bfd3cb3f53098146abb4c"><td class="memItemLeft" align="right" valign="top">static const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c">tjPixelSize</a> [<a class="el" href="group___turbo_j_p_e_g.html#ga7010a4402f54a45ba822ad8675a4655e">TJ_NUMPF</a>]</td></tr>
 <tr class="memdesc:gad77cf8fe5b2bfd3cb3f53098146abb4c"><td class="mdescLeft">&#160;</td><td class="mdescRight">Pixel size (in bytes) for a given pixel format.  <a href="#gad77cf8fe5b2bfd3cb3f53098146abb4c">More...</a><br/></td></tr>
 <tr class="separator:gad77cf8fe5b2bfd3cb3f53098146abb4c"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -343,6 +377,20 @@
 
 </div>
 </div>
+<a class="anchor" id="ga79bde1b4a3e2351e00887e47781b966e"></a>
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">#define TJ_NUMERR</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>The number of error codes. </p>
+
+</div>
+</div>
 <a class="anchor" id="ga7010a4402f54a45ba822ad8675a4655e"></a>
 <div class="memitem">
 <div class="memproto">
@@ -459,6 +507,36 @@
 
 </div>
 </div>
+<a class="anchor" id="ga43b426750b46190a25d34a67ef76df1b"></a>
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">#define TJFLAG_PROGRESSIVE</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Use progressive entropy coding in JPEG images generated by the compression and transform functions. </p>
+<p>Progressive entropy coding will generally improve compression relative to baseline entropy coding (the default), but it will reduce compression and decompression performance considerably. </p>
+
+</div>
+</div>
+<a class="anchor" id="ga519cfa4ef6c18d9e5b455fdf59306a3a"></a>
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">#define TJFLAG_STOPONWARNING</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Immediately discontinue the current compression/decompression/transform operation if the underlying codec throws a warning (non-fatal error). </p>
+<p>The default behavior is to allow the operation to complete unless a fatal error is encountered. </p>
+
+</div>
+</div>
 <a class="anchor" id="ga0aba955473315e405295d978f0c16511"></a>
 <div class="memitem">
 <div class="memproto">
@@ -506,6 +584,20 @@
 
 </div>
 </div>
+<a class="anchor" id="ga153b468cfb905d0de61706c838986fe8"></a>
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">#define TJXOPT_COPYNONE</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>This option will prevent <a class="el" href="group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a" title="Losslessly transform a JPEG image into another JPEG image.">tjTransform()</a> from copying any extra markers (including EXIF and ICC profile data) from the source image to the output image. </p>
+
+</div>
+</div>
 <a class="anchor" id="ga9c771a757fc1294add611906b89ab2d2"></a>
 <div class="memitem">
 <div class="memproto">
@@ -564,6 +656,21 @@
 
 </div>
 </div>
+<a class="anchor" id="gad2371c80674584ecc1a7d75e564cf026"></a>
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">#define TJXOPT_PROGRESSIVE</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>This option will enable progressive entropy coding in the output image generated by this particular transform. </p>
+<p>Progressive entropy coding will generally improve compression relative to baseline entropy coding (the default), but it will reduce compression and decompression performance considerably. </p>
+
+</div>
+</div>
 <a class="anchor" id="ga319826b7eb1583c0595bbe7b95428709"></a>
 <div class="memitem">
 <div class="memproto">
@@ -644,6 +751,28 @@
 
 </div>
 </div>
+<a class="anchor" id="gafbc17cfa57d0d5d11fea35ac025950fe"></a>
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">enum <a class="el" href="group___turbo_j_p_e_g.html#gafbc17cfa57d0d5d11fea35ac025950fe">TJERR</a></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Error codes. </p>
+<table class="fieldtable">
+<tr><th colspan="2">Enumerator</th></tr><tr><td class="fieldname"><em><a class="anchor" id="ggafbc17cfa57d0d5d11fea35ac025950fea342dd6e2aedb47bb257b4e7568329b59"></a>TJERR_WARNING</em>&nbsp;</td><td class="fielddoc">
+<p>The error was non-fatal and recoverable, but the image may still be corrupt. </p>
+</td></tr>
+<tr><td class="fieldname"><em><a class="anchor" id="ggafbc17cfa57d0d5d11fea35ac025950feafc9cceeada13122b09e4851e3788039a"></a>TJERR_FATAL</em>&nbsp;</td><td class="fielddoc">
+<p>The error was fatal and non-recoverable. </p>
+</td></tr>
+</table>
+
+</div>
+</div>
 <a class="anchor" id="gac916144e26c3817ac514e64ae5d12e2a"></a>
 <div class="memitem">
 <div class="memproto">
@@ -704,6 +833,10 @@
 <p>CMYK pixel format. </p>
 <p>Unlike RGB, which is an additive color model used primarily for display, CMYK (Cyan/Magenta/Yellow/Key) is a subtractive color model used primarily for printing. In the CMYK color model, the value of each color component typically corresponds to an amount of cyan, magenta, yellow, or black ink that is applied to a white background. In order to convert between CMYK and RGB, it is necessary to use a color management system (CMS.) A CMS will attempt to map colors within the printer's gamut to perceptually similar colors in the display's gamut and vice versa, but the mapping is typically not 1:1 or reversible, nor can it be defined with a simple formula. Thus, such a conversion is out of scope for a codec library. However, the TurboJPEG API allows for compressing CMYK pixels into a YCCK JPEG image (see <a class="el" href="group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a53839e0fe867b76b58d16b0a1a7c598e" title="YCCK colorspace.">TJCS_YCCK</a>) and decompressing YCCK JPEG images into CMYK pixels. </p>
 </td></tr>
+<tr><td class="fieldname"><em><a class="anchor" id="ggac916144e26c3817ac514e64ae5d12e2aa84c1a6cead7952998e2fb895844a21ed"></a>TJPF_UNKNOWN</em>&nbsp;</td><td class="fielddoc">
+<p>Unknown pixel format. </p>
+<p>Currently this is only used by <a class="el" href="group___turbo_j_p_e_g.html#ga144b981d6b281ecca4cbb4709de75749" title="Load an uncompressed image from disk into memory.">tjLoadImage()</a>. </p>
+</td></tr>
 </table>
 
 </div>
@@ -1018,7 +1151,7 @@
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr2()</a> and <a class="el" href="group___turbo_j_p_e_g.html#ga0be00a62bd1be897f170fa1fed5fb4cb" title="Returns a code indicating the severity of the last error.">tjGetErrorCode()</a>.) </dd></dl>
 
 </div>
 </div>
@@ -1115,7 +1248,7 @@
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr2()</a> and <a class="el" href="group___turbo_j_p_e_g.html#ga0be00a62bd1be897f170fa1fed5fb4cb" title="Returns a code indicating the severity of the last error.">tjGetErrorCode()</a>.) </dd></dl>
 
 </div>
 </div>
@@ -1212,7 +1345,7 @@
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr2()</a> and <a class="el" href="group___turbo_j_p_e_g.html#ga0be00a62bd1be897f170fa1fed5fb4cb" title="Returns a code indicating the severity of the last error.">tjGetErrorCode()</a>.) </dd></dl>
 
 </div>
 </div>
@@ -1305,7 +1438,7 @@
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr2()</a> and <a class="el" href="group___turbo_j_p_e_g.html#ga0be00a62bd1be897f170fa1fed5fb4cb" title="Returns a code indicating the severity of the last error.">tjGetErrorCode()</a>.) </dd></dl>
 
 </div>
 </div>
@@ -1398,7 +1531,7 @@
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr2()</a> and <a class="el" href="group___turbo_j_p_e_g.html#ga0be00a62bd1be897f170fa1fed5fb4cb" title="Returns a code indicating the severity of the last error.">tjGetErrorCode()</a>.) </dd></dl>
 
 </div>
 </div>
@@ -1483,7 +1616,7 @@
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr2()</a> and <a class="el" href="group___turbo_j_p_e_g.html#ga0be00a62bd1be897f170fa1fed5fb4cb" title="Returns a code indicating the severity of the last error.">tjGetErrorCode()</a>.) </dd></dl>
 
 </div>
 </div>
@@ -1554,7 +1687,7 @@
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr2()</a> and <a class="el" href="group___turbo_j_p_e_g.html#ga0be00a62bd1be897f170fa1fed5fb4cb" title="Returns a code indicating the severity of the last error.">tjGetErrorCode()</a>.) </dd></dl>
 
 </div>
 </div>
@@ -1633,7 +1766,7 @@
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr2()</a> and <a class="el" href="group___turbo_j_p_e_g.html#ga0be00a62bd1be897f170fa1fed5fb4cb" title="Returns a code indicating the severity of the last error.">tjGetErrorCode()</a>.) </dd></dl>
 
 </div>
 </div>
@@ -1712,7 +1845,7 @@
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr2()</a> and <a class="el" href="group___turbo_j_p_e_g.html#ga0be00a62bd1be897f170fa1fed5fb4cb" title="Returns a code indicating the severity of the last error.">tjGetErrorCode()</a>.) </dd></dl>
 
 </div>
 </div>
@@ -1737,7 +1870,7 @@
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr2()</a>.) </dd></dl>
 
 </div>
 </div>
@@ -1830,7 +1963,7 @@
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr2()</a> and <a class="el" href="group___turbo_j_p_e_g.html#ga0be00a62bd1be897f170fa1fed5fb4cb" title="Returns a code indicating the severity of the last error.">tjGetErrorCode()</a>.) </dd></dl>
 
 </div>
 </div>
@@ -1923,7 +2056,7 @@
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr2()</a> and <a class="el" href="group___turbo_j_p_e_g.html#ga0be00a62bd1be897f170fa1fed5fb4cb" title="Returns a code indicating the severity of the last error.">tjGetErrorCode()</a>.) </dd></dl>
 
 </div>
 </div>
@@ -1953,21 +2086,53 @@
 
 </div>
 </div>
-<a class="anchor" id="ga9af79c908ec131b1ae8d52fe40375abf"></a>
+<a class="anchor" id="ga0be00a62bd1be897f170fa1fed5fb4cb"></a>
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">DLLEXPORT char* DLLCALL tjGetErrorStr </td>
+          <td class="memname">DLLEXPORT int DLLCALL tjGetErrorCode </td>
           <td>(</td>
-          <td class="paramtype">void&#160;</td>
-          <td class="paramname"></td><td>)</td>
+          <td class="paramtype"><a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a>&#160;</td>
+          <td class="paramname"><em>handle</em></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Returns a code indicating the severity of the last error. </p>
+<p>See <a class="el" href="group___turbo_j_p_e_g.html#gafbc17cfa57d0d5d11fea35ac025950fe">Error codes</a>.</p>
+<dl class="params"><dt>Parameters</dt><dd>
+  <table class="params">
+    <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG compressor, decompressor or transformer instance</td></tr>
+  </table>
+  </dd>
+</dl>
+<dl class="section return"><dt>Returns</dt><dd>a code indicating the severity of the last error. See <a class="el" href="group___turbo_j_p_e_g.html#gafbc17cfa57d0d5d11fea35ac025950fe">Error codes</a>. </dd></dl>
+
+</div>
+</div>
+<a class="anchor" id="ga94a235bd4f1088f61ad87b4eadb64c9c"></a>
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">DLLEXPORT char* DLLCALL tjGetErrorStr2 </td>
+          <td>(</td>
+          <td class="paramtype"><a class="el" href="group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763">tjhandle</a>&#160;</td>
+          <td class="paramname"><em>handle</em></td><td>)</td>
           <td></td>
         </tr>
       </table>
 </div><div class="memdoc">
 
 <p>Returns a descriptive error message explaining why the last command failed. </p>
+<dl class="params"><dt>Parameters</dt><dd>
+  <table class="params">
+    <tr><td class="paramname">handle</td><td>a handle to a TurboJPEG compressor, decompressor, or transformer instance, or NULL if the error was generated by a global function (but note that retrieving the error message for a global function is not thread-safe.)</td></tr>
+  </table>
+  </dd>
+</dl>
 <dl class="section return"><dt>Returns</dt><dd>a descriptive error message explaining why the last command failed. </dd></dl>
 
 </div>
@@ -1993,7 +2158,7 @@
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>a pointer to a list of fractional scaling factors, or NULL if an error is encountered (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to a list of fractional scaling factors, or NULL if an error is encountered (see <a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr2()</a>.) </dd></dl>
 
 </div>
 </div>
@@ -2012,7 +2177,7 @@
 </div><div class="memdoc">
 
 <p>Create a TurboJPEG compressor instance. </p>
-<dl class="section return"><dt>Returns</dt><dd>a handle to the newly-created instance, or NULL if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a handle to the newly-created instance, or NULL if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr2()</a>.) </dd></dl>
 
 </div>
 </div>
@@ -2031,7 +2196,7 @@
 </div><div class="memdoc">
 
 <p>Create a TurboJPEG decompressor instance. </p>
-<dl class="section return"><dt>Returns</dt><dd>a handle to the newly-created instance, or NULL if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a handle to the newly-created instance, or NULL if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr2()</a>.) </dd></dl>
 
 </div>
 </div>
@@ -2050,7 +2215,77 @@
 </div><div class="memdoc">
 
 <p>Create a new TurboJPEG transformer instance. </p>
-<dl class="section return"><dt>Returns</dt><dd>a handle to the newly-created instance, or NULL if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>a handle to the newly-created instance, or NULL if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr2()</a>.) </dd></dl>
+
+</div>
+</div>
+<a class="anchor" id="ga144b981d6b281ecca4cbb4709de75749"></a>
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">DLLEXPORT unsigned char* DLLCALL tjLoadImage </td>
+          <td>(</td>
+          <td class="paramtype">const char *&#160;</td>
+          <td class="paramname"><em>filename</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int *&#160;</td>
+          <td class="paramname"><em>width</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>align</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int *&#160;</td>
+          <td class="paramname"><em>height</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int *&#160;</td>
+          <td class="paramname"><em>pixelFormat</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>flags</em>&#160;</td>
+        </tr>
+        <tr>
+          <td></td>
+          <td>)</td>
+          <td></td><td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Load an uncompressed image from disk into memory. </p>
+<dl class="params"><dt>Parameters</dt><dd>
+  <table class="params">
+    <tr><td class="paramname">filename</td><td>name of a file containing an uncompressed image in Windows BMP or PBMPLUS (PPM/PGM) format</td></tr>
+    <tr><td class="paramname">width</td><td>pointer to an integer variable that will receive the width (in pixels) of the uncompressed image</td></tr>
+    <tr><td class="paramname">align</td><td>row alignment of the image buffer to be returned (must be a power of 2.) For instance, setting this parameter to 4 will cause all rows in the image buffer to be padded to the nearest 32-bit boundary, and setting this parameter to 1 will cause all rows in the image buffer to be unpadded.</td></tr>
+    <tr><td class="paramname">height</td><td>pointer to an integer variable that will receive the height (in pixels) of the uncompressed image</td></tr>
+    <tr><td class="paramname">pixelFormat</td><td>pointer to an integer variable that specifies or will receive the pixel format of the uncompressed image buffer. The behavior of <a class="el" href="group___turbo_j_p_e_g.html#ga144b981d6b281ecca4cbb4709de75749" title="Load an uncompressed image from disk into memory.">tjLoadImage()</a> will vary depending on the value of <code>*pixelFormat</code> passed to the function:<ul>
+<li><a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa84c1a6cead7952998e2fb895844a21ed">TJPF_UNKNOWN</a> : The uncompressed image buffer returned by the function will use the most optimal pixel format for the file type, and <code>*pixelFormat</code> will contain the ID of this pixel format upon successful return from the function.</li>
+<li><a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa5431b54b015337705f13118073711a1a">TJPF_GRAY</a> : Only PGM files and 8-bit BMP files with a grayscale colormap can be loaded.</li>
+<li><a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa7f5100ec44c91994e243f1cf55553f8b">TJPF_CMYK</a> : The RGB or grayscale pixels stored in the file will be converted using a quick &amp; dirty algorithm that is suitable only for testing purposes (proper conversion between CMYK and other formats requires a color management system.)</li>
+<li>Other <a class="el" href="group___turbo_j_p_e_g.html#gac916144e26c3817ac514e64ae5d12e2a">pixel formats</a> : The uncompressed image buffer will use the specified pixel format, and pixel format conversion will be performed if necessary.</li>
+</ul>
+</td></tr>
+    <tr><td class="paramname">flags</td><td>the bitwise OR of one or more of the <a class="el" href="group___turbo_j_p_e_g.html#ga72ecf4ebe6eb702d3c6f5ca27455e1ec">flags</a>.</td></tr>
+  </table>
+  </dd>
+</dl>
+<dl class="section return"><dt>Returns</dt><dd>a pointer to a newly-allocated buffer containing the uncompressed image, converted to the chosen pixel format and with the chosen row alignment, or NULL if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr2()</a>.) This buffer should be freed using <a class="el" href="group___turbo_j_p_e_g.html#ga8c4a1231dc06a450514c835f6471f137" title="Free an image buffer previously allocated by TurboJPEG.">tjFree()</a>. </dd></dl>
 
 </div>
 </div>
@@ -2199,6 +2434,77 @@
 
 </div>
 </div>
+<a class="anchor" id="ga2e78b7b79796e74584028da880a6a29c"></a>
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">DLLEXPORT int DLLCALL tjSaveImage </td>
+          <td>(</td>
+          <td class="paramtype">const char *&#160;</td>
+          <td class="paramname"><em>filename</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">unsigned char *&#160;</td>
+          <td class="paramname"><em>buffer</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>width</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>pitch</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>height</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>pixelFormat</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int&#160;</td>
+          <td class="paramname"><em>flags</em>&#160;</td>
+        </tr>
+        <tr>
+          <td></td>
+          <td>)</td>
+          <td></td><td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Save an uncompressed image from memory to disk. </p>
+<dl class="params"><dt>Parameters</dt><dd>
+  <table class="params">
+    <tr><td class="paramname">filename</td><td>name of a file to which to save the uncompressed image. The image will be stored in Windows BMP or PBMPLUS (PPM/PGM) format, depending on the file extension.</td></tr>
+    <tr><td class="paramname">buffer</td><td>pointer to an image buffer containing RGB, grayscale, or CMYK pixels to be saved</td></tr>
+    <tr><td class="paramname">width</td><td>width (in pixels) of the uncompressed image</td></tr>
+    <tr><td class="paramname">pitch</td><td>bytes per line in the image buffer. Setting this parameter to 0 is the equivalent of setting it to <code>width * <a class="el" href="group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c" title="Pixel size (in bytes) for a given pixel format.">tjPixelSize</a>[pixelFormat]</code>.</td></tr>
+    <tr><td class="paramname">height</td><td>height (in pixels) of the uncompressed image</td></tr>
+    <tr><td class="paramname">pixelFormat</td><td>pixel format of the image buffer (see <a class="el" href="group___turbo_j_p_e_g.html#gac916144e26c3817ac514e64ae5d12e2a">Pixel formats</a>.) If this parameter is set to <a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa5431b54b015337705f13118073711a1a">TJPF_GRAY</a>, then the image will be stored in PGM or 8-bit (indexed color) BMP format. Otherwise, the image will be stored in PPM or 24-bit BMP format. If this parameter is set to <a class="el" href="group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa7f5100ec44c91994e243f1cf55553f8b">TJPF_CMYK</a>, then the CMYK pixels will be converted to RGB using a quick &amp; dirty algorithm that is suitable only for testing (proper conversion between CMYK and other formats requires a color management system.)</td></tr>
+    <tr><td class="paramname">flags</td><td>the bitwise OR of one or more of the <a class="el" href="group___turbo_j_p_e_g.html#ga72ecf4ebe6eb702d3c6f5ca27455e1ec">flags</a>.</td></tr>
+  </table>
+  </dd>
+</dl>
+<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr2()</a>.) </dd></dl>
+
+</div>
+</div>
 <a class="anchor" id="gad02cd42b69f193a0623a9c801788df3a"></a>
 <div class="memitem">
 <div class="memproto">
@@ -2279,11 +2585,34 @@
   </table>
   </dd>
 </dl>
-<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr()</a>.) </dd></dl>
+<dl class="section return"><dt>Returns</dt><dd>0 if successful, or -1 if an error occurred (see <a class="el" href="group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c" title="Returns a descriptive error message explaining why the last command failed.">tjGetErrorStr2()</a> and <a class="el" href="group___turbo_j_p_e_g.html#ga0be00a62bd1be897f170fa1fed5fb4cb" title="Returns a code indicating the severity of the last error.">tjGetErrorCode()</a>.) </dd></dl>
 
 </div>
 </div>
 <h2 class="groupheader">Variable Documentation</h2>
+<a class="anchor" id="ga5af0ab065feefd526debf1e20c43e837"></a>
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">const int tjAlphaOffset[<a class="el" href="group___turbo_j_p_e_g.html#ga7010a4402f54a45ba822ad8675a4655e">TJ_NUMPF</a>]</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">static</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+<p>Alpha offset (in bytes) for a given pixel format. </p>
+<p>This specifies the number of bytes that the Alpha component is offset from the start of the pixel. For instance, if a pixel of format TJ_BGRA is stored in <code>char pixel[]</code>, then the alpha component will be <code>pixel[tjAlphaOffset[TJ_BGRA]]</code>. This will be -1 if the pixel format does not have an alpha component. </p>
+
+</div>
+</div>
 <a class="anchor" id="ga84e2e35d3f08025f976ec1ec53693dea"></a>
 <div class="memitem">
 <div class="memproto">
@@ -2303,7 +2632,7 @@
 </div><div class="memdoc">
 
 <p>Blue offset (in bytes) for a given pixel format. </p>
-<p>This specifies the number of bytes that the Blue component is offset from the start of the pixel. For instance, if a pixel of format TJ_BGRX is stored in <code>char pixel[]</code>, then the blue component will be <code>pixel[tjBlueOffset[TJ_BGRX]]</code>. </p>
+<p>This specifies the number of bytes that the Blue component is offset from the start of the pixel. For instance, if a pixel of format TJ_BGRX is stored in <code>char pixel[]</code>, then the blue component will be <code>pixel[tjBlueOffset[TJ_BGRX]]</code>. This will be -1 if the pixel format does not have a blue component. </p>
 
 </div>
 </div>
@@ -2326,7 +2655,7 @@
 </div><div class="memdoc">
 
 <p>Green offset (in bytes) for a given pixel format. </p>
-<p>This specifies the number of bytes that the green component is offset from the start of the pixel. For instance, if a pixel of format TJ_BGRX is stored in <code>char pixel[]</code>, then the green component will be <code>pixel[tjGreenOffset[TJ_BGRX]]</code>. </p>
+<p>This specifies the number of bytes that the green component is offset from the start of the pixel. For instance, if a pixel of format TJ_BGRX is stored in <code>char pixel[]</code>, then the green component will be <code>pixel[tjGreenOffset[TJ_BGRX]]</code>. This will be -1 if the pixel format does not have a green component. </p>
 
 </div>
 </div>
@@ -2431,7 +2760,7 @@
 </div><div class="memdoc">
 
 <p>Red offset (in bytes) for a given pixel format. </p>
-<p>This specifies the number of bytes that the red component is offset from the start of the pixel. For instance, if a pixel of format TJ_BGRX is stored in <code>char pixel[]</code>, then the red component will be <code>pixel[tjRedOffset[TJ_BGRX]]</code>. </p>
+<p>This specifies the number of bytes that the red component is offset from the start of the pixel. For instance, if a pixel of format TJ_BGRX is stored in <code>char pixel[]</code>, then the red component will be <code>pixel[tjRedOffset[TJ_BGRX]]</code>. This will be -1 if the pixel format does not have a red component. </p>
 
 </div>
 </div>
diff --git a/doc/html/index.html b/doc/html/index.html
index 3cc1b3e..6b27f31 100644
--- a/doc/html/index.html
+++ b/doc/html/index.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.5</span>
+   &#160;<span id="projectnumber">1.6</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/modules.html b/doc/html/modules.html
index 8e6f815..8b38151 100644
--- a/doc/html/modules.html
+++ b/doc/html/modules.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.5</span>
+   &#160;<span id="projectnumber">1.6</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/search/all_74.js b/doc/html/search/all_74.js
index 444aaef..fe7e697 100644
--- a/doc/html/search/all_74.js
+++ b/doc/html/search/all_74.js
@@ -1,10 +1,12 @@
 var searchData=
 [
   ['tj_5fnumcs',['TJ_NUMCS',['../group___turbo_j_p_e_g.html#ga39f57a6fb02d9cf32e7b6890099b5a71',1,'turbojpeg.h']]],
+  ['tj_5fnumerr',['TJ_NUMERR',['../group___turbo_j_p_e_g.html#ga79bde1b4a3e2351e00887e47781b966e',1,'turbojpeg.h']]],
   ['tj_5fnumpf',['TJ_NUMPF',['../group___turbo_j_p_e_g.html#ga7010a4402f54a45ba822ad8675a4655e',1,'turbojpeg.h']]],
   ['tj_5fnumsamp',['TJ_NUMSAMP',['../group___turbo_j_p_e_g.html#ga5ef3d169162ce77ce348e292a0b7477c',1,'turbojpeg.h']]],
   ['tj_5fnumxop',['TJ_NUMXOP',['../group___turbo_j_p_e_g.html#ga0f6dbd18adf38b7d46ac547f0f4d562c',1,'turbojpeg.h']]],
   ['tjalloc',['tjAlloc',['../group___turbo_j_p_e_g.html#ga5c9234bda6d993cdaffdd89bf81a00ff',1,'turbojpeg.h']]],
+  ['tjalphaoffset',['tjAlphaOffset',['../group___turbo_j_p_e_g.html#ga5af0ab065feefd526debf1e20c43e837',1,'turbojpeg.h']]],
   ['tjblueoffset',['tjBlueOffset',['../group___turbo_j_p_e_g.html#ga84e2e35d3f08025f976ec1ec53693dea',1,'turbojpeg.h']]],
   ['tjbufsize',['tjBufSize',['../group___turbo_j_p_e_g.html#gaccc5bca7f12fcdcc302e6e1c6d4b311b',1,'turbojpeg.h']]],
   ['tjbufsizeyuv2',['tjBufSizeYUV2',['../group___turbo_j_p_e_g.html#gaf451664a62c1f6c7cc5a6401f32908c9',1,'turbojpeg.h']]],
@@ -26,19 +28,26 @@
   ['tjdestroy',['tjDestroy',['../group___turbo_j_p_e_g.html#ga674adee917b95ad4a896f1ba39e12540',1,'turbojpeg.h']]],
   ['tjencodeyuv3',['tjEncodeYUV3',['../group___turbo_j_p_e_g.html#gaabe05acd734990053ad1294b5ef239aa',1,'turbojpeg.h']]],
   ['tjencodeyuvplanes',['tjEncodeYUVPlanes',['../group___turbo_j_p_e_g.html#ga8a65ed3bd12df57c219d46afbc9008f1',1,'turbojpeg.h']]],
+  ['tjerr',['TJERR',['../group___turbo_j_p_e_g.html#gafbc17cfa57d0d5d11fea35ac025950fe',1,'turbojpeg.h']]],
+  ['tjerr_5ffatal',['TJERR_FATAL',['../group___turbo_j_p_e_g.html#ggafbc17cfa57d0d5d11fea35ac025950feafc9cceeada13122b09e4851e3788039a',1,'turbojpeg.h']]],
+  ['tjerr_5fwarning',['TJERR_WARNING',['../group___turbo_j_p_e_g.html#ggafbc17cfa57d0d5d11fea35ac025950fea342dd6e2aedb47bb257b4e7568329b59',1,'turbojpeg.h']]],
   ['tjflag_5faccuratedct',['TJFLAG_ACCURATEDCT',['../group___turbo_j_p_e_g.html#gacb233cfd722d66d1ccbf48a7de81f0e0',1,'turbojpeg.h']]],
   ['tjflag_5fbottomup',['TJFLAG_BOTTOMUP',['../group___turbo_j_p_e_g.html#ga72ecf4ebe6eb702d3c6f5ca27455e1ec',1,'turbojpeg.h']]],
   ['tjflag_5ffastdct',['TJFLAG_FASTDCT',['../group___turbo_j_p_e_g.html#gaabce235db80d3f698b27f36cbd453da2',1,'turbojpeg.h']]],
   ['tjflag_5ffastupsample',['TJFLAG_FASTUPSAMPLE',['../group___turbo_j_p_e_g.html#ga4ee4506c81177a06f77e2504a22efd2d',1,'turbojpeg.h']]],
   ['tjflag_5fnorealloc',['TJFLAG_NOREALLOC',['../group___turbo_j_p_e_g.html#ga8808d403c68b62aaa58a4c1e58e98963',1,'turbojpeg.h']]],
+  ['tjflag_5fprogressive',['TJFLAG_PROGRESSIVE',['../group___turbo_j_p_e_g.html#ga43b426750b46190a25d34a67ef76df1b',1,'turbojpeg.h']]],
+  ['tjflag_5fstoponwarning',['TJFLAG_STOPONWARNING',['../group___turbo_j_p_e_g.html#ga519cfa4ef6c18d9e5b455fdf59306a3a',1,'turbojpeg.h']]],
   ['tjfree',['tjFree',['../group___turbo_j_p_e_g.html#ga8c4a1231dc06a450514c835f6471f137',1,'turbojpeg.h']]],
-  ['tjgeterrorstr',['tjGetErrorStr',['../group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf',1,'turbojpeg.h']]],
+  ['tjgeterrorcode',['tjGetErrorCode',['../group___turbo_j_p_e_g.html#ga0be00a62bd1be897f170fa1fed5fb4cb',1,'turbojpeg.h']]],
+  ['tjgeterrorstr2',['tjGetErrorStr2',['../group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c',1,'turbojpeg.h']]],
   ['tjgetscalingfactors',['tjGetScalingFactors',['../group___turbo_j_p_e_g.html#ga6449044b9af402999ccf52f401333be8',1,'turbojpeg.h']]],
   ['tjgreenoffset',['tjGreenOffset',['../group___turbo_j_p_e_g.html#ga82d6e35da441112a411da41923c0ba2f',1,'turbojpeg.h']]],
   ['tjhandle',['tjhandle',['../group___turbo_j_p_e_g.html#ga758d2634ecb4949de7815cba621f5763',1,'turbojpeg.h']]],
   ['tjinitcompress',['tjInitCompress',['../group___turbo_j_p_e_g.html#ga3d10c47fbe4a2489a2b30c931551d01a',1,'turbojpeg.h']]],
   ['tjinitdecompress',['tjInitDecompress',['../group___turbo_j_p_e_g.html#gae5408179d041e2a2f7199c8283cf649e',1,'turbojpeg.h']]],
   ['tjinittransform',['tjInitTransform',['../group___turbo_j_p_e_g.html#ga3155b775bfbac9dbba869b95a0367902',1,'turbojpeg.h']]],
+  ['tjloadimage',['tjLoadImage',['../group___turbo_j_p_e_g.html#ga144b981d6b281ecca4cbb4709de75749',1,'turbojpeg.h']]],
   ['tjmcuheight',['tjMCUHeight',['../group___turbo_j_p_e_g.html#gabd247bb9fecb393eca57366feb8327bf',1,'turbojpeg.h']]],
   ['tjmcuwidth',['tjMCUWidth',['../group___turbo_j_p_e_g.html#ga9e61e7cd47a15a173283ba94e781308c',1,'turbojpeg.h']]],
   ['tjpad',['TJPAD',['../group___turbo_j_p_e_g.html#ga0aba955473315e405295d978f0c16511',1,'turbojpeg.h']]],
@@ -53,6 +62,7 @@
   ['tjpf_5frgb',['TJPF_RGB',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa7ce93230bff449518ce387c17e6ed37c',1,'turbojpeg.h']]],
   ['tjpf_5frgba',['TJPF_RGBA',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa88d2e88fab67f6503cf972e14851cc12',1,'turbojpeg.h']]],
   ['tjpf_5frgbx',['TJPF_RGBX',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa83973bebb7e2dc6fa8bae89ff3f42e01',1,'turbojpeg.h']]],
+  ['tjpf_5funknown',['TJPF_UNKNOWN',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa84c1a6cead7952998e2fb895844a21ed',1,'turbojpeg.h']]],
   ['tjpf_5fxbgr',['TJPF_XBGR',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aaf6603b27147de47e212e75dac027b2af',1,'turbojpeg.h']]],
   ['tjpf_5fxrgb',['TJPF_XRGB',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aadae996905efcfa3b42a0bb3bea7f9d84',1,'turbojpeg.h']]],
   ['tjpixelsize',['tjPixelSize',['../group___turbo_j_p_e_g.html#gad77cf8fe5b2bfd3cb3f53098146abb4c',1,'turbojpeg.h']]],
@@ -68,9 +78,10 @@
   ['tjsamp_5f440',['TJSAMP_440',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074accf740e6f3aa6ba20ba922cad13cb974',1,'turbojpeg.h']]],
   ['tjsamp_5f444',['TJSAMP_444',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074afb8da4f44197837bdec0a4f593dacae3',1,'turbojpeg.h']]],
   ['tjsamp_5fgray',['TJSAMP_GRAY',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a3f1c9504842ddc7a48d0f690754b6248',1,'turbojpeg.h']]],
+  ['tjsaveimage',['tjSaveImage',['../group___turbo_j_p_e_g.html#ga2e78b7b79796e74584028da880a6a29c',1,'turbojpeg.h']]],
   ['tjscaled',['TJSCALED',['../group___turbo_j_p_e_g.html#ga84878bb65404204743aa18cac02781df',1,'turbojpeg.h']]],
   ['tjscalingfactor',['tjscalingfactor',['../structtjscalingfactor.html',1,'']]],
-  ['tjtransform',['tjtransform',['../structtjtransform.html',1,'tjtransform'],['../group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a',1,'tjTransform(tjhandle handle, const unsigned char *jpegBuf, unsigned long jpegSize, int n, unsigned char **dstBufs, unsigned long *dstSizes, tjtransform *transforms, int flags):&#160;turbojpeg.h'],['../group___turbo_j_p_e_g.html#gaa29f3189c41be12ec5dee7caec318a31',1,'tjtransform():&#160;turbojpeg.h']]],
+  ['tjtransform',['tjtransform',['../structtjtransform.html',1,'tjtransform'],['../group___turbo_j_p_e_g.html#gaa29f3189c41be12ec5dee7caec318a31',1,'tjtransform():&#160;turbojpeg.h'],['../group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a',1,'tjTransform(tjhandle handle, const unsigned char *jpegBuf, unsigned long jpegSize, int n, unsigned char **dstBufs, unsigned long *dstSizes, tjtransform *transforms, int flags):&#160;turbojpeg.h']]],
   ['tjxop',['TJXOP',['../group___turbo_j_p_e_g.html#ga2de531af4e7e6c4f124908376b354866',1,'turbojpeg.h']]],
   ['tjxop_5fhflip',['TJXOP_HFLIP',['../group___turbo_j_p_e_g.html#gga2de531af4e7e6c4f124908376b354866aa0df69776caa30f0fa28e26332d311ce',1,'turbojpeg.h']]],
   ['tjxop_5fnone',['TJXOP_NONE',['../group___turbo_j_p_e_g.html#gga2de531af4e7e6c4f124908376b354866aad88c0366cd3f7d0eac9d7a3fa1c2c27',1,'turbojpeg.h']]],
@@ -80,10 +91,12 @@
   ['tjxop_5ftranspose',['TJXOP_TRANSPOSE',['../group___turbo_j_p_e_g.html#gga2de531af4e7e6c4f124908376b354866a31060aed199f886afdd417f80499c32d',1,'turbojpeg.h']]],
   ['tjxop_5ftransverse',['TJXOP_TRANSVERSE',['../group___turbo_j_p_e_g.html#gga2de531af4e7e6c4f124908376b354866af3b14d488aea6ece9e5b3df73a74d6a4',1,'turbojpeg.h']]],
   ['tjxop_5fvflip',['TJXOP_VFLIP',['../group___turbo_j_p_e_g.html#gga2de531af4e7e6c4f124908376b354866a324eddfbec53b7e691f61e56929d0d5d',1,'turbojpeg.h']]],
+  ['tjxopt_5fcopynone',['TJXOPT_COPYNONE',['../group___turbo_j_p_e_g.html#ga153b468cfb905d0de61706c838986fe8',1,'turbojpeg.h']]],
   ['tjxopt_5fcrop',['TJXOPT_CROP',['../group___turbo_j_p_e_g.html#ga9c771a757fc1294add611906b89ab2d2',1,'turbojpeg.h']]],
   ['tjxopt_5fgray',['TJXOPT_GRAY',['../group___turbo_j_p_e_g.html#ga3acee7b48ade1b99e5588736007c2589',1,'turbojpeg.h']]],
   ['tjxopt_5fnooutput',['TJXOPT_NOOUTPUT',['../group___turbo_j_p_e_g.html#gafbf992bbf6e006705886333703ffab31',1,'turbojpeg.h']]],
   ['tjxopt_5fperfect',['TJXOPT_PERFECT',['../group___turbo_j_p_e_g.html#ga50e03cb5ed115330e212417429600b00',1,'turbojpeg.h']]],
+  ['tjxopt_5fprogressive',['TJXOPT_PROGRESSIVE',['../group___turbo_j_p_e_g.html#gad2371c80674584ecc1a7d75e564cf026',1,'turbojpeg.h']]],
   ['tjxopt_5ftrim',['TJXOPT_TRIM',['../group___turbo_j_p_e_g.html#ga319826b7eb1583c0595bbe7b95428709',1,'turbojpeg.h']]],
   ['turbojpeg',['TurboJPEG',['../group___turbo_j_p_e_g.html',1,'']]]
 ];
diff --git a/doc/html/search/enums_74.js b/doc/html/search/enums_74.js
index 276aa24..19c20cf 100644
--- a/doc/html/search/enums_74.js
+++ b/doc/html/search/enums_74.js
@@ -1,6 +1,7 @@
 var searchData=
 [
   ['tjcs',['TJCS',['../group___turbo_j_p_e_g.html#ga4f83ad3368e0e29d1957be0efa7c3720',1,'turbojpeg.h']]],
+  ['tjerr',['TJERR',['../group___turbo_j_p_e_g.html#gafbc17cfa57d0d5d11fea35ac025950fe',1,'turbojpeg.h']]],
   ['tjpf',['TJPF',['../group___turbo_j_p_e_g.html#gac916144e26c3817ac514e64ae5d12e2a',1,'turbojpeg.h']]],
   ['tjsamp',['TJSAMP',['../group___turbo_j_p_e_g.html#ga1d047060ea80bb9820d540bb928e9074',1,'turbojpeg.h']]],
   ['tjxop',['TJXOP',['../group___turbo_j_p_e_g.html#ga2de531af4e7e6c4f124908376b354866',1,'turbojpeg.h']]]
diff --git a/doc/html/search/enumvalues_74.js b/doc/html/search/enumvalues_74.js
index 7dc2f8d..e683856 100644
--- a/doc/html/search/enumvalues_74.js
+++ b/doc/html/search/enumvalues_74.js
@@ -5,6 +5,8 @@
   ['tjcs_5frgb',['TJCS_RGB',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a677cb7ccb85c4038ac41964a2e09e555',1,'turbojpeg.h']]],
   ['tjcs_5fycbcr',['TJCS_YCbCr',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a7389b8f65bb387ffedce3efd0d78ec75',1,'turbojpeg.h']]],
   ['tjcs_5fycck',['TJCS_YCCK',['../group___turbo_j_p_e_g.html#gga4f83ad3368e0e29d1957be0efa7c3720a53839e0fe867b76b58d16b0a1a7c598e',1,'turbojpeg.h']]],
+  ['tjerr_5ffatal',['TJERR_FATAL',['../group___turbo_j_p_e_g.html#ggafbc17cfa57d0d5d11fea35ac025950feafc9cceeada13122b09e4851e3788039a',1,'turbojpeg.h']]],
+  ['tjerr_5fwarning',['TJERR_WARNING',['../group___turbo_j_p_e_g.html#ggafbc17cfa57d0d5d11fea35ac025950fea342dd6e2aedb47bb257b4e7568329b59',1,'turbojpeg.h']]],
   ['tjpf_5fabgr',['TJPF_ABGR',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa1ba1a7f1631dbeaa49a0a85fc4a40081',1,'turbojpeg.h']]],
   ['tjpf_5fargb',['TJPF_ARGB',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aae8f846ed9d9de99b6e1dfe448848765c',1,'turbojpeg.h']]],
   ['tjpf_5fbgr',['TJPF_BGR',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aab10624437fb8ef495a0b153e65749839',1,'turbojpeg.h']]],
@@ -15,6 +17,7 @@
   ['tjpf_5frgb',['TJPF_RGB',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa7ce93230bff449518ce387c17e6ed37c',1,'turbojpeg.h']]],
   ['tjpf_5frgba',['TJPF_RGBA',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa88d2e88fab67f6503cf972e14851cc12',1,'turbojpeg.h']]],
   ['tjpf_5frgbx',['TJPF_RGBX',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa83973bebb7e2dc6fa8bae89ff3f42e01',1,'turbojpeg.h']]],
+  ['tjpf_5funknown',['TJPF_UNKNOWN',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aa84c1a6cead7952998e2fb895844a21ed',1,'turbojpeg.h']]],
   ['tjpf_5fxbgr',['TJPF_XBGR',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aaf6603b27147de47e212e75dac027b2af',1,'turbojpeg.h']]],
   ['tjpf_5fxrgb',['TJPF_XRGB',['../group___turbo_j_p_e_g.html#ggac916144e26c3817ac514e64ae5d12e2aadae996905efcfa3b42a0bb3bea7f9d84',1,'turbojpeg.h']]],
   ['tjsamp_5f411',['TJSAMP_411',['../group___turbo_j_p_e_g.html#gga1d047060ea80bb9820d540bb928e9074a28ec62575e5ea295c3fde3001dc628e2',1,'turbojpeg.h']]],
diff --git a/doc/html/search/functions_74.js b/doc/html/search/functions_74.js
index 69410b0..0c9a3c1 100644
--- a/doc/html/search/functions_74.js
+++ b/doc/html/search/functions_74.js
@@ -16,13 +16,16 @@
   ['tjencodeyuv3',['tjEncodeYUV3',['../group___turbo_j_p_e_g.html#gaabe05acd734990053ad1294b5ef239aa',1,'turbojpeg.h']]],
   ['tjencodeyuvplanes',['tjEncodeYUVPlanes',['../group___turbo_j_p_e_g.html#ga8a65ed3bd12df57c219d46afbc9008f1',1,'turbojpeg.h']]],
   ['tjfree',['tjFree',['../group___turbo_j_p_e_g.html#ga8c4a1231dc06a450514c835f6471f137',1,'turbojpeg.h']]],
-  ['tjgeterrorstr',['tjGetErrorStr',['../group___turbo_j_p_e_g.html#ga9af79c908ec131b1ae8d52fe40375abf',1,'turbojpeg.h']]],
+  ['tjgeterrorcode',['tjGetErrorCode',['../group___turbo_j_p_e_g.html#ga0be00a62bd1be897f170fa1fed5fb4cb',1,'turbojpeg.h']]],
+  ['tjgeterrorstr2',['tjGetErrorStr2',['../group___turbo_j_p_e_g.html#ga94a235bd4f1088f61ad87b4eadb64c9c',1,'turbojpeg.h']]],
   ['tjgetscalingfactors',['tjGetScalingFactors',['../group___turbo_j_p_e_g.html#ga6449044b9af402999ccf52f401333be8',1,'turbojpeg.h']]],
   ['tjinitcompress',['tjInitCompress',['../group___turbo_j_p_e_g.html#ga3d10c47fbe4a2489a2b30c931551d01a',1,'turbojpeg.h']]],
   ['tjinitdecompress',['tjInitDecompress',['../group___turbo_j_p_e_g.html#gae5408179d041e2a2f7199c8283cf649e',1,'turbojpeg.h']]],
   ['tjinittransform',['tjInitTransform',['../group___turbo_j_p_e_g.html#ga3155b775bfbac9dbba869b95a0367902',1,'turbojpeg.h']]],
+  ['tjloadimage',['tjLoadImage',['../group___turbo_j_p_e_g.html#ga144b981d6b281ecca4cbb4709de75749',1,'turbojpeg.h']]],
   ['tjplaneheight',['tjPlaneHeight',['../group___turbo_j_p_e_g.html#ga1a209696c6a80748f20e134b3c64789f',1,'turbojpeg.h']]],
   ['tjplanesizeyuv',['tjPlaneSizeYUV',['../group___turbo_j_p_e_g.html#ga6f98d977bfa9d167c97172e876ba61e2',1,'turbojpeg.h']]],
   ['tjplanewidth',['tjPlaneWidth',['../group___turbo_j_p_e_g.html#ga63fb66bb1e36c74008c4634360becbb1',1,'turbojpeg.h']]],
+  ['tjsaveimage',['tjSaveImage',['../group___turbo_j_p_e_g.html#ga2e78b7b79796e74584028da880a6a29c',1,'turbojpeg.h']]],
   ['tjtransform',['tjTransform',['../group___turbo_j_p_e_g.html#gad02cd42b69f193a0623a9c801788df3a',1,'turbojpeg.h']]]
 ];
diff --git a/doc/html/search/variables_74.js b/doc/html/search/variables_74.js
index 13a056e..2d20942 100644
--- a/doc/html/search/variables_74.js
+++ b/doc/html/search/variables_74.js
@@ -1,5 +1,6 @@
 var searchData=
 [
+  ['tjalphaoffset',['tjAlphaOffset',['../group___turbo_j_p_e_g.html#ga5af0ab065feefd526debf1e20c43e837',1,'turbojpeg.h']]],
   ['tjblueoffset',['tjBlueOffset',['../group___turbo_j_p_e_g.html#ga84e2e35d3f08025f976ec1ec53693dea',1,'turbojpeg.h']]],
   ['tjgreenoffset',['tjGreenOffset',['../group___turbo_j_p_e_g.html#ga82d6e35da441112a411da41923c0ba2f',1,'turbojpeg.h']]],
   ['tjmcuheight',['tjMCUHeight',['../group___turbo_j_p_e_g.html#gabd247bb9fecb393eca57366feb8327bf',1,'turbojpeg.h']]],
diff --git a/doc/html/structtjregion.html b/doc/html/structtjregion.html
index af2a473..36c0afa 100644
--- a/doc/html/structtjregion.html
+++ b/doc/html/structtjregion.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.5</span>
+   &#160;<span id="projectnumber">1.6</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/structtjscalingfactor.html b/doc/html/structtjscalingfactor.html
index 3bb50f5..2f54900 100644
--- a/doc/html/structtjscalingfactor.html
+++ b/doc/html/structtjscalingfactor.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.5</span>
+   &#160;<span id="projectnumber">1.6</span>
    </div>
   </td>
  </tr>
diff --git a/doc/html/structtjtransform.html b/doc/html/structtjtransform.html
index 9fd97f7..d9536a3 100644
--- a/doc/html/structtjtransform.html
+++ b/doc/html/structtjtransform.html
@@ -24,7 +24,7 @@
  <tr style="height: 56px;">
   <td style="padding-left: 0.5em;">
    <div id="projectname">TurboJPEG
-   &#160;<span id="projectnumber">1.5</span>
+   &#160;<span id="projectnumber">1.6</span>
    </div>
   </td>
  </tr>
diff --git a/doxygen.config b/doxygen.config
index 1723123..2f2ab66 100644
--- a/doxygen.config
+++ b/doxygen.config
@@ -1,5 +1,5 @@
 PROJECT_NAME = TurboJPEG
-PROJECT_NUMBER = 1.5
+PROJECT_NUMBER = 1.6
 OUTPUT_DIRECTORY = doc/
 USE_WINDOWS_ENCODING = NO
 OPTIMIZE_OUTPUT_FOR_C = YES
diff --git a/example.c b/example.txt
similarity index 91%
rename from example.c
rename to example.txt
index ac27f49..04c11fe 100644
--- a/example.c
+++ b/example.txt
@@ -1,5 +1,5 @@
 /*
- * example.c
+ * example.txt
  *
  * This file illustrates how to use the IJG code as a subroutine library
  * to read or write JPEG image files.  You should look at this code in
@@ -13,6 +13,20 @@
  * routines in a different style if you prefer.
  */
 
+/* This example was part of the original libjpeg documentation and has been
+ * unchanged since 1994.  It is, as described in libjpeg.txt, "heavily
+ * commented skeleton code for calling the JPEG library."  It is not meant to
+ * be compiled as a standalone program, since it has no main() function and
+ * does not compress from/decompress to a real image buffer (corollary:
+ * put_scanline_someplace() is not a real function.)  First-time users of
+ * libjpeg-turbo would be better served by looking at tjexample.c, which uses
+ * the more straightforward TurboJPEG API, or at cjpeg.c and djpeg.c, which are
+ * examples of libjpeg API usage that can be (and are) compiled into standalone
+ * programs.  Note that this example, as well as the examples in cjpeg.c and
+ * djpeg.c, interleave disk I/O with JPEG compression/decompression, so none of
+ * these examples is suitable for benchmarking purposes.
+ */
+
 #include <stdio.h>
 
 /*
@@ -69,7 +83,7 @@
  */
 
 GLOBAL(void)
-write_JPEG_file (char *filename, int quality)
+write_JPEG_file(char *filename, int quality)
 {
   /* This struct contains the JPEG compression parameters and pointers to
    * working space (which is allocated as needed by the JPEG library).
@@ -158,8 +172,8 @@
      * Here the array is only one element long, but you could pass
      * more than one scanline at a time if that's more convenient.
      */
-    row_pointer[0] = & image_buffer[cinfo.next_scanline * row_stride];
-    (void) jpeg_write_scanlines(&cinfo, row_pointer, 1);
+    row_pointer[0] = &image_buffer[cinfo.next_scanline * row_stride];
+    (void)jpeg_write_scanlines(&cinfo, row_pointer, 1);
   }
 
   /* Step 6: Finish compression */
@@ -260,10 +274,10 @@
  */
 
 METHODDEF(void)
-my_error_exit (j_common_ptr cinfo)
+my_error_exit(j_common_ptr cinfo)
 {
   /* cinfo->err really points to a my_error_mgr struct, so coerce pointer */
-  my_error_ptr myerr = (my_error_ptr) cinfo->err;
+  my_error_ptr myerr = (my_error_ptr)cinfo->err;
 
   /* Always display the message. */
   /* We could postpone this until after returning, if we chose. */
@@ -281,7 +295,7 @@
 
 
 GLOBAL(int)
-read_JPEG_file (char *filename)
+read_JPEG_file(char *filename)
 {
   /* This struct contains the JPEG decompression parameters and pointers to
    * working space (which is allocated as needed by the JPEG library).
@@ -331,7 +345,7 @@
 
   /* Step 3: read file parameters with jpeg_read_header() */
 
-  (void) jpeg_read_header(&cinfo, TRUE);
+  (void)jpeg_read_header(&cinfo, TRUE);
   /* We can ignore the return value from jpeg_read_header since
    *   (a) suspension is not possible with the stdio data source, and
    *   (b) we passed TRUE to reject a tables-only JPEG file as an error.
@@ -346,7 +360,7 @@
 
   /* Step 5: Start decompressor */
 
-  (void) jpeg_start_decompress(&cinfo);
+  (void)jpeg_start_decompress(&cinfo);
   /* We can ignore the return value since suspension is not possible
    * with the stdio data source.
    */
@@ -361,7 +375,7 @@
   row_stride = cinfo.output_width * cinfo.output_components;
   /* Make a one-row-high sample array that will go away when done with image */
   buffer = (*cinfo.mem->alloc_sarray)
-                ((j_common_ptr) &cinfo, JPOOL_IMAGE, row_stride, 1);
+                ((j_common_ptr)&cinfo, JPOOL_IMAGE, row_stride, 1);
 
   /* Step 6: while (scan lines remain to be read) */
   /*           jpeg_read_scanlines(...); */
@@ -374,14 +388,14 @@
      * Here the array is only one element long, but you could ask for
      * more than one scanline at a time if that's more convenient.
      */
-    (void) jpeg_read_scanlines(&cinfo, buffer, 1);
+    (void)jpeg_read_scanlines(&cinfo, buffer, 1);
     /* Assume put_scanline_someplace wants a pointer and sample count. */
     put_scanline_someplace(buffer[0], row_stride);
   }
 
   /* Step 7: Finish decompression */
 
-  (void) jpeg_finish_decompress(&cinfo);
+  (void)jpeg_finish_decompress(&cinfo);
   /* We can ignore the return value since suspension is not possible
    * with the stdio data source.
    */
diff --git a/jaricom.c b/jaricom.c
index 3bb557f..774be9e 100644
--- a/jaricom.c
+++ b/jaricom.c
@@ -29,9 +29,10 @@
  * implementation (jbig_tab.c).
  */
 
-#define V(i,a,b,c,d) (((JLONG)a << 16) | ((JLONG)c << 8) | ((JLONG)d << 7) | b)
+#define V(i, a, b, c, d) \
+  (((JLONG)a << 16) | ((JLONG)c << 8) | ((JLONG)d << 7) | b)
 
-const JLONG jpeg_aritab[113+1] = {
+const JLONG jpeg_aritab[113 + 1] = {
 /*
  * Index, Qe_Value, Next_Index_LPS, Next_Index_MPS, Switch_MPS
  */
diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt
index 0af8ae1..d51ada7 100644
--- a/java/CMakeLists.txt
+++ b/java/CMakeLists.txt
@@ -1,57 +1,78 @@
-set(JAR_FILE turbojpeg.jar)
-set(MANIFEST_FILE ${CMAKE_CURRENT_SOURCE_DIR}/MANIFEST.MF)
+find_package(Java REQUIRED)
+find_package(JNI REQUIRED)
 
-set(JAVA_CLASSNAMES org/libjpegturbo/turbojpeg/TJ
-  org/libjpegturbo/turbojpeg/TJCompressor
-  org/libjpegturbo/turbojpeg/TJCustomFilter
-  org/libjpegturbo/turbojpeg/TJDecompressor
-  org/libjpegturbo/turbojpeg/TJException
-  org/libjpegturbo/turbojpeg/TJScalingFactor
-  org/libjpegturbo/turbojpeg/TJTransform
-  org/libjpegturbo/turbojpeg/TJTransformer
-  org/libjpegturbo/turbojpeg/YUVImage
-  TJUnitTest
-  TJExample
-  TJBench)
-
-if(MSVC_IDE)
-  set(OBJDIR "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}")
-else()
-  set(OBJDIR ${CMAKE_CURRENT_BINARY_DIR})
+# Allow the Java compiler flags to be set using an environment variable
+if(NOT DEFINED CMAKE_JAVA_COMPILE_FLAGS AND DEFINED ENV{JAVAFLAGS})
+  set(CMAKE_JAVA_COMPILE_FLAGS $ENV{JAVAFLAGS})
 endif()
 
+include(UseJava)
+
+set(CMAKE_JAVA_COMPILE_FLAGS "${CMAKE_JAVA_COMPILE_FLAGS} -J-Dfile.encoding=UTF8")
+message(STATUS "CMAKE_JAVA_COMPILE_FLAGS = ${CMAKE_JAVA_COMPILE_FLAGS}")
+string(REGEX REPLACE " " ";" CMAKE_JAVA_COMPILE_FLAGS "${CMAKE_JAVA_COMPILE_FLAGS}")
+
+set(JAVAARGS "" CACHE STRING "Additional arguments to pass to java when running unit tests (example: -d32)")
+message(STATUS "JAVAARGS = ${JAVAARGS}")
+
+set(JAVA_SOURCES org/libjpegturbo/turbojpeg/TJ.java
+  org/libjpegturbo/turbojpeg/TJCompressor.java
+  org/libjpegturbo/turbojpeg/TJCustomFilter.java
+  org/libjpegturbo/turbojpeg/TJDecompressor.java
+  org/libjpegturbo/turbojpeg/TJException.java
+  org/libjpegturbo/turbojpeg/TJScalingFactor.java
+  org/libjpegturbo/turbojpeg/TJTransform.java
+  org/libjpegturbo/turbojpeg/TJTransformer.java
+  org/libjpegturbo/turbojpeg/YUVImage.java
+  TJUnitTest.java
+  TJExample.java
+  TJBench.java)
+
 set(TURBOJPEG_DLL_NAME "turbojpeg")
 if(MINGW)
   set(TURBOJPEG_DLL_NAME "libturbojpeg")
 endif()
-configure_file(org/libjpegturbo/turbojpeg/TJLoader.java.in
-  ${CMAKE_CURRENT_BINARY_DIR}/org/libjpegturbo/turbojpeg/TJLoader.java)
-
-set(JAVA_SOURCES "")
-set(JAVA_CLASSES "")
-set(JAVA_CLASSES_FULL "")
-foreach(class ${JAVA_CLASSNAMES})
-  set(JAVA_SOURCES ${JAVA_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/${class}.java)
-  set(JAVA_CLASSES ${JAVA_CLASSES} ${class}.class)
-  set(JAVA_CLASSES_FULL ${JAVA_CLASSES_FULL} ${OBJDIR}/${class}.class)
-endforeach()
-
+if(WIN32)
+  configure_file(org/libjpegturbo/turbojpeg/TJLoader-win.java.in
+    ${CMAKE_CURRENT_BINARY_DIR}/org/libjpegturbo/turbojpeg/TJLoader.java)
+else()
+  configure_file(org/libjpegturbo/turbojpeg/TJLoader-unix.java.in
+    ${CMAKE_CURRENT_BINARY_DIR}/org/libjpegturbo/turbojpeg/TJLoader.java)
+endif()
 set(JAVA_SOURCES ${JAVA_SOURCES}
   ${CMAKE_CURRENT_BINARY_DIR}/org/libjpegturbo/turbojpeg/TJLoader.java)
-set(JAVA_CLASSES ${JAVA_CLASSES}
-  org/libjpegturbo/turbojpeg/TJLoader.class)
-set(JAVA_CLASSES_FULL ${JAVA_CLASSES_FULL}
-  ${OBJDIR}/org/libjpegturbo/turbojpeg/TJLoader.class)
 
-string(REGEX REPLACE " " ";" JAVACFLAGS "${JAVACFLAGS}")
-add_custom_command(OUTPUT ${JAVA_CLASSES_FULL} DEPENDS ${JAVA_SOURCES}
-  COMMAND ${JAVA_COMPILE} ARGS ${JAVACFLAGS} -d ${OBJDIR} ${JAVA_SOURCES})
+if(MSYS)
+  # UGLY HACK ALERT: If we don't do this, then UseJava.cmake will separate
+  # class path members with a semicolon, which is interpreted as a command
+  # separator by the MSYS shell.
+  set(CMAKE_HOST_SYSTEM_NAME_BAK ${CMAKE_HOST_SYSTEM_NAME})
+  set(CMAKE_HOST_SYSTEM_NAME "MSYS")
+endif()
+add_jar(turbojpeg-java ${JAVA_SOURCES} OUTPUT_NAME turbojpeg
+  ENTRY_POINT TJExample)
+if(MSYS)
+  set(CMAKE_HOST_SYSTEM_NAME ${CMAKE_HOST_SYSTEM_NAME_BAK})
+endif()
 
-add_custom_command(OUTPUT ${JAR_FILE} DEPENDS ${JAVA_CLASSES_FULL}
-  ${MANIFEST_FILE}
-  COMMAND ${JAVA_ARCHIVE} cfm ${JAR_FILE} ${MANIFEST_FILE} ${JAVA_CLASSES}
-  WORKING_DIRECTORY ${OBJDIR})
+add_custom_target(javadoc COMMAND
+  javadoc -notimestamp -d ${CMAKE_CURRENT_SOURCE_DIR}/doc -sourcepath ${CMAKE_CURRENT_SOURCE_DIR} org.libjpegturbo.turbojpeg)
+set(JAVACLASSPATH ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_FILES_DIRECTORY}/turbojpeg-java.dir)
+add_custom_target(javah
+  COMMAND javah -d ${CMAKE_CURRENT_SOURCE_DIR} -classpath ${JAVACLASSPATH} org.libjpegturbo.turbojpeg.TJ
+  COMMAND javah -d ${CMAKE_CURRENT_SOURCE_DIR} -classpath ${JAVACLASSPATH} org.libjpegturbo.turbojpeg.TJCompressor
+  COMMAND javah -d ${CMAKE_CURRENT_SOURCE_DIR} -classpath ${JAVACLASSPATH} org.libjpegturbo.turbojpeg.TJDecompressor
+  COMMAND javah -d ${CMAKE_CURRENT_SOURCE_DIR} -classpath ${JAVACLASSPATH} org.libjpegturbo.turbojpeg.TJTransformer)
 
-add_custom_target(java ALL DEPENDS ${JAR_FILE})
-
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${JAR_FILE} DESTINATION classes)
+if(NOT DEFINED CMAKE_INSTALL_DEFAULT_JAVADIR)
+  set(CMAKE_INSTALL_DEFAULT_JAVADIR "<CMAKE_INSTALL_DATAROOTDIR>/java")
+endif()
+GNUInstallDirs_set_install_dir(JAVADIR
+  "The directory into which Java classes should be installed")
+GNUInstallDirs_get_absolute_install_dir(CMAKE_INSTALL_FULL_JAVADIR
+  CMAKE_INSTALL_JAVADIR)
+set(CMAKE_INSTALL_JAVADIR ${CMAKE_INSTALL_JAVADIR} PARENT_SCOPE)
+set(CMAKE_INSTALL_FULL_JAVADIR ${CMAKE_INSTALL_FULL_JAVADIR} PARENT_SCOPE)
+report_directory(JAVADIR)
+install_jar(turbojpeg-java ${CMAKE_INSTALL_JAVADIR})
+mark_as_advanced(CLEAR CMAKE_INSTALL_JAVADIR)
diff --git a/java/Makefile.am b/java/Makefile.am
deleted file mode 100644
index d3fc59c..0000000
--- a/java/Makefile.am
+++ /dev/null
@@ -1,75 +0,0 @@
-JAVAROOT = .
-
-org/libjpegturbo/turbojpeg/TJLoader.java: $(srcdir)/org/libjpegturbo/turbojpeg/TJLoader.java.tmpl
-	mkdir -p org/libjpegturbo/turbojpeg; \
-	cat $(srcdir)/org/libjpegturbo/turbojpeg/TJLoader.java.tmpl | \
-		sed s@%{__libdir}@$(libdir)@g > org/libjpegturbo/turbojpeg/TJLoader.java
-
-
-JAVASOURCES = org/libjpegturbo/turbojpeg/TJ.java \
-	org/libjpegturbo/turbojpeg/TJCompressor.java \
-	org/libjpegturbo/turbojpeg/TJCustomFilter.java \
-	org/libjpegturbo/turbojpeg/TJDecompressor.java \
-	org/libjpegturbo/turbojpeg/TJException.java \
-	org/libjpegturbo/turbojpeg/TJScalingFactor.java \
-	org/libjpegturbo/turbojpeg/TJTransform.java \
-	org/libjpegturbo/turbojpeg/TJTransformer.java \
-	org/libjpegturbo/turbojpeg/YUVImage.java \
-	TJExample.java \
-	TJUnitTest.java \
-	TJBench.java
-
-JNIHEADERS = org_libjpegturbo_turbojpeg_TJ.h \
-	org_libjpegturbo_turbojpeg_TJCompressor.h \
-	org_libjpegturbo_turbojpeg_TJDecompressor.h \
-	org_libjpegturbo_turbojpeg_TJTransformer.h
-
-if WITH_JAVA
-
-nodist_noinst_JAVA = ${JAVASOURCES} org/libjpegturbo/turbojpeg/TJLoader.java
-
-JAVA_CLASSES = org/libjpegturbo/turbojpeg/TJ.class \
-	org/libjpegturbo/turbojpeg/TJCompressor.class \
-	org/libjpegturbo/turbojpeg/TJCustomFilter.class \
-	org/libjpegturbo/turbojpeg/TJDecompressor.class \
-	org/libjpegturbo/turbojpeg/TJException.class \
-	org/libjpegturbo/turbojpeg/TJLoader.class \
-	org/libjpegturbo/turbojpeg/TJScalingFactor.class \
-	org/libjpegturbo/turbojpeg/TJTransform.class \
-	org/libjpegturbo/turbojpeg/TJTransformer.class \
-	org/libjpegturbo/turbojpeg/YUVImage.class \
-	TJExample.class \
-	TJUnitTest.class \
-	TJBench.class
-
-all: all-am turbojpeg.jar
-
-turbojpeg.jar: classnoinst.stamp ${srcdir}/MANIFEST.MF
-	$(JAR) cfm turbojpeg.jar ${srcdir}/MANIFEST.MF $(JAVA_CLASSES)
-
-clean-local:
-	rm -f turbojpeg.jar
-
-install-exec-local: turbojpeg.jar
-	mkdir -p $(DESTDIR)/$(datadir)/classes
-	$(INSTALL) -m 644 turbojpeg.jar $(DESTDIR)/$(datadir)/classes/
-
-uninstall-local:
-	rm -f $(DESTDIR)/$(datadir)/classes/turbojpeg.jar
-	if [ -d $(DESTDIR)/$(datadir)/classes ]; then rmdir $(DESTDIR)/$(datadir)/classes; fi
-
-headers: all
-	javah -d ${srcdir} org.libjpegturbo.turbojpeg.TJ; \
-	javah -d ${srcdir} org.libjpegturbo.turbojpeg.TJCompressor; \
-	javah -d ${srcdir} org.libjpegturbo.turbojpeg.TJDecompressor; \
-	javah -d ${srcdir} org.libjpegturbo.turbojpeg.TJTransformer
-
-docs: all
-	mkdir -p ${srcdir}/doc; \
-	javadoc -notimestamp -d ${srcdir}/doc -sourcepath ${srcdir} org.libjpegturbo.turbojpeg
-
-endif
-
-EXTRA_DIST = MANIFEST.MF ${JAVASOURCES} ${JNIHEADERS} doc CMakeLists.txt \
-	org/libjpegturbo/turbojpeg/TJLoader.java.tmpl \
-	org/libjpegturbo/turbojpeg/TJLoader.java.in
diff --git a/java/TJBench.java b/java/TJBench.java
index ddc414c..f962766 100644
--- a/java/TJBench.java
+++ b/java/TJBench.java
@@ -34,7 +34,7 @@
 
 class TJBench {
 
-  static int flags = 0, quiet = 0, pf = TJ.PF_BGR, yuvpad = 1;
+  static int flags = 0, quiet = 0, pf = TJ.PF_BGR, yuvPad = 1;
   static boolean compOnly, decompOnly, doTile, doYUV, write = true;
 
   static final String[] pixFormatStr = {
@@ -63,6 +63,26 @@
   }
 
 
+  static String tjErrorMsg;
+  static int tjErrorCode = -1;
+
+  static void handleTJException(TJException e) throws TJException {
+    String _tjErrorMsg = e.getMessage();
+    int _tjErrorCode = e.getErrorCode();
+
+    if ((flags & TJ.FLAG_STOPONWARNING) == 0 &&
+        _tjErrorCode == TJ.ERR_WARNING) {
+      if (tjErrorMsg == null || !tjErrorMsg.equals(_tjErrorMsg) ||
+          tjErrorCode != _tjErrorCode) {
+        tjErrorMsg = _tjErrorMsg;
+        tjErrorCode = _tjErrorCode;
+        System.out.println("WARNING: " + _tjErrorMsg);
+      }
+    } else
+      throw e;
+  }
+
+
   static String formatName(int subsamp, int cs) {
     if (cs == TJ.CS_YCbCr)
       return subNameLong[subsamp];
@@ -76,6 +96,7 @@
   static String sigFig(double val, int figs) {
     String format;
     int digitsAfterDecimal = figs - (int)Math.ceil(Math.log10(Math.abs(val)));
+
     if (digitsAfterDecimal < 1)
       format = new String("%.0f");
     else
@@ -87,10 +108,12 @@
   static byte[] loadImage(String fileName, int[] w, int[] h, int pixelFormat)
                           throws Exception {
     BufferedImage img = ImageIO.read(new File(fileName));
+
     if (img == null)
       throw new Exception("Could not read " + fileName);
     w[0] = img.getWidth();
     h[0] = img.getHeight();
+
     int[] rgb = img.getRGB(0, 0, w[0], h[0], null, 0, w[0]);
     int ps = TJ.getPixelSize(pixelFormat);
     int rindex = TJ.getRedOffset(pixelFormat);
@@ -98,6 +121,7 @@
     int bindex = TJ.getBlueOffset(pixelFormat);
     byte[] dstBuf = new byte[w[0] * h[0] * ps];
     int pixels = w[0] * h[0], dstPtr = 0, rgbPtr = 0;
+
     while (pixels-- > 0) {
       dstBuf[dstPtr + rindex] = (byte)((rgb[rgbPtr] >> 16) & 0xff);
       dstBuf[dstPtr + gindex] = (byte)((rgb[rgbPtr] >> 8) & 0xff);
@@ -117,11 +141,13 @@
     int rindex = TJ.getRedOffset(pixelFormat);
     int gindex = TJ.getGreenOffset(pixelFormat);
     int bindex = TJ.getBlueOffset(pixelFormat);
+
     for (int y = 0; y < h; y++) {
       for (int x = 0; x < w; x++, srcPtr += ps) {
         int pixel = (srcBuf[srcPtr + rindex] & 0xff) << 16 |
                     (srcBuf[srcPtr + gindex] & 0xff) << 8 |
                     (srcBuf[srcPtr + bindex] & 0xff);
+
         img.setRGB(x, y, pixel);
       }
     }
@@ -157,7 +183,8 @@
     if (doYUV) {
       int width = doTile ? tilew : scaledw;
       int height = doTile ? tileh : scaledh;
-      yuvImage = new YUVImage(width, yuvpad, height, subsamp);
+
+      yuvImage = new YUVImage(width, yuvPad, height, subsamp);
       Arrays.fill(yuvImage.getBuf(), (byte)127);
     }
 
@@ -167,21 +194,30 @@
     while (true) {
       int tile = 0;
       double start = getTime();
+
       for (int y = 0; y < h; y += tileh) {
         for (int x = 0; x < w; x += tilew, tile++) {
           int width = doTile ? Math.min(tilew, w - x) : scaledw;
           int height = doTile ? Math.min(tileh, h - y) : scaledh;
+
           tjd.setSourceImage(jpegBuf[tile], jpegSize[tile]);
           if (doYUV) {
-            yuvImage.setBuf(yuvImage.getBuf(), width, yuvpad, height, subsamp);
-            tjd.decompressToYUV(yuvImage, flags);
+            yuvImage.setBuf(yuvImage.getBuf(), width, yuvPad, height, subsamp);
+            try {
+              tjd.decompressToYUV(yuvImage, flags);
+            } catch (TJException e) { handleTJException(e); }
             double startDecode = getTime();
             tjd.setSourceImage(yuvImage);
-            tjd.decompress(dstBuf, x, y, width, pitch, height, pf, flags);
+            try {
+              tjd.decompress(dstBuf, x, y, width, pitch, height, pf, flags);
+            } catch (TJException e) { handleTJException(e); }
             if (iter >= 0)
               elapsedDecode += getTime() - startDecode;
-          } else
-            tjd.decompress(dstBuf, x, y, width, pitch, height, pf, flags);
+          } else {
+            try {
+              tjd.decompress(dstBuf, x, y, width, pitch, height, pf, flags);
+            } catch (TJException e) { handleTJException(e); }
+          }
         }
       }
       elapsed += getTime() - start;
@@ -194,7 +230,7 @@
         elapsed = elapsedDecode = 0.0;
       }
     }
-    if(doYUV)
+    if (doYUV)
       elapsed -= elapsedDecode;
 
     tjd = null;
@@ -205,16 +241,18 @@
 
     if (quiet != 0) {
       System.out.format("%-6s%s",
-        sigFig((double)(w * h) / 1000000. * (double)iter / elapsed, 4),
-        quiet == 2 ? "\n" : "  ");
+                        sigFig((double)(w * h) / 1000000. *
+                               (double)iter / elapsed, 4),
+                        quiet == 2 ? "\n" : "  ");
       if (doYUV)
         System.out.format("%s\n",
-          sigFig((double)(w * h) / 1000000. * (double)iter / elapsedDecode, 4));
+                          sigFig((double)(w * h) / 1000000. *
+                                 (double)iter / elapsedDecode, 4));
       else if (quiet != 2)
         System.out.print("\n");
     } else {
       System.out.format("%s --> Frame rate:         %f fps\n",
-                        (doYUV ? "Decomp to YUV":"Decompress   "),
+                        (doYUV ? "Decomp to YUV" : "Decompress   "),
                         (double)iter / elapsed);
       System.out.format("                  Throughput:         %f Megapixels/sec\n",
                         (double)(w * h) / 1000000. * (double)iter / elapsed);
@@ -222,7 +260,8 @@
         System.out.format("YUV Decode    --> Frame rate:         %f fps\n",
                           (double)iter / elapsedDecode);
         System.out.format("                  Throughput:         %f Megapixels/sec\n",
-                          (double)(w * h) / 1000000. * (double)iter / elapsedDecode);
+                          (double)(w * h) / 1000000. *
+                          (double)iter / elapsedDecode);
       }
     }
 
@@ -255,6 +294,7 @@
             int lum = (int)((double)(srcBuf[rindex] & 0xff) * 0.299 +
                             (double)(srcBuf[gindex] & 0xff) * 0.587 +
                             (double)(srcBuf[bindex] & 0xff) * 0.114 + 0.5);
+
             if (lum > 255) lum = 255;
             if (lum < 0) lum = 0;
             dstBuf[rindex] = (byte)Math.abs((dstBuf[rindex] & 0xff) - lum);
@@ -291,8 +331,9 @@
 
     if (quiet == 0)
       System.out.format(">>>>>  %s (%s) <--> JPEG %s Q%d  <<<<<\n", pfStr,
-        (flags & TJ.FLAG_BOTTOMUP) != 0 ? "Bottom-up" : "Top-down",
-        subNameLong[subsamp], jpegQual);
+                        (flags & TJ.FLAG_BOTTOMUP) != 0 ?
+                        "Bottom-up" : "Top-down",
+                        subNameLong[subsamp], jpegQual);
 
     tjc = new TJCompressor();
 
@@ -319,7 +360,7 @@
       tjc.setSubsamp(subsamp);
 
       if (doYUV) {
-        yuvImage = new YUVImage(tilew, yuvpad, tileh, subsamp);
+        yuvImage = new YUVImage(tilew, yuvPad, tileh, subsamp);
         Arrays.fill(yuvImage.getBuf(), (byte)127);
       }
 
@@ -328,16 +369,19 @@
       elapsed = elapsedEncode = 0.0;
       while (true) {
         int tile = 0;
+
         totalJpegSize = 0;
         start = getTime();
         for (int y = 0; y < h; y += tileh) {
           for (int x = 0; x < w; x += tilew, tile++) {
             int width = Math.min(tilew, w - x);
             int height = Math.min(tileh, h - y);
+
             tjc.setSourceImage(srcBuf, x, y, width, pitch, height, pf);
             if (doYUV) {
               double startEncode = getTime();
-              yuvImage.setBuf(yuvImage.getBuf(), width, yuvpad, height,
+
+              yuvImage.setBuf(yuvImage.getBuf(), width, yuvPad, height,
                               subsamp);
               tjc.encodeYUV(yuvImage, flags);
               if (iter >= 0)
@@ -367,14 +411,17 @@
       if (quiet != 0) {
         if (doYUV)
           System.out.format("%-6s%s",
-            sigFig((double)(w * h) / 1000000. * (double)iter / elapsedEncode, 4),
-            quiet == 2 ? "\n" : "  ");
+                            sigFig((double)(w * h) / 1000000. *
+                                   (double)iter / elapsedEncode, 4),
+                            quiet == 2 ? "\n" : "  ");
         System.out.format("%-6s%s",
-          sigFig((double)(w * h) / 1000000. * (double)iter / elapsed, 4),
-          quiet == 2 ? "\n" : "  ");
+                          sigFig((double)(w * h) / 1000000. *
+                                 (double)iter / elapsed, 4),
+                          quiet == 2 ? "\n" : "  ");
         System.out.format("%-6s%s",
-          sigFig((double)(w * h * ps) / (double)totalJpegSize, 4),
-          quiet == 2 ? "\n" : "  ");
+                          sigFig((double)(w * h * ps) / (double)totalJpegSize,
+                                 4),
+                          quiet == 2 ? "\n" : "  ");
       } else {
         System.out.format("\n%s size: %d x %d\n", doTile ? "Tile" : "Image",
                           tilew, tileh);
@@ -386,9 +433,11 @@
           System.out.format("                  Compression ratio:  %f:1\n",
                             (double)(w * h * ps) / (double)yuvImage.getSize());
           System.out.format("                  Throughput:         %f Megapixels/sec\n",
-                            (double)(w * h) / 1000000. * (double)iter / elapsedEncode);
+                            (double)(w * h) / 1000000. *
+                            (double)iter / elapsedEncode);
           System.out.format("                  Output bit stream:  %f Megabits/sec\n",
-            (double)yuvImage.getSize() * 8. / 1000000. * (double)iter / elapsedEncode);
+                            (double)yuvImage.getSize() * 8. / 1000000. *
+                            (double)iter / elapsedEncode);
         }
         System.out.format("%s --> Frame rate:         %f fps\n",
                           doYUV ? "Comp from YUV" : "Compress     ",
@@ -400,12 +449,14 @@
         System.out.format("                  Throughput:         %f Megapixels/sec\n",
                           (double)(w * h) / 1000000. * (double)iter / elapsed);
         System.out.format("                  Output bit stream:  %f Megabits/sec\n",
-          (double)totalJpegSize * 8. / 1000000. * (double)iter / elapsed);
+                          (double)totalJpegSize * 8. / 1000000. *
+                          (double)iter / elapsed);
       }
       if (tilew == w && tileh == h && write) {
         String tempStr = fileName + "_" + subName[subsamp] + "_" + "Q" +
                          jpegQual + ".jpg";
         FileOutputStream fos = new FileOutputStream(tempStr);
+
         fos.write(jpegBuf[0], 0, jpegSize[0]);
         fos.close();
         if (quiet == 0)
@@ -466,8 +517,9 @@
       System.out.println("\n");
     } else if (quiet == 0)
       System.out.format(">>>>>  JPEG %s --> %s (%s)  <<<<<\n",
-        formatName(subsamp, cs), pixFormatStr[pf],
-        (flags & TJ.FLAG_BOTTOMUP) != 0 ? "Bottom-up" : "Top-down");
+                        formatName(subsamp, cs), pixFormatStr[pf],
+                        (flags & TJ.FLAG_BOTTOMUP) != 0 ?
+                        "Bottom-up" : "Top-down");
 
     for (int tilew = doTile ? 16 : w, tileh = doTile ? 16 : h; ;
          tilew *= 2, tileh *= 2) {
@@ -523,14 +575,15 @@
             xformOp == TJTransform.OP_TRANSVERSE ||
             xformOp == TJTransform.OP_ROT90 ||
             xformOp == TJTransform.OP_ROT270) {
-            if (_subsamp == TJ.SAMP_422)
-              _subsamp = TJ.SAMP_440;
-            else if (_subsamp == TJ.SAMP_440)
-              _subsamp = TJ.SAMP_422;
+          if (_subsamp == TJ.SAMP_422)
+            _subsamp = TJ.SAMP_440;
+          else if (_subsamp == TJ.SAMP_440)
+            _subsamp = TJ.SAMP_422;
         }
 
         TJTransform[] t = new TJTransform[_ntilesw * _ntilesh];
-        jpegBuf = new byte[_ntilesw * _ntilesh][TJ.bufSize(_tilew, _tileh, subsamp)];
+        jpegBuf =
+          new byte[_ntilesw * _ntilesh][TJ.bufSize(_tilew, _tileh, subsamp)];
 
         for (y = 0, tile = 0; y < _h; y += _tileh) {
           for (x = 0; x < _w; x += _tilew, tile++) {
@@ -570,10 +623,11 @@
 
         if (quiet != 0) {
           System.out.format("%-6s%s%-6s%s",
-            sigFig((double)(w * h) / 1000000. / elapsed, 4),
-            quiet == 2 ? "\n" : "  ",
-            sigFig((double)(w * h * ps) / (double)totalJpegSize, 4),
-            quiet == 2 ? "\n" : "  ");
+                            sigFig((double)(w * h) / 1000000. / elapsed, 4),
+                            quiet == 2 ? "\n" : "  ",
+                            sigFig((double)(w * h * ps) /
+                                   (double)totalJpegSize, 4),
+                            quiet == 2 ? "\n" : "  ");
         } else if (quiet == 0) {
           System.out.format("Transform     --> Frame rate:         %f fps\n",
                             1.0 / elapsed);
@@ -636,6 +690,8 @@
     System.out.println("     codec");
     System.out.println("-accuratedct = Use the most accurate DCT/IDCT algorithms available in the");
     System.out.println("     underlying codec");
+    System.out.println("-progressive = Use progressive entropy coding in JPEG images generated by");
+    System.out.println("     compression and transform operations.");
     System.out.println("-subsamp <s> = When testing JPEG compression, this option specifies the level");
     System.out.println("     of chrominance subsampling to use (<s> = 444, 422, 440, 420, 411, or");
     System.out.println("     GRAY).  The default is to test Grayscale, 4:2:0, 4:2:2, and 4:4:4 in");
@@ -667,13 +723,18 @@
     System.out.println("     decompression (these options are mutually exclusive)");
     System.out.println("-grayscale = Perform lossless grayscale conversion prior to decompression");
     System.out.println("     test (can be combined with the other transforms above)");
+    System.out.println("-copynone = Do not copy any extra markers (including EXIF and ICC profile data)");
+    System.out.println("     when transforming the image.");
     System.out.println("-benchtime <t> = Run each benchmark for at least <t> seconds (default = 5.0)");
     System.out.println("-warmup <t> = Run each benchmark for <t> seconds (default = 1.0) prior to");
     System.out.println("     starting the timer, in order to prime the caches and thus improve the");
     System.out.println("     consistency of the results.");
     System.out.println("-componly = Stop after running compression tests.  Do not test decompression.");
     System.out.println("-nowrite = Do not write reference or output images (improves consistency");
-    System.out.println("     of performance measurements.)\n");
+    System.out.println("     of performance measurements.)");
+    System.out.println("-stoponwarning = Immediately discontinue the current");
+    System.out.println("     compression/decompression/transform operation if the underlying codec");
+    System.out.println("     throws a warning (non-fatal error)\n");
     System.out.println("NOTE:  If the quality is specified as a range (e.g. 90-100), a separate");
     System.out.println("test will be performed for all quality values in the range.\n");
     System.exit(1);
@@ -681,9 +742,9 @@
 
 
   public static void main(String[] argv) {
-    byte[] srcBuf = null;  int w = 0, h = 0;
-    int minQual = -1, maxQual = -1;
-    int minArg = 1;  int retval = 0;
+    byte[] srcBuf = null;
+    int w = 0, h = 0, minQual = -1, maxQual = -1;
+    int minArg = 1, retval = 0;
     int subsamp = -1;
 
     try {
@@ -720,20 +781,19 @@
         for (int i = minArg; i < argv.length; i++) {
           if (argv[i].equalsIgnoreCase("-tile")) {
             doTile = true;  xformOpt |= TJTransform.OPT_CROP;
-          }
-          else if (argv[i].equalsIgnoreCase("-fastupsample")) {
+          } else if (argv[i].equalsIgnoreCase("-fastupsample")) {
             System.out.println("Using fast upsampling code\n");
             flags |= TJ.FLAG_FASTUPSAMPLE;
-          }
-          else if (argv[i].equalsIgnoreCase("-fastdct")) {
+          } else if (argv[i].equalsIgnoreCase("-fastdct")) {
             System.out.println("Using fastest DCT/IDCT algorithm\n");
             flags |= TJ.FLAG_FASTDCT;
-          }
-          else if (argv[i].equalsIgnoreCase("-accuratedct")) {
+          } else if (argv[i].equalsIgnoreCase("-accuratedct")) {
             System.out.println("Using most accurate DCT/IDCT algorithm\n");
             flags |= TJ.FLAG_ACCURATEDCT;
-          }
-          else if (argv[i].equalsIgnoreCase("-rgb"))
+          } else if (argv[i].equalsIgnoreCase("-progressive")) {
+            System.out.println("Using progressive entropy coding\n");
+            flags |= TJ.FLAG_PROGRESSIVE;
+          } else if (argv[i].equalsIgnoreCase("-rgb"))
             pf = TJ.PF_RGB;
           else if (argv[i].equalsIgnoreCase("-rgbx"))
             pf = TJ.PF_RGBX;
@@ -755,26 +815,27 @@
             int temp1 = 0, temp2 = 0;
             boolean match = false, scanned = true;
             Scanner scanner = new Scanner(argv[++i]).useDelimiter("/");
+
             try {
               temp1 = scanner.nextInt();
               temp2 = scanner.nextInt();
-            } catch(Exception e) {}
+            } catch (Exception e) {}
             if (temp2 <= 0) temp2 = 1;
             if (temp1 > 0) {
               TJScalingFactor[] scalingFactors = TJ.getScalingFactors();
+
               for (int j = 0; j < scalingFactors.length; j++) {
                 if ((double)temp1 / (double)temp2 ==
                     (double)scalingFactors[j].getNum() /
                     (double)scalingFactors[j].getDenom()) {
                   sf = scalingFactors[j];
-                  match = true;   break;
+                  match = true;  break;
                 }
               }
               if (!match) usage();
             } else
               usage();
-          }
-          else if (argv[i].equalsIgnoreCase("-hflip"))
+          } else if (argv[i].equalsIgnoreCase("-hflip"))
             xformOp = TJTransform.OP_HFLIP;
           else if (argv[i].equalsIgnoreCase("-vflip"))
             xformOp = TJTransform.OP_VFLIP;
@@ -792,8 +853,12 @@
             xformOpt |= TJTransform.OPT_GRAY;
           else if (argv[i].equalsIgnoreCase("-nooutput"))
             xformOpt |= TJTransform.OPT_NOOUTPUT;
-          else if (argv[i].equalsIgnoreCase("-benchtime") && i < argv.length - 1) {
+          else if (argv[i].equalsIgnoreCase("-copynone"))
+            xformOpt |= TJTransform.OPT_COPYNONE;
+          else if (argv[i].equalsIgnoreCase("-benchtime") &&
+                   i < argv.length - 1) {
             double temp = -1;
+
             try {
               temp = Double.parseDouble(argv[++i]);
             } catch (NumberFormatException e) {}
@@ -801,20 +866,32 @@
               benchTime = temp;
             else
               usage();
-          }
-          else if (argv[i].equalsIgnoreCase("-yuv")) {
+          } else if (argv[i].equalsIgnoreCase("-warmup") &&
+                     i < argv.length - 1) {
+            double temp = -1;
+
+            try {
+              temp = Double.parseDouble(argv[++i]);
+            } catch (NumberFormatException e) {}
+            if (temp >= 0.0) {
+              warmup = temp;
+              System.out.format("Warmup time = %.1f seconds\n\n", warmup);
+            } else
+              usage();
+          } else if (argv[i].equalsIgnoreCase("-yuv")) {
             System.out.println("Testing YUV planar encoding/decoding\n");
             doYUV = true;
-          }
-          else if (argv[i].equalsIgnoreCase("-yuvpad") && i < argv.length - 1) {
+          } else if (argv[i].equalsIgnoreCase("-yuvpad") &&
+                     i < argv.length - 1) {
             int temp = 0;
+
             try {
-             temp = Integer.parseInt(argv[++i]);
+              temp = Integer.parseInt(argv[++i]);
             } catch (NumberFormatException e) {}
             if (temp >= 1)
-              yuvpad = temp;
-          }
-          else if (argv[i].equalsIgnoreCase("-subsamp") && i < argv.length - 1) {
+              yuvPad = temp;
+          } else if (argv[i].equalsIgnoreCase("-subsamp") &&
+                     i < argv.length - 1) {
             i++;
             if (argv[i].toUpperCase().startsWith("G"))
               subsamp = TJ.SAMP_GRAY;
@@ -828,22 +905,12 @@
               subsamp = TJ.SAMP_420;
             else if (argv[i].equals("411"))
               subsamp = TJ.SAMP_411;
-          }
-          else if (argv[i].equalsIgnoreCase("-componly"))
+          } else if (argv[i].equalsIgnoreCase("-componly"))
             compOnly = true;
           else if (argv[i].equalsIgnoreCase("-nowrite"))
             write = false;
-          else if (argv[i].equalsIgnoreCase("-warmup") && i < argv.length - 1) {
-            double temp = -1;
-            try {
-             temp = Double.parseDouble(argv[++i]);
-            } catch (NumberFormatException e) {}
-            if (temp >= 0.0) {
-              warmup = temp;
-              System.out.format("Warmup time = %.1f seconds\n\n", warmup);
-            } else
-              usage();
-          }
+          else if (argv[i].equalsIgnoreCase("-stoponwarning"))
+            flags |= TJ.FLAG_STOPONWARNING;
           else usage();
         }
       }
@@ -859,6 +926,7 @@
 
       if (!decompOnly) {
         int[] width = new int[1], height = new int[1];
+
         srcBuf = loadImage(argv[0], width, height, pf);
         w = width[0];  h = height[0];
         int index = -1;
@@ -869,7 +937,8 @@
       if (quiet == 1 && !decompOnly) {
         System.out.println("All performance values in Mpixels/sec\n");
         System.out.format("Bitmap     JPEG     JPEG  %s  %s   ",
-          (doTile ? "Tile " : "Image"), (doTile ? "Tile " : "Image"));
+                          (doTile ? "Tile " : "Image"),
+                          (doTile ? "Tile " : "Image"));
         if (doYUV)
           System.out.print("Encode  ");
         System.out.print("Comp    Comp    Decomp  ");
@@ -915,7 +984,13 @@
       }
 
     } catch (Exception e) {
-      System.out.println("ERROR: " + e.getMessage());
+      if (e instanceof TJException) {
+        TJException tje = (TJException)e;
+
+        System.out.println((tje.getErrorCode() == TJ.ERR_WARNING ?
+                            "WARNING: " : "ERROR: ") + tje.getMessage());
+      } else
+        System.out.println("ERROR: " + e.getMessage());
       e.printStackTrace();
       retval = -1;
     }
diff --git a/java/TJExample.java b/java/TJExample.java
index 835a5b9..16df085 100644
--- a/java/TJExample.java
+++ b/java/TJExample.java
@@ -28,8 +28,8 @@
  */
 
 /*
- * This program demonstrates how to compress and decompress JPEG files using
- * the TurboJPEG JNI wrapper
+ * This program demonstrates how to compress, decompress, and transform JPEG
+ * images using the TurboJPEG Java API
  */
 
 import java.io.*;
@@ -40,138 +40,173 @@
 import javax.swing.*;
 import org.libjpegturbo.turbojpeg.*;
 
+
 public class TJExample implements TJCustomFilter {
 
-  public static final String classname = new TJExample().getClass().getName();
+  private static final String classname = new TJExample().getClass().getName();
+
+  private static final int DEFAULT_SUBSAMP = TJ.SAMP_444;
+  private static final int DEFAULT_QUALITY = 95;
+
+
+  private static final String[] subsampName = {
+    "4:4:4", "4:2:2", "4:2:0", "Grayscale", "4:4:0", "4:1:1"
+  };
+
+  private static final String[] colorspaceName = {
+    "RGB", "YCbCr", "GRAY", "CMYK", "YCCK"
+  };
+
+
+  /* DCT filter example.  This produces a negative of the image. */
+
+  public void customFilter(ShortBuffer coeffBuffer, Rectangle bufferRegion,
+                           Rectangle planeRegion, int componentIndex,
+                           int transformIndex, TJTransform transform)
+                           throws TJException {
+    for (int i = 0; i < bufferRegion.width * bufferRegion.height; i++) {
+      coeffBuffer.put(i, (short)(-coeffBuffer.get(i)));
+    }
+  }
+
 
   private static void usage() throws Exception {
-    System.out.println("\nUSAGE: java " + classname + " <Input file> <Output file> [options]\n");
-    System.out.println("Input and output files can be any image format that the Java Image I/O");
+    System.out.println("\nUSAGE: java [Java options] " + classname +
+                       " <Input image> <Output image> [options]\n");
+
+    System.out.println("Input and output images can be in any image format that the Java Image I/O");
     System.out.println("extensions understand.  If either filename ends in a .jpg extension, then");
-    System.out.println("TurboJPEG will be used to compress or decompress the file.\n");
-    System.out.println("Options:\n");
-    System.out.println("-scale M/N = if the input image is a JPEG file, scale the width/height of the");
-    System.out.print("             output image by a factor of M/N (M/N = ");
-    for (int i = 0; i < sf.length; i++) {
-      System.out.print(sf[i].getNum() + "/" + sf[i].getDenom());
-      if (sf.length == 2 && i != sf.length - 1)
+    System.out.println("the TurboJPEG API will be used to compress or decompress the image.\n");
+
+    System.out.println("Compression Options (used if the output image is a JPEG image)");
+    System.out.println("--------------------------------------------------------------\n");
+
+    System.out.println("-subsamp <444|422|420|gray> = Apply this level of chrominance subsampling when");
+    System.out.println("     compressing the output image.  The default is to use the same level of");
+    System.out.println("     subsampling as in the input image, if the input image is also a JPEG");
+    System.out.println("     image, or to use grayscale if the input image is a grayscale non-JPEG");
+    System.out.println("     image, or to use " + subsampName[DEFAULT_SUBSAMP] + " subsampling otherwise.\n");
+
+    System.out.println("-q <1-100> = Compress the output image with this JPEG quality level");
+    System.out.println("     (default = " + DEFAULT_QUALITY + ").\n");
+
+    System.out.println("Decompression Options (used if the input image is a JPEG image)");
+    System.out.println("---------------------------------------------------------------\n");
+
+    System.out.println("-scale M/N = Scale the input image by a factor of M/N when decompressing it.");
+    System.out.print("(M/N = ");
+    for (int i = 0; i < scalingFactors.length; i++) {
+      System.out.print(scalingFactors[i].getNum() + "/" +
+                       scalingFactors[i].getDenom());
+      if (scalingFactors.length == 2 && i != scalingFactors.length - 1)
         System.out.print(" or ");
-      else if (sf.length > 2) {
-        if (i != sf.length - 1)
+      else if (scalingFactors.length > 2) {
+        if (i != scalingFactors.length - 1)
           System.out.print(", ");
-        if (i == sf.length - 2)
+        if (i == scalingFactors.length - 2)
           System.out.print("or ");
       }
     }
     System.out.println(")\n");
-    System.out.println("-samp <444|422|420|gray> = If the output image is a JPEG file, this specifies");
-    System.out.println("                           the level of chrominance subsampling to use when");
-    System.out.println("                           recompressing it.  Default is to use the same level");
-    System.out.println("                           of subsampling as the input, if the input is a JPEG");
-    System.out.println("                           file, or 4:4:4 otherwise.\n");
-    System.out.println("-q <1-100> = If the output image is a JPEG file, this specifies the JPEG");
-    System.out.println("             quality to use when recompressing it (default = 95).\n");
+
     System.out.println("-hflip, -vflip, -transpose, -transverse, -rot90, -rot180, -rot270 =");
-    System.out.println("     If the input image is a JPEG file, perform the corresponding lossless");
-    System.out.println("     transform prior to decompression (these options are mutually exclusive)\n");
-    System.out.println("-grayscale = If the input image is a JPEG file, perform lossless grayscale");
-    System.out.println("     conversion prior to decompression (can be combined with the other");
-    System.out.println("     transforms above)\n");
-    System.out.println("-crop X,Y,WxH = If the input image is a JPEG file, perform lossless cropping");
-    System.out.println("     prior to decompression.  X,Y specifies the upper left corner of the");
-    System.out.println("     cropping region, and WxH specifies its width and height.  X,Y must be");
-    System.out.println("     evenly divible by the MCU block size (8x8 if the source image was");
-    System.out.println("     compressed using no subsampling or grayscale, or 16x8 for 4:2:2 or 16x16");
-    System.out.println("     for 4:2:0.)\n");
-    System.out.println("-display = Display output image (Output file need not be specified in this");
+    System.out.println("     Perform one of these lossless transform operations on the input image");
+    System.out.println("     prior to decompressing it (these options are mutually exclusive.)\n");
+
+    System.out.println("-grayscale = Perform lossless grayscale conversion on the input image prior");
+    System.out.println("     to decompressing it (can be combined with the other transform operations");
+    System.out.println("     above.)\n");
+
+    System.out.println("-crop WxH+X+Y = Perform lossless cropping on the input image prior to");
+    System.out.println("     decompressing it.  X and Y specify the upper left corner of the cropping");
+    System.out.println("     region, and W and H specify the width and height of the cropping region.");
+    System.out.println("     X and Y must be evenly divible by the MCU block size (8x8 if the input");
+    System.out.println("     image was compressed using no subsampling or grayscale, 16x8 if it was");
+    System.out.println("     compressed using 4:2:2 subsampling, or 16x16 if it was compressed using");
+    System.out.println("     4:2:0 subsampling.)\n");
+
+    System.out.println("General Options");
+    System.out.println("---------------\n");
+
+    System.out.println("-display = Display output image (Output filename need not be specified in this");
     System.out.println("     case.)\n");
+
     System.out.println("-fastupsample = Use the fastest chrominance upsampling algorithm available in");
-    System.out.println("     the underlying codec\n");
+    System.out.println("     the underlying codec.\n");
+
     System.out.println("-fastdct = Use the fastest DCT/IDCT algorithms available in the underlying");
-    System.out.println("     codec\n");
+    System.out.println("     codec.\n");
+
     System.out.println("-accuratedct = Use the most accurate DCT/IDCT algorithms available in the");
-    System.out.println("     underlying codec\n");
+    System.out.println("     underlying codec.\n");
+
     System.exit(1);
   }
 
-  private static final String[] sampName = {
-    "4:4:4", "4:2:2", "4:2:0", "Grayscale", "4:4:0", "4:1:1"
-  };
 
   public static void main(String[] argv) {
 
-    BufferedImage img = null;
-    byte[] bmpBuf = null;
-    TJTransform xform = new TJTransform();
-    int flags = 0;
-
     try {
 
-      sf = TJ.getScalingFactors();
-
-      if (argv.length < 2) {
-        usage();
-      }
-
-      TJScalingFactor scaleFactor = new TJScalingFactor(1, 1);
-      String inFormat = "jpg", outFormat = "jpg";
-      int outSubsamp = -1, outQual = 95;
+      TJScalingFactor scalingFactor = new TJScalingFactor(1, 1);
+      int outSubsamp = -1, outQual = -1;
+      TJTransform xform = new TJTransform();
       boolean display = false;
+      int flags = 0;
+      int width, height;
+      String inFormat = "jpg", outFormat = "jpg";
+      BufferedImage img = null;
+      byte[] imgBuf = null;
+
+      if (argv.length < 2)
+        usage();
 
       if (argv[1].substring(0, 2).equalsIgnoreCase("-d"))
         display = true;
 
+      /* Parse arguments. */
       for (int i = 2; i < argv.length; i++) {
         if (argv[i].length() < 2)
           continue;
         else if (argv[i].length() > 2 &&
-            argv[i].substring(0, 3).equalsIgnoreCase("-sc")) {
+                 argv[i].substring(0, 3).equalsIgnoreCase("-sc") &&
+                 i < argv.length - 1) {
           int match = 0;
-          if (i < argv.length - 1) {
-            String[] scaleArg = argv[++i].split("/");
-            if (scaleArg.length == 2) {
-              TJScalingFactor tempsf =
-                new TJScalingFactor(Integer.parseInt(scaleArg[0]),
-                                    Integer.parseInt(scaleArg[1]));
-              for (int j = 0; j < sf.length; j++) {
-                if (tempsf.equals(sf[j])) {
-                  scaleFactor = sf[j];
-                  match = 1;
-                  break;
-                }
+          String[] scaleArg = argv[++i].split("/");
+          if (scaleArg.length == 2) {
+            TJScalingFactor tempsf =
+              new TJScalingFactor(Integer.parseInt(scaleArg[0]),
+                                  Integer.parseInt(scaleArg[1]));
+            for (int j = 0; j < scalingFactors.length; j++) {
+              if (tempsf.equals(scalingFactors[j])) {
+                scalingFactor = scalingFactors[j];
+                match = 1;
+                break;
               }
             }
           }
-          if (match != 1) usage();
-        }
-        else if (argv[i].length() > 2 &&
-            argv[i].substring(0, 3).equalsIgnoreCase("-sa")) {
-          if (i < argv.length - 1) {
-            i++;
-            if (argv[i].substring(0, 1).equalsIgnoreCase("g"))
-              outSubsamp = TJ.SAMP_GRAY;
-            else if (argv[i].equals("444"))
-              outSubsamp = TJ.SAMP_444;
-            else if (argv[i].equals("422"))
-              outSubsamp = TJ.SAMP_422;
-            else if (argv[i].equals("420"))
-              outSubsamp = TJ.SAMP_420;
-            else
-              usage();
-          } else
+          if (match != 1)
             usage();
-        }
-        else if (argv[i].substring(0, 2).equalsIgnoreCase("-q")) {
-          if (i < argv.length - 1) {
-            int qual = Integer.parseInt(argv[++i]);
-            if (qual >= 1 && qual <= 100)
-              outQual = qual;
-            else
-              usage();
-          } else
+        } else if (argv[i].length() > 2 &&
+                   argv[i].substring(0, 3).equalsIgnoreCase("-su") &&
+                   i < argv.length - 1) {
+          i++;
+          if (argv[i].substring(0, 1).equalsIgnoreCase("g"))
+            outSubsamp = TJ.SAMP_GRAY;
+          else if (argv[i].equals("444"))
+            outSubsamp = TJ.SAMP_444;
+          else if (argv[i].equals("422"))
+            outSubsamp = TJ.SAMP_422;
+          else if (argv[i].equals("420"))
+            outSubsamp = TJ.SAMP_420;
+          else
             usage();
-        }
-        else if (argv[i].substring(0, 2).equalsIgnoreCase("-g"))
+        } else if (argv[i].substring(0, 2).equalsIgnoreCase("-q") &&
+                   i < argv.length - 1) {
+          outQual = Integer.parseInt(argv[++i]);
+          if (outQual < 1 || outQual > 100)
+            usage();
+        } else if (argv[i].substring(0, 2).equalsIgnoreCase("-g"))
           xform.options |= TJTransform.OPT_GRAY;
         else if (argv[i].equalsIgnoreCase("-hflip"))
           xform.op = TJTransform.OP_HFLIP;
@@ -190,43 +225,34 @@
         else if (argv[i].equalsIgnoreCase("-custom"))
           xform.cf = new TJExample();
         else if (argv[i].length() > 2 &&
-                 argv[i].substring(0, 2).equalsIgnoreCase("-c")) {
-          if (i >= argv.length - 1)
+                 argv[i].substring(0, 2).equalsIgnoreCase("-c") &&
+                 i < argv.length - 1) {
+          String[] cropArg = argv[++i].split("[x\\+]");
+          if (cropArg.length != 4)
             usage();
-          String[] cropArg = argv[++i].split(",");
-          if (cropArg.length != 3)
+          xform.width = Integer.parseInt(cropArg[0]);
+          xform.height = Integer.parseInt(cropArg[1]);
+          xform.x = Integer.parseInt(cropArg[2]);
+          xform.y = Integer.parseInt(cropArg[3]);
+          if (xform.x < 0 || xform.y < 0 || xform.width < 1 ||
+              xform.height < 1)
             usage();
-          String[] dimArg = cropArg[2].split("[xX]");
-          if (dimArg.length != 2)
-            usage();
-          int tempx = Integer.parseInt(cropArg[0]);
-          int tempy = Integer.parseInt(cropArg[1]);
-          int tempw = Integer.parseInt(dimArg[0]);
-          int temph = Integer.parseInt(dimArg[1]);
-          if (tempx < 0 || tempy < 0 || tempw < 0 || temph < 0)
-            usage();
-          xform.x = tempx;
-          xform.y = tempy;
-          xform.width = tempw;
-          xform.height = temph;
           xform.options |= TJTransform.OPT_CROP;
-        }
-        else if (argv[i].substring(0, 2).equalsIgnoreCase("-d"))
+        } else if (argv[i].substring(0, 2).equalsIgnoreCase("-d"))
           display = true;
         else if (argv[i].equalsIgnoreCase("-fastupsample")) {
           System.out.println("Using fast upsampling code");
           flags |= TJ.FLAG_FASTUPSAMPLE;
-        }
-        else if (argv[i].equalsIgnoreCase("-fastdct")) {
+        } else if (argv[i].equalsIgnoreCase("-fastdct")) {
           System.out.println("Using fastest DCT/IDCT algorithm");
           flags |= TJ.FLAG_FASTDCT;
-        }
-        else if (argv[i].equalsIgnoreCase("-accuratedct")) {
+        } else if (argv[i].equalsIgnoreCase("-accuratedct")) {
           System.out.println("Using most accurate DCT/IDCT algorithm");
           flags |= TJ.FLAG_ACCURATEDCT;
-        }
-        else usage();
+        } else usage();
       }
+
+      /* Determine input and output image formats based on file extensions. */
       String[] inFileTokens = argv[0].split("\\.");
       if (inFileTokens.length > 1)
         inFormat = inFileTokens[inFileTokens.length - 1];
@@ -239,61 +265,75 @@
           outFormat = outFileTokens[outFileTokens.length - 1];
       }
 
-      File file = new File(argv[0]);
-      int width, height;
-
       if (inFormat.equalsIgnoreCase("jpg")) {
-        FileInputStream fis = new FileInputStream(file);
-        int inputSize = fis.available();
-        if (inputSize < 1) {
+        /* Input image is a JPEG image.  Decompress and/or transform it. */
+        boolean doTransform = (xform.op != TJTransform.OP_NONE ||
+                               xform.options != 0 || xform.cf != null);
+
+        /* Read the JPEG file into memory. */
+        File jpegFile = new File(argv[0]);
+        FileInputStream fis = new FileInputStream(jpegFile);
+        int jpegSize = fis.available();
+        if (jpegSize < 1) {
           System.out.println("Input file contains no data");
           System.exit(1);
         }
-        byte[] inputBuf = new byte[inputSize];
-        fis.read(inputBuf);
+        byte[] jpegBuf = new byte[jpegSize];
+        fis.read(jpegBuf);
         fis.close();
 
         TJDecompressor tjd;
-        if (xform.op != TJTransform.OP_NONE || xform.options != 0 ||
-            xform.cf != null) {
-          TJTransformer tjt = new TJTransformer(inputBuf);
-          TJTransform[] t = new TJTransform[1];
-          t[0] = xform;
-          t[0].options |= TJTransform.OPT_TRIM;
-          TJDecompressor[] tjdx = tjt.transform(t, 0);
-          tjd = tjdx[0];
+        if (doTransform) {
+          /* Transform it. */
+          TJTransformer tjt = new TJTransformer(jpegBuf);
+          TJTransform[] xforms = new TJTransform[1];
+          xforms[0] = xform;
+          xforms[0].options |= TJTransform.OPT_TRIM;
+          TJDecompressor[] tjds = tjt.transform(xforms, 0);
+          tjd = tjds[0];
+          tjt.close();
         } else
-          tjd = new TJDecompressor(inputBuf);
+          tjd = new TJDecompressor(jpegBuf);
 
         width = tjd.getWidth();
         height = tjd.getHeight();
         int inSubsamp = tjd.getSubsamp();
-        System.out.println("Source Image: " + width + " x " + height +
-                           " pixels, " + sampName[inSubsamp] + " subsampling");
-        if (outSubsamp < 0)
-          outSubsamp = inSubsamp;
+        int inColorspace = tjd.getColorspace();
 
-        if (outFormat.equalsIgnoreCase("jpg") &&
-            (xform.op != TJTransform.OP_NONE || xform.options != 0) &&
-            scaleFactor.isOne()) {
-          file = new File(argv[1]);
-          FileOutputStream fos = new FileOutputStream(file);
+        System.out.println((doTransform ? "Transformed" : "Input") +
+                           " Image (jpg):  " + width + " x " + height +
+                           " pixels, " + subsampName[inSubsamp] +
+                           " subsampling, " + colorspaceName[inColorspace]);
+
+        if (outFormat.equalsIgnoreCase("jpg") && doTransform &&
+            scalingFactor.isOne() && outSubsamp < 0 && outQual < 0) {
+          /* Input image has been transformed, and no re-compression options
+             have been selected.  Write the transformed image to disk and
+             exit. */
+          File outFile = new File(argv[1]);
+          FileOutputStream fos = new FileOutputStream(outFile);
           fos.write(tjd.getJPEGBuf(), 0, tjd.getJPEGSize());
           fos.close();
           System.exit(0);
         }
 
-        width = scaleFactor.getScaled(width);
-        height = scaleFactor.getScaled(height);
+        /* Scaling and/or a non-JPEG output image format and/or compression
+           options have been selected, so we need to decompress the
+           input/transformed image. */
+        width = scalingFactor.getScaled(width);
+        height = scalingFactor.getScaled(height);
+        if (outSubsamp < 0)
+          outSubsamp = inSubsamp;
 
         if (!outFormat.equalsIgnoreCase("jpg"))
           img = tjd.decompress(width, height, BufferedImage.TYPE_INT_RGB,
                                flags);
         else
-          bmpBuf = tjd.decompress(width, 0, height, TJ.PF_BGRX, flags);
+          imgBuf = tjd.decompress(width, 0, height, TJ.PF_BGRX, flags);
         tjd.close();
       } else {
-        img = ImageIO.read(file);
+        /* Input image is not a JPEG image.  Load it into memory. */
+        img = ImageIO.read(new File(argv[0]));
         if (img == null)
           throw new Exception("Input image type not supported.");
         width = img.getWidth();
@@ -302,61 +342,58 @@
           if (img.getType() == BufferedImage.TYPE_BYTE_GRAY)
             outSubsamp = TJ.SAMP_GRAY;
           else
-            outSubsamp = TJ.SAMP_444;
+            outSubsamp = DEFAULT_SUBSAMP;
         }
+        System.out.println("Input Image:  " + width + " x " + height +
+                           " pixels");
       }
       System.gc();
       if (!display)
-        System.out.print("Dest. Image (" + outFormat + "):  " + width + " x " +
-                         height + " pixels");
+        System.out.print("Output Image (" + outFormat + "):  " + width +
+                         " x " + height + " pixels");
 
       if (display) {
+        /* Display the uncompressed image */
         ImageIcon icon = new ImageIcon(img);
         JLabel label = new JLabel(icon, JLabel.CENTER);
         JOptionPane.showMessageDialog(null, label, "Output Image",
                                       JOptionPane.PLAIN_MESSAGE);
       } else if (outFormat.equalsIgnoreCase("jpg")) {
-        System.out.println(", " + sampName[outSubsamp] +
+        /* Output image format is JPEG.  Compress the uncompressed image. */
+        if (outQual < 0)
+          outQual = DEFAULT_QUALITY;
+        System.out.println(", " + subsampName[outSubsamp] +
                            " subsampling, quality = " + outQual);
-        TJCompressor tjc = new TJCompressor();
-        int jpegSize;
-        byte[] jpegBuf;
 
+        TJCompressor tjc = new TJCompressor();
         tjc.setSubsamp(outSubsamp);
         tjc.setJPEGQuality(outQual);
         if (img != null)
           tjc.setSourceImage(img, 0, 0, 0, 0);
-        else {
-          tjc.setSourceImage(bmpBuf, 0, 0, width, 0, height, TJ.PF_BGRX);
-        }
-        jpegBuf = tjc.compress(flags);
-        jpegSize = tjc.getCompressedSize();
+        else
+          tjc.setSourceImage(imgBuf, 0, 0, width, 0, height, TJ.PF_BGRX);
+        byte[] jpegBuf = tjc.compress(flags);
+        int jpegSize = tjc.getCompressedSize();
         tjc.close();
 
-        file = new File(argv[1]);
-        FileOutputStream fos = new FileOutputStream(file);
+        /* Write the JPEG image to disk. */
+        File outFile = new File(argv[1]);
+        FileOutputStream fos = new FileOutputStream(outFile);
         fos.write(jpegBuf, 0, jpegSize);
         fos.close();
       } else {
+        /* Output image format is not JPEG.  Save the uncompressed image
+           directly to disk. */
         System.out.print("\n");
-        file = new File(argv[1]);
-        ImageIO.write(img, outFormat, file);
+        File outFile = new File(argv[1]);
+        ImageIO.write(img, outFormat, outFile);
       }
 
-    } catch(Exception e) {
+    } catch (Exception e) {
       e.printStackTrace();
       System.exit(-1);
     }
   }
 
-  public void customFilter(ShortBuffer coeffBuffer, Rectangle bufferRegion,
-                           Rectangle planeRegion, int componentIndex,
-                           int transformIndex, TJTransform transform)
-                           throws TJException {
-    for (int i = 0; i < bufferRegion.width * bufferRegion.height; i++) {
-      coeffBuffer.put(i, (short)(-coeffBuffer.get(i)));
-    }
-  }
-
-  static TJScalingFactor[] sf = null;
+  private static final TJScalingFactor[] scalingFactors = TJ.getScalingFactors();
 };
diff --git a/java/TJUnitTest.java b/java/TJUnitTest.java
index 47ff7bb..4bb57d9 100644
--- a/java/TJUnitTest.java
+++ b/java/TJUnitTest.java
@@ -64,10 +64,6 @@
     "RGBA", "BGRA", "ABGR", "ARGB", "CMYK"
   };
 
-  private static final int[] alphaOffset = {
-    -1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0, -1
-  };
-
   private static final int[] _3byteFormats = {
     TJ.PF_RGB, TJ.PF_BGR
   };
@@ -100,43 +96,43 @@
 
   private static int biTypePF(int biType) {
     ByteOrder byteOrder = ByteOrder.nativeOrder();
-    switch(biType) {
-      case BufferedImage.TYPE_3BYTE_BGR:
-        return TJ.PF_BGR;
-      case BufferedImage.TYPE_4BYTE_ABGR:
-      case BufferedImage.TYPE_4BYTE_ABGR_PRE:
-        return TJ.PF_ABGR;
-      case BufferedImage.TYPE_BYTE_GRAY:
-        return TJ.PF_GRAY;
-      case BufferedImage.TYPE_INT_BGR:
-        return TJ.PF_RGBX;
-      case BufferedImage.TYPE_INT_RGB:
-        return TJ.PF_BGRX;
-      case BufferedImage.TYPE_INT_ARGB:
-      case BufferedImage.TYPE_INT_ARGB_PRE:
-        return TJ.PF_BGRA;
+    switch (biType) {
+    case BufferedImage.TYPE_3BYTE_BGR:
+      return TJ.PF_BGR;
+    case BufferedImage.TYPE_4BYTE_ABGR:
+    case BufferedImage.TYPE_4BYTE_ABGR_PRE:
+      return TJ.PF_ABGR;
+    case BufferedImage.TYPE_BYTE_GRAY:
+      return TJ.PF_GRAY;
+    case BufferedImage.TYPE_INT_BGR:
+      return TJ.PF_RGBX;
+    case BufferedImage.TYPE_INT_RGB:
+      return TJ.PF_BGRX;
+    case BufferedImage.TYPE_INT_ARGB:
+    case BufferedImage.TYPE_INT_ARGB_PRE:
+      return TJ.PF_BGRA;
     }
     return 0;
   }
 
   private static String biTypeStr(int biType) {
-    switch(biType) {
-      case BufferedImage.TYPE_3BYTE_BGR:
-        return "3BYTE_BGR";
-      case BufferedImage.TYPE_4BYTE_ABGR:
-        return "4BYTE_ABGR";
-      case BufferedImage.TYPE_4BYTE_ABGR_PRE:
-        return "4BYTE_ABGR_PRE";
-      case BufferedImage.TYPE_BYTE_GRAY:
-        return "BYTE_GRAY";
-      case BufferedImage.TYPE_INT_BGR:
-        return "INT_BGR";
-      case BufferedImage.TYPE_INT_RGB:
-        return "INT_RGB";
-      case BufferedImage.TYPE_INT_ARGB:
-        return "INT_ARGB";
-      case BufferedImage.TYPE_INT_ARGB_PRE:
-        return "INT_ARGB_PRE";
+    switch (biType) {
+    case BufferedImage.TYPE_3BYTE_BGR:
+      return "3BYTE_BGR";
+    case BufferedImage.TYPE_4BYTE_ABGR:
+      return "4BYTE_ABGR";
+    case BufferedImage.TYPE_4BYTE_ABGR_PRE:
+      return "4BYTE_ABGR_PRE";
+    case BufferedImage.TYPE_BYTE_GRAY:
+      return "BYTE_GRAY";
+    case BufferedImage.TYPE_INT_BGR:
+      return "INT_BGR";
+    case BufferedImage.TYPE_INT_RGB:
+      return "INT_RGB";
+    case BufferedImage.TYPE_INT_ARGB:
+      return "INT_ARGB";
+    case BufferedImage.TYPE_INT_ARGB_PRE:
+      return "INT_ARGB_PRE";
     }
     return "Unknown";
   }
@@ -146,7 +142,7 @@
     int roffset = TJ.getRedOffset(pf);
     int goffset = TJ.getGreenOffset(pf);
     int boffset = TJ.getBlueOffset(pf);
-    int aoffset = alphaOffset[pf];
+    int aoffset = TJ.getAlphaOffset(pf);
     int ps = TJ.getPixelSize(pf);
     int index, row, col, halfway = 16;
 
@@ -215,7 +211,7 @@
     int rshift = TJ.getRedOffset(pf) * 8;
     int gshift = TJ.getGreenOffset(pf) * 8;
     int bshift = TJ.getBlueOffset(pf) * 8;
-    int ashift = alphaOffset[pf] * 8;
+    int ashift = TJ.getAlphaOffset(pf) * 8;
     int index, row, col, halfway = 16;
 
     Arrays.fill(buf, 0);
@@ -246,6 +242,7 @@
                               throws Exception {
     WritableRaster wr = img.getRaster();
     int imgType = img.getType();
+
     if (imgType == BufferedImage.TYPE_INT_RGB ||
         imgType == BufferedImage.TYPE_INT_BGR ||
         imgType == BufferedImage.TYPE_INT_ARGB ||
@@ -298,7 +295,7 @@
     int roffset = TJ.getRedOffset(pf);
     int goffset = TJ.getGreenOffset(pf);
     int boffset = TJ.getBlueOffset(pf);
-    int aoffset = alphaOffset[pf];
+    int aoffset = TJ.getAlphaOffset(pf);
     int ps = TJ.getPixelSize(pf);
     int index, row, col, retval = 1;
     int halfway = 16 * sf.getNum() / sf.getDenom();
@@ -306,6 +303,9 @@
 
     try {
 
+      if (pf == TJ.PF_GRAY)
+        roffset = goffset = boffset = 0;
+
       if (pf == TJ.PF_CMYK) {
         for (row = 0; row < h; row++) {
           for (col = 0; col < w; col++) {
@@ -382,7 +382,7 @@
           checkVal255(row, col, a, "A");
         }
       }
-    } catch(Exception e) {
+    } catch (Exception e) {
       System.out.println("\n" + e.getMessage());
       retval = 0;
     }
@@ -422,7 +422,7 @@
     int rshift = TJ.getRedOffset(pf) * 8;
     int gshift = TJ.getGreenOffset(pf) * 8;
     int bshift = TJ.getBlueOffset(pf) * 8;
-    int ashift = alphaOffset[pf] * 8;
+    int ashift = TJ.getAlphaOffset(pf) * 8;
     int index, row, col, retval = 1;
     int halfway = 16 * sf.getNum() / sf.getDenom();
     int blockSize = 8 * sf.getNum() / sf.getDenom();
@@ -472,7 +472,7 @@
           checkVal255(row, col, a, "A");
         }
       }
-    } catch(Exception e) {
+    } catch (Exception e) {
       System.out.println("\n" + e.getMessage());
       retval = 0;
     }
@@ -579,7 +579,7 @@
           }
         }
       }
-    } catch(Exception e) {
+    } catch (Exception e) {
       System.out.println("\n" + e.getMessage());
       retval = 0;
     }
@@ -668,7 +668,7 @@
                         subNameLong[subsamp]);
       YUVImage yuvImage = tjc.encodeYUV(pad, flags);
       if (checkBufYUV(yuvImage.getBuf(), yuvImage.getSize(), w, h, subsamp,
-          new TJScalingFactor(1, 1)) == 1)
+                      new TJScalingFactor(1, 1)) == 1)
         System.out.print("Passed.\n");
       else {
         System.out.print("FAILED!\n");
@@ -729,7 +729,7 @@
 
     if (doYUV) {
       System.out.format("JPEG -> YUV %s ", subNameLong[subsamp]);
-      if(!sf.isOne())
+      if (!sf.isOne())
         System.out.format("%d/%d ... ", sf.getNum(), sf.getDenom());
       else System.out.print("... ");
       YUVImage yuvImage = tjd.decompressToYUV(scaledWidth, pad, scaledHeight,
@@ -746,7 +746,7 @@
       tjd.setSourceImage(yuvImage);
     } else {
       System.out.format("JPEG -> %s %s ", pfStrLong, buStrLong);
-      if(!sf.isOne())
+      if (!sf.isOne())
         System.out.format("%d/%d ... ", sf.getNum(), sf.getDenom());
       else System.out.print("... ");
     }
@@ -828,7 +828,7 @@
         }
       }
       System.out.print("--------------------\n\n");
-    } catch(Exception e) {
+    } catch (Exception e) {
       if (tjc != null) tjc.close();
       if (tjd != null) tjd.close();
       throw e;
@@ -889,7 +889,7 @@
         }
       }
       System.out.println("Done.      ");
-    } catch(Exception e) {
+    } catch (Exception e) {
       if (tjc != null) tjc.close();
       throw e;
     }
@@ -950,7 +950,7 @@
         doTest(48, 48, onlyRGB, TJ.SAMP_GRAY, "javatest_yuv0");
         doTest(48, 48, onlyGray, TJ.SAMP_GRAY, "javatest_yuv0");
       }
-    } catch(Exception e) {
+    } catch (Exception e) {
       e.printStackTrace();
       exitStatus = -1;
     }
diff --git a/java/doc/constant-values.html b/java/doc/constant-values.html
index ec1b21d..fb33327 100644
--- a/java/doc/constant-values.html
+++ b/java/doc/constant-values.html
@@ -7,8 +7,12 @@
 </head>
 <body>
 <script type="text/javascript"><!--
-    if (location.href.indexOf('is-external=true') == -1) {
-        parent.document.title="Constant Field Values";
+    try {
+        if (location.href.indexOf('is-external=true') == -1) {
+            parent.document.title="Constant Field Values";
+        }
+    }
+    catch(err) {
     }
 //-->
 </script>
@@ -115,6 +119,20 @@
 <td class="colLast"><code>4</code></td>
 </tr>
 <tr class="rowColor">
+<td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.ERR_FATAL">
+<!--   -->
+</a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
+<td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#ERR_FATAL">ERR_FATAL</a></code></td>
+<td class="colLast"><code>1</code></td>
+</tr>
+<tr class="altColor">
+<td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.ERR_WARNING">
+<!--   -->
+</a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
+<td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#ERR_WARNING">ERR_WARNING</a></code></td>
+<td class="colLast"><code>0</code></td>
+</tr>
+<tr class="rowColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.FLAG_ACCURATEDCT">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
@@ -171,6 +189,20 @@
 <td class="colLast"><code>128</code></td>
 </tr>
 <tr class="rowColor">
+<td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.FLAG_PROGRESSIVE">
+<!--   -->
+</a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
+<td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#FLAG_PROGRESSIVE">FLAG_PROGRESSIVE</a></code></td>
+<td class="colLast"><code>16384</code></td>
+</tr>
+<tr class="altColor">
+<td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.FLAG_STOPONWARNING">
+<!--   -->
+</a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
+<td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#FLAG_STOPONWARNING">FLAG_STOPONWARNING</a></code></td>
+<td class="colLast"><code>8192</code></td>
+</tr>
+<tr class="rowColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.NUMCS">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
@@ -178,139 +210,146 @@
 <td class="colLast"><code>5</code></td>
 </tr>
 <tr class="altColor">
+<td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.NUMERR">
+<!--   -->
+</a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
+<td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#NUMERR">NUMERR</a></code></td>
+<td class="colLast"><code>2</code></td>
+</tr>
+<tr class="rowColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.NUMPF">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#NUMPF">NUMPF</a></code></td>
 <td class="colLast"><code>12</code></td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.NUMSAMP">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#NUMSAMP">NUMSAMP</a></code></td>
 <td class="colLast"><code>6</code></td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.PF_ABGR">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#PF_ABGR">PF_ABGR</a></code></td>
 <td class="colLast"><code>9</code></td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.PF_ARGB">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#PF_ARGB">PF_ARGB</a></code></td>
 <td class="colLast"><code>10</code></td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.PF_BGR">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#PF_BGR">PF_BGR</a></code></td>
 <td class="colLast"><code>1</code></td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.PF_BGRA">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#PF_BGRA">PF_BGRA</a></code></td>
 <td class="colLast"><code>8</code></td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.PF_BGRX">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#PF_BGRX">PF_BGRX</a></code></td>
 <td class="colLast"><code>3</code></td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.PF_CMYK">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#PF_CMYK">PF_CMYK</a></code></td>
 <td class="colLast"><code>11</code></td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.PF_GRAY">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#PF_GRAY">PF_GRAY</a></code></td>
 <td class="colLast"><code>6</code></td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.PF_RGB">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#PF_RGB">PF_RGB</a></code></td>
 <td class="colLast"><code>0</code></td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.PF_RGBA">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#PF_RGBA">PF_RGBA</a></code></td>
 <td class="colLast"><code>7</code></td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.PF_RGBX">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#PF_RGBX">PF_RGBX</a></code></td>
 <td class="colLast"><code>2</code></td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.PF_XBGR">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#PF_XBGR">PF_XBGR</a></code></td>
 <td class="colLast"><code>4</code></td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.PF_XRGB">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#PF_XRGB">PF_XRGB</a></code></td>
 <td class="colLast"><code>5</code></td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.SAMP_411">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#SAMP_411">SAMP_411</a></code></td>
 <td class="colLast"><code>5</code></td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.SAMP_420">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#SAMP_420">SAMP_420</a></code></td>
 <td class="colLast"><code>2</code></td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.SAMP_422">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#SAMP_422">SAMP_422</a></code></td>
 <td class="colLast"><code>1</code></td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.SAMP_440">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#SAMP_440">SAMP_440</a></code></td>
 <td class="colLast"><code>4</code></td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.SAMP_444">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJ.html#SAMP_444">SAMP_444</a></code></td>
 <td class="colLast"><code>0</code></td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJ.SAMP_GRAY">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
@@ -393,33 +432,47 @@
 <td class="colLast"><code>2</code></td>
 </tr>
 <tr class="rowColor">
+<td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJTransform.OPT_COPYNONE">
+<!--   -->
+</a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
+<td><code><a href="org/libjpegturbo/turbojpeg/TJTransform.html#OPT_COPYNONE">OPT_COPYNONE</a></code></td>
+<td class="colLast"><code>64</code></td>
+</tr>
+<tr class="altColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJTransform.OPT_CROP">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJTransform.html#OPT_CROP">OPT_CROP</a></code></td>
 <td class="colLast"><code>4</code></td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJTransform.OPT_GRAY">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJTransform.html#OPT_GRAY">OPT_GRAY</a></code></td>
 <td class="colLast"><code>8</code></td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJTransform.OPT_NOOUTPUT">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJTransform.html#OPT_NOOUTPUT">OPT_NOOUTPUT</a></code></td>
 <td class="colLast"><code>16</code></td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJTransform.OPT_PERFECT">
 <!--   -->
 </a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
 <td><code><a href="org/libjpegturbo/turbojpeg/TJTransform.html#OPT_PERFECT">OPT_PERFECT</a></code></td>
 <td class="colLast"><code>1</code></td>
 </tr>
+<tr class="altColor">
+<td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJTransform.OPT_PROGRESSIVE">
+<!--   -->
+</a><code>public&nbsp;static&nbsp;final&nbsp;int</code></td>
+<td><code><a href="org/libjpegturbo/turbojpeg/TJTransform.html#OPT_PROGRESSIVE">OPT_PROGRESSIVE</a></code></td>
+<td class="colLast"><code>32</code></td>
+</tr>
 <tr class="rowColor">
 <td class="colFirst"><a name="org.libjpegturbo.turbojpeg.TJTransform.OPT_TRIM">
 <!--   -->
diff --git a/java/doc/deprecated-list.html b/java/doc/deprecated-list.html
index e47ffb1..31d4e64 100644
--- a/java/doc/deprecated-list.html
+++ b/java/doc/deprecated-list.html
@@ -7,8 +7,12 @@
 </head>
 <body>
 <script type="text/javascript"><!--
-    if (location.href.indexOf('is-external=true') == -1) {
-        parent.document.title="Deprecated List";
+    try {
+        if (location.href.indexOf('is-external=true') == -1) {
+            parent.document.title="Deprecated List";
+        }
+    }
+    catch(err) {
     }
 //-->
 </script>
diff --git a/java/doc/help-doc.html b/java/doc/help-doc.html
index ce749a9..6645d95 100644
--- a/java/doc/help-doc.html
+++ b/java/doc/help-doc.html
@@ -7,8 +7,12 @@
 </head>
 <body>
 <script type="text/javascript"><!--
-    if (location.href.indexOf('is-external=true') == -1) {
-        parent.document.title="API Help";
+    try {
+        if (location.href.indexOf('is-external=true') == -1) {
+            parent.document.title="API Help";
+        }
+    }
+    catch(err) {
     }
 //-->
 </script>
diff --git a/java/doc/index-all.html b/java/doc/index-all.html
index a02d9c4..c2c59d8 100644
--- a/java/doc/index-all.html
+++ b/java/doc/index-all.html
@@ -7,8 +7,12 @@
 </head>
 <body>
 <script type="text/javascript"><!--
-    if (location.href.indexOf('is-external=true') == -1) {
-        parent.document.title="Index";
+    try {
+        if (location.href.indexOf('is-external=true') == -1) {
+            parent.document.title="Index";
+        }
+    }
+    catch(err) {
     }
 //-->
 </script>
@@ -278,6 +282,15 @@
 <div class="block">Returns true or false, depending on whether this instance and
  <code>other</code> have the same numerator and denominator.</div>
 </dd>
+<dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJ.html#ERR_FATAL">ERR_FATAL</a></span> - Static variable in class org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</a></dt>
+<dd>
+<div class="block">The error was fatal and non-recoverable.</div>
+</dd>
+<dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJ.html#ERR_WARNING">ERR_WARNING</a></span> - Static variable in class org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</a></dt>
+<dd>
+<div class="block">The error was non-fatal and recoverable, but the image may still be
+ corrupt.</div>
+</dd>
 </dl>
 <a name="_F_">
 <!--   -->
@@ -324,12 +337,27 @@
 <dd>
 <div class="block"><span class="strong">Deprecated.</span></div>
 </dd>
+<dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJ.html#FLAG_PROGRESSIVE">FLAG_PROGRESSIVE</a></span> - Static variable in class org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</a></dt>
+<dd>
+<div class="block">Use progressive entropy coding in JPEG images generated by compression and
+ transform operations.</div>
+</dd>
+<dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJ.html#FLAG_STOPONWARNING">FLAG_STOPONWARNING</a></span> - Static variable in class org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</a></dt>
+<dd>
+<div class="block">Immediately discontinue the current compression/decompression/transform
+ operation if the underlying codec throws a warning (non-fatal error).</div>
+</dd>
 </dl>
 <a name="_G_">
 <!--   -->
 </a>
 <h2 class="title">G</h2>
 <dl>
+<dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJ.html#getAlphaOffset(int)">getAlphaOffset(int)</a></span> - Static method in class org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</a></dt>
+<dd>
+<div class="block">For the given pixel format, returns the number of bytes that the alpha
+ component is offset from the start of the pixel.</div>
+</dd>
 <dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJ.html#getBlueOffset(int)">getBlueOffset(int)</a></span> - Static method in class org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</a></dt>
 <dd>
 <div class="block">For the given pixel format, returns the number of bytes that the blue
@@ -354,6 +382,11 @@
 <dd>
 <div class="block">Returns denominator</div>
 </dd>
+<dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJException.html#getErrorCode()">getErrorCode()</a></span> - Method in exception org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></dt>
+<dd>
+<div class="block">Returns a code (one of <a href="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><code>TJ.ERR_*</code></a>) indicating the severity of the
+ last error.</div>
+</dd>
 <dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJ.html#getGreenOffset(int)">getGreenOffset(int)</a></span> - Static method in class org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</a></dt>
 <dd>
 <div class="block">For the given pixel format, returns the number of bytes that the green
@@ -516,6 +549,10 @@
 <dd>
 <div class="block">The number of JPEG colorspaces</div>
 </dd>
+<dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJ.html#NUMERR">NUMERR</a></span> - Static variable in class org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg">TJ</a></dt>
+<dd>
+<div class="block">The number of error codes</div>
+</dd>
 <dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJTransform.html#NUMOP">NUMOP</a></span> - Static variable in class org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJTransform.html" title="class in org.libjpegturbo.turbojpeg">TJTransform</a></dt>
 <dd>
 <div class="block">The number of lossless transform operations</div>
@@ -571,6 +608,11 @@
 <dd>
 <div class="block">Flip (mirror) image vertically.</div>
 </dd>
+<dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJTransform.html#OPT_COPYNONE">OPT_COPYNONE</a></span> - Static variable in class org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJTransform.html" title="class in org.libjpegturbo.turbojpeg">TJTransform</a></dt>
+<dd>
+<div class="block">This option will prevent <a href="./org/libjpegturbo/turbojpeg/TJTransformer.html#transform(byte[][],%20org.libjpegturbo.turbojpeg.TJTransform[],%20int)"><code>TJTransformer.transform()</code></a> from copying any extra markers (including EXIF
+ and ICC profile data) from the source image to the output image.</div>
+</dd>
 <dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJTransform.html#OPT_CROP">OPT_CROP</a></span> - Static variable in class org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJTransform.html" title="class in org.libjpegturbo.turbojpeg">TJTransform</a></dt>
 <dd>
 <div class="block">This option will enable lossless cropping.</div>
@@ -590,6 +632,11 @@
 <div class="block">This option will cause <a href="./org/libjpegturbo/turbojpeg/TJTransformer.html#transform(byte[][],%20org.libjpegturbo.turbojpeg.TJTransform[],%20int)"><code>TJTransformer.transform()</code></a> to throw an exception if the transform is not
  perfect.</div>
 </dd>
+<dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJTransform.html#OPT_PROGRESSIVE">OPT_PROGRESSIVE</a></span> - Static variable in class org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJTransform.html" title="class in org.libjpegturbo.turbojpeg">TJTransform</a></dt>
+<dd>
+<div class="block">This option will enable progressive entropy coding in the output image
+ generated by this particular transform.</div>
+</dd>
 <dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJTransform.html#OPT_TRIM">OPT_TRIM</a></span> - Static variable in class org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJTransform.html" title="class in org.libjpegturbo.turbojpeg">TJTransform</a></dt>
 <dd>
 <div class="block">This option will discard any partial MCU blocks that cannot be
@@ -829,6 +876,8 @@
 <dd>&nbsp;</dd>
 <dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJException.html#TJException(java.lang.String)">TJException(String)</a></span> - Constructor for exception org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></dt>
 <dd>&nbsp;</dd>
+<dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJException.html#TJException(java.lang.String,%20int)">TJException(String, int)</a></span> - Constructor for exception org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></dt>
+<dd>&nbsp;</dd>
 <dt><span class="strong"><a href="./org/libjpegturbo/turbojpeg/TJException.html#TJException(java.lang.Throwable)">TJException(Throwable)</a></span> - Constructor for exception org.libjpegturbo.turbojpeg.<a href="./org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></dt>
 <dd>&nbsp;</dd>
 <dt><a href="./org/libjpegturbo/turbojpeg/TJScalingFactor.html" title="class in org.libjpegturbo.turbojpeg"><span class="strong">TJScalingFactor</span></a> - Class in <a href="./org/libjpegturbo/turbojpeg/package-summary.html">org.libjpegturbo.turbojpeg</a></dt>
diff --git a/java/doc/index.html b/java/doc/index.html
index b983957..4e21075 100644
--- a/java/doc/index.html
+++ b/java/doc/index.html
@@ -4,11 +4,12 @@
 <head>
 <title>Generated Documentation (Untitled)</title>
 <script type="text/javascript">
-    targetPage = "" + window.location.search;
-    if (targetPage != "" && targetPage != "undefined")
-        targetPage = targetPage.substring(1);
-    if (targetPage.indexOf(":") != -1 || (targetPage != "" && !validURL(targetPage)))
-        targetPage = "undefined";
+    tmpTargetPage = "" + window.location.search;
+    if (tmpTargetPage != "" && tmpTargetPage != "undefined")
+        tmpTargetPage = tmpTargetPage.substring(1);
+    if (tmpTargetPage.indexOf(":") != -1 || (tmpTargetPage != "" && !validURL(tmpTargetPage)))
+        tmpTargetPage = "undefined";
+    targetPage = tmpTargetPage;
     function validURL(url) {
         try {
             url = decodeURIComponent(url);
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJ.html b/java/doc/org/libjpegturbo/turbojpeg/TJ.html
index ffef657..abcc272 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJ.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJ.html
@@ -7,8 +7,12 @@
 </head>
 <body>
 <script type="text/javascript"><!--
-    if (location.href.indexOf('is-external=true') == -1) {
-        parent.document.title="TJ";
+    try {
+        if (location.href.indexOf('is-external=true') == -1) {
+            parent.document.title="TJ";
+        }
+    }
+    catch(err) {
     }
 //-->
 </script>
@@ -145,6 +149,19 @@
 </tr>
 <tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
+<td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#ERR_FATAL">ERR_FATAL</a></strong></code>
+<div class="block">The error was fatal and non-recoverable.</div>
+</td>
+</tr>
+<tr class="altColor">
+<td class="colFirst"><code>static int</code></td>
+<td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#ERR_WARNING">ERR_WARNING</a></strong></code>
+<div class="block">The error was non-fatal and recoverable, but the image may still be
+ corrupt.</div>
+</td>
+</tr>
+<tr class="rowColor">
+<td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_ACCURATEDCT">FLAG_ACCURATEDCT</a></strong></code>
 <div class="block">Use the most accurate DCT/IDCT algorithm available in the underlying
  codec.</div>
@@ -197,125 +214,145 @@
 </tr>
 <tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
+<td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_PROGRESSIVE">FLAG_PROGRESSIVE</a></strong></code>
+<div class="block">Use progressive entropy coding in JPEG images generated by compression and
+ transform operations.</div>
+</td>
+</tr>
+<tr class="altColor">
+<td class="colFirst"><code>static int</code></td>
+<td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_STOPONWARNING">FLAG_STOPONWARNING</a></strong></code>
+<div class="block">Immediately discontinue the current compression/decompression/transform
+ operation if the underlying codec throws a warning (non-fatal error).</div>
+</td>
+</tr>
+<tr class="rowColor">
+<td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#NUMCS">NUMCS</a></strong></code>
 <div class="block">The number of JPEG colorspaces</div>
 </td>
 </tr>
 <tr class="altColor">
 <td class="colFirst"><code>static int</code></td>
+<td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#NUMERR">NUMERR</a></strong></code>
+<div class="block">The number of error codes</div>
+</td>
+</tr>
+<tr class="rowColor">
+<td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#NUMPF">NUMPF</a></strong></code>
 <div class="block">The number of pixel formats</div>
 </td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#NUMSAMP">NUMSAMP</a></strong></code>
 <div class="block">The number of chrominance subsampling options</div>
 </td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_ABGR">PF_ABGR</a></strong></code>
 <div class="block">ABGR pixel format.</div>
 </td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_ARGB">PF_ARGB</a></strong></code>
 <div class="block">ARGB pixel format.</div>
 </td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_BGR">PF_BGR</a></strong></code>
 <div class="block">BGR pixel format.</div>
 </td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_BGRA">PF_BGRA</a></strong></code>
 <div class="block">BGRA pixel format.</div>
 </td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_BGRX">PF_BGRX</a></strong></code>
 <div class="block">BGRX pixel format.</div>
 </td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_CMYK">PF_CMYK</a></strong></code>
 <div class="block">CMYK pixel format.</div>
 </td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_GRAY">PF_GRAY</a></strong></code>
 <div class="block">Grayscale pixel format.</div>
 </td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_RGB">PF_RGB</a></strong></code>
 <div class="block">RGB pixel format.</div>
 </td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_RGBA">PF_RGBA</a></strong></code>
 <div class="block">RGBA pixel format.</div>
 </td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_RGBX">PF_RGBX</a></strong></code>
 <div class="block">RGBX pixel format.</div>
 </td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_XBGR">PF_XBGR</a></strong></code>
 <div class="block">XBGR pixel format.</div>
 </td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#PF_XRGB">PF_XRGB</a></strong></code>
 <div class="block">XRGB pixel format.</div>
 </td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_411">SAMP_411</a></strong></code>
 <div class="block">4:1:1 chrominance subsampling.</div>
 </td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_420">SAMP_420</a></strong></code>
 <div class="block">4:2:0 chrominance subsampling.</div>
 </td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_422">SAMP_422</a></strong></code>
 <div class="block">4:2:2 chrominance subsampling.</div>
 </td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_440">SAMP_440</a></strong></code>
 <div class="block">4:4:0 chrominance subsampling.</div>
 </td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_444">SAMP_444</a></strong></code>
 <div class="block">4:4:4 chrominance subsampling (no chrominance subsampling).</div>
 </td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#SAMP_GRAY">SAMP_GRAY</a></strong></code>
 <div class="block">Grayscale.</div>
@@ -384,53 +421,60 @@
 </tr>
 <tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
+<td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#getAlphaOffset(int)">getAlphaOffset</a></strong>(int&nbsp;pixelFormat)</code>
+<div class="block">For the given pixel format, returns the number of bytes that the alpha
+ component is offset from the start of the pixel.</div>
+</td>
+</tr>
+<tr class="altColor">
+<td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#getBlueOffset(int)">getBlueOffset</a></strong>(int&nbsp;pixelFormat)</code>
 <div class="block">For the given pixel format, returns the number of bytes that the blue
  component is offset from the start of the pixel.</div>
 </td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#getGreenOffset(int)">getGreenOffset</a></strong>(int&nbsp;pixelFormat)</code>
 <div class="block">For the given pixel format, returns the number of bytes that the green
  component is offset from the start of the pixel.</div>
 </td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#getMCUHeight(int)">getMCUHeight</a></strong>(int&nbsp;subsamp)</code>
 <div class="block">Returns the MCU block height for the given level of chrominance
  subsampling.</div>
 </td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#getMCUWidth(int)">getMCUWidth</a></strong>(int&nbsp;subsamp)</code>
 <div class="block">Returns the MCU block width for the given level of chrominance
  subsampling.</div>
 </td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#getPixelSize(int)">getPixelSize</a></strong>(int&nbsp;pixelFormat)</code>
 <div class="block">Returns the pixel size (in bytes) for the given pixel format.</div>
 </td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#getRedOffset(int)">getRedOffset</a></strong>(int&nbsp;pixelFormat)</code>
 <div class="block">For the given pixel format, returns the number of bytes that the red
  component is offset from the start of the pixel.</div>
 </td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><code>static <a href="../../../org/libjpegturbo/turbojpeg/TJScalingFactor.html" title="class in org.libjpegturbo.turbojpeg">TJScalingFactor</a>[]</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#getScalingFactors()">getScalingFactors</a></strong>()</code>
 <div class="block">Returns a list of fractional scaling factors that the JPEG decompressor in
  this implementation of TurboJPEG supports.</div>
 </td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#planeHeight(int,%20int,%20int)">planeHeight</a></strong>(int&nbsp;componentID,
            int&nbsp;height,
@@ -438,7 +482,7 @@
 <div class="block">Returns the plane height of a YUV image plane with the given parameters.</div>
 </td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#planeSizeYUV(int,%20int,%20int,%20int,%20int)">planeSizeYUV</a></strong>(int&nbsp;componentID,
             int&nbsp;width,
@@ -449,7 +493,7 @@
  plane with the given parameters.</div>
 </td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJ.html#planeWidth(int,%20int,%20int)">planeWidth</a></strong>(int&nbsp;componentID,
           int&nbsp;width,
@@ -938,7 +982,7 @@
 <a name="FLAG_ACCURATEDCT">
 <!--   -->
 </a>
-<ul class="blockListLast">
+<ul class="blockList">
 <li class="blockList">
 <h4>FLAG_ACCURATEDCT</h4>
 <pre>public static final&nbsp;int FLAG_ACCURATEDCT</pre>
@@ -952,6 +996,78 @@
 <dl><dt><span class="strong">See Also:</span></dt><dd><a href="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.FLAG_ACCURATEDCT">Constant Field Values</a></dd></dl>
 </li>
 </ul>
+<a name="FLAG_STOPONWARNING">
+<!--   -->
+</a>
+<ul class="blockList">
+<li class="blockList">
+<h4>FLAG_STOPONWARNING</h4>
+<pre>public static final&nbsp;int FLAG_STOPONWARNING</pre>
+<div class="block">Immediately discontinue the current compression/decompression/transform
+ operation if the underlying codec throws a warning (non-fatal error).  The
+ default behavior is to allow the operation to complete unless a fatal
+ error is encountered.
+ <p>
+ NOTE: due to the design of the TurboJPEG Java API, only certain methods
+ (specifically, <a href="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg"><code>TJDecompressor.decompress*()</code></a> methods
+ with a void return type) will complete and leave the output image in a
+ fully recoverable state after a non-fatal error occurs.</div>
+<dl><dt><span class="strong">See Also:</span></dt><dd><a href="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.FLAG_STOPONWARNING">Constant Field Values</a></dd></dl>
+</li>
+</ul>
+<a name="FLAG_PROGRESSIVE">
+<!--   -->
+</a>
+<ul class="blockList">
+<li class="blockList">
+<h4>FLAG_PROGRESSIVE</h4>
+<pre>public static final&nbsp;int FLAG_PROGRESSIVE</pre>
+<div class="block">Use progressive entropy coding in JPEG images generated by compression and
+ transform operations.  Progressive entropy coding will generally improve
+ compression relative to baseline entropy coding (the default), but it will
+ reduce compression and decompression performance considerably.</div>
+<dl><dt><span class="strong">See Also:</span></dt><dd><a href="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.FLAG_PROGRESSIVE">Constant Field Values</a></dd></dl>
+</li>
+</ul>
+<a name="NUMERR">
+<!--   -->
+</a>
+<ul class="blockList">
+<li class="blockList">
+<h4>NUMERR</h4>
+<pre>public static final&nbsp;int NUMERR</pre>
+<div class="block">The number of error codes</div>
+<dl><dt><span class="strong">See Also:</span></dt><dd><a href="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.NUMERR">Constant Field Values</a></dd></dl>
+</li>
+</ul>
+<a name="ERR_WARNING">
+<!--   -->
+</a>
+<ul class="blockList">
+<li class="blockList">
+<h4>ERR_WARNING</h4>
+<pre>public static final&nbsp;int ERR_WARNING</pre>
+<div class="block">The error was non-fatal and recoverable, but the image may still be
+ corrupt.
+ <p>
+ NOTE: due to the design of the TurboJPEG Java API, only certain methods
+ (specifically, <a href="../../../org/libjpegturbo/turbojpeg/TJDecompressor.html" title="class in org.libjpegturbo.turbojpeg"><code>TJDecompressor.decompress*()</code></a> methods
+ with a void return type) will complete and leave the output image in a
+ fully recoverable state after a non-fatal error occurs.</div>
+<dl><dt><span class="strong">See Also:</span></dt><dd><a href="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.ERR_WARNING">Constant Field Values</a></dd></dl>
+</li>
+</ul>
+<a name="ERR_FATAL">
+<!--   -->
+</a>
+<ul class="blockListLast">
+<li class="blockList">
+<h4>ERR_FATAL</h4>
+<pre>public static final&nbsp;int ERR_FATAL</pre>
+<div class="block">The error was fatal and non-recoverable.</div>
+<dl><dt><span class="strong">See Also:</span></dt><dd><a href="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJ.ERR_FATAL">Constant Field Values</a></dd></dl>
+</li>
+</ul>
 </li>
 </ul>
 <!-- ========= CONSTRUCTOR DETAIL ======== -->
@@ -1032,7 +1148,8 @@
  then the red component will be
  <code>pixel[TJ.getRedOffset(TJ.PF_BGRX)]</code>.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>pixelFormat</code> - the pixel format (one of <code>PF_*</code>)</dd>
-<dt><span class="strong">Returns:</span></dt><dd>the red offset for the given pixel format.</dd></dl>
+<dt><span class="strong">Returns:</span></dt><dd>the red offset for the given pixel format, or -1 if the pixel
+ format does not have a red component.</dd></dl>
 </li>
 </ul>
 <a name="getGreenOffset(int)">
@@ -1048,7 +1165,8 @@
  then the green component will be
  <code>pixel[TJ.getGreenOffset(TJ.PF_BGRX)]</code>.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>pixelFormat</code> - the pixel format (one of <code>PF_*</code>)</dd>
-<dt><span class="strong">Returns:</span></dt><dd>the green offset for the given pixel format.</dd></dl>
+<dt><span class="strong">Returns:</span></dt><dd>the green offset for the given pixel format, or -1 if the pixel
+ format does not have a green component.</dd></dl>
 </li>
 </ul>
 <a name="getBlueOffset(int)">
@@ -1064,7 +1182,25 @@
  then the blue component will be
  <code>pixel[TJ.getBlueOffset(TJ.PF_BGRX)]</code>.</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>pixelFormat</code> - the pixel format (one of <code>PF_*</code>)</dd>
-<dt><span class="strong">Returns:</span></dt><dd>the blue offset for the given pixel format.</dd></dl>
+<dt><span class="strong">Returns:</span></dt><dd>the blue offset for the given pixel format, or -1 if the pixel
+ format does not have a blue component.</dd></dl>
+</li>
+</ul>
+<a name="getAlphaOffset(int)">
+<!--   -->
+</a>
+<ul class="blockList">
+<li class="blockList">
+<h4>getAlphaOffset</h4>
+<pre>public static&nbsp;int&nbsp;getAlphaOffset(int&nbsp;pixelFormat)</pre>
+<div class="block">For the given pixel format, returns the number of bytes that the alpha
+ component is offset from the start of the pixel.  For instance, if a pixel
+ of format <code>TJ.PF_BGRA</code> is stored in <code>char pixel[]</code>,
+ then the alpha component will be
+ <code>pixel[TJ.getAlphaOffset(TJ.PF_BGRA)]</code>.</div>
+<dl><dt><span class="strong">Parameters:</span></dt><dd><code>pixelFormat</code> - the pixel format (one of <code>PF_*</code>)</dd>
+<dt><span class="strong">Returns:</span></dt><dd>the alpha offset for the given pixel format, or -1 if the pixel
+ format does not have a alpha component.</dd></dl>
 </li>
 </ul>
 <a name="bufSize(int, int, int)">
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html b/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html
index 29f12b7..ea8c2be 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html
@@ -7,8 +7,12 @@
 </head>
 <body>
 <script type="text/javascript"><!--
-    if (location.href.indexOf('is-external=true') == -1) {
-        parent.document.title="TJCompressor";
+    try {
+        if (location.href.indexOf('is-external=true') == -1) {
+            parent.document.title="TJCompressor";
+        }
+    }
+    catch(err) {
     }
 //-->
 </script>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html b/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html
index 6bd6fd2..412dcd4 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html
@@ -7,8 +7,12 @@
 </head>
 <body>
 <script type="text/javascript"><!--
-    if (location.href.indexOf('is-external=true') == -1) {
-        parent.document.title="TJCustomFilter";
+    try {
+        if (location.href.indexOf('is-external=true') == -1) {
+            parent.document.title="TJCustomFilter";
+        }
+    }
+    catch(err) {
     }
 //-->
 </script>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html b/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html
index a914de9..b281e32 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html
@@ -7,8 +7,12 @@
 </head>
 <body>
 <script type="text/javascript"><!--
-    if (location.href.indexOf('is-external=true') == -1) {
-        parent.document.title="TJDecompressor";
+    try {
+        if (location.href.indexOf('is-external=true') == -1) {
+            parent.document.title="TJDecompressor";
+        }
+    }
+    catch(err) {
     }
 //-->
 </script>
@@ -785,7 +789,11 @@
                 throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Decompress the JPEG source image or decode the YUV source image associated
  with this decompressor instance and output a grayscale, RGB, or CMYK image
- to the given destination buffer.</div>
+ to the given destination buffer.
+ <p>
+ NOTE: The output image is fully recoverable if this method throws a
+ non-fatal <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg"><code>TJException</code></a> (unless
+ <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_STOPONWARNING"><code>TJ.FLAG_STOPONWARNING</code></a> is specified.)</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>dstBuf</code> - buffer that will receive the decompressed/decoded image.
  If the source image is a JPEG image, then this buffer should normally be
  <code>pitch * scaledHeight</code> bytes in size, where
@@ -895,7 +903,11 @@
  <code>YUVImage</code> instance.  This method performs JPEG decompression
  but leaves out the color conversion step, so a planar YUV image is
  generated instead of an RGB or grayscale image.  This method cannot be
- used to decompress JPEG source images with the CMYK or YCCK colorspace.</div>
+ used to decompress JPEG source images with the CMYK or YCCK colorspace.
+ <p>
+ NOTE: The YUV planar output image is fully recoverable if this method
+ throws a non-fatal <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg"><code>TJException</code></a> (unless
+ <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_STOPONWARNING"><code>TJ.FLAG_STOPONWARNING</code></a> is specified.)</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>dstImage</code> - <a href="../../../org/libjpegturbo/turbojpeg/YUVImage.html" title="class in org.libjpegturbo.turbojpeg"><code>YUVImage</code></a> instance that will receive the YUV planar
  image.  The level of subsampling specified in this <code>YUVImage</code>
  instance must match that of the JPEG image, and the width and height
@@ -1035,7 +1047,11 @@
                 throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Decompress the JPEG source image or decode the YUV source image associated
  with this decompressor instance and output a grayscale, RGB, or CMYK image
- to the given destination buffer.</div>
+ to the given destination buffer.
+ <p>
+ NOTE: The output image is fully recoverable if this method throws a
+ non-fatal <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg"><code>TJException</code></a> (unless
+ <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_STOPONWARNING"><code>TJ.FLAG_STOPONWARNING</code></a> is specified.)</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>dstBuf</code> - buffer that will receive the decompressed/decoded image.
  If the source image is a JPEG image, then this buffer should normally be
  <code>stride * scaledHeight</code> pixels in size, where
@@ -1092,7 +1108,11 @@
                 throws <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg">TJException</a></pre>
 <div class="block">Decompress the JPEG source image or decode the YUV source image associated
  with this decompressor instance and output a decompressed/decoded image to
- the given <code>BufferedImage</code> instance.</div>
+ the given <code>BufferedImage</code> instance.
+ <p>
+ NOTE: The output image is fully recoverable if this method throws a
+ non-fatal <a href="../../../org/libjpegturbo/turbojpeg/TJException.html" title="class in org.libjpegturbo.turbojpeg"><code>TJException</code></a> (unless
+ <a href="../../../org/libjpegturbo/turbojpeg/TJ.html#FLAG_STOPONWARNING"><code>TJ.FLAG_STOPONWARNING</code></a> is specified.)</div>
 <dl><dt><span class="strong">Parameters:</span></dt><dd><code>dstImage</code> - a <code>BufferedImage</code> instance that will receive
  the decompressed/decoded image.  If the source image is a JPEG image, then
  the width and height of the <code>BufferedImage</code> instance must match
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJException.html b/java/doc/org/libjpegturbo/turbojpeg/TJException.html
index 6088066..66d73e7 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJException.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJException.html
@@ -7,8 +7,12 @@
 </head>
 <body>
 <script type="text/javascript"><!--
-    if (location.href.indexOf('is-external=true') == -1) {
-        parent.document.title="TJException";
+    try {
+        if (location.href.indexOf('is-external=true') == -1) {
+            parent.document.title="TJException";
+        }
+    }
+    catch(err) {
     }
 //-->
 </script>
@@ -60,13 +64,13 @@
 <li>Nested&nbsp;|&nbsp;</li>
 <li>Field&nbsp;|&nbsp;</li>
 <li><a href="#constructor_summary">Constr</a>&nbsp;|&nbsp;</li>
-<li><a href="#methods_inherited_from_class_java.lang.Throwable">Method</a></li>
+<li><a href="#method_summary">Method</a></li>
 </ul>
 <ul class="subNavList">
 <li>Detail:&nbsp;</li>
 <li>Field&nbsp;|&nbsp;</li>
 <li><a href="#constructor_detail">Constr</a>&nbsp;|&nbsp;</li>
-<li>Method</li>
+<li><a href="#method_detail">Method</a></li>
 </ul>
 </div>
 <a name="skip-navbar_top">
@@ -138,10 +142,14 @@
 <td class="colOne"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJException.html#TJException(java.lang.String)">TJException</a></strong>(java.lang.String&nbsp;message)</code>&nbsp;</td>
 </tr>
 <tr class="altColor">
+<td class="colOne"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJException.html#TJException(java.lang.String,%20int)">TJException</a></strong>(java.lang.String&nbsp;message,
+           int&nbsp;code)</code>&nbsp;</td>
+</tr>
+<tr class="rowColor">
 <td class="colOne"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJException.html#TJException(java.lang.String,%20java.lang.Throwable)">TJException</a></strong>(java.lang.String&nbsp;message,
            java.lang.Throwable&nbsp;cause)</code>&nbsp;</td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colOne"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJException.html#TJException(java.lang.Throwable)">TJException</a></strong>(java.lang.Throwable&nbsp;cause)</code>&nbsp;</td>
 </tr>
 </table>
@@ -153,6 +161,20 @@
 <!--   -->
 </a>
 <h3>Method Summary</h3>
+<table class="overviewSummary" border="0" cellpadding="3" cellspacing="0" summary="Method Summary table, listing methods, and an explanation">
+<caption><span>Methods</span><span class="tabEnd">&nbsp;</span></caption>
+<tr>
+<th class="colFirst" scope="col">Modifier and Type</th>
+<th class="colLast" scope="col">Method and Description</th>
+</tr>
+<tr class="altColor">
+<td class="colFirst"><code>int</code></td>
+<td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJException.html#getErrorCode()">getErrorCode</a></strong>()</code>
+<div class="block">Returns a code (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><code>TJ.ERR_*</code></a>) indicating the severity of the
+ last error.</div>
+</td>
+</tr>
+</table>
 <ul class="blockList">
 <li class="blockList"><a name="methods_inherited_from_class_java.lang.Throwable">
 <!--   -->
@@ -209,6 +231,16 @@
 <pre>public&nbsp;TJException(java.lang.String&nbsp;message)</pre>
 </li>
 </ul>
+<a name="TJException(java.lang.String, int)">
+<!--   -->
+</a>
+<ul class="blockList">
+<li class="blockList">
+<h4>TJException</h4>
+<pre>public&nbsp;TJException(java.lang.String&nbsp;message,
+           int&nbsp;code)</pre>
+</li>
+</ul>
 <a name="TJException(java.lang.Throwable)">
 <!--   -->
 </a>
@@ -220,6 +252,27 @@
 </ul>
 </li>
 </ul>
+<!-- ============ METHOD DETAIL ========== -->
+<ul class="blockList">
+<li class="blockList"><a name="method_detail">
+<!--   -->
+</a>
+<h3>Method Detail</h3>
+<a name="getErrorCode()">
+<!--   -->
+</a>
+<ul class="blockListLast">
+<li class="blockList">
+<h4>getErrorCode</h4>
+<pre>public&nbsp;int&nbsp;getErrorCode()</pre>
+<div class="block">Returns a code (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><code>TJ.ERR_*</code></a>) indicating the severity of the
+ last error.</div>
+<dl><dt><span class="strong">Returns:</span></dt><dd>a code (one of <a href="../../../org/libjpegturbo/turbojpeg/TJ.html" title="class in org.libjpegturbo.turbojpeg"><code>TJ.ERR_*</code></a>) indicating the severity of the
+ last error.</dd></dl>
+</li>
+</ul>
+</li>
+</ul>
 </li>
 </ul>
 </div>
@@ -270,13 +323,13 @@
 <li>Nested&nbsp;|&nbsp;</li>
 <li>Field&nbsp;|&nbsp;</li>
 <li><a href="#constructor_summary">Constr</a>&nbsp;|&nbsp;</li>
-<li><a href="#methods_inherited_from_class_java.lang.Throwable">Method</a></li>
+<li><a href="#method_summary">Method</a></li>
 </ul>
 <ul class="subNavList">
 <li>Detail:&nbsp;</li>
 <li>Field&nbsp;|&nbsp;</li>
 <li><a href="#constructor_detail">Constr</a>&nbsp;|&nbsp;</li>
-<li>Method</li>
+<li><a href="#method_detail">Method</a></li>
 </ul>
 </div>
 <a name="skip-navbar_bottom">
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJScalingFactor.html b/java/doc/org/libjpegturbo/turbojpeg/TJScalingFactor.html
index 35d6882..7722416 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJScalingFactor.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJScalingFactor.html
@@ -7,8 +7,12 @@
 </head>
 <body>
 <script type="text/javascript"><!--
-    if (location.href.indexOf('is-external=true') == -1) {
-        parent.document.title="TJScalingFactor";
+    try {
+        if (location.href.indexOf('is-external=true') == -1) {
+            parent.document.title="TJScalingFactor";
+        }
+    }
+    catch(err) {
     }
 //-->
 </script>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJTransform.html b/java/doc/org/libjpegturbo/turbojpeg/TJTransform.html
index cf65bd2..5f22691 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJTransform.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJTransform.html
@@ -7,8 +7,12 @@
 </head>
 <body>
 <script type="text/javascript"><!--
-    if (location.href.indexOf('is-external=true') == -1) {
-        parent.document.title="TJTransform";
+    try {
+        if (location.href.indexOf('is-external=true') == -1) {
+            parent.document.title="TJTransform";
+        }
+    }
+    catch(err) {
     }
 //-->
 </script>
@@ -217,31 +221,45 @@
 </tr>
 <tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
+<td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJTransform.html#OPT_COPYNONE">OPT_COPYNONE</a></strong></code>
+<div class="block">This option will prevent <a href="../../../org/libjpegturbo/turbojpeg/TJTransformer.html#transform(byte[][],%20org.libjpegturbo.turbojpeg.TJTransform[],%20int)"><code>TJTransformer.transform()</code></a> from copying any extra markers (including EXIF
+ and ICC profile data) from the source image to the output image.</div>
+</td>
+</tr>
+<tr class="altColor">
+<td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJTransform.html#OPT_CROP">OPT_CROP</a></strong></code>
 <div class="block">This option will enable lossless cropping.</div>
 </td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJTransform.html#OPT_GRAY">OPT_GRAY</a></strong></code>
 <div class="block">This option will discard the color data in the input image and produce
  a grayscale output image.</div>
 </td>
 </tr>
-<tr class="rowColor">
+<tr class="altColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJTransform.html#OPT_NOOUTPUT">OPT_NOOUTPUT</a></strong></code>
 <div class="block">This option will prevent <a href="../../../org/libjpegturbo/turbojpeg/TJTransformer.html#transform(byte[][],%20org.libjpegturbo.turbojpeg.TJTransform[],%20int)"><code>TJTransformer.transform()</code></a> from outputting a JPEG image for this
  particular transform.</div>
 </td>
 </tr>
-<tr class="altColor">
+<tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJTransform.html#OPT_PERFECT">OPT_PERFECT</a></strong></code>
 <div class="block">This option will cause <a href="../../../org/libjpegturbo/turbojpeg/TJTransformer.html#transform(byte[][],%20org.libjpegturbo.turbojpeg.TJTransform[],%20int)"><code>TJTransformer.transform()</code></a> to throw an exception if the transform is not
  perfect.</div>
 </td>
 </tr>
+<tr class="altColor">
+<td class="colFirst"><code>static int</code></td>
+<td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJTransform.html#OPT_PROGRESSIVE">OPT_PROGRESSIVE</a></strong></code>
+<div class="block">This option will enable progressive entropy coding in the output image
+ generated by this particular transform.</div>
+</td>
+</tr>
 <tr class="rowColor">
 <td class="colFirst"><code>static int</code></td>
 <td class="colLast"><code><strong><a href="../../../org/libjpegturbo/turbojpeg/TJTransform.html#OPT_TRIM">OPT_TRIM</a></strong></code>
@@ -548,6 +566,33 @@
 <dl><dt><span class="strong">See Also:</span></dt><dd><a href="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJTransform.OPT_NOOUTPUT">Constant Field Values</a></dd></dl>
 </li>
 </ul>
+<a name="OPT_PROGRESSIVE">
+<!--   -->
+</a>
+<ul class="blockList">
+<li class="blockList">
+<h4>OPT_PROGRESSIVE</h4>
+<pre>public static final&nbsp;int OPT_PROGRESSIVE</pre>
+<div class="block">This option will enable progressive entropy coding in the output image
+ generated by this particular transform.  Progressive entropy coding will
+ generally improve compression relative to baseline entropy coding (the
+ default), but it will reduce compression and decompression performance
+ considerably.</div>
+<dl><dt><span class="strong">See Also:</span></dt><dd><a href="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJTransform.OPT_PROGRESSIVE">Constant Field Values</a></dd></dl>
+</li>
+</ul>
+<a name="OPT_COPYNONE">
+<!--   -->
+</a>
+<ul class="blockList">
+<li class="blockList">
+<h4>OPT_COPYNONE</h4>
+<pre>public static final&nbsp;int OPT_COPYNONE</pre>
+<div class="block">This option will prevent <a href="../../../org/libjpegturbo/turbojpeg/TJTransformer.html#transform(byte[][],%20org.libjpegturbo.turbojpeg.TJTransform[],%20int)"><code>TJTransformer.transform()</code></a> from copying any extra markers (including EXIF
+ and ICC profile data) from the source image to the output image.</div>
+<dl><dt><span class="strong">See Also:</span></dt><dd><a href="../../../constant-values.html#org.libjpegturbo.turbojpeg.TJTransform.OPT_COPYNONE">Constant Field Values</a></dd></dl>
+</li>
+</ul>
 <a name="op">
 <!--   -->
 </a>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html b/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html
index 36cbdb1..a30fe30 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html
@@ -7,8 +7,12 @@
 </head>
 <body>
 <script type="text/javascript"><!--
-    if (location.href.indexOf('is-external=true') == -1) {
-        parent.document.title="TJTransformer";
+    try {
+        if (location.href.indexOf('is-external=true') == -1) {
+            parent.document.title="TJTransformer";
+        }
+    }
+    catch(err) {
     }
 //-->
 </script>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/YUVImage.html b/java/doc/org/libjpegturbo/turbojpeg/YUVImage.html
index b2be0a0..d4485ed 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/YUVImage.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/YUVImage.html
@@ -7,8 +7,12 @@
 </head>
 <body>
 <script type="text/javascript"><!--
-    if (location.href.indexOf('is-external=true') == -1) {
-        parent.document.title="YUVImage";
+    try {
+        if (location.href.indexOf('is-external=true') == -1) {
+            parent.document.title="YUVImage";
+        }
+    }
+    catch(err) {
     }
 //-->
 </script>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/package-summary.html b/java/doc/org/libjpegturbo/turbojpeg/package-summary.html
index f94656e..dedcce5 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/package-summary.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/package-summary.html
@@ -7,8 +7,12 @@
 </head>
 <body>
 <script type="text/javascript"><!--
-    if (location.href.indexOf('is-external=true') == -1) {
-        parent.document.title="org.libjpegturbo.turbojpeg";
+    try {
+        if (location.href.indexOf('is-external=true') == -1) {
+            parent.document.title="org.libjpegturbo.turbojpeg";
+        }
+    }
+    catch(err) {
     }
 //-->
 </script>
diff --git a/java/doc/org/libjpegturbo/turbojpeg/package-tree.html b/java/doc/org/libjpegturbo/turbojpeg/package-tree.html
index 02a5cde..5f0f8c3 100644
--- a/java/doc/org/libjpegturbo/turbojpeg/package-tree.html
+++ b/java/doc/org/libjpegturbo/turbojpeg/package-tree.html
@@ -7,8 +7,12 @@
 </head>
 <body>
 <script type="text/javascript"><!--
-    if (location.href.indexOf('is-external=true') == -1) {
-        parent.document.title="org.libjpegturbo.turbojpeg Class Hierarchy";
+    try {
+        if (location.href.indexOf('is-external=true') == -1) {
+            parent.document.title="org.libjpegturbo.turbojpeg Class Hierarchy";
+        }
+    }
+    catch(err) {
     }
 //-->
 </script>
diff --git a/java/doc/overview-tree.html b/java/doc/overview-tree.html
index 2ae76c6..b659995 100644
--- a/java/doc/overview-tree.html
+++ b/java/doc/overview-tree.html
@@ -7,8 +7,12 @@
 </head>
 <body>
 <script type="text/javascript"><!--
-    if (location.href.indexOf('is-external=true') == -1) {
-        parent.document.title="Class Hierarchy";
+    try {
+        if (location.href.indexOf('is-external=true') == -1) {
+            parent.document.title="Class Hierarchy";
+        }
+    }
+    catch(err) {
     }
 //-->
 </script>
diff --git a/java/doc/script.js b/java/doc/script.js
new file mode 100644
index 0000000..b346356
--- /dev/null
+++ b/java/doc/script.js
@@ -0,0 +1,30 @@
+function show(type)
+{
+    count = 0;
+    for (var key in methods) {
+        var row = document.getElementById(key);
+        if ((methods[key] &  type) != 0) {
+            row.style.display = '';
+            row.className = (count++ % 2) ? rowColor : altColor;
+        }
+        else
+            row.style.display = 'none';
+    }
+    updateTabs(type);
+}
+
+function updateTabs(type)
+{
+    for (var value in tabs) {
+        var sNode = document.getElementById(tabs[value][0]);
+        var spanNode = sNode.firstChild;
+        if (value == type) {
+            sNode.className = activeTableTab;
+            spanNode.innerHTML = tabs[value][1];
+        }
+        else {
+            sNode.className = tableTab;
+            spanNode.innerHTML = "<a href=\"javascript:show("+ value + ");\">" + tabs[value][1] + "</a>";
+        }
+    }
+}
diff --git a/java/doc/serialized-form.html b/java/doc/serialized-form.html
index 846cabc..45bbc86 100644
--- a/java/doc/serialized-form.html
+++ b/java/doc/serialized-form.html
@@ -7,8 +7,12 @@
 </head>
 <body>
 <script type="text/javascript"><!--
-    if (location.href.indexOf('is-external=true') == -1) {
-        parent.document.title="Serialized Form";
+    try {
+        if (location.href.indexOf('is-external=true') == -1) {
+            parent.document.title="Serialized Form";
+        }
+    }
+    catch(err) {
     }
 //-->
 </script>
@@ -74,6 +78,19 @@
 <dt>serialVersionUID:</dt>
 <dd>1L</dd>
 </dl>
+<ul class="blockList">
+<li class="blockList"><a name="serializedForm">
+<!--   -->
+</a>
+<h3>Serialized Fields</h3>
+<ul class="blockList">
+<li class="blockListLast">
+<h4>errorCode</h4>
+<pre>int errorCode</pre>
+</li>
+</ul>
+</li>
+</ul>
 </li>
 <li class="blockList"><a name="org.libjpegturbo.turbojpeg.TJTransform">
 <!--   -->
diff --git a/java/org/libjpegturbo/turbojpeg/TJ.java b/java/org/libjpegturbo/turbojpeg/TJ.java
index 02d14c0..19ec05e 100644
--- a/java/org/libjpegturbo/turbojpeg/TJ.java
+++ b/java/org/libjpegturbo/turbojpeg/TJ.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2011-2013 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2011-2013, 2017 D. R. Commander.  All Rights Reserved.
  * Copyright (C)2015 Viktor Szathmáry.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,7 +34,6 @@
  */
 public final class TJ {
 
-
   /**
    * The number of chrominance subsampling options
    */
@@ -235,7 +234,8 @@
    *
    * @param pixelFormat the pixel format (one of <code>PF_*</code>)
    *
-   * @return the red offset for the given pixel format.
+   * @return the red offset for the given pixel format, or -1 if the pixel
+   * format does not have a red component.
    */
   public static int getRedOffset(int pixelFormat) {
     checkPixelFormat(pixelFormat);
@@ -243,7 +243,7 @@
   }
 
   private static final int[] redOffset = {
-    0, 2, 0, 2, 3, 1, 0, 0, 2, 3, 1, -1
+    0, 2, 0, 2, 3, 1, -1, 0, 2, 3, 1, -1
   };
 
 
@@ -256,7 +256,8 @@
    *
    * @param pixelFormat the pixel format (one of <code>PF_*</code>)
    *
-   * @return the green offset for the given pixel format.
+   * @return the green offset for the given pixel format, or -1 if the pixel
+   * format does not have a green component.
    */
   public static int getGreenOffset(int pixelFormat) {
     checkPixelFormat(pixelFormat);
@@ -264,7 +265,7 @@
   }
 
   private static final int[] greenOffset = {
-    1, 1, 1, 1, 2, 2, 0, 1, 1, 2, 2, -1
+    1, 1, 1, 1, 2, 2, -1, 1, 1, 2, 2, -1
   };
 
 
@@ -277,7 +278,8 @@
    *
    * @param pixelFormat the pixel format (one of <code>PF_*</code>)
    *
-   * @return the blue offset for the given pixel format.
+   * @return the blue offset for the given pixel format, or -1 if the pixel
+   * format does not have a blue component.
    */
   public static int getBlueOffset(int pixelFormat) {
     checkPixelFormat(pixelFormat);
@@ -285,7 +287,29 @@
   }
 
   private static final int[] blueOffset = {
-    2, 0, 2, 0, 1, 3, 0, 2, 0, 1, 3, -1
+    2, 0, 2, 0, 1, 3, -1, 2, 0, 1, 3, -1
+  };
+
+
+  /**
+   * For the given pixel format, returns the number of bytes that the alpha
+   * component is offset from the start of the pixel.  For instance, if a pixel
+   * of format <code>TJ.PF_BGRA</code> is stored in <code>char pixel[]</code>,
+   * then the alpha component will be
+   * <code>pixel[TJ.getAlphaOffset(TJ.PF_BGRA)]</code>.
+   *
+   * @param pixelFormat the pixel format (one of <code>PF_*</code>)
+   *
+   * @return the alpha offset for the given pixel format, or -1 if the pixel
+   * format does not have a alpha component.
+   */
+  public static int getAlphaOffset(int pixelFormat) {
+    checkPixelFormat(pixelFormat);
+    return alphaOffset[pixelFormat];
+  }
+
+  private static final int[] alphaOffset = {
+    -1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0, -1
   };
 
 
@@ -348,16 +372,16 @@
    * The uncompressed source/destination image is stored in bottom-up (Windows,
    * OpenGL) order, not top-down (X11) order.
    */
-  public static final int FLAG_BOTTOMUP     = 2;
+  public static final int FLAG_BOTTOMUP      = 2;
 
   @Deprecated
-  public static final int FLAG_FORCEMMX     = 8;
+  public static final int FLAG_FORCEMMX      = 8;
   @Deprecated
-  public static final int FLAG_FORCESSE     = 16;
+  public static final int FLAG_FORCESSE      = 16;
   @Deprecated
-  public static final int FLAG_FORCESSE2    = 32;
+  public static final int FLAG_FORCESSE2     = 32;
   @Deprecated
-  public static final int FLAG_FORCESSE3    = 128;
+  public static final int FLAG_FORCESSE3     = 128;
 
   /**
    * When decompressing an image that was compressed using chrominance
@@ -366,7 +390,7 @@
    * creates a smooth transition between neighboring chrominance components in
    * order to reduce upsampling artifacts in the decompressed image.
    */
-  public static final int FLAG_FASTUPSAMPLE = 256;
+  public static final int FLAG_FASTUPSAMPLE  = 256;
   /**
    * Use the fastest DCT/IDCT algorithm available in the underlying codec.  The
    * default if this flag is not specified is implementation-specific.  For
@@ -375,7 +399,7 @@
    * only a very slight effect on accuracy, but it uses the accurate algorithm
    * when decompressing, because this has been shown to have a larger effect.
    */
-  public static final int FLAG_FASTDCT      =  2048;
+  public static final int FLAG_FASTDCT       = 2048;
   /**
    * Use the most accurate DCT/IDCT algorithm available in the underlying
    * codec.  The default if this flag is not specified is
@@ -385,7 +409,46 @@
    * but it uses the accurate algorithm when decompressing, because this has
    * been shown to have a larger effect.
    */
-  public static final int FLAG_ACCURATEDCT  =  4096;
+  public static final int FLAG_ACCURATEDCT   = 4096;
+  /**
+   * Immediately discontinue the current compression/decompression/transform
+   * operation if the underlying codec throws a warning (non-fatal error).  The
+   * default behavior is to allow the operation to complete unless a fatal
+   * error is encountered.
+   * <p>
+   * NOTE: due to the design of the TurboJPEG Java API, only certain methods
+   * (specifically, {@link TJDecompressor TJDecompressor.decompress*()} methods
+   * with a void return type) will complete and leave the output image in a
+   * fully recoverable state after a non-fatal error occurs.
+   */
+  public static final int FLAG_STOPONWARNING = 8192;
+  /**
+   * Use progressive entropy coding in JPEG images generated by compression and
+   * transform operations.  Progressive entropy coding will generally improve
+   * compression relative to baseline entropy coding (the default), but it will
+   * reduce compression and decompression performance considerably.
+   */
+  public static final int FLAG_PROGRESSIVE   = 16384;
+
+
+  /**
+   * The number of error codes
+   */
+  public static final int NUMERR = 2;
+  /**
+   * The error was non-fatal and recoverable, but the image may still be
+   * corrupt.
+   * <p>
+   * NOTE: due to the design of the TurboJPEG Java API, only certain methods
+   * (specifically, {@link TJDecompressor TJDecompressor.decompress*()} methods
+   * with a void return type) will complete and leave the output image in a
+   * fully recoverable state after a non-fatal error occurs.
+   */
+  public static final int ERR_WARNING = 0;
+  /**
+   * The error was fatal and non-recoverable.
+   */
+  public static final int ERR_FATAL = 1;
 
 
   /**
diff --git a/java/org/libjpegturbo/turbojpeg/TJCompressor.java b/java/org/libjpegturbo/turbojpeg/TJCompressor.java
index 2ff8e4d..ed746b5 100644
--- a/java/org/libjpegturbo/turbojpeg/TJCompressor.java
+++ b/java/org/libjpegturbo/turbojpeg/TJCompressor.java
@@ -199,7 +199,7 @@
       throw new IllegalArgumentException("Invalid argument in setSourceImage()");
     srcX = x;
     srcY = y;
-    srcWidth = (width == 0) ? srcImage.getWidth(): width;
+    srcWidth = (width == 0) ? srcImage.getWidth() : width;
     srcHeight = (height == 0) ? srcImage.getHeight() : height;
     if (x + width > srcImage.getWidth() || y + height > srcImage.getHeight())
       throw new IllegalArgumentException("Compression region exceeds the bounds of the source image");
@@ -208,30 +208,30 @@
     boolean intPixels = false;
     if (byteOrder == null)
       byteOrder = ByteOrder.nativeOrder();
-    switch(srcImage.getType()) {
-      case BufferedImage.TYPE_3BYTE_BGR:
-        pixelFormat = TJ.PF_BGR;  break;
-      case BufferedImage.TYPE_4BYTE_ABGR:
-      case BufferedImage.TYPE_4BYTE_ABGR_PRE:
-        pixelFormat = TJ.PF_XBGR;  break;
-      case BufferedImage.TYPE_BYTE_GRAY:
-        pixelFormat = TJ.PF_GRAY;  break;
-      case BufferedImage.TYPE_INT_BGR:
-        if (byteOrder == ByteOrder.BIG_ENDIAN)
-          pixelFormat = TJ.PF_XBGR;
-        else
-          pixelFormat = TJ.PF_RGBX;
-        intPixels = true;  break;
-      case BufferedImage.TYPE_INT_RGB:
-      case BufferedImage.TYPE_INT_ARGB:
-      case BufferedImage.TYPE_INT_ARGB_PRE:
-        if (byteOrder == ByteOrder.BIG_ENDIAN)
-          pixelFormat = TJ.PF_XRGB;
-        else
-          pixelFormat = TJ.PF_BGRX;
-        intPixels = true;  break;
-      default:
-        throw new IllegalArgumentException("Unsupported BufferedImage format");
+    switch (srcImage.getType()) {
+    case BufferedImage.TYPE_3BYTE_BGR:
+      pixelFormat = TJ.PF_BGR;  break;
+    case BufferedImage.TYPE_4BYTE_ABGR:
+    case BufferedImage.TYPE_4BYTE_ABGR_PRE:
+      pixelFormat = TJ.PF_XBGR;  break;
+    case BufferedImage.TYPE_BYTE_GRAY:
+      pixelFormat = TJ.PF_GRAY;  break;
+    case BufferedImage.TYPE_INT_BGR:
+      if (byteOrder == ByteOrder.BIG_ENDIAN)
+        pixelFormat = TJ.PF_XBGR;
+      else
+        pixelFormat = TJ.PF_RGBX;
+      intPixels = true;  break;
+    case BufferedImage.TYPE_INT_RGB:
+    case BufferedImage.TYPE_INT_ARGB:
+    case BufferedImage.TYPE_INT_ARGB_PRE:
+      if (byteOrder == ByteOrder.BIG_ENDIAN)
+        pixelFormat = TJ.PF_XRGB;
+      else
+        pixelFormat = TJ.PF_BGRX;
+      intPixels = true;  break;
+    default:
+      throw new IllegalArgumentException("Unsupported BufferedImage format");
     }
     srcPixelFormat = pixelFormat;
 
@@ -447,7 +447,7 @@
    */
   @Deprecated
   public void encodeYUV(byte[] dstBuf, int flags) throws TJException {
-    if(dstBuf == null)
+    if (dstBuf == null)
       throw new IllegalArgumentException("Invalid argument in encodeYUV()");
     checkSourceImage();
     checkSubsampling();
@@ -475,7 +475,7 @@
   public YUVImage encodeYUV(int pad, int flags) throws TJException {
     checkSourceImage();
     checkSubsampling();
-    if(pad < 1 || ((pad & (pad - 1)) != 0))
+    if (pad < 1 || ((pad & (pad - 1)) != 0))
       throw new IllegalStateException("Invalid argument in encodeYUV()");
     YUVImage yuvImage = new YUVImage(srcWidth, pad, srcHeight, subsamp);
     encodeYUV(yuvImage, flags);
@@ -571,7 +571,7 @@
   protected void finalize() throws Throwable {
     try {
       close();
-    } catch(TJException e) {
+    } catch (TJException e) {
     } finally {
       super.finalize();
     }
diff --git a/java/org/libjpegturbo/turbojpeg/TJDecompressor.java b/java/org/libjpegturbo/turbojpeg/TJDecompressor.java
index bd0e694..8af868b 100644
--- a/java/org/libjpegturbo/turbojpeg/TJDecompressor.java
+++ b/java/org/libjpegturbo/turbojpeg/TJDecompressor.java
@@ -308,6 +308,10 @@
    * Decompress the JPEG source image or decode the YUV source image associated
    * with this decompressor instance and output a grayscale, RGB, or CMYK image
    * to the given destination buffer.
+   * <p>
+   * NOTE: The output image is fully recoverable if this method throws a
+   * non-fatal {@link TJException} (unless
+   * {@link TJ#FLAG_STOPONWARNING TJ.FLAG_STOPONWARNING} is specified.)
    *
    * @param dstBuf buffer that will receive the decompressed/decoded image.
    * If the source image is a JPEG image, then this buffer should normally be
@@ -451,6 +455,10 @@
    * but leaves out the color conversion step, so a planar YUV image is
    * generated instead of an RGB or grayscale image.  This method cannot be
    * used to decompress JPEG source images with the CMYK or YCCK colorspace.
+   * <p>
+   * NOTE: The YUV planar output image is fully recoverable if this method
+   * throws a non-fatal {@link TJException} (unless
+   * {@link TJ#FLAG_STOPONWARNING TJ.FLAG_STOPONWARNING} is specified.)
    *
    * @param dstImage {@link YUVImage} instance that will receive the YUV planar
    * image.  The level of subsampling specified in this <code>YUVImage</code>
@@ -618,6 +626,10 @@
    * Decompress the JPEG source image or decode the YUV source image associated
    * with this decompressor instance and output a grayscale, RGB, or CMYK image
    * to the given destination buffer.
+   * <p>
+   * NOTE: The output image is fully recoverable if this method throws a
+   * non-fatal {@link TJException} (unless
+   * {@link TJ#FLAG_STOPONWARNING TJ.FLAG_STOPONWARNING} is specified.)
    *
    * @param dstBuf buffer that will receive the decompressed/decoded image.
    * If the source image is a JPEG image, then this buffer should normally be
@@ -699,6 +711,10 @@
    * Decompress the JPEG source image or decode the YUV source image associated
    * with this decompressor instance and output a decompressed/decoded image to
    * the given <code>BufferedImage</code> instance.
+   * <p>
+   * NOTE: The output image is fully recoverable if this method throws a
+   * non-fatal {@link TJException} (unless
+   * {@link TJ#FLAG_STOPONWARNING TJ.FLAG_STOPONWARNING} is specified.)
    *
    * @param dstImage a <code>BufferedImage</code> instance that will receive
    * the decompressed/decoded image.  If the source image is a JPEG image, then
@@ -734,35 +750,35 @@
     int pixelFormat;  boolean intPixels = false;
     if (byteOrder == null)
       byteOrder = ByteOrder.nativeOrder();
-    switch(dstImage.getType()) {
-      case BufferedImage.TYPE_3BYTE_BGR:
-        pixelFormat = TJ.PF_BGR;  break;
-      case BufferedImage.TYPE_4BYTE_ABGR:
-      case BufferedImage.TYPE_4BYTE_ABGR_PRE:
-        pixelFormat = TJ.PF_XBGR;  break;
-      case BufferedImage.TYPE_BYTE_GRAY:
-        pixelFormat = TJ.PF_GRAY;  break;
-      case BufferedImage.TYPE_INT_BGR:
-        if (byteOrder == ByteOrder.BIG_ENDIAN)
-          pixelFormat = TJ.PF_XBGR;
-        else
-          pixelFormat = TJ.PF_RGBX;
-        intPixels = true;  break;
-      case BufferedImage.TYPE_INT_RGB:
-        if (byteOrder == ByteOrder.BIG_ENDIAN)
-          pixelFormat = TJ.PF_XRGB;
-        else
-          pixelFormat = TJ.PF_BGRX;
-        intPixels = true;  break;
-      case BufferedImage.TYPE_INT_ARGB:
-      case BufferedImage.TYPE_INT_ARGB_PRE:
-        if (byteOrder == ByteOrder.BIG_ENDIAN)
-          pixelFormat = TJ.PF_ARGB;
-        else
-          pixelFormat = TJ.PF_BGRA;
-        intPixels = true;  break;
-      default:
-        throw new IllegalArgumentException("Unsupported BufferedImage format");
+    switch (dstImage.getType()) {
+    case BufferedImage.TYPE_3BYTE_BGR:
+      pixelFormat = TJ.PF_BGR;  break;
+    case BufferedImage.TYPE_4BYTE_ABGR:
+    case BufferedImage.TYPE_4BYTE_ABGR_PRE:
+      pixelFormat = TJ.PF_XBGR;  break;
+    case BufferedImage.TYPE_BYTE_GRAY:
+      pixelFormat = TJ.PF_GRAY;  break;
+    case BufferedImage.TYPE_INT_BGR:
+      if (byteOrder == ByteOrder.BIG_ENDIAN)
+        pixelFormat = TJ.PF_XBGR;
+      else
+        pixelFormat = TJ.PF_RGBX;
+      intPixels = true;  break;
+    case BufferedImage.TYPE_INT_RGB:
+      if (byteOrder == ByteOrder.BIG_ENDIAN)
+        pixelFormat = TJ.PF_XRGB;
+      else
+        pixelFormat = TJ.PF_BGRX;
+      intPixels = true;  break;
+    case BufferedImage.TYPE_INT_ARGB:
+    case BufferedImage.TYPE_INT_ARGB_PRE:
+      if (byteOrder == ByteOrder.BIG_ENDIAN)
+        pixelFormat = TJ.PF_ARGB;
+      else
+        pixelFormat = TJ.PF_BGRA;
+      intPixels = true;  break;
+    default:
+      throw new IllegalArgumentException("Unsupported BufferedImage format");
     }
     WritableRaster wr = dstImage.getRaster();
     if (intPixels) {
@@ -846,7 +862,7 @@
   protected void finalize() throws Throwable {
     try {
       close();
-    } catch(TJException e) {
+    } catch (TJException e) {
     } finally {
       super.finalize();
     }
diff --git a/java/org/libjpegturbo/turbojpeg/TJException.java b/java/org/libjpegturbo/turbojpeg/TJException.java
index 59c2041..97659d4 100644
--- a/java/org/libjpegturbo/turbojpeg/TJException.java
+++ b/java/org/libjpegturbo/turbojpeg/TJException.java
@@ -1,5 +1,6 @@
 /*
  * Copyright (C)2015 Viktor Szathmáry.  All Rights Reserved.
+ * Copyright (C)2017 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -46,8 +47,26 @@
     super(message);
   }
 
+  public TJException(String message, int code) {
+    super(message);
+    if (errorCode >= 0 && errorCode < TJ.NUMERR)
+      errorCode = code;
+  }
+
   public TJException(Throwable cause) {
     super(cause);
   }
 
+  /**
+   * Returns a code (one of {@link TJ TJ.ERR_*}) indicating the severity of the
+   * last error.
+   *
+   * @return a code (one of {@link TJ TJ.ERR_*}) indicating the severity of the
+   * last error.
+   */
+  public int getErrorCode() {
+    return errorCode;
+  }
+
+  private int errorCode = TJ.ERR_FATAL;
 }
diff --git a/java/org/libjpegturbo/turbojpeg/TJLoader.java.tmpl b/java/org/libjpegturbo/turbojpeg/TJLoader-unix.java.in
similarity index 77%
rename from java/org/libjpegturbo/turbojpeg/TJLoader.java.tmpl
rename to java/org/libjpegturbo/turbojpeg/TJLoader-unix.java.in
index 5ef3118..65884e8 100644
--- a/java/org/libjpegturbo/turbojpeg/TJLoader.java.tmpl
+++ b/java/org/libjpegturbo/turbojpeg/TJLoader-unix.java.in
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2011-2013 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2011-2013, 2016 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,19 +36,19 @@
       String os = System.getProperty("os.name").toLowerCase();
       if (os.indexOf("mac") >= 0) {
         try {
-          System.load("%{__libdir}/libturbojpeg.jnilib");
+          System.load("@CMAKE_INSTALL_FULL_LIBDIR@/libturbojpeg.jnilib");
         } catch (java.lang.UnsatisfiedLinkError e2) {
           System.load("/usr/lib/libturbojpeg.jnilib");
         }
       } else {
         try {
-          System.load("%{__libdir}/libturbojpeg.so");
+          System.load("@CMAKE_INSTALL_FULL_LIBDIR@/libturbojpeg.so");
         } catch (java.lang.UnsatisfiedLinkError e3) {
-          String libdir = "%{__libdir}";
-          if (libdir.equals("/opt/libjpeg-turbo/lib64")) {
-            System.load("/opt/libjpeg-turbo/lib32/libturbojpeg.so");
-          } else if (libdir.equals("/opt/libjpeg-turbo/lib32")) {
-            System.load("/opt/libjpeg-turbo/lib64/libturbojpeg.so");
+          String libdir = "@CMAKE_INSTALL_FULL_LIBDIR@";
+          if (libdir.equals("@CMAKE_INSTALL_DEFAULT_PREFIX@/lib64")) {
+            System.load("@CMAKE_INSTALL_DEFAULT_PREFIX@/lib32/libturbojpeg.so");
+          } else if (libdir.equals("@CMAKE_INSTALL_DEFAULT_PREFIX@/lib32")) {
+            System.load("@CMAKE_INSTALL_DEFAULT_PREFIX@/lib64/libturbojpeg.so");
           } else {
             throw e3;
           }
diff --git a/java/org/libjpegturbo/turbojpeg/TJLoader.java.in b/java/org/libjpegturbo/turbojpeg/TJLoader-win.java.in
similarity index 100%
rename from java/org/libjpegturbo/turbojpeg/TJLoader.java.in
rename to java/org/libjpegturbo/turbojpeg/TJLoader-win.java.in
diff --git a/java/org/libjpegturbo/turbojpeg/TJTransform.java b/java/org/libjpegturbo/turbojpeg/TJTransform.java
index 7381f36..1b10d3d 100644
--- a/java/org/libjpegturbo/turbojpeg/TJTransform.java
+++ b/java/org/libjpegturbo/turbojpeg/TJTransform.java
@@ -103,21 +103,21 @@
    * partial MCU blocks that cannot be transformed will be left in place, which
    * will create odd-looking strips on the right or bottom edge of the image.
    */
-  public static final int OPT_PERFECT  = 1;
+  public static final int OPT_PERFECT     = 1;
   /**
    * This option will discard any partial MCU blocks that cannot be
    * transformed.
    */
-  public static final int OPT_TRIM     = 2;
+  public static final int OPT_TRIM        = 2;
   /**
    * This option will enable lossless cropping.
    */
-  public static final int OPT_CROP     = 4;
+  public static final int OPT_CROP        = 4;
   /**
    * This option will discard the color data in the input image and produce
    * a grayscale output image.
    */
-  public static final int OPT_GRAY     = 8;
+  public static final int OPT_GRAY        = 8;
   /**
    * This option will prevent {@link TJTransformer#transform
    * TJTransformer.transform()} from outputting a JPEG image for this
@@ -125,7 +125,21 @@
    * filter to capture the transformed DCT coefficients without transcoding
    * them.
    */
-  public static final int OPT_NOOUTPUT = 16;
+  public static final int OPT_NOOUTPUT    = 16;
+  /**
+   * This option will enable progressive entropy coding in the output image
+   * generated by this particular transform.  Progressive entropy coding will
+   * generally improve compression relative to baseline entropy coding (the
+   * default), but it will reduce compression and decompression performance
+   * considerably.
+   */
+  public static final int OPT_PROGRESSIVE = 32;
+  /**
+   * This option will prevent {@link TJTransformer#transform
+   * TJTransformer.transform()} from copying any extra markers (including EXIF
+   * and ICC profile data) from the source image to the output image.
+   */
+  public static final int OPT_COPYNONE    = 64;
 
 
   /**
diff --git a/java/org/libjpegturbo/turbojpeg/YUVImage.java b/java/org/libjpegturbo/turbojpeg/YUVImage.java
index d123e37..0777c13 100644
--- a/java/org/libjpegturbo/turbojpeg/YUVImage.java
+++ b/java/org/libjpegturbo/turbojpeg/YUVImage.java
@@ -208,12 +208,12 @@
    * @param subsamp the level of chrominance subsampling used in the YUV
    * image (one of {@link TJ#SAMP_444 TJ.SAMP_*})
    */
-  public void setBuf(byte[][] planes, int[] offsets, int width, int strides[],
+  public void setBuf(byte[][] planes, int[] offsets, int width, int[] strides,
                      int height, int subsamp) {
     setBuf(planes, offsets, width, strides, height, subsamp, false);
   }
 
-  private void setBuf(byte[][] planes, int[] offsets, int width, int strides[],
+  private void setBuf(byte[][] planes, int[] offsets, int width, int[] strides,
                      int height, int subsamp, boolean alloc) {
     if ((planes == null && !alloc) || width < 1 || height < 1 || subsamp < 0 ||
         subsamp >= TJ.NUMSAMP)
@@ -428,7 +428,7 @@
     return TJ.bufSizeYUV(yuvWidth, yuvPad, yuvHeight, yuvSubsamp);
   }
 
-  private static final int PAD(int v, int p) {
+  private static int PAD(int v, int p) {
     return (v + p - 1) & (~(p - 1));
   }
 
diff --git a/jcapimin.c b/jcapimin.c
index 15674be..178c55b 100644
--- a/jcapimin.c
+++ b/jcapimin.c
@@ -31,7 +31,7 @@
  */
 
 GLOBAL(void)
-jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
+jpeg_CreateCompress(j_compress_ptr cinfo, int version, size_t structsize)
 {
   int i;
 
@@ -41,7 +41,7 @@
     ERREXIT2(cinfo, JERR_BAD_LIB_VERSION, JPEG_LIB_VERSION, version);
   if (structsize != sizeof(struct jpeg_compress_struct))
     ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE,
-             (int) sizeof(struct jpeg_compress_struct), (int) structsize);
+             (int)sizeof(struct jpeg_compress_struct), (int)structsize);
 
   /* For debugging purposes, we zero the whole master structure.
    * But the application has already set the err pointer, and may have set
@@ -59,7 +59,7 @@
   cinfo->is_decompressor = FALSE;
 
   /* Initialize a memory manager instance for this object */
-  jinit_memory_mgr((j_common_ptr) cinfo);
+  jinit_memory_mgr((j_common_ptr)cinfo);
 
   /* Zero out pointers to permanent structures. */
   cinfo->progress = NULL;
@@ -83,7 +83,7 @@
   /* Must do it here for emit_dqt in case jpeg_write_tables is used */
   cinfo->block_size = DCTSIZE;
   cinfo->natural_order = jpeg_natural_order;
-  cinfo->lim_Se = DCTSIZE2-1;
+  cinfo->lim_Se = DCTSIZE2 - 1;
 #endif
 
   cinfo->script_space = NULL;
@@ -100,9 +100,9 @@
  */
 
 GLOBAL(void)
-jpeg_destroy_compress (j_compress_ptr cinfo)
+jpeg_destroy_compress(j_compress_ptr cinfo)
 {
-  jpeg_destroy((j_common_ptr) cinfo); /* use common routine */
+  jpeg_destroy((j_common_ptr)cinfo); /* use common routine */
 }
 
 
@@ -112,9 +112,9 @@
  */
 
 GLOBAL(void)
-jpeg_abort_compress (j_compress_ptr cinfo)
+jpeg_abort_compress(j_compress_ptr cinfo)
 {
-  jpeg_abort((j_common_ptr) cinfo); /* use common routine */
+  jpeg_abort((j_common_ptr)cinfo); /* use common routine */
 }
 
 
@@ -131,7 +131,7 @@
  */
 
 GLOBAL(void)
-jpeg_suppress_tables (j_compress_ptr cinfo, boolean suppress)
+jpeg_suppress_tables(j_compress_ptr cinfo, boolean suppress)
 {
   int i;
   JQUANT_TBL *qtbl;
@@ -159,7 +159,7 @@
  */
 
 GLOBAL(void)
-jpeg_finish_compress (j_compress_ptr cinfo)
+jpeg_finish_compress(j_compress_ptr cinfo)
 {
   JDIMENSION iMCU_row;
 
@@ -172,18 +172,18 @@
   } else if (cinfo->global_state != CSTATE_WRCOEFS)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
   /* Perform any remaining passes */
-  while (! cinfo->master->is_last_pass) {
+  while (!cinfo->master->is_last_pass) {
     (*cinfo->master->prepare_for_pass) (cinfo);
     for (iMCU_row = 0; iMCU_row < cinfo->total_iMCU_rows; iMCU_row++) {
       if (cinfo->progress != NULL) {
-        cinfo->progress->pass_counter = (long) iMCU_row;
-        cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows;
-        (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+        cinfo->progress->pass_counter = (long)iMCU_row;
+        cinfo->progress->pass_limit = (long)cinfo->total_iMCU_rows;
+        (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
       }
       /* We bypass the main controller and invoke coef controller directly;
        * all work is being done from the coefficient buffer.
        */
-      if (! (*cinfo->coef->compress_data) (cinfo, (JSAMPIMAGE) NULL))
+      if (!(*cinfo->coef->compress_data) (cinfo, (JSAMPIMAGE)NULL))
         ERREXIT(cinfo, JERR_CANT_SUSPEND);
     }
     (*cinfo->master->finish_pass) (cinfo);
@@ -192,7 +192,7 @@
   (*cinfo->marker->write_file_trailer) (cinfo);
   (*cinfo->dest->term_destination) (cinfo);
   /* We can use jpeg_abort to release memory and reset global_state */
-  jpeg_abort((j_common_ptr) cinfo);
+  jpeg_abort((j_common_ptr)cinfo);
 }
 
 
@@ -204,8 +204,8 @@
  */
 
 GLOBAL(void)
-jpeg_write_marker (j_compress_ptr cinfo, int marker,
-                   const JOCTET *dataptr, unsigned int datalen)
+jpeg_write_marker(j_compress_ptr cinfo, int marker, const JOCTET *dataptr,
+                  unsigned int datalen)
 {
   void (*write_marker_byte) (j_compress_ptr info, int val);
 
@@ -226,7 +226,7 @@
 /* Same, but piecemeal. */
 
 GLOBAL(void)
-jpeg_write_m_header (j_compress_ptr cinfo, int marker, unsigned int datalen)
+jpeg_write_m_header(j_compress_ptr cinfo, int marker, unsigned int datalen)
 {
   if (cinfo->next_scanline != 0 ||
       (cinfo->global_state != CSTATE_SCANNING &&
@@ -238,7 +238,7 @@
 }
 
 GLOBAL(void)
-jpeg_write_m_byte (j_compress_ptr cinfo, int val)
+jpeg_write_m_byte(j_compress_ptr cinfo, int val)
 {
   (*cinfo->marker->write_marker_byte) (cinfo, val);
 }
@@ -266,13 +266,13 @@
  */
 
 GLOBAL(void)
-jpeg_write_tables (j_compress_ptr cinfo)
+jpeg_write_tables(j_compress_ptr cinfo)
 {
   if (cinfo->global_state != CSTATE_START)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
 
   /* (Re)initialize error mgr and destination modules */
-  (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
+  (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
   (*cinfo->dest->init_destination) (cinfo);
   /* Initialize the marker writer ... bit of a crock to do it here. */
   jinit_marker_writer(cinfo);
diff --git a/jcapistd.c b/jcapistd.c
index 5c6d0be..aa2aad9 100644
--- a/jcapistd.c
+++ b/jcapistd.c
@@ -36,7 +36,7 @@
  */
 
 GLOBAL(void)
-jpeg_start_compress (j_compress_ptr cinfo, boolean write_all_tables)
+jpeg_start_compress(j_compress_ptr cinfo, boolean write_all_tables)
 {
   if (cinfo->global_state != CSTATE_START)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
@@ -45,7 +45,7 @@
     jpeg_suppress_tables(cinfo, FALSE); /* mark all tables to be written */
 
   /* (Re)initialize error mgr and destination modules */
-  (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
+  (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
   (*cinfo->dest->init_destination) (cinfo);
   /* Perform master selection of active modules */
   jinit_compress_master(cinfo);
@@ -75,8 +75,8 @@
  */
 
 GLOBAL(JDIMENSION)
-jpeg_write_scanlines (j_compress_ptr cinfo, JSAMPARRAY scanlines,
-                      JDIMENSION num_lines)
+jpeg_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines,
+                     JDIMENSION num_lines)
 {
   JDIMENSION row_ctr, rows_left;
 
@@ -87,9 +87,9 @@
 
   /* Call progress monitor hook if present */
   if (cinfo->progress != NULL) {
-    cinfo->progress->pass_counter = (long) cinfo->next_scanline;
-    cinfo->progress->pass_limit = (long) cinfo->image_height;
-    (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+    cinfo->progress->pass_counter = (long)cinfo->next_scanline;
+    cinfo->progress->pass_limit = (long)cinfo->image_height;
+    (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
   }
 
   /* Give master control module another chance if this is first call to
@@ -118,8 +118,8 @@
  */
 
 GLOBAL(JDIMENSION)
-jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data,
-                     JDIMENSION num_lines)
+jpeg_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+                    JDIMENSION num_lines)
 {
   JDIMENSION lines_per_iMCU_row;
 
@@ -132,9 +132,9 @@
 
   /* Call progress monitor hook if present */
   if (cinfo->progress != NULL) {
-    cinfo->progress->pass_counter = (long) cinfo->next_scanline;
-    cinfo->progress->pass_limit = (long) cinfo->image_height;
-    (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+    cinfo->progress->pass_counter = (long)cinfo->next_scanline;
+    cinfo->progress->pass_limit = (long)cinfo->image_height;
+    (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
   }
 
   /* Give master control module another chance if this is first call to
@@ -151,7 +151,7 @@
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
 
   /* Directly compress the row. */
-  if (! (*cinfo->coef->compress_data) (cinfo, data)) {
+  if (!(*cinfo->coef->compress_data) (cinfo, data)) {
     /* If compressor did not consume the whole row, suspend processing. */
     return 0;
   }
diff --git a/jcarith.c b/jcarith.c
index 6d3b8af..291771a 100644
--- a/jcarith.c
+++ b/jcarith.c
@@ -105,25 +105,25 @@
 
 #ifdef RIGHT_SHIFT_IS_UNSIGNED
 #define ISHIFT_TEMPS    int ishift_temp;
-#define IRIGHT_SHIFT(x,shft)  \
-        ((ishift_temp = (x)) < 0 ? \
-         (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
-         (ishift_temp >> (shft)))
+#define IRIGHT_SHIFT(x, shft) \
+  ((ishift_temp = (x)) < 0 ? \
+   (ishift_temp >> (shft)) | ((~0) << (16 - (shft))) : \
+   (ishift_temp >> (shft)))
 #else
 #define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft)    ((x) >> (shft))
+#define IRIGHT_SHIFT(x, shft)   ((x) >> (shft))
 #endif
 
 
 LOCAL(void)
-emit_byte (int val, j_compress_ptr cinfo)
+emit_byte(int val, j_compress_ptr cinfo)
 /* Write next output byte; we do not support suspension in this module. */
 {
   struct jpeg_destination_mgr *dest = cinfo->dest;
 
-  *dest->next_output_byte++ = (JOCTET) val;
+  *dest->next_output_byte++ = (JOCTET)val;
   if (--dest->free_in_buffer == 0)
-    if (! (*dest->empty_output_buffer) (cinfo))
+    if (!(*dest->empty_output_buffer) (cinfo))
       ERREXIT(cinfo, JERR_CANT_SUSPEND);
 }
 
@@ -133,9 +133,9 @@
  */
 
 METHODDEF(void)
-finish_pass (j_compress_ptr cinfo)
+finish_pass(j_compress_ptr cinfo)
 {
-  arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr e = (arith_entropy_ptr)cinfo->entropy;
   JLONG temp;
 
   /* Section D.1.8: Termination of encoding */
@@ -219,9 +219,9 @@
  */
 
 LOCAL(void)
-arith_encode (j_compress_ptr cinfo, unsigned char *st, int val)
+arith_encode(j_compress_ptr cinfo, unsigned char *st, int val)
 {
-  register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  register arith_entropy_ptr e = (arith_entropy_ptr)cinfo->entropy;
   register unsigned char nl, nm;
   register JLONG qe, temp;
   register int sv;
@@ -231,8 +231,8 @@
    */
   sv = *st;
   qe = jpeg_aritab[sv & 0x7F];  /* => Qe_Value */
-  nl = qe & 0xFF; qe >>= 8;     /* Next_Index_LPS + Switch_MPS */
-  nm = qe & 0xFF; qe >>= 8;     /* Next_Index_MPS */
+  nl = qe & 0xFF;  qe >>= 8;    /* Next_Index_LPS + Switch_MPS */
+  nm = qe & 0xFF;  qe >>= 8;    /* Next_Index_MPS */
 
   /* Encode & estimation procedures per sections D.1.4 & D.1.5 */
   e->a -= qe;
@@ -319,9 +319,9 @@
  */
 
 LOCAL(void)
-emit_restart (j_compress_ptr cinfo, int restart_num)
+emit_restart(j_compress_ptr cinfo, int restart_num)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   int ci;
   jpeg_component_info *compptr;
 
@@ -362,9 +362,9 @@
  */
 
 METHODDEF(boolean)
-encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   JBLOCKROW block;
   unsigned char *st;
   int blkn, ci, tbl;
@@ -391,7 +391,7 @@
     /* Compute the DC value after the required point transform by Al.
      * This is simply an arithmetic right shift.
      */
-    m = IRIGHT_SHIFT((int) ((*block)[0]), cinfo->Al);
+    m = IRIGHT_SHIFT((int)((*block)[0]), cinfo->Al);
 
     /* Sections F.1.4.1 & F.1.4.4.1: Encoding of DC coefficients */
 
@@ -432,9 +432,9 @@
       }
       arith_encode(cinfo, st, 0);
       /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
-      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+      if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
         entropy->dc_context[ci] = 0;    /* zero diff category */
-      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+      else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
         entropy->dc_context[ci] += 8;   /* large diff category */
       /* Figure F.9: Encoding the magnitude bit pattern of v */
       st += 14;
@@ -453,9 +453,9 @@
  */
 
 METHODDEF(boolean)
-encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   JBLOCKROW block;
   unsigned char *st;
   int tbl, k, ke;
@@ -510,7 +510,7 @@
           break;
         }
       }
-      arith_encode(cinfo, st + 1, 0); st += 3; k++;
+      arith_encode(cinfo, st + 1, 0);  st += 3;  k++;
     }
     st += 2;
     /* Figure F.8: Encoding the magnitude category of v */
@@ -552,9 +552,9 @@
  */
 
 METHODDEF(boolean)
-encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   unsigned char *st;
   int Al, blkn;
 
@@ -587,9 +587,9 @@
  */
 
 METHODDEF(boolean)
-encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   JBLOCKROW block;
   unsigned char *st;
   int tbl, k, ke, kex;
@@ -662,7 +662,7 @@
           break;
         }
       }
-      arith_encode(cinfo, st + 1, 0); st += 3; k++;
+      arith_encode(cinfo, st + 1, 0);  st += 3;  k++;
     }
   }
   /* Encode EOB decision only if k <= cinfo->Se */
@@ -680,9 +680,9 @@
  */
 
 METHODDEF(boolean)
-encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   jpeg_component_info *compptr;
   JBLOCKROW block;
   unsigned char *st;
@@ -747,9 +747,9 @@
       }
       arith_encode(cinfo, st, 0);
       /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
-      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+      if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
         entropy->dc_context[ci] = 0;    /* zero diff category */
-      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+      else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
         entropy->dc_context[ci] += 8;   /* large diff category */
       /* Figure F.9: Encoding the magnitude bit pattern of v */
       st += 14;
@@ -770,7 +770,7 @@
       st = entropy->ac_stats[tbl] + 3 * (k - 1);
       arith_encode(cinfo, st, 0);       /* EOB decision */
       while ((v = (*block)[jpeg_natural_order[k]]) == 0) {
-        arith_encode(cinfo, st + 1, 0); st += 3; k++;
+        arith_encode(cinfo, st + 1, 0);  st += 3;  k++;
       }
       arith_encode(cinfo, st + 1, 1);
       /* Figure F.6: Encoding nonzero value v */
@@ -822,9 +822,9 @@
  */
 
 METHODDEF(void)
-start_pass (j_compress_ptr cinfo, boolean gather_statistics)
+start_pass(j_compress_ptr cinfo, boolean gather_statistics)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   int ci, tbl;
   jpeg_component_info *compptr;
 
@@ -862,8 +862,8 @@
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
         ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
       if (entropy->dc_stats[tbl] == NULL)
-        entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
-          ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+        entropy->dc_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+          ((j_common_ptr)cinfo, JPOOL_IMAGE, DC_STAT_BINS);
       MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
       /* Initialize DC predictions to 0 */
       entropy->last_dc_val[ci] = 0;
@@ -875,13 +875,14 @@
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
         ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
       if (entropy->ac_stats[tbl] == NULL)
-        entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
-          ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+        entropy->ac_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+          ((j_common_ptr)cinfo, JPOOL_IMAGE, AC_STAT_BINS);
       MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
 #ifdef CALCULATE_SPECTRAL_CONDITIONING
       if (cinfo->progressive_mode)
         /* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */
-        cinfo->arith_ac_K[tbl] = cinfo->Ss + ((8 + cinfo->Se - cinfo->Ss) >> 4);
+        cinfo->arith_ac_K[tbl] = cinfo->Ss +
+                                 ((8 + cinfo->Se - cinfo->Ss) >> 4);
 #endif
     }
   }
@@ -905,15 +906,15 @@
  */
 
 GLOBAL(void)
-jinit_arith_encoder (j_compress_ptr cinfo)
+jinit_arith_encoder(j_compress_ptr cinfo)
 {
   arith_entropy_ptr entropy;
   int i;
 
   entropy = (arith_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(arith_entropy_encoder));
-  cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
+  cinfo->entropy = (struct jpeg_entropy_encoder *)entropy;
   entropy->pub.start_pass = start_pass;
   entropy->pub.finish_pass = finish_pass;
 
diff --git a/jccoefct.c b/jccoefct.c
index a08d6e3..068232a 100644
--- a/jccoefct.c
+++ b/jccoefct.c
@@ -58,21 +58,19 @@
 
 
 /* Forward declarations */
-METHODDEF(boolean) compress_data
-        (j_compress_ptr cinfo, JSAMPIMAGE input_buf);
+METHODDEF(boolean) compress_data(j_compress_ptr cinfo, JSAMPIMAGE input_buf);
 #ifdef FULL_COEF_BUFFER_SUPPORTED
-METHODDEF(boolean) compress_first_pass
-        (j_compress_ptr cinfo, JSAMPIMAGE input_buf);
-METHODDEF(boolean) compress_output
-        (j_compress_ptr cinfo, JSAMPIMAGE input_buf);
+METHODDEF(boolean) compress_first_pass(j_compress_ptr cinfo,
+                                       JSAMPIMAGE input_buf);
+METHODDEF(boolean) compress_output(j_compress_ptr cinfo, JSAMPIMAGE input_buf);
 #endif
 
 
 LOCAL(void)
-start_iMCU_row (j_compress_ptr cinfo)
+start_iMCU_row(j_compress_ptr cinfo)
 /* Reset within-iMCU-row counters for a new row */
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
 
   /* In an interleaved scan, an MCU row is the same as an iMCU row.
    * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
@@ -81,7 +79,7 @@
   if (cinfo->comps_in_scan > 1) {
     coef->MCU_rows_per_iMCU_row = 1;
   } else {
-    if (coef->iMCU_row_num < (cinfo->total_iMCU_rows-1))
+    if (coef->iMCU_row_num < (cinfo->total_iMCU_rows - 1))
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
     else
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
@@ -97,9 +95,9 @@
  */
 
 METHODDEF(void)
-start_pass_coef (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_coef(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
 
   coef->iMCU_row_num = 0;
   start_iMCU_row(cinfo);
@@ -140,9 +138,9 @@
  */
 
 METHODDEF(boolean)
-compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+compress_data(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION MCU_col_num;       /* index of current MCU within row */
   JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
@@ -167,31 +165,33 @@
       blkn = 0;
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
         compptr = cinfo->cur_comp_info[ci];
-        blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
-                                                : compptr->last_col_width;
+        blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width :
+                                                  compptr->last_col_width;
         xpos = MCU_col_num * compptr->MCU_sample_width;
         ypos = yoffset * DCTSIZE; /* ypos == (yoffset+yindex) * DCTSIZE */
         for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
           if (coef->iMCU_row_num < last_iMCU_row ||
-              yoffset+yindex < compptr->last_row_height) {
+              yoffset + yindex < compptr->last_row_height) {
             (*cinfo->fdct->forward_DCT) (cinfo, compptr,
                                          input_buf[compptr->component_index],
                                          coef->MCU_buffer[blkn],
-                                         ypos, xpos, (JDIMENSION) blockcnt);
+                                         ypos, xpos, (JDIMENSION)blockcnt);
             if (blockcnt < compptr->MCU_width) {
               /* Create some dummy blocks at the right edge of the image. */
-              jzero_far((void *) coef->MCU_buffer[blkn + blockcnt],
+              jzero_far((void *)coef->MCU_buffer[blkn + blockcnt],
                         (compptr->MCU_width - blockcnt) * sizeof(JBLOCK));
               for (bi = blockcnt; bi < compptr->MCU_width; bi++) {
-                coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn+bi-1][0][0];
+                coef->MCU_buffer[blkn + bi][0][0] =
+                  coef->MCU_buffer[blkn + bi - 1][0][0];
               }
             }
           } else {
             /* Create a row of dummy blocks at the bottom of the image. */
-            jzero_far((void *) coef->MCU_buffer[blkn],
+            jzero_far((void *)coef->MCU_buffer[blkn],
                       compptr->MCU_width * sizeof(JBLOCK));
             for (bi = 0; bi < compptr->MCU_width; bi++) {
-              coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn-1][0][0];
+              coef->MCU_buffer[blkn + bi][0][0] =
+                coef->MCU_buffer[blkn - 1][0][0];
             }
           }
           blkn += compptr->MCU_width;
@@ -201,7 +201,7 @@
       /* Try to write the MCU.  In event of a suspension failure, we will
        * re-DCT the MCU on restart (a bit inefficient, could be fixed...)
        */
-      if (! (*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
+      if (!(*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
         coef->MCU_vert_offset = yoffset;
         coef->mcu_ctr = MCU_col_num;
@@ -242,9 +242,9 @@
  */
 
 METHODDEF(boolean)
-compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+compress_first_pass(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
   JDIMENSION blocks_across, MCUs_across, MCUindex;
   int bi, ci, h_samp_factor, block_row, block_rows, ndummy;
@@ -257,21 +257,21 @@
        ci++, compptr++) {
     /* Align the virtual buffer for this component. */
     buffer = (*cinfo->mem->access_virt_barray)
-      ((j_common_ptr) cinfo, coef->whole_image[ci],
+      ((j_common_ptr)cinfo, coef->whole_image[ci],
        coef->iMCU_row_num * compptr->v_samp_factor,
-       (JDIMENSION) compptr->v_samp_factor, TRUE);
+       (JDIMENSION)compptr->v_samp_factor, TRUE);
     /* Count non-dummy DCT block rows in this iMCU row. */
     if (coef->iMCU_row_num < last_iMCU_row)
       block_rows = compptr->v_samp_factor;
     else {
       /* NB: can't use last_row_height here, since may not be set! */
-      block_rows = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+      block_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
       if (block_rows == 0) block_rows = compptr->v_samp_factor;
     }
     blocks_across = compptr->width_in_blocks;
     h_samp_factor = compptr->h_samp_factor;
     /* Count number of dummy blocks to be added at the right margin. */
-    ndummy = (int) (blocks_across % h_samp_factor);
+    ndummy = (int)(blocks_across % h_samp_factor);
     if (ndummy > 0)
       ndummy = h_samp_factor - ndummy;
     /* Perform DCT for all non-dummy blocks in this iMCU row.  Each call
@@ -281,12 +281,12 @@
       thisblockrow = buffer[block_row];
       (*cinfo->fdct->forward_DCT) (cinfo, compptr,
                                    input_buf[ci], thisblockrow,
-                                   (JDIMENSION) (block_row * DCTSIZE),
-                                   (JDIMENSION) 0, blocks_across);
+                                   (JDIMENSION)(block_row * DCTSIZE),
+                                   (JDIMENSION)0, blocks_across);
       if (ndummy > 0) {
         /* Create dummy blocks at the right edge of the image. */
         thisblockrow += blocks_across; /* => first dummy block */
-        jzero_far((void *) thisblockrow, ndummy * sizeof(JBLOCK));
+        jzero_far((void *)thisblockrow, ndummy * sizeof(JBLOCK));
         lastDC = thisblockrow[-1][0];
         for (bi = 0; bi < ndummy; bi++) {
           thisblockrow[bi][0] = lastDC;
@@ -304,11 +304,11 @@
       for (block_row = block_rows; block_row < compptr->v_samp_factor;
            block_row++) {
         thisblockrow = buffer[block_row];
-        lastblockrow = buffer[block_row-1];
-        jzero_far((void *) thisblockrow,
-                  (size_t) (blocks_across * sizeof(JBLOCK)));
+        lastblockrow = buffer[block_row - 1];
+        jzero_far((void *)thisblockrow,
+                  (size_t)(blocks_across * sizeof(JBLOCK)));
         for (MCUindex = 0; MCUindex < MCUs_across; MCUindex++) {
-          lastDC = lastblockrow[h_samp_factor-1][0];
+          lastDC = lastblockrow[h_samp_factor - 1][0];
           for (bi = 0; bi < h_samp_factor; bi++) {
             thisblockrow[bi][0] = lastDC;
           }
@@ -338,9 +338,9 @@
  */
 
 METHODDEF(boolean)
-compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+compress_output(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION MCU_col_num;       /* index of current MCU within row */
   int blkn, ci, xindex, yindex, yoffset;
   JDIMENSION start_col;
@@ -355,9 +355,9 @@
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     buffer[ci] = (*cinfo->mem->access_virt_barray)
-      ((j_common_ptr) cinfo, coef->whole_image[compptr->component_index],
+      ((j_common_ptr)cinfo, coef->whole_image[compptr->component_index],
        coef->iMCU_row_num * compptr->v_samp_factor,
-       (JDIMENSION) compptr->v_samp_factor, FALSE);
+       (JDIMENSION)compptr->v_samp_factor, FALSE);
   }
 
   /* Loop to process one whole iMCU row */
@@ -371,14 +371,14 @@
         compptr = cinfo->cur_comp_info[ci];
         start_col = MCU_col_num * compptr->MCU_width;
         for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-          buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
+          buffer_ptr = buffer[ci][yindex + yoffset] + start_col;
           for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
             coef->MCU_buffer[blkn++] = buffer_ptr++;
           }
         }
       }
       /* Try to write the MCU. */
-      if (! (*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
+      if (!(*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
         coef->MCU_vert_offset = yoffset;
         coef->mcu_ctr = MCU_col_num;
@@ -402,14 +402,14 @@
  */
 
 GLOBAL(void)
-jinit_c_coef_controller (j_compress_ptr cinfo, boolean need_full_buffer)
+jinit_c_coef_controller(j_compress_ptr cinfo, boolean need_full_buffer)
 {
   my_coef_ptr coef;
 
   coef = (my_coef_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_coef_controller));
-  cinfo->coef = (struct jpeg_c_coef_controller *) coef;
+  cinfo->coef = (struct jpeg_c_coef_controller *)coef;
   coef->pub.start_pass = start_pass_coef;
 
   /* Create the coefficient buffer. */
@@ -423,12 +423,12 @@
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
          ci++, compptr++) {
       coef->whole_image[ci] = (*cinfo->mem->request_virt_barray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
-         (JDIMENSION) jround_up((long) compptr->width_in_blocks,
-                                (long) compptr->h_samp_factor),
-         (JDIMENSION) jround_up((long) compptr->height_in_blocks,
-                                (long) compptr->v_samp_factor),
-         (JDIMENSION) compptr->v_samp_factor);
+        ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
+         (JDIMENSION)jround_up((long)compptr->width_in_blocks,
+                               (long)compptr->h_samp_factor),
+         (JDIMENSION)jround_up((long)compptr->height_in_blocks,
+                               (long)compptr->v_samp_factor),
+         (JDIMENSION)compptr->v_samp_factor);
     }
 #else
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -439,7 +439,7 @@
     int i;
 
     buffer = (JBLOCKROW)
-      (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
     for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) {
       coef->MCU_buffer[i] = buffer + i;
diff --git a/jccolext.c b/jccolext.c
index 479b320..19c955c 100644
--- a/jccolext.c
+++ b/jccolext.c
@@ -29,13 +29,13 @@
 
 INLINE
 LOCAL(void)
-rgb_ycc_convert_internal (j_compress_ptr cinfo,
-                          JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                          JDIMENSION output_row, int num_rows)
+rgb_ycc_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                         JSAMPIMAGE output_buf, JDIMENSION output_row,
+                         int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int r, g, b;
-  register JLONG * ctab = cconvert->rgb_ycc_tab;
+  register JLONG *ctab = cconvert->rgb_ycc_tab;
   register JSAMPROW inptr;
   register JSAMPROW outptr0, outptr1, outptr2;
   register JDIMENSION col;
@@ -58,17 +58,14 @@
        * need the general RIGHT_SHIFT macro.
        */
       /* Y */
-      outptr0[col] = (JSAMPLE)
-                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-                 >> SCALEBITS);
+      outptr0[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+                                ctab[b + B_Y_OFF]) >> SCALEBITS);
       /* Cb */
-      outptr1[col] = (JSAMPLE)
-                ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
-                 >> SCALEBITS);
+      outptr1[col] = (JSAMPLE)((ctab[r + R_CB_OFF] + ctab[g + G_CB_OFF] +
+                                ctab[b + B_CB_OFF]) >> SCALEBITS);
       /* Cr */
-      outptr2[col] = (JSAMPLE)
-                ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
-                 >> SCALEBITS);
+      outptr2[col] = (JSAMPLE)((ctab[r + R_CR_OFF] + ctab[g + G_CR_OFF] +
+                                ctab[b + B_CR_OFF]) >> SCALEBITS);
     }
   }
 }
@@ -86,13 +83,13 @@
 
 INLINE
 LOCAL(void)
-rgb_gray_convert_internal (j_compress_ptr cinfo,
-                           JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                           JDIMENSION output_row, int num_rows)
+rgb_gray_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                          JSAMPIMAGE output_buf, JDIMENSION output_row,
+                          int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int r, g, b;
-  register JLONG * ctab = cconvert->rgb_ycc_tab;
+  register JLONG *ctab = cconvert->rgb_ycc_tab;
   register JSAMPROW inptr;
   register JSAMPROW outptr;
   register JDIMENSION col;
@@ -108,9 +105,8 @@
       b = GETJSAMPLE(inptr[RGB_BLUE]);
       inptr += RGB_PIXELSIZE;
       /* Y */
-      outptr[col] = (JSAMPLE)
-                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-                 >> SCALEBITS);
+      outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+                               ctab[b + B_Y_OFF]) >> SCALEBITS);
     }
   }
 }
@@ -123,9 +119,9 @@
 
 INLINE
 LOCAL(void)
-rgb_rgb_convert_internal (j_compress_ptr cinfo,
-                          JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                          JDIMENSION output_row, int num_rows)
+rgb_rgb_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                         JSAMPIMAGE output_buf, JDIMENSION output_row,
+                         int num_rows)
 {
   register JSAMPROW inptr;
   register JSAMPROW outptr0, outptr1, outptr2;
diff --git a/jccolor.c b/jccolor.c
index b973d10..5b95e3f 100644
--- a/jccolor.c
+++ b/jccolor.c
@@ -63,9 +63,9 @@
  */
 
 #define SCALEBITS       16      /* speediest right-shift on some machines */
-#define CBCR_OFFSET     ((JLONG) CENTERJSAMPLE << SCALEBITS)
-#define ONE_HALF        ((JLONG) 1 << (SCALEBITS-1))
-#define FIX(x)          ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5))
+#define CBCR_OFFSET     ((JLONG)CENTERJSAMPLE << SCALEBITS)
+#define ONE_HALF        ((JLONG)1 << (SCALEBITS - 1))
+#define FIX(x)          ((JLONG)((x) * (1L << SCALEBITS) + 0.5))
 
 /* We allocate one big table and divide it up into eight parts, instead of
  * doing eight alloc_small requests.  This lets us use a single table base
@@ -74,15 +74,15 @@
  */
 
 #define R_Y_OFF         0                       /* offset to R => Y section */
-#define G_Y_OFF         (1*(MAXJSAMPLE+1))      /* offset to G => Y section */
-#define B_Y_OFF         (2*(MAXJSAMPLE+1))      /* etc. */
-#define R_CB_OFF        (3*(MAXJSAMPLE+1))
-#define G_CB_OFF        (4*(MAXJSAMPLE+1))
-#define B_CB_OFF        (5*(MAXJSAMPLE+1))
+#define G_Y_OFF         (1 * (MAXJSAMPLE + 1))  /* offset to G => Y section */
+#define B_Y_OFF         (2 * (MAXJSAMPLE + 1))  /* etc. */
+#define R_CB_OFF        (3 * (MAXJSAMPLE + 1))
+#define G_CB_OFF        (4 * (MAXJSAMPLE + 1))
+#define B_CB_OFF        (5 * (MAXJSAMPLE + 1))
 #define R_CR_OFF        B_CB_OFF                /* B=>Cb, R=>Cr are the same */
-#define G_CR_OFF        (6*(MAXJSAMPLE+1))
-#define B_CR_OFF        (7*(MAXJSAMPLE+1))
-#define TABLE_SIZE      (8*(MAXJSAMPLE+1))
+#define G_CR_OFF        (6 * (MAXJSAMPLE + 1))
+#define B_CR_OFF        (7 * (MAXJSAMPLE + 1))
+#define TABLE_SIZE      (8 * (MAXJSAMPLE + 1))
 
 
 /* Include inline routines for colorspace extensions */
@@ -195,33 +195,33 @@
  */
 
 METHODDEF(void)
-rgb_ycc_start (j_compress_ptr cinfo)
+rgb_ycc_start(j_compress_ptr cinfo)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   JLONG *rgb_ycc_tab;
   JLONG i;
 
   /* Allocate and fill in the conversion tables. */
   cconvert->rgb_ycc_tab = rgb_ycc_tab = (JLONG *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 (TABLE_SIZE * sizeof(JLONG)));
 
   for (i = 0; i <= MAXJSAMPLE; i++) {
-    rgb_ycc_tab[i+R_Y_OFF] = FIX(0.29900) * i;
-    rgb_ycc_tab[i+G_Y_OFF] = FIX(0.58700) * i;
-    rgb_ycc_tab[i+B_Y_OFF] = FIX(0.11400) * i     + ONE_HALF;
-    rgb_ycc_tab[i+R_CB_OFF] = (-FIX(0.16874)) * i;
-    rgb_ycc_tab[i+G_CB_OFF] = (-FIX(0.33126)) * i;
+    rgb_ycc_tab[i + R_Y_OFF] = FIX(0.29900) * i;
+    rgb_ycc_tab[i + G_Y_OFF] = FIX(0.58700) * i;
+    rgb_ycc_tab[i + B_Y_OFF] = FIX(0.11400) * i   + ONE_HALF;
+    rgb_ycc_tab[i + R_CB_OFF] = (-FIX(0.16874)) * i;
+    rgb_ycc_tab[i + G_CB_OFF] = (-FIX(0.33126)) * i;
     /* We use a rounding fudge-factor of 0.5-epsilon for Cb and Cr.
      * This ensures that the maximum output will round to MAXJSAMPLE
      * not MAXJSAMPLE+1, and thus that we don't have to range-limit.
      */
-    rgb_ycc_tab[i+B_CB_OFF] = FIX(0.50000) * i    + CBCR_OFFSET + ONE_HALF-1;
+    rgb_ycc_tab[i + B_CB_OFF] = FIX(0.50000) * i  + CBCR_OFFSET + ONE_HALF - 1;
 /*  B=>Cb and R=>Cr tables are the same
-    rgb_ycc_tab[i+R_CR_OFF] = FIX(0.50000) * i    + CBCR_OFFSET + ONE_HALF-1;
+    rgb_ycc_tab[i + R_CR_OFF] = FIX(0.50000) * i  + CBCR_OFFSET + ONE_HALF - 1;
 */
-    rgb_ycc_tab[i+G_CR_OFF] = (-FIX(0.41869)) * i;
-    rgb_ycc_tab[i+B_CR_OFF] = (-FIX(0.08131)) * i;
+    rgb_ycc_tab[i + G_CR_OFF] = (-FIX(0.41869)) * i;
+    rgb_ycc_tab[i + B_CR_OFF] = (-FIX(0.08131)) * i;
   }
 }
 
@@ -231,43 +231,42 @@
  */
 
 METHODDEF(void)
-rgb_ycc_convert (j_compress_ptr cinfo,
-                 JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                 JDIMENSION output_row, int num_rows)
+rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
   switch (cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      extrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                  num_rows);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      extrgbx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_BGR:
-      extbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                  num_rows);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      extbgrx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      extxbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      extxrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    default:
-      rgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                               num_rows);
-      break;
+  case JCS_EXT_RGB:
+    extrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                num_rows);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    extrgbx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_BGR:
+    extbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                num_rows);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    extbgrx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    extxbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    extxrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  default:
+    rgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                             num_rows);
+    break;
   }
 }
 
@@ -280,43 +279,42 @@
  */
 
 METHODDEF(void)
-rgb_gray_convert (j_compress_ptr cinfo,
-                  JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                  JDIMENSION output_row, int num_rows)
+rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                 JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
   switch (cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      extrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      extrgbx_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                    num_rows);
-      break;
-    case JCS_EXT_BGR:
-      extbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      extbgrx_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                    num_rows);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      extxbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                    num_rows);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      extxrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                    num_rows);
-      break;
-    default:
-      rgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                num_rows);
-      break;
+  case JCS_EXT_RGB:
+    extrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    extrgbx_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                  num_rows);
+    break;
+  case JCS_EXT_BGR:
+    extbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    extbgrx_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                  num_rows);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    extxbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                  num_rows);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    extxrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                  num_rows);
+    break;
+  default:
+    rgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                              num_rows);
+    break;
   }
 }
 
@@ -326,43 +324,42 @@
  */
 
 METHODDEF(void)
-rgb_rgb_convert (j_compress_ptr cinfo,
-                  JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                  JDIMENSION output_row, int num_rows)
+rgb_rgb_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
   switch (cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      extrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                  num_rows);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      extrgbx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_BGR:
-      extbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                  num_rows);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      extbgrx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      extxbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      extxrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    default:
-      rgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                               num_rows);
-      break;
+  case JCS_EXT_RGB:
+    extrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                num_rows);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    extrgbx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_BGR:
+    extbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                num_rows);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    extbgrx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    extxbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    extxrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  default:
+    rgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                             num_rows);
+    break;
   }
 }
 
@@ -376,11 +373,10 @@
  */
 
 METHODDEF(void)
-cmyk_ycck_convert (j_compress_ptr cinfo,
-                   JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                   JDIMENSION output_row, int num_rows)
+cmyk_ycck_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                  JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int r, g, b;
   register JLONG *ctab = cconvert->rgb_ycc_tab;
   register JSAMPROW inptr;
@@ -408,17 +404,14 @@
        * need the general RIGHT_SHIFT macro.
        */
       /* Y */
-      outptr0[col] = (JSAMPLE)
-                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-                 >> SCALEBITS);
+      outptr0[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+                                ctab[b + B_Y_OFF]) >> SCALEBITS);
       /* Cb */
-      outptr1[col] = (JSAMPLE)
-                ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
-                 >> SCALEBITS);
+      outptr1[col] = (JSAMPLE)((ctab[r + R_CB_OFF] + ctab[g + G_CB_OFF] +
+                                ctab[b + B_CB_OFF]) >> SCALEBITS);
       /* Cr */
-      outptr2[col] = (JSAMPLE)
-                ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
-                 >> SCALEBITS);
+      outptr2[col] = (JSAMPLE)((ctab[r + R_CR_OFF] + ctab[g + G_CR_OFF] +
+                                ctab[b + B_CR_OFF]) >> SCALEBITS);
     }
   }
 }
@@ -431,9 +424,8 @@
  */
 
 METHODDEF(void)
-grayscale_convert (j_compress_ptr cinfo,
-                   JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                   JDIMENSION output_row, int num_rows)
+grayscale_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                  JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
   register JSAMPROW inptr;
   register JSAMPROW outptr;
@@ -460,9 +452,8 @@
  */
 
 METHODDEF(void)
-null_convert (j_compress_ptr cinfo,
-              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-              JDIMENSION output_row, int num_rows)
+null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows)
 {
   register JSAMPROW inptr;
   register JSAMPROW outptr, outptr0, outptr1, outptr2, outptr3;
@@ -522,7 +513,7 @@
  */
 
 METHODDEF(void)
-null_method (j_compress_ptr cinfo)
+null_method(j_compress_ptr cinfo)
 {
   /* no work needed */
 }
@@ -533,14 +524,14 @@
  */
 
 GLOBAL(void)
-jinit_color_converter (j_compress_ptr cinfo)
+jinit_color_converter(j_compress_ptr cinfo)
 {
   my_cconvert_ptr cconvert;
 
   cconvert = (my_cconvert_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_color_converter));
-  cinfo->cconvert = (struct jpeg_color_converter *) cconvert;
+  cinfo->cconvert = (struct jpeg_color_converter *)cconvert;
   /* set start_pass to null method until we find out differently */
   cconvert->pub.start_pass = null_method;
 
diff --git a/jcdctmgr.c b/jcdctmgr.c
index 6e3b19b..825e244 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -41,7 +41,7 @@
                                            FAST_FLOAT *divisors,
                                            FAST_FLOAT *workspace);
 
-METHODDEF(void) quantize (JCOEFPTR, DCTELEM *, DCTELEM *);
+METHODDEF(void) quantize(JCOEFPTR, DCTELEM *, DCTELEM *);
 
 typedef struct {
   struct jpeg_forward_dct pub;  /* public fields */
@@ -80,7 +80,7 @@
  */
 
 LOCAL(int)
-flss (UINT16 val)
+flss(UINT16 val)
 {
   int bit;
 
@@ -170,7 +170,7 @@
  */
 
 LOCAL(int)
-compute_reciprocal (UINT16 divisor, DCTELEM *dtbl)
+compute_reciprocal(UINT16 divisor, DCTELEM *dtbl)
 {
   UDCTELEM2 fq, fr;
   UDCTELEM c;
@@ -182,10 +182,10 @@
      * identity function.  Since only the C quantization algorithm is used in
      * these cases, the scale value is irrelevant.
      */
-    dtbl[DCTSIZE2 * 0] = (DCTELEM) 1;                       /* reciprocal */
-    dtbl[DCTSIZE2 * 1] = (DCTELEM) 0;                       /* correction */
-    dtbl[DCTSIZE2 * 2] = (DCTELEM) 1;                       /* scale */
-    dtbl[DCTSIZE2 * 3] = -(DCTELEM) (sizeof(DCTELEM) * 8);  /* shift */
+    dtbl[DCTSIZE2 * 0] = (DCTELEM)1;                        /* reciprocal */
+    dtbl[DCTSIZE2 * 1] = (DCTELEM)0;                        /* correction */
+    dtbl[DCTSIZE2 * 2] = (DCTELEM)1;                        /* scale */
+    dtbl[DCTSIZE2 * 3] = -(DCTELEM)(sizeof(DCTELEM) * 8);   /* shift */
     return 0;
   }
 
@@ -195,26 +195,26 @@
   fq = ((UDCTELEM2)1 << r) / divisor;
   fr = ((UDCTELEM2)1 << r) % divisor;
 
-  c = divisor / 2; /* for rounding */
+  c = divisor / 2;                      /* for rounding */
 
-  if (fr == 0) { /* divisor is power of two */
+  if (fr == 0) {                        /* divisor is power of two */
     /* fq will be one bit too large to fit in DCTELEM, so adjust */
     fq >>= 1;
     r--;
-  } else if (fr <= (divisor / 2U)) { /* fractional part is < 0.5 */
+  } else if (fr <= (divisor / 2U)) {    /* fractional part is < 0.5 */
     c++;
-  } else { /* fractional part is > 0.5 */
+  } else {                              /* fractional part is > 0.5 */
     fq++;
   }
 
-  dtbl[DCTSIZE2 * 0] = (DCTELEM) fq;      /* reciprocal */
-  dtbl[DCTSIZE2 * 1] = (DCTELEM) c;       /* correction + roundfactor */
+  dtbl[DCTSIZE2 * 0] = (DCTELEM)fq;     /* reciprocal */
+  dtbl[DCTSIZE2 * 1] = (DCTELEM)c;      /* correction + roundfactor */
 #ifdef WITH_SIMD
-  dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r));  /* scale */
+  dtbl[DCTSIZE2 * 2] = (DCTELEM)(1 << (sizeof(DCTELEM) * 8 * 2 - r)); /* scale */
 #else
   dtbl[DCTSIZE2 * 2] = 1;
 #endif
-  dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
+  dtbl[DCTSIZE2 * 3] = (DCTELEM)r - sizeof(DCTELEM) * 8; /* shift */
 
   if (r <= 16) return 0;
   else return 1;
@@ -233,9 +233,9 @@
  */
 
 METHODDEF(void)
-start_pass_fdctmgr (j_compress_ptr cinfo)
+start_pass_fdctmgr(j_compress_ptr cinfo)
 {
-  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+  my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
   int ci, qtblno, i;
   jpeg_component_info *compptr;
   JQUANT_TBL *qtbl;
@@ -259,7 +259,7 @@
        */
       if (fdct->divisors[qtblno] == NULL) {
         fdct->divisors[qtblno] = (DCTELEM *)
-          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+          (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                       (DCTSIZE2 * 4) * sizeof(DCTELEM));
       }
       dtbl = fdct->divisors[qtblno];
@@ -269,7 +269,7 @@
             fdct->quantize == jsimd_quantize)
           fdct->quantize = quantize;
 #else
-        dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
+        dtbl[i] = ((DCTELEM)qtbl->quantval[i]) << 3;
 #endif
       }
       break;
@@ -299,23 +299,23 @@
 
         if (fdct->divisors[qtblno] == NULL) {
           fdct->divisors[qtblno] = (DCTELEM *)
-            (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+            (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                         (DCTSIZE2 * 4) * sizeof(DCTELEM));
         }
         dtbl = fdct->divisors[qtblno];
         for (i = 0; i < DCTSIZE2; i++) {
 #if BITS_IN_JSAMPLE == 8
           if (!compute_reciprocal(
-                DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
-                                      (JLONG) aanscales[i]),
-                        CONST_BITS-3), &dtbl[i]) &&
+                DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
+                                      (JLONG)aanscales[i]),
+                        CONST_BITS - 3), &dtbl[i]) &&
               fdct->quantize == jsimd_quantize)
             fdct->quantize = quantize;
 #else
-           dtbl[i] = (DCTELEM)
-             DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
-                                   (JLONG) aanscales[i]),
-                     CONST_BITS-3);
+          dtbl[i] = (DCTELEM)
+            DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
+                                  (JLONG)aanscales[i]),
+                    CONST_BITS - 3);
 #endif
         }
       }
@@ -341,7 +341,7 @@
 
         if (fdct->float_divisors[qtblno] == NULL) {
           fdct->float_divisors[qtblno] = (FAST_FLOAT *)
-            (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+            (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                         DCTSIZE2 * sizeof(FAST_FLOAT));
         }
         fdtbl = fdct->float_divisors[qtblno];
@@ -349,7 +349,7 @@
         for (row = 0; row < DCTSIZE; row++) {
           for (col = 0; col < DCTSIZE; col++) {
             fdtbl[i] = (FAST_FLOAT)
-              (1.0 / (((double) qtbl->quantval[i] *
+              (1.0 / (((double)qtbl->quantval[i] *
                        aanscalefactor[row] * aanscalefactor[col] * 8.0)));
             i++;
           }
@@ -370,7 +370,7 @@
  */
 
 METHODDEF(void)
-convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
+convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
 {
   register DCTELEM *workspaceptr;
   register JSAMPROW elemptr;
@@ -405,7 +405,7 @@
  */
 
 METHODDEF(void)
-quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
 {
   int i;
   DCTELEM temp;
@@ -426,15 +426,15 @@
     if (temp < 0) {
       temp = -temp;
       product = (UDCTELEM2)(temp + corr) * recip;
-      product >>= shift + sizeof(DCTELEM)*8;
+      product >>= shift + sizeof(DCTELEM) * 8;
       temp = (DCTELEM)product;
       temp = -temp;
     } else {
       product = (UDCTELEM2)(temp + corr) * recip;
-      product >>= shift + sizeof(DCTELEM)*8;
+      product >>= shift + sizeof(DCTELEM) * 8;
       temp = (DCTELEM)product;
     }
-    output_ptr[i] = (JCOEF) temp;
+    output_ptr[i] = (JCOEF)temp;
   }
 
 #else
@@ -457,20 +457,20 @@
      * If your machine's division is fast enough, define FAST_DIVIDE.
      */
 #ifdef FAST_DIVIDE
-#define DIVIDE_BY(a,b)  a /= b
+#define DIVIDE_BY(a, b)  a /= b
 #else
-#define DIVIDE_BY(a,b)  if (a >= b) a /= b; else a = 0
+#define DIVIDE_BY(a, b)  if (a >= b) a /= b;  else a = 0
 #endif
     if (temp < 0) {
       temp = -temp;
-      temp += qval>>1;  /* for rounding */
+      temp += qval >> 1;        /* for rounding */
       DIVIDE_BY(temp, qval);
       temp = -temp;
     } else {
-      temp += qval>>1;  /* for rounding */
+      temp += qval >> 1;        /* for rounding */
       DIVIDE_BY(temp, qval);
     }
-    output_ptr[i] = (JCOEF) temp;
+    output_ptr[i] = (JCOEF)temp;
   }
 
 #endif
@@ -487,14 +487,13 @@
  */
 
 METHODDEF(void)
-forward_DCT (j_compress_ptr cinfo, jpeg_component_info *compptr,
-             JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-             JDIMENSION start_row, JDIMENSION start_col,
-             JDIMENSION num_blocks)
+forward_DCT(j_compress_ptr cinfo, jpeg_component_info *compptr,
+            JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+            JDIMENSION start_row, JDIMENSION start_col, JDIMENSION num_blocks)
 /* This version is used for integer DCT implementations. */
 {
   /* This routine is heavily used, so it's worth coding it tightly. */
-  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+  my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
   DCTELEM *divisors = fdct->divisors[compptr->quant_tbl_no];
   DCTELEM *workspace;
   JDIMENSION bi;
@@ -522,9 +521,9 @@
 
 #ifdef DCT_FLOAT_SUPPORTED
 
-
 METHODDEF(void)
-convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace)
+convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+               FAST_FLOAT *workspace)
 {
   register FAST_FLOAT *workspaceptr;
   register JSAMPROW elemptr;
@@ -555,7 +554,8 @@
 
 
 METHODDEF(void)
-quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace)
+quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+               FAST_FLOAT *workspace)
 {
   register FAST_FLOAT temp;
   register int i;
@@ -571,20 +571,20 @@
      * The maximum coefficient size is +-16K (for 12-bit data), so this
      * code should work for either 16-bit or 32-bit ints.
      */
-    output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
+    output_ptr[i] = (JCOEF)((int)(temp + (FAST_FLOAT)16384.5) - 16384);
   }
 }
 
 
 METHODDEF(void)
-forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                   JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-                   JDIMENSION start_row, JDIMENSION start_col,
-                   JDIMENSION num_blocks)
+forward_DCT_float(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                  JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+                  JDIMENSION start_row, JDIMENSION start_col,
+                  JDIMENSION num_blocks)
 /* This version is used for floating-point DCT implementations. */
 {
   /* This routine is heavily used, so it's worth coding it tightly. */
-  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+  my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
   FAST_FLOAT *divisors = fdct->float_divisors[compptr->quant_tbl_no];
   FAST_FLOAT *workspace;
   JDIMENSION bi;
@@ -618,15 +618,15 @@
  */
 
 GLOBAL(void)
-jinit_forward_dct (j_compress_ptr cinfo)
+jinit_forward_dct(j_compress_ptr cinfo)
 {
   my_fdct_ptr fdct;
   int i;
 
   fdct = (my_fdct_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_fdct_controller));
-  cinfo->fdct = (struct jpeg_forward_dct *) fdct;
+  cinfo->fdct = (struct jpeg_forward_dct *)fdct;
   fdct->pub.start_pass = start_pass_fdctmgr;
 
   /* First determine the DCT... */
@@ -703,12 +703,12 @@
 #ifdef DCT_FLOAT_SUPPORTED
   if (cinfo->dct_method == JDCT_FLOAT)
     fdct->float_workspace = (FAST_FLOAT *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(FAST_FLOAT) * DCTSIZE2);
   else
 #endif
     fdct->workspace = (DCTELEM *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(DCTELEM) * DCTSIZE2);
 
   /* Mark divisor tables unallocated */
diff --git a/jchuff.c b/jchuff.c
index fffaace..7bca0e7 100644
--- a/jchuff.c
+++ b/jchuff.c
@@ -55,10 +55,6 @@
 #define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
 #endif
 
-#ifndef min
- #define min(a,b) ((a)<(b)?(a):(b))
-#endif
-
 
 /* Expanded entropy encoder object for Huffman encoding.
  *
@@ -67,9 +63,9 @@
  */
 
 typedef struct {
-  size_t put_buffer;            /* current bit-accumulation buffer */
-  int put_bits;                 /* # of bits now in it */
-  int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+  size_t put_buffer;                    /* current bit-accumulation buffer */
+  int put_bits;                         /* # of bits now in it */
+  int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
 } savable_state;
 
 /* This macro is to work around compilers with missing or broken
@@ -78,16 +74,16 @@
  */
 
 #ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest,src)  ((dest) = (src))
+#define ASSIGN_STATE(dest, src)  ((dest) = (src))
 #else
 #if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest,src)  \
-        ((dest).put_buffer = (src).put_buffer, \
-         (dest).put_bits = (src).put_bits, \
-         (dest).last_dc_val[0] = (src).last_dc_val[0], \
-         (dest).last_dc_val[1] = (src).last_dc_val[1], \
-         (dest).last_dc_val[2] = (src).last_dc_val[2], \
-         (dest).last_dc_val[3] = (src).last_dc_val[3])
+#define ASSIGN_STATE(dest, src) \
+  ((dest).put_buffer = (src).put_buffer, \
+   (dest).put_bits = (src).put_bits, \
+   (dest).last_dc_val[0] = (src).last_dc_val[0], \
+   (dest).last_dc_val[1] = (src).last_dc_val[1], \
+   (dest).last_dc_val[2] = (src).last_dc_val[2], \
+   (dest).last_dc_val[3] = (src).last_dc_val[3])
 #endif
 #endif
 
@@ -128,12 +124,12 @@
 
 
 /* Forward declarations */
-METHODDEF(boolean) encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data);
-METHODDEF(void) finish_pass_huff (j_compress_ptr cinfo);
+METHODDEF(boolean) encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data);
+METHODDEF(void) finish_pass_huff(j_compress_ptr cinfo);
 #ifdef ENTROPY_OPT_SUPPORTED
-METHODDEF(boolean) encode_mcu_gather (j_compress_ptr cinfo,
-                                      JBLOCKROW *MCU_data);
-METHODDEF(void) finish_pass_gather (j_compress_ptr cinfo);
+METHODDEF(boolean) encode_mcu_gather(j_compress_ptr cinfo,
+                                     JBLOCKROW *MCU_data);
+METHODDEF(void) finish_pass_gather(j_compress_ptr cinfo);
 #endif
 
 
@@ -144,9 +140,9 @@
  */
 
 METHODDEF(void)
-start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
+start_pass_huff(j_compress_ptr cinfo, boolean gather_statistics)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   int ci, dctbl, actbl;
   jpeg_component_info *compptr;
 
@@ -180,12 +176,12 @@
       /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
       if (entropy->dc_count_ptrs[dctbl] == NULL)
         entropy->dc_count_ptrs[dctbl] = (long *)
-          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+          (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                       257 * sizeof(long));
       MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * sizeof(long));
       if (entropy->ac_count_ptrs[actbl] == NULL)
         entropy->ac_count_ptrs[actbl] = (long *)
-          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+          (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                       257 * sizeof(long));
       MEMZERO(entropy->ac_count_ptrs[actbl], 257 * sizeof(long));
 #endif
@@ -193,9 +189,9 @@
       /* Compute derived values for Huffman tables */
       /* We may do this more than once for a table, but it's not expensive */
       jpeg_make_c_derived_tbl(cinfo, TRUE, dctbl,
-                              & entropy->dc_derived_tbls[dctbl]);
+                              &entropy->dc_derived_tbls[dctbl]);
       jpeg_make_c_derived_tbl(cinfo, FALSE, actbl,
-                              & entropy->ac_derived_tbls[actbl]);
+                              &entropy->ac_derived_tbls[actbl]);
     }
     /* Initialize DC predictions to 0 */
     entropy->saved.last_dc_val[ci] = 0;
@@ -219,8 +215,8 @@
  */
 
 GLOBAL(void)
-jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
-                         c_derived_tbl **pdtbl)
+jpeg_make_c_derived_tbl(j_compress_ptr cinfo, boolean isDC, int tblno,
+                        c_derived_tbl **pdtbl)
 {
   JHUFF_TBL *htbl;
   c_derived_tbl *dtbl;
@@ -244,7 +240,7 @@
   /* Allocate a workspace if we haven't already done so. */
   if (*pdtbl == NULL)
     *pdtbl = (c_derived_tbl *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(c_derived_tbl));
   dtbl = *pdtbl;
 
@@ -252,11 +248,11 @@
 
   p = 0;
   for (l = 1; l <= 16; l++) {
-    i = (int) htbl->bits[l];
+    i = (int)htbl->bits[l];
     if (i < 0 || p + i > 256)   /* protect against table overrun */
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     while (i--)
-      huffsize[p++] = (char) l;
+      huffsize[p++] = (char)l;
   }
   huffsize[p] = 0;
   lastp = p;
@@ -268,14 +264,14 @@
   si = huffsize[0];
   p = 0;
   while (huffsize[p]) {
-    while (((int) huffsize[p]) == si) {
+    while (((int)huffsize[p]) == si) {
       huffcode[p++] = code;
       code++;
     }
     /* code is now 1 more than the last code used for codelength si; but
      * it must still fit in si bits, since no code is allowed to be all ones.
      */
-    if (((JLONG) code) >= (((JLONG) 1) << si))
+    if (((JLONG)code) >= (((JLONG)1) << si))
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     code <<= 1;
     si++;
@@ -310,20 +306,21 @@
 /* Outputting bytes to the file */
 
 /* Emit a byte, taking 'action' if must suspend. */
-#define emit_byte(state,val,action)  \
-        { *(state)->next_output_byte++ = (JOCTET) (val);  \
-          if (--(state)->free_in_buffer == 0)  \
-            if (! dump_buffer(state))  \
-              { action; } }
+#define emit_byte(state, val, action) { \
+  *(state)->next_output_byte++ = (JOCTET)(val); \
+  if (--(state)->free_in_buffer == 0) \
+    if (!dump_buffer(state)) \
+      { action; } \
+}
 
 
 LOCAL(boolean)
-dump_buffer (working_state *state)
+dump_buffer(working_state *state)
 /* Empty the output buffer; return TRUE if successful, FALSE if must suspend */
 {
   struct jpeg_destination_mgr *dest = state->cinfo->dest;
 
-  if (! (*dest->empty_output_buffer) (state->cinfo))
+  if (!(*dest->empty_output_buffer) (state->cinfo))
     return FALSE;
   /* After a successful buffer dump, must reset buffer pointers */
   state->next_output_byte = dest->next_output_byte;
@@ -349,7 +346,7 @@
   *buffer++ = c; \
   if (c == 0xFF)  /* need to stuff a zero byte? */ \
     *buffer++ = 0; \
- }
+}
 
 #define PUT_BITS(code, size) { \
   put_bits += size; \
@@ -387,7 +384,7 @@
 #error Cannot determine word size
 #endif
 
-#if SIZEOF_SIZE_T==8 || defined(_WIN64)
+#if SIZEOF_SIZE_T == 8 || defined(_WIN64)
 
 #define EMIT_BITS(code, size) { \
   CHECKBUF47() \
@@ -395,11 +392,11 @@
 }
 
 #define EMIT_CODE(code, size) { \
-  temp2 &= (((JLONG) 1)<<nbits) - 1; \
+  temp2 &= (((JLONG)1) << nbits) - 1; \
   CHECKBUF31() \
   PUT_BITS(code, size) \
   PUT_BITS(temp2, nbits) \
- }
+}
 
 #else
 
@@ -409,12 +406,12 @@
 }
 
 #define EMIT_CODE(code, size) { \
-  temp2 &= (((JLONG) 1)<<nbits) - 1; \
+  temp2 &= (((JLONG)1) << nbits) - 1; \
   PUT_BITS(code, size) \
   CHECKBUF15() \
   PUT_BITS(temp2, nbits) \
   CHECKBUF15() \
- }
+}
 
 #endif
 
@@ -434,34 +431,33 @@
   if (state->free_in_buffer < BUFSIZE) { \
     localbuf = 1; \
     buffer = _buffer; \
-  } \
-  else buffer = state->next_output_byte; \
- }
+  } else \
+    buffer = state->next_output_byte; \
+}
 
 #define STORE_BUFFER() { \
   if (localbuf) { \
     bytes = buffer - _buffer; \
     buffer = _buffer; \
     while (bytes > 0) { \
-      bytestocopy = min(bytes, state->free_in_buffer); \
+      bytestocopy = MIN(bytes, state->free_in_buffer); \
       MEMCOPY(state->next_output_byte, buffer, bytestocopy); \
       state->next_output_byte += bytestocopy; \
       buffer += bytestocopy; \
       state->free_in_buffer -= bytestocopy; \
       if (state->free_in_buffer == 0) \
-        if (! dump_buffer(state)) return FALSE; \
+        if (!dump_buffer(state)) return FALSE; \
       bytes -= bytestocopy; \
     } \
-  } \
-  else { \
+  } else { \
     state->free_in_buffer -= (buffer - state->next_output_byte); \
     state->next_output_byte = buffer; \
   } \
- }
+}
 
 
 LOCAL(boolean)
-flush_bits (working_state *state)
+flush_bits(working_state *state)
 {
   JOCTET _buffer[BUFSIZE], *buffer;
   size_t put_buffer;  int put_bits;
@@ -486,8 +482,8 @@
 /* Encode a single block's worth of coefficients */
 
 LOCAL(boolean)
-encode_one_block_simd (working_state *state, JCOEFPTR block, int last_dc_val,
-                       c_derived_tbl *dctbl, c_derived_tbl *actbl)
+encode_one_block_simd(working_state *state, JCOEFPTR block, int last_dc_val,
+                      c_derived_tbl *dctbl, c_derived_tbl *actbl)
 {
   JOCTET _buffer[BUFSIZE], *buffer;
   size_t bytes, bytestocopy;  int localbuf = 0;
@@ -503,8 +499,8 @@
 }
 
 LOCAL(boolean)
-encode_one_block (working_state *state, JCOEFPTR block, int last_dc_val,
-                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
+encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val,
+                 c_derived_tbl *dctbl, c_derived_tbl *actbl)
 {
   int temp, temp2, temp3;
   int nbits;
@@ -522,11 +518,11 @@
 
   temp = temp2 = block[0] - last_dc_val;
 
- /* This is a well-known technique for obtaining the absolute value without a
-  * branch.  It is derived from an assembly language technique presented in
-  * "How to Optimize for the Pentium Processors", Copyright (c) 1996, 1997 by
-  * Agner Fog.
-  */
+  /* This is a well-known technique for obtaining the absolute value without a
+   * branch.  It is derived from an assembly language technique presented in
+   * "How to Optimize for the Pentium Processors", Copyright (c) 1996, 1997 by
+   * Agner Fog.
+   */
   temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
   temp ^= temp3;
   temp -= temp3;
@@ -544,7 +540,7 @@
   EMIT_BITS(code, size)
 
   /* Mask off any extra bits in code */
-  temp2 &= (((JLONG) 1)<<nbits) - 1;
+  temp2 &= (((JLONG)1) << nbits) - 1;
 
   /* Emit that number of bits of the value, if positive, */
   /* or the complement of its magnitude, if negative. */
@@ -558,7 +554,7 @@
  * improves performance greatly on systems with a limited number of
  * registers (such as x86.)
  */
-#define kloop(jpeg_natural_order_of_k) {  \
+#define kloop(jpeg_natural_order_of_k) { \
   if ((temp = block[jpeg_natural_order_of_k]) == 0) { \
     r++; \
   } else { \
@@ -575,11 +571,11 @@
       r -= 16; \
     } \
     /* Emit Huffman symbol for run length / number of bits */ \
-    temp3 = (r << 4) + nbits;  \
+    temp3 = (r << 4) + nbits; \
     code = actbl->ehufco[temp3]; \
     size = actbl->ehufsi[temp3]; \
     EMIT_CODE(code, size) \
-    r = 0;  \
+    r = 0; \
   } \
 }
 
@@ -616,11 +612,11 @@
  */
 
 LOCAL(boolean)
-emit_restart (working_state *state, int restart_num)
+emit_restart(working_state *state, int restart_num)
 {
   int ci;
 
-  if (! flush_bits(state))
+  if (!flush_bits(state))
     return FALSE;
 
   emit_byte(state, 0xFF, return FALSE);
@@ -641,9 +637,9 @@
  */
 
 METHODDEF(boolean)
-encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   working_state state;
   int blkn, ci;
   jpeg_component_info *compptr;
@@ -657,7 +653,7 @@
   /* Emit restart marker if needed */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! emit_restart(&state, entropy->next_restart_num))
+      if (!emit_restart(&state, entropy->next_restart_num))
         return FALSE;
   }
 
@@ -666,10 +662,10 @@
     for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
       ci = cinfo->MCU_membership[blkn];
       compptr = cinfo->cur_comp_info[ci];
-      if (! encode_one_block_simd(&state,
-                                  MCU_data[blkn][0], state.cur.last_dc_val[ci],
-                                  entropy->dc_derived_tbls[compptr->dc_tbl_no],
-                                  entropy->ac_derived_tbls[compptr->ac_tbl_no]))
+      if (!encode_one_block_simd(&state,
+                                 MCU_data[blkn][0], state.cur.last_dc_val[ci],
+                                 entropy->dc_derived_tbls[compptr->dc_tbl_no],
+                                 entropy->ac_derived_tbls[compptr->ac_tbl_no]))
         return FALSE;
       /* Update last_dc_val */
       state.cur.last_dc_val[ci] = MCU_data[blkn][0][0];
@@ -678,10 +674,10 @@
     for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
       ci = cinfo->MCU_membership[blkn];
       compptr = cinfo->cur_comp_info[ci];
-      if (! encode_one_block(&state,
-                             MCU_data[blkn][0], state.cur.last_dc_val[ci],
-                             entropy->dc_derived_tbls[compptr->dc_tbl_no],
-                             entropy->ac_derived_tbls[compptr->ac_tbl_no]))
+      if (!encode_one_block(&state,
+                            MCU_data[blkn][0], state.cur.last_dc_val[ci],
+                            entropy->dc_derived_tbls[compptr->dc_tbl_no],
+                            entropy->ac_derived_tbls[compptr->ac_tbl_no]))
         return FALSE;
       /* Update last_dc_val */
       state.cur.last_dc_val[ci] = MCU_data[blkn][0][0];
@@ -712,9 +708,9 @@
  */
 
 METHODDEF(void)
-finish_pass_huff (j_compress_ptr cinfo)
+finish_pass_huff(j_compress_ptr cinfo)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   working_state state;
 
   /* Load up working state ... flush_bits needs it */
@@ -724,7 +720,7 @@
   state.cinfo = cinfo;
 
   /* Flush out the last data */
-  if (! flush_bits(&state))
+  if (!flush_bits(&state))
     ERREXIT(cinfo, JERR_CANT_SUSPEND);
 
   /* Update state */
@@ -751,8 +747,8 @@
 /* Process a single block's worth of coefficients */
 
 LOCAL(void)
-htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
-                 long dc_counts[], long ac_counts[])
+htest_one_block(j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
+                long dc_counts[], long ac_counts[])
 {
   register int temp;
   register int nbits;
@@ -773,7 +769,7 @@
   /* Check for out-of-range coefficient values.
    * Since we're encoding a difference, the range limit is twice as much.
    */
-  if (nbits > MAX_COEF_BITS+1)
+  if (nbits > MAX_COEF_BITS + 1)
     ERREXIT(cinfo, JERR_BAD_DCT_COEF);
 
   /* Count the Huffman symbol for the number of bits */
@@ -824,9 +820,9 @@
  */
 
 METHODDEF(boolean)
-encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_gather(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   int blkn, ci;
   jpeg_component_info *compptr;
 
@@ -884,10 +880,10 @@
  */
 
 GLOBAL(void)
-jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
+jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
 {
 #define MAX_CLEN 32             /* assumed maximum initial code length */
-  UINT8 bits[MAX_CLEN+1];       /* bits[k] = # of symbols with code length k */
+  UINT8 bits[MAX_CLEN + 1];     /* bits[k] = # of symbols with code length k */
   int codesize[257];            /* codesize[k] = code length of symbol k */
   int others[257];              /* next symbol in current branch of tree */
   int c1, c2;
@@ -987,8 +983,8 @@
         j--;
 
       bits[i] -= 2;             /* remove two symbols */
-      bits[i-1]++;              /* one goes in this length */
-      bits[j+1] += 2;           /* two new symbols in this length */
+      bits[i - 1]++;            /* one goes in this length */
+      bits[j + 1] += 2;         /* two new symbols in this length */
       bits[j]--;                /* symbol of this length is now a prefix */
     }
   }
@@ -1009,7 +1005,7 @@
   for (i = 1; i <= MAX_CLEN; i++) {
     for (j = 0; j <= 255; j++) {
       if (codesize[j] == i) {
-        htbl->huffval[p] = (UINT8) j;
+        htbl->huffval[p] = (UINT8)j;
         p++;
       }
     }
@@ -1025,9 +1021,9 @@
  */
 
 METHODDEF(void)
-finish_pass_gather (j_compress_ptr cinfo)
+finish_pass_gather(j_compress_ptr cinfo)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   int ci, dctbl, actbl;
   jpeg_component_info *compptr;
   JHUFF_TBL **htblptr;
@@ -1044,17 +1040,17 @@
     compptr = cinfo->cur_comp_info[ci];
     dctbl = compptr->dc_tbl_no;
     actbl = compptr->ac_tbl_no;
-    if (! did_dc[dctbl]) {
-      htblptr = & cinfo->dc_huff_tbl_ptrs[dctbl];
+    if (!did_dc[dctbl]) {
+      htblptr = &cinfo->dc_huff_tbl_ptrs[dctbl];
       if (*htblptr == NULL)
-        *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+        *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
       jpeg_gen_optimal_table(cinfo, *htblptr, entropy->dc_count_ptrs[dctbl]);
       did_dc[dctbl] = TRUE;
     }
-    if (! did_ac[actbl]) {
-      htblptr = & cinfo->ac_huff_tbl_ptrs[actbl];
+    if (!did_ac[actbl]) {
+      htblptr = &cinfo->ac_huff_tbl_ptrs[actbl];
       if (*htblptr == NULL)
-        *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+        *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
       jpeg_gen_optimal_table(cinfo, *htblptr, entropy->ac_count_ptrs[actbl]);
       did_ac[actbl] = TRUE;
     }
@@ -1070,15 +1066,15 @@
  */
 
 GLOBAL(void)
-jinit_huff_encoder (j_compress_ptr cinfo)
+jinit_huff_encoder(j_compress_ptr cinfo)
 {
   huff_entropy_ptr entropy;
   int i;
 
   entropy = (huff_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(huff_entropy_encoder));
-  cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
+  cinfo->entropy = (struct jpeg_entropy_encoder *)entropy;
   entropy->pub.start_pass = start_pass_huff;
 
   /* Mark tables unallocated */
diff --git a/jchuff.h b/jchuff.h
index 4236089..3d9b7c0 100644
--- a/jchuff.h
+++ b/jchuff.h
@@ -34,10 +34,9 @@
 } c_derived_tbl;
 
 /* Expand a Huffman table definition into the derived format */
-EXTERN(void) jpeg_make_c_derived_tbl
-        (j_compress_ptr cinfo, boolean isDC, int tblno,
-         c_derived_tbl ** pdtbl);
+EXTERN(void) jpeg_make_c_derived_tbl(j_compress_ptr cinfo, boolean isDC,
+                                     int tblno, c_derived_tbl **pdtbl);
 
 /* Generate an optimal table definition given the specified counts */
-EXTERN(void) jpeg_gen_optimal_table
-        (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[]);
+EXTERN(void) jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl,
+                                    long freq[]);
diff --git a/jcicc.c b/jcicc.c
new file mode 100644
index 0000000..11037ff
--- /dev/null
+++ b/jcicc.c
@@ -0,0 +1,105 @@
+/*
+ * jcicc.c
+ *
+ * Copyright (C) 1997-1998, Thomas G. Lane, Todd Newman.
+ * Copyright (C) 2017, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file provides code to write International Color Consortium (ICC) device
+ * profiles embedded in JFIF JPEG image files.  The ICC has defined a standard
+ * for including such data in JPEG "APP2" markers.  The code given here does
+ * not know anything about the internal structure of the ICC profile data; it
+ * just knows how to embed the profile data in a JPEG file while writing it.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jerror.h"
+
+
+/*
+ * Since an ICC profile can be larger than the maximum size of a JPEG marker
+ * (64K), we need provisions to split it into multiple markers.  The format
+ * defined by the ICC specifies one or more APP2 markers containing the
+ * following data:
+ *      Identifying string      ASCII "ICC_PROFILE\0"  (12 bytes)
+ *      Marker sequence number  1 for first APP2, 2 for next, etc (1 byte)
+ *      Number of markers       Total number of APP2's used (1 byte)
+ *      Profile data            (remainder of APP2 data)
+ * Decoders should use the marker sequence numbers to reassemble the profile,
+ * rather than assuming that the APP2 markers appear in the correct sequence.
+ */
+
+#define ICC_MARKER  (JPEG_APP0 + 2)     /* JPEG marker code for ICC */
+#define ICC_OVERHEAD_LEN  14            /* size of non-profile data in APP2 */
+#define MAX_BYTES_IN_MARKER  65533      /* maximum data len of a JPEG marker */
+#define MAX_DATA_BYTES_IN_MARKER  (MAX_BYTES_IN_MARKER - ICC_OVERHEAD_LEN)
+
+
+/*
+ * This routine writes the given ICC profile data into a JPEG file.  It *must*
+ * be called AFTER calling jpeg_start_compress() and BEFORE the first call to
+ * jpeg_write_scanlines().  (This ordering ensures that the APP2 marker(s) will
+ * appear after the SOI and JFIF or Adobe markers, but before all else.)
+ */
+
+GLOBAL(void)
+jpeg_write_icc_profile(j_compress_ptr cinfo, const JOCTET *icc_data_ptr,
+                       unsigned int icc_data_len)
+{
+  unsigned int num_markers;     /* total number of markers we'll write */
+  int cur_marker = 1;           /* per spec, counting starts at 1 */
+  unsigned int length;          /* number of bytes to write in this marker */
+
+  if (icc_data_ptr == NULL || icc_data_len == 0)
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
+  if (cinfo->global_state < CSTATE_SCANNING)
+    ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+  /* Calculate the number of markers we'll need, rounding up of course */
+  num_markers = icc_data_len / MAX_DATA_BYTES_IN_MARKER;
+  if (num_markers * MAX_DATA_BYTES_IN_MARKER != icc_data_len)
+    num_markers++;
+
+  while (icc_data_len > 0) {
+    /* length of profile to put in this marker */
+    length = icc_data_len;
+    if (length > MAX_DATA_BYTES_IN_MARKER)
+      length = MAX_DATA_BYTES_IN_MARKER;
+    icc_data_len -= length;
+
+    /* Write the JPEG marker header (APP2 code and marker length) */
+    jpeg_write_m_header(cinfo, ICC_MARKER,
+                        (unsigned int)(length + ICC_OVERHEAD_LEN));
+
+    /* Write the marker identifying string "ICC_PROFILE" (null-terminated).  We
+     * code it in this less-than-transparent way so that the code works even if
+     * the local character set is not ASCII.
+     */
+    jpeg_write_m_byte(cinfo, 0x49);
+    jpeg_write_m_byte(cinfo, 0x43);
+    jpeg_write_m_byte(cinfo, 0x43);
+    jpeg_write_m_byte(cinfo, 0x5F);
+    jpeg_write_m_byte(cinfo, 0x50);
+    jpeg_write_m_byte(cinfo, 0x52);
+    jpeg_write_m_byte(cinfo, 0x4F);
+    jpeg_write_m_byte(cinfo, 0x46);
+    jpeg_write_m_byte(cinfo, 0x49);
+    jpeg_write_m_byte(cinfo, 0x4C);
+    jpeg_write_m_byte(cinfo, 0x45);
+    jpeg_write_m_byte(cinfo, 0x0);
+
+    /* Add the sequencing info */
+    jpeg_write_m_byte(cinfo, cur_marker);
+    jpeg_write_m_byte(cinfo, (int)num_markers);
+
+    /* Add the profile data */
+    while (length--) {
+      jpeg_write_m_byte(cinfo, *icc_data_ptr);
+      icc_data_ptr++;
+    }
+    cur_marker++;
+  }
+}
diff --git a/jcinit.c b/jcinit.c
index 463bd8c..78aa465 100644
--- a/jcinit.c
+++ b/jcinit.c
@@ -28,13 +28,13 @@
  */
 
 GLOBAL(void)
-jinit_compress_master (j_compress_ptr cinfo)
+jinit_compress_master(j_compress_ptr cinfo)
 {
   /* Initialize master control (includes parameter checking/processing) */
   jinit_c_master_control(cinfo, FALSE /* full compression */);
 
   /* Preprocessing */
-  if (! cinfo->raw_data_in) {
+  if (!cinfo->raw_data_in) {
     jinit_color_converter(cinfo);
     jinit_downsampler(cinfo);
     jinit_c_prep_controller(cinfo, FALSE /* never need full buffer here */);
@@ -60,14 +60,14 @@
   }
 
   /* Need a full-image coefficient buffer in any multi-pass mode. */
-  jinit_c_coef_controller(cinfo,
-                (boolean) (cinfo->num_scans > 1 || cinfo->optimize_coding));
+  jinit_c_coef_controller(cinfo, (boolean)(cinfo->num_scans > 1 ||
+                                           cinfo->optimize_coding));
   jinit_c_main_controller(cinfo, FALSE /* never need full buffer here */);
 
   jinit_marker_writer(cinfo);
 
   /* We can now tell the memory manager to allocate virtual arrays. */
-  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo);
+  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
 
   /* Write the datastream header (SOI) immediately.
    * Frame and scan headers are postponed till later.
diff --git a/jcmainct.c b/jcmainct.c
index d01f463..3f23028 100644
--- a/jcmainct.c
+++ b/jcmainct.c
@@ -39,9 +39,10 @@
 
 
 /* Forward declarations */
-METHODDEF(void) process_data_simple_main
-        (j_compress_ptr cinfo, JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
-         JDIMENSION in_rows_avail);
+METHODDEF(void) process_data_simple_main(j_compress_ptr cinfo,
+                                         JSAMPARRAY input_buf,
+                                         JDIMENSION *in_row_ctr,
+                                         JDIMENSION in_rows_avail);
 
 
 /*
@@ -49,9 +50,9 @@
  */
 
 METHODDEF(void)
-start_pass_main (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_main(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
 
   /* Do nothing in raw-data mode. */
   if (cinfo->raw_data_in)
@@ -75,19 +76,18 @@
  */
 
 METHODDEF(void)
-process_data_simple_main (j_compress_ptr cinfo,
-                          JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
-                          JDIMENSION in_rows_avail)
+process_data_simple_main(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                         JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
 
   while (main_ptr->cur_iMCU_row < cinfo->total_iMCU_rows) {
     /* Read input data if we haven't filled the main buffer yet */
     if (main_ptr->rowgroup_ctr < DCTSIZE)
-      (*cinfo->prep->pre_process_data) (cinfo,
-                                        input_buf, in_row_ctr, in_rows_avail,
-                                        main_ptr->buffer, &main_ptr->rowgroup_ctr,
-                                        (JDIMENSION) DCTSIZE);
+      (*cinfo->prep->pre_process_data) (cinfo, input_buf, in_row_ctr,
+                                        in_rows_avail, main_ptr->buffer,
+                                        &main_ptr->rowgroup_ctr,
+                                        (JDIMENSION)DCTSIZE);
 
     /* If we don't have a full iMCU row buffered, return to application for
      * more data.  Note that preprocessor will always pad to fill the iMCU row
@@ -97,14 +97,14 @@
       return;
 
     /* Send the completed row to the compressor */
-    if (! (*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) {
+    if (!(*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) {
       /* If compressor did not consume the whole row, then we must need to
        * suspend processing and return to the application.  In this situation
        * we pretend we didn't yet consume the last input row; otherwise, if
        * it happened to be the last row of the image, the application would
        * think we were done.
        */
-      if (! main_ptr->suspended) {
+      if (!main_ptr->suspended) {
         (*in_row_ctr)--;
         main_ptr->suspended = TRUE;
       }
@@ -128,16 +128,16 @@
  */
 
 GLOBAL(void)
-jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
+jinit_c_main_controller(j_compress_ptr cinfo, boolean need_full_buffer)
 {
   my_main_ptr main_ptr;
   int ci;
   jpeg_component_info *compptr;
 
   main_ptr = (my_main_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_main_controller));
-  cinfo->main = (struct jpeg_c_main_controller *) main_ptr;
+  cinfo->main = (struct jpeg_c_main_controller *)main_ptr;
   main_ptr->pub.start_pass = start_pass_main;
 
   /* We don't need to create a buffer in raw-data mode. */
@@ -154,9 +154,9 @@
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
          ci++, compptr++) {
       main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE,
+        ((j_common_ptr)cinfo, JPOOL_IMAGE,
          compptr->width_in_blocks * DCTSIZE,
-         (JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
+         (JDIMENSION)(compptr->v_samp_factor * DCTSIZE));
     }
   }
 }
diff --git a/jcmarker.c b/jcmarker.c
index 463f665..7e1af1f 100644
--- a/jcmarker.c
+++ b/jcmarker.c
@@ -110,30 +110,30 @@
  */
 
 LOCAL(void)
-emit_byte (j_compress_ptr cinfo, int val)
+emit_byte(j_compress_ptr cinfo, int val)
 /* Emit a byte */
 {
   struct jpeg_destination_mgr *dest = cinfo->dest;
 
-  *(dest->next_output_byte)++ = (JOCTET) val;
+  *(dest->next_output_byte)++ = (JOCTET)val;
   if (--dest->free_in_buffer == 0) {
-    if (! (*dest->empty_output_buffer) (cinfo))
+    if (!(*dest->empty_output_buffer) (cinfo))
       ERREXIT(cinfo, JERR_CANT_SUSPEND);
   }
 }
 
 
 LOCAL(void)
-emit_marker (j_compress_ptr cinfo, JPEG_MARKER mark)
+emit_marker(j_compress_ptr cinfo, JPEG_MARKER mark)
 /* Emit a marker code */
 {
   emit_byte(cinfo, 0xFF);
-  emit_byte(cinfo, (int) mark);
+  emit_byte(cinfo, (int)mark);
 }
 
 
 LOCAL(void)
-emit_2bytes (j_compress_ptr cinfo, int value)
+emit_2bytes(j_compress_ptr cinfo, int value)
 /* Emit a 2-byte integer; these are always MSB first in JPEG files */
 {
   emit_byte(cinfo, (value >> 8) & 0xFF);
@@ -146,7 +146,7 @@
  */
 
 LOCAL(int)
-emit_dqt (j_compress_ptr cinfo, int index)
+emit_dqt(j_compress_ptr cinfo, int index)
 /* Emit a DQT marker */
 /* Returns the precision used (0 = 8bits, 1 = 16bits) for baseline checking */
 {
@@ -163,19 +163,19 @@
       prec = 1;
   }
 
-  if (! qtbl->sent_table) {
+  if (!qtbl->sent_table) {
     emit_marker(cinfo, M_DQT);
 
-    emit_2bytes(cinfo, prec ? DCTSIZE2*2 + 1 + 2 : DCTSIZE2 + 1 + 2);
+    emit_2bytes(cinfo, prec ? DCTSIZE2 * 2 + 1 + 2 : DCTSIZE2 + 1 + 2);
 
-    emit_byte(cinfo, index + (prec<<4));
+    emit_byte(cinfo, index + (prec << 4));
 
     for (i = 0; i < DCTSIZE2; i++) {
       /* The table entries must be emitted in zigzag order. */
       unsigned int qval = qtbl->quantval[jpeg_natural_order[i]];
       if (prec)
-        emit_byte(cinfo, (int) (qval >> 8));
-      emit_byte(cinfo, (int) (qval & 0xFF));
+        emit_byte(cinfo, (int)(qval >> 8));
+      emit_byte(cinfo, (int)(qval & 0xFF));
     }
 
     qtbl->sent_table = TRUE;
@@ -186,7 +186,7 @@
 
 
 LOCAL(void)
-emit_dht (j_compress_ptr cinfo, int index, boolean is_ac)
+emit_dht(j_compress_ptr cinfo, int index, boolean is_ac)
 /* Emit a DHT marker */
 {
   JHUFF_TBL *htbl;
@@ -202,7 +202,7 @@
   if (htbl == NULL)
     ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, index);
 
-  if (! htbl->sent_table) {
+  if (!htbl->sent_table) {
     emit_marker(cinfo, M_DHT);
 
     length = 0;
@@ -224,7 +224,7 @@
 
 
 LOCAL(void)
-emit_dac (j_compress_ptr cinfo)
+emit_dac(j_compress_ptr cinfo)
 /* Emit a DAC marker */
 /* Since the useful info is so small, we want to emit all the tables in */
 /* one DAC marker.  Therefore this routine does its own scan of the table. */
@@ -255,7 +255,7 @@
   if (length) {
     emit_marker(cinfo, M_DAC);
 
-    emit_2bytes(cinfo, length*2 + 2);
+    emit_2bytes(cinfo, length * 2 + 2);
 
     for (i = 0; i < NUM_ARITH_TBLS; i++) {
       if (dc_in_use[i]) {
@@ -273,19 +273,19 @@
 
 
 LOCAL(void)
-emit_dri (j_compress_ptr cinfo)
+emit_dri(j_compress_ptr cinfo)
 /* Emit a DRI marker */
 {
   emit_marker(cinfo, M_DRI);
 
   emit_2bytes(cinfo, 4);        /* fixed length */
 
-  emit_2bytes(cinfo, (int) cinfo->restart_interval);
+  emit_2bytes(cinfo, (int)cinfo->restart_interval);
 }
 
 
 LOCAL(void)
-emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
+emit_sof(j_compress_ptr cinfo, JPEG_MARKER code)
 /* Emit a SOF marker */
 {
   int ci;
@@ -296,13 +296,12 @@
   emit_2bytes(cinfo, 3 * cinfo->num_components + 2 + 5 + 1); /* length */
 
   /* Make sure image isn't bigger than SOF field can handle */
-  if ((long) cinfo->_jpeg_height > 65535L ||
-      (long) cinfo->_jpeg_width > 65535L)
-    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) 65535);
+  if ((long)cinfo->_jpeg_height > 65535L || (long)cinfo->_jpeg_width > 65535L)
+    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int)65535);
 
   emit_byte(cinfo, cinfo->data_precision);
-  emit_2bytes(cinfo, (int) cinfo->_jpeg_height);
-  emit_2bytes(cinfo, (int) cinfo->_jpeg_width);
+  emit_2bytes(cinfo, (int)cinfo->_jpeg_height);
+  emit_2bytes(cinfo, (int)cinfo->_jpeg_width);
 
   emit_byte(cinfo, cinfo->num_components);
 
@@ -316,7 +315,7 @@
 
 
 LOCAL(void)
-emit_sos (j_compress_ptr cinfo)
+emit_sos(j_compress_ptr cinfo)
 /* Emit a SOS marker */
 {
   int i, td, ta;
@@ -351,7 +350,7 @@
 
 
 LOCAL(void)
-emit_jfif_app0 (j_compress_ptr cinfo)
+emit_jfif_app0(j_compress_ptr cinfo)
 /* Emit a JFIF-compliant APP0 marker */
 {
   /*
@@ -378,15 +377,15 @@
   emit_byte(cinfo, cinfo->JFIF_major_version); /* Version fields */
   emit_byte(cinfo, cinfo->JFIF_minor_version);
   emit_byte(cinfo, cinfo->density_unit); /* Pixel size information */
-  emit_2bytes(cinfo, (int) cinfo->X_density);
-  emit_2bytes(cinfo, (int) cinfo->Y_density);
+  emit_2bytes(cinfo, (int)cinfo->X_density);
+  emit_2bytes(cinfo, (int)cinfo->Y_density);
   emit_byte(cinfo, 0);          /* No thumbnail image */
   emit_byte(cinfo, 0);
 }
 
 
 LOCAL(void)
-emit_adobe_app14 (j_compress_ptr cinfo)
+emit_adobe_app14(j_compress_ptr cinfo)
 /* Emit an Adobe APP14 marker */
 {
   /*
@@ -440,19 +439,19 @@
  */
 
 METHODDEF(void)
-write_marker_header (j_compress_ptr cinfo, int marker, unsigned int datalen)
+write_marker_header(j_compress_ptr cinfo, int marker, unsigned int datalen)
 /* Emit an arbitrary marker header */
 {
-  if (datalen > (unsigned int) 65533)           /* safety check */
+  if (datalen > (unsigned int)65533)            /* safety check */
     ERREXIT(cinfo, JERR_BAD_LENGTH);
 
-  emit_marker(cinfo, (JPEG_MARKER) marker);
+  emit_marker(cinfo, (JPEG_MARKER)marker);
 
-  emit_2bytes(cinfo, (int) (datalen + 2));      /* total length */
+  emit_2bytes(cinfo, (int)(datalen + 2));       /* total length */
 }
 
 METHODDEF(void)
-write_marker_byte (j_compress_ptr cinfo, int val)
+write_marker_byte(j_compress_ptr cinfo, int val)
 /* Emit one byte of marker parameters following write_marker_header */
 {
   emit_byte(cinfo, val);
@@ -471,9 +470,9 @@
  */
 
 METHODDEF(void)
-write_file_header (j_compress_ptr cinfo)
+write_file_header(j_compress_ptr cinfo)
 {
-  my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+  my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
 
   emit_marker(cinfo, M_SOI);    /* first the SOI */
 
@@ -496,7 +495,7 @@
  */
 
 METHODDEF(void)
-write_frame_header (j_compress_ptr cinfo)
+write_frame_header(j_compress_ptr cinfo)
 {
   int ci, prec;
   boolean is_baseline;
@@ -556,9 +555,9 @@
  */
 
 METHODDEF(void)
-write_scan_header (j_compress_ptr cinfo)
+write_scan_header(j_compress_ptr cinfo)
 {
-  my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+  my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
   int i;
   jpeg_component_info *compptr;
 
@@ -600,7 +599,7 @@
  */
 
 METHODDEF(void)
-write_file_trailer (j_compress_ptr cinfo)
+write_file_trailer(j_compress_ptr cinfo)
 {
   emit_marker(cinfo, M_EOI);
 }
@@ -614,7 +613,7 @@
  */
 
 METHODDEF(void)
-write_tables_only (j_compress_ptr cinfo)
+write_tables_only(j_compress_ptr cinfo)
 {
   int i;
 
@@ -622,10 +621,10 @@
 
   for (i = 0; i < NUM_QUANT_TBLS; i++) {
     if (cinfo->quant_tbl_ptrs[i] != NULL)
-      (void) emit_dqt(cinfo, i);
+      (void)emit_dqt(cinfo, i);
   }
 
-  if (! cinfo->arith_code) {
+  if (!cinfo->arith_code) {
     for (i = 0; i < NUM_HUFF_TBLS; i++) {
       if (cinfo->dc_huff_tbl_ptrs[i] != NULL)
         emit_dht(cinfo, i, FALSE);
@@ -643,15 +642,15 @@
  */
 
 GLOBAL(void)
-jinit_marker_writer (j_compress_ptr cinfo)
+jinit_marker_writer(j_compress_ptr cinfo)
 {
   my_marker_ptr marker;
 
   /* Create the subobject */
   marker = (my_marker_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_marker_writer));
-  cinfo->marker = (struct jpeg_marker_writer *) marker;
+  cinfo->marker = (struct jpeg_marker_writer *)marker;
   /* Initialize method pointers */
   marker->pub.write_file_header = write_file_header;
   marker->pub.write_frame_header = write_frame_header;
diff --git a/jcmaster.c b/jcmaster.c
index 03a8b40..7536a6f 100644
--- a/jcmaster.c
+++ b/jcmaster.c
@@ -25,9 +25,9 @@
 /* Private state */
 
 typedef enum {
-        main_pass,              /* input data, also do first output step */
-        huff_opt_pass,          /* Huffman code optimization pass */
-        output_pass             /* data output pass */
+  main_pass,                    /* input data, also do first output step */
+  huff_opt_pass,                /* Huffman code optimization pass */
+  output_pass                   /* data output pass */
 } c_pass_type;
 
 typedef struct {
@@ -66,7 +66,7 @@
  */
 
 GLOBAL(void)
-jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo)
+jpeg_calc_jpeg_dimensions(j_compress_ptr cinfo)
 /* Do computations that are needed before master selection phase */
 {
   /* Hardwire it to "no scaling" */
@@ -79,7 +79,7 @@
 
 
 LOCAL(void)
-initial_setup (j_compress_ptr cinfo, boolean transcode_only)
+initial_setup(j_compress_ptr cinfo, boolean transcode_only)
 /* Do computations that are needed before master selection phase */
 {
   int ci;
@@ -95,19 +95,19 @@
 #endif
 
   /* Sanity check on image dimensions */
-  if (cinfo->_jpeg_height <= 0 || cinfo->_jpeg_width <= 0
-      || cinfo->num_components <= 0 || cinfo->input_components <= 0)
+  if (cinfo->_jpeg_height <= 0 || cinfo->_jpeg_width <= 0 ||
+      cinfo->num_components <= 0 || cinfo->input_components <= 0)
     ERREXIT(cinfo, JERR_EMPTY_IMAGE);
 
   /* Make sure image isn't bigger than I can handle */
-  if ((long) cinfo->_jpeg_height > (long) JPEG_MAX_DIMENSION ||
-      (long) cinfo->_jpeg_width > (long) JPEG_MAX_DIMENSION)
-    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION);
+  if ((long)cinfo->_jpeg_height > (long)JPEG_MAX_DIMENSION ||
+      (long)cinfo->_jpeg_width > (long)JPEG_MAX_DIMENSION)
+    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int)JPEG_MAX_DIMENSION);
 
   /* Width of an input scanline must be representable as JDIMENSION. */
-  samplesperrow = (long) cinfo->image_width * (long) cinfo->input_components;
-  jd_samplesperrow = (JDIMENSION) samplesperrow;
-  if ((long) jd_samplesperrow != samplesperrow)
+  samplesperrow = (long)cinfo->image_width * (long)cinfo->input_components;
+  jd_samplesperrow = (JDIMENSION)samplesperrow;
+  if ((long)jd_samplesperrow != samplesperrow)
     ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
 
   /* For now, precision must match compiled-in value... */
@@ -124,8 +124,10 @@
   cinfo->max_v_samp_factor = 1;
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    if (compptr->h_samp_factor<=0 || compptr->h_samp_factor>MAX_SAMP_FACTOR ||
-        compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR)
+    if (compptr->h_samp_factor <= 0 ||
+        compptr->h_samp_factor > MAX_SAMP_FACTOR ||
+        compptr->v_samp_factor <= 0 ||
+        compptr->v_samp_factor > MAX_SAMP_FACTOR)
       ERREXIT(cinfo, JERR_BAD_SAMPLING);
     cinfo->max_h_samp_factor = MAX(cinfo->max_h_samp_factor,
                                    compptr->h_samp_factor);
@@ -146,18 +148,18 @@
 #endif
     /* Size in DCT blocks */
     compptr->width_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor,
-                    (long) (cinfo->max_h_samp_factor * DCTSIZE));
+      jdiv_round_up((long)cinfo->_jpeg_width * (long)compptr->h_samp_factor,
+                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
     compptr->height_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor,
-                    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+      jdiv_round_up((long)cinfo->_jpeg_height * (long)compptr->v_samp_factor,
+                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
     /* Size in samples */
     compptr->downsampled_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor,
-                    (long) cinfo->max_h_samp_factor);
+      jdiv_round_up((long)cinfo->_jpeg_width * (long)compptr->h_samp_factor,
+                    (long)cinfo->max_h_samp_factor);
     compptr->downsampled_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor,
-                    (long) cinfo->max_v_samp_factor);
+      jdiv_round_up((long)cinfo->_jpeg_height * (long)compptr->v_samp_factor,
+                    (long)cinfo->max_v_samp_factor);
     /* Mark component needed (this flag isn't actually used for compression) */
     compptr->component_needed = TRUE;
   }
@@ -166,15 +168,15 @@
    * main controller will call coefficient controller).
    */
   cinfo->total_iMCU_rows = (JDIMENSION)
-    jdiv_round_up((long) cinfo->_jpeg_height,
-                  (long) (cinfo->max_v_samp_factor*DCTSIZE));
+    jdiv_round_up((long)cinfo->_jpeg_height,
+                  (long)(cinfo->max_v_samp_factor * DCTSIZE));
 }
 
 
 #ifdef C_MULTISCAN_FILES_SUPPORTED
 
 LOCAL(void)
-validate_script (j_compress_ptr cinfo)
+validate_script(j_compress_ptr cinfo)
 /* Verify that the scan script in cinfo->scan_info[] is valid; also
  * determine whether it uses progressive JPEG, and set cinfo->progressive_mode.
  */
@@ -196,10 +198,10 @@
    * for progressive JPEG, no scan can have this.
    */
   scanptr = cinfo->scan_info;
-  if (scanptr->Ss != 0 || scanptr->Se != DCTSIZE2-1) {
+  if (scanptr->Ss != 0 || scanptr->Se != DCTSIZE2 - 1) {
 #ifdef C_PROGRESSIVE_SUPPORTED
     cinfo->progressive_mode = TRUE;
-    last_bitpos_ptr = & last_bitpos[0][0];
+    last_bitpos_ptr = &last_bitpos[0][0];
     for (ci = 0; ci < cinfo->num_components; ci++)
       for (coefi = 0; coefi < DCTSIZE2; coefi++)
         *last_bitpos_ptr++ = -1;
@@ -222,7 +224,7 @@
       if (thisi < 0 || thisi >= cinfo->num_components)
         ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
       /* Components must appear in SOF order within each scan */
-      if (ci > 0 && thisi <= scanptr->component_index[ci-1])
+      if (ci > 0 && thisi <= scanptr->component_index[ci - 1])
         ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
     }
     /* Validate progression parameters */
@@ -255,7 +257,7 @@
           ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
       }
       for (ci = 0; ci < ncomps; ci++) {
-        last_bitpos_ptr = & last_bitpos[scanptr->component_index[ci]][0];
+        last_bitpos_ptr = &last_bitpos[scanptr->component_index[ci]][0];
         if (Ss != 0 && last_bitpos_ptr[0] < 0) /* AC without prior DC scan */
           ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
         for (coefi = Ss; coefi <= Se; coefi++) {
@@ -265,7 +267,7 @@
               ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
           } else {
             /* not first scan */
-            if (Ah != last_bitpos_ptr[coefi] || Al != Ah-1)
+            if (Ah != last_bitpos_ptr[coefi] || Al != Ah - 1)
               ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
           }
           last_bitpos_ptr[coefi] = Al;
@@ -274,7 +276,7 @@
 #endif
     } else {
       /* For sequential JPEG, all progression parameters must be these: */
-      if (Ss != 0 || Se != DCTSIZE2-1 || Ah != 0 || Al != 0)
+      if (Ss != 0 || Se != DCTSIZE2 - 1 || Ah != 0 || Al != 0)
         ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
       /* Make sure components are not sent twice */
       for (ci = 0; ci < ncomps; ci++) {
@@ -301,7 +303,7 @@
 #endif
   } else {
     for (ci = 0; ci < cinfo->num_components; ci++) {
-      if (! component_sent[ci])
+      if (!component_sent[ci])
         ERREXIT(cinfo, JERR_MISSING_DATA);
     }
   }
@@ -311,7 +313,7 @@
 
 
 LOCAL(void)
-select_scan_parameters (j_compress_ptr cinfo)
+select_scan_parameters(j_compress_ptr cinfo)
 /* Set up the scan parameters for the current scan */
 {
   int ci;
@@ -319,7 +321,7 @@
 #ifdef C_MULTISCAN_FILES_SUPPORTED
   if (cinfo->scan_info != NULL) {
     /* Prepare for current scan --- the script is already validated */
-    my_master_ptr master = (my_master_ptr) cinfo->master;
+    my_master_ptr master = (my_master_ptr)cinfo->master;
     const jpeg_scan_info *scanptr = cinfo->scan_info + master->scan_number;
 
     cinfo->comps_in_scan = scanptr->comps_in_scan;
@@ -331,8 +333,7 @@
     cinfo->Se = scanptr->Se;
     cinfo->Ah = scanptr->Ah;
     cinfo->Al = scanptr->Al;
-  }
-  else
+  } else
 #endif
   {
     /* Prepare for single sequential-JPEG scan containing all components */
@@ -344,7 +345,7 @@
       cinfo->cur_comp_info[ci] = &cinfo->comp_info[ci];
     }
     cinfo->Ss = 0;
-    cinfo->Se = DCTSIZE2-1;
+    cinfo->Se = DCTSIZE2 - 1;
     cinfo->Ah = 0;
     cinfo->Al = 0;
   }
@@ -352,7 +353,7 @@
 
 
 LOCAL(void)
-per_scan_setup (j_compress_ptr cinfo)
+per_scan_setup(j_compress_ptr cinfo)
 /* Do computations that are needed before processing a JPEG scan */
 /* cinfo->comps_in_scan and cinfo->cur_comp_info[] are already set */
 {
@@ -377,7 +378,7 @@
     /* For noninterleaved scans, it is convenient to define last_row_height
      * as the number of block rows present in the last iMCU row.
      */
-    tmp = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+    tmp = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
     if (tmp == 0) tmp = compptr->v_samp_factor;
     compptr->last_row_height = tmp;
 
@@ -394,11 +395,11 @@
 
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = (JDIMENSION)
-      jdiv_round_up((long) cinfo->_jpeg_width,
-                    (long) (cinfo->max_h_samp_factor*DCTSIZE));
+      jdiv_round_up((long)cinfo->_jpeg_width,
+                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
     cinfo->MCU_rows_in_scan = (JDIMENSION)
-      jdiv_round_up((long) cinfo->_jpeg_height,
-                    (long) (cinfo->max_v_samp_factor*DCTSIZE));
+      jdiv_round_up((long)cinfo->_jpeg_height,
+                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
 
     cinfo->blocks_in_MCU = 0;
 
@@ -410,10 +411,10 @@
       compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
       compptr->MCU_sample_width = compptr->MCU_width * DCTSIZE;
       /* Figure number of non-dummy blocks in last MCU column & row */
-      tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
+      tmp = (int)(compptr->width_in_blocks % compptr->MCU_width);
       if (tmp == 0) tmp = compptr->MCU_width;
       compptr->last_col_width = tmp;
-      tmp = (int) (compptr->height_in_blocks % compptr->MCU_height);
+      tmp = (int)(compptr->height_in_blocks % compptr->MCU_height);
       if (tmp == 0) tmp = compptr->MCU_height;
       compptr->last_row_height = tmp;
       /* Prepare array describing MCU composition */
@@ -430,8 +431,8 @@
   /* Convert restart specified in rows to actual MCU count. */
   /* Note that count must fit in 16 bits, so we provide limiting. */
   if (cinfo->restart_in_rows > 0) {
-    long nominal = (long) cinfo->restart_in_rows * (long) cinfo->MCUs_per_row;
-    cinfo->restart_interval = (unsigned int) MIN(nominal, 65535L);
+    long nominal = (long)cinfo->restart_in_rows * (long)cinfo->MCUs_per_row;
+    cinfo->restart_interval = (unsigned int)MIN(nominal, 65535L);
   }
 }
 
@@ -445,9 +446,9 @@
  */
 
 METHODDEF(void)
-prepare_for_pass (j_compress_ptr cinfo)
+prepare_for_pass(j_compress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   switch (master->pass_type) {
   case main_pass:
@@ -456,7 +457,7 @@
      */
     select_scan_parameters(cinfo);
     per_scan_setup(cinfo);
-    if (! cinfo->raw_data_in) {
+    if (!cinfo->raw_data_in) {
       (*cinfo->cconvert->start_pass) (cinfo);
       (*cinfo->downsample->start_pass) (cinfo);
       (*cinfo->prep->start_pass) (cinfo, JBUF_PASS_THRU);
@@ -496,7 +497,7 @@
   case output_pass:
     /* Do a data-output pass. */
     /* We need not repeat per-scan setup if prior optimization pass did it. */
-    if (! cinfo->optimize_coding) {
+    if (!cinfo->optimize_coding) {
       select_scan_parameters(cinfo);
       per_scan_setup(cinfo);
     }
@@ -512,7 +513,7 @@
     ERREXIT(cinfo, JERR_NOT_COMPILED);
   }
 
-  master->pub.is_last_pass = (master->pass_number == master->total_passes-1);
+  master->pub.is_last_pass = (master->pass_number == master->total_passes - 1);
 
   /* Set up progress monitor's pass info if present */
   if (cinfo->progress != NULL) {
@@ -533,7 +534,7 @@
  */
 
 METHODDEF(void)
-pass_startup (j_compress_ptr cinfo)
+pass_startup(j_compress_ptr cinfo)
 {
   cinfo->master->call_pass_startup = FALSE; /* reset flag so call only once */
 
@@ -547,9 +548,9 @@
  */
 
 METHODDEF(void)
-finish_pass_master (j_compress_ptr cinfo)
+finish_pass_master(j_compress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   /* The entropy coder always needs an end-of-pass call,
    * either to analyze statistics or to flush its output buffer.
@@ -563,7 +564,7 @@
      * or output of scan 1 (if no optimization).
      */
     master->pass_type = output_pass;
-    if (! cinfo->optimize_coding)
+    if (!cinfo->optimize_coding)
       master->scan_number++;
     break;
   case huff_opt_pass:
@@ -587,14 +588,14 @@
  */
 
 GLOBAL(void)
-jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
+jinit_c_master_control(j_compress_ptr cinfo, boolean transcode_only)
 {
   my_master_ptr master;
 
   master = (my_master_ptr)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  sizeof(my_comp_master));
-  cinfo->master = (struct jpeg_comp_master *) master;
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(my_comp_master));
+  cinfo->master = (struct jpeg_comp_master *)master;
   master->pub.prepare_for_pass = prepare_for_pass;
   master->pub.pass_startup = pass_startup;
   master->pub.finish_pass = finish_pass_master;
diff --git a/jcomapi.c b/jcomapi.c
index 6e5bf3d..efbb835 100644
--- a/jcomapi.c
+++ b/jcomapi.c
@@ -29,7 +29,7 @@
  */
 
 GLOBAL(void)
-jpeg_abort (j_common_ptr cinfo)
+jpeg_abort(j_common_ptr cinfo)
 {
   int pool;
 
@@ -40,7 +40,7 @@
   /* Releasing pools in reverse order might help avoid fragmentation
    * with some (brain-damaged) malloc libraries.
    */
-  for (pool = JPOOL_NUMPOOLS-1; pool > JPOOL_PERMANENT; pool--) {
+  for (pool = JPOOL_NUMPOOLS - 1; pool > JPOOL_PERMANENT; pool--) {
     (*cinfo->mem->free_pool) (cinfo, pool);
   }
 
@@ -50,7 +50,7 @@
     /* Try to keep application from accessing now-deleted marker list.
      * A bit kludgy to do it here, but this is the most central place.
      */
-    ((j_decompress_ptr) cinfo)->marker_list = NULL;
+    ((j_decompress_ptr)cinfo)->marker_list = NULL;
   } else {
     cinfo->global_state = CSTATE_START;
   }
@@ -69,7 +69,7 @@
  */
 
 GLOBAL(void)
-jpeg_destroy (j_common_ptr cinfo)
+jpeg_destroy(j_common_ptr cinfo)
 {
   /* We need only tell the memory manager to release everything. */
   /* NB: mem pointer is NULL if memory mgr failed to initialize. */
@@ -86,7 +86,7 @@
  */
 
 GLOBAL(JQUANT_TBL *)
-jpeg_alloc_quant_table (j_common_ptr cinfo)
+jpeg_alloc_quant_table(j_common_ptr cinfo)
 {
   JQUANT_TBL *tbl;
 
@@ -98,7 +98,7 @@
 
 
 GLOBAL(JHUFF_TBL *)
-jpeg_alloc_huff_table (j_common_ptr cinfo)
+jpeg_alloc_huff_table(j_common_ptr cinfo)
 {
   JHUFF_TBL *tbl;
 
diff --git a/jconfig.h.in b/jconfig.h.in
index 02c12cc..3a47c18 100644
--- a/jconfig.h.in
+++ b/jconfig.h.in
@@ -1,19 +1,25 @@
 /* Version ID for the JPEG library.
  * Might be useful for tests like "#if JPEG_LIB_VERSION >= 60".
  */
-#define JPEG_LIB_VERSION  62	/* Version 6b */
+#define JPEG_LIB_VERSION @JPEG_LIB_VERSION@
 
 /* libjpeg-turbo version */
-#define LIBJPEG_TURBO_VERSION 0
+#define LIBJPEG_TURBO_VERSION @VERSION@
 
 /* libjpeg-turbo version in integer form */
-#define LIBJPEG_TURBO_VERSION_NUMBER 0
+#define LIBJPEG_TURBO_VERSION_NUMBER @LIBJPEG_TURBO_VERSION_NUMBER@
 
 /* Support arithmetic encoding */
-#undef C_ARITH_CODING_SUPPORTED
+#cmakedefine C_ARITH_CODING_SUPPORTED
 
 /* Support arithmetic decoding */
-#undef D_ARITH_CODING_SUPPORTED
+#cmakedefine D_ARITH_CODING_SUPPORTED
+
+/* Support in-memory source/destination managers */
+#cmakedefine MEM_SRCDST_SUPPORTED
+
+/* Use accelerated SIMD routines. */
+#cmakedefine WITH_SIMD
 
 /*
  * Define BITS_IN_JSAMPLE as either
@@ -24,50 +30,44 @@
  * We do not support run-time selection of data precision, sorry.
  */
 
-#define BITS_IN_JSAMPLE  8      /* use 8 or 12 */
+#define BITS_IN_JSAMPLE  @BITS_IN_JSAMPLE@      /* use 8 or 12 */
 
 /* Define to 1 if you have the <locale.h> header file. */
-#undef HAVE_LOCALE_H
+#cmakedefine HAVE_LOCALE_H
 
 /* Define to 1 if you have the <stddef.h> header file. */
-#undef HAVE_STDDEF_H
+#cmakedefine HAVE_STDDEF_H
 
 /* Define to 1 if you have the <stdlib.h> header file. */
-#undef HAVE_STDLIB_H
+#cmakedefine HAVE_STDLIB_H
 
-/* Define to 1 if the system has the type `unsigned char'. */
-#undef HAVE_UNSIGNED_CHAR
-
-/* Define to 1 if the system has the type `unsigned short'. */
-#undef HAVE_UNSIGNED_SHORT
-
-/* Compiler does not support pointers to undefined structures. */
-#undef INCOMPLETE_TYPES_BROKEN
-
-/* Support in-memory source/destination managers */
-#undef MEM_SRCDST_SUPPORTED
+/* Define if you need to include <sys/types.h> to get size_t. */
+#cmakedefine NEED_SYS_TYPES_H
 
 /* Define if you have BSD-like bzero and bcopy in <strings.h> rather than
    memset/memcpy in <string.h>. */
-#undef NEED_BSD_STRINGS
+#cmakedefine NEED_BSD_STRINGS
 
-/* Define if you need to include <sys/types.h> to get size_t. */
-#undef NEED_SYS_TYPES_H
+/* Define to 1 if the system has the type `unsigned char'. */
+#cmakedefine HAVE_UNSIGNED_CHAR
+
+/* Define to 1 if the system has the type `unsigned short'. */
+#cmakedefine HAVE_UNSIGNED_SHORT
+
+/* Compiler does not support pointers to undefined structures. */
+#cmakedefine INCOMPLETE_TYPES_BROKEN
 
 /* Define if your (broken) compiler shifts signed values as if they were
    unsigned. */
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-/* Use accelerated SIMD routines. */
-#undef WITH_SIMD
+#cmakedefine RIGHT_SHIFT_IS_UNSIGNED
 
 /* Define to 1 if type `char' is unsigned and you are not using gcc.  */
 #ifndef __CHAR_UNSIGNED__
-# undef __CHAR_UNSIGNED__
+  #cmakedefine __CHAR_UNSIGNED__
 #endif
 
 /* Define to empty if `const' does not conform to ANSI C. */
-#undef const
+/* #undef const */
 
 /* Define to `unsigned int' if <sys/types.h> does not define. */
-#undef size_t
+/* #undef size_t */
diff --git a/jconfigint.h.in b/jconfigint.h.in
index 963e760..835f7c4 100644
--- a/jconfigint.h.in
+++ b/jconfigint.h.in
@@ -1,17 +1,21 @@
 /* libjpeg-turbo build number */
-#undef BUILD
+#define BUILD "@BUILD@"
 
 /* Compiler's inline keyword */
 #undef inline
 
 /* How to obtain function inlining. */
-#undef INLINE
+#define INLINE @INLINE@
 
 /* Define to the full name of this package. */
-#undef PACKAGE_NAME
+#define PACKAGE_NAME "@CMAKE_PROJECT_NAME@"
 
 /* Version number of package */
-#undef VERSION
+#define VERSION "@VERSION@"
+
+#ifndef _WIN32
 
 /* The size of `size_t', as computed by sizeof. */
-#undef SIZEOF_SIZE_T
+#define SIZEOF_SIZE_T @SIZE_T@
+
+#endif
diff --git a/jcparam.c b/jcparam.c
index 18b2d48..bcea927 100644
--- a/jcparam.c
+++ b/jcparam.c
@@ -25,9 +25,9 @@
  */
 
 GLOBAL(void)
-jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
-                      const unsigned int *basic_table,
-                      int scale_factor, boolean force_baseline)
+jpeg_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+                     const unsigned int *basic_table, int scale_factor,
+                     boolean force_baseline)
 /* Define a quantization table equal to the basic_table times
  * a scale factor (given as a percentage).
  * If force_baseline is TRUE, the computed quantization table entries
@@ -45,19 +45,19 @@
   if (which_tbl < 0 || which_tbl >= NUM_QUANT_TBLS)
     ERREXIT1(cinfo, JERR_DQT_INDEX, which_tbl);
 
-  qtblptr = & cinfo->quant_tbl_ptrs[which_tbl];
+  qtblptr = &cinfo->quant_tbl_ptrs[which_tbl];
 
   if (*qtblptr == NULL)
-    *qtblptr = jpeg_alloc_quant_table((j_common_ptr) cinfo);
+    *qtblptr = jpeg_alloc_quant_table((j_common_ptr)cinfo);
 
   for (i = 0; i < DCTSIZE2; i++) {
-    temp = ((long) basic_table[i] * scale_factor + 50L) / 100L;
+    temp = ((long)basic_table[i] * scale_factor + 50L) / 100L;
     /* limit the values to the valid range */
     if (temp <= 0L) temp = 1L;
     if (temp > 32767L) temp = 32767L; /* max quantizer needed for 12 bits */
     if (force_baseline && temp > 255L)
       temp = 255L;              /* limit to baseline range if requested */
-    (*qtblptr)->quantval[i] = (UINT16) temp;
+    (*qtblptr)->quantval[i] = (UINT16)temp;
   }
 
   /* Initialize sent_table FALSE so table will be written to JPEG file. */
@@ -93,7 +93,7 @@
 
 #if JPEG_LIB_VERSION >= 70
 GLOBAL(void)
-jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
+jpeg_default_qtables(j_compress_ptr cinfo, boolean force_baseline)
 /* Set or change the 'quality' (quantization) setting, using default tables
  * and straight percentage-scaling quality scales.
  * This entry point allows different scalings for luminance and chrominance.
@@ -109,8 +109,8 @@
 
 
 GLOBAL(void)
-jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
-                         boolean force_baseline)
+jpeg_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+                        boolean force_baseline)
 /* Set or change the 'quality' (quantization) setting, using default tables
  * and a straight percentage-scaling quality scale.  In most cases it's better
  * to use jpeg_set_quality (below); this entry point is provided for
@@ -126,7 +126,7 @@
 
 
 GLOBAL(int)
-jpeg_quality_scaling (int quality)
+jpeg_quality_scaling(int quality)
 /* Convert a user-specified quality rating to a percentage scaling factor
  * for an underlying quantization table, using our recommended scaling curve.
  * The input 'quality' factor should be 0 (terrible) to 100 (very good).
@@ -145,14 +145,14 @@
   if (quality < 50)
     quality = 5000 / quality;
   else
-    quality = 200 - quality*2;
+    quality = 200 - quality * 2;
 
   return quality;
 }
 
 
 GLOBAL(void)
-jpeg_set_quality (j_compress_ptr cinfo, int quality, boolean force_baseline)
+jpeg_set_quality(j_compress_ptr cinfo, int quality, boolean force_baseline)
 /* Set or change the 'quality' (quantization) setting, using default tables.
  * This is the standard quality-adjusting entry point for typical user
  * interfaces; only those who want detailed control over quantization tables
@@ -178,7 +178,7 @@
  */
 
 GLOBAL(void)
-jpeg_set_defaults (j_compress_ptr cinfo)
+jpeg_set_defaults(j_compress_ptr cinfo)
 {
   int i;
 
@@ -192,7 +192,7 @@
    */
   if (cinfo->comp_info == NULL)
     cinfo->comp_info = (jpeg_component_info *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   MAX_COMPONENTS * sizeof(jpeg_component_info));
 
   /* Initialize everything not dependent on the color space */
@@ -205,7 +205,7 @@
   /* Set up two quantization tables using default quality of 75 */
   jpeg_set_quality(cinfo, 75, TRUE);
   /* Set up two Huffman tables */
-  std_huff_tables((j_common_ptr) cinfo);
+  std_huff_tables((j_common_ptr)cinfo);
 
   /* Initialize default arithmetic coding conditioning */
   for (i = 0; i < NUM_ARITH_TBLS; i++) {
@@ -278,7 +278,7 @@
  */
 
 GLOBAL(void)
-jpeg_default_colorspace (j_compress_ptr cinfo)
+jpeg_default_colorspace(j_compress_ptr cinfo)
 {
   switch (cinfo->in_color_space) {
   case JCS_GRAYSCALE:
@@ -320,12 +320,12 @@
  */
 
 GLOBAL(void)
-jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
+jpeg_set_colorspace(j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
 {
   jpeg_component_info *compptr;
   int ci;
 
-#define SET_COMP(index,id,hsamp,vsamp,quant,dctbl,actbl)  \
+#define SET_COMP(index, id, hsamp, vsamp, quant, dctbl, actbl) \
   (compptr = &cinfo->comp_info[index], \
    compptr->component_id = (id), \
    compptr->h_samp_factor = (hsamp), \
@@ -352,39 +352,39 @@
     cinfo->write_JFIF_header = TRUE; /* Write a JFIF marker */
     cinfo->num_components = 1;
     /* JFIF specifies component ID 1 */
-    SET_COMP(0, 1, 1,1, 0, 0,0);
+    SET_COMP(0, 1, 1, 1, 0, 0, 0);
     break;
   case JCS_RGB:
     cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag RGB */
     cinfo->num_components = 3;
-    SET_COMP(0, 0x52 /* 'R' */, 1,1, 0, 0,0);
-    SET_COMP(1, 0x47 /* 'G' */, 1,1, 0, 0,0);
-    SET_COMP(2, 0x42 /* 'B' */, 1,1, 0, 0,0);
+    SET_COMP(0, 0x52 /* 'R' */, 1, 1, 0, 0, 0);
+    SET_COMP(1, 0x47 /* 'G' */, 1, 1, 0, 0, 0);
+    SET_COMP(2, 0x42 /* 'B' */, 1, 1, 0, 0, 0);
     break;
   case JCS_YCbCr:
     cinfo->write_JFIF_header = TRUE; /* Write a JFIF marker */
     cinfo->num_components = 3;
     /* JFIF specifies component IDs 1,2,3 */
     /* We default to 2x2 subsamples of chrominance */
-    SET_COMP(0, 1, 2,2, 0, 0,0);
-    SET_COMP(1, 2, 1,1, 1, 1,1);
-    SET_COMP(2, 3, 1,1, 1, 1,1);
+    SET_COMP(0, 1, 2, 2, 0, 0, 0);
+    SET_COMP(1, 2, 1, 1, 1, 1, 1);
+    SET_COMP(2, 3, 1, 1, 1, 1, 1);
     break;
   case JCS_CMYK:
     cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag CMYK */
     cinfo->num_components = 4;
-    SET_COMP(0, 0x43 /* 'C' */, 1,1, 0, 0,0);
-    SET_COMP(1, 0x4D /* 'M' */, 1,1, 0, 0,0);
-    SET_COMP(2, 0x59 /* 'Y' */, 1,1, 0, 0,0);
-    SET_COMP(3, 0x4B /* 'K' */, 1,1, 0, 0,0);
+    SET_COMP(0, 0x43 /* 'C' */, 1, 1, 0, 0, 0);
+    SET_COMP(1, 0x4D /* 'M' */, 1, 1, 0, 0, 0);
+    SET_COMP(2, 0x59 /* 'Y' */, 1, 1, 0, 0, 0);
+    SET_COMP(3, 0x4B /* 'K' */, 1, 1, 0, 0, 0);
     break;
   case JCS_YCCK:
     cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag YCCK */
     cinfo->num_components = 4;
-    SET_COMP(0, 1, 2,2, 0, 0,0);
-    SET_COMP(1, 2, 1,1, 1, 1,1);
-    SET_COMP(2, 3, 1,1, 1, 1,1);
-    SET_COMP(3, 4, 2,2, 0, 0,0);
+    SET_COMP(0, 1, 2, 2, 0, 0, 0);
+    SET_COMP(1, 2, 1, 1, 1, 1, 1);
+    SET_COMP(2, 3, 1, 1, 1, 1, 1);
+    SET_COMP(3, 4, 2, 2, 0, 0, 0);
     break;
   case JCS_UNKNOWN:
     cinfo->num_components = cinfo->input_components;
@@ -392,7 +392,7 @@
       ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components,
                MAX_COMPONENTS);
     for (ci = 0; ci < cinfo->num_components; ci++) {
-      SET_COMP(ci, ci, 1,1, 0, 0,0);
+      SET_COMP(ci, ci, 1, 1, 0, 0, 0);
     }
     break;
   default:
@@ -404,8 +404,7 @@
 #ifdef C_PROGRESSIVE_SUPPORTED
 
 LOCAL(jpeg_scan_info *)
-fill_a_scan (jpeg_scan_info *scanptr, int ci,
-             int Ss, int Se, int Ah, int Al)
+fill_a_scan(jpeg_scan_info *scanptr, int ci, int Ss, int Se, int Ah, int Al)
 /* Support routine: generate one scan for specified component */
 {
   scanptr->comps_in_scan = 1;
@@ -419,8 +418,7 @@
 }
 
 LOCAL(jpeg_scan_info *)
-fill_scans (jpeg_scan_info *scanptr, int ncomps,
-            int Ss, int Se, int Ah, int Al)
+fill_scans(jpeg_scan_info *scanptr, int ncomps, int Ss, int Se, int Ah, int Al)
 /* Support routine: generate one scan for each component */
 {
   int ci;
@@ -438,7 +436,7 @@
 }
 
 LOCAL(jpeg_scan_info *)
-fill_dc_scans (jpeg_scan_info *scanptr, int ncomps, int Ah, int Al)
+fill_dc_scans(jpeg_scan_info *scanptr, int ncomps, int Ah, int Al)
 /* Support routine: generate interleaved DC scan if possible, else N scans */
 {
   int ci;
@@ -466,7 +464,7 @@
  */
 
 GLOBAL(void)
-jpeg_simple_progression (j_compress_ptr cinfo)
+jpeg_simple_progression(j_compress_ptr cinfo)
 {
   int ncomps = cinfo->num_components;
   int nscans;
@@ -498,7 +496,7 @@
   if (cinfo->script_space == NULL || cinfo->script_space_size < nscans) {
     cinfo->script_space_size = MAX(nscans, 10);
     cinfo->script_space = (jpeg_scan_info *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                         cinfo->script_space_size * sizeof(jpeg_scan_info));
   }
   scanptr = cinfo->script_space;
diff --git a/jcphuff.c b/jcphuff.c
index 046e2e1..68c8858 100644
--- a/jcphuff.c
+++ b/jcphuff.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2011, 2015, 2018, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -19,9 +19,41 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jchuff.h"             /* Declarations shared with jchuff.c */
+#include <limits.h>
 
 #ifdef C_PROGRESSIVE_SUPPORTED
 
+/*
+ * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
+ * used for bit counting rather than the lookup table.  This will reduce the
+ * memory footprint by 64k, which is important for some mobile applications
+ * that create many isolated instances of libjpeg-turbo (web browsers, for
+ * instance.)  This may improve performance on some mobile platforms as well.
+ * This feature is enabled by default only on ARM processors, because some x86
+ * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
+ * shown to have a significant performance impact even on the x86 chips that
+ * have a fast implementation of it.  When building for ARMv6, you can
+ * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
+ * flags (this defines __thumb__).
+ */
+
+/* NOTE: Both GCC and Clang define __GNUC__ */
+#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
+#if !defined __thumb__ || defined __thumb2__
+#define USE_CLZ_INTRINSIC
+#endif
+#endif
+
+#ifdef USE_CLZ_INTRINSIC
+#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
+#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
+#else
+#include "jpeg_nbits_table.h"
+#define JPEG_NBITS(x) (jpeg_nbits_table[x])
+#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
+#endif
+
+
 /* Expanded entropy encoder object for progressive Huffman encoding. */
 
 typedef struct {
@@ -79,26 +111,26 @@
 
 #ifdef RIGHT_SHIFT_IS_UNSIGNED
 #define ISHIFT_TEMPS    int ishift_temp;
-#define IRIGHT_SHIFT(x,shft)  \
-        ((ishift_temp = (x)) < 0 ? \
-         (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
-         (ishift_temp >> (shft)))
+#define IRIGHT_SHIFT(x, shft) \
+  ((ishift_temp = (x)) < 0 ? \
+   (ishift_temp >> (shft)) | ((~0) << (16 - (shft))) : \
+   (ishift_temp >> (shft)))
 #else
 #define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft)    ((x) >> (shft))
+#define IRIGHT_SHIFT(x, shft)   ((x) >> (shft))
 #endif
 
 /* Forward declarations */
-METHODDEF(boolean) encode_mcu_DC_first (j_compress_ptr cinfo,
+METHODDEF(boolean) encode_mcu_DC_first(j_compress_ptr cinfo,
+                                       JBLOCKROW *MCU_data);
+METHODDEF(boolean) encode_mcu_AC_first(j_compress_ptr cinfo,
+                                       JBLOCKROW *MCU_data);
+METHODDEF(boolean) encode_mcu_DC_refine(j_compress_ptr cinfo,
                                         JBLOCKROW *MCU_data);
-METHODDEF(boolean) encode_mcu_AC_first (j_compress_ptr cinfo,
+METHODDEF(boolean) encode_mcu_AC_refine(j_compress_ptr cinfo,
                                         JBLOCKROW *MCU_data);
-METHODDEF(boolean) encode_mcu_DC_refine (j_compress_ptr cinfo,
-                                         JBLOCKROW *MCU_data);
-METHODDEF(boolean) encode_mcu_AC_refine (j_compress_ptr cinfo,
-                                         JBLOCKROW *MCU_data);
-METHODDEF(void) finish_pass_phuff (j_compress_ptr cinfo);
-METHODDEF(void) finish_pass_gather_phuff (j_compress_ptr cinfo);
+METHODDEF(void) finish_pass_phuff(j_compress_ptr cinfo);
+METHODDEF(void) finish_pass_gather_phuff(j_compress_ptr cinfo);
 
 
 /*
@@ -106,9 +138,9 @@
  */
 
 METHODDEF(void)
-start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
+start_pass_phuff(j_compress_ptr cinfo, boolean gather_statistics)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   boolean is_DC_band;
   int ci, tbl;
   jpeg_component_info *compptr;
@@ -134,7 +166,7 @@
       /* AC refinement needs a correction bit buffer */
       if (entropy->bit_buffer == NULL)
         entropy->bit_buffer = (char *)
-          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+          (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                       MAX_CORR_BITS * sizeof(char));
     }
   }
@@ -167,14 +199,14 @@
       /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
       if (entropy->count_ptrs[tbl] == NULL)
         entropy->count_ptrs[tbl] = (long *)
-          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+          (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                       257 * sizeof(long));
       MEMZERO(entropy->count_ptrs[tbl], 257 * sizeof(long));
     } else {
       /* Compute derived values for Huffman table */
       /* We may do this more than once for a table, but it's not expensive */
       jpeg_make_c_derived_tbl(cinfo, is_DC_band, tbl,
-                              & entropy->derived_tbls[tbl]);
+                              &entropy->derived_tbls[tbl]);
     }
   }
 
@@ -198,19 +230,20 @@
  */
 
 /* Emit a byte */
-#define emit_byte(entropy,val)  \
-        { *(entropy)->next_output_byte++ = (JOCTET) (val);  \
-          if (--(entropy)->free_in_buffer == 0)  \
-            dump_buffer(entropy); }
+#define emit_byte(entropy, val) { \
+  *(entropy)->next_output_byte++ = (JOCTET)(val); \
+  if (--(entropy)->free_in_buffer == 0) \
+    dump_buffer(entropy); \
+}
 
 
 LOCAL(void)
-dump_buffer (phuff_entropy_ptr entropy)
+dump_buffer(phuff_entropy_ptr entropy)
 /* Empty the output buffer; we do not support suspension in this module. */
 {
   struct jpeg_destination_mgr *dest = entropy->cinfo->dest;
 
-  if (! (*dest->empty_output_buffer) (entropy->cinfo))
+  if (!(*dest->empty_output_buffer) (entropy->cinfo))
     ERREXIT(entropy->cinfo, JERR_CANT_SUSPEND);
   /* After a successful buffer dump, must reset buffer pointers */
   entropy->next_output_byte = dest->next_output_byte;
@@ -227,11 +260,11 @@
  */
 
 LOCAL(void)
-emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
+emit_bits(phuff_entropy_ptr entropy, unsigned int code, int size)
 /* Emit some bits, unless we are in gather mode */
 {
   /* This routine is heavily used, so it's worth coding tightly. */
-  register size_t put_buffer = (size_t) code;
+  register size_t put_buffer = (size_t)code;
   register int put_bits = entropy->put_bits;
 
   /* if size is 0, caller used an invalid Huffman table entry */
@@ -241,7 +274,7 @@
   if (entropy->gather_statistics)
     return;                     /* do nothing if we're only getting stats */
 
-  put_buffer &= (((size_t) 1)<<size) - 1; /* mask off any extra bits in code */
+  put_buffer &= (((size_t)1) << size) - 1; /* mask off any extra bits in code */
 
   put_bits += size;             /* new number of bits in buffer */
 
@@ -250,7 +283,7 @@
   put_buffer |= entropy->put_buffer; /* and merge with old buffer contents */
 
   while (put_bits >= 8) {
-    int c = (int) ((put_buffer >> 16) & 0xFF);
+    int c = (int)((put_buffer >> 16) & 0xFF);
 
     emit_byte(entropy, c);
     if (c == 0xFF) {            /* need to stuff a zero byte? */
@@ -266,7 +299,7 @@
 
 
 LOCAL(void)
-flush_bits (phuff_entropy_ptr entropy)
+flush_bits(phuff_entropy_ptr entropy)
 {
   emit_bits(entropy, 0x7F, 7); /* fill any partial byte with ones */
   entropy->put_buffer = 0;     /* and reset bit-buffer to empty */
@@ -279,7 +312,7 @@
  */
 
 LOCAL(void)
-emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol)
+emit_symbol(phuff_entropy_ptr entropy, int tbl_no, int symbol)
 {
   if (entropy->gather_statistics)
     entropy->count_ptrs[tbl_no][symbol]++;
@@ -295,14 +328,14 @@
  */
 
 LOCAL(void)
-emit_buffered_bits (phuff_entropy_ptr entropy, char *bufstart,
-                    unsigned int nbits)
+emit_buffered_bits(phuff_entropy_ptr entropy, char *bufstart,
+                   unsigned int nbits)
 {
   if (entropy->gather_statistics)
     return;                     /* no real work */
 
   while (nbits > 0) {
-    emit_bits(entropy, (unsigned int) (*bufstart), 1);
+    emit_bits(entropy, (unsigned int)(*bufstart), 1);
     bufstart++;
     nbits--;
   }
@@ -314,15 +347,13 @@
  */
 
 LOCAL(void)
-emit_eobrun (phuff_entropy_ptr entropy)
+emit_eobrun(phuff_entropy_ptr entropy)
 {
   register int temp, nbits;
 
   if (entropy->EOBRUN > 0) {    /* if there is any pending EOBRUN */
     temp = entropy->EOBRUN;
-    nbits = 0;
-    while ((temp >>= 1))
-      nbits++;
+    nbits = JPEG_NBITS_NONZERO(temp) - 1;
     /* safety check: shouldn't happen given limited correction-bit buffer */
     if (nbits > 14)
       ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
@@ -345,13 +376,13 @@
  */
 
 LOCAL(void)
-emit_restart (phuff_entropy_ptr entropy, int restart_num)
+emit_restart(phuff_entropy_ptr entropy, int restart_num)
 {
   int ci;
 
   emit_eobrun(entropy);
 
-  if (! entropy->gather_statistics) {
+  if (!entropy->gather_statistics) {
     flush_bits(entropy);
     emit_byte(entropy, 0xFF);
     emit_byte(entropy, JPEG_RST0 + restart_num);
@@ -375,10 +406,10 @@
  */
 
 METHODDEF(boolean)
-encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp, temp2;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+  register int temp, temp2, temp3;
   register int nbits;
   int blkn, ci;
   int Al = cinfo->Al;
@@ -403,31 +434,31 @@
     /* Compute the DC value after the required point transform by Al.
      * This is simply an arithmetic right shift.
      */
-    temp2 = IRIGHT_SHIFT((int) ((*block)[0]), Al);
+    temp2 = IRIGHT_SHIFT((int)((*block)[0]), Al);
 
     /* DC differences are figured on the point-transformed values. */
     temp = temp2 - entropy->last_dc_val[ci];
     entropy->last_dc_val[ci] = temp2;
 
     /* Encode the DC coefficient difference per section G.1.2.1 */
-    temp2 = temp;
-    if (temp < 0) {
-      temp = -temp;             /* temp is abs value of input */
-      /* For a negative input, want temp2 = bitwise complement of abs(input) */
-      /* This code assumes we are on a two's complement machine */
-      temp2--;
-    }
+
+    /* This is a well-known technique for obtaining the absolute value without
+     * a branch.  It is derived from an assembly language technique presented
+     * in "How to Optimize for the Pentium Processors", Copyright (c) 1996,
+     * 1997 by Agner Fog.
+     */
+    temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
+    temp ^= temp3;
+    temp -= temp3;              /* temp is abs value of input */
+    /* For a negative input, want temp2 = bitwise complement of abs(input) */
+    temp2 = temp ^ temp3;
 
     /* Find the number of bits needed for the magnitude of the coefficient */
-    nbits = 0;
-    while (temp) {
-      nbits++;
-      temp >>= 1;
-    }
+    nbits = JPEG_NBITS(temp);
     /* Check for out-of-range coefficient values.
      * Since we're encoding a difference, the range limit is twice as much.
      */
-    if (nbits > MAX_COEF_BITS+1)
+    if (nbits > MAX_COEF_BITS + 1)
       ERREXIT(cinfo, JERR_BAD_DCT_COEF);
 
     /* Count/emit the Huffman-coded symbol for the number of bits */
@@ -436,7 +467,7 @@
     /* Emit that number of bits of the value, if positive, */
     /* or the complement of its magnitude, if negative. */
     if (nbits)                  /* emit_bits rejects calls with size 0 */
-      emit_bits(entropy, (unsigned int) temp2, nbits);
+      emit_bits(entropy, (unsigned int)temp2, nbits);
   }
 
   cinfo->dest->next_output_byte = entropy->next_output_byte;
@@ -462,10 +493,10 @@
  */
 
 METHODDEF(boolean)
-encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp, temp2;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+  register int temp, temp2, temp3;
   register int nbits;
   register int r, k;
   int Se = cinfo->Se;
@@ -497,15 +528,12 @@
      * in C, we shift after obtaining the absolute value; so the code is
      * interwoven with finding the abs value (temp) and output bits (temp2).
      */
-    if (temp < 0) {
-      temp = -temp;             /* temp is abs value of input */
-      temp >>= Al;              /* apply the point transform */
-      /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
-      temp2 = ~temp;
-    } else {
-      temp >>= Al;              /* apply the point transform */
-      temp2 = temp;
-    }
+    temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
+    temp ^= temp3;
+    temp -= temp3;              /* temp is abs value of input */
+    temp >>= Al;                /* apply the point transform */
+    /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
+    temp2 = temp ^ temp3;
     /* Watch out for case that nonzero coef is zero after point transform */
     if (temp == 0) {
       r++;
@@ -522,9 +550,7 @@
     }
 
     /* Find the number of bits needed for the magnitude of the coefficient */
-    nbits = 1;                  /* there must be at least one 1 bit */
-    while ((temp >>= 1))
-      nbits++;
+    nbits = JPEG_NBITS_NONZERO(temp);  /* there must be at least one 1 bit */
     /* Check for out-of-range coefficient values */
     if (nbits > MAX_COEF_BITS)
       ERREXIT(cinfo, JERR_BAD_DCT_COEF);
@@ -534,7 +560,7 @@
 
     /* Emit that number of bits of the value, if positive, */
     /* or the complement of its magnitude, if negative. */
-    emit_bits(entropy, (unsigned int) temp2, nbits);
+    emit_bits(entropy, (unsigned int)temp2, nbits);
 
     r = 0;                      /* reset zero run length */
   }
@@ -569,9 +595,9 @@
  */
 
 METHODDEF(boolean)
-encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   register int temp;
   int blkn;
   int Al = cinfo->Al;
@@ -591,7 +617,7 @@
 
     /* We simply emit the Al'th bit of the DC coefficient value. */
     temp = (*block)[0];
-    emit_bits(entropy, (unsigned int) (temp >> Al), 1);
+    emit_bits(entropy, (unsigned int)(temp >> Al), 1);
   }
 
   cinfo->dest->next_output_byte = entropy->next_output_byte;
@@ -616,10 +642,10 @@
  */
 
 METHODDEF(boolean)
-encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+  register int temp, temp3;
   register int r, k;
   int EOB;
   char *BR_buffer;
@@ -650,8 +676,9 @@
      * is an integer division with rounding towards 0.  To do this portably
      * in C, we shift after obtaining the absolute value.
      */
-    if (temp < 0)
-      temp = -temp;             /* temp is abs value of input */
+    temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
+    temp ^= temp3;
+    temp -= temp3;              /* temp is abs value of input */
     temp >>= Al;                /* apply the point transform */
     absvalues[k] = temp;        /* save abs value for main pass */
     if (temp == 1)
@@ -690,7 +717,7 @@
      */
     if (temp > 1) {
       /* The correction bit is the next bit of the absolute value. */
-      BR_buffer[BR++] = (char) (temp & 1);
+      BR_buffer[BR++] = (char)(temp & 1);
       continue;
     }
 
@@ -702,7 +729,7 @@
 
     /* Emit output bit for newly-nonzero coef */
     temp = ((*block)[jpeg_natural_order[k]] < 0) ? 0 : 1;
-    emit_bits(entropy, (unsigned int) temp, 1);
+    emit_bits(entropy, (unsigned int)temp, 1);
 
     /* Emit buffered correction bits that must be associated with this code */
     emit_buffered_bits(entropy, BR_buffer, BR);
@@ -718,7 +745,8 @@
      * 1. overflow of the EOB counter;
      * 2. overflow of the correction bit buffer during the next MCU.
      */
-    if (entropy->EOBRUN == 0x7FFF || entropy->BE > (MAX_CORR_BITS-DCTSIZE2+1))
+    if (entropy->EOBRUN == 0x7FFF ||
+        entropy->BE > (MAX_CORR_BITS - DCTSIZE2 + 1))
       emit_eobrun(entropy);
   }
 
@@ -744,9 +772,9 @@
  */
 
 METHODDEF(void)
-finish_pass_phuff (j_compress_ptr cinfo)
+finish_pass_phuff(j_compress_ptr cinfo)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
 
   entropy->next_output_byte = cinfo->dest->next_output_byte;
   entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@@ -765,9 +793,9 @@
  */
 
 METHODDEF(void)
-finish_pass_gather_phuff (j_compress_ptr cinfo)
+finish_pass_gather_phuff(j_compress_ptr cinfo)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   boolean is_DC_band;
   int ci, tbl;
   jpeg_component_info *compptr;
@@ -793,13 +821,13 @@
     } else {
       tbl = compptr->ac_tbl_no;
     }
-    if (! did[tbl]) {
+    if (!did[tbl]) {
       if (is_DC_band)
-        htblptr = & cinfo->dc_huff_tbl_ptrs[tbl];
+        htblptr = &cinfo->dc_huff_tbl_ptrs[tbl];
       else
-        htblptr = & cinfo->ac_huff_tbl_ptrs[tbl];
+        htblptr = &cinfo->ac_huff_tbl_ptrs[tbl];
       if (*htblptr == NULL)
-        *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+        *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
       jpeg_gen_optimal_table(cinfo, *htblptr, entropy->count_ptrs[tbl]);
       did[tbl] = TRUE;
     }
@@ -812,15 +840,15 @@
  */
 
 GLOBAL(void)
-jinit_phuff_encoder (j_compress_ptr cinfo)
+jinit_phuff_encoder(j_compress_ptr cinfo)
 {
   phuff_entropy_ptr entropy;
   int i;
 
   entropy = (phuff_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(phuff_entropy_encoder));
-  cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
+  cinfo->entropy = (struct jpeg_entropy_encoder *)entropy;
   entropy->pub.start_pass = start_pass_phuff;
 
   /* Mark tables unallocated */
diff --git a/jcprepct.c b/jcprepct.c
index e72ebd8..d59713a 100644
--- a/jcprepct.c
+++ b/jcprepct.c
@@ -78,9 +78,9 @@
  */
 
 METHODDEF(void)
-start_pass_prep (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_prep(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
+  my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
 
   if (pass_mode != JBUF_PASS_THRU)
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -106,14 +106,14 @@
  */
 
 LOCAL(void)
-expand_bottom_edge (JSAMPARRAY image_data, JDIMENSION num_cols,
-                    int input_rows, int output_rows)
+expand_bottom_edge(JSAMPARRAY image_data, JDIMENSION num_cols, int input_rows,
+                   int output_rows)
 {
   register int row;
 
   for (row = input_rows; row < output_rows; row++) {
-    jcopy_sample_rows(image_data, input_rows-1, image_data, row,
-                      1, num_cols);
+    jcopy_sample_rows(image_data, input_rows - 1, image_data, row, 1,
+                      num_cols);
   }
 }
 
@@ -128,13 +128,12 @@
  */
 
 METHODDEF(void)
-pre_process_data (j_compress_ptr cinfo,
-                  JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
-                  JDIMENSION in_rows_avail,
-                  JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
-                  JDIMENSION out_row_groups_avail)
+pre_process_data(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                 JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail,
+                 JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
+                 JDIMENSION out_row_groups_avail)
 {
-  my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
+  my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
   int numrows, ci;
   JDIMENSION inrows;
   jpeg_component_info *compptr;
@@ -144,10 +143,10 @@
     /* Do color conversion to fill the conversion buffer. */
     inrows = in_rows_avail - *in_row_ctr;
     numrows = cinfo->max_v_samp_factor - prep->next_buf_row;
-    numrows = (int) MIN((JDIMENSION) numrows, inrows);
+    numrows = (int)MIN((JDIMENSION)numrows, inrows);
     (*cinfo->cconvert->color_convert) (cinfo, input_buf + *in_row_ctr,
                                        prep->color_buf,
-                                       (JDIMENSION) prep->next_buf_row,
+                                       (JDIMENSION)prep->next_buf_row,
                                        numrows);
     *in_row_ctr += numrows;
     prep->next_buf_row += numrows;
@@ -164,7 +163,7 @@
     /* If we've filled the conversion buffer, empty it. */
     if (prep->next_buf_row == cinfo->max_v_samp_factor) {
       (*cinfo->downsample->downsample) (cinfo,
-                                        prep->color_buf, (JDIMENSION) 0,
+                                        prep->color_buf, (JDIMENSION)0,
                                         output_buf, *out_row_group_ctr);
       prep->next_buf_row = 0;
       (*out_row_group_ctr)++;
@@ -172,14 +171,12 @@
     /* If at bottom of image, pad the output to a full iMCU height.
      * Note we assume the caller is providing a one-iMCU-height output buffer!
      */
-    if (prep->rows_to_go == 0 &&
-        *out_row_group_ctr < out_row_groups_avail) {
+    if (prep->rows_to_go == 0 && *out_row_group_ctr < out_row_groups_avail) {
       for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
            ci++, compptr++) {
-        expand_bottom_edge(output_buf[ci],
-                           compptr->width_in_blocks * DCTSIZE,
-                           (int) (*out_row_group_ctr * compptr->v_samp_factor),
-                           (int) (out_row_groups_avail * compptr->v_samp_factor));
+        expand_bottom_edge(output_buf[ci], compptr->width_in_blocks * DCTSIZE,
+                           (int)(*out_row_group_ctr * compptr->v_samp_factor),
+                           (int)(out_row_groups_avail * compptr->v_samp_factor));
       }
       *out_row_group_ctr = out_row_groups_avail;
       break;                    /* can exit outer loop without test */
@@ -195,13 +192,12 @@
  */
 
 METHODDEF(void)
-pre_process_context (j_compress_ptr cinfo,
-                     JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
-                     JDIMENSION in_rows_avail,
-                     JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
-                     JDIMENSION out_row_groups_avail)
+pre_process_context(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                    JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail,
+                    JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
+                    JDIMENSION out_row_groups_avail)
 {
-  my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
+  my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
   int numrows, ci;
   int buf_height = cinfo->max_v_samp_factor * 3;
   JDIMENSION inrows;
@@ -211,19 +207,18 @@
       /* Do color conversion to fill the conversion buffer. */
       inrows = in_rows_avail - *in_row_ctr;
       numrows = prep->next_buf_stop - prep->next_buf_row;
-      numrows = (int) MIN((JDIMENSION) numrows, inrows);
+      numrows = (int)MIN((JDIMENSION)numrows, inrows);
       (*cinfo->cconvert->color_convert) (cinfo, input_buf + *in_row_ctr,
                                          prep->color_buf,
-                                         (JDIMENSION) prep->next_buf_row,
+                                         (JDIMENSION)prep->next_buf_row,
                                          numrows);
       /* Pad at top of image, if first time through */
       if (prep->rows_to_go == cinfo->image_height) {
         for (ci = 0; ci < cinfo->num_components; ci++) {
           int row;
           for (row = 1; row <= cinfo->max_v_samp_factor; row++) {
-            jcopy_sample_rows(prep->color_buf[ci], 0,
-                              prep->color_buf[ci], -row,
-                              1, cinfo->image_width);
+            jcopy_sample_rows(prep->color_buf[ci], 0, prep->color_buf[ci],
+                              -row, 1, cinfo->image_width);
           }
         }
       }
@@ -245,9 +240,8 @@
     }
     /* If we've gotten enough data, downsample a row group. */
     if (prep->next_buf_row == prep->next_buf_stop) {
-      (*cinfo->downsample->downsample) (cinfo,
-                                        prep->color_buf,
-                                        (JDIMENSION) prep->this_row_group,
+      (*cinfo->downsample->downsample) (cinfo, prep->color_buf,
+                                        (JDIMENSION)prep->this_row_group,
                                         output_buf, *out_row_group_ctr);
       (*out_row_group_ctr)++;
       /* Advance pointers with wraparound as necessary. */
@@ -267,9 +261,9 @@
  */
 
 LOCAL(void)
-create_context_buffer (j_compress_ptr cinfo)
+create_context_buffer(j_compress_ptr cinfo)
 {
-  my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
+  my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
   int rgroup_height = cinfo->max_v_samp_factor;
   int ci, i;
   jpeg_component_info *compptr;
@@ -279,7 +273,7 @@
    * we need five row groups' worth of pointers for each component.
    */
   fake_buffer = (JSAMPARRAY)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 (cinfo->num_components * 5 * rgroup_height) *
                                 sizeof(JSAMPROW));
 
@@ -290,10 +284,10 @@
      * horizontally within the buffer, if it so chooses.
      */
     true_buffer = (*cinfo->mem->alloc_sarray)
-      ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
-                      cinfo->max_h_samp_factor) / compptr->h_samp_factor),
-       (JDIMENSION) (3 * rgroup_height));
+      ((j_common_ptr)cinfo, JPOOL_IMAGE,
+       (JDIMENSION)(((long)compptr->width_in_blocks * DCTSIZE *
+                     cinfo->max_h_samp_factor) / compptr->h_samp_factor),
+       (JDIMENSION)(3 * rgroup_height));
     /* Copy true buffer row pointers into the middle of the fake row array */
     MEMCOPY(fake_buffer + rgroup_height, true_buffer,
             3 * rgroup_height * sizeof(JSAMPROW));
@@ -315,7 +309,7 @@
  */
 
 GLOBAL(void)
-jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer)
+jinit_c_prep_controller(j_compress_ptr cinfo, boolean need_full_buffer)
 {
   my_prep_ptr prep;
   int ci;
@@ -325,9 +319,9 @@
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
 
   prep = (my_prep_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_prep_controller));
-  cinfo->prep = (struct jpeg_c_prep_controller *) prep;
+  cinfo->prep = (struct jpeg_c_prep_controller *)prep;
   prep->pub.start_pass = start_pass_prep;
 
   /* Allocate the color conversion buffer.
@@ -348,10 +342,10 @@
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
          ci++, compptr++) {
       prep->color_buf[ci] = (*cinfo->mem->alloc_sarray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE,
-         (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
-                        cinfo->max_h_samp_factor) / compptr->h_samp_factor),
-         (JDIMENSION) cinfo->max_v_samp_factor);
+        ((j_common_ptr)cinfo, JPOOL_IMAGE,
+         (JDIMENSION)(((long)compptr->width_in_blocks * DCTSIZE *
+                       cinfo->max_h_samp_factor) / compptr->h_samp_factor),
+         (JDIMENSION)cinfo->max_v_samp_factor);
     }
   }
 }
diff --git a/jcsample.c b/jcsample.c
index c4b4991..a36f5bb 100644
--- a/jcsample.c
+++ b/jcsample.c
@@ -79,7 +79,7 @@
  */
 
 METHODDEF(void)
-start_pass_downsample (j_compress_ptr cinfo)
+start_pass_downsample(j_compress_ptr cinfo)
 {
   /* no work for now */
 }
@@ -91,14 +91,14 @@
  */
 
 LOCAL(void)
-expand_right_edge (JSAMPARRAY image_data, int num_rows,
-                   JDIMENSION input_cols, JDIMENSION output_cols)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+                  JDIMENSION output_cols)
 {
   register JSAMPROW ptr;
   register JSAMPLE pixval;
   register int count;
   int row;
-  int numcols = (int) (output_cols - input_cols);
+  int numcols = (int)(output_cols - input_cols);
 
   if (numcols > 0) {
     for (row = 0; row < num_rows; row++) {
@@ -118,11 +118,11 @@
  */
 
 METHODDEF(void)
-sep_downsample (j_compress_ptr cinfo,
-                JSAMPIMAGE input_buf, JDIMENSION in_row_index,
-                JSAMPIMAGE output_buf, JDIMENSION out_row_group_index)
+sep_downsample(j_compress_ptr cinfo, JSAMPIMAGE input_buf,
+               JDIMENSION in_row_index, JSAMPIMAGE output_buf,
+               JDIMENSION out_row_group_index)
 {
-  my_downsample_ptr downsample = (my_downsample_ptr) cinfo->downsample;
+  my_downsample_ptr downsample = (my_downsample_ptr)cinfo->downsample;
   int ci;
   jpeg_component_info *compptr;
   JSAMPARRAY in_ptr, out_ptr;
@@ -144,8 +144,8 @@
  */
 
 METHODDEF(void)
-int_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                JSAMPARRAY input_data, JSAMPARRAY output_data)
+int_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+               JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int inrow, outrow, h_expand, v_expand, numpix, numpix2, h, v;
   JDIMENSION outcol, outcol_h;  /* outcol_h == outcol*h_expand */
@@ -156,14 +156,14 @@
   h_expand = cinfo->max_h_samp_factor / compptr->h_samp_factor;
   v_expand = cinfo->max_v_samp_factor / compptr->v_samp_factor;
   numpix = h_expand * v_expand;
-  numpix2 = numpix/2;
+  numpix2 = numpix / 2;
 
   /* Expand input data enough to let all the output samples be generated
    * by the standard loop.  Special-casing padded output would be more
    * efficient.
    */
-  expand_right_edge(input_data, cinfo->max_v_samp_factor,
-                    cinfo->image_width, output_cols * h_expand);
+  expand_right_edge(input_data, cinfo->max_v_samp_factor, cinfo->image_width,
+                    output_cols * h_expand);
 
   inrow = 0;
   for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
@@ -172,12 +172,12 @@
          outcol++, outcol_h += h_expand) {
       outvalue = 0;
       for (v = 0; v < v_expand; v++) {
-        inptr = input_data[inrow+v] + outcol_h;
+        inptr = input_data[inrow + v] + outcol_h;
         for (h = 0; h < h_expand; h++) {
-          outvalue += (JLONG) GETJSAMPLE(*inptr++);
+          outvalue += (JLONG)GETJSAMPLE(*inptr++);
         }
       }
-      *outptr++ = (JSAMPLE) ((outvalue + numpix2) / numpix);
+      *outptr++ = (JSAMPLE)((outvalue + numpix2) / numpix);
     }
     inrow += v_expand;
   }
@@ -191,15 +191,15 @@
  */
 
 METHODDEF(void)
-fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                     JSAMPARRAY input_data, JSAMPARRAY output_data)
+fullsize_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   /* Copy the data */
-  jcopy_sample_rows(input_data, 0, output_data, 0,
-                    cinfo->max_v_samp_factor, cinfo->image_width);
+  jcopy_sample_rows(input_data, 0, output_data, 0, cinfo->max_v_samp_factor,
+                    cinfo->image_width);
   /* Edge-expand */
-  expand_right_edge(output_data, cinfo->max_v_samp_factor,
-                    cinfo->image_width, compptr->width_in_blocks * DCTSIZE);
+  expand_right_edge(output_data, cinfo->max_v_samp_factor, cinfo->image_width,
+                    compptr->width_in_blocks * DCTSIZE);
 }
 
 
@@ -216,8 +216,8 @@
  */
 
 METHODDEF(void)
-h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                 JSAMPARRAY input_data, JSAMPARRAY output_data)
+h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int outrow;
   JDIMENSION outcol;
@@ -229,16 +229,16 @@
    * by the standard loop.  Special-casing padded output would be more
    * efficient.
    */
-  expand_right_edge(input_data, cinfo->max_v_samp_factor,
-                    cinfo->image_width, output_cols * 2);
+  expand_right_edge(input_data, cinfo->max_v_samp_factor, cinfo->image_width,
+                    output_cols * 2);
 
   for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
     outptr = output_data[outrow];
     inptr = input_data[outrow];
     bias = 0;                   /* bias = 0,1,0,1,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
-      *outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr) + GETJSAMPLE(inptr[1])
-                              + bias) >> 1);
+      *outptr++ =
+        (JSAMPLE)((GETJSAMPLE(*inptr) + GETJSAMPLE(inptr[1]) + bias) >> 1);
       bias ^= 1;                /* 0=>1, 1=>0 */
       inptr += 2;
     }
@@ -253,8 +253,8 @@
  */
 
 METHODDEF(void)
-h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                 JSAMPARRAY input_data, JSAMPARRAY output_data)
+h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int inrow, outrow;
   JDIMENSION outcol;
@@ -266,19 +266,19 @@
    * by the standard loop.  Special-casing padded output would be more
    * efficient.
    */
-  expand_right_edge(input_data, cinfo->max_v_samp_factor,
-                    cinfo->image_width, output_cols * 2);
+  expand_right_edge(input_data, cinfo->max_v_samp_factor, cinfo->image_width,
+                    output_cols * 2);
 
   inrow = 0;
   for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
     outptr = output_data[outrow];
     inptr0 = input_data[inrow];
-    inptr1 = input_data[inrow+1];
+    inptr1 = input_data[inrow + 1];
     bias = 1;                   /* bias = 1,2,1,2,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
-      *outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                              GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1])
-                              + bias) >> 2);
+      *outptr++ =
+        (JSAMPLE)((GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
+                   GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]) + bias) >> 2);
       bias ^= 3;                /* 1=>2, 2=>1 */
       inptr0 += 2; inptr1 += 2;
     }
@@ -296,8 +296,8 @@
  */
 
 METHODDEF(void)
-h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                        JSAMPARRAY input_data, JSAMPARRAY output_data)
+h2v2_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int inrow, outrow;
   JDIMENSION colctr;
@@ -332,9 +332,9 @@
   for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
     outptr = output_data[outrow];
     inptr0 = input_data[inrow];
-    inptr1 = input_data[inrow+1];
-    above_ptr = input_data[inrow-1];
-    below_ptr = input_data[inrow+2];
+    inptr1 = input_data[inrow + 1];
+    above_ptr = input_data[inrow - 1];
+    below_ptr = input_data[inrow + 2];
 
     /* Special case for first column: pretend column -1 is same as column 0 */
     membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
@@ -347,7 +347,7 @@
     neighsum += GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[2]) +
                 GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[2]);
     membersum = membersum * memberscale + neighsum * neighscale;
-    *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16);
+    *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
     inptr0 += 2; inptr1 += 2; above_ptr += 2; below_ptr += 2;
 
     for (colctr = output_cols - 2; colctr > 0; colctr--) {
@@ -367,7 +367,7 @@
       /* form final output scaled up by 2^16 */
       membersum = membersum * memberscale + neighsum * neighscale;
       /* round, descale and output it */
-      *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16);
+      *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
       inptr0 += 2; inptr1 += 2; above_ptr += 2; below_ptr += 2;
     }
 
@@ -382,7 +382,7 @@
     neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[1]) +
                 GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[1]);
     membersum = membersum * memberscale + neighsum * neighscale;
-    *outptr = (JSAMPLE) ((membersum + 32768) >> 16);
+    *outptr = (JSAMPLE)((membersum + 32768) >> 16);
 
     inrow += 2;
   }
@@ -396,8 +396,8 @@
  */
 
 METHODDEF(void)
-fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                            JSAMPARRAY input_data, JSAMPARRAY output_data)
+fullsize_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                           JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int outrow;
   JDIMENSION colctr;
@@ -425,8 +425,8 @@
   for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
     outptr = output_data[outrow];
     inptr = input_data[outrow];
-    above_ptr = input_data[outrow-1];
-    below_ptr = input_data[outrow+1];
+    above_ptr = input_data[outrow - 1];
+    below_ptr = input_data[outrow + 1];
 
     /* Special case for first column */
     colsum = GETJSAMPLE(*above_ptr++) + GETJSAMPLE(*below_ptr++) +
@@ -436,7 +436,7 @@
                  GETJSAMPLE(*inptr);
     neighsum = colsum + (colsum - membersum) + nextcolsum;
     membersum = membersum * memberscale + neighsum * neighscale;
-    *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16);
+    *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
     lastcolsum = colsum; colsum = nextcolsum;
 
     for (colctr = output_cols - 2; colctr > 0; colctr--) {
@@ -446,7 +446,7 @@
                    GETJSAMPLE(*inptr);
       neighsum = lastcolsum + (colsum - membersum) + nextcolsum;
       membersum = membersum * memberscale + neighsum * neighscale;
-      *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16);
+      *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
       lastcolsum = colsum; colsum = nextcolsum;
     }
 
@@ -454,7 +454,7 @@
     membersum = GETJSAMPLE(*inptr);
     neighsum = lastcolsum + (colsum - membersum) + colsum;
     membersum = membersum * memberscale + neighsum * neighscale;
-    *outptr = (JSAMPLE) ((membersum + 32768) >> 16);
+    *outptr = (JSAMPLE)((membersum + 32768) >> 16);
 
   }
 }
@@ -468,7 +468,7 @@
  */
 
 GLOBAL(void)
-jinit_downsampler (j_compress_ptr cinfo)
+jinit_downsampler(j_compress_ptr cinfo)
 {
   my_downsample_ptr downsample;
   int ci;
@@ -476,9 +476,9 @@
   boolean smoothok = TRUE;
 
   downsample = (my_downsample_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_downsampler));
-  cinfo->downsample = (struct jpeg_downsampler *) downsample;
+  cinfo->downsample = (struct jpeg_downsampler *)downsample;
   downsample->pub.start_pass = start_pass_downsample;
   downsample->pub.downsample = sep_downsample;
   downsample->pub.need_context_rows = FALSE;
diff --git a/jcstest.c b/jcstest.c
index 11883b5..1c8b7c8 100644
--- a/jcstest.c
+++ b/jcstest.c
@@ -51,13 +51,13 @@
 static void my_error_exit(j_common_ptr cinfo)
 {
   error_mgr *myerr = (error_mgr *)cinfo->err;
-  (*cinfo->err->output_message)(cinfo);
+  (*cinfo->err->output_message) (cinfo);
   longjmp(myerr->jb, 1);
 }
 
 static void my_output_message(j_common_ptr cinfo)
 {
-  (*cinfo->err->format_message)(cinfo, lasterror);
+  (*cinfo->err->format_message) (cinfo, lasterror);
 }
 
 int main(void)
@@ -67,11 +67,11 @@
   error_mgr jerr;
 
   printf("libjpeg-turbo colorspace extensions:\n");
-  #if JCS_EXTENSIONS
+#if JCS_EXTENSIONS
   printf("  Present at compile time\n");
-  #else
+#else
   printf("  Not present at compile time\n");
-  #endif
+#endif
 
   cinfo.err = jpeg_std_error(&jerr.pub);
   jerr.pub.error_exit = my_error_exit;
@@ -90,7 +90,7 @@
   jpeg_default_colorspace(&cinfo);
   jcs_valid = 1;
 
-  done:
+done:
   if (jcs_valid)
     printf("  Working properly\n");
   else
@@ -98,11 +98,11 @@
            lasterror);
 
   printf("libjpeg-turbo alpha colorspace extensions:\n");
-  #if JCS_ALPHA_EXTENSIONS
+#if JCS_ALPHA_EXTENSIONS
   printf("  Present at compile time\n");
-  #else
+#else
   printf("  Not present at compile time\n");
-  #endif
+#endif
 
   if (setjmp(jerr.jb)) {
     /* this will execute if libjpeg has an error */
@@ -114,7 +114,7 @@
   jpeg_default_colorspace(&cinfo);
   jcs_alpha_valid = 1;
 
-  done2:
+done2:
   if (jcs_alpha_valid)
     printf("  Working properly\n");
   else
diff --git a/jctrans.c b/jctrans.c
index 6f16b05..ce70a30 100644
--- a/jctrans.c
+++ b/jctrans.c
@@ -20,10 +20,10 @@
 
 
 /* Forward declarations */
-LOCAL(void) transencode_master_selection
-        (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays);
-LOCAL(void) transencode_coef_controller
-        (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays);
+LOCAL(void) transencode_master_selection(j_compress_ptr cinfo,
+                                         jvirt_barray_ptr *coef_arrays);
+LOCAL(void) transencode_coef_controller(j_compress_ptr cinfo,
+                                        jvirt_barray_ptr *coef_arrays);
 
 
 /*
@@ -39,14 +39,14 @@
  */
 
 GLOBAL(void)
-jpeg_write_coefficients (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays)
+jpeg_write_coefficients(j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays)
 {
   if (cinfo->global_state != CSTATE_START)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
   /* Mark all tables to be written */
   jpeg_suppress_tables(cinfo, FALSE);
   /* (Re)initialize error mgr and destination modules */
-  (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
+  (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
   (*cinfo->dest->init_destination) (cinfo);
   /* Perform master selection of active modules */
   transencode_master_selection(cinfo, coef_arrays);
@@ -64,8 +64,7 @@
  */
 
 GLOBAL(void)
-jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
-                               j_compress_ptr dstinfo)
+jpeg_copy_critical_parameters(j_decompress_ptr srcinfo, j_compress_ptr dstinfo)
 {
   JQUANT_TBL **qtblptr;
   jpeg_component_info *incomp, *outcomp;
@@ -97,11 +96,10 @@
   /* Copy the source's quantization tables. */
   for (tblno = 0; tblno < NUM_QUANT_TBLS; tblno++) {
     if (srcinfo->quant_tbl_ptrs[tblno] != NULL) {
-      qtblptr = & dstinfo->quant_tbl_ptrs[tblno];
+      qtblptr = &dstinfo->quant_tbl_ptrs[tblno];
       if (*qtblptr == NULL)
-        *qtblptr = jpeg_alloc_quant_table((j_common_ptr) dstinfo);
-      MEMCOPY((*qtblptr)->quantval,
-              srcinfo->quant_tbl_ptrs[tblno]->quantval,
+        *qtblptr = jpeg_alloc_quant_table((j_common_ptr)dstinfo);
+      MEMCOPY((*qtblptr)->quantval, srcinfo->quant_tbl_ptrs[tblno]->quantval,
               sizeof((*qtblptr)->quantval));
       (*qtblptr)->sent_table = FALSE;
     }
@@ -165,8 +163,8 @@
  */
 
 LOCAL(void)
-transencode_master_selection (j_compress_ptr cinfo,
-                              jvirt_barray_ptr *coef_arrays)
+transencode_master_selection(j_compress_ptr cinfo,
+                             jvirt_barray_ptr *coef_arrays)
 {
   /* Although we don't actually use input_components for transcoding,
    * jcmaster.c's initial_setup will complain if input_components is 0.
@@ -199,7 +197,7 @@
   jinit_marker_writer(cinfo);
 
   /* We can now tell the memory manager to allocate virtual arrays. */
-  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo);
+  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
 
   /* Write the datastream header (SOI, JFIF) immediately.
    * Frame and scan headers are postponed till later.
@@ -238,10 +236,10 @@
 
 
 LOCAL(void)
-start_iMCU_row (j_compress_ptr cinfo)
+start_iMCU_row(j_compress_ptr cinfo)
 /* Reset within-iMCU-row counters for a new row */
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
 
   /* In an interleaved scan, an MCU row is the same as an iMCU row.
    * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
@@ -250,7 +248,7 @@
   if (cinfo->comps_in_scan > 1) {
     coef->MCU_rows_per_iMCU_row = 1;
   } else {
-    if (coef->iMCU_row_num < (cinfo->total_iMCU_rows-1))
+    if (coef->iMCU_row_num < (cinfo->total_iMCU_rows - 1))
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
     else
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
@@ -266,9 +264,9 @@
  */
 
 METHODDEF(void)
-start_pass_coef (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_coef(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
 
   if (pass_mode != JBUF_CRANK_DEST)
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -289,9 +287,9 @@
  */
 
 METHODDEF(boolean)
-compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+compress_output(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION MCU_col_num;       /* index of current MCU within row */
   JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
@@ -306,9 +304,9 @@
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     buffer[ci] = (*cinfo->mem->access_virt_barray)
-      ((j_common_ptr) cinfo, coef->whole_image[compptr->component_index],
+      ((j_common_ptr)cinfo, coef->whole_image[compptr->component_index],
        coef->iMCU_row_num * compptr->v_samp_factor,
-       (JDIMENSION) compptr->v_samp_factor, FALSE);
+       (JDIMENSION)compptr->v_samp_factor, FALSE);
   }
 
   /* Loop to process one whole iMCU row */
@@ -321,13 +319,13 @@
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
         compptr = cinfo->cur_comp_info[ci];
         start_col = MCU_col_num * compptr->MCU_width;
-        blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
-                                                : compptr->last_col_width;
+        blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width :
+                                                  compptr->last_col_width;
         for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
           if (coef->iMCU_row_num < last_iMCU_row ||
-              yindex+yoffset < compptr->last_row_height) {
+              yindex + yoffset < compptr->last_row_height) {
             /* Fill in pointers to real blocks in this row */
-            buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
+            buffer_ptr = buffer[ci][yindex + yoffset] + start_col;
             for (xindex = 0; xindex < blockcnt; xindex++)
               MCU_buffer[blkn++] = buffer_ptr++;
           } else {
@@ -342,13 +340,13 @@
            */
           for (; xindex < compptr->MCU_width; xindex++) {
             MCU_buffer[blkn] = coef->dummy_buffer[blkn];
-            MCU_buffer[blkn][0][0] = MCU_buffer[blkn-1][0][0];
+            MCU_buffer[blkn][0][0] = MCU_buffer[blkn - 1][0][0];
             blkn++;
           }
         }
       }
       /* Try to write the MCU. */
-      if (! (*cinfo->entropy->encode_mcu) (cinfo, MCU_buffer)) {
+      if (!(*cinfo->entropy->encode_mcu) (cinfo, MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
         coef->MCU_vert_offset = yoffset;
         coef->mcu_ctr = MCU_col_num;
@@ -374,17 +372,17 @@
  */
 
 LOCAL(void)
-transencode_coef_controller (j_compress_ptr cinfo,
-                             jvirt_barray_ptr *coef_arrays)
+transencode_coef_controller(j_compress_ptr cinfo,
+                            jvirt_barray_ptr *coef_arrays)
 {
   my_coef_ptr coef;
   JBLOCKROW buffer;
   int i;
 
   coef = (my_coef_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_coef_controller));
-  cinfo->coef = (struct jpeg_c_coef_controller *) coef;
+  cinfo->coef = (struct jpeg_c_coef_controller *)coef;
   coef->pub.start_pass = start_pass_coef;
   coef->pub.compress_data = compress_output;
 
@@ -393,9 +391,9 @@
 
   /* Allocate and pre-zero space for dummy DCT blocks. */
   buffer = (JBLOCKROW)
-    (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
-  jzero_far((void *) buffer, C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
+  jzero_far((void *)buffer, C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
   for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) {
     coef->dummy_buffer[i] = buffer + i;
   }
diff --git a/jdapimin.c b/jdapimin.c
index f80a146..21a41d2 100644
--- a/jdapimin.c
+++ b/jdapimin.c
@@ -31,7 +31,7 @@
  */
 
 GLOBAL(void)
-jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
+jpeg_CreateDecompress(j_decompress_ptr cinfo, int version, size_t structsize)
 {
   int i;
 
@@ -41,7 +41,7 @@
     ERREXIT2(cinfo, JERR_BAD_LIB_VERSION, JPEG_LIB_VERSION, version);
   if (structsize != sizeof(struct jpeg_decompress_struct))
     ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE,
-             (int) sizeof(struct jpeg_decompress_struct), (int) structsize);
+             (int)sizeof(struct jpeg_decompress_struct), (int)structsize);
 
   /* For debugging purposes, we zero the whole master structure.
    * But the application has already set the err pointer, and may have set
@@ -50,8 +50,8 @@
    * complain here.
    */
   {
-    struct jpeg_error_mgr * err = cinfo->err;
-    void * client_data = cinfo->client_data; /* ignore Purify complaint here */
+    struct jpeg_error_mgr *err = cinfo->err;
+    void *client_data = cinfo->client_data; /* ignore Purify complaint here */
     MEMZERO(cinfo, sizeof(struct jpeg_decompress_struct));
     cinfo->err = err;
     cinfo->client_data = client_data;
@@ -59,7 +59,7 @@
   cinfo->is_decompressor = TRUE;
 
   /* Initialize a memory manager instance for this object */
-  jinit_memory_mgr((j_common_ptr) cinfo);
+  jinit_memory_mgr((j_common_ptr)cinfo);
 
   /* Zero out pointers to permanent structures. */
   cinfo->progress = NULL;
@@ -89,8 +89,8 @@
    * here.
    */
   cinfo->master = (struct jpeg_decomp_master *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
-                                  sizeof(my_decomp_master));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
+                                sizeof(my_decomp_master));
   MEMZERO(cinfo->master, sizeof(my_decomp_master));
 }
 
@@ -100,9 +100,9 @@
  */
 
 GLOBAL(void)
-jpeg_destroy_decompress (j_decompress_ptr cinfo)
+jpeg_destroy_decompress(j_decompress_ptr cinfo)
 {
-  jpeg_destroy((j_common_ptr) cinfo); /* use common routine */
+  jpeg_destroy((j_common_ptr)cinfo); /* use common routine */
 }
 
 
@@ -112,9 +112,9 @@
  */
 
 GLOBAL(void)
-jpeg_abort_decompress (j_decompress_ptr cinfo)
+jpeg_abort_decompress(j_decompress_ptr cinfo)
 {
-  jpeg_abort((j_common_ptr) cinfo); /* use common routine */
+  jpeg_abort((j_common_ptr)cinfo); /* use common routine */
 }
 
 
@@ -123,7 +123,7 @@
  */
 
 LOCAL(void)
-default_decompress_parms (j_decompress_ptr cinfo)
+default_decompress_parms(j_decompress_ptr cinfo)
 {
   /* Guess the input colorspace, and set output colorspace accordingly. */
   /* (Wish JPEG committee had provided a real way to specify this...) */
@@ -250,7 +250,7 @@
  */
 
 GLOBAL(int)
-jpeg_read_header (j_decompress_ptr cinfo, boolean require_image)
+jpeg_read_header(j_decompress_ptr cinfo, boolean require_image)
 {
   int retcode;
 
@@ -271,7 +271,7 @@
      * call jpeg_abort, but we can't change it now for compatibility reasons.
      * A side effect is to free any temporary memory (there shouldn't be any).
      */
-    jpeg_abort((j_common_ptr) cinfo); /* sets state = DSTATE_START */
+    jpeg_abort((j_common_ptr)cinfo); /* sets state = DSTATE_START */
     retcode = JPEG_HEADER_TABLES_ONLY;
     break;
   case JPEG_SUSPENDED:
@@ -296,7 +296,7 @@
  */
 
 GLOBAL(int)
-jpeg_consume_input (j_decompress_ptr cinfo)
+jpeg_consume_input(j_decompress_ptr cinfo)
 {
   int retcode = JPEG_SUSPENDED;
 
@@ -343,7 +343,7 @@
  */
 
 GLOBAL(boolean)
-jpeg_input_complete (j_decompress_ptr cinfo)
+jpeg_input_complete(j_decompress_ptr cinfo)
 {
   /* Check for valid jpeg object */
   if (cinfo->global_state < DSTATE_START ||
@@ -358,7 +358,7 @@
  */
 
 GLOBAL(boolean)
-jpeg_has_multiple_scans (j_decompress_ptr cinfo)
+jpeg_has_multiple_scans(j_decompress_ptr cinfo)
 {
   /* Only valid after jpeg_read_header completes */
   if (cinfo->global_state < DSTATE_READY ||
@@ -378,10 +378,10 @@
  */
 
 GLOBAL(boolean)
-jpeg_finish_decompress (j_decompress_ptr cinfo)
+jpeg_finish_decompress(j_decompress_ptr cinfo)
 {
   if ((cinfo->global_state == DSTATE_SCANNING ||
-       cinfo->global_state == DSTATE_RAW_OK) && ! cinfo->buffered_image) {
+       cinfo->global_state == DSTATE_RAW_OK) && !cinfo->buffered_image) {
     /* Terminate final pass of non-buffered mode */
     if (cinfo->output_scanline < cinfo->output_height)
       ERREXIT(cinfo, JERR_TOO_LITTLE_DATA);
@@ -395,13 +395,13 @@
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
   }
   /* Read until EOI */
-  while (! cinfo->inputctl->eoi_reached) {
+  while (!cinfo->inputctl->eoi_reached) {
     if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
       return FALSE;             /* Suspend, come back later */
   }
   /* Do final cleanup */
   (*cinfo->src->term_source) (cinfo);
   /* We can use jpeg_abort to release memory and reset global_state */
-  jpeg_abort((j_common_ptr) cinfo);
+  jpeg_abort((j_common_ptr)cinfo);
   return TRUE;
 }
diff --git a/jdapistd.c b/jdapistd.c
index 105121d..0e0642b 100644
--- a/jdapistd.c
+++ b/jdapistd.c
@@ -25,7 +25,7 @@
 #include "jmemsys.h"
 
 /* Forward declarations */
-LOCAL(boolean) output_pass_setup (j_decompress_ptr cinfo);
+LOCAL(boolean) output_pass_setup(j_decompress_ptr cinfo);
 
 
 /*
@@ -40,7 +40,7 @@
  */
 
 GLOBAL(boolean)
-jpeg_start_decompress (j_decompress_ptr cinfo)
+jpeg_start_decompress(j_decompress_ptr cinfo)
 {
   if (cinfo->global_state == DSTATE_READY) {
     /* First call: initialize master control, select active modules */
@@ -60,7 +60,7 @@
         int retcode;
         /* Call progress monitor hook if present */
         if (cinfo->progress != NULL)
-          (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+          (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
         /* Absorb some more input */
         retcode = (*cinfo->inputctl->consume_input) (cinfo);
         if (retcode == JPEG_SUSPENDED)
@@ -72,7 +72,7 @@
             (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) {
           if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) {
             /* jdmaster underestimated number of scans; ratchet up one scan */
-            cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows;
+            cinfo->progress->pass_limit += (long)cinfo->total_iMCU_rows;
           }
         }
       }
@@ -97,7 +97,7 @@
  */
 
 LOCAL(boolean)
-output_pass_setup (j_decompress_ptr cinfo)
+output_pass_setup(j_decompress_ptr cinfo)
 {
   if (cinfo->global_state != DSTATE_PRESCAN) {
     /* First call: do pass setup */
@@ -113,14 +113,14 @@
       JDIMENSION last_scanline;
       /* Call progress monitor hook if present */
       if (cinfo->progress != NULL) {
-        cinfo->progress->pass_counter = (long) cinfo->output_scanline;
-        cinfo->progress->pass_limit = (long) cinfo->output_height;
-        (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+        cinfo->progress->pass_counter = (long)cinfo->output_scanline;
+        cinfo->progress->pass_limit = (long)cinfo->output_height;
+        (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
       }
       /* Process some data */
       last_scanline = cinfo->output_scanline;
-      (*cinfo->main->process_data) (cinfo, (JSAMPARRAY) NULL,
-                                    &cinfo->output_scanline, (JDIMENSION) 0);
+      (*cinfo->main->process_data) (cinfo, (JSAMPARRAY)NULL,
+                                    &cinfo->output_scanline, (JDIMENSION)0);
       if (cinfo->output_scanline == last_scanline)
         return FALSE;           /* No progress made, must suspend */
     }
@@ -150,8 +150,8 @@
  */
 
 GLOBAL(void)
-jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
-                    JDIMENSION *width)
+jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                   JDIMENSION *width)
 {
   int ci, align, orig_downsampled_width;
   JDIMENSION input_xoffset;
@@ -210,11 +210,10 @@
   /* Set the first and last iMCU columns that we must decompress.  These values
    * will be used in single-scan decompressions.
    */
-  cinfo->master->first_iMCU_col =
-    (JDIMENSION) (long) (*xoffset) / (long) align;
+  cinfo->master->first_iMCU_col = (JDIMENSION)(long)(*xoffset) / (long)align;
   cinfo->master->last_iMCU_col =
-    (JDIMENSION) jdiv_round_up((long) (*xoffset + cinfo->output_width),
-                               (long) align) - 1;
+    (JDIMENSION)jdiv_round_up((long)(*xoffset + cinfo->output_width),
+                              (long)align) - 1;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
@@ -224,9 +223,9 @@
     /* Set downsampled_width to the new output width. */
     orig_downsampled_width = compptr->downsampled_width;
     compptr->downsampled_width =
-      (JDIMENSION) jdiv_round_up((long) (cinfo->output_width *
-                                         compptr->h_samp_factor),
-                                 (long) cinfo->max_h_samp_factor);
+      (JDIMENSION)jdiv_round_up((long)(cinfo->output_width *
+                                       compptr->h_samp_factor),
+                                (long)cinfo->max_h_samp_factor);
     if (compptr->downsampled_width < 2 && orig_downsampled_width >= 2)
       reinit_upsampler = TRUE;
 
@@ -234,11 +233,10 @@
      * values will be used in multi-scan decompressions.
      */
     cinfo->master->first_MCU_col[ci] =
-      (JDIMENSION) (long) (*xoffset * hsf) / (long) align;
+      (JDIMENSION)(long)(*xoffset * hsf) / (long)align;
     cinfo->master->last_MCU_col[ci] =
-      (JDIMENSION) jdiv_round_up((long) ((*xoffset + cinfo->output_width) *
-                                         hsf),
-                                 (long) align) - 1;
+      (JDIMENSION)jdiv_round_up((long)((*xoffset + cinfo->output_width) * hsf),
+                                (long)align) - 1;
   }
 
   if (reinit_upsampler) {
@@ -263,8 +261,8 @@
  */
 
 GLOBAL(JDIMENSION)
-jpeg_read_scanlines (j_decompress_ptr cinfo, JSAMPARRAY scanlines,
-                     JDIMENSION max_lines)
+jpeg_read_scanlines(j_decompress_ptr cinfo, JSAMPARRAY scanlines,
+                    JDIMENSION max_lines)
 {
   JDIMENSION row_ctr;
 
@@ -277,9 +275,9 @@
 
   /* Call progress monitor hook if present */
   if (cinfo->progress != NULL) {
-    cinfo->progress->pass_counter = (long) cinfo->output_scanline;
-    cinfo->progress->pass_limit = (long) cinfo->output_height;
-    (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+    cinfo->progress->pass_counter = (long)cinfo->output_scanline;
+    cinfo->progress->pass_limit = (long)cinfo->output_height;
+    (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
   }
 
   /* Process some data */
@@ -292,16 +290,16 @@
 
 /* Dummy color convert function used by jpeg_skip_scanlines() */
 LOCAL(void)
-noop_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-              JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+noop_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+             JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
 }
 
 
 /* Dummy quantize function used by jpeg_skip_scanlines() */
 LOCAL(void)
-noop_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-               JSAMPARRAY output_buf, int num_rows)
+noop_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+              JSAMPARRAY output_buf, int num_rows)
 {
 }
 
@@ -315,7 +313,7 @@
  */
 
 LOCAL(void)
-read_and_discard_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
+read_and_discard_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
 {
   JDIMENSION n;
   void (*color_convert) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
@@ -346,10 +344,10 @@
  */
 
 LOCAL(void)
-increment_simple_rowgroup_ctr (j_decompress_ptr cinfo, JDIMENSION rows)
+increment_simple_rowgroup_ctr(j_decompress_ptr cinfo, JDIMENSION rows)
 {
   JDIMENSION rows_left;
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
 
   /* Increment the counter to the next row group after the skipped rows. */
   main_ptr->rowgroup_ctr += rows / cinfo->max_v_samp_factor;
@@ -375,11 +373,11 @@
  */
 
 GLOBAL(JDIMENSION)
-jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
+jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
   JDIMENSION i, x;
   int y;
   JDIMENSION lines_per_iMCU_row, lines_left_in_iMCU_row, lines_after_iMCU_row;
@@ -544,8 +542,8 @@
  */
 
 GLOBAL(JDIMENSION)
-jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
-                    JDIMENSION max_lines)
+jpeg_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+                   JDIMENSION max_lines)
 {
   JDIMENSION lines_per_iMCU_row;
 
@@ -558,9 +556,9 @@
 
   /* Call progress monitor hook if present */
   if (cinfo->progress != NULL) {
-    cinfo->progress->pass_counter = (long) cinfo->output_scanline;
-    cinfo->progress->pass_limit = (long) cinfo->output_height;
-    (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+    cinfo->progress->pass_counter = (long)cinfo->output_scanline;
+    cinfo->progress->pass_limit = (long)cinfo->output_height;
+    (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
   }
 
   /* Verify that at least one iMCU row can be returned. */
@@ -569,7 +567,7 @@
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
 
   /* Decompress directly into user's buffer. */
-  if (! (*cinfo->coef->decompress_data) (cinfo, data))
+  if (!(*cinfo->coef->decompress_data) (cinfo, data))
     return 0;                   /* suspension forced, can do nothing more */
 
   /* OK, we processed one iMCU row. */
@@ -587,7 +585,7 @@
  */
 
 GLOBAL(boolean)
-jpeg_start_output (j_decompress_ptr cinfo, int scan_number)
+jpeg_start_output(j_decompress_ptr cinfo, int scan_number)
 {
   if (cinfo->global_state != DSTATE_BUFIMAGE &&
       cinfo->global_state != DSTATE_PRESCAN)
@@ -595,8 +593,7 @@
   /* Limit scan number to valid range */
   if (scan_number <= 0)
     scan_number = 1;
-  if (cinfo->inputctl->eoi_reached &&
-      scan_number > cinfo->input_scan_number)
+  if (cinfo->inputctl->eoi_reached && scan_number > cinfo->input_scan_number)
     scan_number = cinfo->input_scan_number;
   cinfo->output_scan_number = scan_number;
   /* Perform any dummy output passes, and set up for the real pass */
@@ -612,7 +609,7 @@
  */
 
 GLOBAL(boolean)
-jpeg_finish_output (j_decompress_ptr cinfo)
+jpeg_finish_output(j_decompress_ptr cinfo)
 {
   if ((cinfo->global_state == DSTATE_SCANNING ||
        cinfo->global_state == DSTATE_RAW_OK) && cinfo->buffered_image) {
@@ -626,7 +623,7 @@
   }
   /* Read markers looking for SOS or EOI */
   while (cinfo->input_scan_number <= cinfo->output_scan_number &&
-         ! cinfo->inputctl->eoi_reached) {
+         !cinfo->inputctl->eoi_reached) {
     if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
       return FALSE;             /* Suspend, come back later */
   }
diff --git a/jdarith.c b/jdarith.c
index 0f560f6..333f392 100644
--- a/jdarith.c
+++ b/jdarith.c
@@ -68,13 +68,13 @@
 
 
 LOCAL(int)
-get_byte (j_decompress_ptr cinfo)
+get_byte(j_decompress_ptr cinfo)
 /* Read next input byte; we do not support suspension in this module. */
 {
   struct jpeg_source_mgr *src = cinfo->src;
 
   if (src->bytes_in_buffer == 0)
-    if (! (*src->fill_input_buffer) (cinfo))
+    if (!(*src->fill_input_buffer) (cinfo))
       ERREXIT(cinfo, JERR_CANT_SUSPEND);
   src->bytes_in_buffer--;
   return GETJOCTET(*src->next_input_byte++);
@@ -109,9 +109,9 @@
  */
 
 LOCAL(int)
-arith_decode (j_decompress_ptr cinfo, unsigned char *st)
+arith_decode(j_decompress_ptr cinfo, unsigned char *st)
 {
-  register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  register arith_entropy_ptr e = (arith_entropy_ptr)cinfo->entropy;
   register unsigned char nl, nm;
   register JLONG qe, temp;
   register int sv, data;
@@ -156,8 +156,8 @@
    */
   sv = *st;
   qe = jpeg_aritab[sv & 0x7F];  /* => Qe_Value */
-  nl = qe & 0xFF; qe >>= 8;     /* Next_Index_LPS + Switch_MPS */
-  nm = qe & 0xFF; qe >>= 8;     /* Next_Index_MPS */
+  nl = qe & 0xFF;  qe >>= 8;    /* Next_Index_LPS + Switch_MPS */
+  nm = qe & 0xFF;  qe >>= 8;    /* Next_Index_MPS */
 
   /* Decode & estimation procedures per sections D.2.4 & D.2.5 */
   temp = e->a - qe;
@@ -193,14 +193,14 @@
  */
 
 LOCAL(void)
-process_restart (j_decompress_ptr cinfo)
+process_restart(j_decompress_ptr cinfo)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   int ci;
   jpeg_component_info *compptr;
 
   /* Advance past the RSTn marker */
-  if (! (*cinfo->marker->read_restart_marker) (cinfo))
+  if (!(*cinfo->marker->read_restart_marker) (cinfo))
     ERREXIT(cinfo, JERR_CANT_SUSPEND);
 
   /* Re-initialize statistics areas */
@@ -244,9 +244,9 @@
  */
 
 METHODDEF(boolean)
-decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   JBLOCKROW block;
   unsigned char *st;
   int blkn, ci, tbl, sign;
@@ -280,7 +280,7 @@
       /* Figure F.21: Decoding nonzero value v */
       /* Figure F.22: Decoding the sign of v */
       sign = arith_decode(cinfo, st + 1);
-      st += 2; st += sign;
+      st += 2;  st += sign;
       /* Figure F.23: Decoding the magnitude category of v */
       if ((m = arith_decode(cinfo, st)) != 0) {
         st = entropy->dc_stats[tbl] + 20;       /* Table F.4: X1 = 20 */
@@ -294,9 +294,9 @@
         }
       }
       /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
-      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+      if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
         entropy->dc_context[ci] = 0;               /* zero diff category */
-      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+      else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
         entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
       else
         entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
@@ -305,12 +305,12 @@
       st += 14;
       while (m >>= 1)
         if (arith_decode(cinfo, st)) v |= m;
-      v += 1; if (sign) v = -v;
+      v += 1;  if (sign) v = -v;
       entropy->last_dc_val[ci] = (entropy->last_dc_val[ci] + v) & 0xffff;
     }
 
     /* Scale and output the DC coefficient (assumes jpeg_natural_order[0]=0) */
-    (*block)[0] = (JCOEF) LEFT_SHIFT(entropy->last_dc_val[ci], cinfo->Al);
+    (*block)[0] = (JCOEF)LEFT_SHIFT(entropy->last_dc_val[ci], cinfo->Al);
   }
 
   return TRUE;
@@ -323,9 +323,9 @@
  */
 
 METHODDEF(boolean)
-decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   JBLOCKROW block;
   unsigned char *st;
   int tbl, sign, k;
@@ -351,7 +351,7 @@
     st = entropy->ac_stats[tbl] + 3 * (k - 1);
     if (arith_decode(cinfo, st)) break;         /* EOB flag */
     while (arith_decode(cinfo, st + 1) == 0) {
-      st += 3; k++;
+      st += 3;  k++;
       if (k > cinfo->Se) {
         WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
         entropy->ct = -1;                       /* spectral overflow */
@@ -383,9 +383,9 @@
     st += 14;
     while (m >>= 1)
       if (arith_decode(cinfo, st)) v |= m;
-    v += 1; if (sign) v = -v;
+    v += 1;  if (sign) v = -v;
     /* Scale and output coefficient in natural (dezigzagged) order */
-    (*block)[jpeg_natural_order[k]] = (JCOEF) ((unsigned)v << cinfo->Al);
+    (*block)[jpeg_natural_order[k]] = (JCOEF)((unsigned)v << cinfo->Al);
   }
 
   return TRUE;
@@ -397,9 +397,9 @@
  */
 
 METHODDEF(boolean)
-decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   unsigned char *st;
   int p1, blkn;
 
@@ -430,9 +430,9 @@
  */
 
 METHODDEF(boolean)
-decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   JBLOCKROW block;
   JCOEFPTR thiscoef;
   unsigned char *st;
@@ -481,7 +481,7 @@
           *thiscoef = p1;
         break;
       }
-      st += 3; k++;
+      st += 3;  k++;
       if (k > cinfo->Se) {
         WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
         entropy->ct = -1;                       /* spectral overflow */
@@ -499,9 +499,9 @@
  */
 
 METHODDEF(boolean)
-decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   jpeg_component_info *compptr;
   JBLOCKROW block;
   unsigned char *st;
@@ -538,7 +538,7 @@
       /* Figure F.21: Decoding nonzero value v */
       /* Figure F.22: Decoding the sign of v */
       sign = arith_decode(cinfo, st + 1);
-      st += 2; st += sign;
+      st += 2;  st += sign;
       /* Figure F.23: Decoding the magnitude category of v */
       if ((m = arith_decode(cinfo, st)) != 0) {
         st = entropy->dc_stats[tbl] + 20;       /* Table F.4: X1 = 20 */
@@ -552,9 +552,9 @@
         }
       }
       /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
-      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+      if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
         entropy->dc_context[ci] = 0;               /* zero diff category */
-      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+      else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
         entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
       else
         entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
@@ -563,12 +563,12 @@
       st += 14;
       while (m >>= 1)
         if (arith_decode(cinfo, st)) v |= m;
-      v += 1; if (sign) v = -v;
+      v += 1;  if (sign) v = -v;
       entropy->last_dc_val[ci] = (entropy->last_dc_val[ci] + v) & 0xffff;
     }
 
     if (block)
-      (*block)[0] = (JCOEF) entropy->last_dc_val[ci];
+      (*block)[0] = (JCOEF)entropy->last_dc_val[ci];
 
     /* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */
 
@@ -579,7 +579,7 @@
       st = entropy->ac_stats[tbl] + 3 * (k - 1);
       if (arith_decode(cinfo, st)) break;       /* EOB flag */
       while (arith_decode(cinfo, st + 1) == 0) {
-        st += 3; k++;
+        st += 3;  k++;
         if (k > DCTSIZE2 - 1) {
           WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
           entropy->ct = -1;                     /* spectral overflow */
@@ -611,9 +611,9 @@
       st += 14;
       while (m >>= 1)
         if (arith_decode(cinfo, st)) v |= m;
-      v += 1; if (sign) v = -v;
+      v += 1;  if (sign) v = -v;
       if (block)
-        (*block)[jpeg_natural_order[k]] = (JCOEF) v;
+        (*block)[jpeg_natural_order[k]] = (JCOEF)v;
     }
   }
 
@@ -626,9 +626,9 @@
  */
 
 METHODDEF(void)
-start_pass (j_decompress_ptr cinfo)
+start_pass(j_decompress_ptr cinfo)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   int ci, tbl;
   jpeg_component_info *compptr;
 
@@ -647,11 +647,11 @@
     }
     if (cinfo->Ah != 0) {
       /* Successive approximation refinement scan: must have Al = Ah-1. */
-      if (cinfo->Ah-1 != cinfo->Al)
+      if (cinfo->Ah - 1 != cinfo->Al)
         goto bad;
     }
     if (cinfo->Al > 13) {       /* need not check for < 0 */
-      bad:
+bad:
       ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
                cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
     }
@@ -661,7 +661,7 @@
      */
     for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
       int coefi, cindex = cinfo->cur_comp_info[ci]->component_index;
-      int *coef_bit_ptr = & cinfo->coef_bits[cindex][0];
+      int *coef_bit_ptr = &cinfo->coef_bits[cindex][0];
       if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
         WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
       for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
@@ -702,8 +702,8 @@
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
         ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
       if (entropy->dc_stats[tbl] == NULL)
-        entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
-          ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+        entropy->dc_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+          ((j_common_ptr)cinfo, JPOOL_IMAGE, DC_STAT_BINS);
       MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
       /* Initialize DC predictions to 0 */
       entropy->last_dc_val[ci] = 0;
@@ -714,8 +714,8 @@
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
         ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
       if (entropy->ac_stats[tbl] == NULL)
-        entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
-          ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+        entropy->ac_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+          ((j_common_ptr)cinfo, JPOOL_IMAGE, AC_STAT_BINS);
       MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
     }
   }
@@ -735,15 +735,15 @@
  */
 
 GLOBAL(void)
-jinit_arith_decoder (j_decompress_ptr cinfo)
+jinit_arith_decoder(j_decompress_ptr cinfo)
 {
   arith_entropy_ptr entropy;
   int i;
 
   entropy = (arith_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(arith_entropy_decoder));
-  cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
+  cinfo->entropy = (struct jpeg_entropy_decoder *)entropy;
   entropy->pub.start_pass = start_pass;
 
   /* Mark tables unallocated */
@@ -759,9 +759,10 @@
     /* Create progression status table */
     int *coef_bit_ptr, ci;
     cinfo->coef_bits = (int (*)[DCTSIZE2])
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  cinfo->num_components*DCTSIZE2*sizeof(int));
-    coef_bit_ptr = & cinfo->coef_bits[0][0];
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                  cinfo->num_components * DCTSIZE2 *
+                                  sizeof(int));
+    coef_bit_ptr = &cinfo->coef_bits[0][0];
     for (ci = 0; ci < cinfo->num_components; ci++)
       for (i = 0; i < DCTSIZE2; i++)
         *coef_bit_ptr++ = -1;
diff --git a/jdatadst-tj.c b/jdatadst-tj.c
index a2219df..0bd961b 100644
--- a/jdatadst-tj.c
+++ b/jdatadst-tj.c
@@ -24,8 +24,8 @@
 #include "jerror.h"
 
 #ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc(),free() */
-extern void *malloc (size_t size);
-extern void free (void *ptr);
+extern void *malloc(size_t size);
+extern void free(void *ptr);
 #endif
 
 
@@ -54,7 +54,7 @@
  */
 
 METHODDEF(void)
-init_mem_destination (j_compress_ptr cinfo)
+init_mem_destination(j_compress_ptr cinfo)
 {
   /* no work necessary here */
 }
@@ -84,17 +84,17 @@
  */
 
 METHODDEF(boolean)
-empty_mem_output_buffer (j_compress_ptr cinfo)
+empty_mem_output_buffer(j_compress_ptr cinfo)
 {
   size_t nextsize;
   JOCTET *nextbuffer;
-  my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+  my_mem_dest_ptr dest = (my_mem_dest_ptr)cinfo->dest;
 
   if (!dest->alloc) ERREXIT(cinfo, JERR_BUFFER_SIZE);
 
   /* Try to allocate new buffer with double size */
   nextsize = dest->bufsize * 2;
-  nextbuffer = (JOCTET *) malloc(nextsize);
+  nextbuffer = (JOCTET *)malloc(nextsize);
 
   if (nextbuffer == NULL)
     ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
@@ -126,9 +126,9 @@
  */
 
 METHODDEF(void)
-term_mem_destination (j_compress_ptr cinfo)
+term_mem_destination(j_compress_ptr cinfo)
 {
-  my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+  my_mem_dest_ptr dest = (my_mem_dest_ptr)cinfo->dest;
 
   if (dest->alloc) *dest->outbuffer = dest->buffer;
   *dest->outsize = (unsigned long)(dest->bufsize - dest->pub.free_in_buffer);
@@ -147,9 +147,8 @@
  */
 
 GLOBAL(void)
-jpeg_mem_dest_tj (j_compress_ptr cinfo,
-               unsigned char **outbuffer, unsigned long *outsize,
-               boolean alloc)
+jpeg_mem_dest_tj(j_compress_ptr cinfo, unsigned char **outbuffer,
+                 unsigned long *outsize, boolean alloc)
 {
   boolean reused = FALSE;
   my_mem_dest_ptr dest;
@@ -162,9 +161,9 @@
    */
   if (cinfo->dest == NULL) {    /* first time for this JPEG object? */
     cinfo->dest = (struct jpeg_destination_mgr *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   sizeof(my_mem_destination_mgr));
-    dest = (my_mem_dest_ptr) cinfo->dest;
+    dest = (my_mem_dest_ptr)cinfo->dest;
     dest->newbuffer = NULL;
     dest->buffer = NULL;
   } else if (cinfo->dest->init_destination != init_mem_destination) {
@@ -174,7 +173,7 @@
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
   }
 
-  dest = (my_mem_dest_ptr) cinfo->dest;
+  dest = (my_mem_dest_ptr)cinfo->dest;
   dest->pub.init_destination = init_mem_destination;
   dest->pub.empty_output_buffer = empty_mem_output_buffer;
   dest->pub.term_destination = term_mem_destination;
@@ -187,12 +186,12 @@
   if (*outbuffer == NULL || *outsize == 0) {
     if (alloc) {
       /* Allocate initial buffer */
-      dest->newbuffer = *outbuffer = (unsigned char *) malloc(OUTPUT_BUF_SIZE);
+      dest->newbuffer = *outbuffer = (unsigned char *)malloc(OUTPUT_BUF_SIZE);
       if (dest->newbuffer == NULL)
         ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
       *outsize = OUTPUT_BUF_SIZE;
-    }
-    else ERREXIT(cinfo, JERR_BUFFER_SIZE);
+    } else
+      ERREXIT(cinfo, JERR_BUFFER_SIZE);
   }
 
   dest->pub.next_output_byte = dest->buffer = *outbuffer;
diff --git a/jdatadst.c b/jdatadst.c
index dcaf6f0..3168b96 100644
--- a/jdatadst.c
+++ b/jdatadst.c
@@ -24,8 +24,8 @@
 #include "jerror.h"
 
 #ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc(),free() */
-extern void *malloc (size_t size);
-extern void free (void *ptr);
+extern void *malloc(size_t size);
+extern void free(void *ptr);
 #endif
 
 
@@ -66,14 +66,14 @@
  */
 
 METHODDEF(void)
-init_destination (j_compress_ptr cinfo)
+init_destination(j_compress_ptr cinfo)
 {
-  my_dest_ptr dest = (my_dest_ptr) cinfo->dest;
+  my_dest_ptr dest = (my_dest_ptr)cinfo->dest;
 
   /* Allocate the output buffer --- it will be released when done with image */
   dest->buffer = (JOCTET *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  OUTPUT_BUF_SIZE * sizeof(JOCTET));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                OUTPUT_BUF_SIZE * sizeof(JOCTET));
 
   dest->pub.next_output_byte = dest->buffer;
   dest->pub.free_in_buffer = OUTPUT_BUF_SIZE;
@@ -81,7 +81,7 @@
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(void)
-init_mem_destination (j_compress_ptr cinfo)
+init_mem_destination(j_compress_ptr cinfo)
 {
   /* no work necessary here */
 }
@@ -112,12 +112,12 @@
  */
 
 METHODDEF(boolean)
-empty_output_buffer (j_compress_ptr cinfo)
+empty_output_buffer(j_compress_ptr cinfo)
 {
-  my_dest_ptr dest = (my_dest_ptr) cinfo->dest;
+  my_dest_ptr dest = (my_dest_ptr)cinfo->dest;
 
   if (JFWRITE(dest->outfile, dest->buffer, OUTPUT_BUF_SIZE) !=
-      (size_t) OUTPUT_BUF_SIZE)
+      (size_t)OUTPUT_BUF_SIZE)
     ERREXIT(cinfo, JERR_FILE_WRITE);
 
   dest->pub.next_output_byte = dest->buffer;
@@ -128,15 +128,15 @@
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(boolean)
-empty_mem_output_buffer (j_compress_ptr cinfo)
+empty_mem_output_buffer(j_compress_ptr cinfo)
 {
   size_t nextsize;
   JOCTET *nextbuffer;
-  my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+  my_mem_dest_ptr dest = (my_mem_dest_ptr)cinfo->dest;
 
   /* Try to allocate new buffer with double size */
   nextsize = dest->bufsize * 2;
-  nextbuffer = (JOCTET *) malloc(nextsize);
+  nextbuffer = (JOCTET *)malloc(nextsize);
 
   if (nextbuffer == NULL)
     ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
@@ -169,9 +169,9 @@
  */
 
 METHODDEF(void)
-term_destination (j_compress_ptr cinfo)
+term_destination(j_compress_ptr cinfo)
 {
-  my_dest_ptr dest = (my_dest_ptr) cinfo->dest;
+  my_dest_ptr dest = (my_dest_ptr)cinfo->dest;
   size_t datacount = OUTPUT_BUF_SIZE - dest->pub.free_in_buffer;
 
   /* Write any data remaining in the buffer */
@@ -187,9 +187,9 @@
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(void)
-term_mem_destination (j_compress_ptr cinfo)
+term_mem_destination(j_compress_ptr cinfo)
 {
-  my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+  my_mem_dest_ptr dest = (my_mem_dest_ptr)cinfo->dest;
 
   *dest->outbuffer = dest->buffer;
   *dest->outsize = (unsigned long)(dest->bufsize - dest->pub.free_in_buffer);
@@ -204,7 +204,7 @@
  */
 
 GLOBAL(void)
-jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile)
+jpeg_stdio_dest(j_compress_ptr cinfo, FILE *outfile)
 {
   my_dest_ptr dest;
 
@@ -213,7 +213,7 @@
    */
   if (cinfo->dest == NULL) {    /* first time for this JPEG object? */
     cinfo->dest = (struct jpeg_destination_mgr *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   sizeof(my_destination_mgr));
   } else if (cinfo->dest->init_destination != init_destination) {
     /* It is unsafe to reuse the existing destination manager unless it was
@@ -225,7 +225,7 @@
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
   }
 
-  dest = (my_dest_ptr) cinfo->dest;
+  dest = (my_dest_ptr)cinfo->dest;
   dest->pub.init_destination = init_destination;
   dest->pub.empty_output_buffer = empty_output_buffer;
   dest->pub.term_destination = term_destination;
@@ -249,8 +249,8 @@
  */
 
 GLOBAL(void)
-jpeg_mem_dest (j_compress_ptr cinfo,
-               unsigned char **outbuffer, unsigned long *outsize)
+jpeg_mem_dest(j_compress_ptr cinfo, unsigned char **outbuffer,
+              unsigned long *outsize)
 {
   my_mem_dest_ptr dest;
 
@@ -262,7 +262,7 @@
    */
   if (cinfo->dest == NULL) {    /* first time for this JPEG object? */
     cinfo->dest = (struct jpeg_destination_mgr *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   sizeof(my_mem_destination_mgr));
   } else if (cinfo->dest->init_destination != init_mem_destination) {
     /* It is unsafe to reuse the existing destination manager unless it was
@@ -271,7 +271,7 @@
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
   }
 
-  dest = (my_mem_dest_ptr) cinfo->dest;
+  dest = (my_mem_dest_ptr)cinfo->dest;
   dest->pub.init_destination = init_mem_destination;
   dest->pub.empty_output_buffer = empty_mem_output_buffer;
   dest->pub.term_destination = term_mem_destination;
@@ -281,7 +281,7 @@
 
   if (*outbuffer == NULL || *outsize == 0) {
     /* Allocate initial buffer */
-    dest->newbuffer = *outbuffer = (unsigned char *) malloc(OUTPUT_BUF_SIZE);
+    dest->newbuffer = *outbuffer = (unsigned char *)malloc(OUTPUT_BUF_SIZE);
     if (dest->newbuffer == NULL)
       ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
     *outsize = OUTPUT_BUF_SIZE;
diff --git a/jdatasrc-tj.c b/jdatasrc-tj.c
index 05456c8..1c71307 100644
--- a/jdatasrc-tj.c
+++ b/jdatasrc-tj.c
@@ -30,7 +30,7 @@
  */
 
 METHODDEF(void)
-init_mem_source (j_decompress_ptr cinfo)
+init_mem_source(j_decompress_ptr cinfo)
 {
   /* no work necessary here */
 }
@@ -70,10 +70,10 @@
  */
 
 METHODDEF(boolean)
-fill_mem_input_buffer (j_decompress_ptr cinfo)
+fill_mem_input_buffer(j_decompress_ptr cinfo)
 {
   static const JOCTET mybuffer[4] = {
-    (JOCTET) 0xFF, (JOCTET) JPEG_EOI, 0, 0
+    (JOCTET)0xFF, (JOCTET)JPEG_EOI, 0, 0
   };
 
   /* The whole JPEG data is expected to reside in the supplied memory
@@ -104,7 +104,7 @@
  */
 
 METHODDEF(void)
-skip_input_data (j_decompress_ptr cinfo, long num_bytes)
+skip_input_data(j_decompress_ptr cinfo, long num_bytes)
 {
   struct jpeg_source_mgr *src = cinfo->src;
 
@@ -113,15 +113,15 @@
    * any trouble anyway --- large skips are infrequent.
    */
   if (num_bytes > 0) {
-    while (num_bytes > (long) src->bytes_in_buffer) {
-      num_bytes -= (long) src->bytes_in_buffer;
-      (void) (*src->fill_input_buffer) (cinfo);
+    while (num_bytes > (long)src->bytes_in_buffer) {
+      num_bytes -= (long)src->bytes_in_buffer;
+      (void)(*src->fill_input_buffer) (cinfo);
       /* note we assume that fill_input_buffer will never return FALSE,
        * so suspension need not be handled.
        */
     }
-    src->next_input_byte += (size_t) num_bytes;
-    src->bytes_in_buffer -= (size_t) num_bytes;
+    src->next_input_byte += (size_t)num_bytes;
+    src->bytes_in_buffer -= (size_t)num_bytes;
   }
 }
 
@@ -145,7 +145,7 @@
  */
 
 METHODDEF(void)
-term_source (j_decompress_ptr cinfo)
+term_source(j_decompress_ptr cinfo)
 {
   /* no work necessary here */
 }
@@ -157,8 +157,8 @@
  */
 
 GLOBAL(void)
-jpeg_mem_src_tj (j_decompress_ptr cinfo,
-                 const unsigned char *inbuffer, unsigned long insize)
+jpeg_mem_src_tj(j_decompress_ptr cinfo, const unsigned char *inbuffer,
+                unsigned long insize)
 {
   struct jpeg_source_mgr *src;
 
@@ -171,7 +171,7 @@
    */
   if (cinfo->src == NULL) {     /* first time for this JPEG object? */
     cinfo->src = (struct jpeg_source_mgr *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   sizeof(struct jpeg_source_mgr));
   } else if (cinfo->src->init_source != init_mem_source) {
     /* It is unsafe to reuse the existing source manager unless it was created
@@ -186,6 +186,6 @@
   src->skip_input_data = skip_input_data;
   src->resync_to_restart = jpeg_resync_to_restart; /* use default method */
   src->term_source = term_source;
-  src->bytes_in_buffer = (size_t) insize;
-  src->next_input_byte = (const JOCTET *) inbuffer;
+  src->bytes_in_buffer = (size_t)insize;
+  src->next_input_byte = (const JOCTET *)inbuffer;
 }
diff --git a/jdatasrc.c b/jdatasrc.c
index c83183f..eadb4a2 100644
--- a/jdatasrc.c
+++ b/jdatasrc.c
@@ -45,9 +45,9 @@
  */
 
 METHODDEF(void)
-init_source (j_decompress_ptr cinfo)
+init_source(j_decompress_ptr cinfo)
 {
-  my_src_ptr src = (my_src_ptr) cinfo->src;
+  my_src_ptr src = (my_src_ptr)cinfo->src;
 
   /* We reset the empty-input-file flag for each image,
    * but we don't clear the input buffer.
@@ -58,7 +58,7 @@
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(void)
-init_mem_source (j_decompress_ptr cinfo)
+init_mem_source(j_decompress_ptr cinfo)
 {
   /* no work necessary here */
 }
@@ -99,9 +99,9 @@
  */
 
 METHODDEF(boolean)
-fill_input_buffer (j_decompress_ptr cinfo)
+fill_input_buffer(j_decompress_ptr cinfo)
 {
-  my_src_ptr src = (my_src_ptr) cinfo->src;
+  my_src_ptr src = (my_src_ptr)cinfo->src;
   size_t nbytes;
 
   nbytes = JFREAD(src->infile, src->buffer, INPUT_BUF_SIZE);
@@ -111,8 +111,8 @@
       ERREXIT(cinfo, JERR_INPUT_EMPTY);
     WARNMS(cinfo, JWRN_JPEG_EOF);
     /* Insert a fake EOI marker */
-    src->buffer[0] = (JOCTET) 0xFF;
-    src->buffer[1] = (JOCTET) JPEG_EOI;
+    src->buffer[0] = (JOCTET)0xFF;
+    src->buffer[1] = (JOCTET)JPEG_EOI;
     nbytes = 2;
   }
 
@@ -125,10 +125,10 @@
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(boolean)
-fill_mem_input_buffer (j_decompress_ptr cinfo)
+fill_mem_input_buffer(j_decompress_ptr cinfo)
 {
   static const JOCTET mybuffer[4] = {
-    (JOCTET) 0xFF, (JOCTET) JPEG_EOI, 0, 0
+    (JOCTET)0xFF, (JOCTET)JPEG_EOI, 0, 0
   };
 
   /* The whole JPEG data is expected to reside in the supplied memory
@@ -160,7 +160,7 @@
  */
 
 METHODDEF(void)
-skip_input_data (j_decompress_ptr cinfo, long num_bytes)
+skip_input_data(j_decompress_ptr cinfo, long num_bytes)
 {
   struct jpeg_source_mgr *src = cinfo->src;
 
@@ -169,15 +169,15 @@
    * any trouble anyway --- large skips are infrequent.
    */
   if (num_bytes > 0) {
-    while (num_bytes > (long) src->bytes_in_buffer) {
-      num_bytes -= (long) src->bytes_in_buffer;
-      (void) (*src->fill_input_buffer) (cinfo);
+    while (num_bytes > (long)src->bytes_in_buffer) {
+      num_bytes -= (long)src->bytes_in_buffer;
+      (void)(*src->fill_input_buffer) (cinfo);
       /* note we assume that fill_input_buffer will never return FALSE,
        * so suspension need not be handled.
        */
     }
-    src->next_input_byte += (size_t) num_bytes;
-    src->bytes_in_buffer -= (size_t) num_bytes;
+    src->next_input_byte += (size_t)num_bytes;
+    src->bytes_in_buffer -= (size_t)num_bytes;
   }
 }
 
@@ -201,7 +201,7 @@
  */
 
 METHODDEF(void)
-term_source (j_decompress_ptr cinfo)
+term_source(j_decompress_ptr cinfo)
 {
   /* no work necessary here */
 }
@@ -214,7 +214,7 @@
  */
 
 GLOBAL(void)
-jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile)
+jpeg_stdio_src(j_decompress_ptr cinfo, FILE *infile)
 {
   my_src_ptr src;
 
@@ -225,11 +225,11 @@
    */
   if (cinfo->src == NULL) {     /* first time for this JPEG object? */
     cinfo->src = (struct jpeg_source_mgr *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   sizeof(my_source_mgr));
-    src = (my_src_ptr) cinfo->src;
+    src = (my_src_ptr)cinfo->src;
     src->buffer = (JOCTET *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   INPUT_BUF_SIZE * sizeof(JOCTET));
   } else if (cinfo->src->init_source != init_source) {
     /* It is unsafe to reuse the existing source manager unless it was created
@@ -241,7 +241,7 @@
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
   }
 
-  src = (my_src_ptr) cinfo->src;
+  src = (my_src_ptr)cinfo->src;
   src->pub.init_source = init_source;
   src->pub.fill_input_buffer = fill_input_buffer;
   src->pub.skip_input_data = skip_input_data;
@@ -260,8 +260,8 @@
  */
 
 GLOBAL(void)
-jpeg_mem_src (j_decompress_ptr cinfo,
-              const unsigned char *inbuffer, unsigned long insize)
+jpeg_mem_src(j_decompress_ptr cinfo, const unsigned char *inbuffer,
+             unsigned long insize)
 {
   struct jpeg_source_mgr *src;
 
@@ -274,7 +274,7 @@
    */
   if (cinfo->src == NULL) {     /* first time for this JPEG object? */
     cinfo->src = (struct jpeg_source_mgr *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   sizeof(struct jpeg_source_mgr));
   } else if (cinfo->src->init_source != init_mem_source) {
     /* It is unsafe to reuse the existing source manager unless it was created
@@ -289,7 +289,7 @@
   src->skip_input_data = skip_input_data;
   src->resync_to_restart = jpeg_resync_to_restart; /* use default method */
   src->term_source = term_source;
-  src->bytes_in_buffer = (size_t) insize;
-  src->next_input_byte = (const JOCTET *) inbuffer;
+  src->bytes_in_buffer = (size_t)insize;
+  src->next_input_byte = (const JOCTET *)inbuffer;
 }
 #endif
diff --git a/jdcoefct.c b/jdcoefct.c
index 1a48969..723a9ac 100644
--- a/jdcoefct.c
+++ b/jdcoefct.c
@@ -25,16 +25,15 @@
 
 
 /* Forward declarations */
-METHODDEF(int) decompress_onepass
-        (j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
+METHODDEF(int) decompress_onepass(j_decompress_ptr cinfo,
+                                  JSAMPIMAGE output_buf);
 #ifdef D_MULTISCAN_FILES_SUPPORTED
-METHODDEF(int) decompress_data
-        (j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
+METHODDEF(int) decompress_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
 #endif
 #ifdef BLOCK_SMOOTHING_SUPPORTED
-LOCAL(boolean) smoothing_ok (j_decompress_ptr cinfo);
-METHODDEF(int) decompress_smooth_data
-        (j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
+LOCAL(boolean) smoothing_ok(j_decompress_ptr cinfo);
+METHODDEF(int) decompress_smooth_data(j_decompress_ptr cinfo,
+                                      JSAMPIMAGE output_buf);
 #endif
 
 
@@ -43,7 +42,7 @@
  */
 
 METHODDEF(void)
-start_input_pass (j_decompress_ptr cinfo)
+start_input_pass(j_decompress_ptr cinfo)
 {
   cinfo->input_iMCU_row = 0;
   start_iMCU_row(cinfo);
@@ -55,10 +54,10 @@
  */
 
 METHODDEF(void)
-start_output_pass (j_decompress_ptr cinfo)
+start_output_pass(j_decompress_ptr cinfo)
 {
 #ifdef BLOCK_SMOOTHING_SUPPORTED
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
 
   /* If multipass, check to see whether to use block smoothing on this pass */
   if (coef->pub.coef_arrays != NULL) {
@@ -83,9 +82,9 @@
  */
 
 METHODDEF(int)
-decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
+decompress_onepass(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION MCU_col_num;       /* index of current MCU within row */
   JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
@@ -101,9 +100,9 @@
     for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col;
          MCU_col_num++) {
       /* Try to fetch an MCU.  Entropy decoder expects buffer to be zeroed. */
-      jzero_far((void *) coef->MCU_buffer[0],
-                (size_t) (cinfo->blocks_in_MCU * sizeof(JBLOCK)));
-      if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
+      jzero_far((void *)coef->MCU_buffer[0],
+                (size_t)(cinfo->blocks_in_MCU * sizeof(JBLOCK)));
+      if (!(*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
         coef->MCU_vert_offset = yoffset;
         coef->MCU_ctr = MCU_col_num;
@@ -120,28 +119,28 @@
          * incremented past them!).  Note the inner loop relies on having
          * allocated the MCU_buffer[] blocks sequentially.
          */
-        blkn = 0;                 /* index of current DCT block within MCU */
+        blkn = 0;               /* index of current DCT block within MCU */
         for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
           compptr = cinfo->cur_comp_info[ci];
           /* Don't bother to IDCT an uninteresting component. */
-          if (! compptr->component_needed) {
+          if (!compptr->component_needed) {
             blkn += compptr->MCU_blocks;
             continue;
           }
           inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index];
-          useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
-                                                      : compptr->last_col_width;
+          useful_width = (MCU_col_num < last_MCU_col) ?
+                         compptr->MCU_width : compptr->last_col_width;
           output_ptr = output_buf[compptr->component_index] +
-            yoffset * compptr->_DCT_scaled_size;
+                       yoffset * compptr->_DCT_scaled_size;
           start_col = (MCU_col_num - cinfo->master->first_iMCU_col) *
-              compptr->MCU_sample_width;
+                      compptr->MCU_sample_width;
           for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
             if (cinfo->input_iMCU_row < last_iMCU_row ||
-                yoffset+yindex < compptr->last_row_height) {
+                yoffset + yindex < compptr->last_row_height) {
               output_col = start_col;
               for (xindex = 0; xindex < useful_width; xindex++) {
                 (*inverse_DCT) (cinfo, compptr,
-                                (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
+                                (JCOEFPTR)coef->MCU_buffer[blkn + xindex],
                                 output_ptr, output_col);
                 output_col += compptr->_DCT_scaled_size;
               }
@@ -172,7 +171,7 @@
  */
 
 METHODDEF(int)
-dummy_consume_data (j_decompress_ptr cinfo)
+dummy_consume_data(j_decompress_ptr cinfo)
 {
   return JPEG_SUSPENDED;        /* Always indicate nothing was done */
 }
@@ -188,9 +187,9 @@
  */
 
 METHODDEF(int)
-consume_data (j_decompress_ptr cinfo)
+consume_data(j_decompress_ptr cinfo)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION MCU_col_num;       /* index of current MCU within row */
   int blkn, ci, xindex, yindex, yoffset;
   JDIMENSION start_col;
@@ -202,9 +201,9 @@
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     buffer[ci] = (*cinfo->mem->access_virt_barray)
-      ((j_common_ptr) cinfo, coef->whole_image[compptr->component_index],
+      ((j_common_ptr)cinfo, coef->whole_image[compptr->component_index],
        cinfo->input_iMCU_row * compptr->v_samp_factor,
-       (JDIMENSION) compptr->v_samp_factor, TRUE);
+       (JDIMENSION)compptr->v_samp_factor, TRUE);
     /* Note: entropy decoder expects buffer to be zeroed,
      * but this is handled automatically by the memory manager
      * because we requested a pre-zeroed array.
@@ -222,14 +221,14 @@
         compptr = cinfo->cur_comp_info[ci];
         start_col = MCU_col_num * compptr->MCU_width;
         for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-          buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
+          buffer_ptr = buffer[ci][yindex + yoffset] + start_col;
           for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
             coef->MCU_buffer[blkn++] = buffer_ptr++;
           }
         }
       }
       /* Try to fetch the MCU. */
-      if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
+      if (!(*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
         coef->MCU_vert_offset = yoffset;
         coef->MCU_ctr = MCU_col_num;
@@ -259,9 +258,9 @@
  */
 
 METHODDEF(int)
-decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
+decompress_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
   JDIMENSION block_num;
   int ci, block_row, block_rows;
@@ -276,7 +275,7 @@
   while (cinfo->input_scan_number < cinfo->output_scan_number ||
          (cinfo->input_scan_number == cinfo->output_scan_number &&
           cinfo->input_iMCU_row <= cinfo->output_iMCU_row)) {
-    if ((*cinfo->inputctl->consume_input)(cinfo) == JPEG_SUSPENDED)
+    if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
       return JPEG_SUSPENDED;
   }
 
@@ -284,19 +283,19 @@
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Don't bother to IDCT an uninteresting component. */
-    if (! compptr->component_needed)
+    if (!compptr->component_needed)
       continue;
     /* Align the virtual buffer for this component. */
     buffer = (*cinfo->mem->access_virt_barray)
-      ((j_common_ptr) cinfo, coef->whole_image[ci],
+      ((j_common_ptr)cinfo, coef->whole_image[ci],
        cinfo->output_iMCU_row * compptr->v_samp_factor,
-       (JDIMENSION) compptr->v_samp_factor, FALSE);
+       (JDIMENSION)compptr->v_samp_factor, FALSE);
     /* Count non-dummy DCT block rows in this iMCU row. */
     if (cinfo->output_iMCU_row < last_iMCU_row)
       block_rows = compptr->v_samp_factor;
     else {
       /* NB: can't use last_row_height here; it is input-side-dependent! */
-      block_rows = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+      block_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
       if (block_rows == 0) block_rows = compptr->v_samp_factor;
     }
     inverse_DCT = cinfo->idct->inverse_DCT[ci];
@@ -307,8 +306,8 @@
       output_col = 0;
       for (block_num = cinfo->master->first_MCU_col[ci];
            block_num <= cinfo->master->last_MCU_col[ci]; block_num++) {
-        (*inverse_DCT) (cinfo, compptr, (JCOEFPTR) buffer_ptr,
-                        output_ptr, output_col);
+        (*inverse_DCT) (cinfo, compptr, (JCOEFPTR)buffer_ptr, output_ptr,
+                        output_col);
         buffer_ptr++;
         output_col += compptr->_DCT_scaled_size;
       }
@@ -350,9 +349,9 @@
  */
 
 LOCAL(boolean)
-smoothing_ok (j_decompress_ptr cinfo)
+smoothing_ok(j_decompress_ptr cinfo)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   boolean smoothing_useful = FALSE;
   int ci, coefi;
   jpeg_component_info *compptr;
@@ -360,13 +359,13 @@
   int *coef_bits;
   int *coef_bits_latch;
 
-  if (! cinfo->progressive_mode || cinfo->coef_bits == NULL)
+  if (!cinfo->progressive_mode || cinfo->coef_bits == NULL)
     return FALSE;
 
   /* Allocate latch area if not already done */
   if (coef->coef_bits_latch == NULL)
     coef->coef_bits_latch = (int *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   cinfo->num_components *
                                   (SAVED_COEFS * sizeof(int)));
   coef_bits_latch = coef->coef_bits_latch;
@@ -406,9 +405,9 @@
  */
 
 METHODDEF(int)
-decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
+decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
   JDIMENSION block_num, last_block_column;
   int ci, block_row, block_rows, access_rows;
@@ -422,8 +421,8 @@
   JCOEF *workspace;
   int *coef_bits;
   JQUANT_TBL *quanttbl;
-  JLONG Q00,Q01,Q02,Q10,Q11,Q20, num;
-  int DC1,DC2,DC3,DC4,DC5,DC6,DC7,DC8,DC9;
+  JLONG Q00, Q01, Q02, Q10, Q11, Q20, num;
+  int DC1, DC2, DC3, DC4, DC5, DC6, DC7, DC8, DC9;
   int Al, pred;
 
   /* Keep a local variable to avoid looking it up more than once */
@@ -431,7 +430,7 @@
 
   /* Force some input to be done if we are getting ahead of the input. */
   while (cinfo->input_scan_number <= cinfo->output_scan_number &&
-         ! cinfo->inputctl->eoi_reached) {
+         !cinfo->inputctl->eoi_reached) {
     if (cinfo->input_scan_number == cinfo->output_scan_number) {
       /* If input is working on current scan, we ordinarily want it to
        * have completed the current row.  But if input scan is DC,
@@ -439,10 +438,10 @@
        * values are up to date.
        */
       JDIMENSION delta = (cinfo->Ss == 0) ? 1 : 0;
-      if (cinfo->input_iMCU_row > cinfo->output_iMCU_row+delta)
+      if (cinfo->input_iMCU_row > cinfo->output_iMCU_row + delta)
         break;
     }
-    if ((*cinfo->inputctl->consume_input)(cinfo) == JPEG_SUSPENDED)
+    if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
       return JPEG_SUSPENDED;
   }
 
@@ -450,7 +449,7 @@
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Don't bother to IDCT an uninteresting component. */
-    if (! compptr->component_needed)
+    if (!compptr->component_needed)
       continue;
     /* Count non-dummy DCT block rows in this iMCU row. */
     if (cinfo->output_iMCU_row < last_iMCU_row) {
@@ -459,7 +458,7 @@
       last_row = FALSE;
     } else {
       /* NB: can't use last_row_height here; it is input-side-dependent! */
-      block_rows = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+      block_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
       if (block_rows == 0) block_rows = compptr->v_samp_factor;
       access_rows = block_rows; /* this iMCU row only */
       last_row = TRUE;
@@ -468,15 +467,15 @@
     if (cinfo->output_iMCU_row > 0) {
       access_rows += compptr->v_samp_factor; /* prior iMCU row too */
       buffer = (*cinfo->mem->access_virt_barray)
-        ((j_common_ptr) cinfo, coef->whole_image[ci],
+        ((j_common_ptr)cinfo, coef->whole_image[ci],
          (cinfo->output_iMCU_row - 1) * compptr->v_samp_factor,
-         (JDIMENSION) access_rows, FALSE);
+         (JDIMENSION)access_rows, FALSE);
       buffer += compptr->v_samp_factor; /* point to current iMCU row */
       first_row = FALSE;
     } else {
       buffer = (*cinfo->mem->access_virt_barray)
-        ((j_common_ptr) cinfo, coef->whole_image[ci],
-         (JDIMENSION) 0, (JDIMENSION) access_rows, FALSE);
+        ((j_common_ptr)cinfo, coef->whole_image[ci],
+         (JDIMENSION)0, (JDIMENSION)access_rows, FALSE);
       first_row = TRUE;
     }
     /* Fetch component-dependent info */
@@ -496,115 +495,115 @@
       if (first_row && block_row == 0)
         prev_block_row = buffer_ptr;
       else
-        prev_block_row = buffer[block_row-1];
-      if (last_row && block_row == block_rows-1)
+        prev_block_row = buffer[block_row - 1];
+      if (last_row && block_row == block_rows - 1)
         next_block_row = buffer_ptr;
       else
-        next_block_row = buffer[block_row+1];
+        next_block_row = buffer[block_row + 1];
       /* We fetch the surrounding DC values using a sliding-register approach.
        * Initialize all nine here so as to do the right thing on narrow pics.
        */
-      DC1 = DC2 = DC3 = (int) prev_block_row[0][0];
-      DC4 = DC5 = DC6 = (int) buffer_ptr[0][0];
-      DC7 = DC8 = DC9 = (int) next_block_row[0][0];
+      DC1 = DC2 = DC3 = (int)prev_block_row[0][0];
+      DC4 = DC5 = DC6 = (int)buffer_ptr[0][0];
+      DC7 = DC8 = DC9 = (int)next_block_row[0][0];
       output_col = 0;
       last_block_column = compptr->width_in_blocks - 1;
       for (block_num = cinfo->master->first_MCU_col[ci];
            block_num <= cinfo->master->last_MCU_col[ci]; block_num++) {
         /* Fetch current DCT block into workspace so we can modify it. */
-        jcopy_block_row(buffer_ptr, (JBLOCKROW) workspace, (JDIMENSION) 1);
+        jcopy_block_row(buffer_ptr, (JBLOCKROW)workspace, (JDIMENSION)1);
         /* Update DC values */
         if (block_num < last_block_column) {
-          DC3 = (int) prev_block_row[1][0];
-          DC6 = (int) buffer_ptr[1][0];
-          DC9 = (int) next_block_row[1][0];
+          DC3 = (int)prev_block_row[1][0];
+          DC6 = (int)buffer_ptr[1][0];
+          DC9 = (int)next_block_row[1][0];
         }
         /* Compute coefficient estimates per K.8.
          * An estimate is applied only if coefficient is still zero,
          * and is not known to be fully accurate.
          */
         /* AC01 */
-        if ((Al=coef_bits[1]) != 0 && workspace[1] == 0) {
+        if ((Al = coef_bits[1]) != 0 && workspace[1] == 0) {
           num = 36 * Q00 * (DC4 - DC6);
           if (num >= 0) {
-            pred = (int) (((Q01<<7) + num) / (Q01<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q01 << 7) + num) / (Q01 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
           } else {
-            pred = (int) (((Q01<<7) - num) / (Q01<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q01 << 7) - num) / (Q01 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
             pred = -pred;
           }
-          workspace[1] = (JCOEF) pred;
+          workspace[1] = (JCOEF)pred;
         }
         /* AC10 */
-        if ((Al=coef_bits[2]) != 0 && workspace[8] == 0) {
+        if ((Al = coef_bits[2]) != 0 && workspace[8] == 0) {
           num = 36 * Q00 * (DC2 - DC8);
           if (num >= 0) {
-            pred = (int) (((Q10<<7) + num) / (Q10<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q10 << 7) + num) / (Q10 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
           } else {
-            pred = (int) (((Q10<<7) - num) / (Q10<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q10 << 7) - num) / (Q10 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
             pred = -pred;
           }
-          workspace[8] = (JCOEF) pred;
+          workspace[8] = (JCOEF)pred;
         }
         /* AC20 */
-        if ((Al=coef_bits[3]) != 0 && workspace[16] == 0) {
-          num = 9 * Q00 * (DC2 + DC8 - 2*DC5);
+        if ((Al = coef_bits[3]) != 0 && workspace[16] == 0) {
+          num = 9 * Q00 * (DC2 + DC8 - 2 * DC5);
           if (num >= 0) {
-            pred = (int) (((Q20<<7) + num) / (Q20<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q20 << 7) + num) / (Q20 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
           } else {
-            pred = (int) (((Q20<<7) - num) / (Q20<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q20 << 7) - num) / (Q20 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
             pred = -pred;
           }
-          workspace[16] = (JCOEF) pred;
+          workspace[16] = (JCOEF)pred;
         }
         /* AC11 */
-        if ((Al=coef_bits[4]) != 0 && workspace[9] == 0) {
+        if ((Al = coef_bits[4]) != 0 && workspace[9] == 0) {
           num = 5 * Q00 * (DC1 - DC3 - DC7 + DC9);
           if (num >= 0) {
-            pred = (int) (((Q11<<7) + num) / (Q11<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q11 << 7) + num) / (Q11 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
           } else {
-            pred = (int) (((Q11<<7) - num) / (Q11<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q11 << 7) - num) / (Q11 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
             pred = -pred;
           }
-          workspace[9] = (JCOEF) pred;
+          workspace[9] = (JCOEF)pred;
         }
         /* AC02 */
-        if ((Al=coef_bits[5]) != 0 && workspace[2] == 0) {
-          num = 9 * Q00 * (DC4 + DC6 - 2*DC5);
+        if ((Al = coef_bits[5]) != 0 && workspace[2] == 0) {
+          num = 9 * Q00 * (DC4 + DC6 - 2 * DC5);
           if (num >= 0) {
-            pred = (int) (((Q02<<7) + num) / (Q02<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q02 << 7) + num) / (Q02 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
           } else {
-            pred = (int) (((Q02<<7) - num) / (Q02<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q02 << 7) - num) / (Q02 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
             pred = -pred;
           }
-          workspace[2] = (JCOEF) pred;
+          workspace[2] = (JCOEF)pred;
         }
         /* OK, do the IDCT */
-        (*inverse_DCT) (cinfo, compptr, (JCOEFPTR) workspace,
-                        output_ptr, output_col);
+        (*inverse_DCT) (cinfo, compptr, (JCOEFPTR)workspace, output_ptr,
+                        output_col);
         /* Advance for next column */
-        DC1 = DC2; DC2 = DC3;
-        DC4 = DC5; DC5 = DC6;
-        DC7 = DC8; DC8 = DC9;
+        DC1 = DC2;  DC2 = DC3;
+        DC4 = DC5;  DC5 = DC6;
+        DC7 = DC8;  DC8 = DC9;
         buffer_ptr++, prev_block_row++, next_block_row++;
         output_col += compptr->_DCT_scaled_size;
       }
@@ -625,14 +624,14 @@
  */
 
 GLOBAL(void)
-jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
+jinit_d_coef_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
 {
   my_coef_ptr coef;
 
   coef = (my_coef_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_coef_controller));
-  cinfo->coef = (struct jpeg_d_coef_controller *) coef;
+  cinfo->coef = (struct jpeg_d_coef_controller *)coef;
   coef->pub.start_input_pass = start_input_pass;
   coef->pub.start_output_pass = start_output_pass;
 #ifdef BLOCK_SMOOTHING_SUPPORTED
@@ -657,12 +656,12 @@
         access_rows *= 3;
 #endif
       coef->whole_image[ci] = (*cinfo->mem->request_virt_barray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE, TRUE,
-         (JDIMENSION) jround_up((long) compptr->width_in_blocks,
-                                (long) compptr->h_samp_factor),
-         (JDIMENSION) jround_up((long) compptr->height_in_blocks,
-                                (long) compptr->v_samp_factor),
-         (JDIMENSION) access_rows);
+        ((j_common_ptr)cinfo, JPOOL_IMAGE, TRUE,
+         (JDIMENSION)jround_up((long)compptr->width_in_blocks,
+                               (long)compptr->h_samp_factor),
+         (JDIMENSION)jround_up((long)compptr->height_in_blocks,
+                               (long)compptr->v_samp_factor),
+         (JDIMENSION)access_rows);
     }
     coef->pub.consume_data = consume_data;
     coef->pub.decompress_data = decompress_data;
@@ -676,7 +675,7 @@
     int i;
 
     buffer = (JBLOCKROW)
-      (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   D_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
     for (i = 0; i < D_MAX_BLOCKS_IN_MCU; i++) {
       coef->MCU_buffer[i] = buffer + i;
@@ -688,6 +687,6 @@
 
   /* Allocate the workspace buffer */
   coef->workspace = (JCOEF *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(JCOEF) * DCTSIZE2);
 }
diff --git a/jdcoefct.h b/jdcoefct.h
index bf6beb2..c4d1943 100644
--- a/jdcoefct.h
+++ b/jdcoefct.h
@@ -59,10 +59,10 @@
 
 
 LOCAL(void)
-start_iMCU_row (j_decompress_ptr cinfo)
+start_iMCU_row(j_decompress_ptr cinfo)
 /* Reset within-iMCU-row counters for a new row (input side) */
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
 
   /* In an interleaved scan, an MCU row is the same as an iMCU row.
    * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
@@ -71,7 +71,7 @@
   if (cinfo->comps_in_scan > 1) {
     coef->MCU_rows_per_iMCU_row = 1;
   } else {
-    if (cinfo->input_iMCU_row < (cinfo->total_iMCU_rows-1))
+    if (cinfo->input_iMCU_row < (cinfo->total_iMCU_rows - 1))
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
     else
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
diff --git a/jdcol565.c b/jdcol565.c
index 349fce4..40068ef 100644
--- a/jdcol565.c
+++ b/jdcol565.c
@@ -17,22 +17,22 @@
 
 INLINE
 LOCAL(void)
-ycc_rgb565_convert_internal (j_decompress_ptr cinfo,
-                             JSAMPIMAGE input_buf, JDIMENSION input_row,
-                             JSAMPARRAY output_buf, int num_rows)
+ycc_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                            JDIMENSION input_row, JSAMPARRAY output_buf,
+                            int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int y, cb, cr;
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  register int * Crrtab = cconvert->Cr_r_tab;
-  register int * Cbbtab = cconvert->Cb_b_tab;
-  register JLONG * Crgtab = cconvert->Cr_g_tab;
-  register JLONG * Cbgtab = cconvert->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register int *Crrtab = cconvert->Cr_r_tab;
+  register int *Cbbtab = cconvert->Cb_b_tab;
+  register JLONG *Crgtab = cconvert->Cr_g_tab;
+  register JLONG *Cbgtab = cconvert->Cb_g_tab;
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
@@ -53,7 +53,7 @@
                                             SCALEBITS))];
       b = range_limit[y + Cbbtab[cb]];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -88,7 +88,7 @@
                                             SCALEBITS))];
       b = range_limit[y + Cbbtab[cb]];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
     }
   }
 }
@@ -96,22 +96,22 @@
 
 INLINE
 LOCAL(void)
-ycc_rgb565D_convert_internal (j_decompress_ptr cinfo,
-                              JSAMPIMAGE input_buf, JDIMENSION input_row,
-                              JSAMPARRAY output_buf, int num_rows)
+ycc_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                             JDIMENSION input_row, JSAMPARRAY output_buf,
+                             int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int y, cb, cr;
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  register int * Crrtab = cconvert->Cr_r_tab;
-  register int * Cbbtab = cconvert->Cb_b_tab;
-  register JLONG * Crgtab = cconvert->Cr_g_tab;
-  register JLONG * Cbgtab = cconvert->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register int *Crrtab = cconvert->Cr_r_tab;
+  register int *Cbbtab = cconvert->Cb_b_tab;
+  register JLONG *Crgtab = cconvert->Cr_g_tab;
+  register JLONG *Cbgtab = cconvert->Cb_g_tab;
   JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
   SHIFT_TEMPS
 
@@ -134,7 +134,7 @@
                                                      SCALEBITS)), d0)];
       b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -174,7 +174,7 @@
                                                      SCALEBITS)), d0)];
       b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
     }
   }
 }
@@ -182,9 +182,9 @@
 
 INLINE
 LOCAL(void)
-rgb_rgb565_convert_internal (j_decompress_ptr cinfo,
-                             JSAMPIMAGE input_buf, JDIMENSION input_row,
-                             JSAMPARRAY output_buf, int num_rows)
+rgb_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                            JDIMENSION input_row, JSAMPARRAY output_buf,
+                            int num_rows)
 {
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
@@ -206,7 +206,7 @@
       g = GETJSAMPLE(*inptr1++);
       b = GETJSAMPLE(*inptr2++);
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -229,7 +229,7 @@
       g = GETJSAMPLE(*inptr1);
       b = GETJSAMPLE(*inptr2);
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
     }
   }
 }
@@ -237,14 +237,14 @@
 
 INLINE
 LOCAL(void)
-rgb_rgb565D_convert_internal (j_decompress_ptr cinfo,
-                              JSAMPIMAGE input_buf, JDIMENSION input_row,
-                              JSAMPARRAY output_buf, int num_rows)
+rgb_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                             JDIMENSION input_row, JSAMPARRAY output_buf,
+                             int num_rows)
 {
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
   JDIMENSION num_cols = cinfo->output_width;
   JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
   SHIFT_TEMPS
@@ -263,7 +263,7 @@
       g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
       b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -288,7 +288,7 @@
       g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1), d0)];
       b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2), d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
     }
   }
 }
@@ -296,9 +296,9 @@
 
 INLINE
 LOCAL(void)
-gray_rgb565_convert_internal (j_decompress_ptr cinfo,
-                              JSAMPIMAGE input_buf, JDIMENSION input_row,
-                              JSAMPARRAY output_buf, int num_rows)
+gray_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                             JDIMENSION input_row, JSAMPARRAY output_buf,
+                             int num_rows)
 {
   register JSAMPROW inptr, outptr;
   register JDIMENSION col;
@@ -313,7 +313,7 @@
     if (PACK_NEED_ALIGNMENT(outptr)) {
       g = *inptr++;
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -328,7 +328,7 @@
     if (num_cols & 1) {
       g = *inptr;
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
     }
   }
 }
@@ -336,13 +336,13 @@
 
 INLINE
 LOCAL(void)
-gray_rgb565D_convert_internal (j_decompress_ptr cinfo,
-                               JSAMPIMAGE input_buf, JDIMENSION input_row,
-                               JSAMPARRAY output_buf, int num_rows)
+gray_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                              JDIMENSION input_row, JSAMPARRAY output_buf,
+                              int num_rows)
 {
   register JSAMPROW inptr, outptr;
   register JDIMENSION col;
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
   JDIMENSION num_cols = cinfo->output_width;
   JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
 
@@ -356,7 +356,7 @@
       g = *inptr++;
       g = range_limit[DITHER_565_R(g, d0)];
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -378,7 +378,7 @@
       g = *inptr;
       g = range_limit[DITHER_565_R(g, d0)];
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
     }
   }
 }
diff --git a/jdcolext.c b/jdcolext.c
index 59b676c..72a5301 100644
--- a/jdcolext.c
+++ b/jdcolext.c
@@ -28,22 +28,22 @@
 
 INLINE
 LOCAL(void)
-ycc_rgb_convert_internal (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
+ycc_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int y, cb, cr;
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  register int * Crrtab = cconvert->Cr_r_tab;
-  register int * Cbbtab = cconvert->Cb_b_tab;
-  register JLONG * Crgtab = cconvert->Cr_g_tab;
-  register JLONG * Cbgtab = cconvert->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register int *Crrtab = cconvert->Cr_r_tab;
+  register int *Cbbtab = cconvert->Cb_b_tab;
+  register JLONG *Crgtab = cconvert->Cr_g_tab;
+  register JLONG *Cbgtab = cconvert->Cb_g_tab;
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
@@ -59,8 +59,8 @@
       /* Range-limiting is essential due to noise introduced by DCT losses. */
       outptr[RGB_RED] =   range_limit[y + Crrtab[cr]];
       outptr[RGB_GREEN] = range_limit[y +
-                              ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
-                                                 SCALEBITS))];
+                              ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+                                                SCALEBITS))];
       outptr[RGB_BLUE] =  range_limit[y + Cbbtab[cb]];
       /* Set unused byte to 0xFF so it can be interpreted as an opaque */
       /* alpha channel value */
@@ -81,9 +81,9 @@
 
 INLINE
 LOCAL(void)
-gray_rgb_convert_internal (j_decompress_ptr cinfo,
-                           JSAMPIMAGE input_buf, JDIMENSION input_row,
-                           JSAMPARRAY output_buf, int num_rows)
+gray_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                          JDIMENSION input_row, JSAMPARRAY output_buf,
+                          int num_rows)
 {
   register JSAMPROW inptr, outptr;
   register JDIMENSION col;
@@ -112,9 +112,9 @@
 
 INLINE
 LOCAL(void)
-rgb_rgb_convert_internal (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
+rgb_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
 {
   register JSAMPROW inptr0, inptr1, inptr2;
   register JSAMPROW outptr;
diff --git a/jdcolor.c b/jdcolor.c
index 05cbf4d..5d6fa90 100644
--- a/jdcolor.c
+++ b/jdcolor.c
@@ -74,8 +74,8 @@
  */
 
 #define SCALEBITS       16      /* speediest right-shift on some machines */
-#define ONE_HALF        ((JLONG) 1 << (SCALEBITS-1))
-#define FIX(x)          ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5))
+#define ONE_HALF        ((JLONG)1 << (SCALEBITS - 1))
+#define FIX(x)          ((JLONG)((x) * (1L << SCALEBITS) + 0.5))
 
 /* We allocate one big table for RGB->Y conversion and divide it up into
  * three parts, instead of doing three alloc_small requests.  This lets us
@@ -85,9 +85,9 @@
  */
 
 #define R_Y_OFF         0                       /* offset to R => Y section */
-#define G_Y_OFF         (1*(MAXJSAMPLE+1))      /* offset to G => Y section */
-#define B_Y_OFF         (2*(MAXJSAMPLE+1))      /* etc. */
-#define TABLE_SIZE      (3*(MAXJSAMPLE+1))
+#define G_Y_OFF         (1 * (MAXJSAMPLE + 1))  /* offset to G => Y section */
+#define B_Y_OFF         (2 * (MAXJSAMPLE + 1))  /* etc. */
+#define TABLE_SIZE      (3 * (MAXJSAMPLE + 1))
 
 
 /* Include inline routines for colorspace extensions */
@@ -208,25 +208,25 @@
  */
 
 LOCAL(void)
-build_ycc_rgb_table (j_decompress_ptr cinfo)
+build_ycc_rgb_table(j_decompress_ptr cinfo)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   int i;
   JLONG x;
   SHIFT_TEMPS
 
   cconvert->Cr_r_tab = (int *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(int));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(int));
   cconvert->Cb_b_tab = (int *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(int));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(int));
   cconvert->Cr_g_tab = (JLONG *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(JLONG));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(JLONG));
   cconvert->Cb_g_tab = (JLONG *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(JLONG));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(JLONG));
 
   for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
     /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
@@ -238,10 +238,10 @@
     cconvert->Cb_b_tab[i] = (int)
                     RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS);
     /* Cr=>G value is scaled-up -0.71414 * x */
-    cconvert->Cr_g_tab[i] = (- FIX(0.71414)) * x;
+    cconvert->Cr_g_tab[i] = (-FIX(0.71414)) * x;
     /* Cb=>G value is scaled-up -0.34414 * x */
     /* We also add in ONE_HALF so that need not do it in inner loop */
-    cconvert->Cb_g_tab[i] = (- FIX(0.34414)) * x + ONE_HALF;
+    cconvert->Cb_g_tab[i] = (-FIX(0.34414)) * x + ONE_HALF;
   }
 }
 
@@ -251,43 +251,42 @@
  */
 
 METHODDEF(void)
-ycc_rgb_convert (j_decompress_ptr cinfo,
-                 JSAMPIMAGE input_buf, JDIMENSION input_row,
-                 JSAMPARRAY output_buf, int num_rows)
+ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   switch (cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      ycc_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                  num_rows);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      ycc_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_BGR:
-      ycc_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                  num_rows);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      ycc_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      ycc_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      ycc_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    default:
-      ycc_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                               num_rows);
-      break;
+  case JCS_EXT_RGB:
+    ycc_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                num_rows);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    ycc_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_BGR:
+    ycc_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                num_rows);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    ycc_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    ycc_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    ycc_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  default:
+    ycc_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                             num_rows);
+    break;
   }
 }
 
@@ -300,21 +299,21 @@
  */
 
 LOCAL(void)
-build_rgb_y_table (j_decompress_ptr cinfo)
+build_rgb_y_table(j_decompress_ptr cinfo)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   JLONG *rgb_y_tab;
   JLONG i;
 
   /* Allocate and fill in the conversion tables. */
   cconvert->rgb_y_tab = rgb_y_tab = (JLONG *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 (TABLE_SIZE * sizeof(JLONG)));
 
   for (i = 0; i <= MAXJSAMPLE; i++) {
-    rgb_y_tab[i+R_Y_OFF] = FIX(0.29900) * i;
-    rgb_y_tab[i+G_Y_OFF] = FIX(0.58700) * i;
-    rgb_y_tab[i+B_Y_OFF] = FIX(0.11400) * i + ONE_HALF;
+    rgb_y_tab[i + R_Y_OFF] = FIX(0.29900) * i;
+    rgb_y_tab[i + G_Y_OFF] = FIX(0.58700) * i;
+    rgb_y_tab[i + B_Y_OFF] = FIX(0.11400) * i + ONE_HALF;
   }
 }
 
@@ -324,11 +323,10 @@
  */
 
 METHODDEF(void)
-rgb_gray_convert (j_decompress_ptr cinfo,
-                  JSAMPIMAGE input_buf, JDIMENSION input_row,
-                  JSAMPARRAY output_buf, int num_rows)
+rgb_gray_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                 JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int r, g, b;
   register JLONG *ctab = cconvert->rgb_y_tab;
   register JSAMPROW outptr;
@@ -347,9 +345,8 @@
       g = GETJSAMPLE(inptr1[col]);
       b = GETJSAMPLE(inptr2[col]);
       /* Y */
-      outptr[col] = (JSAMPLE)
-                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-                 >> SCALEBITS);
+      outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+                               ctab[b + B_Y_OFF]) >> SCALEBITS);
     }
   }
 }
@@ -361,9 +358,8 @@
  */
 
 METHODDEF(void)
-null_convert (j_decompress_ptr cinfo,
-              JSAMPIMAGE input_buf, JDIMENSION input_row,
-              JSAMPARRAY output_buf, int num_rows)
+null_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+             JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   register JSAMPROW inptr, inptr0, inptr1, inptr2, inptr3, outptr;
   register JDIMENSION col;
@@ -423,12 +419,11 @@
  */
 
 METHODDEF(void)
-grayscale_convert (j_decompress_ptr cinfo,
-                   JSAMPIMAGE input_buf, JDIMENSION input_row,
-                   JSAMPARRAY output_buf, int num_rows)
+grayscale_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                  JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
-  jcopy_sample_rows(input_buf[0], (int) input_row, output_buf, 0,
-                    num_rows, cinfo->output_width);
+  jcopy_sample_rows(input_buf[0], (int)input_row, output_buf, 0, num_rows,
+                    cinfo->output_width);
 }
 
 
@@ -437,43 +432,42 @@
  */
 
 METHODDEF(void)
-gray_rgb_convert (j_decompress_ptr cinfo,
-                  JSAMPIMAGE input_buf, JDIMENSION input_row,
-                  JSAMPARRAY output_buf, int num_rows)
+gray_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                 JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   switch (cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      gray_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      gray_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                    num_rows);
-      break;
-    case JCS_EXT_BGR:
-      gray_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      gray_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                    num_rows);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      gray_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                    num_rows);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      gray_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                    num_rows);
-      break;
-    default:
-      gray_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                num_rows);
-      break;
+  case JCS_EXT_RGB:
+    gray_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    gray_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                  num_rows);
+    break;
+  case JCS_EXT_BGR:
+    gray_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    gray_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                  num_rows);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    gray_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                  num_rows);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    gray_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                  num_rows);
+    break;
+  default:
+    gray_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                              num_rows);
+    break;
   }
 }
 
@@ -483,43 +477,42 @@
  */
 
 METHODDEF(void)
-rgb_rgb_convert (j_decompress_ptr cinfo,
-                  JSAMPIMAGE input_buf, JDIMENSION input_row,
-                  JSAMPARRAY output_buf, int num_rows)
+rgb_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   switch (cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      rgb_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                  num_rows);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      rgb_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_BGR:
-      rgb_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                  num_rows);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      rgb_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      rgb_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      rgb_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    default:
-      rgb_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                               num_rows);
-      break;
+  case JCS_EXT_RGB:
+    rgb_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                num_rows);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    rgb_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_BGR:
+    rgb_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                num_rows);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    rgb_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    rgb_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    rgb_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  default:
+    rgb_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                             num_rows);
+    break;
   }
 }
 
@@ -532,11 +525,10 @@
  */
 
 METHODDEF(void)
-ycck_cmyk_convert (j_decompress_ptr cinfo,
-                   JSAMPIMAGE input_buf, JDIMENSION input_row,
-                   JSAMPARRAY output_buf, int num_rows)
+ycck_cmyk_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                  JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int y, cb, cr;
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2, inptr3;
@@ -564,7 +556,7 @@
       /* Range-limiting is essential due to noise introduced by DCT losses. */
       outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])];   /* red */
       outptr[1] = range_limit[MAXJSAMPLE - (y +                 /* green */
-                              ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+                              ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                                  SCALEBITS)))];
       outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])];   /* blue */
       /* K passes through unchanged */
@@ -579,16 +571,16 @@
  * RGB565 conversion
  */
 
-#define PACK_SHORT_565_LE(r, g, b)   ((((r) << 8) & 0xF800) |  \
-                                      (((g) << 3) & 0x7E0) | ((b) >> 3))
-#define PACK_SHORT_565_BE(r, g, b)   (((r) & 0xF8) | ((g) >> 5) |  \
-                                      (((g) << 11) & 0xE000) |  \
-                                      (((b) << 5) & 0x1F00))
+#define PACK_SHORT_565_LE(r, g, b)  ((((r) << 8) & 0xF800) | \
+                                     (((g) << 3) & 0x7E0) | ((b) >> 3))
+#define PACK_SHORT_565_BE(r, g, b)  (((r) & 0xF8) | ((g) >> 5) | \
+                                     (((g) << 11) & 0xE000) | \
+                                     (((b) << 5) & 0x1F00))
 
-#define PACK_TWO_PIXELS_LE(l, r)     ((r << 16) | l)
-#define PACK_TWO_PIXELS_BE(l, r)     ((l << 16) | r)
+#define PACK_TWO_PIXELS_LE(l, r)    ((r << 16) | l)
+#define PACK_TWO_PIXELS_BE(l, r)    ((l << 16) | r)
 
-#define PACK_NEED_ALIGNMENT(ptr)     (((size_t)(ptr)) & 3)
+#define PACK_NEED_ALIGNMENT(ptr)    (((size_t)(ptr)) & 3)
 
 #define WRITE_TWO_ALIGNED_PIXELS(addr, pixels)  ((*(int *)(addr)) = pixels)
 
@@ -662,9 +654,8 @@
 
 
 METHODDEF(void)
-ycc_rgb565_convert (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION input_row,
-                    JSAMPARRAY output_buf, int num_rows)
+ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                   JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     ycc_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -674,9 +665,8 @@
 
 
 METHODDEF(void)
-ycc_rgb565D_convert (j_decompress_ptr cinfo,
-                     JSAMPIMAGE input_buf, JDIMENSION input_row,
-                     JSAMPARRAY output_buf, int num_rows)
+ycc_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                    JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     ycc_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -686,9 +676,8 @@
 
 
 METHODDEF(void)
-rgb_rgb565_convert (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION input_row,
-                    JSAMPARRAY output_buf, int num_rows)
+rgb_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                   JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     rgb_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -698,9 +687,8 @@
 
 
 METHODDEF(void)
-rgb_rgb565D_convert (j_decompress_ptr cinfo,
-                     JSAMPIMAGE input_buf, JDIMENSION input_row,
-                     JSAMPARRAY output_buf, int num_rows)
+rgb_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                    JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     rgb_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -710,9 +698,8 @@
 
 
 METHODDEF(void)
-gray_rgb565_convert (j_decompress_ptr cinfo,
-                     JSAMPIMAGE input_buf, JDIMENSION input_row,
-                     JSAMPARRAY output_buf, int num_rows)
+gray_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                    JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     gray_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -722,9 +709,8 @@
 
 
 METHODDEF(void)
-gray_rgb565D_convert (j_decompress_ptr cinfo,
-                      JSAMPIMAGE input_buf, JDIMENSION input_row,
-                      JSAMPARRAY output_buf, int num_rows)
+gray_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                     JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     gray_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -738,7 +724,7 @@
  */
 
 METHODDEF(void)
-start_pass_dcolor (j_decompress_ptr cinfo)
+start_pass_dcolor(j_decompress_ptr cinfo)
 {
   /* no work needed */
 }
@@ -749,15 +735,15 @@
  */
 
 GLOBAL(void)
-jinit_color_deconverter (j_decompress_ptr cinfo)
+jinit_color_deconverter(j_decompress_ptr cinfo)
 {
   my_cconvert_ptr cconvert;
   int ci;
 
   cconvert = (my_cconvert_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_color_deconverter));
-  cinfo->cconvert = (struct jpeg_color_deconverter *) cconvert;
+  cinfo->cconvert = (struct jpeg_color_deconverter *)cconvert;
   cconvert->pub.start_pass = start_pass_dcolor;
 
   /* Make sure num_components agrees with jpeg_color_space */
@@ -843,11 +829,11 @@
     cinfo->out_color_components = 3;
     if (cinfo->dither_mode == JDITHER_NONE) {
       if (cinfo->jpeg_color_space == JCS_YCbCr) {
-         if (jsimd_can_ycc_rgb565())
-           cconvert->pub.color_convert = jsimd_ycc_rgb565_convert;
-         else {
-           cconvert->pub.color_convert = ycc_rgb565_convert;
-           build_ycc_rgb_table(cinfo);
+        if (jsimd_can_ycc_rgb565())
+          cconvert->pub.color_convert = jsimd_ycc_rgb565_convert;
+        else {
+          cconvert->pub.color_convert = ycc_rgb565_convert;
+          build_ycc_rgb_table(cinfo);
         }
       } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
         cconvert->pub.color_convert = gray_rgb565_convert;
diff --git a/jdct.h b/jdct.h
index faf8e1c..dbcdb71 100644
--- a/jdct.h
+++ b/jdct.h
@@ -36,7 +36,7 @@
 typedef unsigned int UDCTELEM;
 typedef unsigned long long UDCTELEM2;
 #else
-typedef short DCTELEM;  /* prefer 16 bit with SIMD for parellelism */
+typedef short DCTELEM;          /* prefer 16 bit with SIMD for parellelism */
 typedef unsigned short UDCTELEM;
 typedef unsigned int UDCTELEM2;
 #endif
@@ -63,15 +63,15 @@
  * Each IDCT routine has its own ideas about the best dct_table element type.
  */
 
-typedef MULTIPLIER ISLOW_MULT_TYPE; /* short or int, whichever is faster */
+typedef MULTIPLIER ISLOW_MULT_TYPE;  /* short or int, whichever is faster */
 #if BITS_IN_JSAMPLE == 8
-typedef MULTIPLIER IFAST_MULT_TYPE; /* 16 bits is OK, use short if faster */
-#define IFAST_SCALE_BITS  2     /* fractional bits in scale factors */
+typedef MULTIPLIER IFAST_MULT_TYPE;  /* 16 bits is OK, use short if faster */
+#define IFAST_SCALE_BITS  2          /* fractional bits in scale factors */
 #else
-typedef JLONG IFAST_MULT_TYPE;  /* need 32 bits for scaled quantizers */
-#define IFAST_SCALE_BITS  13    /* fractional bits in scale factors */
+typedef JLONG IFAST_MULT_TYPE;       /* need 32 bits for scaled quantizers */
+#define IFAST_SCALE_BITS  13         /* fractional bits in scale factors */
 #endif
-typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
+typedef FAST_FLOAT FLOAT_MULT_TYPE;  /* preferred floating type */
 
 
 /*
@@ -90,64 +90,64 @@
 
 /* Extern declarations for the forward and inverse DCT routines. */
 
-EXTERN(void) jpeg_fdct_islow (DCTELEM *data);
-EXTERN(void) jpeg_fdct_ifast (DCTELEM *data);
-EXTERN(void) jpeg_fdct_float (FAST_FLOAT *data);
+EXTERN(void) jpeg_fdct_islow(DCTELEM *data);
+EXTERN(void) jpeg_fdct_ifast(DCTELEM *data);
+EXTERN(void) jpeg_fdct_float(FAST_FLOAT *data);
 
-EXTERN(void) jpeg_idct_islow
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_ifast
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_float
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_7x7
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_6x6
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_5x5
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_4x4
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_3x3
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_2x2
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_1x1
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_9x9
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_10x10
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_11x11
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_12x12
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_13x13
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_14x14
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_15x15
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_16x16
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_islow(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_ifast(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_float(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_7x7(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_6x6(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_5x5(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_4x4(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_3x3(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_2x2(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_1x1(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_9x9(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_10x10(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_11x11(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_12x12(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_13x13(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_14x14(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_15x15(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_16x16(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
 
 
 /*
@@ -160,7 +160,7 @@
  * and may differ from one module to the next.
  */
 
-#define ONE     ((JLONG) 1)
+#define ONE         ((JLONG)1)
 #define CONST_SCALE (ONE << CONST_BITS)
 
 /* Convert a positive real constant to an integer scaled by CONST_SCALE.
@@ -168,14 +168,14 @@
  * thus causing a lot of useless floating-point operations at run time.
  */
 
-#define FIX(x)  ((JLONG) ((x) * CONST_SCALE + 0.5))
+#define FIX(x)  ((JLONG)((x) * CONST_SCALE + 0.5))
 
 /* Descale and correctly round a JLONG value that's scaled by N bits.
  * We assume RIGHT_SHIFT rounds towards minus infinity, so adding
  * the fudge factor is correct for either sign of X.
  */
 
-#define DESCALE(x,n)  RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
+#define DESCALE(x, n)  RIGHT_SHIFT((x) + (ONE << ((n) - 1)), n)
 
 /* Multiply a JLONG variable by a JLONG constant to yield a JLONG result.
  * This macro is used only when the two inputs will actually be no more than
@@ -187,22 +187,22 @@
  */
 
 #ifdef SHORTxSHORT_32           /* may work if 'int' is 32 bits */
-#define MULTIPLY16C16(var,const)  (((INT16) (var)) * ((INT16) (const)))
+#define MULTIPLY16C16(var, const)  (((INT16)(var)) * ((INT16)(const)))
 #endif
 #ifdef SHORTxLCONST_32          /* known to work with Microsoft C 6.0 */
-#define MULTIPLY16C16(var,const)  (((INT16) (var)) * ((JLONG) (const)))
+#define MULTIPLY16C16(var, const)  (((INT16)(var)) * ((JLONG)(const)))
 #endif
 
 #ifndef MULTIPLY16C16           /* default definition */
-#define MULTIPLY16C16(var,const)  ((var) * (const))
+#define MULTIPLY16C16(var, const)  ((var) * (const))
 #endif
 
 /* Same except both inputs are variables. */
 
 #ifdef SHORTxSHORT_32           /* may work if 'int' is 32 bits */
-#define MULTIPLY16V16(var1,var2)  (((INT16) (var1)) * ((INT16) (var2)))
+#define MULTIPLY16V16(var1, var2)  (((INT16)(var1)) * ((INT16)(var2)))
 #endif
 
 #ifndef MULTIPLY16V16           /* default definition */
-#define MULTIPLY16V16(var1,var2)  ((var1) * (var2))
+#define MULTIPLY16V16(var1, var2)  ((var1) * (var2))
 #endif
diff --git a/jddctmgr.c b/jddctmgr.c
index 3a5ba7e..e0a8598 100644
--- a/jddctmgr.c
+++ b/jddctmgr.c
@@ -94,9 +94,9 @@
  */
 
 METHODDEF(void)
-start_pass (j_decompress_ptr cinfo)
+start_pass(j_decompress_ptr cinfo)
 {
-  my_idct_ptr idct = (my_idct_ptr) cinfo->idct;
+  my_idct_ptr idct = (my_idct_ptr)cinfo->idct;
   int ci, i;
   jpeg_component_info *compptr;
   int method = 0;
@@ -233,7 +233,7 @@
      * multiplier table all-zero; we'll be reading zeroes from the
      * coefficient controller's buffer anyway.
      */
-    if (! compptr->component_needed || idct->cur_method[ci] == method)
+    if (!compptr->component_needed || idct->cur_method[ci] == method)
       continue;
     qtbl = compptr->quant_table;
     if (qtbl == NULL)           /* happens if no data yet for component */
@@ -246,9 +246,9 @@
         /* For LL&M IDCT method, multipliers are equal to raw quantization
          * coefficients, but are stored as ints to ensure access efficiency.
          */
-        ISLOW_MULT_TYPE *ismtbl = (ISLOW_MULT_TYPE *) compptr->dct_table;
+        ISLOW_MULT_TYPE *ismtbl = (ISLOW_MULT_TYPE *)compptr->dct_table;
         for (i = 0; i < DCTSIZE2; i++) {
-          ismtbl[i] = (ISLOW_MULT_TYPE) qtbl->quantval[i];
+          ismtbl[i] = (ISLOW_MULT_TYPE)qtbl->quantval[i];
         }
       }
       break;
@@ -263,7 +263,7 @@
          * For integer operation, the multiplier table is to be scaled by
          * IFAST_SCALE_BITS.
          */
-        IFAST_MULT_TYPE *ifmtbl = (IFAST_MULT_TYPE *) compptr->dct_table;
+        IFAST_MULT_TYPE *ifmtbl = (IFAST_MULT_TYPE *)compptr->dct_table;
 #define CONST_BITS 14
         static const INT16 aanscales[DCTSIZE2] = {
           /* precomputed values scaled up by 14 bits */
@@ -280,9 +280,9 @@
 
         for (i = 0; i < DCTSIZE2; i++) {
           ifmtbl[i] = (IFAST_MULT_TYPE)
-            DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
-                                  (JLONG) aanscales[i]),
-                    CONST_BITS-IFAST_SCALE_BITS);
+            DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
+                                  (JLONG)aanscales[i]),
+                    CONST_BITS - IFAST_SCALE_BITS);
         }
       }
       break;
@@ -295,7 +295,7 @@
          *   scalefactor[0] = 1
          *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
          */
-        FLOAT_MULT_TYPE *fmtbl = (FLOAT_MULT_TYPE *) compptr->dct_table;
+        FLOAT_MULT_TYPE *fmtbl = (FLOAT_MULT_TYPE *)compptr->dct_table;
         int row, col;
         static const double aanscalefactor[DCTSIZE] = {
           1.0, 1.387039845, 1.306562965, 1.175875602,
@@ -306,7 +306,7 @@
         for (row = 0; row < DCTSIZE; row++) {
           for (col = 0; col < DCTSIZE; col++) {
             fmtbl[i] = (FLOAT_MULT_TYPE)
-              ((double) qtbl->quantval[i] *
+              ((double)qtbl->quantval[i] *
                aanscalefactor[row] * aanscalefactor[col]);
             i++;
           }
@@ -327,23 +327,23 @@
  */
 
 GLOBAL(void)
-jinit_inverse_dct (j_decompress_ptr cinfo)
+jinit_inverse_dct(j_decompress_ptr cinfo)
 {
   my_idct_ptr idct;
   int ci;
   jpeg_component_info *compptr;
 
   idct = (my_idct_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_idct_controller));
-  cinfo->idct = (struct jpeg_inverse_dct *) idct;
+  cinfo->idct = (struct jpeg_inverse_dct *)idct;
   idct->pub.start_pass = start_pass;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Allocate and pre-zero a multiplier table for each component */
     compptr->dct_table =
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(multiplier_table));
     MEMZERO(compptr->dct_table, sizeof(multiplier_table));
     /* Mark multiplier table not yet set up for any method */
diff --git a/jdhuff.c b/jdhuff.c
index bb2b848..003cee7 100644
--- a/jdhuff.c
+++ b/jdhuff.c
@@ -42,14 +42,14 @@
  */
 
 #ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest,src)  ((dest) = (src))
+#define ASSIGN_STATE(dest, src)  ((dest) = (src))
 #else
 #if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest,src)  \
-        ((dest).last_dc_val[0] = (src).last_dc_val[0], \
-         (dest).last_dc_val[1] = (src).last_dc_val[1], \
-         (dest).last_dc_val[2] = (src).last_dc_val[2], \
-         (dest).last_dc_val[3] = (src).last_dc_val[3])
+#define ASSIGN_STATE(dest, src) \
+  ((dest).last_dc_val[0] = (src).last_dc_val[0], \
+   (dest).last_dc_val[1] = (src).last_dc_val[1], \
+   (dest).last_dc_val[2] = (src).last_dc_val[2], \
+   (dest).last_dc_val[3] = (src).last_dc_val[3])
 #endif
 #endif
 
@@ -88,9 +88,9 @@
  */
 
 METHODDEF(void)
-start_pass_huff_decoder (j_decompress_ptr cinfo)
+start_pass_huff_decoder(j_decompress_ptr cinfo)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   int ci, blkn, dctbl, actbl;
   d_derived_tbl **pdtbl;
   jpeg_component_info *compptr;
@@ -99,7 +99,7 @@
    * This ought to be an error condition, but we make it a warning because
    * there are some baseline files out there with all zeroes in these bytes.
    */
-  if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2-1 ||
+  if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2 - 1 ||
       cinfo->Ah != 0 || cinfo->Al != 0)
     WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
 
@@ -152,8 +152,8 @@
  */
 
 GLOBAL(void)
-jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
-                         d_derived_tbl **pdtbl)
+jpeg_make_d_derived_tbl(j_decompress_ptr cinfo, boolean isDC, int tblno,
+                        d_derived_tbl **pdtbl)
 {
   JHUFF_TBL *htbl;
   d_derived_tbl *dtbl;
@@ -178,7 +178,7 @@
   /* Allocate a workspace if we haven't already done so. */
   if (*pdtbl == NULL)
     *pdtbl = (d_derived_tbl *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(d_derived_tbl));
   dtbl = *pdtbl;
   dtbl->pub = htbl;             /* fill in back link */
@@ -187,11 +187,11 @@
 
   p = 0;
   for (l = 1; l <= 16; l++) {
-    i = (int) htbl->bits[l];
+    i = (int)htbl->bits[l];
     if (i < 0 || p + i > 256)   /* protect against table overrun */
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     while (i--)
-      huffsize[p++] = (char) l;
+      huffsize[p++] = (char)l;
   }
   huffsize[p] = 0;
   numsymbols = p;
@@ -203,14 +203,14 @@
   si = huffsize[0];
   p = 0;
   while (huffsize[p]) {
-    while (((int) huffsize[p]) == si) {
+    while (((int)huffsize[p]) == si) {
       huffcode[p++] = code;
       code++;
     }
     /* code is now 1 more than the last code used for codelength si; but
      * it must still fit in si bits, since no code is allowed to be all ones.
      */
-    if (((JLONG) code) >= (((JLONG) 1) << si))
+    if (((JLONG)code) >= (((JLONG)1) << si))
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     code <<= 1;
     si++;
@@ -224,9 +224,9 @@
       /* valoffset[l] = huffval[] index of 1st symbol of code length l,
        * minus the minimum code of length l
        */
-      dtbl->valoffset[l] = (JLONG) p - (JLONG) huffcode[p];
+      dtbl->valoffset[l] = (JLONG)p - (JLONG)huffcode[p];
       p += htbl->bits[l];
-      dtbl->maxcode[l] = huffcode[p-1]; /* maximum code of length l */
+      dtbl->maxcode[l] = huffcode[p - 1]; /* maximum code of length l */
     } else {
       dtbl->maxcode[l] = -1;    /* -1 if no codes of this length */
     }
@@ -241,16 +241,16 @@
    * with that code.
    */
 
-   for (i = 0; i < (1 << HUFF_LOOKAHEAD); i++)
-     dtbl->lookup[i] = (HUFF_LOOKAHEAD + 1) << HUFF_LOOKAHEAD;
+  for (i = 0; i < (1 << HUFF_LOOKAHEAD); i++)
+    dtbl->lookup[i] = (HUFF_LOOKAHEAD + 1) << HUFF_LOOKAHEAD;
 
   p = 0;
   for (l = 1; l <= HUFF_LOOKAHEAD; l++) {
-    for (i = 1; i <= (int) htbl->bits[l]; i++, p++) {
+    for (i = 1; i <= (int)htbl->bits[l]; i++, p++) {
       /* l = current code's length, p = its index in huffcode[] & huffval[]. */
       /* Generate left-justified code followed by all possible bit sequences */
-      lookbits = huffcode[p] << (HUFF_LOOKAHEAD-l);
-      for (ctr = 1 << (HUFF_LOOKAHEAD-l); ctr > 0; ctr--) {
+      lookbits = huffcode[p] << (HUFF_LOOKAHEAD - l);
+      for (ctr = 1 << (HUFF_LOOKAHEAD - l); ctr > 0; ctr--) {
         dtbl->lookup[lookbits] = (l << HUFF_LOOKAHEAD) | htbl->huffval[p];
         lookbits++;
       }
@@ -291,14 +291,14 @@
 #ifdef SLOW_SHIFT_32
 #define MIN_GET_BITS  15        /* minimum allowable value */
 #else
-#define MIN_GET_BITS  (BIT_BUF_SIZE-7)
+#define MIN_GET_BITS  (BIT_BUF_SIZE - 7)
 #endif
 
 
 GLOBAL(boolean)
-jpeg_fill_bit_buffer (bitread_working_state *state,
-                      register bit_buf_type get_buffer, register int bits_left,
-                      int nbits)
+jpeg_fill_bit_buffer(bitread_working_state *state,
+                     register bit_buf_type get_buffer, register int bits_left,
+                     int nbits)
 /* Load up the bit buffer to a depth of at least nbits */
 {
   /* Copy heavily used state fields into locals (hopefully registers) */
@@ -316,7 +316,7 @@
 
       /* Attempt to read a byte */
       if (bytes_in_buffer == 0) {
-        if (! (*cinfo->src->fill_input_buffer) (cinfo))
+        if (!(*cinfo->src->fill_input_buffer) (cinfo))
           return FALSE;
         next_input_byte = cinfo->src->next_input_byte;
         bytes_in_buffer = cinfo->src->bytes_in_buffer;
@@ -333,7 +333,7 @@
          */
         do {
           if (bytes_in_buffer == 0) {
-            if (! (*cinfo->src->fill_input_buffer) (cinfo))
+            if (!(*cinfo->src->fill_input_buffer) (cinfo))
               return FALSE;
             next_input_byte = cinfo->src->next_input_byte;
             bytes_in_buffer = cinfo->src->bytes_in_buffer;
@@ -365,7 +365,7 @@
       bits_left += 8;
     } /* end while */
   } else {
-  no_more_bytes:
+no_more_bytes:
     /* We get here if we've read the marker that terminates the compressed
      * data segment.  There should be enough bits in the buffer register
      * to satisfy the request; if so, no problem.
@@ -376,7 +376,7 @@
        * We use a nonvolatile flag to ensure that only one warning message
        * appears per data segment.
        */
-      if (! cinfo->entropy->insufficient_data) {
+      if (!cinfo->entropy->insufficient_data) {
         WARNMS(cinfo, JWRN_HIT_MARKER);
         cinfo->entropy->insufficient_data = TRUE;
       }
@@ -400,8 +400,7 @@
    handle markers.  We have to hand off any blocks with markers to the
    slower routines. */
 
-#define GET_BYTE \
-{ \
+#define GET_BYTE { \
   register int c0, c1; \
   c0 = GETJOCTET(*buffer++); \
   c1 = GETJOCTET(*buffer); \
@@ -421,7 +420,7 @@
   } \
 }
 
-#if SIZEOF_SIZE_T==8 || defined(_WIN64)
+#if SIZEOF_SIZE_T == 8 || defined(_WIN64)
 
 /* Pre-fetch 48 bytes, because the holding register is 64-bit */
 #define FILL_BIT_BUFFER_FAST \
@@ -446,9 +445,9 @@
  */
 
 GLOBAL(int)
-jpeg_huff_decode (bitread_working_state *state,
-                  register bit_buf_type get_buffer, register int bits_left,
-                  d_derived_tbl *htbl, int min_bits)
+jpeg_huff_decode(bitread_working_state *state,
+                 register bit_buf_type get_buffer, register int bits_left,
+                 d_derived_tbl *htbl, int min_bits)
 {
   register int l = min_bits;
   register JLONG code;
@@ -480,7 +479,7 @@
     return 0;                   /* fake a zero as the safest result */
   }
 
-  return htbl->pub->huffval[ (int) (code + htbl->valoffset[l]) ];
+  return htbl->pub->huffval[(int)(code + htbl->valoffset[l])];
 }
 
 
@@ -493,21 +492,25 @@
 #ifdef AVOID_TABLES
 
 #define NEG_1 ((unsigned int)-1)
-#define HUFF_EXTEND(x,s)  ((x) + ((((x) - (1<<((s)-1))) >> 31) & (((NEG_1)<<(s)) + 1)))
+#define HUFF_EXTEND(x, s) \
+  ((x) + ((((x) - (1 << ((s) - 1))) >> 31) & (((NEG_1) << (s)) + 1)))
 
 #else
 
-#define HUFF_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
+#define HUFF_EXTEND(x, s) \
+  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
 
-static const int extend_test[16] =   /* entry n is 2**(n-1) */
-  { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
-    0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
+static const int extend_test[16] = {   /* entry n is 2**(n-1) */
+  0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
+  0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000
+};
 
-static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
-  { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
-    ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
-    ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
-    ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
+static const int extend_offset[16] = { /* entry n is (-1 << n) + 1 */
+  0, ((-1) << 1) + 1, ((-1) << 2) + 1, ((-1) << 3) + 1, ((-1) << 4) + 1,
+  ((-1) << 5) + 1, ((-1) << 6) + 1, ((-1) << 7) + 1, ((-1) << 8) + 1,
+  ((-1) << 9) + 1, ((-1) << 10) + 1, ((-1) << 11) + 1, ((-1) << 12) + 1,
+  ((-1) << 13) + 1, ((-1) << 14) + 1, ((-1) << 15) + 1
+};
 
 #endif /* AVOID_TABLES */
 
@@ -518,9 +521,9 @@
  */
 
 LOCAL(boolean)
-process_restart (j_decompress_ptr cinfo)
+process_restart(j_decompress_ptr cinfo)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   int ci;
 
   /* Throw away any unused bits remaining in bit buffer; */
@@ -529,7 +532,7 @@
   entropy->bitstate.bits_left = 0;
 
   /* Advance past the RSTn marker */
-  if (! (*cinfo->marker->read_restart_marker) (cinfo))
+  if (!(*cinfo->marker->read_restart_marker) (cinfo))
     return FALSE;
 
   /* Re-initialize DC predictions to 0 */
@@ -552,16 +555,16 @@
 
 
 LOCAL(boolean)
-decode_mcu_slow (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_slow(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   BITREAD_STATE_VARS;
   int blkn;
   savable_state state;
   /* Outer loop handles each block in the MCU */
 
   /* Load up working state */
-  BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+  BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
   ASSIGN_STATE(state, entropy->saved);
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
@@ -587,7 +590,7 @@
       state.last_dc_val[ci] = s;
       if (block) {
         /* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
-        (*block)[0] = (JCOEF) s;
+        (*block)[0] = (JCOEF)s;
       }
     }
 
@@ -610,7 +613,7 @@
            * Note: the extra entries in jpeg_natural_order[] will save us
            * if k >= DCTSIZE2, which could happen if the data is corrupted.
            */
-          (*block)[jpeg_natural_order[k]] = (JCOEF) s;
+          (*block)[jpeg_natural_order[k]] = (JCOEF)s;
         } else {
           if (r != 15)
             break;
@@ -642,16 +645,16 @@
   }
 
   /* Completed MCU, so update state */
-  BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+  BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
   ASSIGN_STATE(entropy->saved, state);
   return TRUE;
 }
 
 
 LOCAL(boolean)
-decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_fast(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   BITREAD_STATE_VARS;
   JOCTET *buffer;
   int blkn;
@@ -659,8 +662,8 @@
   /* Outer loop handles each block in the MCU */
 
   /* Load up working state */
-  BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-  buffer = (JOCTET *) br_state.next_input_byte;
+  BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
+  buffer = (JOCTET *)br_state.next_input_byte;
   ASSIGN_STATE(state, entropy->saved);
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
@@ -681,7 +684,7 @@
       s += state.last_dc_val[ci];
       state.last_dc_val[ci] = s;
       if (block)
-        (*block)[0] = (JCOEF) s;
+        (*block)[0] = (JCOEF)s;
     }
 
     if (entropy->ac_needed[blkn] && block) {
@@ -696,7 +699,7 @@
           FILL_BIT_BUFFER_FAST
           r = GET_BITS(s);
           s = HUFF_EXTEND(r, s);
-          (*block)[jpeg_natural_order[k]] = (JCOEF) s;
+          (*block)[jpeg_natural_order[k]] = (JCOEF)s;
         } else {
           if (r != 15) break;
           k += 15;
@@ -729,7 +732,7 @@
 
   br_state.bytes_in_buffer -= (buffer - br_state.next_input_byte);
   br_state.next_input_byte = buffer;
-  BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+  BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
   ASSIGN_STATE(entropy->saved, state);
   return TRUE;
 }
@@ -753,33 +756,32 @@
 #define BUFSIZE (DCTSIZE2 * 8)
 
 METHODDEF(boolean)
-decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   int usefast = 1;
 
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
+      if (!process_restart(cinfo))
         return FALSE;
     usefast = 0;
   }
 
-  if (cinfo->src->bytes_in_buffer < BUFSIZE * (size_t)cinfo->blocks_in_MCU
-    || cinfo->unread_marker != 0)
+  if (cinfo->src->bytes_in_buffer < BUFSIZE * (size_t)cinfo->blocks_in_MCU ||
+      cinfo->unread_marker != 0)
     usefast = 0;
 
   /* If we've run out of data, just leave the MCU set to zeroes.
    * This way, we return uniform gray for the remainder of the segment.
    */
-  if (! entropy->pub.insufficient_data) {
+  if (!entropy->pub.insufficient_data) {
 
     if (usefast) {
       if (!decode_mcu_fast(cinfo, MCU_data)) goto use_slow;
-    }
-    else {
-      use_slow:
+    } else {
+use_slow:
       if (!decode_mcu_slow(cinfo, MCU_data)) return FALSE;
     }
 
@@ -797,7 +799,7 @@
  */
 
 GLOBAL(void)
-jinit_huff_decoder (j_decompress_ptr cinfo)
+jinit_huff_decoder(j_decompress_ptr cinfo)
 {
   huff_entropy_ptr entropy;
   int i;
@@ -806,12 +808,12 @@
      are the default tables.  Thus, if the tables are not set by the time
      the Huffman decoder is initialized (usually within the body of
      jpeg_start_decompress()), we set them to default values. */
-  std_huff_tables((j_common_ptr) cinfo);
+  std_huff_tables((j_common_ptr)cinfo);
 
   entropy = (huff_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(huff_entropy_decoder));
-  cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
+  cinfo->entropy = (struct jpeg_entropy_decoder *)entropy;
   entropy->pub.start_pass = start_pass_huff_decoder;
   entropy->pub.decode_mcu = decode_mcu;
 
diff --git a/jdhuff.h b/jdhuff.h
index 87d4465..b2a4668 100644
--- a/jdhuff.h
+++ b/jdhuff.h
@@ -43,13 +43,12 @@
    * if too long.  The next 8 bits of each entry contain the
    * symbol.
    */
-  int lookup[1<<HUFF_LOOKAHEAD];
+  int lookup[1 << HUFF_LOOKAHEAD];
 } d_derived_tbl;
 
 /* Expand a Huffman table definition into the derived format */
-EXTERN(void) jpeg_make_d_derived_tbl
-        (j_decompress_ptr cinfo, boolean isDC, int tblno,
-         d_derived_tbl ** pdtbl);
+EXTERN(void) jpeg_make_d_derived_tbl(j_decompress_ptr cinfo, boolean isDC,
+                                     int tblno, d_derived_tbl **pdtbl);
 
 
 /*
@@ -74,7 +73,7 @@
 #error Cannot determine word size
 #endif
 
-#if SIZEOF_SIZE_T==8 || defined(_WIN64)
+#if SIZEOF_SIZE_T == 8 || defined(_WIN64)
 
 typedef size_t bit_buf_type;            /* type of bit-extraction buffer */
 #define BIT_BUF_SIZE  64                /* size of buffer in bits */
@@ -113,23 +112,23 @@
 } bitread_working_state;
 
 /* Macros to declare and load/save bitread local variables. */
-#define BITREAD_STATE_VARS  \
-        register bit_buf_type get_buffer;  \
-        register int bits_left;  \
-        bitread_working_state br_state
+#define BITREAD_STATE_VARS \
+  register bit_buf_type get_buffer; \
+  register int bits_left; \
+  bitread_working_state br_state
 
-#define BITREAD_LOAD_STATE(cinfop,permstate)  \
-        br_state.cinfo = cinfop; \
-        br_state.next_input_byte = cinfop->src->next_input_byte; \
-        br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
-        get_buffer = permstate.get_buffer; \
-        bits_left = permstate.bits_left;
+#define BITREAD_LOAD_STATE(cinfop, permstate) \
+  br_state.cinfo = cinfop; \
+  br_state.next_input_byte = cinfop->src->next_input_byte; \
+  br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
+  get_buffer = permstate.get_buffer; \
+  bits_left = permstate.bits_left;
 
-#define BITREAD_SAVE_STATE(cinfop,permstate)  \
-        cinfop->src->next_input_byte = br_state.next_input_byte; \
-        cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \
-        permstate.get_buffer = get_buffer; \
-        permstate.bits_left = bits_left
+#define BITREAD_SAVE_STATE(cinfop, permstate) \
+  cinfop->src->next_input_byte = br_state.next_input_byte; \
+  cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \
+  permstate.get_buffer = get_buffer; \
+  permstate.bits_left = bits_left
 
 /*
  * These macros provide the in-line portion of bit fetching.
@@ -137,7 +136,7 @@
  * before using GET_BITS, PEEK_BITS, or DROP_BITS.
  * The variables get_buffer and bits_left are assumed to be locals,
  * but the state struct might not be (jpeg_huff_decode needs this).
- *      CHECK_BIT_BUFFER(state,n,action);
+ *      CHECK_BIT_BUFFER(state, n, action);
  *              Ensure there are N bits in get_buffer; if suspend, take action.
  *      val = GET_BITS(n);
  *              Fetch next N bits.
@@ -149,25 +148,27 @@
  * is evaluated multiple times.
  */
 
-#define CHECK_BIT_BUFFER(state,nbits,action) \
-        { if (bits_left < (nbits)) {  \
-            if (! jpeg_fill_bit_buffer(&(state),get_buffer,bits_left,nbits))  \
-              { action; }  \
-            get_buffer = (state).get_buffer; bits_left = (state).bits_left; } }
+#define CHECK_BIT_BUFFER(state, nbits, action) { \
+  if (bits_left < (nbits)) { \
+    if (!jpeg_fill_bit_buffer(&(state), get_buffer, bits_left, nbits)) \
+      { action; } \
+    get_buffer = (state).get_buffer; bits_left = (state).bits_left; \
+  } \
+}
 
 #define GET_BITS(nbits) \
-        (((int) (get_buffer >> (bits_left -= (nbits)))) & ((1<<(nbits))-1))
+  (((int)(get_buffer >> (bits_left -= (nbits)))) & ((1 << (nbits)) - 1))
 
 #define PEEK_BITS(nbits) \
-        (((int) (get_buffer >> (bits_left -  (nbits)))) & ((1<<(nbits))-1))
+  (((int)(get_buffer >> (bits_left -  (nbits)))) & ((1 << (nbits)) - 1))
 
 #define DROP_BITS(nbits) \
-        (bits_left -= (nbits))
+  (bits_left -= (nbits))
 
 /* Load up the bit buffer to a depth of at least nbits */
-EXTERN(boolean) jpeg_fill_bit_buffer
-        (bitread_working_state *state, register bit_buf_type get_buffer,
-         register int bits_left, int nbits);
+EXTERN(boolean) jpeg_fill_bit_buffer(bitread_working_state *state,
+                                     register bit_buf_type get_buffer,
+                                     register int bits_left, int nbits);
 
 
 /*
@@ -187,10 +188,11 @@
  * 3. jpeg_huff_decode returns -1 if forced to suspend.
  */
 
-#define HUFF_DECODE(result,state,htbl,failaction,slowlabel) \
-{ register int nb, look; \
+#define HUFF_DECODE(result, state, htbl, failaction, slowlabel) { \
+  register int nb, look; \
   if (bits_left < HUFF_LOOKAHEAD) { \
-    if (! jpeg_fill_bit_buffer(&state,get_buffer,bits_left, 0)) {failaction;} \
+    if (!jpeg_fill_bit_buffer(&state, get_buffer, bits_left, 0)) \
+      { failaction; } \
     get_buffer = state.get_buffer; bits_left = state.bits_left; \
     if (bits_left < HUFF_LOOKAHEAD) { \
       nb = 1; goto slowlabel; \
@@ -202,13 +204,14 @@
     result = htbl->lookup[look] & ((1 << HUFF_LOOKAHEAD) - 1); \
   } else { \
 slowlabel: \
-    if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
-        { failaction; } \
+    if ((result = \
+         jpeg_huff_decode(&state, get_buffer, bits_left, htbl, nb)) < 0) \
+      { failaction; } \
     get_buffer = state.get_buffer; bits_left = state.bits_left; \
   } \
 }
 
-#define HUFF_DECODE_FAST(s,nb,htbl) \
+#define HUFF_DECODE_FAST(s, nb, htbl) \
   FILL_BIT_BUFFER_FAST; \
   s = PEEK_BITS(HUFF_LOOKAHEAD); \
   s = htbl->lookup[s]; \
@@ -225,10 +228,11 @@
       s |= GET_BITS(1); \
       nb++; \
     } \
-    s = htbl->pub->huffval[ (int) (s + htbl->valoffset[nb]) & 0xFF ]; \
+    s = htbl->pub->huffval[(int)(s + htbl->valoffset[nb]) & 0xFF]; \
   }
 
 /* Out-of-line case for Huffman code fetching */
-EXTERN(int) jpeg_huff_decode
-        (bitread_working_state *state, register bit_buf_type get_buffer,
-         register int bits_left, d_derived_tbl *htbl, int min_bits);
+EXTERN(int) jpeg_huff_decode(bitread_working_state *state,
+                             register bit_buf_type get_buffer,
+                             register int bits_left, d_derived_tbl *htbl,
+                             int min_bits);
diff --git a/jdicc.c b/jdicc.c
new file mode 100644
index 0000000..7224695
--- /dev/null
+++ b/jdicc.c
@@ -0,0 +1,171 @@
+/*
+ * jdicc.c
+ *
+ * Copyright (C) 1997-1998, Thomas G. Lane, Todd Newman.
+ * Copyright (C) 2017, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file provides code to read International Color Consortium (ICC) device
+ * profiles embedded in JFIF JPEG image files.  The ICC has defined a standard
+ * for including such data in JPEG "APP2" markers.  The code given here does
+ * not know anything about the internal structure of the ICC profile data; it
+ * just knows how to get the profile data from a JPEG file while reading it.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jerror.h"
+
+#ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc() */
+extern void *malloc(size_t size);
+#endif
+
+
+#define ICC_MARKER  (JPEG_APP0 + 2)     /* JPEG marker code for ICC */
+#define ICC_OVERHEAD_LEN  14            /* size of non-profile data in APP2 */
+
+
+/*
+ * Handy subroutine to test whether a saved marker is an ICC profile marker.
+ */
+
+LOCAL(boolean)
+marker_is_icc(jpeg_saved_marker_ptr marker)
+{
+  return
+    marker->marker == ICC_MARKER &&
+    marker->data_length >= ICC_OVERHEAD_LEN &&
+    /* verify the identifying string */
+    GETJOCTET(marker->data[0]) == 0x49 &&
+    GETJOCTET(marker->data[1]) == 0x43 &&
+    GETJOCTET(marker->data[2]) == 0x43 &&
+    GETJOCTET(marker->data[3]) == 0x5F &&
+    GETJOCTET(marker->data[4]) == 0x50 &&
+    GETJOCTET(marker->data[5]) == 0x52 &&
+    GETJOCTET(marker->data[6]) == 0x4F &&
+    GETJOCTET(marker->data[7]) == 0x46 &&
+    GETJOCTET(marker->data[8]) == 0x49 &&
+    GETJOCTET(marker->data[9]) == 0x4C &&
+    GETJOCTET(marker->data[10]) == 0x45 &&
+    GETJOCTET(marker->data[11]) == 0x0;
+}
+
+
+/*
+ * See if there was an ICC profile in the JPEG file being read; if so,
+ * reassemble and return the profile data.
+ *
+ * TRUE is returned if an ICC profile was found, FALSE if not.  If TRUE is
+ * returned, *icc_data_ptr is set to point to the returned data, and
+ * *icc_data_len is set to its length.
+ *
+ * IMPORTANT: the data at *icc_data_ptr is allocated with malloc() and must be
+ * freed by the caller with free() when the caller no longer needs it.
+ * (Alternatively, we could write this routine to use the IJG library's memory
+ * allocator, so that the data would be freed implicitly when
+ * jpeg_finish_decompress() is called.  But it seems likely that many
+ * applications will prefer to have the data stick around after decompression
+ * finishes.)
+ */
+
+GLOBAL(boolean)
+jpeg_read_icc_profile(j_decompress_ptr cinfo, JOCTET **icc_data_ptr,
+                      unsigned int *icc_data_len)
+{
+  jpeg_saved_marker_ptr marker;
+  int num_markers = 0;
+  int seq_no;
+  JOCTET *icc_data;
+  unsigned int total_length;
+#define MAX_SEQ_NO  255         /* sufficient since marker numbers are bytes */
+  char marker_present[MAX_SEQ_NO + 1];      /* 1 if marker found */
+  unsigned int data_length[MAX_SEQ_NO + 1]; /* size of profile data in marker */
+  unsigned int data_offset[MAX_SEQ_NO + 1]; /* offset for data in marker */
+
+  if (icc_data_ptr == NULL || icc_data_len == NULL)
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
+  if (cinfo->global_state < DSTATE_READY)
+    ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+  *icc_data_ptr = NULL;         /* avoid confusion if FALSE return */
+  *icc_data_len = 0;
+
+  /* This first pass over the saved markers discovers whether there are
+   * any ICC markers and verifies the consistency of the marker numbering.
+   */
+
+  for (seq_no = 1; seq_no <= MAX_SEQ_NO; seq_no++)
+    marker_present[seq_no] = 0;
+
+  for (marker = cinfo->marker_list; marker != NULL; marker = marker->next) {
+    if (marker_is_icc(marker)) {
+      if (num_markers == 0)
+        num_markers = GETJOCTET(marker->data[13]);
+      else if (num_markers != GETJOCTET(marker->data[13])) {
+        WARNMS(cinfo, JWRN_BOGUS_ICC);  /* inconsistent num_markers fields */
+        return FALSE;
+      }
+      seq_no = GETJOCTET(marker->data[12]);
+      if (seq_no <= 0 || seq_no > num_markers) {
+        WARNMS(cinfo, JWRN_BOGUS_ICC);  /* bogus sequence number */
+        return FALSE;
+      }
+      if (marker_present[seq_no]) {
+        WARNMS(cinfo, JWRN_BOGUS_ICC);  /* duplicate sequence numbers */
+        return FALSE;
+      }
+      marker_present[seq_no] = 1;
+      data_length[seq_no] = marker->data_length - ICC_OVERHEAD_LEN;
+    }
+  }
+
+  if (num_markers == 0)
+    return FALSE;
+
+  /* Check for missing markers, count total space needed,
+   * compute offset of each marker's part of the data.
+   */
+
+  total_length = 0;
+  for (seq_no = 1; seq_no <= num_markers; seq_no++) {
+    if (marker_present[seq_no] == 0) {
+      WARNMS(cinfo, JWRN_BOGUS_ICC);  /* missing sequence number */
+      return FALSE;
+    }
+    data_offset[seq_no] = total_length;
+    total_length += data_length[seq_no];
+  }
+
+  if (total_length == 0) {
+    WARNMS(cinfo, JWRN_BOGUS_ICC);  /* found only empty markers? */
+    return FALSE;
+  }
+
+  /* Allocate space for assembled data */
+  icc_data = (JOCTET *)malloc(total_length * sizeof(JOCTET));
+  if (icc_data == NULL)
+    ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 11);  /* oops, out of memory */
+
+  /* and fill it in */
+  for (marker = cinfo->marker_list; marker != NULL; marker = marker->next) {
+    if (marker_is_icc(marker)) {
+      JOCTET FAR *src_ptr;
+      JOCTET *dst_ptr;
+      unsigned int length;
+      seq_no = GETJOCTET(marker->data[12]);
+      dst_ptr = icc_data + data_offset[seq_no];
+      src_ptr = marker->data + ICC_OVERHEAD_LEN;
+      length = data_length[seq_no];
+      while (length--) {
+        *dst_ptr++ = *src_ptr++;
+      }
+    }
+  }
+
+  *icc_data_ptr = icc_data;
+  *icc_data_len = total_length;
+
+  return TRUE;
+}
diff --git a/jdinput.c b/jdinput.c
index 32a6b42..fbf8b29 100644
--- a/jdinput.c
+++ b/jdinput.c
@@ -33,7 +33,7 @@
 
 
 /* Forward declarations */
-METHODDEF(int) consume_markers (j_decompress_ptr cinfo);
+METHODDEF(int) consume_markers(j_decompress_ptr cinfo);
 
 
 /*
@@ -41,16 +41,16 @@
  */
 
 LOCAL(void)
-initial_setup (j_decompress_ptr cinfo)
+initial_setup(j_decompress_ptr cinfo)
 /* Called once, when first SOS marker is reached */
 {
   int ci;
   jpeg_component_info *compptr;
 
   /* Make sure image isn't bigger than I can handle */
-  if ((long) cinfo->image_height > (long) JPEG_MAX_DIMENSION ||
-      (long) cinfo->image_width > (long) JPEG_MAX_DIMENSION)
-    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION);
+  if ((long)cinfo->image_height > (long)JPEG_MAX_DIMENSION ||
+      (long)cinfo->image_width > (long)JPEG_MAX_DIMENSION)
+    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int)JPEG_MAX_DIMENSION);
 
   /* For now, precision must match compiled-in value... */
   if (cinfo->data_precision != BITS_IN_JSAMPLE)
@@ -66,8 +66,10 @@
   cinfo->max_v_samp_factor = 1;
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    if (compptr->h_samp_factor<=0 || compptr->h_samp_factor>MAX_SAMP_FACTOR ||
-        compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR)
+    if (compptr->h_samp_factor <= 0 ||
+        compptr->h_samp_factor > MAX_SAMP_FACTOR ||
+        compptr->v_samp_factor <= 0 ||
+        compptr->v_samp_factor > MAX_SAMP_FACTOR)
       ERREXIT(cinfo, JERR_BAD_SAMPLING);
     cinfo->max_h_samp_factor = MAX(cinfo->max_h_samp_factor,
                                    compptr->h_samp_factor);
@@ -75,10 +77,10 @@
                                    compptr->v_samp_factor);
   }
 
-#if JPEG_LIB_VERSION >=80
-    cinfo->block_size = DCTSIZE;
-    cinfo->natural_order = jpeg_natural_order;
-    cinfo->lim_Se = DCTSIZE2-1;
+#if JPEG_LIB_VERSION >= 80
+  cinfo->block_size = DCTSIZE;
+  cinfo->natural_order = jpeg_natural_order;
+  cinfo->lim_Se = DCTSIZE2 - 1;
 #endif
 
   /* We initialize DCT_scaled_size and min_DCT_scaled_size to DCTSIZE.
@@ -101,11 +103,11 @@
 #endif
     /* Size in DCT blocks */
     compptr->width_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
-                    (long) (cinfo->max_h_samp_factor * DCTSIZE));
+      jdiv_round_up((long)cinfo->image_width * (long)compptr->h_samp_factor,
+                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
     compptr->height_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
-                    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+      jdiv_round_up((long)cinfo->image_height * (long)compptr->v_samp_factor,
+                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
     /* Set the first and last MCU columns to decompress from multi-scan images.
      * By default, decompress all of the MCU columns.
      */
@@ -117,11 +119,11 @@
      */
     /* Size in samples */
     compptr->downsampled_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
-                    (long) cinfo->max_h_samp_factor);
+      jdiv_round_up((long)cinfo->image_width * (long)compptr->h_samp_factor,
+                    (long)cinfo->max_h_samp_factor);
     compptr->downsampled_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
-                    (long) cinfo->max_v_samp_factor);
+      jdiv_round_up((long)cinfo->image_height * (long)compptr->v_samp_factor,
+                    (long)cinfo->max_v_samp_factor);
     /* Mark component needed, until color conversion says otherwise */
     compptr->component_needed = TRUE;
     /* Mark no quantization table yet saved for component */
@@ -130,8 +132,8 @@
 
   /* Compute number of fully interleaved MCU rows. */
   cinfo->total_iMCU_rows = (JDIMENSION)
-    jdiv_round_up((long) cinfo->image_height,
-                  (long) (cinfo->max_v_samp_factor*DCTSIZE));
+    jdiv_round_up((long)cinfo->image_height,
+                  (long)(cinfo->max_v_samp_factor * DCTSIZE));
 
   /* Decide whether file contains multiple scans */
   if (cinfo->comps_in_scan < cinfo->num_components || cinfo->progressive_mode)
@@ -142,7 +144,7 @@
 
 
 LOCAL(void)
-per_scan_setup (j_decompress_ptr cinfo)
+per_scan_setup(j_decompress_ptr cinfo)
 /* Do computations that are needed before processing a JPEG scan */
 /* cinfo->comps_in_scan and cinfo->cur_comp_info[] were set from SOS marker */
 {
@@ -167,7 +169,7 @@
     /* For noninterleaved scans, it is convenient to define last_row_height
      * as the number of block rows present in the last iMCU row.
      */
-    tmp = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+    tmp = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
     if (tmp == 0) tmp = compptr->v_samp_factor;
     compptr->last_row_height = tmp;
 
@@ -184,11 +186,11 @@
 
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width,
-                    (long) (cinfo->max_h_samp_factor*DCTSIZE));
+      jdiv_round_up((long)cinfo->image_width,
+                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
     cinfo->MCU_rows_in_scan = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height,
-                    (long) (cinfo->max_v_samp_factor*DCTSIZE));
+      jdiv_round_up((long)cinfo->image_height,
+                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
 
     cinfo->blocks_in_MCU = 0;
 
@@ -198,12 +200,13 @@
       compptr->MCU_width = compptr->h_samp_factor;
       compptr->MCU_height = compptr->v_samp_factor;
       compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
-      compptr->MCU_sample_width = compptr->MCU_width * compptr->_DCT_scaled_size;
+      compptr->MCU_sample_width = compptr->MCU_width *
+                                  compptr->_DCT_scaled_size;
       /* Figure number of non-dummy blocks in last MCU column & row */
-      tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
+      tmp = (int)(compptr->width_in_blocks % compptr->MCU_width);
       if (tmp == 0) tmp = compptr->MCU_width;
       compptr->last_col_width = tmp;
-      tmp = (int) (compptr->height_in_blocks % compptr->MCU_height);
+      tmp = (int)(compptr->height_in_blocks % compptr->MCU_height);
       if (tmp == 0) tmp = compptr->MCU_height;
       compptr->last_row_height = tmp;
       /* Prepare array describing MCU composition */
@@ -241,7 +244,7 @@
  */
 
 LOCAL(void)
-latch_quant_tables (j_decompress_ptr cinfo)
+latch_quant_tables(j_decompress_ptr cinfo)
 {
   int ci, qtblno;
   jpeg_component_info *compptr;
@@ -259,7 +262,7 @@
       ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
     /* OK, save away the quantization table */
     qtbl = (JQUANT_TBL *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(JQUANT_TBL));
     MEMCOPY(qtbl, cinfo->quant_tbl_ptrs[qtblno], sizeof(JQUANT_TBL));
     compptr->quant_table = qtbl;
@@ -275,7 +278,7 @@
  */
 
 METHODDEF(void)
-start_input_pass (j_decompress_ptr cinfo)
+start_input_pass(j_decompress_ptr cinfo)
 {
   per_scan_setup(cinfo);
   latch_quant_tables(cinfo);
@@ -292,7 +295,7 @@
  */
 
 METHODDEF(void)
-finish_input_pass (j_decompress_ptr cinfo)
+finish_input_pass(j_decompress_ptr cinfo)
 {
   cinfo->inputctl->consume_input = consume_markers;
 }
@@ -309,9 +312,9 @@
  */
 
 METHODDEF(int)
-consume_markers (j_decompress_ptr cinfo)
+consume_markers(j_decompress_ptr cinfo)
 {
-  my_inputctl_ptr inputctl = (my_inputctl_ptr) cinfo->inputctl;
+  my_inputctl_ptr inputctl = (my_inputctl_ptr)cinfo->inputctl;
   int val;
 
   if (inputctl->pub.eoi_reached) /* After hitting EOI, read no further */
@@ -329,7 +332,7 @@
        * responsible for enforcing this sequencing.
        */
     } else {                    /* 2nd or later SOS marker */
-      if (! inputctl->pub.has_multiple_scans)
+      if (!inputctl->pub.has_multiple_scans)
         ERREXIT(cinfo, JERR_EOI_EXPECTED); /* Oops, I wasn't expecting this! */
       start_input_pass(cinfo);
     }
@@ -360,16 +363,16 @@
  */
 
 METHODDEF(void)
-reset_input_controller (j_decompress_ptr cinfo)
+reset_input_controller(j_decompress_ptr cinfo)
 {
-  my_inputctl_ptr inputctl = (my_inputctl_ptr) cinfo->inputctl;
+  my_inputctl_ptr inputctl = (my_inputctl_ptr)cinfo->inputctl;
 
   inputctl->pub.consume_input = consume_markers;
   inputctl->pub.has_multiple_scans = FALSE; /* "unknown" would be better */
   inputctl->pub.eoi_reached = FALSE;
   inputctl->inheaders = TRUE;
   /* Reset other modules */
-  (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
+  (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
   (*cinfo->marker->reset_marker_reader) (cinfo);
   /* Reset progression state -- would be cleaner if entropy decoder did this */
   cinfo->coef_bits = NULL;
@@ -382,15 +385,15 @@
  */
 
 GLOBAL(void)
-jinit_input_controller (j_decompress_ptr cinfo)
+jinit_input_controller(j_decompress_ptr cinfo)
 {
   my_inputctl_ptr inputctl;
 
   /* Create subobject in permanent pool */
   inputctl = (my_inputctl_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                 sizeof(my_input_controller));
-  cinfo->inputctl = (struct jpeg_input_controller *) inputctl;
+  cinfo->inputctl = (struct jpeg_input_controller *)inputctl;
   /* Initialize method pointers */
   inputctl->pub.consume_input = consume_markers;
   inputctl->pub.reset_input_controller = reset_input_controller;
diff --git a/jdmainct.c b/jdmainct.c
index ebb069b..50301d6 100644
--- a/jdmainct.c
+++ b/jdmainct.c
@@ -112,26 +112,29 @@
 
 
 /* Forward declarations */
-METHODDEF(void) process_data_simple_main
-        (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
-         JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
-METHODDEF(void) process_data_context_main
-        (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
-         JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
+METHODDEF(void) process_data_simple_main(j_decompress_ptr cinfo,
+                                         JSAMPARRAY output_buf,
+                                         JDIMENSION *out_row_ctr,
+                                         JDIMENSION out_rows_avail);
+METHODDEF(void) process_data_context_main(j_decompress_ptr cinfo,
+                                          JSAMPARRAY output_buf,
+                                          JDIMENSION *out_row_ctr,
+                                          JDIMENSION out_rows_avail);
 #ifdef QUANT_2PASS_SUPPORTED
-METHODDEF(void) process_data_crank_post
-        (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
-         JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
+METHODDEF(void) process_data_crank_post(j_decompress_ptr cinfo,
+                                        JSAMPARRAY output_buf,
+                                        JDIMENSION *out_row_ctr,
+                                        JDIMENSION out_rows_avail);
 #endif
 
 
 LOCAL(void)
-alloc_funny_pointers (j_decompress_ptr cinfo)
+alloc_funny_pointers(j_decompress_ptr cinfo)
 /* Allocate space for the funny pointer lists.
  * This is done only once, not once per pass.
  */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
   int ci, rgroup;
   int M = cinfo->_min_DCT_scaled_size;
   jpeg_component_info *compptr;
@@ -141,7 +144,7 @@
    * We alloc both arrays with one call to save a few cycles.
    */
   main_ptr->xbuffer[0] = (JSAMPIMAGE)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 cinfo->num_components * 2 * sizeof(JSAMPARRAY));
   main_ptr->xbuffer[1] = main_ptr->xbuffer[0] + cinfo->num_components;
 
@@ -153,7 +156,7 @@
      * We alloc both pointer lists with one call to save a few cycles.
      */
     xbuf = (JSAMPARRAY)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   2 * (rgroup * (M + 4)) * sizeof(JSAMPROW));
     xbuf += rgroup;             /* want one row group at negative offsets */
     main_ptr->xbuffer[0][ci] = xbuf;
@@ -164,7 +167,7 @@
 
 
 LOCAL(void)
-make_funny_pointers (j_decompress_ptr cinfo)
+make_funny_pointers(j_decompress_ptr cinfo)
 /* Create the funny pointer lists discussed in the comments above.
  * The actual workspace is already allocated (in main_ptr->buffer),
  * and the space for the pointer lists is allocated too.
@@ -172,7 +175,7 @@
  * This will be repeated at the beginning of each pass.
  */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
   int ci, i, rgroup;
   int M = cinfo->_min_DCT_scaled_size;
   jpeg_component_info *compptr;
@@ -191,8 +194,8 @@
     }
     /* In the second list, put the last four row groups in swapped order */
     for (i = 0; i < rgroup * 2; i++) {
-      xbuf1[rgroup*(M-2) + i] = buf[rgroup*M + i];
-      xbuf1[rgroup*M + i] = buf[rgroup*(M-2) + i];
+      xbuf1[rgroup * (M - 2) + i] = buf[rgroup * M + i];
+      xbuf1[rgroup * M + i] = buf[rgroup * (M - 2) + i];
     }
     /* The wraparound pointers at top and bottom will be filled later
      * (see set_wraparound_pointers, below).  Initially we want the "above"
@@ -207,13 +210,13 @@
 
 
 LOCAL(void)
-set_bottom_pointers (j_decompress_ptr cinfo)
+set_bottom_pointers(j_decompress_ptr cinfo)
 /* Change the pointer lists to duplicate the last sample row at the bottom
  * of the image.  whichptr indicates which xbuffer holds the final iMCU row.
  * Also sets rowgroups_avail to indicate number of nondummy row groups in row.
  */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
   int ci, i, rgroup, iMCUheight, rows_left;
   jpeg_component_info *compptr;
   JSAMPARRAY xbuf;
@@ -224,20 +227,20 @@
     iMCUheight = compptr->v_samp_factor * compptr->_DCT_scaled_size;
     rgroup = iMCUheight / cinfo->_min_DCT_scaled_size;
     /* Count nondummy sample rows remaining for this component */
-    rows_left = (int) (compptr->downsampled_height % (JDIMENSION) iMCUheight);
+    rows_left = (int)(compptr->downsampled_height % (JDIMENSION)iMCUheight);
     if (rows_left == 0) rows_left = iMCUheight;
     /* Count nondummy row groups.  Should get same answer for each component,
      * so we need only do it once.
      */
     if (ci == 0) {
-      main_ptr->rowgroups_avail = (JDIMENSION) ((rows_left-1) / rgroup + 1);
+      main_ptr->rowgroups_avail = (JDIMENSION)((rows_left - 1) / rgroup + 1);
     }
     /* Duplicate the last real sample row rgroup*2 times; this pads out the
      * last partial rowgroup and ensures at least one full rowgroup of context.
      */
     xbuf = main_ptr->xbuffer[main_ptr->whichptr][ci];
     for (i = 0; i < rgroup * 2; i++) {
-      xbuf[rows_left + i] = xbuf[rows_left-1];
+      xbuf[rows_left + i] = xbuf[rows_left - 1];
     }
   }
 }
@@ -248,9 +251,9 @@
  */
 
 METHODDEF(void)
-start_pass_main (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_main(j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
 
   switch (pass_mode) {
   case JBUF_PASS_THRU:
@@ -286,22 +289,21 @@
  */
 
 METHODDEF(void)
-process_data_simple_main (j_decompress_ptr cinfo,
-                          JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                          JDIMENSION out_rows_avail)
+process_data_simple_main(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+                         JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
   JDIMENSION rowgroups_avail;
 
   /* Read input data if we haven't filled the main buffer yet */
-  if (! main_ptr->buffer_full) {
-    if (! (*cinfo->coef->decompress_data) (cinfo, main_ptr->buffer))
+  if (!main_ptr->buffer_full) {
+    if (!(*cinfo->coef->decompress_data) (cinfo, main_ptr->buffer))
       return;                   /* suspension forced, can do nothing more */
     main_ptr->buffer_full = TRUE;       /* OK, we have an iMCU row to work with */
   }
 
   /* There are always min_DCT_scaled_size row groups in an iMCU row. */
-  rowgroups_avail = (JDIMENSION) cinfo->_min_DCT_scaled_size;
+  rowgroups_avail = (JDIMENSION)cinfo->_min_DCT_scaled_size;
   /* Note: at the bottom of the image, we may pass extra garbage row groups
    * to the postprocessor.  The postprocessor has to check for bottom
    * of image anyway (at row resolution), so no point in us doing it too.
@@ -326,16 +328,15 @@
  */
 
 METHODDEF(void)
-process_data_context_main (j_decompress_ptr cinfo,
-                           JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                           JDIMENSION out_rows_avail)
+process_data_context_main(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+                          JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
 
   /* Read input data if we haven't filled the main buffer yet */
-  if (! main_ptr->buffer_full) {
-    if (! (*cinfo->coef->decompress_data) (cinfo,
-                                           main_ptr->xbuffer[main_ptr->whichptr]))
+  if (!main_ptr->buffer_full) {
+    if (!(*cinfo->coef->decompress_data) (cinfo,
+                                          main_ptr->xbuffer[main_ptr->whichptr]))
       return;                   /* suspension forced, can do nothing more */
     main_ptr->buffer_full = TRUE;       /* OK, we have an iMCU row to work with */
     main_ptr->iMCU_row_ctr++;   /* count rows received */
@@ -349,9 +350,11 @@
   switch (main_ptr->context_state) {
   case CTX_POSTPONED_ROW:
     /* Call postprocessor using previously set pointers for postponed row */
-    (*cinfo->post->post_process_data) (cinfo, main_ptr->xbuffer[main_ptr->whichptr],
-                        &main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail,
-                        output_buf, out_row_ctr, out_rows_avail);
+    (*cinfo->post->post_process_data) (cinfo,
+                                       main_ptr->xbuffer[main_ptr->whichptr],
+                                       &main_ptr->rowgroup_ctr,
+                                       main_ptr->rowgroups_avail, output_buf,
+                                       out_row_ctr, out_rows_avail);
     if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail)
       return;                   /* Need to suspend */
     main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
@@ -361,7 +364,7 @@
   case CTX_PREPARE_FOR_IMCU:
     /* Prepare to process first M-1 row groups of this iMCU row */
     main_ptr->rowgroup_ctr = 0;
-    main_ptr->rowgroups_avail = (JDIMENSION) (cinfo->_min_DCT_scaled_size - 1);
+    main_ptr->rowgroups_avail = (JDIMENSION)(cinfo->_min_DCT_scaled_size - 1);
     /* Check for bottom of image: if so, tweak pointers to "duplicate"
      * the last sample row, and adjust rowgroups_avail to ignore padding rows.
      */
@@ -371,9 +374,11 @@
     /*FALLTHROUGH*/
   case CTX_PROCESS_IMCU:
     /* Call postprocessor using previously set pointers */
-    (*cinfo->post->post_process_data) (cinfo, main_ptr->xbuffer[main_ptr->whichptr],
-                        &main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail,
-                        output_buf, out_row_ctr, out_rows_avail);
+    (*cinfo->post->post_process_data) (cinfo,
+                                       main_ptr->xbuffer[main_ptr->whichptr],
+                                       &main_ptr->rowgroup_ctr,
+                                       main_ptr->rowgroups_avail, output_buf,
+                                       out_row_ctr, out_rows_avail);
     if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail)
       return;                   /* Need to suspend */
     /* After the first iMCU, change wraparound pointers to normal state */
@@ -384,8 +389,8 @@
     main_ptr->buffer_full = FALSE;
     /* Still need to process last row group of this iMCU row, */
     /* which is saved at index M+1 of the other xbuffer */
-    main_ptr->rowgroup_ctr = (JDIMENSION) (cinfo->_min_DCT_scaled_size + 1);
-    main_ptr->rowgroups_avail = (JDIMENSION) (cinfo->_min_DCT_scaled_size + 2);
+    main_ptr->rowgroup_ctr = (JDIMENSION)(cinfo->_min_DCT_scaled_size + 1);
+    main_ptr->rowgroups_avail = (JDIMENSION)(cinfo->_min_DCT_scaled_size + 2);
     main_ptr->context_state = CTX_POSTPONED_ROW;
   }
 }
@@ -400,12 +405,11 @@
 #ifdef QUANT_2PASS_SUPPORTED
 
 METHODDEF(void)
-process_data_crank_post (j_decompress_ptr cinfo,
-                         JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                         JDIMENSION out_rows_avail)
+process_data_crank_post(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+                        JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
-  (*cinfo->post->post_process_data) (cinfo, (JSAMPIMAGE) NULL,
-                                     (JDIMENSION *) NULL, (JDIMENSION) 0,
+  (*cinfo->post->post_process_data) (cinfo, (JSAMPIMAGE)NULL,
+                                     (JDIMENSION *)NULL, (JDIMENSION)0,
                                      output_buf, out_row_ctr, out_rows_avail);
 }
 
@@ -417,16 +421,16 @@
  */
 
 GLOBAL(void)
-jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
+jinit_d_main_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
 {
   my_main_ptr main_ptr;
   int ci, rgroup, ngroups;
   jpeg_component_info *compptr;
 
   main_ptr = (my_main_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_main_controller));
-  cinfo->main = (struct jpeg_d_main_controller *) main_ptr;
+  cinfo->main = (struct jpeg_d_main_controller *)main_ptr;
   main_ptr->pub.start_pass = start_pass_main;
 
   if (need_full_buffer)         /* shouldn't happen */
@@ -449,8 +453,8 @@
     rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
       cinfo->_min_DCT_scaled_size; /* height of a row group of component */
     main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray)
-                        ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                        ((j_common_ptr)cinfo, JPOOL_IMAGE,
                          compptr->width_in_blocks * compptr->_DCT_scaled_size,
-                         (JDIMENSION) (rgroup * ngroups));
+                         (JDIMENSION)(rgroup * ngroups));
   }
 }
diff --git a/jdmainct.h b/jdmainct.h
index 3090301..37b201c 100644
--- a/jdmainct.h
+++ b/jdmainct.h
@@ -44,12 +44,12 @@
 
 
 LOCAL(void)
-set_wraparound_pointers (j_decompress_ptr cinfo)
+set_wraparound_pointers(j_decompress_ptr cinfo)
 /* Set up the "wraparound" pointers at top and bottom of the pointer lists.
  * This changes the pointer list state from top-of-image to the normal state.
  */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
   int ci, i, rgroup;
   int M = cinfo->_min_DCT_scaled_size;
   jpeg_component_info *compptr;
@@ -62,10 +62,10 @@
     xbuf0 = main_ptr->xbuffer[0][ci];
     xbuf1 = main_ptr->xbuffer[1][ci];
     for (i = 0; i < rgroup; i++) {
-      xbuf0[i - rgroup] = xbuf0[rgroup*(M+1) + i];
-      xbuf1[i - rgroup] = xbuf1[rgroup*(M+1) + i];
-      xbuf0[rgroup*(M+2) + i] = xbuf0[i];
-      xbuf1[rgroup*(M+2) + i] = xbuf1[i];
+      xbuf0[i - rgroup] = xbuf0[rgroup * (M + 1) + i];
+      xbuf1[i - rgroup] = xbuf1[rgroup * (M + 1) + i];
+      xbuf0[rgroup * (M + 2) + i] = xbuf0[i];
+      xbuf1[rgroup * (M + 2) + i] = xbuf1[i];
     }
   }
 }
diff --git a/jdmarker.c b/jdmarker.c
index e3b612c..c9c7ef6 100644
--- a/jdmarker.c
+++ b/jdmarker.c
@@ -119,50 +119,50 @@
  */
 
 /* Declare and initialize local copies of input pointer/count */
-#define INPUT_VARS(cinfo)  \
-        struct jpeg_source_mgr *datasrc = (cinfo)->src;  \
-        const JOCTET *next_input_byte = datasrc->next_input_byte;  \
-        size_t bytes_in_buffer = datasrc->bytes_in_buffer
+#define INPUT_VARS(cinfo) \
+  struct jpeg_source_mgr *datasrc = (cinfo)->src; \
+  const JOCTET *next_input_byte = datasrc->next_input_byte; \
+  size_t bytes_in_buffer = datasrc->bytes_in_buffer
 
 /* Unload the local copies --- do this only at a restart boundary */
-#define INPUT_SYNC(cinfo)  \
-        ( datasrc->next_input_byte = next_input_byte,  \
-          datasrc->bytes_in_buffer = bytes_in_buffer )
+#define INPUT_SYNC(cinfo) \
+  ( datasrc->next_input_byte = next_input_byte, \
+    datasrc->bytes_in_buffer = bytes_in_buffer )
 
 /* Reload the local copies --- used only in MAKE_BYTE_AVAIL */
-#define INPUT_RELOAD(cinfo)  \
-        ( next_input_byte = datasrc->next_input_byte,  \
-          bytes_in_buffer = datasrc->bytes_in_buffer )
+#define INPUT_RELOAD(cinfo) \
+  ( next_input_byte = datasrc->next_input_byte, \
+    bytes_in_buffer = datasrc->bytes_in_buffer )
 
 /* Internal macro for INPUT_BYTE and INPUT_2BYTES: make a byte available.
  * Note we do *not* do INPUT_SYNC before calling fill_input_buffer,
  * but we must reload the local copies after a successful fill.
  */
-#define MAKE_BYTE_AVAIL(cinfo,action)  \
-        if (bytes_in_buffer == 0) {  \
-          if (! (*datasrc->fill_input_buffer) (cinfo))  \
-            { action; }  \
-          INPUT_RELOAD(cinfo);  \
-        }
+#define MAKE_BYTE_AVAIL(cinfo, action) \
+  if (bytes_in_buffer == 0) { \
+    if (!(*datasrc->fill_input_buffer) (cinfo)) \
+      { action; } \
+    INPUT_RELOAD(cinfo); \
+  }
 
 /* Read a byte into variable V.
  * If must suspend, take the specified action (typically "return FALSE").
  */
-#define INPUT_BYTE(cinfo,V,action)  \
-        MAKESTMT( MAKE_BYTE_AVAIL(cinfo,action); \
-                  bytes_in_buffer--; \
-                  V = GETJOCTET(*next_input_byte++); )
+#define INPUT_BYTE(cinfo, V, action) \
+  MAKESTMT( MAKE_BYTE_AVAIL(cinfo, action); \
+            bytes_in_buffer--; \
+            V = GETJOCTET(*next_input_byte++); )
 
 /* As above, but read two bytes interpreted as an unsigned 16-bit integer.
  * V should be declared unsigned int or perhaps JLONG.
  */
-#define INPUT_2BYTES(cinfo,V,action)  \
-        MAKESTMT( MAKE_BYTE_AVAIL(cinfo,action); \
-                  bytes_in_buffer--; \
-                  V = ((unsigned int) GETJOCTET(*next_input_byte++)) << 8; \
-                  MAKE_BYTE_AVAIL(cinfo,action); \
-                  bytes_in_buffer--; \
-                  V += GETJOCTET(*next_input_byte++); )
+#define INPUT_2BYTES(cinfo, V, action) \
+  MAKESTMT( MAKE_BYTE_AVAIL(cinfo, action); \
+            bytes_in_buffer--; \
+            V = ((unsigned int)GETJOCTET(*next_input_byte++)) << 8; \
+            MAKE_BYTE_AVAIL(cinfo, action); \
+            bytes_in_buffer--; \
+            V += GETJOCTET(*next_input_byte++); )
 
 
 /*
@@ -197,7 +197,7 @@
 
 
 LOCAL(boolean)
-get_soi (j_decompress_ptr cinfo)
+get_soi(j_decompress_ptr cinfo)
 /* Process an SOI marker */
 {
   int i;
@@ -237,7 +237,7 @@
 
 
 LOCAL(boolean)
-get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
+get_sof(j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
 /* Process a SOFn marker */
 {
   JLONG length;
@@ -258,7 +258,7 @@
   length -= 8;
 
   TRACEMS4(cinfo, 1, JTRC_SOF, cinfo->unread_marker,
-           (int) cinfo->image_width, (int) cinfo->image_height,
+           (int)cinfo->image_width, (int)cinfo->image_height,
            cinfo->num_components);
 
   if (cinfo->marker->saw_SOF)
@@ -267,16 +267,16 @@
   /* We don't support files in which the image height is initially specified */
   /* as 0 and is later redefined by DNL.  As long as we have to check that,  */
   /* might as well have a general sanity check. */
-  if (cinfo->image_height <= 0 || cinfo->image_width <= 0
-      || cinfo->num_components <= 0)
+  if (cinfo->image_height <= 0 || cinfo->image_width <= 0 ||
+      cinfo->num_components <= 0)
     ERREXIT(cinfo, JERR_EMPTY_IMAGE);
 
   if (length != (cinfo->num_components * 3))
     ERREXIT(cinfo, JERR_BAD_LENGTH);
 
   if (cinfo->comp_info == NULL) /* do only once, even if suspend */
-    cinfo->comp_info = (jpeg_component_info *) (*cinfo->mem->alloc_small)
-                        ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    cinfo->comp_info = (jpeg_component_info *)(*cinfo->mem->alloc_small)
+                        ((j_common_ptr)cinfo, JPOOL_IMAGE,
                          cinfo->num_components * sizeof(jpeg_component_info));
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
@@ -301,7 +301,7 @@
 
 
 LOCAL(boolean)
-get_sos (j_decompress_ptr cinfo)
+get_sos(j_decompress_ptr cinfo)
 /* Process a SOS marker */
 {
   JLONG length;
@@ -309,7 +309,7 @@
   jpeg_component_info *compptr;
   INPUT_VARS(cinfo);
 
-  if (! cinfo->marker->saw_SOF)
+  if (!cinfo->marker->saw_SOF)
     ERREXIT(cinfo, JERR_SOS_NO_SOF);
 
   INPUT_2BYTES(cinfo, length, return FALSE);
@@ -341,7 +341,7 @@
 
     ERREXIT1(cinfo, JERR_BAD_COMPONENT_ID, cc);
 
-  id_found:
+id_found:
 
     cinfo->cur_comp_info[i] = compptr;
     compptr->dc_tbl_no = (c >> 4) & 15;
@@ -384,7 +384,7 @@
 #ifdef D_ARITH_CODING_SUPPORTED
 
 LOCAL(boolean)
-get_dac (j_decompress_ptr cinfo)
+get_dac(j_decompress_ptr cinfo)
 /* Process a DAC marker */
 {
   JLONG length;
@@ -402,14 +402,14 @@
 
     TRACEMS2(cinfo, 1, JTRC_DAC, index, val);
 
-    if (index < 0 || index >= (2*NUM_ARITH_TBLS))
+    if (index < 0 || index >= (2 * NUM_ARITH_TBLS))
       ERREXIT1(cinfo, JERR_DAC_INDEX, index);
 
     if (index >= NUM_ARITH_TBLS) { /* define AC table */
-      cinfo->arith_ac_K[index-NUM_ARITH_TBLS] = (UINT8) val;
+      cinfo->arith_ac_K[index - NUM_ARITH_TBLS] = (UINT8)val;
     } else {                    /* define DC table */
-      cinfo->arith_dc_L[index] = (UINT8) (val & 0x0F);
-      cinfo->arith_dc_U[index] = (UINT8) (val >> 4);
+      cinfo->arith_dc_L[index] = (UINT8)(val & 0x0F);
+      cinfo->arith_dc_U[index] = (UINT8)(val >> 4);
       if (cinfo->arith_dc_L[index] > cinfo->arith_dc_U[index])
         ERREXIT1(cinfo, JERR_DAC_VALUE, val);
     }
@@ -422,7 +422,7 @@
   return TRUE;
 }
 
-#else /* ! D_ARITH_CODING_SUPPORTED */
+#else /* !D_ARITH_CODING_SUPPORTED */
 
 #define get_dac(cinfo)  skip_variable(cinfo)
 
@@ -430,7 +430,7 @@
 
 
 LOCAL(boolean)
-get_dht (j_decompress_ptr cinfo)
+get_dht(j_decompress_ptr cinfo)
 /* Process a DHT marker */
 {
   JLONG length;
@@ -467,7 +467,7 @@
     /* Here we just do minimal validation of the counts to avoid walking
      * off the end of our table space.  jdhuff.c will check more carefully.
      */
-    if (count > 256 || ((JLONG) count) > length)
+    if (count > 256 || ((JLONG)count) > length)
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
 
     for (i = 0; i < count; i++)
@@ -489,7 +489,7 @@
     }
 
     if (*htblptr == NULL)
-      *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+      *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
 
     MEMCOPY((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
     MEMCOPY((*htblptr)->huffval, huffval, sizeof((*htblptr)->huffval));
@@ -504,7 +504,7 @@
 
 
 LOCAL(boolean)
-get_dqt (j_decompress_ptr cinfo)
+get_dqt(j_decompress_ptr cinfo)
 /* Process a DQT marker */
 {
   JLONG length;
@@ -527,7 +527,7 @@
       ERREXIT1(cinfo, JERR_DQT_INDEX, n);
 
     if (cinfo->quant_tbl_ptrs[n] == NULL)
-      cinfo->quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr) cinfo);
+      cinfo->quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr)cinfo);
     quant_ptr = cinfo->quant_tbl_ptrs[n];
 
     for (i = 0; i < DCTSIZE2; i++) {
@@ -536,20 +536,20 @@
       else
         INPUT_BYTE(cinfo, tmp, return FALSE);
       /* We convert the zigzag-order table to natural array order. */
-      quant_ptr->quantval[jpeg_natural_order[i]] = (UINT16) tmp;
+      quant_ptr->quantval[jpeg_natural_order[i]] = (UINT16)tmp;
     }
 
     if (cinfo->err->trace_level >= 2) {
       for (i = 0; i < DCTSIZE2; i += 8) {
         TRACEMS8(cinfo, 2, JTRC_QUANTVALS,
-                 quant_ptr->quantval[i],   quant_ptr->quantval[i+1],
-                 quant_ptr->quantval[i+2], quant_ptr->quantval[i+3],
-                 quant_ptr->quantval[i+4], quant_ptr->quantval[i+5],
-                 quant_ptr->quantval[i+6], quant_ptr->quantval[i+7]);
+                 quant_ptr->quantval[i],     quant_ptr->quantval[i + 1],
+                 quant_ptr->quantval[i + 2], quant_ptr->quantval[i + 3],
+                 quant_ptr->quantval[i + 4], quant_ptr->quantval[i + 5],
+                 quant_ptr->quantval[i + 6], quant_ptr->quantval[i + 7]);
       }
     }
 
-    length -= DCTSIZE2+1;
+    length -= DCTSIZE2 + 1;
     if (prec) length -= DCTSIZE2;
   }
 
@@ -562,7 +562,7 @@
 
 
 LOCAL(boolean)
-get_dri (j_decompress_ptr cinfo)
+get_dri(j_decompress_ptr cinfo)
 /* Process a DRI marker */
 {
   JLONG length;
@@ -598,14 +598,14 @@
 
 
 LOCAL(void)
-examine_app0 (j_decompress_ptr cinfo, JOCTET *data,
-              unsigned int datalen, JLONG remaining)
+examine_app0(j_decompress_ptr cinfo, JOCTET *data, unsigned int datalen,
+             JLONG remaining)
 /* Examine first few bytes from an APP0.
  * Take appropriate action if it is a JFIF marker.
  * datalen is # of bytes at data[], remaining is length of rest of marker data.
  */
 {
-  JLONG totallen = (JLONG) datalen + remaining;
+  JLONG totallen = (JLONG)datalen + remaining;
 
   if (datalen >= APP0_DATA_LEN &&
       GETJOCTET(data[0]) == 0x4A &&
@@ -639,43 +639,43 @@
                GETJOCTET(data[12]), GETJOCTET(data[13]));
     totallen -= APP0_DATA_LEN;
     if (totallen !=
-        ((JLONG)GETJOCTET(data[12]) * (JLONG)GETJOCTET(data[13]) * (JLONG) 3))
-      TRACEMS1(cinfo, 1, JTRC_JFIF_BADTHUMBNAILSIZE, (int) totallen);
+        ((JLONG)GETJOCTET(data[12]) * (JLONG)GETJOCTET(data[13]) * (JLONG)3))
+      TRACEMS1(cinfo, 1, JTRC_JFIF_BADTHUMBNAILSIZE, (int)totallen);
   } else if (datalen >= 6 &&
-      GETJOCTET(data[0]) == 0x4A &&
-      GETJOCTET(data[1]) == 0x46 &&
-      GETJOCTET(data[2]) == 0x58 &&
-      GETJOCTET(data[3]) == 0x58 &&
-      GETJOCTET(data[4]) == 0) {
+             GETJOCTET(data[0]) == 0x4A &&
+             GETJOCTET(data[1]) == 0x46 &&
+             GETJOCTET(data[2]) == 0x58 &&
+             GETJOCTET(data[3]) == 0x58 &&
+             GETJOCTET(data[4]) == 0) {
     /* Found JFIF "JFXX" extension APP0 marker */
     /* The library doesn't actually do anything with these,
      * but we try to produce a helpful trace message.
      */
     switch (GETJOCTET(data[5])) {
     case 0x10:
-      TRACEMS1(cinfo, 1, JTRC_THUMB_JPEG, (int) totallen);
+      TRACEMS1(cinfo, 1, JTRC_THUMB_JPEG, (int)totallen);
       break;
     case 0x11:
-      TRACEMS1(cinfo, 1, JTRC_THUMB_PALETTE, (int) totallen);
+      TRACEMS1(cinfo, 1, JTRC_THUMB_PALETTE, (int)totallen);
       break;
     case 0x13:
-      TRACEMS1(cinfo, 1, JTRC_THUMB_RGB, (int) totallen);
+      TRACEMS1(cinfo, 1, JTRC_THUMB_RGB, (int)totallen);
       break;
     default:
       TRACEMS2(cinfo, 1, JTRC_JFIF_EXTENSION,
-               GETJOCTET(data[5]), (int) totallen);
+               GETJOCTET(data[5]), (int)totallen);
       break;
     }
   } else {
     /* Start of APP0 does not match "JFIF" or "JFXX", or too short */
-    TRACEMS1(cinfo, 1, JTRC_APP0, (int) totallen);
+    TRACEMS1(cinfo, 1, JTRC_APP0, (int)totallen);
   }
 }
 
 
 LOCAL(void)
-examine_app14 (j_decompress_ptr cinfo, JOCTET *data,
-               unsigned int datalen, JLONG remaining)
+examine_app14(j_decompress_ptr cinfo, JOCTET *data, unsigned int datalen,
+              JLONG remaining)
 /* Examine first few bytes from an APP14.
  * Take appropriate action if it is an Adobe marker.
  * datalen is # of bytes at data[], remaining is length of rest of marker data.
@@ -696,16 +696,16 @@
     transform = GETJOCTET(data[11]);
     TRACEMS4(cinfo, 1, JTRC_ADOBE, version, flags0, flags1, transform);
     cinfo->saw_Adobe_marker = TRUE;
-    cinfo->Adobe_transform = (UINT8) transform;
+    cinfo->Adobe_transform = (UINT8)transform;
   } else {
     /* Start of APP14 does not match "Adobe", or too short */
-    TRACEMS1(cinfo, 1, JTRC_APP14, (int) (datalen + remaining));
+    TRACEMS1(cinfo, 1, JTRC_APP14, (int)(datalen + remaining));
   }
 }
 
 
 METHODDEF(boolean)
-get_interesting_appn (j_decompress_ptr cinfo)
+get_interesting_appn(j_decompress_ptr cinfo)
 /* Process an APP0 or APP14 marker without saving it */
 {
   JLONG length;
@@ -720,7 +720,7 @@
   if (length >= APPN_DATA_LEN)
     numtoread = APPN_DATA_LEN;
   else if (length > 0)
-    numtoread = (unsigned int) length;
+    numtoread = (unsigned int)length;
   else
     numtoread = 0;
   for (i = 0; i < numtoread; i++)
@@ -730,10 +730,10 @@
   /* process it */
   switch (cinfo->unread_marker) {
   case M_APP0:
-    examine_app0(cinfo, (JOCTET *) b, numtoread, length);
+    examine_app0(cinfo, (JOCTET *)b, numtoread, length);
     break;
   case M_APP14:
-    examine_app14(cinfo, (JOCTET *) b, numtoread, length);
+    examine_app14(cinfo, (JOCTET *)b, numtoread, length);
     break;
   default:
     /* can't get here unless jpeg_save_markers chooses wrong processor */
@@ -744,7 +744,7 @@
   /* skip any remaining data -- could be lots */
   INPUT_SYNC(cinfo);
   if (length > 0)
-    (*cinfo->src->skip_input_data) (cinfo, (long) length);
+    (*cinfo->src->skip_input_data) (cinfo, (long)length);
 
   return TRUE;
 }
@@ -753,10 +753,10 @@
 #ifdef SAVE_MARKERS_SUPPORTED
 
 METHODDEF(boolean)
-save_marker (j_decompress_ptr cinfo)
+save_marker(j_decompress_ptr cinfo)
 /* Save an APPn or COM marker into the marker list */
 {
-  my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+  my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
   jpeg_saved_marker_ptr cur_marker = marker->cur_marker;
   unsigned int bytes_read, data_length;
   JOCTET *data;
@@ -770,22 +770,22 @@
     if (length >= 0) {          /* watch out for bogus length word */
       /* figure out how much we want to save */
       unsigned int limit;
-      if (cinfo->unread_marker == (int) M_COM)
+      if (cinfo->unread_marker == (int)M_COM)
         limit = marker->length_limit_COM;
       else
-        limit = marker->length_limit_APPn[cinfo->unread_marker - (int) M_APP0];
-      if ((unsigned int) length < limit)
-        limit = (unsigned int) length;
+        limit = marker->length_limit_APPn[cinfo->unread_marker - (int)M_APP0];
+      if ((unsigned int)length < limit)
+        limit = (unsigned int)length;
       /* allocate and initialize the marker item */
       cur_marker = (jpeg_saved_marker_ptr)
-        (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+        (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                     sizeof(struct jpeg_marker_struct) + limit);
       cur_marker->next = NULL;
-      cur_marker->marker = (UINT8) cinfo->unread_marker;
-      cur_marker->original_length = (unsigned int) length;
+      cur_marker->marker = (UINT8)cinfo->unread_marker;
+      cur_marker->original_length = (unsigned int)length;
       cur_marker->data_length = limit;
       /* data area is just beyond the jpeg_marker_struct */
-      data = cur_marker->data = (JOCTET *) (cur_marker + 1);
+      data = cur_marker->data = (JOCTET *)(cur_marker + 1);
       marker->cur_marker = cur_marker;
       marker->bytes_read = 0;
       bytes_read = 0;
@@ -843,14 +843,14 @@
     break;
   default:
     TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker,
-             (int) (data_length + length));
+             (int)(data_length + length));
     break;
   }
 
   /* skip any remaining data -- could be lots */
   INPUT_SYNC(cinfo);            /* do before skip_input_data */
   if (length > 0)
-    (*cinfo->src->skip_input_data) (cinfo, (long) length);
+    (*cinfo->src->skip_input_data) (cinfo, (long)length);
 
   return TRUE;
 }
@@ -859,7 +859,7 @@
 
 
 METHODDEF(boolean)
-skip_variable (j_decompress_ptr cinfo)
+skip_variable(j_decompress_ptr cinfo)
 /* Skip over an unknown or uninteresting variable-length marker */
 {
   JLONG length;
@@ -868,11 +868,11 @@
   INPUT_2BYTES(cinfo, length, return FALSE);
   length -= 2;
 
-  TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker, (int) length);
+  TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker, (int)length);
 
   INPUT_SYNC(cinfo);            /* do before skip_input_data */
   if (length > 0)
-    (*cinfo->src->skip_input_data) (cinfo, (long) length);
+    (*cinfo->src->skip_input_data) (cinfo, (long)length);
 
   return TRUE;
 }
@@ -888,7 +888,7 @@
  */
 
 LOCAL(boolean)
-next_marker (j_decompress_ptr cinfo)
+next_marker(j_decompress_ptr cinfo)
 {
   int c;
   INPUT_VARS(cinfo);
@@ -935,7 +935,7 @@
 
 
 LOCAL(boolean)
-first_marker (j_decompress_ptr cinfo)
+first_marker(j_decompress_ptr cinfo)
 /* Like next_marker, but used to obtain the initial SOI marker. */
 /* For this marker, we do not allow preceding garbage or fill; otherwise,
  * we might well scan an entire input file before realizing it ain't JPEG.
@@ -948,7 +948,7 @@
 
   INPUT_BYTE(cinfo, c, return FALSE);
   INPUT_BYTE(cinfo, c2, return FALSE);
-  if (c != 0xFF || c2 != (int) M_SOI)
+  if (c != 0xFF || c2 != (int)M_SOI)
     ERREXIT2(cinfo, JERR_NO_SOI, c, c2);
 
   cinfo->unread_marker = c2;
@@ -966,18 +966,18 @@
  */
 
 METHODDEF(int)
-read_markers (j_decompress_ptr cinfo)
+read_markers(j_decompress_ptr cinfo)
 {
   /* Outer loop repeats once for each marker. */
   for (;;) {
     /* Collect the marker proper, unless we already did. */
     /* NB: first_marker() enforces the requirement that SOI appear first. */
     if (cinfo->unread_marker == 0) {
-      if (! cinfo->marker->saw_SOI) {
-        if (! first_marker(cinfo))
+      if (!cinfo->marker->saw_SOI) {
+        if (!first_marker(cinfo))
           return JPEG_SUSPENDED;
       } else {
-        if (! next_marker(cinfo))
+        if (!next_marker(cinfo))
           return JPEG_SUSPENDED;
       }
     }
@@ -987,28 +987,28 @@
      */
     switch (cinfo->unread_marker) {
     case M_SOI:
-      if (! get_soi(cinfo))
+      if (!get_soi(cinfo))
         return JPEG_SUSPENDED;
       break;
 
     case M_SOF0:                /* Baseline */
     case M_SOF1:                /* Extended sequential, Huffman */
-      if (! get_sof(cinfo, FALSE, FALSE))
+      if (!get_sof(cinfo, FALSE, FALSE))
         return JPEG_SUSPENDED;
       break;
 
     case M_SOF2:                /* Progressive, Huffman */
-      if (! get_sof(cinfo, TRUE, FALSE))
+      if (!get_sof(cinfo, TRUE, FALSE))
         return JPEG_SUSPENDED;
       break;
 
     case M_SOF9:                /* Extended sequential, arithmetic */
-      if (! get_sof(cinfo, FALSE, TRUE))
+      if (!get_sof(cinfo, FALSE, TRUE))
         return JPEG_SUSPENDED;
       break;
 
     case M_SOF10:               /* Progressive, arithmetic */
-      if (! get_sof(cinfo, TRUE, TRUE))
+      if (!get_sof(cinfo, TRUE, TRUE))
         return JPEG_SUSPENDED;
       break;
 
@@ -1026,7 +1026,7 @@
       break;
 
     case M_SOS:
-      if (! get_sos(cinfo))
+      if (!get_sos(cinfo))
         return JPEG_SUSPENDED;
       cinfo->unread_marker = 0; /* processed the marker */
       return JPEG_REACHED_SOS;
@@ -1037,22 +1037,22 @@
       return JPEG_REACHED_EOI;
 
     case M_DAC:
-      if (! get_dac(cinfo))
+      if (!get_dac(cinfo))
         return JPEG_SUSPENDED;
       break;
 
     case M_DHT:
-      if (! get_dht(cinfo))
+      if (!get_dht(cinfo))
         return JPEG_SUSPENDED;
       break;
 
     case M_DQT:
-      if (! get_dqt(cinfo))
+      if (!get_dqt(cinfo))
         return JPEG_SUSPENDED;
       break;
 
     case M_DRI:
-      if (! get_dri(cinfo))
+      if (!get_dri(cinfo))
         return JPEG_SUSPENDED;
       break;
 
@@ -1072,13 +1072,13 @@
     case M_APP13:
     case M_APP14:
     case M_APP15:
-      if (! (*((my_marker_ptr) cinfo->marker)->process_APPn[
-                cinfo->unread_marker - (int) M_APP0]) (cinfo))
+      if (!(*((my_marker_ptr)cinfo->marker)->process_APPn[
+               cinfo->unread_marker - (int)M_APP0]) (cinfo))
         return JPEG_SUSPENDED;
       break;
 
     case M_COM:
-      if (! (*((my_marker_ptr) cinfo->marker)->process_COM) (cinfo))
+      if (!(*((my_marker_ptr)cinfo->marker)->process_COM) (cinfo))
         return JPEG_SUSPENDED;
       break;
 
@@ -1095,7 +1095,7 @@
       break;
 
     case M_DNL:                 /* Ignore DNL ... perhaps the wrong thing */
-      if (! skip_variable(cinfo))
+      if (!skip_variable(cinfo))
         return JPEG_SUSPENDED;
       break;
 
@@ -1127,25 +1127,25 @@
  */
 
 METHODDEF(boolean)
-read_restart_marker (j_decompress_ptr cinfo)
+read_restart_marker(j_decompress_ptr cinfo)
 {
   /* Obtain a marker unless we already did. */
   /* Note that next_marker will complain if it skips any data. */
   if (cinfo->unread_marker == 0) {
-    if (! next_marker(cinfo))
+    if (!next_marker(cinfo))
       return FALSE;
   }
 
   if (cinfo->unread_marker ==
-      ((int) M_RST0 + cinfo->marker->next_restart_num)) {
+      ((int)M_RST0 + cinfo->marker->next_restart_num)) {
     /* Normal case --- swallow the marker and let entropy decoder continue */
     TRACEMS1(cinfo, 3, JTRC_RST, cinfo->marker->next_restart_num);
     cinfo->unread_marker = 0;
   } else {
     /* Uh-oh, the restart markers have been messed up. */
     /* Let the data source manager determine how to resync. */
-    if (! (*cinfo->src->resync_to_restart) (cinfo,
-                                            cinfo->marker->next_restart_num))
+    if (!(*cinfo->src->resync_to_restart) (cinfo,
+                                           cinfo->marker->next_restart_num))
       return FALSE;
   }
 
@@ -1206,7 +1206,7 @@
  */
 
 GLOBAL(boolean)
-jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired)
+jpeg_resync_to_restart(j_decompress_ptr cinfo, int desired)
 {
   int marker = cinfo->unread_marker;
   int action = 1;
@@ -1216,16 +1216,16 @@
 
   /* Outer loop handles repeated decision after scanning forward. */
   for (;;) {
-    if (marker < (int) M_SOF0)
+    if (marker < (int)M_SOF0)
       action = 2;               /* invalid marker */
-    else if (marker < (int) M_RST0 || marker > (int) M_RST7)
+    else if (marker < (int)M_RST0 || marker > (int)M_RST7)
       action = 3;               /* valid non-restart marker */
     else {
-      if (marker == ((int) M_RST0 + ((desired+1) & 7)) ||
-          marker == ((int) M_RST0 + ((desired+2) & 7)))
+      if (marker == ((int)M_RST0 + ((desired + 1) & 7)) ||
+          marker == ((int)M_RST0 + ((desired + 2) & 7)))
         action = 3;             /* one of the next two expected restarts */
-      else if (marker == ((int) M_RST0 + ((desired-1) & 7)) ||
-               marker == ((int) M_RST0 + ((desired-2) & 7)))
+      else if (marker == ((int)M_RST0 + ((desired - 1) & 7)) ||
+               marker == ((int)M_RST0 + ((desired - 2) & 7)))
         action = 2;             /* a prior restart, so advance */
       else
         action = 1;             /* desired restart or too far away */
@@ -1238,7 +1238,7 @@
       return TRUE;
     case 2:
       /* Scan to the next marker, and repeat the decision loop. */
-      if (! next_marker(cinfo))
+      if (!next_marker(cinfo))
         return FALSE;
       marker = cinfo->unread_marker;
       break;
@@ -1256,9 +1256,9 @@
  */
 
 METHODDEF(void)
-reset_marker_reader (j_decompress_ptr cinfo)
+reset_marker_reader(j_decompress_ptr cinfo)
 {
-  my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+  my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
 
   cinfo->comp_info = NULL;              /* until allocated by get_sof */
   cinfo->input_scan_number = 0;         /* no SOS seen yet */
@@ -1276,16 +1276,16 @@
  */
 
 GLOBAL(void)
-jinit_marker_reader (j_decompress_ptr cinfo)
+jinit_marker_reader(j_decompress_ptr cinfo)
 {
   my_marker_ptr marker;
   int i;
 
   /* Create subobject in permanent pool */
   marker = (my_marker_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                 sizeof(my_marker_reader));
-  cinfo->marker = (struct jpeg_marker_reader *) marker;
+  cinfo->marker = (struct jpeg_marker_reader *)marker;
   /* Initialize public method pointers */
   marker->pub.reset_marker_reader = reset_marker_reader;
   marker->pub.read_markers = read_markers;
@@ -1314,10 +1314,10 @@
 #ifdef SAVE_MARKERS_SUPPORTED
 
 GLOBAL(void)
-jpeg_save_markers (j_decompress_ptr cinfo, int marker_code,
-                   unsigned int length_limit)
+jpeg_save_markers(j_decompress_ptr cinfo, int marker_code,
+                  unsigned int length_limit)
 {
-  my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+  my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
   long maxlength;
   jpeg_marker_parser_method processor;
 
@@ -1325,8 +1325,8 @@
    * (should only be a concern in a 16-bit environment).
    */
   maxlength = cinfo->mem->max_alloc_chunk - sizeof(struct jpeg_marker_struct);
-  if (((long) length_limit) > maxlength)
-    length_limit = (unsigned int) maxlength;
+  if (((long)length_limit) > maxlength)
+    length_limit = (unsigned int)maxlength;
 
   /* Choose processor routine to use.
    * APP0/APP14 have special requirements.
@@ -1334,23 +1334,23 @@
   if (length_limit) {
     processor = save_marker;
     /* If saving APP0/APP14, save at least enough for our internal use. */
-    if (marker_code == (int) M_APP0 && length_limit < APP0_DATA_LEN)
+    if (marker_code == (int)M_APP0 && length_limit < APP0_DATA_LEN)
       length_limit = APP0_DATA_LEN;
-    else if (marker_code == (int) M_APP14 && length_limit < APP14_DATA_LEN)
+    else if (marker_code == (int)M_APP14 && length_limit < APP14_DATA_LEN)
       length_limit = APP14_DATA_LEN;
   } else {
     processor = skip_variable;
     /* If discarding APP0/APP14, use our regular on-the-fly processor. */
-    if (marker_code == (int) M_APP0 || marker_code == (int) M_APP14)
+    if (marker_code == (int)M_APP0 || marker_code == (int)M_APP14)
       processor = get_interesting_appn;
   }
 
-  if (marker_code == (int) M_COM) {
+  if (marker_code == (int)M_COM) {
     marker->process_COM = processor;
     marker->length_limit_COM = length_limit;
-  } else if (marker_code >= (int) M_APP0 && marker_code <= (int) M_APP15) {
-    marker->process_APPn[marker_code - (int) M_APP0] = processor;
-    marker->length_limit_APPn[marker_code - (int) M_APP0] = length_limit;
+  } else if (marker_code >= (int)M_APP0 && marker_code <= (int)M_APP15) {
+    marker->process_APPn[marker_code - (int)M_APP0] = processor;
+    marker->length_limit_APPn[marker_code - (int)M_APP0] = length_limit;
   } else
     ERREXIT1(cinfo, JERR_UNKNOWN_MARKER, marker_code);
 }
@@ -1363,15 +1363,15 @@
  */
 
 GLOBAL(void)
-jpeg_set_marker_processor (j_decompress_ptr cinfo, int marker_code,
-                           jpeg_marker_parser_method routine)
+jpeg_set_marker_processor(j_decompress_ptr cinfo, int marker_code,
+                          jpeg_marker_parser_method routine)
 {
-  my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+  my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
 
-  if (marker_code == (int) M_COM)
+  if (marker_code == (int)M_COM)
     marker->process_COM = routine;
-  else if (marker_code >= (int) M_APP0 && marker_code <= (int) M_APP15)
-    marker->process_APPn[marker_code - (int) M_APP0] = routine;
+  else if (marker_code >= (int)M_APP0 && marker_code <= (int)M_APP15)
+    marker->process_APPn[marker_code - (int)M_APP0] = routine;
   else
     ERREXIT1(cinfo, JERR_UNKNOWN_MARKER, marker_code);
 }
diff --git a/jdmaster.c b/jdmaster.c
index 9079dda..b209064 100644
--- a/jdmaster.c
+++ b/jdmaster.c
@@ -31,7 +31,7 @@
  */
 
 LOCAL(boolean)
-use_merged_upsample (j_decompress_ptr cinfo)
+use_merged_upsample(j_decompress_ptr cinfo)
 {
 #ifdef UPSAMPLE_MERGING_SUPPORTED
   /* Merging is the equivalent of plain box-filter upsampling */
@@ -40,22 +40,22 @@
   /* jdmerge.c only supports YCC=>RGB and YCC=>RGB565 color conversion */
   if (cinfo->jpeg_color_space != JCS_YCbCr || cinfo->num_components != 3 ||
       (cinfo->out_color_space != JCS_RGB &&
-      cinfo->out_color_space != JCS_RGB565 &&
-      cinfo->out_color_space != JCS_EXT_RGB &&
-      cinfo->out_color_space != JCS_EXT_RGBX &&
-      cinfo->out_color_space != JCS_EXT_BGR &&
-      cinfo->out_color_space != JCS_EXT_BGRX &&
-      cinfo->out_color_space != JCS_EXT_XBGR &&
-      cinfo->out_color_space != JCS_EXT_XRGB &&
-      cinfo->out_color_space != JCS_EXT_RGBA &&
-      cinfo->out_color_space != JCS_EXT_BGRA &&
-      cinfo->out_color_space != JCS_EXT_ABGR &&
-      cinfo->out_color_space != JCS_EXT_ARGB))
+       cinfo->out_color_space != JCS_RGB565 &&
+       cinfo->out_color_space != JCS_EXT_RGB &&
+       cinfo->out_color_space != JCS_EXT_RGBX &&
+       cinfo->out_color_space != JCS_EXT_BGR &&
+       cinfo->out_color_space != JCS_EXT_BGRX &&
+       cinfo->out_color_space != JCS_EXT_XBGR &&
+       cinfo->out_color_space != JCS_EXT_XRGB &&
+       cinfo->out_color_space != JCS_EXT_RGBA &&
+       cinfo->out_color_space != JCS_EXT_BGRA &&
+       cinfo->out_color_space != JCS_EXT_ABGR &&
+       cinfo->out_color_space != JCS_EXT_ARGB))
     return FALSE;
   if ((cinfo->out_color_space == JCS_RGB565 &&
-      cinfo->out_color_components != 3) ||
+       cinfo->out_color_components != 3) ||
       (cinfo->out_color_space != JCS_RGB565 &&
-      cinfo->out_color_components != rgb_pixelsize[cinfo->out_color_space]))
+       cinfo->out_color_components != rgb_pixelsize[cinfo->out_color_space]))
     return FALSE;
   /* and it only handles 2h1v or 2h2v sampling ratios */
   if (cinfo->comp_info[0].h_samp_factor != 2 ||
@@ -100,7 +100,7 @@
 #else
 LOCAL(void)
 #endif
-jpeg_core_output_dimensions (j_decompress_ptr cinfo)
+jpeg_core_output_dimensions(j_decompress_ptr cinfo)
 /* Do computations that are needed before master selection phase.
  * This function is used for transcoding and full decompression.
  */
@@ -113,129 +113,129 @@
   if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom) {
     /* Provide 1/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 1;
     cinfo->_min_DCT_v_scaled_size = 1;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 2) {
     /* Provide 2/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 2L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 2L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 2L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 2L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 2;
     cinfo->_min_DCT_v_scaled_size = 2;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 3) {
     /* Provide 3/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 3L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 3L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 3L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 3L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 3;
     cinfo->_min_DCT_v_scaled_size = 3;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 4) {
     /* Provide 4/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 4L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 4L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 4L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 4L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 4;
     cinfo->_min_DCT_v_scaled_size = 4;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 5) {
     /* Provide 5/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 5L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 5L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 5L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 5L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 5;
     cinfo->_min_DCT_v_scaled_size = 5;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 6) {
     /* Provide 6/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 6L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 6L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 6L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 6L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 6;
     cinfo->_min_DCT_v_scaled_size = 6;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 7) {
     /* Provide 7/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 7L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 7L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 7L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 7L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 7;
     cinfo->_min_DCT_v_scaled_size = 7;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 8) {
     /* Provide 8/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 8L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 8L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 8L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 8L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 8;
     cinfo->_min_DCT_v_scaled_size = 8;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 9) {
     /* Provide 9/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 9L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 9L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 9L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 9L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 9;
     cinfo->_min_DCT_v_scaled_size = 9;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 10) {
     /* Provide 10/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 10L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 10L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 10L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 10L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 10;
     cinfo->_min_DCT_v_scaled_size = 10;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 11) {
     /* Provide 11/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 11L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 11L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 11L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 11L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 11;
     cinfo->_min_DCT_v_scaled_size = 11;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 12) {
     /* Provide 12/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 12L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 12L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 12L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 12L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 12;
     cinfo->_min_DCT_v_scaled_size = 12;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 13) {
     /* Provide 13/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 13L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 13L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 13L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 13L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 13;
     cinfo->_min_DCT_v_scaled_size = 13;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 14) {
     /* Provide 14/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 14L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 14L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 14L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 14L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 14;
     cinfo->_min_DCT_v_scaled_size = 14;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 15) {
     /* Provide 15/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 15L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 15L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 15L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 15L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 15;
     cinfo->_min_DCT_v_scaled_size = 15;
   } else {
     /* Provide 16/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 16L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 16L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 16L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 16L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 16;
     cinfo->_min_DCT_v_scaled_size = 16;
   }
@@ -268,7 +268,7 @@
  */
 
 GLOBAL(void)
-jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
+jpeg_calc_output_dimensions(j_decompress_ptr cinfo)
 /* Do computations that are needed before master selection phase */
 {
 #ifdef IDCT_SCALING_SUPPORTED
@@ -314,13 +314,13 @@
        ci++, compptr++) {
     /* Size in samples, after IDCT scaling */
     compptr->downsampled_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width *
-                    (long) (compptr->h_samp_factor * compptr->_DCT_scaled_size),
-                    (long) (cinfo->max_h_samp_factor * DCTSIZE));
+      jdiv_round_up((long)cinfo->image_width *
+                    (long)(compptr->h_samp_factor * compptr->_DCT_scaled_size),
+                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
     compptr->downsampled_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height *
-                    (long) (compptr->v_samp_factor * compptr->_DCT_scaled_size),
-                    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+      jdiv_round_up((long)cinfo->image_height *
+                    (long)(compptr->v_samp_factor * compptr->_DCT_scaled_size),
+                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
   }
 
 #else /* !IDCT_SCALING_SUPPORTED */
@@ -417,30 +417,30 @@
  */
 
 LOCAL(void)
-prepare_range_limit_table (j_decompress_ptr cinfo)
+prepare_range_limit_table(j_decompress_ptr cinfo)
 /* Allocate and fill in the sample_range_limit table */
 {
   JSAMPLE *table;
   int i;
 
   table = (JSAMPLE *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                (5 * (MAXJSAMPLE+1) + CENTERJSAMPLE) * sizeof(JSAMPLE));
-  table += (MAXJSAMPLE+1);      /* allow negative subscripts of simple table */
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                (5 * (MAXJSAMPLE + 1) + CENTERJSAMPLE) * sizeof(JSAMPLE));
+  table += (MAXJSAMPLE + 1);    /* allow negative subscripts of simple table */
   cinfo->sample_range_limit = table;
   /* First segment of "simple" table: limit[x] = 0 for x < 0 */
-  MEMZERO(table - (MAXJSAMPLE+1), (MAXJSAMPLE+1) * sizeof(JSAMPLE));
+  MEMZERO(table - (MAXJSAMPLE + 1), (MAXJSAMPLE + 1) * sizeof(JSAMPLE));
   /* Main part of "simple" table: limit[x] = x */
   for (i = 0; i <= MAXJSAMPLE; i++)
-    table[i] = (JSAMPLE) i;
+    table[i] = (JSAMPLE)i;
   table += CENTERJSAMPLE;       /* Point to where post-IDCT table starts */
   /* End of simple table, rest of first half of post-IDCT table */
-  for (i = CENTERJSAMPLE; i < 2*(MAXJSAMPLE+1); i++)
+  for (i = CENTERJSAMPLE; i < 2 * (MAXJSAMPLE + 1); i++)
     table[i] = MAXJSAMPLE;
   /* Second half of post-IDCT table */
-  MEMZERO(table + (2 * (MAXJSAMPLE+1)),
-          (2 * (MAXJSAMPLE+1) - CENTERJSAMPLE) * sizeof(JSAMPLE));
-  MEMCOPY(table + (4 * (MAXJSAMPLE+1) - CENTERJSAMPLE),
+  MEMZERO(table + (2 * (MAXJSAMPLE + 1)),
+          (2 * (MAXJSAMPLE + 1) - CENTERJSAMPLE) * sizeof(JSAMPLE));
+  MEMCOPY(table + (4 * (MAXJSAMPLE + 1) - CENTERJSAMPLE),
           cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE));
 }
 
@@ -457,9 +457,9 @@
  */
 
 LOCAL(void)
-master_selection (j_decompress_ptr cinfo)
+master_selection(j_decompress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
   boolean use_c_buffer;
   long samplesperrow;
   JDIMENSION jd_samplesperrow;
@@ -469,9 +469,10 @@
   prepare_range_limit_table(cinfo);
 
   /* Width of an output scanline must be representable as JDIMENSION. */
-  samplesperrow = (long) cinfo->output_width * (long) cinfo->out_color_components;
-  jd_samplesperrow = (JDIMENSION) samplesperrow;
-  if ((long) jd_samplesperrow != samplesperrow)
+  samplesperrow = (long)cinfo->output_width *
+                  (long)cinfo->out_color_components;
+  jd_samplesperrow = (JDIMENSION)samplesperrow;
+  if ((long)jd_samplesperrow != samplesperrow)
     ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
 
   /* Initialize my private state */
@@ -482,7 +483,7 @@
   master->quantizer_1pass = NULL;
   master->quantizer_2pass = NULL;
   /* No mode changes if not using buffered-image mode. */
-  if (! cinfo->quantize_colors || ! cinfo->buffered_image) {
+  if (!cinfo->quantize_colors || !cinfo->buffered_image) {
     cinfo->enable_1pass_quant = FALSE;
     cinfo->enable_external_quant = FALSE;
     cinfo->enable_2pass_quant = FALSE;
@@ -528,7 +529,7 @@
   }
 
   /* Post-processing: in particular, color conversion first */
-  if (! cinfo->raw_data_out) {
+  if (!cinfo->raw_data_out) {
     if (master->using_merged_upsample) {
 #ifdef UPSAMPLE_MERGING_SUPPORTED
       jinit_merged_upsampler(cinfo); /* does color conversion too */
@@ -565,11 +566,11 @@
   use_c_buffer = cinfo->inputctl->has_multiple_scans || cinfo->buffered_image;
   jinit_d_coef_controller(cinfo, use_c_buffer);
 
-  if (! cinfo->raw_data_out)
+  if (!cinfo->raw_data_out)
     jinit_d_main_controller(cinfo, FALSE /* never need full buffer here */);
 
   /* We can now tell the memory manager to allocate virtual arrays. */
-  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo);
+  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
 
   /* Initialize input side of decompressor to consume first scan. */
   (*cinfo->inputctl->start_input_pass) (cinfo);
@@ -585,7 +586,7 @@
    * progress monitoring appropriately.  The input step is counted
    * as one pass.
    */
-  if (cinfo->progress != NULL && ! cinfo->buffered_image &&
+  if (cinfo->progress != NULL && !cinfo->buffered_image &&
       cinfo->inputctl->has_multiple_scans) {
     int nscans;
     /* Estimate number of scans to set pass_limit. */
@@ -597,7 +598,7 @@
       nscans = cinfo->num_components;
     }
     cinfo->progress->pass_counter = 0L;
-    cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows * nscans;
+    cinfo->progress->pass_limit = (long)cinfo->total_iMCU_rows * nscans;
     cinfo->progress->completed_passes = 0;
     cinfo->progress->total_passes = (cinfo->enable_2pass_quant ? 3 : 2);
     /* Count the input pass as done */
@@ -617,9 +618,9 @@
  */
 
 METHODDEF(void)
-prepare_for_output_pass (j_decompress_ptr cinfo)
+prepare_for_output_pass(j_decompress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   if (master->pub.is_dummy_pass) {
 #ifdef QUANT_2PASS_SUPPORTED
@@ -645,8 +646,8 @@
     }
     (*cinfo->idct->start_pass) (cinfo);
     (*cinfo->coef->start_output_pass) (cinfo);
-    if (! cinfo->raw_data_out) {
-      if (! master->using_merged_upsample)
+    if (!cinfo->raw_data_out) {
+      if (!master->using_merged_upsample)
         (*cinfo->cconvert->start_pass) (cinfo);
       (*cinfo->upsample->start_pass) (cinfo);
       if (cinfo->quantize_colors)
@@ -665,7 +666,7 @@
     /* In buffered-image mode, we assume one more output pass if EOI not
      * yet reached, but no more passes if EOI has been reached.
      */
-    if (cinfo->buffered_image && ! cinfo->inputctl->eoi_reached) {
+    if (cinfo->buffered_image && !cinfo->inputctl->eoi_reached) {
       cinfo->progress->total_passes += (cinfo->enable_2pass_quant ? 2 : 1);
     }
   }
@@ -677,9 +678,9 @@
  */
 
 METHODDEF(void)
-finish_output_pass (j_decompress_ptr cinfo)
+finish_output_pass(j_decompress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   if (cinfo->quantize_colors)
     (*cinfo->cquantize->finish_pass) (cinfo);
@@ -694,9 +695,9 @@
  */
 
 GLOBAL(void)
-jpeg_new_colormap (j_decompress_ptr cinfo)
+jpeg_new_colormap(j_decompress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   /* Prevent application from calling me at wrong times */
   if (cinfo->global_state != DSTATE_BUFIMAGE)
@@ -722,9 +723,9 @@
  */
 
 GLOBAL(void)
-jinit_master_decompress (j_decompress_ptr cinfo)
+jinit_master_decompress(j_decompress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   master->pub.prepare_for_output_pass = prepare_for_output_pass;
   master->pub.finish_output_pass = finish_output_pass;
diff --git a/jdmerge.c b/jdmerge.c
index ca6f16c..baf4073 100644
--- a/jdmerge.c
+++ b/jdmerge.c
@@ -76,8 +76,8 @@
 typedef my_upsampler *my_upsample_ptr;
 
 #define SCALEBITS       16      /* speediest right-shift on some machines */
-#define ONE_HALF        ((JLONG) 1 << (SCALEBITS-1))
-#define FIX(x)          ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5))
+#define ONE_HALF        ((JLONG)1 << (SCALEBITS - 1))
+#define FIX(x)          ((JLONG)((x) * (1L << SCALEBITS) + 0.5))
 
 
 /* Include inline routines for colorspace extensions */
@@ -187,25 +187,25 @@
  */
 
 LOCAL(void)
-build_ycc_rgb_table (j_decompress_ptr cinfo)
+build_ycc_rgb_table(j_decompress_ptr cinfo)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
   int i;
   JLONG x;
   SHIFT_TEMPS
 
   upsample->Cr_r_tab = (int *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(int));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(int));
   upsample->Cb_b_tab = (int *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(int));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(int));
   upsample->Cr_g_tab = (JLONG *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(JLONG));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(JLONG));
   upsample->Cb_g_tab = (JLONG *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(JLONG));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(JLONG));
 
   for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
     /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
@@ -217,10 +217,10 @@
     upsample->Cb_b_tab[i] = (int)
                     RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS);
     /* Cr=>G value is scaled-up -0.71414 * x */
-    upsample->Cr_g_tab[i] = (- FIX(0.71414)) * x;
+    upsample->Cr_g_tab[i] = (-FIX(0.71414)) * x;
     /* Cb=>G value is scaled-up -0.34414 * x */
     /* We also add in ONE_HALF so that need not do it in inner loop */
-    upsample->Cb_g_tab[i] = (- FIX(0.34414)) * x + ONE_HALF;
+    upsample->Cb_g_tab[i] = (-FIX(0.34414)) * x + ONE_HALF;
   }
 }
 
@@ -230,9 +230,9 @@
  */
 
 METHODDEF(void)
-start_pass_merged_upsample (j_decompress_ptr cinfo)
+start_pass_merged_upsample(j_decompress_ptr cinfo)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
 
   /* Mark the spare buffer empty */
   upsample->spare_full = FALSE;
@@ -248,14 +248,13 @@
  */
 
 METHODDEF(void)
-merged_2v_upsample (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-                    JDIMENSION in_row_groups_avail,
-                    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                    JDIMENSION out_rows_avail)
+merged_2v_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                   JDIMENSION *in_row_group_ctr,
+                   JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                   JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 /* 2:1 vertical sampling case: may need a spare row. */
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
   JSAMPROW work_ptrs[2];
   JDIMENSION num_rows;          /* number of rows returned to caller */
 
@@ -264,8 +263,8 @@
     JDIMENSION size = upsample->out_row_width;
     if (cinfo->out_color_space == JCS_RGB565)
       size = cinfo->output_width * 2;
-    jcopy_sample_rows(& upsample->spare_row, 0, output_buf + *out_row_ctr, 0,
-                      1, size);
+    jcopy_sample_rows(&upsample->spare_row, 0, output_buf + *out_row_ctr, 0, 1,
+                      size);
     num_rows = 1;
     upsample->spare_full = FALSE;
   } else {
@@ -294,20 +293,19 @@
   *out_row_ctr += num_rows;
   upsample->rows_to_go -= num_rows;
   /* When the buffer is emptied, declare this input row group consumed */
-  if (! upsample->spare_full)
+  if (!upsample->spare_full)
     (*in_row_group_ctr)++;
 }
 
 
 METHODDEF(void)
-merged_1v_upsample (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-                    JDIMENSION in_row_groups_avail,
-                    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                    JDIMENSION out_rows_avail)
+merged_1v_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                   JDIMENSION *in_row_group_ctr,
+                   JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                   JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 /* 1:1 vertical sampling case: much easier, never need a spare row. */
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
 
   /* Just do the upsampling. */
   (*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr,
@@ -333,43 +331,42 @@
  */
 
 METHODDEF(void)
-h2v1_merged_upsample (j_decompress_ptr cinfo,
-                      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-                      JSAMPARRAY output_buf)
+h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                     JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
   switch (cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      extrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                           output_buf);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      extrgbx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    case JCS_EXT_BGR:
-      extbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                           output_buf);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      extbgrx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      extxbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      extxrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    default:
-      h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                    output_buf);
-      break;
+  case JCS_EXT_RGB:
+    extrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                         output_buf);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    extrgbx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  case JCS_EXT_BGR:
+    extbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                         output_buf);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    extbgrx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    extxbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    extxrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  default:
+    h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                  output_buf);
+    break;
   }
 }
 
@@ -379,43 +376,42 @@
  */
 
 METHODDEF(void)
-h2v2_merged_upsample (j_decompress_ptr cinfo,
-                      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-                      JSAMPARRAY output_buf)
+h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                     JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
   switch (cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      extrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                           output_buf);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      extrgbx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    case JCS_EXT_BGR:
-      extbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                           output_buf);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      extbgrx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      extxbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      extxrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    default:
-      h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                    output_buf);
-      break;
+  case JCS_EXT_RGB:
+    extrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                         output_buf);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    extrgbx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  case JCS_EXT_BGR:
+    extbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                         output_buf);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    extbgrx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    extxbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    extxrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  default:
+    h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                  output_buf);
+    break;
   }
 }
 
@@ -424,24 +420,24 @@
  * RGB565 conversion
  */
 
-#define PACK_SHORT_565_LE(r, g, b)   ((((r) << 8) & 0xF800) |  \
-                                      (((g) << 3) & 0x7E0) | ((b) >> 3))
-#define PACK_SHORT_565_BE(r, g, b)   (((r) & 0xF8) | ((g) >> 5) |  \
-                                      (((g) << 11) & 0xE000) |  \
-                                      (((b) << 5) & 0x1F00))
+#define PACK_SHORT_565_LE(r, g, b)  ((((r) << 8) & 0xF800) | \
+                                     (((g) << 3) & 0x7E0) | ((b) >> 3))
+#define PACK_SHORT_565_BE(r, g, b)  (((r) & 0xF8) | ((g) >> 5) | \
+                                     (((g) << 11) & 0xE000) | \
+                                     (((b) << 5) & 0x1F00))
 
-#define PACK_TWO_PIXELS_LE(l, r)     ((r << 16) | l)
-#define PACK_TWO_PIXELS_BE(l, r)     ((l << 16) | r)
+#define PACK_TWO_PIXELS_LE(l, r)    ((r << 16) | l)
+#define PACK_TWO_PIXELS_BE(l, r)    ((l << 16) | r)
 
-#define PACK_NEED_ALIGNMENT(ptr)  (((size_t)(ptr)) & 3)
+#define PACK_NEED_ALIGNMENT(ptr)    (((size_t)(ptr)) & 3)
 
-#define WRITE_TWO_PIXELS_LE(addr, pixels) {  \
-  ((INT16*)(addr))[0] = (INT16)(pixels);  \
-  ((INT16*)(addr))[1] = (INT16)((pixels) >> 16);  \
+#define WRITE_TWO_PIXELS_LE(addr, pixels) { \
+  ((INT16 *)(addr))[0] = (INT16)(pixels); \
+  ((INT16 *)(addr))[1] = (INT16)((pixels) >> 16); \
 }
-#define WRITE_TWO_PIXELS_BE(addr, pixels) {  \
-  ((INT16*)(addr))[1] = (INT16)(pixels);  \
-  ((INT16*)(addr))[0] = (INT16)((pixels) >> 16);  \
+#define WRITE_TWO_PIXELS_BE(addr, pixels) { \
+  ((INT16 *)(addr))[1] = (INT16)(pixels); \
+  ((INT16 *)(addr))[0] = (INT16)((pixels) >> 16); \
 }
 
 #define DITHER_565_R(r, dither)  ((r) + ((dither) & 0xFF))
@@ -510,9 +506,8 @@
 
 
 METHODDEF(void)
-h2v1_merged_upsample_565 (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-                          JSAMPARRAY output_buf)
+h2v1_merged_upsample_565(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
   if (is_big_endian())
     h2v1_merged_upsample_565_be(cinfo, input_buf, in_row_group_ctr,
@@ -520,13 +515,12 @@
   else
     h2v1_merged_upsample_565_le(cinfo, input_buf, in_row_group_ctr,
                                 output_buf);
- }
+}
 
 
 METHODDEF(void)
-h2v1_merged_upsample_565D (j_decompress_ptr cinfo,
-                           JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-                           JSAMPARRAY output_buf)
+h2v1_merged_upsample_565D(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                          JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
   if (is_big_endian())
     h2v1_merged_upsample_565D_be(cinfo, input_buf, in_row_group_ctr,
@@ -538,9 +532,8 @@
 
 
 METHODDEF(void)
-h2v2_merged_upsample_565 (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-                          JSAMPARRAY output_buf)
+h2v2_merged_upsample_565(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
   if (is_big_endian())
     h2v2_merged_upsample_565_be(cinfo, input_buf, in_row_group_ctr,
@@ -552,9 +545,8 @@
 
 
 METHODDEF(void)
-h2v2_merged_upsample_565D (j_decompress_ptr cinfo,
-                           JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-                           JSAMPARRAY output_buf)
+h2v2_merged_upsample_565D(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                          JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
   if (is_big_endian())
     h2v2_merged_upsample_565D_be(cinfo, input_buf, in_row_group_ctr,
@@ -574,14 +566,14 @@
  */
 
 GLOBAL(void)
-jinit_merged_upsampler (j_decompress_ptr cinfo)
+jinit_merged_upsampler(j_decompress_ptr cinfo)
 {
   my_upsample_ptr upsample;
 
   upsample = (my_upsample_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_upsampler));
-  cinfo->upsample = (struct jpeg_upsampler *) upsample;
+  cinfo->upsample = (struct jpeg_upsampler *)upsample;
   upsample->pub.start_pass = start_pass_merged_upsample;
   upsample->pub.need_context_rows = FALSE;
 
@@ -602,8 +594,8 @@
     }
     /* Allocate a spare row buffer */
     upsample->spare_row = (JSAMPROW)
-      (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                (size_t) (upsample->out_row_width * sizeof(JSAMPLE)));
+      (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                (size_t)(upsample->out_row_width * sizeof(JSAMPLE)));
   } else {
     upsample->pub.upsample = merged_1v_upsample;
     if (jsimd_can_h2v1_merged_upsample())
diff --git a/jdmrg565.c b/jdmrg565.c
index a376340..1b87e37 100644
--- a/jdmrg565.c
+++ b/jdmrg565.c
@@ -15,23 +15,22 @@
 
 INLINE
 LOCAL(void)
-h2v1_merged_upsample_565_internal (j_decompress_ptr cinfo,
-                                   JSAMPIMAGE input_buf,
-                                   JDIMENSION in_row_group_ctr,
-                                   JSAMPARRAY output_buf)
+h2v1_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                                  JDIMENSION in_row_group_ctr,
+                                  JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
   register JSAMPROW outptr;
   JSAMPROW inptr0, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  JLONG * Crgtab = upsample->Cr_g_tab;
-  JLONG * Cbgtab = upsample->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  int *Crrtab = upsample->Cr_r_tab;
+  int *Cbbtab = upsample->Cb_b_tab;
+  JLONG *Crgtab = upsample->Cr_g_tab;
+  JLONG *Cbgtab = upsample->Cb_g_tab;
   unsigned int r, g, b;
   JLONG rgb;
   SHIFT_TEMPS
@@ -47,7 +46,7 @@
     cb = GETJSAMPLE(*inptr1++);
     cr = GETJSAMPLE(*inptr2++);
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 2 Y values and emit 2 pixels */
@@ -72,37 +71,37 @@
     cb = GETJSAMPLE(*inptr1);
     cr = GETJSAMPLE(*inptr2);
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     y  = GETJSAMPLE(*inptr0);
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr = (INT16)rgb;
-   }
- }
+    *(INT16 *)outptr = (INT16)rgb;
+  }
+}
 
 
 INLINE
 LOCAL(void)
-h2v1_merged_upsample_565D_internal (j_decompress_ptr cinfo,
-                                    JSAMPIMAGE input_buf,
-                                    JDIMENSION in_row_group_ctr,
-                                    JSAMPARRAY output_buf)
+h2v1_merged_upsample_565D_internal(j_decompress_ptr cinfo,
+                                   JSAMPIMAGE input_buf,
+                                   JDIMENSION in_row_group_ctr,
+                                   JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
   register JSAMPROW outptr;
   JSAMPROW inptr0, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  JLONG * Crgtab = upsample->Cr_g_tab;
-  JLONG * Cbgtab = upsample->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  int *Crrtab = upsample->Cr_r_tab;
+  int *Cbbtab = upsample->Cb_b_tab;
+  JLONG *Crgtab = upsample->Cr_g_tab;
+  JLONG *Cbgtab = upsample->Cb_g_tab;
   JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
   unsigned int r, g, b;
   JLONG rgb;
@@ -119,7 +118,7 @@
     cb = GETJSAMPLE(*inptr1++);
     cr = GETJSAMPLE(*inptr2++);
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 2 Y values and emit 2 pixels */
@@ -146,37 +145,36 @@
     cb = GETJSAMPLE(*inptr1);
     cr = GETJSAMPLE(*inptr2);
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     y  = GETJSAMPLE(*inptr0);
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr = (INT16)rgb;
+    *(INT16 *)outptr = (INT16)rgb;
   }
 }
 
 
 INLINE
 LOCAL(void)
-h2v2_merged_upsample_565_internal (j_decompress_ptr cinfo,
-                                   JSAMPIMAGE input_buf,
-                                   JDIMENSION in_row_group_ctr,
-                                   JSAMPARRAY output_buf)
+h2v2_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                                  JDIMENSION in_row_group_ctr,
+                                  JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
   register JSAMPROW outptr0, outptr1;
   JSAMPROW inptr00, inptr01, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  JLONG * Crgtab = upsample->Cr_g_tab;
-  JLONG * Cbgtab = upsample->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  int *Crrtab = upsample->Cr_r_tab;
+  int *Cbbtab = upsample->Cb_b_tab;
+  JLONG *Crgtab = upsample->Cr_g_tab;
+  JLONG *Cbgtab = upsample->Cb_g_tab;
   unsigned int r, g, b;
   JLONG rgb;
   SHIFT_TEMPS
@@ -194,7 +192,7 @@
     cb = GETJSAMPLE(*inptr1++);
     cr = GETJSAMPLE(*inptr2++);
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 4 Y values and emit 4 pixels */
@@ -234,7 +232,7 @@
     cb = GETJSAMPLE(*inptr1);
     cr = GETJSAMPLE(*inptr2);
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     y  = GETJSAMPLE(*inptr00);
@@ -242,45 +240,45 @@
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr0 = (INT16)rgb;
+    *(INT16 *)outptr0 = (INT16)rgb;
 
     y  = GETJSAMPLE(*inptr01);
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr1 = (INT16)rgb;
+    *(INT16 *)outptr1 = (INT16)rgb;
   }
 }
 
 
 INLINE
 LOCAL(void)
-h2v2_merged_upsample_565D_internal (j_decompress_ptr cinfo,
-                                    JSAMPIMAGE input_buf,
-                                    JDIMENSION in_row_group_ctr,
-                                    JSAMPARRAY output_buf)
+h2v2_merged_upsample_565D_internal(j_decompress_ptr cinfo,
+                                   JSAMPIMAGE input_buf,
+                                   JDIMENSION in_row_group_ctr,
+                                   JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
   register JSAMPROW outptr0, outptr1;
   JSAMPROW inptr00, inptr01, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  JLONG * Crgtab = upsample->Cr_g_tab;
-  JLONG * Cbgtab = upsample->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  int *Crrtab = upsample->Cr_r_tab;
+  int *Cbbtab = upsample->Cb_b_tab;
+  JLONG *Crgtab = upsample->Cr_g_tab;
+  JLONG *Cbgtab = upsample->Cb_g_tab;
   JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
-  JLONG d1 = dither_matrix[(cinfo->output_scanline+1) & DITHER_MASK];
+  JLONG d1 = dither_matrix[(cinfo->output_scanline + 1) & DITHER_MASK];
   unsigned int r, g, b;
   JLONG rgb;
   SHIFT_TEMPS
 
-  inptr00 = input_buf[0][in_row_group_ctr*2];
-  inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
+  inptr00 = input_buf[0][in_row_group_ctr * 2];
+  inptr01 = input_buf[0][in_row_group_ctr * 2 + 1];
   inptr1 = input_buf[1][in_row_group_ctr];
   inptr2 = input_buf[2][in_row_group_ctr];
   outptr0 = output_buf[0];
@@ -292,7 +290,7 @@
     cb = GETJSAMPLE(*inptr1++);
     cr = GETJSAMPLE(*inptr2++);
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 4 Y values and emit 4 pixels */
@@ -336,7 +334,7 @@
     cb = GETJSAMPLE(*inptr1);
     cr = GETJSAMPLE(*inptr2);
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     y  = GETJSAMPLE(*inptr00);
@@ -344,13 +342,13 @@
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr0 = (INT16)rgb;
+    *(INT16 *)outptr0 = (INT16)rgb;
 
     y  = GETJSAMPLE(*inptr01);
     r = range_limit[DITHER_565_R(y + cred, d1)];
     g = range_limit[DITHER_565_G(y + cgreen, d1)];
     b = range_limit[DITHER_565_B(y + cblue, d1)];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr1 = (INT16)rgb;
+    *(INT16 *)outptr1 = (INT16)rgb;
   }
 }
diff --git a/jdmrgext.c b/jdmrgext.c
index 9d7d2af..b1c27df 100644
--- a/jdmrgext.c
+++ b/jdmrgext.c
@@ -21,23 +21,22 @@
 
 INLINE
 LOCAL(void)
-h2v1_merged_upsample_internal (j_decompress_ptr cinfo,
-                               JSAMPIMAGE input_buf,
-                               JDIMENSION in_row_group_ctr,
-                               JSAMPARRAY output_buf)
+h2v1_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                              JDIMENSION in_row_group_ctr,
+                              JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
   register JSAMPROW outptr;
   JSAMPROW inptr0, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  JLONG * Crgtab = upsample->Cr_g_tab;
-  JLONG * Cbgtab = upsample->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  int *Crrtab = upsample->Cr_r_tab;
+  int *Cbbtab = upsample->Cb_b_tab;
+  JLONG *Crgtab = upsample->Cr_g_tab;
+  JLONG *Cbgtab = upsample->Cb_g_tab;
   SHIFT_TEMPS
 
   inptr0 = input_buf[0][in_row_group_ctr];
@@ -50,7 +49,7 @@
     cb = GETJSAMPLE(*inptr1++);
     cr = GETJSAMPLE(*inptr2++);
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     /* Fetch 2 Y values and emit 2 pixels */
     y  = GETJSAMPLE(*inptr0++);
@@ -75,7 +74,7 @@
     cb = GETJSAMPLE(*inptr1);
     cr = GETJSAMPLE(*inptr2);
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     y  = GETJSAMPLE(*inptr0);
     outptr[RGB_RED] =   range_limit[y + cred];
@@ -94,27 +93,26 @@
 
 INLINE
 LOCAL(void)
-h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
-                               JSAMPIMAGE input_buf,
-                               JDIMENSION in_row_group_ctr,
-                               JSAMPARRAY output_buf)
+h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                              JDIMENSION in_row_group_ctr,
+                              JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
   register JSAMPROW outptr0, outptr1;
   JSAMPROW inptr00, inptr01, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  JLONG * Crgtab = upsample->Cr_g_tab;
-  JLONG * Cbgtab = upsample->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  int *Crrtab = upsample->Cr_r_tab;
+  int *Cbbtab = upsample->Cb_b_tab;
+  JLONG *Crgtab = upsample->Cr_g_tab;
+  JLONG *Cbgtab = upsample->Cb_g_tab;
   SHIFT_TEMPS
 
-  inptr00 = input_buf[0][in_row_group_ctr*2];
-  inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
+  inptr00 = input_buf[0][in_row_group_ctr * 2];
+  inptr01 = input_buf[0][in_row_group_ctr * 2 + 1];
   inptr1 = input_buf[1][in_row_group_ctr];
   inptr2 = input_buf[2][in_row_group_ctr];
   outptr0 = output_buf[0];
@@ -125,7 +123,7 @@
     cb = GETJSAMPLE(*inptr1++);
     cr = GETJSAMPLE(*inptr2++);
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     /* Fetch 4 Y values and emit 4 pixels */
     y  = GETJSAMPLE(*inptr00++);
@@ -166,7 +164,7 @@
     cb = GETJSAMPLE(*inptr1);
     cr = GETJSAMPLE(*inptr2);
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     y  = GETJSAMPLE(*inptr00);
     outptr0[RGB_RED] =   range_limit[y + cred];
diff --git a/jdphuff.c b/jdphuff.c
index c927ffa..dcb0abe 100644
--- a/jdphuff.c
+++ b/jdphuff.c
@@ -43,15 +43,15 @@
  */
 
 #ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest,src)  ((dest) = (src))
+#define ASSIGN_STATE(dest, src)  ((dest) = (src))
 #else
 #if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest,src)  \
-        ((dest).EOBRUN = (src).EOBRUN, \
-         (dest).last_dc_val[0] = (src).last_dc_val[0], \
-         (dest).last_dc_val[1] = (src).last_dc_val[1], \
-         (dest).last_dc_val[2] = (src).last_dc_val[2], \
-         (dest).last_dc_val[3] = (src).last_dc_val[3])
+#define ASSIGN_STATE(dest, src) \
+  ((dest).EOBRUN = (src).EOBRUN, \
+   (dest).last_dc_val[0] = (src).last_dc_val[0], \
+   (dest).last_dc_val[1] = (src).last_dc_val[1], \
+   (dest).last_dc_val[2] = (src).last_dc_val[2], \
+   (dest).last_dc_val[3] = (src).last_dc_val[3])
 #endif
 #endif
 
@@ -77,14 +77,14 @@
 typedef phuff_entropy_decoder *phuff_entropy_ptr;
 
 /* Forward declarations */
-METHODDEF(boolean) decode_mcu_DC_first (j_decompress_ptr cinfo,
+METHODDEF(boolean) decode_mcu_DC_first(j_decompress_ptr cinfo,
+                                       JBLOCKROW *MCU_data);
+METHODDEF(boolean) decode_mcu_AC_first(j_decompress_ptr cinfo,
+                                       JBLOCKROW *MCU_data);
+METHODDEF(boolean) decode_mcu_DC_refine(j_decompress_ptr cinfo,
                                         JBLOCKROW *MCU_data);
-METHODDEF(boolean) decode_mcu_AC_first (j_decompress_ptr cinfo,
+METHODDEF(boolean) decode_mcu_AC_refine(j_decompress_ptr cinfo,
                                         JBLOCKROW *MCU_data);
-METHODDEF(boolean) decode_mcu_DC_refine (j_decompress_ptr cinfo,
-                                         JBLOCKROW *MCU_data);
-METHODDEF(boolean) decode_mcu_AC_refine (j_decompress_ptr cinfo,
-                                         JBLOCKROW *MCU_data);
 
 
 /*
@@ -92,9 +92,9 @@
  */
 
 METHODDEF(void)
-start_pass_phuff_decoder (j_decompress_ptr cinfo)
+start_pass_phuff_decoder(j_decompress_ptr cinfo)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   boolean is_DC_band, bad;
   int ci, coefi, tbl;
   d_derived_tbl **pdtbl;
@@ -118,7 +118,7 @@
   }
   if (cinfo->Ah != 0) {
     /* Successive approximation refinement scan: must have Al = Ah-1. */
-    if (cinfo->Al != cinfo->Ah-1)
+    if (cinfo->Al != cinfo->Ah - 1)
       bad = TRUE;
   }
   if (cinfo->Al > 13)           /* need not check for < 0 */
@@ -138,7 +138,7 @@
    */
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     int cindex = cinfo->cur_comp_info[ci]->component_index;
-    coef_bit_ptr = & cinfo->coef_bits[cindex][0];
+    coef_bit_ptr = &cinfo->coef_bits[cindex][0];
     if (!is_DC_band && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
       WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
     for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
@@ -206,21 +206,25 @@
 #ifdef AVOID_TABLES
 
 #define NEG_1 ((unsigned)-1)
-#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((NEG_1)<<(s)) + 1) : (x))
+#define HUFF_EXTEND(x, s) \
+  ((x) < (1 << ((s) - 1)) ? (x) + (((NEG_1) << (s)) + 1) : (x))
 
 #else
 
-#define HUFF_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
+#define HUFF_EXTEND(x, s) \
+  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
 
-static const int extend_test[16] =   /* entry n is 2**(n-1) */
-  { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
-    0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
+static const int extend_test[16] = {   /* entry n is 2**(n-1) */
+  0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
+  0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000
+};
 
-static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
-  { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
-    ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
-    ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
-    ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
+static const int extend_offset[16] = { /* entry n is (-1 << n) + 1 */
+  0, ((-1) << 1) + 1, ((-1) << 2) + 1, ((-1) << 3) + 1, ((-1) << 4) + 1,
+  ((-1) << 5) + 1, ((-1) << 6) + 1, ((-1) << 7) + 1, ((-1) << 8) + 1,
+  ((-1) << 9) + 1, ((-1) << 10) + 1, ((-1) << 11) + 1, ((-1) << 12) + 1,
+  ((-1) << 13) + 1, ((-1) << 14) + 1, ((-1) << 15) + 1
+};
 
 #endif /* AVOID_TABLES */
 
@@ -231,9 +235,9 @@
  */
 
 LOCAL(boolean)
-process_restart (j_decompress_ptr cinfo)
+process_restart(j_decompress_ptr cinfo)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   int ci;
 
   /* Throw away any unused bits remaining in bit buffer; */
@@ -242,7 +246,7 @@
   entropy->bitstate.bits_left = 0;
 
   /* Advance past the RSTn marker */
-  if (! (*cinfo->marker->read_restart_marker) (cinfo))
+  if (!(*cinfo->marker->read_restart_marker) (cinfo))
     return FALSE;
 
   /* Re-initialize DC predictions to 0 */
@@ -289,9 +293,9 @@
  */
 
 METHODDEF(boolean)
-decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   int Al = cinfo->Al;
   register int s, r;
   int blkn, ci;
@@ -304,17 +308,17 @@
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
+      if (!process_restart(cinfo))
         return FALSE;
   }
 
   /* If we've run out of data, just leave the MCU set to zeroes.
    * This way, we return uniform gray for the remainder of the segment.
    */
-  if (! entropy->pub.insufficient_data) {
+  if (!entropy->pub.insufficient_data) {
 
     /* Load up working state */
-    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+    BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
     ASSIGN_STATE(state, entropy->saved);
 
     /* Outer loop handles each block in the MCU */
@@ -339,11 +343,11 @@
       s += state.last_dc_val[ci];
       state.last_dc_val[ci] = s;
       /* Scale and output the coefficient (assumes jpeg_natural_order[0]=0) */
-      (*block)[0] = (JCOEF) LEFT_SHIFT(s, Al);
+      (*block)[0] = (JCOEF)LEFT_SHIFT(s, Al);
     }
 
     /* Completed MCU, so update state */
-    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+    BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
     ASSIGN_STATE(entropy->saved, state);
   }
 
@@ -360,9 +364,9 @@
  */
 
 METHODDEF(boolean)
-decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   int Se = cinfo->Se;
   int Al = cinfo->Al;
   register int s, k, r;
@@ -374,14 +378,14 @@
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
+      if (!process_restart(cinfo))
         return FALSE;
   }
 
   /* If we've run out of data, just leave the MCU set to zeroes.
    * This way, we return uniform gray for the remainder of the segment.
    */
-  if (! entropy->pub.insufficient_data) {
+  if (!entropy->pub.insufficient_data) {
 
     /* Load up working state.
      * We can avoid loading/saving bitread state if in an EOB run.
@@ -393,7 +397,7 @@
     if (EOBRUN > 0)             /* if it's a band of zeroes... */
       EOBRUN--;                 /* ...process it now (we do nothing) */
     else {
-      BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+      BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
       block = MCU_data[0];
       tbl = entropy->ac_derived_tbl;
 
@@ -407,7 +411,7 @@
           r = GET_BITS(s);
           s = HUFF_EXTEND(r, s);
           /* Scale and output coefficient in natural (dezigzagged) order */
-          (*block)[jpeg_natural_order[k]] = (JCOEF) LEFT_SHIFT(s, Al);
+          (*block)[jpeg_natural_order[k]] = (JCOEF)LEFT_SHIFT(s, Al);
         } else {
           if (r == 15) {        /* ZRL */
             k += 15;            /* skip 15 zeroes in band */
@@ -424,7 +428,7 @@
         }
       }
 
-      BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+      BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
     }
 
     /* Completed MCU, so update state */
@@ -445,9 +449,9 @@
  */
 
 METHODDEF(boolean)
-decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   int p1 = 1 << cinfo->Al;      /* 1 in the bit position being coded */
   int blkn;
   JBLOCKROW block;
@@ -456,7 +460,7 @@
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
+      if (!process_restart(cinfo))
         return FALSE;
   }
 
@@ -465,7 +469,7 @@
    */
 
   /* Load up working state */
-  BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+  BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
 
   /* Outer loop handles each block in the MCU */
 
@@ -480,7 +484,7 @@
   }
 
   /* Completed MCU, so update state */
-  BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+  BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
 
   /* Account for restart interval (no-op if not using restarts) */
   entropy->restarts_to_go--;
@@ -494,9 +498,9 @@
  */
 
 METHODDEF(boolean)
-decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   int Se = cinfo->Se;
   int p1 = 1 << cinfo->Al;        /* 1 in the bit position being coded */
   int m1 = (NEG_1) << cinfo->Al;  /* -1 in the bit position being coded */
@@ -512,16 +516,16 @@
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
+      if (!process_restart(cinfo))
         return FALSE;
   }
 
   /* If we've run out of data, don't modify the MCU.
    */
-  if (! entropy->pub.insufficient_data) {
+  if (!entropy->pub.insufficient_data) {
 
     /* Load up working state */
-    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+    BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
     EOBRUN = entropy->saved.EOBRUN; /* only part of saved state we need */
 
     /* There is always only one block per MCU */
@@ -589,7 +593,7 @@
         if (s) {
           int pos = jpeg_natural_order[k];
           /* Output newly nonzero coefficient */
-          (*block)[pos] = (JCOEF) s;
+          (*block)[pos] = (JCOEF)s;
           /* Remember its position in case we have to suspend */
           newnz_pos[num_newnz++] = pos;
         }
@@ -621,7 +625,7 @@
     }
 
     /* Completed MCU, so update state */
-    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+    BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
     entropy->saved.EOBRUN = EOBRUN; /* only part of saved state we need */
   }
 
@@ -644,16 +648,16 @@
  */
 
 GLOBAL(void)
-jinit_phuff_decoder (j_decompress_ptr cinfo)
+jinit_phuff_decoder(j_decompress_ptr cinfo)
 {
   phuff_entropy_ptr entropy;
   int *coef_bit_ptr;
   int ci, i;
 
   entropy = (phuff_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(phuff_entropy_decoder));
-  cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
+  cinfo->entropy = (struct jpeg_entropy_decoder *)entropy;
   entropy->pub.start_pass = start_pass_phuff_decoder;
 
   /* Mark derived tables unallocated */
@@ -663,9 +667,10 @@
 
   /* Create progression status table */
   cinfo->coef_bits = (int (*)[DCTSIZE2])
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                cinfo->num_components*DCTSIZE2*sizeof(int));
-  coef_bit_ptr = & cinfo->coef_bits[0][0];
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                cinfo->num_components * DCTSIZE2 *
+                                sizeof(int));
+  coef_bit_ptr = &cinfo->coef_bits[0][0];
   for (ci = 0; ci < cinfo->num_components; ci++)
     for (i = 0; i < DCTSIZE2; i++)
       *coef_bit_ptr++ = -1;
diff --git a/jdpostct.c b/jdpostct.c
index 601fc2a..6a2cf5c 100644
--- a/jdpostct.c
+++ b/jdpostct.c
@@ -46,22 +46,28 @@
 
 
 /* Forward declarations */
-METHODDEF(void) post_process_1pass
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-         JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
-         JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-         JDIMENSION out_rows_avail);
+METHODDEF(void) post_process_1pass(j_decompress_ptr cinfo,
+                                   JSAMPIMAGE input_buf,
+                                   JDIMENSION *in_row_group_ctr,
+                                   JDIMENSION in_row_groups_avail,
+                                   JSAMPARRAY output_buf,
+                                   JDIMENSION *out_row_ctr,
+                                   JDIMENSION out_rows_avail);
 #ifdef QUANT_2PASS_SUPPORTED
-METHODDEF(void) post_process_prepass
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-         JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
-         JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-         JDIMENSION out_rows_avail);
-METHODDEF(void) post_process_2pass
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-         JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
-         JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-         JDIMENSION out_rows_avail);
+METHODDEF(void) post_process_prepass(j_decompress_ptr cinfo,
+                                     JSAMPIMAGE input_buf,
+                                     JDIMENSION *in_row_group_ctr,
+                                     JDIMENSION in_row_groups_avail,
+                                     JSAMPARRAY output_buf,
+                                     JDIMENSION *out_row_ctr,
+                                     JDIMENSION out_rows_avail);
+METHODDEF(void) post_process_2pass(j_decompress_ptr cinfo,
+                                   JSAMPIMAGE input_buf,
+                                   JDIMENSION *in_row_group_ctr,
+                                   JDIMENSION in_row_groups_avail,
+                                   JSAMPARRAY output_buf,
+                                   JDIMENSION *out_row_ctr,
+                                   JDIMENSION out_rows_avail);
 #endif
 
 
@@ -70,9 +76,9 @@
  */
 
 METHODDEF(void)
-start_pass_dpost (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_dpost(j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_post_ptr post = (my_post_ptr) cinfo->post;
+  my_post_ptr post = (my_post_ptr)cinfo->post;
 
   switch (pass_mode) {
   case JBUF_PASS_THRU:
@@ -85,8 +91,8 @@
        */
       if (post->buffer == NULL) {
         post->buffer = (*cinfo->mem->access_virt_sarray)
-          ((j_common_ptr) cinfo, post->whole_image,
-           (JDIMENSION) 0, post->strip_height, TRUE);
+          ((j_common_ptr)cinfo, post->whole_image,
+           (JDIMENSION)0, post->strip_height, TRUE);
       }
     } else {
       /* For single-pass processing without color quantization,
@@ -123,13 +129,12 @@
  */
 
 METHODDEF(void)
-post_process_1pass (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-                    JDIMENSION in_row_groups_avail,
-                    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                    JDIMENSION out_rows_avail)
+post_process_1pass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                   JDIMENSION *in_row_group_ctr,
+                   JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                   JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
-  my_post_ptr post = (my_post_ptr) cinfo->post;
+  my_post_ptr post = (my_post_ptr)cinfo->post;
   JDIMENSION num_rows, max_rows;
 
   /* Fill the buffer, but not more than what we can dump out in one go. */
@@ -138,12 +143,13 @@
   if (max_rows > post->strip_height)
     max_rows = post->strip_height;
   num_rows = 0;
-  (*cinfo->upsample->upsample) (cinfo,
-                input_buf, in_row_group_ctr, in_row_groups_avail,
-                post->buffer, &num_rows, max_rows);
+  (*cinfo->upsample->upsample) (cinfo, input_buf, in_row_group_ctr,
+                                in_row_groups_avail, post->buffer, &num_rows,
+                                max_rows);
   /* Quantize and emit data. */
-  (*cinfo->cquantize->color_quantize) (cinfo,
-                post->buffer, output_buf + *out_row_ctr, (int) num_rows);
+  (*cinfo->cquantize->color_quantize) (cinfo, post->buffer,
+                                       output_buf + *out_row_ctr,
+                                       (int)num_rows);
   *out_row_ctr += num_rows;
 }
 
@@ -155,34 +161,33 @@
  */
 
 METHODDEF(void)
-post_process_prepass (j_decompress_ptr cinfo,
-                      JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-                      JDIMENSION in_row_groups_avail,
-                      JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                      JDIMENSION out_rows_avail)
+post_process_prepass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                     JDIMENSION *in_row_group_ctr,
+                     JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                     JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
-  my_post_ptr post = (my_post_ptr) cinfo->post;
+  my_post_ptr post = (my_post_ptr)cinfo->post;
   JDIMENSION old_next_row, num_rows;
 
   /* Reposition virtual buffer if at start of strip. */
   if (post->next_row == 0) {
     post->buffer = (*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr) cinfo, post->whole_image,
+        ((j_common_ptr)cinfo, post->whole_image,
          post->starting_row, post->strip_height, TRUE);
   }
 
   /* Upsample some data (up to a strip height's worth). */
   old_next_row = post->next_row;
-  (*cinfo->upsample->upsample) (cinfo,
-                input_buf, in_row_group_ctr, in_row_groups_avail,
-                post->buffer, &post->next_row, post->strip_height);
+  (*cinfo->upsample->upsample) (cinfo, input_buf, in_row_group_ctr,
+                                in_row_groups_avail, post->buffer,
+                                &post->next_row, post->strip_height);
 
   /* Allow quantizer to scan new data.  No data is emitted, */
   /* but we advance out_row_ctr so outer loop can tell when we're done. */
   if (post->next_row > old_next_row) {
     num_rows = post->next_row - old_next_row;
     (*cinfo->cquantize->color_quantize) (cinfo, post->buffer + old_next_row,
-                                         (JSAMPARRAY) NULL, (int) num_rows);
+                                         (JSAMPARRAY)NULL, (int)num_rows);
     *out_row_ctr += num_rows;
   }
 
@@ -199,19 +204,18 @@
  */
 
 METHODDEF(void)
-post_process_2pass (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-                    JDIMENSION in_row_groups_avail,
-                    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                    JDIMENSION out_rows_avail)
+post_process_2pass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                   JDIMENSION *in_row_group_ctr,
+                   JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                   JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
-  my_post_ptr post = (my_post_ptr) cinfo->post;
+  my_post_ptr post = (my_post_ptr)cinfo->post;
   JDIMENSION num_rows, max_rows;
 
   /* Reposition virtual buffer if at start of strip. */
   if (post->next_row == 0) {
     post->buffer = (*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr) cinfo, post->whole_image,
+        ((j_common_ptr)cinfo, post->whole_image,
          post->starting_row, post->strip_height, FALSE);
   }
 
@@ -226,9 +230,9 @@
     num_rows = max_rows;
 
   /* Quantize and emit data. */
-  (*cinfo->cquantize->color_quantize) (cinfo,
-                post->buffer + post->next_row, output_buf + *out_row_ctr,
-                (int) num_rows);
+  (*cinfo->cquantize->color_quantize) (cinfo, post->buffer + post->next_row,
+                                       output_buf + *out_row_ctr,
+                                       (int)num_rows);
   *out_row_ctr += num_rows;
 
   /* Advance if we filled the strip. */
@@ -247,14 +251,14 @@
  */
 
 GLOBAL(void)
-jinit_d_post_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
+jinit_d_post_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
 {
   my_post_ptr post;
 
   post = (my_post_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_post_controller));
-  cinfo->post = (struct jpeg_d_post_controller *) post;
+  cinfo->post = (struct jpeg_d_post_controller *)post;
   post->pub.start_pass = start_pass_dpost;
   post->whole_image = NULL;     /* flag for no virtual arrays */
   post->buffer = NULL;          /* flag for no strip buffer */
@@ -265,16 +269,16 @@
      * an efficient number of rows for upsampling to return.
      * (In the presence of output rescaling, we might want to be smarter?)
      */
-    post->strip_height = (JDIMENSION) cinfo->max_v_samp_factor;
+    post->strip_height = (JDIMENSION)cinfo->max_v_samp_factor;
     if (need_full_buffer) {
       /* Two-pass color quantization: need full-image storage. */
       /* We round up the number of rows to a multiple of the strip height. */
 #ifdef QUANT_2PASS_SUPPORTED
       post->whole_image = (*cinfo->mem->request_virt_sarray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
+        ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
          cinfo->output_width * cinfo->out_color_components,
-         (JDIMENSION) jround_up((long) cinfo->output_height,
-                                (long) post->strip_height),
+         (JDIMENSION)jround_up((long)cinfo->output_height,
+                               (long)post->strip_height),
          post->strip_height);
 #else
       ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -282,7 +286,7 @@
     } else {
       /* One-pass color quantization: just make a strip buffer. */
       post->buffer = (*cinfo->mem->alloc_sarray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE,
+        ((j_common_ptr)cinfo, JPOOL_IMAGE,
          cinfo->output_width * cinfo->out_color_components,
          post->strip_height);
     }
diff --git a/jdsample.c b/jdsample.c
index b1378e1..52ee9af 100644
--- a/jdsample.c
+++ b/jdsample.c
@@ -36,9 +36,9 @@
  */
 
 METHODDEF(void)
-start_pass_upsample (j_decompress_ptr cinfo)
+start_pass_upsample(j_decompress_ptr cinfo)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
 
   /* Mark the conversion buffer empty */
   upsample->next_row_out = cinfo->max_v_samp_factor;
@@ -56,13 +56,12 @@
  */
 
 METHODDEF(void)
-sep_upsample (j_decompress_ptr cinfo,
-              JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-              JDIMENSION in_row_groups_avail,
-              JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-              JDIMENSION out_rows_avail)
+sep_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+             JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
+             JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+             JDIMENSION out_rows_avail)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
   int ci;
   jpeg_component_info *compptr;
   JDIMENSION num_rows;
@@ -84,7 +83,7 @@
   /* Color-convert and emit rows */
 
   /* How many we have in the buffer: */
-  num_rows = (JDIMENSION) (cinfo->max_v_samp_factor - upsample->next_row_out);
+  num_rows = (JDIMENSION)(cinfo->max_v_samp_factor - upsample->next_row_out);
   /* Not more than the distance to the end of the image.  Need this test
    * in case the image height is not a multiple of max_v_samp_factor:
    */
@@ -96,9 +95,8 @@
     num_rows = out_rows_avail;
 
   (*cinfo->cconvert->color_convert) (cinfo, upsample->color_buf,
-                                     (JDIMENSION) upsample->next_row_out,
-                                     output_buf + *out_row_ctr,
-                                     (int) num_rows);
+                                     (JDIMENSION)upsample->next_row_out,
+                                     output_buf + *out_row_ctr, (int)num_rows);
 
   /* Adjust counts */
   *out_row_ctr += num_rows;
@@ -124,8 +122,8 @@
  */
 
 METHODDEF(void)
-fullsize_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                   JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+fullsize_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   *output_data_ptr = input_data;
 }
@@ -137,8 +135,8 @@
  */
 
 METHODDEF(void)
-noop_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+noop_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   *output_data_ptr = NULL;      /* safety check */
 }
@@ -156,10 +154,10 @@
  */
 
 METHODDEF(void)
-int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-              JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+             JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr, outptr;
   register JSAMPLE invalue;
@@ -185,8 +183,8 @@
     }
     /* Generate any additional output rows by duplicating the first one */
     if (v_expand > 1) {
-      jcopy_sample_rows(output_data, outrow, output_data, outrow+1,
-                        v_expand-1, cinfo->output_width);
+      jcopy_sample_rows(output_data, outrow, output_data, outrow + 1,
+                        v_expand - 1, cinfo->output_width);
     }
     inrow++;
     outrow += v_expand;
@@ -200,8 +198,8 @@
  */
 
 METHODDEF(void)
-h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr, outptr;
@@ -228,8 +226,8 @@
  */
 
 METHODDEF(void)
-h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr, outptr;
@@ -247,8 +245,8 @@
       *outptr++ = invalue;
       *outptr++ = invalue;
     }
-    jcopy_sample_rows(output_data, outrow, output_data, outrow+1,
-                      1, cinfo->output_width);
+    jcopy_sample_rows(output_data, outrow, output_data, outrow + 1, 1,
+                      cinfo->output_width);
     inrow++;
     outrow += 2;
   }
@@ -271,8 +269,8 @@
  */
 
 METHODDEF(void)
-h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr, outptr;
@@ -285,20 +283,20 @@
     outptr = output_data[inrow];
     /* Special case for first column */
     invalue = GETJSAMPLE(*inptr++);
-    *outptr++ = (JSAMPLE) invalue;
-    *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(*inptr) + 2) >> 2);
+    *outptr++ = (JSAMPLE)invalue;
+    *outptr++ = (JSAMPLE)((invalue * 3 + GETJSAMPLE(*inptr) + 2) >> 2);
 
     for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
       /* General case: 3/4 * nearer pixel + 1/4 * further pixel */
       invalue = GETJSAMPLE(*inptr++) * 3;
-      *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(inptr[-2]) + 1) >> 2);
-      *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(*inptr) + 2) >> 2);
+      *outptr++ = (JSAMPLE)((invalue + GETJSAMPLE(inptr[-2]) + 1) >> 2);
+      *outptr++ = (JSAMPLE)((invalue + GETJSAMPLE(*inptr) + 2) >> 2);
     }
 
     /* Special case for last column */
     invalue = GETJSAMPLE(*inptr);
-    *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(inptr[-1]) + 1) >> 2);
-    *outptr++ = (JSAMPLE) invalue;
+    *outptr++ = (JSAMPLE)((invalue * 3 + GETJSAMPLE(inptr[-1]) + 1) >> 2);
+    *outptr++ = (JSAMPLE)invalue;
   }
 }
 
@@ -311,8 +309,8 @@
  */
 
 METHODDEF(void)
-h1v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr0, inptr1, outptr;
@@ -330,14 +328,14 @@
       /* inptr0 points to nearest input row, inptr1 points to next nearest */
       inptr0 = input_data[inrow];
       if (v == 0)               /* next nearest is row above */
-        inptr1 = input_data[inrow-1];
+        inptr1 = input_data[inrow - 1];
       else                      /* next nearest is row below */
-        inptr1 = input_data[inrow+1];
+        inptr1 = input_data[inrow + 1];
       outptr = output_data[outrow++];
 
-      for(colctr = 0; colctr < compptr->downsampled_width; colctr++) {
+      for (colctr = 0; colctr < compptr->downsampled_width; colctr++) {
         thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-        *outptr++ = (JSAMPLE) ((thiscolsum + 1) >> 2);
+        *outptr++ = (JSAMPLE)((thiscolsum + 1) >> 2);
       }
     }
     inrow++;
@@ -354,8 +352,8 @@
  */
 
 METHODDEF(void)
-h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr0, inptr1, outptr;
@@ -373,30 +371,30 @@
       /* inptr0 points to nearest input row, inptr1 points to next nearest */
       inptr0 = input_data[inrow];
       if (v == 0)               /* next nearest is row above */
-        inptr1 = input_data[inrow-1];
+        inptr1 = input_data[inrow - 1];
       else                      /* next nearest is row below */
-        inptr1 = input_data[inrow+1];
+        inptr1 = input_data[inrow + 1];
       outptr = output_data[outrow++];
 
       /* Special case for first column */
       thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
       nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 8) >> 4);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
-      lastcolsum = thiscolsum; thiscolsum = nextcolsum;
+      *outptr++ = (JSAMPLE)((thiscolsum * 4 + 8) >> 4);
+      *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
+      lastcolsum = thiscolsum;  thiscolsum = nextcolsum;
 
       for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
         /* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */
         /* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */
         nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-        *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
-        *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
-        lastcolsum = thiscolsum; thiscolsum = nextcolsum;
+        *outptr++ = (JSAMPLE)((thiscolsum * 3 + lastcolsum + 8) >> 4);
+        *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
+        lastcolsum = thiscolsum;  thiscolsum = nextcolsum;
       }
 
       /* Special case for last column */
-      *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 7) >> 4);
+      *outptr++ = (JSAMPLE)((thiscolsum * 3 + lastcolsum + 8) >> 4);
+      *outptr++ = (JSAMPLE)((thiscolsum * 4 + 7) >> 4);
     }
     inrow++;
   }
@@ -408,7 +406,7 @@
  */
 
 GLOBAL(void)
-jinit_upsampler (j_decompress_ptr cinfo)
+jinit_upsampler(j_decompress_ptr cinfo)
 {
   my_upsample_ptr upsample;
   int ci;
@@ -418,14 +416,14 @@
 
   if (!cinfo->master->jinit_upsampler_no_alloc) {
     upsample = (my_upsample_ptr)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(my_upsampler));
-    cinfo->upsample = (struct jpeg_upsampler *) upsample;
+    cinfo->upsample = (struct jpeg_upsampler *)upsample;
     upsample->pub.start_pass = start_pass_upsample;
     upsample->pub.upsample = sep_upsample;
     upsample->pub.need_context_rows = FALSE; /* until we find out differently */
   } else
-    upsample = (my_upsample_ptr) cinfo->upsample;
+    upsample = (my_upsample_ptr)cinfo->upsample;
 
   if (cinfo->CCIR601_sampling)  /* this isn't supported */
     ERREXIT(cinfo, JERR_CCIR601_NOTIMPL);
@@ -451,7 +449,7 @@
     v_out_group = cinfo->max_v_samp_factor;
     upsample->rowgroup_height[ci] = v_in_group; /* save for use later */
     need_buffer = TRUE;
-    if (! compptr->component_needed) {
+    if (!compptr->component_needed) {
       /* Don't bother to upsample an uninteresting component. */
       upsample->methods[ci] = noop_upsample;
       need_buffer = FALSE;
@@ -459,8 +457,7 @@
       /* Fullsize components can be processed without any work. */
       upsample->methods[ci] = fullsize_upsample;
       need_buffer = FALSE;
-    } else if (h_in_group * 2 == h_out_group &&
-               v_in_group == v_out_group) {
+    } else if (h_in_group * 2 == h_out_group && v_in_group == v_out_group) {
       /* Special cases for 2h1v upsampling */
       if (do_fancy && compptr->downsampled_width > 2) {
         if (jsimd_can_h2v1_fancy_upsample())
@@ -502,16 +499,16 @@
       else
 #endif
         upsample->methods[ci] = int_upsample;
-      upsample->h_expand[ci] = (UINT8) (h_out_group / h_in_group);
-      upsample->v_expand[ci] = (UINT8) (v_out_group / v_in_group);
+      upsample->h_expand[ci] = (UINT8)(h_out_group / h_in_group);
+      upsample->v_expand[ci] = (UINT8)(v_out_group / v_in_group);
     } else
       ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL);
     if (need_buffer && !cinfo->master->jinit_upsampler_no_alloc) {
       upsample->color_buf[ci] = (*cinfo->mem->alloc_sarray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE,
-         (JDIMENSION) jround_up((long) cinfo->output_width,
-                                (long) cinfo->max_h_samp_factor),
-         (JDIMENSION) cinfo->max_v_samp_factor);
+        ((j_common_ptr)cinfo, JPOOL_IMAGE,
+         (JDIMENSION)jround_up((long)cinfo->output_width,
+                               (long)cinfo->max_h_samp_factor),
+         (JDIMENSION)cinfo->max_v_samp_factor);
     }
   }
 }
diff --git a/jdtrans.c b/jdtrans.c
index cfc85dd..56713ef 100644
--- a/jdtrans.c
+++ b/jdtrans.c
@@ -19,7 +19,7 @@
 
 
 /* Forward declarations */
-LOCAL(void) transdecode_master_selection (j_decompress_ptr cinfo);
+LOCAL(void) transdecode_master_selection(j_decompress_ptr cinfo);
 
 
 /*
@@ -45,7 +45,7 @@
  */
 
 GLOBAL(jvirt_barray_ptr *)
-jpeg_read_coefficients (j_decompress_ptr cinfo)
+jpeg_read_coefficients(j_decompress_ptr cinfo)
 {
   if (cinfo->global_state == DSTATE_READY) {
     /* First call: initialize active modules */
@@ -58,7 +58,7 @@
       int retcode;
       /* Call progress monitor hook if present */
       if (cinfo->progress != NULL)
-        (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+        (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
       /* Absorb some more input */
       retcode = (*cinfo->inputctl->consume_input) (cinfo);
       if (retcode == JPEG_SUSPENDED)
@@ -70,7 +70,7 @@
           (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) {
         if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) {
           /* startup underestimated number of scans; ratchet up one scan */
-          cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows;
+          cinfo->progress->pass_limit += (long)cinfo->total_iMCU_rows;
         }
       }
     }
@@ -97,7 +97,7 @@
  */
 
 LOCAL(void)
-transdecode_master_selection (j_decompress_ptr cinfo)
+transdecode_master_selection(j_decompress_ptr cinfo)
 {
   /* This is effectively a buffered-image operation. */
   cinfo->buffered_image = TRUE;
@@ -129,7 +129,7 @@
   jinit_d_coef_controller(cinfo, TRUE);
 
   /* We can now tell the memory manager to allocate virtual arrays. */
-  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo);
+  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
 
   /* Initialize input side of decompressor to consume first scan. */
   (*cinfo->inputctl->start_input_pass) (cinfo);
@@ -148,7 +148,7 @@
       nscans = 1;
     }
     cinfo->progress->pass_counter = 0L;
-    cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows * nscans;
+    cinfo->progress->pass_limit = (long)cinfo->total_iMCU_rows * nscans;
     cinfo->progress->completed_passes = 0;
     cinfo->progress->total_passes = 1;
   }
diff --git a/jerror.c b/jerror.c
index c31acd9..936c4f5 100644
--- a/jerror.c
+++ b/jerror.c
@@ -44,7 +44,7 @@
  * want to refer to it directly.
  */
 
-#define JMESSAGE(code,string)   string ,
+#define JMESSAGE(code, string)  string,
 
 const char * const jpeg_std_message_table[] = {
 #include "jerror.h"
@@ -66,7 +66,7 @@
  */
 
 METHODDEF(void)
-error_exit (j_common_ptr cinfo)
+error_exit(j_common_ptr cinfo)
 {
   /* Always display the message */
   (*cinfo->err->output_message) (cinfo);
@@ -94,7 +94,7 @@
  */
 
 METHODDEF(void)
-output_message (j_common_ptr cinfo)
+output_message(j_common_ptr cinfo)
 {
   char buffer[JMSG_LENGTH_MAX];
 
@@ -124,7 +124,7 @@
  */
 
 METHODDEF(void)
-emit_message (j_common_ptr cinfo, int msg_level)
+emit_message(j_common_ptr cinfo, int msg_level)
 {
   struct jpeg_error_mgr *err = cinfo->err;
 
@@ -153,7 +153,7 @@
  */
 
 METHODDEF(void)
-format_message (j_common_ptr cinfo, char *buffer)
+format_message(j_common_ptr cinfo, char *buffer)
 {
   struct jpeg_error_mgr *err = cinfo->err;
   int msg_code = err->msg_code;
@@ -208,7 +208,7 @@
  */
 
 METHODDEF(void)
-reset_error_mgr (j_common_ptr cinfo)
+reset_error_mgr(j_common_ptr cinfo)
 {
   cinfo->err->num_warnings = 0;
   /* trace_level is not reset since it is an application-supplied parameter */
@@ -227,7 +227,7 @@
  */
 
 GLOBAL(struct jpeg_error_mgr *)
-jpeg_std_error (struct jpeg_error_mgr *err)
+jpeg_std_error(struct jpeg_error_mgr *err)
 {
   err->error_exit = error_exit;
   err->emit_message = emit_message;
@@ -241,7 +241,7 @@
 
   /* Initialize message table pointers */
   err->jpeg_message_table = jpeg_std_message_table;
-  err->last_jpeg_message = (int) JMSG_LASTMSGCODE - 1;
+  err->last_jpeg_message = (int)JMSG_LASTMSGCODE - 1;
 
   err->addon_message_table = NULL;
   err->first_addon_message = 0; /* for safety */
diff --git a/jerror.h b/jerror.h
index 11a07cb..e54cc30 100644
--- a/jerror.h
+++ b/jerror.h
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * Modified 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2014, D. R. Commander.
+ * Copyright (C) 2014, 2017, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -28,7 +28,7 @@
 #define JMAKE_ENUM_LIST
 #else
 /* Repeated inclusions of this file are no-ops unless JMESSAGE is defined */
-#define JMESSAGE(code,string)
+#define JMESSAGE(code, string)
 #endif /* JERROR_H */
 #endif /* JMESSAGE */
 
@@ -36,7 +36,7 @@
 
 typedef enum {
 
-#define JMESSAGE(code,string)   code ,
+#define JMESSAGE(code, string)  code,
 
 #endif /* JMAKE_ENUM_LIST */
 
@@ -44,8 +44,7 @@
 
 /* For maintenance convenience, list is alphabetical by message code name */
 #if JPEG_LIB_VERSION < 70
-JMESSAGE(JERR_ARITH_NOTIMPL,
-         "Sorry, arithmetic coding is not implemented")
+JMESSAGE(JERR_ARITH_NOTIMPL, "Sorry, arithmetic coding is not implemented")
 #endif
 JMESSAGE(JERR_BAD_ALIGN_TYPE, "ALIGN_TYPE is wrong, please fix")
 JMESSAGE(JERR_BAD_ALLOC_CHUNK, "MAX_ALLOC_CHUNK is wrong, please fix")
@@ -154,8 +153,7 @@
 JMESSAGE(JTRC_JFIF, "JFIF APP0 marker: version %d.%02d, density %dx%d  %d")
 JMESSAGE(JTRC_JFIF_BADTHUMBNAILSIZE,
          "Warning: thumbnail image size does not match data length %u")
-JMESSAGE(JTRC_JFIF_EXTENSION,
-         "JFIF extension marker: type 0x%02x, length %u")
+JMESSAGE(JTRC_JFIF_EXTENSION, "JFIF extension marker: type 0x%02x, length %u")
 JMESSAGE(JTRC_JFIF_THUMBNAIL, "    with %d x %d thumbnail image")
 JMESSAGE(JTRC_MISC_MARKER, "Miscellaneous marker 0x%02x, length %u")
 JMESSAGE(JTRC_PARMLESS_MARKER, "Unexpected marker 0x%02x")
@@ -208,6 +206,7 @@
 JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
 #endif
 #endif
+JMESSAGE(JWRN_BOGUS_ICC, "Corrupt JPEG data: bad ICC marker")
 
 #ifdef JMAKE_ENUM_LIST
 
@@ -228,90 +227,90 @@
 /* The first parameter is either type of cinfo pointer */
 
 /* Fatal errors (print message and exit) */
-#define ERREXIT(cinfo,code)  \
+#define ERREXIT(cinfo, code) \
   ((cinfo)->err->msg_code = (code), \
-   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXIT1(cinfo,code,p1)  \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT1(cinfo, code, p1) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
-   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXIT2(cinfo,code,p1,p2)  \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT2(cinfo, code, p1, p2) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
    (cinfo)->err->msg_parm.i[1] = (p2), \
-   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXIT3(cinfo,code,p1,p2,p3)  \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT3(cinfo, code, p1, p2, p3) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
    (cinfo)->err->msg_parm.i[1] = (p2), \
    (cinfo)->err->msg_parm.i[2] = (p3), \
-   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXIT4(cinfo,code,p1,p2,p3,p4)  \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT4(cinfo, code, p1, p2, p3, p4) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
    (cinfo)->err->msg_parm.i[1] = (p2), \
    (cinfo)->err->msg_parm.i[2] = (p3), \
    (cinfo)->err->msg_parm.i[3] = (p4), \
-   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXITS(cinfo,code,str)  \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXITS(cinfo, code, str) \
   ((cinfo)->err->msg_code = (code), \
    strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
-   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
 
 #define MAKESTMT(stuff)         do { stuff } while (0)
 
 /* Nonfatal errors (we can keep going, but the data is probably corrupt) */
-#define WARNMS(cinfo,code)  \
+#define WARNMS(cinfo, code) \
   ((cinfo)->err->msg_code = (code), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), -1))
-#define WARNMS1(cinfo,code,p1)  \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), -1))
+#define WARNMS1(cinfo, code, p1) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), -1))
-#define WARNMS2(cinfo,code,p1,p2)  \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), -1))
+#define WARNMS2(cinfo, code, p1, p2) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
    (cinfo)->err->msg_parm.i[1] = (p2), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), -1))
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), -1))
 
 /* Informational/debugging messages */
-#define TRACEMS(cinfo,lvl,code)  \
+#define TRACEMS(cinfo, lvl, code) \
   ((cinfo)->err->msg_code = (code), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
-#define TRACEMS1(cinfo,lvl,code,p1)  \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+#define TRACEMS1(cinfo, lvl, code, p1) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
-#define TRACEMS2(cinfo,lvl,code,p1,p2)  \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+#define TRACEMS2(cinfo, lvl, code, p1, p2) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
    (cinfo)->err->msg_parm.i[1] = (p2), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
-#define TRACEMS3(cinfo,lvl,code,p1,p2,p3)  \
-  MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+#define TRACEMS3(cinfo, lvl, code, p1, p2, p3) \
+  MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
            _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); \
            (cinfo)->err->msg_code = (code); \
-           (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
-#define TRACEMS4(cinfo,lvl,code,p1,p2,p3,p4)  \
-  MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
+           (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMS4(cinfo, lvl, code, p1, p2, p3, p4) \
+  MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
            _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
            (cinfo)->err->msg_code = (code); \
-           (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
-#define TRACEMS5(cinfo,lvl,code,p1,p2,p3,p4,p5)  \
-  MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
+           (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMS5(cinfo, lvl, code, p1, p2, p3, p4, p5) \
+  MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
            _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
            _mp[4] = (p5); \
            (cinfo)->err->msg_code = (code); \
-           (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
-#define TRACEMS8(cinfo,lvl,code,p1,p2,p3,p4,p5,p6,p7,p8)  \
-  MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
+           (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMS8(cinfo, lvl, code, p1, p2, p3, p4, p5, p6, p7, p8) \
+  MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
            _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
            _mp[4] = (p5); _mp[5] = (p6); _mp[6] = (p7); _mp[7] = (p8); \
            (cinfo)->err->msg_code = (code); \
-           (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
-#define TRACEMSS(cinfo,lvl,code,str)  \
+           (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMSS(cinfo, lvl, code, str) \
   ((cinfo)->err->msg_code = (code), \
    strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
 
 #endif /* JERROR_H */
diff --git a/jfdctflt.c b/jfdctflt.c
index b3da3eb..ab6f6d0 100644
--- a/jfdctflt.c
+++ b/jfdctflt.c
@@ -57,7 +57,7 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_float (FAST_FLOAT *data)
+jpeg_fdct_float(FAST_FLOAT *data)
 {
   FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
@@ -68,7 +68,7 @@
   /* Pass 1: process rows. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+  for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
     tmp0 = dataptr[0] + dataptr[7];
     tmp7 = dataptr[0] - dataptr[7];
     tmp1 = dataptr[1] + dataptr[6];
@@ -88,7 +88,7 @@
     dataptr[0] = tmp10 + tmp11; /* phase 3 */
     dataptr[4] = tmp10 - tmp11;
 
-    z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */
+    z1 = (tmp12 + tmp13) * ((FAST_FLOAT)0.707106781); /* c4 */
     dataptr[2] = tmp13 + z1;    /* phase 5 */
     dataptr[6] = tmp13 - z1;
 
@@ -99,10 +99,10 @@
     tmp12 = tmp6 + tmp7;
 
     /* The rotator is modified from fig 4-8 to avoid extra negations. */
-    z5 = (tmp10 - tmp12) * ((FAST_FLOAT) 0.382683433); /* c6 */
-    z2 = ((FAST_FLOAT) 0.541196100) * tmp10 + z5; /* c2-c6 */
-    z4 = ((FAST_FLOAT) 1.306562965) * tmp12 + z5; /* c2+c6 */
-    z3 = tmp11 * ((FAST_FLOAT) 0.707106781); /* c4 */
+    z5 = (tmp10 - tmp12) * ((FAST_FLOAT)0.382683433); /* c6 */
+    z2 = ((FAST_FLOAT)0.541196100) * tmp10 + z5; /* c2-c6 */
+    z4 = ((FAST_FLOAT)1.306562965) * tmp12 + z5; /* c2+c6 */
+    z3 = tmp11 * ((FAST_FLOAT)0.707106781); /* c4 */
 
     z11 = tmp7 + z3;            /* phase 5 */
     z13 = tmp7 - z3;
@@ -118,15 +118,15 @@
   /* Pass 2: process columns. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
-    tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
-    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
-    tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
-    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
-    tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
-    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
-    tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+  for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+    tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+    tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+    tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+    tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+    tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+    tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+    tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+    tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
 
     /* Even part */
 
@@ -135,12 +135,12 @@
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
 
-    dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
-    dataptr[DCTSIZE*4] = tmp10 - tmp11;
+    dataptr[DCTSIZE * 0] = tmp10 + tmp11; /* phase 3 */
+    dataptr[DCTSIZE * 4] = tmp10 - tmp11;
 
-    z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */
-    dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
-    dataptr[DCTSIZE*6] = tmp13 - z1;
+    z1 = (tmp12 + tmp13) * ((FAST_FLOAT)0.707106781); /* c4 */
+    dataptr[DCTSIZE * 2] = tmp13 + z1; /* phase 5 */
+    dataptr[DCTSIZE * 6] = tmp13 - z1;
 
     /* Odd part */
 
@@ -149,18 +149,18 @@
     tmp12 = tmp6 + tmp7;
 
     /* The rotator is modified from fig 4-8 to avoid extra negations. */
-    z5 = (tmp10 - tmp12) * ((FAST_FLOAT) 0.382683433); /* c6 */
-    z2 = ((FAST_FLOAT) 0.541196100) * tmp10 + z5; /* c2-c6 */
-    z4 = ((FAST_FLOAT) 1.306562965) * tmp12 + z5; /* c2+c6 */
-    z3 = tmp11 * ((FAST_FLOAT) 0.707106781); /* c4 */
+    z5 = (tmp10 - tmp12) * ((FAST_FLOAT)0.382683433); /* c6 */
+    z2 = ((FAST_FLOAT)0.541196100) * tmp10 + z5; /* c2-c6 */
+    z4 = ((FAST_FLOAT)1.306562965) * tmp12 + z5; /* c2+c6 */
+    z3 = tmp11 * ((FAST_FLOAT)0.707106781); /* c4 */
 
     z11 = tmp7 + z3;            /* phase 5 */
     z13 = tmp7 - z3;
 
-    dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */
-    dataptr[DCTSIZE*3] = z13 - z2;
-    dataptr[DCTSIZE*1] = z11 + z4;
-    dataptr[DCTSIZE*7] = z11 - z4;
+    dataptr[DCTSIZE * 5] = z13 + z2; /* phase 6 */
+    dataptr[DCTSIZE * 3] = z13 - z2;
+    dataptr[DCTSIZE * 1] = z11 + z4;
+    dataptr[DCTSIZE * 7] = z11 - z4;
 
     dataptr++;                  /* advance pointer to next column */
   }
diff --git a/jfdctfst.c b/jfdctfst.c
index 5cd83a7..4c9ce0d 100644
--- a/jfdctfst.c
+++ b/jfdctfst.c
@@ -79,10 +79,10 @@
  */
 
 #if CONST_BITS == 8
-#define FIX_0_382683433  ((JLONG)   98)         /* FIX(0.382683433) */
-#define FIX_0_541196100  ((JLONG)  139)         /* FIX(0.541196100) */
-#define FIX_0_707106781  ((JLONG)  181)         /* FIX(0.707106781) */
-#define FIX_1_306562965  ((JLONG)  334)         /* FIX(1.306562965) */
+#define FIX_0_382683433  ((JLONG)98)            /* FIX(0.382683433) */
+#define FIX_0_541196100  ((JLONG)139)           /* FIX(0.541196100) */
+#define FIX_0_707106781  ((JLONG)181)           /* FIX(0.707106781) */
+#define FIX_1_306562965  ((JLONG)334)           /* FIX(1.306562965) */
 #else
 #define FIX_0_382683433  FIX(0.382683433)
 #define FIX_0_541196100  FIX(0.541196100)
@@ -98,7 +98,7 @@
 
 #ifndef USE_ACCURATE_ROUNDING
 #undef DESCALE
-#define DESCALE(x,n)  RIGHT_SHIFT(x, n)
+#define DESCALE(x, n)  RIGHT_SHIFT(x, n)
 #endif
 
 
@@ -106,7 +106,7 @@
  * descale to yield a DCTELEM result.
  */
 
-#define MULTIPLY(var,const)  ((DCTELEM) DESCALE((var) * (const), CONST_BITS))
+#define MULTIPLY(var, const)  ((DCTELEM)DESCALE((var) * (const), CONST_BITS))
 
 
 /*
@@ -114,7 +114,7 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_ifast (DCTELEM *data)
+jpeg_fdct_ifast(DCTELEM *data)
 {
   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   DCTELEM tmp10, tmp11, tmp12, tmp13;
@@ -126,7 +126,7 @@
   /* Pass 1: process rows. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+  for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
     tmp0 = dataptr[0] + dataptr[7];
     tmp7 = dataptr[0] - dataptr[7];
     tmp1 = dataptr[1] + dataptr[6];
@@ -176,15 +176,15 @@
   /* Pass 2: process columns. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
-    tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
-    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
-    tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
-    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
-    tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
-    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
-    tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+  for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+    tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+    tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+    tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+    tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+    tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+    tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+    tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+    tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
 
     /* Even part */
 
@@ -193,12 +193,12 @@
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
 
-    dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
-    dataptr[DCTSIZE*4] = tmp10 - tmp11;
+    dataptr[DCTSIZE * 0] = tmp10 + tmp11; /* phase 3 */
+    dataptr[DCTSIZE * 4] = tmp10 - tmp11;
 
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
-    dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
-    dataptr[DCTSIZE*6] = tmp13 - z1;
+    dataptr[DCTSIZE * 2] = tmp13 + z1; /* phase 5 */
+    dataptr[DCTSIZE * 6] = tmp13 - z1;
 
     /* Odd part */
 
@@ -215,10 +215,10 @@
     z11 = tmp7 + z3;            /* phase 5 */
     z13 = tmp7 - z3;
 
-    dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */
-    dataptr[DCTSIZE*3] = z13 - z2;
-    dataptr[DCTSIZE*1] = z11 + z4;
-    dataptr[DCTSIZE*7] = z11 - z4;
+    dataptr[DCTSIZE * 5] = z13 + z2; /* phase 6 */
+    dataptr[DCTSIZE * 3] = z13 - z2;
+    dataptr[DCTSIZE * 1] = z11 + z4;
+    dataptr[DCTSIZE * 7] = z11 - z4;
 
     dataptr++;                  /* advance pointer to next column */
   }
diff --git a/jfdctint.c b/jfdctint.c
index 169bb94..c0391a9 100644
--- a/jfdctint.c
+++ b/jfdctint.c
@@ -93,18 +93,18 @@
  */
 
 #if CONST_BITS == 13
-#define FIX_0_298631336  ((JLONG)  2446)        /* FIX(0.298631336) */
-#define FIX_0_390180644  ((JLONG)  3196)        /* FIX(0.390180644) */
-#define FIX_0_541196100  ((JLONG)  4433)        /* FIX(0.541196100) */
-#define FIX_0_765366865  ((JLONG)  6270)        /* FIX(0.765366865) */
-#define FIX_0_899976223  ((JLONG)  7373)        /* FIX(0.899976223) */
-#define FIX_1_175875602  ((JLONG)  9633)        /* FIX(1.175875602) */
-#define FIX_1_501321110  ((JLONG)  12299)       /* FIX(1.501321110) */
-#define FIX_1_847759065  ((JLONG)  15137)       /* FIX(1.847759065) */
-#define FIX_1_961570560  ((JLONG)  16069)       /* FIX(1.961570560) */
-#define FIX_2_053119869  ((JLONG)  16819)       /* FIX(2.053119869) */
-#define FIX_2_562915447  ((JLONG)  20995)       /* FIX(2.562915447) */
-#define FIX_3_072711026  ((JLONG)  25172)       /* FIX(3.072711026) */
+#define FIX_0_298631336  ((JLONG)2446)          /* FIX(0.298631336) */
+#define FIX_0_390180644  ((JLONG)3196)          /* FIX(0.390180644) */
+#define FIX_0_541196100  ((JLONG)4433)          /* FIX(0.541196100) */
+#define FIX_0_765366865  ((JLONG)6270)          /* FIX(0.765366865) */
+#define FIX_0_899976223  ((JLONG)7373)          /* FIX(0.899976223) */
+#define FIX_1_175875602  ((JLONG)9633)          /* FIX(1.175875602) */
+#define FIX_1_501321110  ((JLONG)12299)         /* FIX(1.501321110) */
+#define FIX_1_847759065  ((JLONG)15137)         /* FIX(1.847759065) */
+#define FIX_1_961570560  ((JLONG)16069)         /* FIX(1.961570560) */
+#define FIX_2_053119869  ((JLONG)16819)         /* FIX(2.053119869) */
+#define FIX_2_562915447  ((JLONG)20995)         /* FIX(2.562915447) */
+#define FIX_3_072711026  ((JLONG)25172)         /* FIX(3.072711026) */
 #else
 #define FIX_0_298631336  FIX(0.298631336)
 #define FIX_0_390180644  FIX(0.390180644)
@@ -129,9 +129,9 @@
  */
 
 #if BITS_IN_JSAMPLE == 8
-#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
+#define MULTIPLY(var, const)  MULTIPLY16C16(var, const)
 #else
-#define MULTIPLY(var,const)  ((var) * (const))
+#define MULTIPLY(var, const)  ((var) * (const))
 #endif
 
 
@@ -140,7 +140,7 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_islow (DCTELEM *data)
+jpeg_fdct_islow(DCTELEM *data)
 {
   JLONG tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   JLONG tmp10, tmp11, tmp12, tmp13;
@@ -154,7 +154,7 @@
   /* furthermore, we scale the results by 2**PASS1_BITS. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+  for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
     tmp0 = dataptr[0] + dataptr[7];
     tmp7 = dataptr[0] - dataptr[7];
     tmp1 = dataptr[1] + dataptr[6];
@@ -173,14 +173,14 @@
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
 
-    dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS);
-    dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS);
+    dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS);
+    dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS);
 
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
-    dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
-                                   CONST_BITS-PASS1_BITS);
-    dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
-                                   CONST_BITS-PASS1_BITS);
+    dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
+                                  CONST_BITS - PASS1_BITS);
+    dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, -FIX_1_847759065),
+                                  CONST_BITS - PASS1_BITS);
 
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
      * cK represents cos(K*pi/16).
@@ -197,18 +197,18 @@
     tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
     tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+    z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
 
     z3 += z5;
     z4 += z5;
 
-    dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
-    dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
-    dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
-    dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
+    dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
+    dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
+    dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
 
     dataptr += DCTSIZE;         /* advance pointer to next row */
   }
@@ -219,15 +219,15 @@
    */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
-    tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
-    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
-    tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
-    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
-    tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
-    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
-    tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+  for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+    tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+    tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+    tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+    tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+    tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+    tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+    tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+    tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
 
     /* Even part per LL&M figure 1 --- note that published figure is faulty;
      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
@@ -238,14 +238,16 @@
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
 
-    dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
-    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
+    dataptr[DCTSIZE * 0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS);
+    dataptr[DCTSIZE * 4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS);
 
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
-    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
-                                           CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
-                                           CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE * 2] =
+      (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
+                       CONST_BITS + PASS1_BITS);
+    dataptr[DCTSIZE * 6] =
+      (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, -FIX_1_847759065),
+                       CONST_BITS + PASS1_BITS);
 
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
      * cK represents cos(K*pi/16).
@@ -262,22 +264,22 @@
     tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
     tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+    z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
 
     z3 += z5;
     z4 += z5;
 
-    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,
-                                           CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,
-                                           CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,
-                                           CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,
-                                           CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE * 7] = (DCTELEM)DESCALE(tmp4 + z1 + z3,
+                                            CONST_BITS + PASS1_BITS);
+    dataptr[DCTSIZE * 5] = (DCTELEM)DESCALE(tmp5 + z2 + z4,
+                                            CONST_BITS + PASS1_BITS);
+    dataptr[DCTSIZE * 3] = (DCTELEM)DESCALE(tmp6 + z2 + z3,
+                                            CONST_BITS + PASS1_BITS);
+    dataptr[DCTSIZE * 1] = (DCTELEM)DESCALE(tmp7 + z1 + z4,
+                                            CONST_BITS + PASS1_BITS);
 
     dataptr++;                  /* advance pointer to next column */
   }
diff --git a/jidctflt.c b/jidctflt.c
index 68c521e..4780206 100644
--- a/jidctflt.c
+++ b/jidctflt.c
@@ -61,7 +61,7 @@
  * entry; produce a float result.
  */
 
-#define DEQUANTIZE(coef,quantval)  (((FAST_FLOAT) (coef)) * (quantval))
+#define DEQUANTIZE(coef, quantval)  (((FAST_FLOAT)(coef)) * (quantval))
 
 
 /*
@@ -69,9 +69,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
@@ -83,12 +83,12 @@
   JSAMPLE *range_limit = cinfo->sample_range_limit;
   int ctr;
   FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */
-  #define _0_125 ((FLOAT_MULT_TYPE)0.125)
+#define _0_125 ((FLOAT_MULT_TYPE)0.125)
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (FLOAT_MULT_TYPE *) compptr->dct_table;
+  quantptr = (FLOAT_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = DCTSIZE; ctr > 0; ctr--) {
     /* Due to quantization, we will usually find that many of the input
@@ -100,22 +100,22 @@
      * column DCT calculations can be simplified this way.
      */
 
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
-        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
-        inptr[DCTSIZE*7] == 0) {
+    if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+        inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 4] == 0 &&
+        inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 6] == 0 &&
+        inptr[DCTSIZE * 7] == 0) {
       /* AC terms all zero */
-      FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE*0],
-                                    quantptr[DCTSIZE*0] * _0_125);
+      FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE * 0],
+                                    quantptr[DCTSIZE * 0] * _0_125);
 
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-      wsptr[DCTSIZE*2] = dcval;
-      wsptr[DCTSIZE*3] = dcval;
-      wsptr[DCTSIZE*4] = dcval;
-      wsptr[DCTSIZE*5] = dcval;
-      wsptr[DCTSIZE*6] = dcval;
-      wsptr[DCTSIZE*7] = dcval;
+      wsptr[DCTSIZE * 0] = dcval;
+      wsptr[DCTSIZE * 1] = dcval;
+      wsptr[DCTSIZE * 2] = dcval;
+      wsptr[DCTSIZE * 3] = dcval;
+      wsptr[DCTSIZE * 4] = dcval;
+      wsptr[DCTSIZE * 5] = dcval;
+      wsptr[DCTSIZE * 6] = dcval;
+      wsptr[DCTSIZE * 7] = dcval;
 
       inptr++;                  /* advance pointers to next column */
       quantptr++;
@@ -125,16 +125,16 @@
 
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0] * _0_125);
-    tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2] * _0_125);
-    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4] * _0_125);
-    tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6] * _0_125);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0] * _0_125);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2] * _0_125);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4] * _0_125);
+    tmp3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6] * _0_125);
 
     tmp10 = tmp0 + tmp2;        /* phase 3 */
     tmp11 = tmp0 - tmp2;
 
     tmp13 = tmp1 + tmp3;        /* phases 5-3 */
-    tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT) 1.414213562) - tmp13; /* 2*c4 */
+    tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT)1.414213562) - tmp13; /* 2*c4 */
 
     tmp0 = tmp10 + tmp13;       /* phase 2 */
     tmp3 = tmp10 - tmp13;
@@ -143,10 +143,10 @@
 
     /* Odd part */
 
-    tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1] * _0_125);
-    tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3] * _0_125);
-    tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5] * _0_125);
-    tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7] * _0_125);
+    tmp4 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1] * _0_125);
+    tmp5 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3] * _0_125);
+    tmp6 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5] * _0_125);
+    tmp7 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7] * _0_125);
 
     z13 = tmp6 + tmp5;          /* phase 6 */
     z10 = tmp6 - tmp5;
@@ -154,24 +154,24 @@
     z12 = tmp4 - tmp7;
 
     tmp7 = z11 + z13;           /* phase 5 */
-    tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); /* 2*c4 */
+    tmp11 = (z11 - z13) * ((FAST_FLOAT)1.414213562); /* 2*c4 */
 
-    z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
-    tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */
-    tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */
+    z5 = (z10 + z12) * ((FAST_FLOAT)1.847759065); /* 2*c2 */
+    tmp10 = z5 - z12 * ((FAST_FLOAT)1.082392200); /* 2*(c2-c6) */
+    tmp12 = z5 - z10 * ((FAST_FLOAT)2.613125930); /* 2*(c2+c6) */
 
     tmp6 = tmp12 - tmp7;        /* phase 2 */
     tmp5 = tmp11 - tmp6;
     tmp4 = tmp10 - tmp5;
 
-    wsptr[DCTSIZE*0] = tmp0 + tmp7;
-    wsptr[DCTSIZE*7] = tmp0 - tmp7;
-    wsptr[DCTSIZE*1] = tmp1 + tmp6;
-    wsptr[DCTSIZE*6] = tmp1 - tmp6;
-    wsptr[DCTSIZE*2] = tmp2 + tmp5;
-    wsptr[DCTSIZE*5] = tmp2 - tmp5;
-    wsptr[DCTSIZE*3] = tmp3 + tmp4;
-    wsptr[DCTSIZE*4] = tmp3 - tmp4;
+    wsptr[DCTSIZE * 0] = tmp0 + tmp7;
+    wsptr[DCTSIZE * 7] = tmp0 - tmp7;
+    wsptr[DCTSIZE * 1] = tmp1 + tmp6;
+    wsptr[DCTSIZE * 6] = tmp1 - tmp6;
+    wsptr[DCTSIZE * 2] = tmp2 + tmp5;
+    wsptr[DCTSIZE * 5] = tmp2 - tmp5;
+    wsptr[DCTSIZE * 3] = tmp3 + tmp4;
+    wsptr[DCTSIZE * 4] = tmp3 - tmp4;
 
     inptr++;                    /* advance pointers to next column */
     quantptr++;
@@ -192,12 +192,12 @@
     /* Even part */
 
     /* Apply signed->unsigned and prepare float->int conversion */
-    z5 = wsptr[0] + ((FAST_FLOAT) CENTERJSAMPLE + (FAST_FLOAT) 0.5);
+    z5 = wsptr[0] + ((FAST_FLOAT)CENTERJSAMPLE + (FAST_FLOAT)0.5);
     tmp10 = z5 + wsptr[4];
     tmp11 = z5 - wsptr[4];
 
     tmp13 = wsptr[2] + wsptr[6];
-    tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT) 1.414213562) - tmp13;
+    tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT)1.414213562) - tmp13;
 
     tmp0 = tmp10 + tmp13;
     tmp3 = tmp10 - tmp13;
@@ -212,11 +212,11 @@
     z12 = wsptr[1] - wsptr[7];
 
     tmp7 = z11 + z13;
-    tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562);
+    tmp11 = (z11 - z13) * ((FAST_FLOAT)1.414213562);
 
-    z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
-    tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */
-    tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */
+    z5 = (z10 + z12) * ((FAST_FLOAT)1.847759065); /* 2*c2 */
+    tmp10 = z5 - z12 * ((FAST_FLOAT)1.082392200); /* 2*(c2-c6) */
+    tmp12 = z5 - z10 * ((FAST_FLOAT)2.613125930); /* 2*(c2+c6) */
 
     tmp6 = tmp12 - tmp7;
     tmp5 = tmp11 - tmp6;
@@ -224,14 +224,14 @@
 
     /* Final output stage: float->int conversion and range-limit */
 
-    outptr[0] = range_limit[((int) (tmp0 + tmp7)) & RANGE_MASK];
-    outptr[7] = range_limit[((int) (tmp0 - tmp7)) & RANGE_MASK];
-    outptr[1] = range_limit[((int) (tmp1 + tmp6)) & RANGE_MASK];
-    outptr[6] = range_limit[((int) (tmp1 - tmp6)) & RANGE_MASK];
-    outptr[2] = range_limit[((int) (tmp2 + tmp5)) & RANGE_MASK];
-    outptr[5] = range_limit[((int) (tmp2 - tmp5)) & RANGE_MASK];
-    outptr[3] = range_limit[((int) (tmp3 + tmp4)) & RANGE_MASK];
-    outptr[4] = range_limit[((int) (tmp3 - tmp4)) & RANGE_MASK];
+    outptr[0] = range_limit[((int)(tmp0 + tmp7)) & RANGE_MASK];
+    outptr[7] = range_limit[((int)(tmp0 - tmp7)) & RANGE_MASK];
+    outptr[1] = range_limit[((int)(tmp1 + tmp6)) & RANGE_MASK];
+    outptr[6] = range_limit[((int)(tmp1 - tmp6)) & RANGE_MASK];
+    outptr[2] = range_limit[((int)(tmp2 + tmp5)) & RANGE_MASK];
+    outptr[5] = range_limit[((int)(tmp2 - tmp5)) & RANGE_MASK];
+    outptr[3] = range_limit[((int)(tmp3 + tmp4)) & RANGE_MASK];
+    outptr[4] = range_limit[((int)(tmp3 - tmp4)) & RANGE_MASK];
 
     wsptr += DCTSIZE;           /* advance pointer to next row */
   }
diff --git a/jidctfst.c b/jidctfst.c
index 10db739..89a20c9 100644
--- a/jidctfst.c
+++ b/jidctfst.c
@@ -92,10 +92,10 @@
  */
 
 #if CONST_BITS == 8
-#define FIX_1_082392200  ((JLONG)  277)         /* FIX(1.082392200) */
-#define FIX_1_414213562  ((JLONG)  362)         /* FIX(1.414213562) */
-#define FIX_1_847759065  ((JLONG)  473)         /* FIX(1.847759065) */
-#define FIX_2_613125930  ((JLONG)  669)         /* FIX(2.613125930) */
+#define FIX_1_082392200  ((JLONG)277)           /* FIX(1.082392200) */
+#define FIX_1_414213562  ((JLONG)362)           /* FIX(1.414213562) */
+#define FIX_1_847759065  ((JLONG)473)           /* FIX(1.847759065) */
+#define FIX_2_613125930  ((JLONG)669)           /* FIX(2.613125930) */
 #else
 #define FIX_1_082392200  FIX(1.082392200)
 #define FIX_1_414213562  FIX(1.414213562)
@@ -111,7 +111,7 @@
 
 #ifndef USE_ACCURATE_ROUNDING
 #undef DESCALE
-#define DESCALE(x,n)  RIGHT_SHIFT(x, n)
+#define DESCALE(x, n)  RIGHT_SHIFT(x, n)
 #endif
 
 
@@ -119,7 +119,7 @@
  * descale to yield a DCTELEM result.
  */
 
-#define MULTIPLY(var,const)  ((DCTELEM) DESCALE((var) * (const), CONST_BITS))
+#define MULTIPLY(var, const)  ((DCTELEM)DESCALE((var) * (const), CONST_BITS))
 
 
 /* Dequantize a coefficient by multiplying it by the multiplier-table
@@ -129,10 +129,10 @@
  */
 
 #if BITS_IN_JSAMPLE == 8
-#define DEQUANTIZE(coef,quantval)  (((IFAST_MULT_TYPE) (coef)) * (quantval))
+#define DEQUANTIZE(coef, quantval)  (((IFAST_MULT_TYPE)(coef)) * (quantval))
 #else
-#define DEQUANTIZE(coef,quantval)  \
-        DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS)
+#define DEQUANTIZE(coef, quantval) \
+  DESCALE((coef) * (quantval), IFAST_SCALE_BITS - PASS1_BITS)
 #endif
 
 
@@ -147,19 +147,19 @@
 #else
 #define DCTELEMBITS  32         /* DCTELEM must be 32 bits */
 #endif
-#define IRIGHT_SHIFT(x,shft)  \
-    ((ishift_temp = (x)) < 0 ? \
-     (ishift_temp >> (shft)) | ((~((DCTELEM) 0)) << (DCTELEMBITS-(shft))) : \
-     (ishift_temp >> (shft)))
+#define IRIGHT_SHIFT(x, shft) \
+  ((ishift_temp = (x)) < 0 ? \
+   (ishift_temp >> (shft)) | ((~((DCTELEM)0)) << (DCTELEMBITS - (shft))) : \
+   (ishift_temp >> (shft)))
 #else
 #define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft)    ((x) >> (shft))
+#define IRIGHT_SHIFT(x, shft)   ((x) >> (shft))
 #endif
 
 #ifdef USE_ACCURATE_ROUNDING
-#define IDESCALE(x,n)  ((int) IRIGHT_SHIFT((x) + (1 << ((n)-1)), n))
+#define IDESCALE(x, n)  ((int)IRIGHT_SHIFT((x) + (1 << ((n) - 1)), n))
 #else
-#define IDESCALE(x,n)  ((int) IRIGHT_SHIFT(x, n))
+#define IDESCALE(x, n)  ((int)IRIGHT_SHIFT(x, n))
 #endif
 
 
@@ -168,9 +168,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   DCTELEM tmp10, tmp11, tmp12, tmp13;
@@ -188,7 +188,7 @@
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
+  quantptr = (IFAST_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = DCTSIZE; ctr > 0; ctr--) {
     /* Due to quantization, we will usually find that many of the input
@@ -200,21 +200,21 @@
      * column DCT calculations can be simplified this way.
      */
 
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
-        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
-        inptr[DCTSIZE*7] == 0) {
+    if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+        inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 4] == 0 &&
+        inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 6] == 0 &&
+        inptr[DCTSIZE * 7] == 0) {
       /* AC terms all zero */
-      int dcval = (int) DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+      int dcval = (int)DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
 
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-      wsptr[DCTSIZE*2] = dcval;
-      wsptr[DCTSIZE*3] = dcval;
-      wsptr[DCTSIZE*4] = dcval;
-      wsptr[DCTSIZE*5] = dcval;
-      wsptr[DCTSIZE*6] = dcval;
-      wsptr[DCTSIZE*7] = dcval;
+      wsptr[DCTSIZE * 0] = dcval;
+      wsptr[DCTSIZE * 1] = dcval;
+      wsptr[DCTSIZE * 2] = dcval;
+      wsptr[DCTSIZE * 3] = dcval;
+      wsptr[DCTSIZE * 4] = dcval;
+      wsptr[DCTSIZE * 5] = dcval;
+      wsptr[DCTSIZE * 6] = dcval;
+      wsptr[DCTSIZE * 7] = dcval;
 
       inptr++;                  /* advance pointers to next column */
       quantptr++;
@@ -224,10 +224,10 @@
 
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+    tmp3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     tmp10 = tmp0 + tmp2;        /* phase 3 */
     tmp11 = tmp0 - tmp2;
@@ -242,10 +242,10 @@
 
     /* Odd part */
 
-    tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    tmp4 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    tmp5 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    tmp6 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    tmp7 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     z13 = tmp6 + tmp5;          /* phase 6 */
     z10 = tmp6 - tmp5;
@@ -257,20 +257,20 @@
 
     z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
     tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
-    tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
+    tmp12 = MULTIPLY(z10, -FIX_2_613125930) + z5; /* -2*(c2+c6) */
 
     tmp6 = tmp12 - tmp7;        /* phase 2 */
     tmp5 = tmp11 - tmp6;
     tmp4 = tmp10 + tmp5;
 
-    wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
-    wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
-    wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
-    wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
-    wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
-    wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
-    wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
-    wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
+    wsptr[DCTSIZE * 0] = (int)(tmp0 + tmp7);
+    wsptr[DCTSIZE * 7] = (int)(tmp0 - tmp7);
+    wsptr[DCTSIZE * 1] = (int)(tmp1 + tmp6);
+    wsptr[DCTSIZE * 6] = (int)(tmp1 - tmp6);
+    wsptr[DCTSIZE * 2] = (int)(tmp2 + tmp5);
+    wsptr[DCTSIZE * 5] = (int)(tmp2 - tmp5);
+    wsptr[DCTSIZE * 4] = (int)(tmp3 + tmp4);
+    wsptr[DCTSIZE * 3] = (int)(tmp3 - tmp4);
 
     inptr++;                    /* advance pointers to next column */
     quantptr++;
@@ -296,8 +296,8 @@
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[IDESCALE(wsptr[0], PASS1_BITS+3)
-                                  & RANGE_MASK];
+      JSAMPLE dcval =
+        range_limit[IDESCALE(wsptr[0], PASS1_BITS + 3) & RANGE_MASK];
 
       outptr[0] = dcval;
       outptr[1] = dcval;
@@ -315,12 +315,12 @@
 
     /* Even part */
 
-    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
-    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
+    tmp10 = ((DCTELEM)wsptr[0] + (DCTELEM)wsptr[4]);
+    tmp11 = ((DCTELEM)wsptr[0] - (DCTELEM)wsptr[4]);
 
-    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
-    tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562)
-            - tmp13;
+    tmp13 = ((DCTELEM)wsptr[2] + (DCTELEM)wsptr[6]);
+    tmp12 =
+      MULTIPLY((DCTELEM)wsptr[2] - (DCTELEM)wsptr[6], FIX_1_414213562) - tmp13;
 
     tmp0 = tmp10 + tmp13;
     tmp3 = tmp10 - tmp13;
@@ -329,17 +329,17 @@
 
     /* Odd part */
 
-    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
-    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
-    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
-    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
+    z13 = (DCTELEM)wsptr[5] + (DCTELEM)wsptr[3];
+    z10 = (DCTELEM)wsptr[5] - (DCTELEM)wsptr[3];
+    z11 = (DCTELEM)wsptr[1] + (DCTELEM)wsptr[7];
+    z12 = (DCTELEM)wsptr[1] - (DCTELEM)wsptr[7];
 
     tmp7 = z11 + z13;           /* phase 5 */
     tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
 
     z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
     tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
-    tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
+    tmp12 = MULTIPLY(z10, -FIX_2_613125930) + z5; /* -2*(c2+c6) */
 
     tmp6 = tmp12 - tmp7;        /* phase 2 */
     tmp5 = tmp11 - tmp6;
@@ -347,22 +347,22 @@
 
     /* Final output stage: scale down by a factor of 8 and range-limit */
 
-    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] =
+      range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[7] =
+      range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[1] =
+      range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[6] =
+      range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[2] =
+      range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[5] =
+      range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[4] =
+      range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[3] =
+      range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS + 3) & RANGE_MASK];
 
     wsptr += DCTSIZE;           /* advance pointer to next row */
   }
diff --git a/jidctint.c b/jidctint.c
index 3ac6caf..5557342 100644
--- a/jidctint.c
+++ b/jidctint.c
@@ -115,18 +115,18 @@
  */
 
 #if CONST_BITS == 13
-#define FIX_0_298631336  ((JLONG)  2446)        /* FIX(0.298631336) */
-#define FIX_0_390180644  ((JLONG)  3196)        /* FIX(0.390180644) */
-#define FIX_0_541196100  ((JLONG)  4433)        /* FIX(0.541196100) */
-#define FIX_0_765366865  ((JLONG)  6270)        /* FIX(0.765366865) */
-#define FIX_0_899976223  ((JLONG)  7373)        /* FIX(0.899976223) */
-#define FIX_1_175875602  ((JLONG)  9633)        /* FIX(1.175875602) */
-#define FIX_1_501321110  ((JLONG)  12299)       /* FIX(1.501321110) */
-#define FIX_1_847759065  ((JLONG)  15137)       /* FIX(1.847759065) */
-#define FIX_1_961570560  ((JLONG)  16069)       /* FIX(1.961570560) */
-#define FIX_2_053119869  ((JLONG)  16819)       /* FIX(2.053119869) */
-#define FIX_2_562915447  ((JLONG)  20995)       /* FIX(2.562915447) */
-#define FIX_3_072711026  ((JLONG)  25172)       /* FIX(3.072711026) */
+#define FIX_0_298631336  ((JLONG)2446)          /* FIX(0.298631336) */
+#define FIX_0_390180644  ((JLONG)3196)          /* FIX(0.390180644) */
+#define FIX_0_541196100  ((JLONG)4433)          /* FIX(0.541196100) */
+#define FIX_0_765366865  ((JLONG)6270)          /* FIX(0.765366865) */
+#define FIX_0_899976223  ((JLONG)7373)          /* FIX(0.899976223) */
+#define FIX_1_175875602  ((JLONG)9633)          /* FIX(1.175875602) */
+#define FIX_1_501321110  ((JLONG)12299)         /* FIX(1.501321110) */
+#define FIX_1_847759065  ((JLONG)15137)         /* FIX(1.847759065) */
+#define FIX_1_961570560  ((JLONG)16069)         /* FIX(1.961570560) */
+#define FIX_2_053119869  ((JLONG)16819)         /* FIX(2.053119869) */
+#define FIX_2_562915447  ((JLONG)20995)         /* FIX(2.562915447) */
+#define FIX_3_072711026  ((JLONG)25172)         /* FIX(3.072711026) */
 #else
 #define FIX_0_298631336  FIX(0.298631336)
 #define FIX_0_390180644  FIX(0.390180644)
@@ -151,9 +151,9 @@
  */
 
 #if BITS_IN_JSAMPLE == 8
-#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
+#define MULTIPLY(var, const)  MULTIPLY16C16(var, const)
 #else
-#define MULTIPLY(var,const)  ((var) * (const))
+#define MULTIPLY(var, const)  ((var) * (const))
 #endif
 
 
@@ -162,7 +162,7 @@
  * are 16 bits or less, so either int or short multiply will work.
  */
 
-#define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
+#define DEQUANTIZE(coef, quantval)  (((ISLOW_MULT_TYPE)(coef)) * (quantval))
 
 
 /*
@@ -170,9 +170,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp3;
   JLONG tmp10, tmp11, tmp12, tmp13;
@@ -191,7 +191,7 @@
   /* furthermore, we scale the results by 2**PASS1_BITS. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = DCTSIZE; ctr > 0; ctr--) {
     /* Due to quantization, we will usually find that many of the input
@@ -203,22 +203,22 @@
      * column DCT calculations can be simplified this way.
      */
 
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
-        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
-        inptr[DCTSIZE*7] == 0) {
+    if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+        inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 4] == 0 &&
+        inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 6] == 0 &&
+        inptr[DCTSIZE * 7] == 0) {
       /* AC terms all zero */
-      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]),
-                             PASS1_BITS);
+      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE * 0],
+                             quantptr[DCTSIZE * 0]), PASS1_BITS);
 
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-      wsptr[DCTSIZE*2] = dcval;
-      wsptr[DCTSIZE*3] = dcval;
-      wsptr[DCTSIZE*4] = dcval;
-      wsptr[DCTSIZE*5] = dcval;
-      wsptr[DCTSIZE*6] = dcval;
-      wsptr[DCTSIZE*7] = dcval;
+      wsptr[DCTSIZE * 0] = dcval;
+      wsptr[DCTSIZE * 1] = dcval;
+      wsptr[DCTSIZE * 2] = dcval;
+      wsptr[DCTSIZE * 3] = dcval;
+      wsptr[DCTSIZE * 4] = dcval;
+      wsptr[DCTSIZE * 5] = dcval;
+      wsptr[DCTSIZE * 6] = dcval;
+      wsptr[DCTSIZE * 7] = dcval;
 
       inptr++;                  /* advance pointers to next column */
       quantptr++;
@@ -229,15 +229,15 @@
     /* Even part: reverse the even part of the forward DCT. */
     /* The rotator is sqrt(2)*c(-6). */
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
+    tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
 
     tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS);
     tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS);
@@ -251,10 +251,10 @@
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-    tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    tmp3 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
 
     z1 = tmp0 + tmp3;
     z2 = tmp1 + tmp2;
@@ -266,10 +266,10 @@
     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+    z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
 
     z3 += z5;
     z4 += z5;
@@ -281,14 +281,14 @@
 
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 
-    wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE * 0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS - PASS1_BITS);
 
     inptr++;                    /* advance pointers to next column */
     quantptr++;
@@ -314,8 +314,8 @@
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3)
-                                  & RANGE_MASK];
+      JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
+                                               PASS1_BITS + 3) & RANGE_MASK];
 
       outptr[0] = dcval;
       outptr[1] = dcval;
@@ -334,15 +334,15 @@
     /* Even part: reverse the even part of the forward DCT. */
     /* The rotator is sqrt(2)*c(-6). */
 
-    z2 = (JLONG) wsptr[2];
-    z3 = (JLONG) wsptr[6];
+    z2 = (JLONG)wsptr[2];
+    z3 = (JLONG)wsptr[6];
 
     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
+    tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
 
-    tmp0 = LEFT_SHIFT((JLONG) wsptr[0] + (JLONG) wsptr[4], CONST_BITS);
-    tmp1 = LEFT_SHIFT((JLONG) wsptr[0] - (JLONG) wsptr[4], CONST_BITS);
+    tmp0 = LEFT_SHIFT((JLONG)wsptr[0] + (JLONG)wsptr[4], CONST_BITS);
+    tmp1 = LEFT_SHIFT((JLONG)wsptr[0] - (JLONG)wsptr[4], CONST_BITS);
 
     tmp10 = tmp0 + tmp3;
     tmp13 = tmp0 - tmp3;
@@ -353,10 +353,10 @@
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
 
-    tmp0 = (JLONG) wsptr[7];
-    tmp1 = (JLONG) wsptr[5];
-    tmp2 = (JLONG) wsptr[3];
-    tmp3 = (JLONG) wsptr[1];
+    tmp0 = (JLONG)wsptr[7];
+    tmp1 = (JLONG)wsptr[5];
+    tmp2 = (JLONG)wsptr[3];
+    tmp3 = (JLONG)wsptr[1];
 
     z1 = tmp0 + tmp3;
     z2 = tmp1 + tmp2;
@@ -368,10 +368,10 @@
     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+    z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
 
     z3 += z5;
     z4 += z5;
@@ -383,30 +383,30 @@
 
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 
-    outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)DESCALE(tmp10 + tmp3,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[7] = range_limit[(int)DESCALE(tmp10 - tmp3,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)DESCALE(tmp11 + tmp2,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[6] = range_limit[(int)DESCALE(tmp11 - tmp2,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)DESCALE(tmp12 + tmp1,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[5] = range_limit[(int)DESCALE(tmp12 - tmp1,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)DESCALE(tmp13 + tmp0,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[4] = range_limit[(int)DESCALE(tmp13 - tmp0,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += DCTSIZE;           /* advance pointer to next row */
   }
@@ -424,9 +424,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_7x7(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
   JLONG z1, z2, z3;
@@ -436,25 +436,25 @@
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[7*7];   /* buffers data between passes */
+  int workspace[7 * 7];         /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp13 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp13 = LEFT_SHIFT(tmp13, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp13 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
@@ -468,15 +468,15 @@
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
 
     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
     tmp0 = tmp1 - tmp2;
     tmp1 += tmp2;
-    tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
+    tmp2 = MULTIPLY(z2 + z3, -FIX(1.378756276));     /* -c1 */
     tmp1 += tmp2;
     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
     tmp0 += z2;
@@ -484,13 +484,13 @@
 
     /* Final output stage */
 
-    wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[7 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[7 * 6] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[7 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[7 * 5] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[7 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[7 * 4] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[7 * 3] = (int)RIGHT_SHIFT(tmp13, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 7 rows from work array, store into output array. */
@@ -502,12 +502,12 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp13 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp13 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp13 = LEFT_SHIFT(tmp13, CONST_BITS);
 
-    z1 = (JLONG) wsptr[2];
-    z2 = (JLONG) wsptr[4];
-    z3 = (JLONG) wsptr[6];
+    z1 = (JLONG)wsptr[2];
+    z2 = (JLONG)wsptr[4];
+    z3 = (JLONG)wsptr[6];
 
     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
@@ -521,15 +521,15 @@
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
 
     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
     tmp0 = tmp1 - tmp2;
     tmp1 += tmp2;
-    tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
+    tmp2 = MULTIPLY(z2 + z3, -FIX(1.378756276));     /* -c1 */
     tmp1 += tmp2;
     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
     tmp0 += z2;
@@ -537,27 +537,27 @@
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp13,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += 7;         /* advance pointer to next row */
   }
@@ -573,9 +573,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
   JLONG z1, z2, z3;
@@ -585,35 +585,35 @@
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[6*6];   /* buffers data between passes */
+  int workspace[6 * 6];         /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
-    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
     tmp1 = tmp0 + tmp10;
-    tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
-    tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS - PASS1_BITS);
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
     tmp10 = tmp1 + tmp0;
     tmp12 = tmp1 - tmp0;
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
     tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS);
     tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS);
@@ -621,12 +621,12 @@
 
     /* Final output stage */
 
-    wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[6*1] = (int) (tmp11 + tmp1);
-    wsptr[6*4] = (int) (tmp11 - tmp1);
-    wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[6 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[6 * 5] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[6 * 1] = (int)(tmp11 + tmp1);
+    wsptr[6 * 4] = (int)(tmp11 - tmp1);
+    wsptr[6 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[6 * 3] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 6 rows from work array, store into output array. */
@@ -638,22 +638,22 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
-    tmp2 = (JLONG) wsptr[4];
+    tmp2 = (JLONG)wsptr[4];
     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
     tmp1 = tmp0 + tmp10;
     tmp11 = tmp0 - tmp10 - tmp10;
-    tmp10 = (JLONG) wsptr[2];
+    tmp10 = (JLONG)wsptr[2];
     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
     tmp10 = tmp1 + tmp0;
     tmp12 = tmp1 - tmp0;
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
     tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS);
     tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS);
@@ -661,24 +661,24 @@
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += 6;         /* advance pointer to next row */
   }
@@ -694,9 +694,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_5x5(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp10, tmp11, tmp12;
   JLONG z1, z2, z3;
@@ -706,23 +706,23 @@
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[5*5];   /* buffers data between passes */
+  int workspace[5 * 5];         /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp12 = LEFT_SHIFT(tmp12, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp12 += ONE << (CONST_BITS - PASS1_BITS - 1);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
     z3 = tmp12 + z2;
@@ -732,8 +732,8 @@
 
     /* Odd part */
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
 
     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
@@ -741,11 +741,11 @@
 
     /* Final output stage */
 
-    wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[5 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[5 * 4] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[5 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[5 * 3] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[5 * 2] = (int)RIGHT_SHIFT(tmp12, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 5 rows from work array, store into output array. */
@@ -757,10 +757,10 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp12 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp12 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp12 = LEFT_SHIFT(tmp12, CONST_BITS);
-    tmp0 = (JLONG) wsptr[2];
-    tmp1 = (JLONG) wsptr[4];
+    tmp0 = (JLONG)wsptr[2];
+    tmp1 = (JLONG)wsptr[4];
     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
     z3 = tmp12 + z2;
@@ -770,8 +770,8 @@
 
     /* Odd part */
 
-    z2 = (JLONG) wsptr[1];
-    z3 = (JLONG) wsptr[3];
+    z2 = (JLONG)wsptr[1];
+    z3 = (JLONG)wsptr[3];
 
     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
@@ -779,21 +779,21 @@
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += 5;         /* advance pointer to next row */
   }
@@ -809,9 +809,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_3x3(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp2, tmp10, tmp12;
   JCOEFPTR inptr;
@@ -820,36 +820,36 @@
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[3*3];   /* buffers data between passes */
+  int workspace[3 * 3];         /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
-    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
     tmp10 = tmp0 + tmp12;
     tmp2 = tmp0 - tmp12 - tmp12;
 
     /* Odd part */
 
-    tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
 
     /* Final output stage */
 
-    wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[3 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[3 * 2] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[3 * 1] = (int)RIGHT_SHIFT(tmp2, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 3 rows from work array, store into output array. */
@@ -861,29 +861,29 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
-    tmp2 = (JLONG) wsptr[2];
+    tmp2 = (JLONG)wsptr[2];
     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
     tmp10 = tmp0 + tmp12;
     tmp2 = tmp0 - tmp12 - tmp12;
 
     /* Odd part */
 
-    tmp12 = (JLONG) wsptr[1];
+    tmp12 = (JLONG)wsptr[1];
     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += 3;         /* advance pointer to next row */
   }
@@ -899,9 +899,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_9x9(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
   JLONG z1, z2, z3, z4;
@@ -911,25 +911,25 @@
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*9];   /* buffers data between passes */
+  int workspace[8 * 9];         /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
     tmp1 = tmp0 + tmp3;
@@ -949,12 +949,12 @@
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
-    z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
+    z2 = MULTIPLY(z2, -FIX(1.224744871));            /* -c3 */
 
     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
@@ -966,15 +966,15 @@
 
     /* Final output stage */
 
-    wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp14, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 9 rows from work array, store into output array. */
@@ -986,12 +986,12 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
 
-    z1 = (JLONG) wsptr[2];
-    z2 = (JLONG) wsptr[4];
-    z3 = (JLONG) wsptr[6];
+    z1 = (JLONG)wsptr[2];
+    z2 = (JLONG)wsptr[4];
+    z3 = (JLONG)wsptr[6];
 
     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
     tmp1 = tmp0 + tmp3;
@@ -1011,12 +1011,12 @@
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
-    z4 = (JLONG) wsptr[7];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
+    z4 = (JLONG)wsptr[7];
 
-    z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
+    z2 = MULTIPLY(z2, -FIX(1.224744871));            /* -c3 */
 
     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
@@ -1028,33 +1028,33 @@
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp13 + tmp3,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp13 - tmp3,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp14,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -1070,9 +1070,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_10x10(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24;
@@ -1083,32 +1083,32 @@
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*10];  /* buffers data between passes */
+  int workspace[8 * 10];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     z3 = LEFT_SHIFT(z3, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z3 += ONE << (CONST_BITS - PASS1_BITS - 1);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
     tmp10 = z3 + z1;
     tmp11 = z3 - z2;
 
     tmp22 = RIGHT_SHIFT(z3 - LEFT_SHIFT(z1 - z2, 1),
-                        CONST_BITS-PASS1_BITS);  /* c0 = (c4-c8)*2 */
+                        CONST_BITS - PASS1_BITS); /* c0 = (c4-c8)*2 */
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
@@ -1121,10 +1121,10 @@
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     tmp11 = z2 + z4;
     tmp13 = z2 - z4;
@@ -1148,16 +1148,16 @@
 
     /* Final output stage */
 
-    wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*2] = (int) (tmp22 + tmp12);
-    wsptr[8*7] = (int) (tmp22 - tmp12);
-    wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2] = (int)(tmp22 + tmp12);
+    wsptr[8 * 7] = (int)(tmp22 - tmp12);
+    wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 10 rows from work array, store into output array. */
@@ -1169,9 +1169,9 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z3 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     z3 = LEFT_SHIFT(z3, CONST_BITS);
-    z4 = (JLONG) wsptr[4];
+    z4 = (JLONG)wsptr[4];
     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
     tmp10 = z3 + z1;
@@ -1179,8 +1179,8 @@
 
     tmp22 = z3 - LEFT_SHIFT(z1 - z2, 1);         /* c0 = (c4-c8)*2 */
 
-    z2 = (JLONG) wsptr[2];
-    z3 = (JLONG) wsptr[6];
+    z2 = (JLONG)wsptr[2];
+    z3 = (JLONG)wsptr[6];
 
     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
@@ -1193,11 +1193,11 @@
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
     z3 = LEFT_SHIFT(z3, CONST_BITS);
-    z4 = (JLONG) wsptr[7];
+    z4 = (JLONG)wsptr[7];
 
     tmp11 = z2 + z4;
     tmp13 = z2 - z4;
@@ -1220,36 +1220,36 @@
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[9] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -1265,9 +1265,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_11x11(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
@@ -1278,30 +1278,30 @@
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*11];  /* buffers data between passes */
+  int workspace[8 * 11];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp10 = LEFT_SHIFT(tmp10, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp10 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
     z4 = z1 + z3;
-    tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
+    tmp24 = MULTIPLY(z4, -FIX(1.155664402));         /* -(c2-c10) */
     z4 -= z2;
     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
     tmp21 = tmp20 + tmp23 + tmp25 -
@@ -1316,10 +1316,10 @@
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     tmp11 = z1 + z2;
     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
@@ -1331,26 +1331,26 @@
     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
-    z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
+    z1    = MULTIPLY(z2 + z4, -FIX(1.798248910));        /* -(c1+c9) */
     tmp11 += z1;
     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
-    tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
+    tmp14 += MULTIPLY(z2, -FIX(1.467221301)) +           /* -(c5+c9) */
              MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
              MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
 
     /* Final output stage */
 
-    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 11 rows from work array, store into output array. */
@@ -1362,17 +1362,17 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp10 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp10 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp10 = LEFT_SHIFT(tmp10, CONST_BITS);
 
-    z1 = (JLONG) wsptr[2];
-    z2 = (JLONG) wsptr[4];
-    z3 = (JLONG) wsptr[6];
+    z1 = (JLONG)wsptr[2];
+    z2 = (JLONG)wsptr[4];
+    z3 = (JLONG)wsptr[6];
 
     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
     z4 = z1 + z3;
-    tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
+    tmp24 = MULTIPLY(z4, -FIX(1.155664402));         /* -(c2-c10) */
     z4 -= z2;
     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
     tmp21 = tmp20 + tmp23 + tmp25 -
@@ -1387,10 +1387,10 @@
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
-    z4 = (JLONG) wsptr[7];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
+    z4 = (JLONG)wsptr[7];
 
     tmp11 = z1 + z2;
     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
@@ -1402,48 +1402,48 @@
     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
-    z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
+    z1    = MULTIPLY(z2 + z4, -FIX(1.798248910));        /* -(c1+c9) */
     tmp11 += z1;
     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
-    tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
+    tmp14 += MULTIPLY(z2, -FIX(1.467221301)) +           /* -(c5+c9) */
              MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
              MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
 
     /* Final output stage */
 
-    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
+    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -1459,9 +1459,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
@@ -1472,32 +1472,32 @@
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*12];  /* buffers data between passes */
+  int workspace[8 * 12];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     z3 = LEFT_SHIFT(z3, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z3 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
 
     tmp10 = z3 + z4;
     tmp11 = z3 - z4;
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
     z1 = LEFT_SHIFT(z1, CONST_BITS);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
     z2 = LEFT_SHIFT(z2, CONST_BITS);
 
     tmp12 = z1 - z2;
@@ -1517,19 +1517,19 @@
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
-    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
+    tmp14 = MULTIPLY(z2, -FIX_0_541196100);                  /* -c9 */
 
     tmp10 = z1 + z3;
     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
-    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
+    tmp13 = MULTIPLY(z3 + z4, -FIX(1.045510580));            /* -(c7+c11) */
     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
@@ -1543,18 +1543,18 @@
 
     /* Final output stage */
 
-    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 12 rows from work array, store into output array. */
@@ -1566,19 +1566,19 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z3 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     z3 = LEFT_SHIFT(z3, CONST_BITS);
 
-    z4 = (JLONG) wsptr[4];
+    z4 = (JLONG)wsptr[4];
     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
 
     tmp10 = z3 + z4;
     tmp11 = z3 - z4;
 
-    z1 = (JLONG) wsptr[2];
+    z1 = (JLONG)wsptr[2];
     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
     z1 = LEFT_SHIFT(z1, CONST_BITS);
-    z2 = (JLONG) wsptr[6];
+    z2 = (JLONG)wsptr[6];
     z2 = LEFT_SHIFT(z2, CONST_BITS);
 
     tmp12 = z1 - z2;
@@ -1598,19 +1598,19 @@
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
-    z4 = (JLONG) wsptr[7];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
+    z4 = (JLONG)wsptr[7];
 
     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
-    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
+    tmp14 = MULTIPLY(z2, -FIX_0_541196100);                  /* -c9 */
 
     tmp10 = z1 + z3;
     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
-    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
+    tmp13 = MULTIPLY(z3 + z4, -FIX(1.045510580));            /* -(c7+c11) */
     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
@@ -1624,42 +1624,42 @@
 
     /* Final output stage */
 
-    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
+    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -1675,9 +1675,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_13x13(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
@@ -1688,25 +1688,25 @@
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*13];  /* buffers data between passes */
+  int workspace[8 * 13];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     z1 = LEFT_SHIFT(z1, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     tmp10 = z3 + z4;
     tmp11 = z3 - z4;
@@ -1721,22 +1721,22 @@
     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
 
     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
-    tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
+    tmp25 = MULTIPLY(z2, -FIX(1.252223920)) + tmp12 + tmp13;  /* c4 */
 
     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
 
-    tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
-    tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
+    tmp23 = MULTIPLY(z2, -FIX(0.170464608)) - tmp12 - tmp13;  /* c12 */
+    tmp24 = MULTIPLY(z2, -FIX(0.803364869)) + tmp12 - tmp13;  /* c8 */
 
     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
@@ -1744,13 +1744,13 @@
     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
     tmp10 = tmp11 + tmp12 + tmp13 -
             MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
-    tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
+    tmp14 = MULTIPLY(z2 + z3, -FIX(0.338443458));    /* -c11 */
     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
-    tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
+    tmp14 = MULTIPLY(z2 + z4, -FIX(1.163874945));    /* -c5 */
     tmp11 += tmp14;
     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
-    tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
+    tmp14 = MULTIPLY(z3 + z4, -FIX(0.657217813));    /* -c9 */
     tmp12 += tmp14;
     tmp13 += tmp14;
     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
@@ -1763,19 +1763,19 @@
 
     /* Final output stage */
 
-    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 13 rows from work array, store into output array. */
@@ -1787,12 +1787,12 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     z1 = LEFT_SHIFT(z1, CONST_BITS);
 
-    z2 = (JLONG) wsptr[2];
-    z3 = (JLONG) wsptr[4];
-    z4 = (JLONG) wsptr[6];
+    z2 = (JLONG)wsptr[2];
+    z3 = (JLONG)wsptr[4];
+    z4 = (JLONG)wsptr[6];
 
     tmp10 = z3 + z4;
     tmp11 = z3 - z4;
@@ -1807,22 +1807,22 @@
     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
 
     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
-    tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
+    tmp25 = MULTIPLY(z2, -FIX(1.252223920)) + tmp12 + tmp13;  /* c4 */
 
     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
 
-    tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
-    tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
+    tmp23 = MULTIPLY(z2, -FIX(0.170464608)) - tmp12 - tmp13;  /* c12 */
+    tmp24 = MULTIPLY(z2, -FIX(0.803364869)) + tmp12 - tmp13;  /* c8 */
 
     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
-    z4 = (JLONG) wsptr[7];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
+    z4 = (JLONG)wsptr[7];
 
     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
@@ -1830,13 +1830,13 @@
     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
     tmp10 = tmp11 + tmp12 + tmp13 -
             MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
-    tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
+    tmp14 = MULTIPLY(z2 + z3, -FIX(0.338443458));    /* -c11 */
     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
-    tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
+    tmp14 = MULTIPLY(z2 + z4, -FIX(1.163874945));    /* -c5 */
     tmp11 += tmp14;
     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
-    tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
+    tmp14 = MULTIPLY(z3 + z4, -FIX(0.657217813));    /* -c9 */
     tmp12 += tmp14;
     tmp13 += tmp14;
     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
@@ -1849,45 +1849,45 @@
 
     /* Final output stage */
 
-    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
+    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -1903,9 +1903,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_14x14(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
@@ -1916,22 +1916,22 @@
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*14];  /* buffers data between passes */
+  int workspace[8 * 14];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     z1 = LEFT_SHIFT(z1, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
@@ -1941,10 +1941,10 @@
     tmp12 = z1 - z4;
 
     tmp23 = RIGHT_SHIFT(z1 - LEFT_SHIFT(z2 + z3 - z4, 1),
-                        CONST_BITS-PASS1_BITS);  /* c0 = (c4+c12-c8)*2 */
+                        CONST_BITS - PASS1_BITS); /* c0 = (c4+c12-c8)*2 */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
 
@@ -1962,10 +1962,10 @@
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
     tmp13 = LEFT_SHIFT(z4, CONST_BITS);
 
     tmp14 = z1 + z3;
@@ -1978,7 +1978,7 @@
     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
     tmp16 += tmp15;
     z1    += z4;
-    z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
+    z4    = MULTIPLY(z2 + z3, -FIX(0.158341681)) - tmp13;  /* -c13 */
     tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
     tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
     z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
@@ -1989,20 +1989,20 @@
 
     /* Final output stage */
 
-    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*3]  = (int) (tmp23 + tmp13);
-    wsptr[8*10] = (int) (tmp23 - tmp13);
-    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
-    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3]  = (int)(tmp23 + tmp13);
+    wsptr[8 * 10] = (int)(tmp23 - tmp13);
+    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 14 rows from work array, store into output array. */
@@ -2014,9 +2014,9 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     z1 = LEFT_SHIFT(z1, CONST_BITS);
-    z4 = (JLONG) wsptr[4];
+    z4 = (JLONG)wsptr[4];
     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
@@ -2027,8 +2027,8 @@
 
     tmp23 = z1 - LEFT_SHIFT(z2 + z3 - z4, 1);    /* c0 = (c4+c12-c8)*2 */
 
-    z1 = (JLONG) wsptr[2];
-    z2 = (JLONG) wsptr[6];
+    z1 = (JLONG)wsptr[2];
+    z2 = (JLONG)wsptr[6];
 
     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
 
@@ -2046,10 +2046,10 @@
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
-    z4 = (JLONG) wsptr[7];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
+    z4 = (JLONG)wsptr[7];
     z4 = LEFT_SHIFT(z4, CONST_BITS);
 
     tmp14 = z1 + z3;
@@ -2061,7 +2061,7 @@
     z1    -= z2;
     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
     tmp16 += tmp15;
-    tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
+    tmp13 = MULTIPLY(z2 + z3, -FIX(0.158341681)) - z4;     /* -c13 */
     tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
     tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
     tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
@@ -2072,48 +2072,48 @@
 
     /* Final output stage */
 
-    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
+    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp16,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp16,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -2129,9 +2129,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_15x15(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
@@ -2142,25 +2142,25 @@
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*15];  /* buffers data between passes */
+  int workspace[8 * 15];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     z1 = LEFT_SHIFT(z1, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
@@ -2195,19 +2195,19 @@
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     tmp13 = z2 - z4;
     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
 
-    tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
-    tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
+    tmp13 = MULTIPLY(z2, -FIX(0.831253876));                /* -c9 */
+    tmp15 = MULTIPLY(z2, -FIX(1.344997024));                /* -c3 */
     z2 = z1 - z4;
     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
 
@@ -2220,21 +2220,21 @@
 
     /* Final output stage */
 
-    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
-    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
-    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 14] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp27, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 15 rows from work array, store into output array. */
@@ -2246,12 +2246,12 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     z1 = LEFT_SHIFT(z1, CONST_BITS);
 
-    z2 = (JLONG) wsptr[2];
-    z3 = (JLONG) wsptr[4];
-    z4 = (JLONG) wsptr[6];
+    z2 = (JLONG)wsptr[2];
+    z3 = (JLONG)wsptr[4];
+    z4 = (JLONG)wsptr[6];
 
     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
@@ -2286,19 +2286,19 @@
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z4 = (JLONG) wsptr[5];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z4 = (JLONG)wsptr[5];
     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
-    z4 = (JLONG) wsptr[7];
+    z4 = (JLONG)wsptr[7];
 
     tmp13 = z2 - z4;
     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
 
-    tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
-    tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
+    tmp13 = MULTIPLY(z2, -FIX(0.831253876));                /* -c9 */
+    tmp15 = MULTIPLY(z2, -FIX(1.344997024));                /* -c3 */
     z2 = z1 - z4;
     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
 
@@ -2311,51 +2311,51 @@
 
     /* Final output stage */
 
-    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
+    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[14] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp16,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp16,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp27,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -2371,9 +2371,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_16x16(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
@@ -2384,23 +2384,23 @@
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*16];  /* buffers data between passes */
+  int workspace[8 * 16];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
+    tmp0 += 1 << (CONST_BITS - PASS1_BITS - 1);
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
 
@@ -2409,8 +2409,8 @@
     tmp12 = tmp0 + tmp2;
     tmp13 = tmp0 - tmp2;
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
     z3 = z1 - z2;
     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
@@ -2431,10 +2431,10 @@
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     tmp11 = z1 + z3;
 
@@ -2455,13 +2455,13 @@
     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
     z2    += z4;
-    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
+    z1    = MULTIPLY(z2, -FIX(0.666655658));       /* -c11 */
     tmp1  += z1;
     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
-    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
+    z2    = MULTIPLY(z2, -FIX(1.247225013));       /* -c5 */
     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
     tmp12 += z2;
-    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+    z2    = MULTIPLY(z3 + z4, -FIX(1.353318001));  /* -c3 */
     tmp2  += z2;
     tmp3  += z2;
     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
@@ -2470,22 +2470,22 @@
 
     /* Final output stage */
 
-    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
-    wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
-    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
-    wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
-    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
-    wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
-    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
-    wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
-    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 15] = (int)RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 14] = (int)RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 16 rows from work array, store into output array. */
@@ -2497,10 +2497,10 @@
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
 
-    z1 = (JLONG) wsptr[4];
+    z1 = (JLONG)wsptr[4];
     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
 
@@ -2509,8 +2509,8 @@
     tmp12 = tmp0 + tmp2;
     tmp13 = tmp0 - tmp2;
 
-    z1 = (JLONG) wsptr[2];
-    z2 = (JLONG) wsptr[6];
+    z1 = (JLONG)wsptr[2];
+    z2 = (JLONG)wsptr[6];
     z3 = z1 - z2;
     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
@@ -2531,10 +2531,10 @@
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
-    z4 = (JLONG) wsptr[7];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
+    z4 = (JLONG)wsptr[7];
 
     tmp11 = z1 + z3;
 
@@ -2555,13 +2555,13 @@
     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
     z2    += z4;
-    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
+    z1    = MULTIPLY(z2, -FIX(0.666655658));       /* -c11 */
     tmp1  += z1;
     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
-    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
+    z2    = MULTIPLY(z2, -FIX(1.247225013));       /* -c5 */
     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
     tmp12 += z2;
-    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+    z2    = MULTIPLY(z3 + z4, -FIX(1.353318001));  /* -c3 */
     tmp2  += z2;
     tmp3  += z2;
     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
@@ -2570,54 +2570,54 @@
 
     /* Final output stage */
 
-    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
+    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp0,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[15] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp0,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp1,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[14] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp1,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp2,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp2,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp3,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp3,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp27 + tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp27 - tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
diff --git a/jidctred.c b/jidctred.c
index 7a81803..1ff352f 100644
--- a/jidctred.c
+++ b/jidctred.c
@@ -58,20 +58,20 @@
  */
 
 #if CONST_BITS == 13
-#define FIX_0_211164243  ((JLONG)  1730)        /* FIX(0.211164243) */
-#define FIX_0_509795579  ((JLONG)  4176)        /* FIX(0.509795579) */
-#define FIX_0_601344887  ((JLONG)  4926)        /* FIX(0.601344887) */
-#define FIX_0_720959822  ((JLONG)  5906)        /* FIX(0.720959822) */
-#define FIX_0_765366865  ((JLONG)  6270)        /* FIX(0.765366865) */
-#define FIX_0_850430095  ((JLONG)  6967)        /* FIX(0.850430095) */
-#define FIX_0_899976223  ((JLONG)  7373)        /* FIX(0.899976223) */
-#define FIX_1_061594337  ((JLONG)  8697)        /* FIX(1.061594337) */
-#define FIX_1_272758580  ((JLONG)  10426)       /* FIX(1.272758580) */
-#define FIX_1_451774981  ((JLONG)  11893)       /* FIX(1.451774981) */
-#define FIX_1_847759065  ((JLONG)  15137)       /* FIX(1.847759065) */
-#define FIX_2_172734803  ((JLONG)  17799)       /* FIX(2.172734803) */
-#define FIX_2_562915447  ((JLONG)  20995)       /* FIX(2.562915447) */
-#define FIX_3_624509785  ((JLONG)  29692)       /* FIX(3.624509785) */
+#define FIX_0_211164243  ((JLONG)1730)          /* FIX(0.211164243) */
+#define FIX_0_509795579  ((JLONG)4176)          /* FIX(0.509795579) */
+#define FIX_0_601344887  ((JLONG)4926)          /* FIX(0.601344887) */
+#define FIX_0_720959822  ((JLONG)5906)          /* FIX(0.720959822) */
+#define FIX_0_765366865  ((JLONG)6270)          /* FIX(0.765366865) */
+#define FIX_0_850430095  ((JLONG)6967)          /* FIX(0.850430095) */
+#define FIX_0_899976223  ((JLONG)7373)          /* FIX(0.899976223) */
+#define FIX_1_061594337  ((JLONG)8697)          /* FIX(1.061594337) */
+#define FIX_1_272758580  ((JLONG)10426)         /* FIX(1.272758580) */
+#define FIX_1_451774981  ((JLONG)11893)         /* FIX(1.451774981) */
+#define FIX_1_847759065  ((JLONG)15137)         /* FIX(1.847759065) */
+#define FIX_2_172734803  ((JLONG)17799)         /* FIX(2.172734803) */
+#define FIX_2_562915447  ((JLONG)20995)         /* FIX(2.562915447) */
+#define FIX_3_624509785  ((JLONG)29692)         /* FIX(3.624509785) */
 #else
 #define FIX_0_211164243  FIX(0.211164243)
 #define FIX_0_509795579  FIX(0.509795579)
@@ -98,9 +98,9 @@
  */
 
 #if BITS_IN_JSAMPLE == 8
-#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
+#define MULTIPLY(var, const)  MULTIPLY16C16(var, const)
 #else
-#define MULTIPLY(var,const)  ((var) * (const))
+#define MULTIPLY(var, const)  ((var) * (const))
 #endif
 
 
@@ -109,7 +109,7 @@
  * are 16 bits or less, so either int or short multiply will work.
  */
 
-#define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
+#define DEQUANTIZE(coef, quantval)  (((ISLOW_MULT_TYPE)(coef)) * (quantval))
 
 
 /*
@@ -118,9 +118,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp2, tmp10, tmp12;
   JLONG z1, z2, z3, z4;
@@ -130,69 +130,73 @@
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[DCTSIZE*4];     /* buffers data between passes */
+  int workspace[DCTSIZE * 4];   /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = DCTSIZE; ctr > 0; inptr++, quantptr++, wsptr++, ctr--) {
     /* Don't bother to process column 4, because second pass won't use it */
-    if (ctr == DCTSIZE-4)
+    if (ctr == DCTSIZE - 4)
       continue;
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*5] == 0 &&
-        inptr[DCTSIZE*6] == 0 && inptr[DCTSIZE*7] == 0) {
+    if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+        inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 5] == 0 &&
+        inptr[DCTSIZE * 6] == 0 && inptr[DCTSIZE * 7] == 0) {
       /* AC terms all zero; we need not examine term 4 for 4x4 output */
-      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]),
-                             PASS1_BITS);
+      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE * 0],
+                                        quantptr[DCTSIZE * 0]), PASS1_BITS);
 
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-      wsptr[DCTSIZE*2] = dcval;
-      wsptr[DCTSIZE*3] = dcval;
+      wsptr[DCTSIZE * 0] = dcval;
+      wsptr[DCTSIZE * 1] = dcval;
+      wsptr[DCTSIZE * 2] = dcval;
+      wsptr[DCTSIZE * 3] = dcval;
 
       continue;
     }
 
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS+1);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS + 1);
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
-    tmp2 = MULTIPLY(z2, FIX_1_847759065) + MULTIPLY(z3, - FIX_0_765366865);
+    tmp2 = MULTIPLY(z2, FIX_1_847759065) + MULTIPLY(z3, -FIX_0_765366865);
 
     tmp10 = tmp0 + tmp2;
     tmp12 = tmp0 - tmp2;
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
 
-    tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */
-         + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */
-         + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */
-         + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */
+    tmp0 = MULTIPLY(z1, -FIX_0_211164243) + /* sqrt(2) * ( c3-c1) */
+           MULTIPLY(z2,  FIX_1_451774981) + /* sqrt(2) * ( c3+c7) */
+           MULTIPLY(z3, -FIX_2_172734803) + /* sqrt(2) * (-c1-c5) */
+           MULTIPLY(z4,  FIX_1_061594337);  /* sqrt(2) * ( c5+c7) */
 
-    tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */
-         + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */
-         + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */
-         + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
+    tmp2 = MULTIPLY(z1, -FIX_0_509795579) + /* sqrt(2) * (c7-c5) */
+           MULTIPLY(z2, -FIX_0_601344887) + /* sqrt(2) * (c5-c1) */
+           MULTIPLY(z3,  FIX_0_899976223) + /* sqrt(2) * (c3-c7) */
+           MULTIPLY(z4,  FIX_2_562915447);  /* sqrt(2) * (c1+c3) */
 
     /* Final output stage */
 
-    wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp2, CONST_BITS-PASS1_BITS+1);
-    wsptr[DCTSIZE*3] = (int) DESCALE(tmp10 - tmp2, CONST_BITS-PASS1_BITS+1);
-    wsptr[DCTSIZE*1] = (int) DESCALE(tmp12 + tmp0, CONST_BITS-PASS1_BITS+1);
-    wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 - tmp0, CONST_BITS-PASS1_BITS+1);
+    wsptr[DCTSIZE * 0] =
+      (int)DESCALE(tmp10 + tmp2, CONST_BITS - PASS1_BITS + 1);
+    wsptr[DCTSIZE * 3] =
+      (int)DESCALE(tmp10 - tmp2, CONST_BITS - PASS1_BITS + 1);
+    wsptr[DCTSIZE * 1] =
+      (int)DESCALE(tmp12 + tmp0, CONST_BITS - PASS1_BITS + 1);
+    wsptr[DCTSIZE * 2] =
+      (int)DESCALE(tmp12 - tmp0, CONST_BITS - PASS1_BITS + 1);
   }
 
   /* Pass 2: process 4 rows from work array, store into output array. */
@@ -206,8 +210,8 @@
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 &&
         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3)
-                                  & RANGE_MASK];
+      JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
+                                               PASS1_BITS + 3) & RANGE_MASK];
 
       outptr[0] = dcval;
       outptr[1] = dcval;
@@ -221,45 +225,45 @@
 
     /* Even part */
 
-    tmp0 = LEFT_SHIFT((JLONG) wsptr[0], CONST_BITS+1);
+    tmp0 = LEFT_SHIFT((JLONG)wsptr[0], CONST_BITS + 1);
 
-    tmp2 = MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
-         + MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865);
+    tmp2 = MULTIPLY((JLONG)wsptr[2],  FIX_1_847759065) +
+           MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865);
 
     tmp10 = tmp0 + tmp2;
     tmp12 = tmp0 - tmp2;
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[7];
-    z2 = (JLONG) wsptr[5];
-    z3 = (JLONG) wsptr[3];
-    z4 = (JLONG) wsptr[1];
+    z1 = (JLONG)wsptr[7];
+    z2 = (JLONG)wsptr[5];
+    z3 = (JLONG)wsptr[3];
+    z4 = (JLONG)wsptr[1];
 
-    tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */
-         + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */
-         + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */
-         + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */
+    tmp0 = MULTIPLY(z1, -FIX_0_211164243) + /* sqrt(2) * ( c3-c1) */
+           MULTIPLY(z2,  FIX_1_451774981) + /* sqrt(2) * ( c3+c7) */
+           MULTIPLY(z3, -FIX_2_172734803) + /* sqrt(2) * (-c1-c5) */
+           MULTIPLY(z4,  FIX_1_061594337);  /* sqrt(2) * ( c5+c7) */
 
-    tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */
-         + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */
-         + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */
-         + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
+    tmp2 = MULTIPLY(z1, -FIX_0_509795579) + /* sqrt(2) * (c7-c5) */
+           MULTIPLY(z2, -FIX_0_601344887) + /* sqrt(2) * (c5-c1) */
+           MULTIPLY(z3, FIX_0_899976223) +  /* sqrt(2) * (c3-c7) */
+           MULTIPLY(z4, FIX_2_562915447);   /* sqrt(2) * (c1+c3) */
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp2,
-                                          CONST_BITS+PASS1_BITS+3+1)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) DESCALE(tmp10 - tmp2,
-                                          CONST_BITS+PASS1_BITS+3+1)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE(tmp12 + tmp0,
-                                          CONST_BITS+PASS1_BITS+3+1)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) DESCALE(tmp12 - tmp0,
-                                          CONST_BITS+PASS1_BITS+3+1)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)DESCALE(tmp10 + tmp2,
+                                         CONST_BITS + PASS1_BITS + 3 + 1) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)DESCALE(tmp10 - tmp2,
+                                         CONST_BITS + PASS1_BITS + 3 + 1) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)DESCALE(tmp12 + tmp0,
+                                         CONST_BITS + PASS1_BITS + 3 + 1) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)DESCALE(tmp12 - tmp0,
+                                         CONST_BITS + PASS1_BITS + 3 + 1) &
+                            RANGE_MASK];
 
     wsptr += DCTSIZE;           /* advance pointer to next row */
   }
@@ -272,9 +276,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp10, z1;
   JCOEFPTR inptr;
@@ -283,50 +287,52 @@
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[DCTSIZE*2];     /* buffers data between passes */
+  int workspace[DCTSIZE * 2];   /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = DCTSIZE; ctr > 0; inptr++, quantptr++, wsptr++, ctr--) {
     /* Don't bother to process columns 2,4,6 */
-    if (ctr == DCTSIZE-2 || ctr == DCTSIZE-4 || ctr == DCTSIZE-6)
+    if (ctr == DCTSIZE - 2 || ctr == DCTSIZE - 4 || ctr == DCTSIZE - 6)
       continue;
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*3] == 0 &&
-        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*7] == 0) {
+    if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 3] == 0 &&
+        inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 7] == 0) {
       /* AC terms all zero; we need not examine terms 2,4,6 for 2x2 output */
-      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]),
-                             PASS1_BITS);
+      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE * 0],
+                             quantptr[DCTSIZE * 0]), PASS1_BITS);
 
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
+      wsptr[DCTSIZE * 0] = dcval;
+      wsptr[DCTSIZE * 1] = dcval;
 
       continue;
     }
 
     /* Even part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp10 = LEFT_SHIFT(z1, CONST_BITS+2);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+    tmp10 = LEFT_SHIFT(z1, CONST_BITS + 2);
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-    tmp0 = MULTIPLY(z1, - FIX_0_720959822); /* sqrt(2) * (c7-c5+c3-c1) */
-    z1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    tmp0 += MULTIPLY(z1, FIX_0_850430095); /* sqrt(2) * (-c1+c3+c5+c7) */
-    z1 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    tmp0 += MULTIPLY(z1, - FIX_1_272758580); /* sqrt(2) * (-c1+c3-c5-c7) */
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    tmp0 += MULTIPLY(z1, FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+    tmp0 = MULTIPLY(z1, -FIX_0_720959822);  /* sqrt(2) * ( c7-c5+c3-c1) */
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    tmp0 += MULTIPLY(z1, FIX_0_850430095);  /* sqrt(2) * (-c1+c3+c5+c7) */
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    tmp0 += MULTIPLY(z1, -FIX_1_272758580); /* sqrt(2) * (-c1+c3-c5-c7) */
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    tmp0 += MULTIPLY(z1, FIX_3_624509785);  /* sqrt(2) * ( c1+c3+c5+c7) */
 
     /* Final output stage */
 
-    wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp0, CONST_BITS-PASS1_BITS+2);
-    wsptr[DCTSIZE*1] = (int) DESCALE(tmp10 - tmp0, CONST_BITS-PASS1_BITS+2);
+    wsptr[DCTSIZE * 0] =
+      (int)DESCALE(tmp10 + tmp0, CONST_BITS - PASS1_BITS + 2);
+    wsptr[DCTSIZE * 1] =
+      (int)DESCALE(tmp10 - tmp0, CONST_BITS - PASS1_BITS + 2);
   }
 
   /* Pass 2: process 2 rows from work array, store into output array. */
@@ -339,8 +345,8 @@
 #ifndef NO_ZERO_ROW_TEST
     if (wsptr[1] == 0 && wsptr[3] == 0 && wsptr[5] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3)
-                                  & RANGE_MASK];
+      JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
+                                               PASS1_BITS + 3) & RANGE_MASK];
 
       outptr[0] = dcval;
       outptr[1] = dcval;
@@ -352,23 +358,23 @@
 
     /* Even part */
 
-    tmp10 = LEFT_SHIFT((JLONG) wsptr[0], CONST_BITS+2);
+    tmp10 = LEFT_SHIFT((JLONG)wsptr[0], CONST_BITS + 2);
 
     /* Odd part */
 
-    tmp0 = MULTIPLY((JLONG) wsptr[7], - FIX_0_720959822) /* sqrt(2) * (c7-c5+c3-c1) */
-         + MULTIPLY((JLONG) wsptr[5], FIX_0_850430095) /* sqrt(2) * (-c1+c3+c5+c7) */
-         + MULTIPLY((JLONG) wsptr[3], - FIX_1_272758580) /* sqrt(2) * (-c1+c3-c5-c7) */
-         + MULTIPLY((JLONG) wsptr[1], FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */
+    tmp0 = MULTIPLY((JLONG)wsptr[7], -FIX_0_720959822) + /* sqrt(2) * ( c7-c5+c3-c1) */
+           MULTIPLY((JLONG)wsptr[5],  FIX_0_850430095) + /* sqrt(2) * (-c1+c3+c5+c7) */
+           MULTIPLY((JLONG)wsptr[3], -FIX_1_272758580) + /* sqrt(2) * (-c1+c3-c5-c7) */
+           MULTIPLY((JLONG)wsptr[1],  FIX_3_624509785);  /* sqrt(2) * ( c1+c3+c5+c7) */
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0,
-                                          CONST_BITS+PASS1_BITS+3+2)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE(tmp10 - tmp0,
-                                          CONST_BITS+PASS1_BITS+3+2)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)DESCALE(tmp10 + tmp0,
+                                         CONST_BITS + PASS1_BITS + 3 + 2) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)DESCALE(tmp10 - tmp0,
+                                         CONST_BITS + PASS1_BITS + 3 + 2) &
+                            RANGE_MASK];
 
     wsptr += DCTSIZE;           /* advance pointer to next row */
   }
@@ -381,9 +387,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_1x1(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   int dcval;
   ISLOW_MULT_TYPE *quantptr;
@@ -393,9 +399,9 @@
   /* We hardly need an inverse DCT routine for this: just take the
    * average pixel value, which is one-eighth of the DC coefficient.
    */
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
-  dcval = (int) DESCALE((JLONG) dcval, 3);
+  dcval = (int)DESCALE((JLONG)dcval, 3);
 
   output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
 }
diff --git a/jinclude.h b/jinclude.h
index d461a1a..c1bcf7d 100644
--- a/jinclude.h
+++ b/jinclude.h
@@ -61,14 +61,18 @@
 #ifdef NEED_BSD_STRINGS
 
 #include <strings.h>
-#define MEMZERO(target,size)    bzero((void *)(target), (size_t)(size))
-#define MEMCOPY(dest,src,size)  bcopy((const void *)(src), (void *)(dest), (size_t)(size))
+#define MEMZERO(target, size) \
+  bzero((void *)(target), (size_t)(size))
+#define MEMCOPY(dest, src, size) \
+  bcopy((const void *)(src), (void *)(dest), (size_t)(size))
 
 #else /* not BSD, assume ANSI/SysV string lib */
 
 #include <string.h>
-#define MEMZERO(target,size)    memset((void *)(target), 0, (size_t)(size))
-#define MEMCOPY(dest,src,size)  memcpy((void *)(dest), (const void *)(src), (size_t)(size))
+#define MEMZERO(target, size) \
+  memset((void *)(target), 0, (size_t)(size))
+#define MEMCOPY(dest, src, size) \
+  memcpy((void *)(dest), (const void *)(src), (size_t)(size))
 
 #endif
 
@@ -78,7 +82,7 @@
  * CAUTION: argument order is different from underlying functions!
  */
 
-#define JFREAD(file,buf,sizeofbuf)  \
-  ((size_t) fread((void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file)))
-#define JFWRITE(file,buf,sizeofbuf)  \
-  ((size_t) fwrite((const void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file)))
+#define JFREAD(file, buf, sizeofbuf) \
+  ((size_t)fread((void *)(buf), (size_t)1, (size_t)(sizeofbuf), (file)))
+#define JFWRITE(file, buf, sizeofbuf) \
+  ((size_t)fwrite((const void *)(buf), (size_t)1, (size_t)(sizeofbuf), (file)))
diff --git a/jmemmgr.c b/jmemmgr.c
index 8dfb633..192a58c 100644
--- a/jmemmgr.c
+++ b/jmemmgr.c
@@ -39,13 +39,13 @@
 
 #ifndef NO_GETENV
 #ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare getenv() */
-extern char *getenv (const char *name);
+extern char *getenv(const char *name);
 #endif
 #endif
 
 
 LOCAL(size_t)
-round_up_pow2 (size_t a, size_t b)
+round_up_pow2(size_t a, size_t b)
 /* a rounded up to the next multiple of b, i.e. ceil(a/b)*b */
 /* Assumes a >= 0, b > 0, and b is a power of 2 */
 {
@@ -89,7 +89,9 @@
 #ifndef WITH_SIMD
 #define ALIGN_SIZE  sizeof(double)
 #else
-#define ALIGN_SIZE  16 /* Most SIMD implementations require this */
+#define ALIGN_SIZE  32 /* Most of the SIMD instructions we support require
+                          16-byte (128-bit) alignment, but AVX2 requires
+                          32-byte alignment. */
 #endif
 #endif
 
@@ -104,7 +106,7 @@
 typedef struct small_pool_struct *small_pool_ptr;
 
 typedef struct small_pool_struct {
-  small_pool_ptr next;  /* next in list of pools */
+  small_pool_ptr next;          /* next in list of pools */
   size_t bytes_used;            /* how many bytes already used within pool */
   size_t bytes_left;            /* bytes still available in this pool */
 } small_pool_hdr;
@@ -112,7 +114,7 @@
 typedef struct large_pool_struct *large_pool_ptr;
 
 typedef struct large_pool_struct {
-  large_pool_ptr next;  /* next in list of pools */
+  large_pool_ptr next;          /* next in list of pools */
   size_t bytes_used;            /* how many bytes already used within pool */
   size_t bytes_left;            /* bytes still available in this pool */
 } large_pool_hdr;
@@ -191,9 +193,9 @@
 #ifdef MEM_STATS                /* optional extra stuff for statistics */
 
 LOCAL(void)
-print_mem_stats (j_common_ptr cinfo, int pool_id)
+print_mem_stats(j_common_ptr cinfo, int pool_id)
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   small_pool_ptr shdr_ptr;
   large_pool_ptr lhdr_ptr;
 
@@ -206,15 +208,13 @@
 
   for (lhdr_ptr = mem->large_list[pool_id]; lhdr_ptr != NULL;
        lhdr_ptr = lhdr_ptr->next) {
-    fprintf(stderr, "  Large chunk used %ld\n",
-            (long) lhdr_ptr->bytes_used);
+    fprintf(stderr, "  Large chunk used %ld\n", (long)lhdr_ptr->bytes_used);
   }
 
   for (shdr_ptr = mem->small_list[pool_id]; shdr_ptr != NULL;
        shdr_ptr = shdr_ptr->next) {
     fprintf(stderr, "  Small chunk used %ld free %ld\n",
-            (long) shdr_ptr->bytes_used,
-            (long) shdr_ptr->bytes_left);
+            (long)shdr_ptr->bytes_used, (long)shdr_ptr->bytes_left);
   }
 }
 
@@ -222,7 +222,7 @@
 
 
 LOCAL(void)
-out_of_memory (j_common_ptr cinfo, int which)
+out_of_memory(j_common_ptr cinfo, int which)
 /* Report an out-of-memory error and stop execution */
 /* If we compiled MEM_STATS support, report alloc requests before dying */
 {
@@ -250,26 +250,24 @@
  * adjustment.
  */
 
-static const size_t first_pool_slop[JPOOL_NUMPOOLS] =
-{
-        1600,                   /* first PERMANENT pool */
-        16000                   /* first IMAGE pool */
+static const size_t first_pool_slop[JPOOL_NUMPOOLS] = {
+  1600,                         /* first PERMANENT pool */
+  16000                         /* first IMAGE pool */
 };
 
-static const size_t extra_pool_slop[JPOOL_NUMPOOLS] =
-{
-        0,                      /* additional PERMANENT pools */
-        5000                    /* additional IMAGE pools */
+static const size_t extra_pool_slop[JPOOL_NUMPOOLS] = {
+  0,                            /* additional PERMANENT pools */
+  5000                          /* additional IMAGE pools */
 };
 
 #define MIN_SLOP  50            /* greater than 0 to avoid futile looping */
 
 
 METHODDEF(void *)
-alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
+alloc_small(j_common_ptr cinfo, int pool_id, size_t sizeofobject)
 /* Allocate a "small" object */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   small_pool_ptr hdr_ptr, prev_hdr_ptr;
   char *data_ptr;
   size_t min_request, slop;
@@ -313,11 +311,11 @@
     else
       slop = extra_pool_slop[pool_id];
     /* Don't ask for more than MAX_ALLOC_CHUNK */
-    if (slop > (size_t) (MAX_ALLOC_CHUNK-min_request))
-      slop = (size_t) (MAX_ALLOC_CHUNK-min_request);
+    if (slop > (size_t)(MAX_ALLOC_CHUNK - min_request))
+      slop = (size_t)(MAX_ALLOC_CHUNK - min_request);
     /* Try to get space, if fail reduce slop and try again */
     for (;;) {
-      hdr_ptr = (small_pool_ptr) jpeg_get_small(cinfo, min_request + slop);
+      hdr_ptr = (small_pool_ptr)jpeg_get_small(cinfo, min_request + slop);
       if (hdr_ptr != NULL)
         break;
       slop /= 2;
@@ -336,7 +334,7 @@
   }
 
   /* OK, allocate the object from the current pool */
-  data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */
+  data_ptr = (char *)hdr_ptr; /* point to first data byte in pool... */
   data_ptr += sizeof(small_pool_hdr); /* ...by skipping the header... */
   if ((size_t)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
     data_ptr += ALIGN_SIZE - (size_t)data_ptr % ALIGN_SIZE;
@@ -344,7 +342,7 @@
   hdr_ptr->bytes_used += sizeofobject;
   hdr_ptr->bytes_left -= sizeofobject;
 
-  return (void *) data_ptr;
+  return (void *)data_ptr;
 }
 
 
@@ -362,10 +360,10 @@
  */
 
 METHODDEF(void *)
-alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
+alloc_large(j_common_ptr cinfo, int pool_id, size_t sizeofobject)
 /* Allocate a "large" object */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   large_pool_ptr hdr_ptr;
   char *data_ptr;
 
@@ -390,9 +388,9 @@
   if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
     ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
 
-  hdr_ptr = (large_pool_ptr) jpeg_get_large(cinfo, sizeofobject +
-                                            sizeof(large_pool_hdr) +
-                                            ALIGN_SIZE - 1);
+  hdr_ptr = (large_pool_ptr)jpeg_get_large(cinfo, sizeofobject +
+                                           sizeof(large_pool_hdr) +
+                                           ALIGN_SIZE - 1);
   if (hdr_ptr == NULL)
     out_of_memory(cinfo, 4);    /* jpeg_get_large failed */
   mem->total_space_allocated += sizeofobject + sizeof(large_pool_hdr) +
@@ -407,12 +405,12 @@
   hdr_ptr->bytes_left = 0;
   mem->large_list[pool_id] = hdr_ptr;
 
-  data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */
+  data_ptr = (char *)hdr_ptr; /* point to first data byte in pool... */
   data_ptr += sizeof(small_pool_hdr); /* ...by skipping the header... */
   if ((size_t)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
     data_ptr += ALIGN_SIZE - (size_t)data_ptr % ALIGN_SIZE;
 
-  return (void *) data_ptr;
+  return (void *)data_ptr;
 }
 
 
@@ -433,11 +431,11 @@
  */
 
 METHODDEF(JSAMPARRAY)
-alloc_sarray (j_common_ptr cinfo, int pool_id,
-              JDIMENSION samplesperrow, JDIMENSION numrows)
+alloc_sarray(j_common_ptr cinfo, int pool_id, JDIMENSION samplesperrow,
+             JDIMENSION numrows)
 /* Allocate a 2-D sample array */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   JSAMPARRAY result;
   JSAMPROW workspace;
   JDIMENSION rowsperchunk, currow, i;
@@ -456,27 +454,27 @@
                                                            sizeof(JSAMPLE));
 
   /* Calculate max # of rows allowed in one allocation chunk */
-  ltemp = (MAX_ALLOC_CHUNK-sizeof(large_pool_hdr)) /
-          ((long) samplesperrow * sizeof(JSAMPLE));
+  ltemp = (MAX_ALLOC_CHUNK - sizeof(large_pool_hdr)) /
+          ((long)samplesperrow * sizeof(JSAMPLE));
   if (ltemp <= 0)
     ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
-  if (ltemp < (long) numrows)
-    rowsperchunk = (JDIMENSION) ltemp;
+  if (ltemp < (long)numrows)
+    rowsperchunk = (JDIMENSION)ltemp;
   else
     rowsperchunk = numrows;
   mem->last_rowsperchunk = rowsperchunk;
 
   /* Get space for row pointers (small object) */
-  result = (JSAMPARRAY) alloc_small(cinfo, pool_id,
-                                    (size_t) (numrows * sizeof(JSAMPROW)));
+  result = (JSAMPARRAY)alloc_small(cinfo, pool_id,
+                                   (size_t)(numrows * sizeof(JSAMPROW)));
 
   /* Get the rows themselves (large objects) */
   currow = 0;
   while (currow < numrows) {
     rowsperchunk = MIN(rowsperchunk, numrows - currow);
-    workspace = (JSAMPROW) alloc_large(cinfo, pool_id,
-        (size_t) ((size_t) rowsperchunk * (size_t) samplesperrow
-                  * sizeof(JSAMPLE)));
+    workspace = (JSAMPROW)alloc_large(cinfo, pool_id,
+      (size_t)((size_t)rowsperchunk * (size_t)samplesperrow *
+               sizeof(JSAMPLE)));
     for (i = rowsperchunk; i > 0; i--) {
       result[currow++] = workspace;
       workspace += samplesperrow;
@@ -493,11 +491,11 @@
  */
 
 METHODDEF(JBLOCKARRAY)
-alloc_barray (j_common_ptr cinfo, int pool_id,
-              JDIMENSION blocksperrow, JDIMENSION numrows)
+alloc_barray(j_common_ptr cinfo, int pool_id, JDIMENSION blocksperrow,
+             JDIMENSION numrows)
 /* Allocate a 2-D coefficient-block array */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   JBLOCKARRAY result;
   JBLOCKROW workspace;
   JDIMENSION rowsperchunk, currow, i;
@@ -508,27 +506,27 @@
     out_of_memory(cinfo, 6);    /* safety check */
 
   /* Calculate max # of rows allowed in one allocation chunk */
-  ltemp = (MAX_ALLOC_CHUNK-sizeof(large_pool_hdr)) /
-          ((long) blocksperrow * sizeof(JBLOCK));
+  ltemp = (MAX_ALLOC_CHUNK - sizeof(large_pool_hdr)) /
+          ((long)blocksperrow * sizeof(JBLOCK));
   if (ltemp <= 0)
     ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
-  if (ltemp < (long) numrows)
-    rowsperchunk = (JDIMENSION) ltemp;
+  if (ltemp < (long)numrows)
+    rowsperchunk = (JDIMENSION)ltemp;
   else
     rowsperchunk = numrows;
   mem->last_rowsperchunk = rowsperchunk;
 
   /* Get space for row pointers (small object) */
-  result = (JBLOCKARRAY) alloc_small(cinfo, pool_id,
-                                     (size_t) (numrows * sizeof(JBLOCKROW)));
+  result = (JBLOCKARRAY)alloc_small(cinfo, pool_id,
+                                    (size_t)(numrows * sizeof(JBLOCKROW)));
 
   /* Get the rows themselves (large objects) */
   currow = 0;
   while (currow < numrows) {
     rowsperchunk = MIN(rowsperchunk, numrows - currow);
-    workspace = (JBLOCKROW) alloc_large(cinfo, pool_id,
-        (size_t) ((size_t) rowsperchunk * (size_t) blocksperrow
-                  * sizeof(JBLOCK)));
+    workspace = (JBLOCKROW)alloc_large(cinfo, pool_id,
+        (size_t)((size_t)rowsperchunk * (size_t)blocksperrow *
+                  sizeof(JBLOCK)));
     for (i = rowsperchunk; i > 0; i--) {
       result[currow++] = workspace;
       workspace += blocksperrow;
@@ -577,12 +575,12 @@
 
 
 METHODDEF(jvirt_sarray_ptr)
-request_virt_sarray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
-                     JDIMENSION samplesperrow, JDIMENSION numrows,
-                     JDIMENSION maxaccess)
+request_virt_sarray(j_common_ptr cinfo, int pool_id, boolean pre_zero,
+                    JDIMENSION samplesperrow, JDIMENSION numrows,
+                    JDIMENSION maxaccess)
 /* Request a virtual 2-D sample array */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   jvirt_sarray_ptr result;
 
   /* Only IMAGE-lifetime virtual arrays are currently supported */
@@ -590,8 +588,8 @@
     ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
 
   /* get control block */
-  result = (jvirt_sarray_ptr) alloc_small(cinfo, pool_id,
-                                          sizeof(struct jvirt_sarray_control));
+  result = (jvirt_sarray_ptr)alloc_small(cinfo, pool_id,
+                                         sizeof(struct jvirt_sarray_control));
 
   result->mem_buffer = NULL;    /* marks array not yet realized */
   result->rows_in_array = numrows;
@@ -607,12 +605,12 @@
 
 
 METHODDEF(jvirt_barray_ptr)
-request_virt_barray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
-                     JDIMENSION blocksperrow, JDIMENSION numrows,
-                     JDIMENSION maxaccess)
+request_virt_barray(j_common_ptr cinfo, int pool_id, boolean pre_zero,
+                    JDIMENSION blocksperrow, JDIMENSION numrows,
+                    JDIMENSION maxaccess)
 /* Request a virtual 2-D coefficient-block array */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   jvirt_barray_ptr result;
 
   /* Only IMAGE-lifetime virtual arrays are currently supported */
@@ -620,8 +618,8 @@
     ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
 
   /* get control block */
-  result = (jvirt_barray_ptr) alloc_small(cinfo, pool_id,
-                                          sizeof(struct jvirt_barray_control));
+  result = (jvirt_barray_ptr)alloc_small(cinfo, pool_id,
+                                         sizeof(struct jvirt_barray_control));
 
   result->mem_buffer = NULL;    /* marks array not yet realized */
   result->rows_in_array = numrows;
@@ -637,10 +635,10 @@
 
 
 METHODDEF(void)
-realize_virt_arrays (j_common_ptr cinfo)
+realize_virt_arrays(j_common_ptr cinfo)
 /* Allocate the in-memory buffers for any unrealized virtual arrays */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   size_t space_per_minheight, maximum_space, avail_mem;
   size_t minheights, max_minheights;
   jvirt_sarray_ptr sptr;
@@ -654,11 +652,11 @@
   maximum_space = 0;
   for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
     if (sptr->mem_buffer == NULL) { /* if not realized yet */
-      size_t new_space = (long) sptr->rows_in_array *
-                         (long) sptr->samplesperrow * sizeof(JSAMPLE);
+      size_t new_space = (long)sptr->rows_in_array *
+                         (long)sptr->samplesperrow * sizeof(JSAMPLE);
 
-      space_per_minheight += (long) sptr->maxaccess *
-                             (long) sptr->samplesperrow * sizeof(JSAMPLE);
+      space_per_minheight += (long)sptr->maxaccess *
+                             (long)sptr->samplesperrow * sizeof(JSAMPLE);
       if (SIZE_MAX - maximum_space < new_space)
         out_of_memory(cinfo, 10);
       maximum_space += new_space;
@@ -666,11 +664,11 @@
   }
   for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) {
     if (bptr->mem_buffer == NULL) { /* if not realized yet */
-      size_t new_space = (long) bptr->rows_in_array *
-                         (long) bptr->blocksperrow * sizeof(JBLOCK);
+      size_t new_space = (long)bptr->rows_in_array *
+                         (long)bptr->blocksperrow * sizeof(JBLOCK);
 
-      space_per_minheight += (long) bptr->maxaccess *
-                             (long) bptr->blocksperrow * sizeof(JBLOCK);
+      space_per_minheight += (long)bptr->maxaccess *
+                             (long)bptr->blocksperrow * sizeof(JBLOCK);
       if (SIZE_MAX - maximum_space < new_space)
         out_of_memory(cinfo, 11);
       maximum_space += new_space;
@@ -703,17 +701,17 @@
 
   for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
     if (sptr->mem_buffer == NULL) { /* if not realized yet */
-      minheights = ((long) sptr->rows_in_array - 1L) / sptr->maxaccess + 1L;
+      minheights = ((long)sptr->rows_in_array - 1L) / sptr->maxaccess + 1L;
       if (minheights <= max_minheights) {
         /* This buffer fits in memory */
         sptr->rows_in_mem = sptr->rows_in_array;
       } else {
         /* It doesn't fit in memory, create backing store. */
-        sptr->rows_in_mem = (JDIMENSION) (max_minheights * sptr->maxaccess);
-        jpeg_open_backing_store(cinfo, & sptr->b_s_info,
-                                (long) sptr->rows_in_array *
-                                (long) sptr->samplesperrow *
-                                (long) sizeof(JSAMPLE));
+        sptr->rows_in_mem = (JDIMENSION)(max_minheights * sptr->maxaccess);
+        jpeg_open_backing_store(cinfo, &sptr->b_s_info,
+                                (long)sptr->rows_in_array *
+                                (long)sptr->samplesperrow *
+                                (long)sizeof(JSAMPLE));
         sptr->b_s_open = TRUE;
       }
       sptr->mem_buffer = alloc_sarray(cinfo, JPOOL_IMAGE,
@@ -727,17 +725,17 @@
 
   for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) {
     if (bptr->mem_buffer == NULL) { /* if not realized yet */
-      minheights = ((long) bptr->rows_in_array - 1L) / bptr->maxaccess + 1L;
+      minheights = ((long)bptr->rows_in_array - 1L) / bptr->maxaccess + 1L;
       if (minheights <= max_minheights) {
         /* This buffer fits in memory */
         bptr->rows_in_mem = bptr->rows_in_array;
       } else {
         /* It doesn't fit in memory, create backing store. */
-        bptr->rows_in_mem = (JDIMENSION) (max_minheights * bptr->maxaccess);
-        jpeg_open_backing_store(cinfo, & bptr->b_s_info,
-                                (long) bptr->rows_in_array *
-                                (long) bptr->blocksperrow *
-                                (long) sizeof(JBLOCK));
+        bptr->rows_in_mem = (JDIMENSION)(max_minheights * bptr->maxaccess);
+        jpeg_open_backing_store(cinfo, &bptr->b_s_info,
+                                (long)bptr->rows_in_array *
+                                (long)bptr->blocksperrow *
+                                (long)sizeof(JBLOCK));
         bptr->b_s_open = TRUE;
       }
       bptr->mem_buffer = alloc_barray(cinfo, JPOOL_IMAGE,
@@ -752,32 +750,32 @@
 
 
 LOCAL(void)
-do_sarray_io (j_common_ptr cinfo, jvirt_sarray_ptr ptr, boolean writing)
+do_sarray_io(j_common_ptr cinfo, jvirt_sarray_ptr ptr, boolean writing)
 /* Do backing store read or write of a virtual sample array */
 {
   long bytesperrow, file_offset, byte_count, rows, thisrow, i;
 
-  bytesperrow = (long) ptr->samplesperrow * sizeof(JSAMPLE);
+  bytesperrow = (long)ptr->samplesperrow * sizeof(JSAMPLE);
   file_offset = ptr->cur_start_row * bytesperrow;
   /* Loop to read or write each allocation chunk in mem_buffer */
-  for (i = 0; i < (long) ptr->rows_in_mem; i += ptr->rowsperchunk) {
+  for (i = 0; i < (long)ptr->rows_in_mem; i += ptr->rowsperchunk) {
     /* One chunk, but check for short chunk at end of buffer */
-    rows = MIN((long) ptr->rowsperchunk, (long) ptr->rows_in_mem - i);
+    rows = MIN((long)ptr->rowsperchunk, (long)ptr->rows_in_mem - i);
     /* Transfer no more than is currently defined */
-    thisrow = (long) ptr->cur_start_row + i;
-    rows = MIN(rows, (long) ptr->first_undef_row - thisrow);
+    thisrow = (long)ptr->cur_start_row + i;
+    rows = MIN(rows, (long)ptr->first_undef_row - thisrow);
     /* Transfer no more than fits in file */
-    rows = MIN(rows, (long) ptr->rows_in_array - thisrow);
+    rows = MIN(rows, (long)ptr->rows_in_array - thisrow);
     if (rows <= 0)              /* this chunk might be past end of file! */
       break;
     byte_count = rows * bytesperrow;
     if (writing)
-      (*ptr->b_s_info.write_backing_store) (cinfo, & ptr->b_s_info,
-                                            (void *) ptr->mem_buffer[i],
+      (*ptr->b_s_info.write_backing_store) (cinfo, &ptr->b_s_info,
+                                            (void *)ptr->mem_buffer[i],
                                             file_offset, byte_count);
     else
-      (*ptr->b_s_info.read_backing_store) (cinfo, & ptr->b_s_info,
-                                           (void *) ptr->mem_buffer[i],
+      (*ptr->b_s_info.read_backing_store) (cinfo, &ptr->b_s_info,
+                                           (void *)ptr->mem_buffer[i],
                                            file_offset, byte_count);
     file_offset += byte_count;
   }
@@ -785,32 +783,32 @@
 
 
 LOCAL(void)
-do_barray_io (j_common_ptr cinfo, jvirt_barray_ptr ptr, boolean writing)
+do_barray_io(j_common_ptr cinfo, jvirt_barray_ptr ptr, boolean writing)
 /* Do backing store read or write of a virtual coefficient-block array */
 {
   long bytesperrow, file_offset, byte_count, rows, thisrow, i;
 
-  bytesperrow = (long) ptr->blocksperrow * sizeof(JBLOCK);
+  bytesperrow = (long)ptr->blocksperrow * sizeof(JBLOCK);
   file_offset = ptr->cur_start_row * bytesperrow;
   /* Loop to read or write each allocation chunk in mem_buffer */
-  for (i = 0; i < (long) ptr->rows_in_mem; i += ptr->rowsperchunk) {
+  for (i = 0; i < (long)ptr->rows_in_mem; i += ptr->rowsperchunk) {
     /* One chunk, but check for short chunk at end of buffer */
-    rows = MIN((long) ptr->rowsperchunk, (long) ptr->rows_in_mem - i);
+    rows = MIN((long)ptr->rowsperchunk, (long)ptr->rows_in_mem - i);
     /* Transfer no more than is currently defined */
-    thisrow = (long) ptr->cur_start_row + i;
-    rows = MIN(rows, (long) ptr->first_undef_row - thisrow);
+    thisrow = (long)ptr->cur_start_row + i;
+    rows = MIN(rows, (long)ptr->first_undef_row - thisrow);
     /* Transfer no more than fits in file */
-    rows = MIN(rows, (long) ptr->rows_in_array - thisrow);
+    rows = MIN(rows, (long)ptr->rows_in_array - thisrow);
     if (rows <= 0)              /* this chunk might be past end of file! */
       break;
     byte_count = rows * bytesperrow;
     if (writing)
-      (*ptr->b_s_info.write_backing_store) (cinfo, & ptr->b_s_info,
-                                            (void *) ptr->mem_buffer[i],
+      (*ptr->b_s_info.write_backing_store) (cinfo, &ptr->b_s_info,
+                                            (void *)ptr->mem_buffer[i],
                                             file_offset, byte_count);
     else
-      (*ptr->b_s_info.read_backing_store) (cinfo, & ptr->b_s_info,
-                                           (void *) ptr->mem_buffer[i],
+      (*ptr->b_s_info.read_backing_store) (cinfo, &ptr->b_s_info,
+                                           (void *)ptr->mem_buffer[i],
                                            file_offset, byte_count);
     file_offset += byte_count;
   }
@@ -818,9 +816,8 @@
 
 
 METHODDEF(JSAMPARRAY)
-access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
-                    JDIMENSION start_row, JDIMENSION num_rows,
-                    boolean writable)
+access_virt_sarray(j_common_ptr cinfo, jvirt_sarray_ptr ptr,
+                   JDIMENSION start_row, JDIMENSION num_rows, boolean writable)
 /* Access the part of a virtual sample array starting at start_row */
 /* and extending for num_rows rows.  writable is true if  */
 /* caller intends to modify the accessed area. */
@@ -835,8 +832,8 @@
 
   /* Make the desired part of the virtual array accessible */
   if (start_row < ptr->cur_start_row ||
-      end_row > ptr->cur_start_row+ptr->rows_in_mem) {
-    if (! ptr->b_s_open)
+      end_row > ptr->cur_start_row + ptr->rows_in_mem) {
+    if (!ptr->b_s_open)
       ERREXIT(cinfo, JERR_VIRTUAL_BUG);
     /* Flush old buffer contents if necessary */
     if (ptr->dirty) {
@@ -856,10 +853,10 @@
       /* use long arithmetic here to avoid overflow & unsigned problems */
       long ltemp;
 
-      ltemp = (long) end_row - (long) ptr->rows_in_mem;
+      ltemp = (long)end_row - (long)ptr->rows_in_mem;
       if (ltemp < 0)
         ltemp = 0;              /* don't fall off front end of file */
-      ptr->cur_start_row = (JDIMENSION) ltemp;
+      ptr->cur_start_row = (JDIMENSION)ltemp;
     }
     /* Read in the selected part of the array.
      * During the initial write pass, we will do no actual read
@@ -882,15 +879,15 @@
     if (writable)
       ptr->first_undef_row = end_row;
     if (ptr->pre_zero) {
-      size_t bytesperrow = (size_t) ptr->samplesperrow * sizeof(JSAMPLE);
+      size_t bytesperrow = (size_t)ptr->samplesperrow * sizeof(JSAMPLE);
       undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */
       end_row -= ptr->cur_start_row;
       while (undef_row < end_row) {
-        jzero_far((void *) ptr->mem_buffer[undef_row], bytesperrow);
+        jzero_far((void *)ptr->mem_buffer[undef_row], bytesperrow);
         undef_row++;
       }
     } else {
-      if (! writable)           /* reader looking at undefined data */
+      if (!writable)            /* reader looking at undefined data */
         ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
     }
   }
@@ -903,9 +900,8 @@
 
 
 METHODDEF(JBLOCKARRAY)
-access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr,
-                    JDIMENSION start_row, JDIMENSION num_rows,
-                    boolean writable)
+access_virt_barray(j_common_ptr cinfo, jvirt_barray_ptr ptr,
+                   JDIMENSION start_row, JDIMENSION num_rows, boolean writable)
 /* Access the part of a virtual block array starting at start_row */
 /* and extending for num_rows rows.  writable is true if  */
 /* caller intends to modify the accessed area. */
@@ -920,8 +916,8 @@
 
   /* Make the desired part of the virtual array accessible */
   if (start_row < ptr->cur_start_row ||
-      end_row > ptr->cur_start_row+ptr->rows_in_mem) {
-    if (! ptr->b_s_open)
+      end_row > ptr->cur_start_row + ptr->rows_in_mem) {
+    if (!ptr->b_s_open)
       ERREXIT(cinfo, JERR_VIRTUAL_BUG);
     /* Flush old buffer contents if necessary */
     if (ptr->dirty) {
@@ -941,10 +937,10 @@
       /* use long arithmetic here to avoid overflow & unsigned problems */
       long ltemp;
 
-      ltemp = (long) end_row - (long) ptr->rows_in_mem;
+      ltemp = (long)end_row - (long)ptr->rows_in_mem;
       if (ltemp < 0)
         ltemp = 0;              /* don't fall off front end of file */
-      ptr->cur_start_row = (JDIMENSION) ltemp;
+      ptr->cur_start_row = (JDIMENSION)ltemp;
     }
     /* Read in the selected part of the array.
      * During the initial write pass, we will do no actual read
@@ -967,15 +963,15 @@
     if (writable)
       ptr->first_undef_row = end_row;
     if (ptr->pre_zero) {
-      size_t bytesperrow = (size_t) ptr->blocksperrow * sizeof(JBLOCK);
+      size_t bytesperrow = (size_t)ptr->blocksperrow * sizeof(JBLOCK);
       undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */
       end_row -= ptr->cur_start_row;
       while (undef_row < end_row) {
-        jzero_far((void *) ptr->mem_buffer[undef_row], bytesperrow);
+        jzero_far((void *)ptr->mem_buffer[undef_row], bytesperrow);
         undef_row++;
       }
     } else {
-      if (! writable)           /* reader looking at undefined data */
+      if (!writable)            /* reader looking at undefined data */
         ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
     }
   }
@@ -992,9 +988,9 @@
  */
 
 METHODDEF(void)
-free_pool (j_common_ptr cinfo, int pool_id)
+free_pool(j_common_ptr cinfo, int pool_id)
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   small_pool_ptr shdr_ptr;
   large_pool_ptr lhdr_ptr;
   size_t space_freed;
@@ -1015,14 +1011,14 @@
     for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
       if (sptr->b_s_open) {     /* there may be no backing store */
         sptr->b_s_open = FALSE; /* prevent recursive close if error */
-        (*sptr->b_s_info.close_backing_store) (cinfo, & sptr->b_s_info);
+        (*sptr->b_s_info.close_backing_store) (cinfo, &sptr->b_s_info);
       }
     }
     mem->virt_sarray_list = NULL;
     for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) {
       if (bptr->b_s_open) {     /* there may be no backing store */
         bptr->b_s_open = FALSE; /* prevent recursive close if error */
-        (*bptr->b_s_info.close_backing_store) (cinfo, & bptr->b_s_info);
+        (*bptr->b_s_info.close_backing_store) (cinfo, &bptr->b_s_info);
       }
     }
     mem->virt_barray_list = NULL;
@@ -1037,7 +1033,7 @@
     space_freed = lhdr_ptr->bytes_used +
                   lhdr_ptr->bytes_left +
                   sizeof(large_pool_hdr);
-    jpeg_free_large(cinfo, (void *) lhdr_ptr, space_freed);
+    jpeg_free_large(cinfo, (void *)lhdr_ptr, space_freed);
     mem->total_space_allocated -= space_freed;
     lhdr_ptr = next_lhdr_ptr;
   }
@@ -1048,10 +1044,9 @@
 
   while (shdr_ptr != NULL) {
     small_pool_ptr next_shdr_ptr = shdr_ptr->next;
-    space_freed = shdr_ptr->bytes_used +
-                  shdr_ptr->bytes_left +
+    space_freed = shdr_ptr->bytes_used + shdr_ptr->bytes_left +
                   sizeof(small_pool_hdr);
-    jpeg_free_small(cinfo, (void *) shdr_ptr, space_freed);
+    jpeg_free_small(cinfo, (void *)shdr_ptr, space_freed);
     mem->total_space_allocated -= space_freed;
     shdr_ptr = next_shdr_ptr;
   }
@@ -1064,7 +1059,7 @@
  */
 
 METHODDEF(void)
-self_destruct (j_common_ptr cinfo)
+self_destruct(j_common_ptr cinfo)
 {
   int pool;
 
@@ -1072,12 +1067,12 @@
    * Releasing pools in reverse order might help avoid fragmentation
    * with some (brain-damaged) malloc libraries.
    */
-  for (pool = JPOOL_NUMPOOLS-1; pool >= JPOOL_PERMANENT; pool--) {
+  for (pool = JPOOL_NUMPOOLS - 1; pool >= JPOOL_PERMANENT; pool--) {
     free_pool(cinfo, pool);
   }
 
   /* Release the memory manager control block too. */
-  jpeg_free_small(cinfo, (void *) cinfo->mem, sizeof(my_memory_mgr));
+  jpeg_free_small(cinfo, (void *)cinfo->mem, sizeof(my_memory_mgr));
   cinfo->mem = NULL;            /* ensures I will be called only once */
 
   jpeg_mem_term(cinfo);         /* system-dependent cleanup */
@@ -1090,7 +1085,7 @@
  */
 
 GLOBAL(void)
-jinit_memory_mgr (j_common_ptr cinfo)
+jinit_memory_mgr(j_common_ptr cinfo)
 {
   my_mem_ptr mem;
   long max_to_use;
@@ -1106,22 +1101,22 @@
    * in common if and only if X is a power of 2, ie has only one one-bit.
    * Some compilers may give an "unreachable code" warning here; ignore it.
    */
-  if ((ALIGN_SIZE & (ALIGN_SIZE-1)) != 0)
+  if ((ALIGN_SIZE & (ALIGN_SIZE - 1)) != 0)
     ERREXIT(cinfo, JERR_BAD_ALIGN_TYPE);
   /* MAX_ALLOC_CHUNK must be representable as type size_t, and must be
    * a multiple of ALIGN_SIZE.
    * Again, an "unreachable code" warning may be ignored here.
    * But a "constant too large" warning means you need to fix MAX_ALLOC_CHUNK.
    */
-  test_mac = (size_t) MAX_ALLOC_CHUNK;
-  if ((long) test_mac != MAX_ALLOC_CHUNK ||
+  test_mac = (size_t)MAX_ALLOC_CHUNK;
+  if ((long)test_mac != MAX_ALLOC_CHUNK ||
       (MAX_ALLOC_CHUNK % ALIGN_SIZE) != 0)
     ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK);
 
   max_to_use = jpeg_mem_init(cinfo); /* system-dependent initialization */
 
   /* Attempt to allocate memory manager's control block */
-  mem = (my_mem_ptr) jpeg_get_small(cinfo, sizeof(my_memory_mgr));
+  mem = (my_mem_ptr)jpeg_get_small(cinfo, sizeof(my_memory_mgr));
 
   if (mem == NULL) {
     jpeg_mem_term(cinfo);       /* system-dependent cleanup */
@@ -1147,7 +1142,7 @@
   /* Initialize working state */
   mem->pub.max_memory_to_use = max_to_use;
 
-  for (pool = JPOOL_NUMPOOLS-1; pool >= JPOOL_PERMANENT; pool--) {
+  for (pool = JPOOL_NUMPOOLS - 1; pool >= JPOOL_PERMANENT; pool--) {
     mem->small_list[pool] = NULL;
     mem->large_list[pool] = NULL;
   }
@@ -1157,7 +1152,7 @@
   mem->total_space_allocated = sizeof(my_memory_mgr);
 
   /* Declare ourselves open for business */
-  cinfo->mem = & mem->pub;
+  cinfo->mem = &mem->pub;
 
   /* Check for an environment variable JPEGMEM; if found, override the
    * default max_memory setting from jpeg_mem_init.  Note that the
@@ -1166,7 +1161,8 @@
    * this feature.
    */
 #ifndef NO_GETENV
-  { char *memenv;
+  {
+    char *memenv;
 
     if ((memenv = getenv("JPEGMEM")) != NULL) {
       char ch = 'x';
diff --git a/jmemnobs.c b/jmemnobs.c
index ac12afa..6191aaa 100644
--- a/jmemnobs.c
+++ b/jmemnobs.c
@@ -23,8 +23,8 @@
 #include "jmemsys.h"            /* import the system-dependent declarations */
 
 #ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc(),free() */
-extern void *malloc (size_t size);
-extern void free (void *ptr);
+extern void *malloc(size_t size);
+extern void free(void *ptr);
 #endif
 
 
@@ -34,13 +34,13 @@
  */
 
 GLOBAL(void *)
-jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
+jpeg_get_small(j_common_ptr cinfo, size_t sizeofobject)
 {
-  return (void *) malloc(sizeofobject);
+  return (void *)malloc(sizeofobject);
 }
 
 GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void *object, size_t sizeofobject)
+jpeg_free_small(j_common_ptr cinfo, void *object, size_t sizeofobject)
 {
   free(object);
 }
@@ -51,13 +51,13 @@
  */
 
 GLOBAL(void *)
-jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
+jpeg_get_large(j_common_ptr cinfo, size_t sizeofobject)
 {
-  return (void *) malloc(sizeofobject);
+  return (void *)malloc(sizeofobject);
 }
 
 GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void *object, size_t sizeofobject)
+jpeg_free_large(j_common_ptr cinfo, void *object, size_t sizeofobject)
 {
   free(object);
 }
@@ -68,8 +68,8 @@
  */
 
 GLOBAL(size_t)
-jpeg_mem_available (j_common_ptr cinfo, size_t min_bytes_needed,
-                    size_t max_bytes_needed, size_t already_allocated)
+jpeg_mem_available(j_common_ptr cinfo, size_t min_bytes_needed,
+                   size_t max_bytes_needed, size_t already_allocated)
 {
   if (cinfo->mem->max_memory_to_use) {
     if (cinfo->mem->max_memory_to_use > already_allocated)
@@ -90,8 +90,8 @@
  */
 
 GLOBAL(void)
-jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-                         long total_bytes_needed)
+jpeg_open_backing_store(j_common_ptr cinfo, backing_store_ptr info,
+                        long total_bytes_needed)
 {
   ERREXIT(cinfo, JERR_NO_BACKING_STORE);
 }
@@ -103,13 +103,13 @@
  */
 
 GLOBAL(long)
-jpeg_mem_init (j_common_ptr cinfo)
+jpeg_mem_init(j_common_ptr cinfo)
 {
   return 0;                     /* just set max_memory_to_use to 0 */
 }
 
 GLOBAL(void)
-jpeg_mem_term (j_common_ptr cinfo)
+jpeg_mem_term(j_common_ptr cinfo)
 {
   /* no work */
 }
diff --git a/jmemsys.h b/jmemsys.h
index f7dfe87..9229550 100644
--- a/jmemsys.h
+++ b/jmemsys.h
@@ -31,9 +31,9 @@
  * size of the object being freed, just in case it's needed.
  */
 
-EXTERN(void *) jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject);
-EXTERN(void) jpeg_free_small (j_common_ptr cinfo, void *object,
-                              size_t sizeofobject);
+EXTERN(void *) jpeg_get_small(j_common_ptr cinfo, size_t sizeofobject);
+EXTERN(void) jpeg_free_small(j_common_ptr cinfo, void *object,
+                             size_t sizeofobject);
 
 /*
  * These two functions are used to allocate and release large chunks of
@@ -43,9 +43,9 @@
  * large chunks.
  */
 
-EXTERN(void *) jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject);
-EXTERN(void) jpeg_free_large (j_common_ptr cinfo, void *object,
-                              size_t sizeofobject);
+EXTERN(void *) jpeg_get_large(j_common_ptr cinfo, size_t sizeofobject);
+EXTERN(void) jpeg_free_large(j_common_ptr cinfo, void *object,
+                             size_t sizeofobject);
 
 /*
  * The macro MAX_ALLOC_CHUNK designates the maximum number of bytes that may
@@ -84,9 +84,9 @@
  * Conversely, zero may be returned to always use the minimum amount of memory.
  */
 
-EXTERN(size_t) jpeg_mem_available (j_common_ptr cinfo, size_t min_bytes_needed,
-                                   size_t max_bytes_needed,
-                                   size_t already_allocated);
+EXTERN(size_t) jpeg_mem_available(j_common_ptr cinfo, size_t min_bytes_needed,
+                                  size_t max_bytes_needed,
+                                  size_t already_allocated);
 
 
 /*
@@ -157,9 +157,9 @@
  * just take an error exit.)
  */
 
-EXTERN(void) jpeg_open_backing_store (j_common_ptr cinfo,
-                                      backing_store_ptr info,
-                                      long total_bytes_needed);
+EXTERN(void) jpeg_open_backing_store(j_common_ptr cinfo,
+                                     backing_store_ptr info,
+                                     long total_bytes_needed);
 
 
 /*
@@ -174,5 +174,5 @@
  * all opened backing-store objects have been closed.
  */
 
-EXTERN(long) jpeg_mem_init (j_common_ptr cinfo);
-EXTERN(void) jpeg_mem_term (j_common_ptr cinfo);
+EXTERN(long) jpeg_mem_init(j_common_ptr cinfo);
+EXTERN(void) jpeg_mem_term(j_common_ptr cinfo);
diff --git a/jmorecfg.h b/jmorecfg.h
index 1d96786..4768e4f 100644
--- a/jmorecfg.h
+++ b/jmorecfg.h
@@ -49,15 +49,15 @@
 #ifdef HAVE_UNSIGNED_CHAR
 
 typedef unsigned char JSAMPLE;
-#define GETJSAMPLE(value)  ((int) (value))
+#define GETJSAMPLE(value)  ((int)(value))
 
 #else /* not HAVE_UNSIGNED_CHAR */
 
 typedef char JSAMPLE;
 #ifdef __CHAR_UNSIGNED__
-#define GETJSAMPLE(value)  ((int) (value))
+#define GETJSAMPLE(value)  ((int)(value))
 #else
-#define GETJSAMPLE(value)  ((int) (value) & 0xFF)
+#define GETJSAMPLE(value)  ((int)(value) & 0xFF)
 #endif /* __CHAR_UNSIGNED__ */
 
 #endif /* HAVE_UNSIGNED_CHAR */
@@ -74,7 +74,7 @@
  */
 
 typedef short JSAMPLE;
-#define GETJSAMPLE(value)  ((int) (value))
+#define GETJSAMPLE(value)  ((int)(value))
 
 #define MAXJSAMPLE      4095
 #define CENTERJSAMPLE   2048
@@ -173,9 +173,9 @@
  */
 
 #ifndef XMD_H                   /* X11/xmd.h correctly defines INT32 */
-#ifndef _BASETSD_H_		/* Microsoft defines it in basetsd.h */
-#ifndef _BASETSD_H		/* MinGW is slightly different */
-#ifndef QGLOBAL_H		/* Qt defines it in qglobal.h */
+#ifndef _BASETSD_H_             /* Microsoft defines it in basetsd.h */
+#ifndef _BASETSD_H              /* MinGW is slightly different */
+#ifndef QGLOBAL_H               /* Qt defines it in qglobal.h */
 typedef long INT32;
 #endif
 #endif
@@ -220,7 +220,7 @@
  * software out there that uses it.
  */
 
-#define JMETHOD(type,methodname,arglist)  type (*methodname) arglist
+#define JMETHOD(type, methodname, arglist)  type (*methodname) arglist
 
 
 /* libjpeg-turbo no longer supports platforms that have far symbols (MS-DOS),
diff --git a/jpegint.h b/jpegint.h
index 9979a91..867ce8d 100644
--- a/jpegint.h
+++ b/jpegint.h
@@ -274,9 +274,9 @@
 /* Miscellaneous useful macros */
 
 #undef MAX
-#define MAX(a,b)        ((a) > (b) ? (a) : (b))
+#define MAX(a, b)       ((a) > (b) ? (a) : (b))
 #undef MIN
-#define MIN(a,b)        ((a) < (b) ? (a) : (b))
+#define MIN(a, b)       ((a) < (b) ? (a) : (b))
 
 
 /* We assume that right shift corresponds to signed division by 2 with
@@ -291,64 +291,64 @@
 
 #ifdef RIGHT_SHIFT_IS_UNSIGNED
 #define SHIFT_TEMPS     JLONG shift_temp;
-#define RIGHT_SHIFT(x,shft)  \
-        ((shift_temp = (x)) < 0 ? \
-         (shift_temp >> (shft)) | ((~((JLONG) 0)) << (32-(shft))) : \
-         (shift_temp >> (shft)))
+#define RIGHT_SHIFT(x, shft) \
+  ((shift_temp = (x)) < 0 ? \
+   (shift_temp >> (shft)) | ((~((JLONG)0)) << (32 - (shft))) : \
+   (shift_temp >> (shft)))
 #else
 #define SHIFT_TEMPS
-#define RIGHT_SHIFT(x,shft)     ((x) >> (shft))
+#define RIGHT_SHIFT(x, shft)    ((x) >> (shft))
 #endif
 
 
 /* Compression module initialization routines */
-EXTERN(void) jinit_compress_master (j_compress_ptr cinfo);
-EXTERN(void) jinit_c_master_control (j_compress_ptr cinfo,
-                                     boolean transcode_only);
-EXTERN(void) jinit_c_main_controller (j_compress_ptr cinfo,
-                                      boolean need_full_buffer);
-EXTERN(void) jinit_c_prep_controller (j_compress_ptr cinfo,
-                                      boolean need_full_buffer);
-EXTERN(void) jinit_c_coef_controller (j_compress_ptr cinfo,
-                                      boolean need_full_buffer);
-EXTERN(void) jinit_color_converter (j_compress_ptr cinfo);
-EXTERN(void) jinit_downsampler (j_compress_ptr cinfo);
-EXTERN(void) jinit_forward_dct (j_compress_ptr cinfo);
-EXTERN(void) jinit_huff_encoder (j_compress_ptr cinfo);
-EXTERN(void) jinit_phuff_encoder (j_compress_ptr cinfo);
-EXTERN(void) jinit_arith_encoder (j_compress_ptr cinfo);
-EXTERN(void) jinit_marker_writer (j_compress_ptr cinfo);
+EXTERN(void) jinit_compress_master(j_compress_ptr cinfo);
+EXTERN(void) jinit_c_master_control(j_compress_ptr cinfo,
+                                    boolean transcode_only);
+EXTERN(void) jinit_c_main_controller(j_compress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) jinit_c_prep_controller(j_compress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) jinit_c_coef_controller(j_compress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) jinit_color_converter(j_compress_ptr cinfo);
+EXTERN(void) jinit_downsampler(j_compress_ptr cinfo);
+EXTERN(void) jinit_forward_dct(j_compress_ptr cinfo);
+EXTERN(void) jinit_huff_encoder(j_compress_ptr cinfo);
+EXTERN(void) jinit_phuff_encoder(j_compress_ptr cinfo);
+EXTERN(void) jinit_arith_encoder(j_compress_ptr cinfo);
+EXTERN(void) jinit_marker_writer(j_compress_ptr cinfo);
 /* Decompression module initialization routines */
-EXTERN(void) jinit_master_decompress (j_decompress_ptr cinfo);
-EXTERN(void) jinit_d_main_controller (j_decompress_ptr cinfo,
-                                      boolean need_full_buffer);
-EXTERN(void) jinit_d_coef_controller (j_decompress_ptr cinfo,
-                                      boolean need_full_buffer);
-EXTERN(void) jinit_d_post_controller (j_decompress_ptr cinfo,
-                                      boolean need_full_buffer);
-EXTERN(void) jinit_input_controller (j_decompress_ptr cinfo);
-EXTERN(void) jinit_marker_reader (j_decompress_ptr cinfo);
-EXTERN(void) jinit_huff_decoder (j_decompress_ptr cinfo);
-EXTERN(void) jinit_phuff_decoder (j_decompress_ptr cinfo);
-EXTERN(void) jinit_arith_decoder (j_decompress_ptr cinfo);
-EXTERN(void) jinit_inverse_dct (j_decompress_ptr cinfo);
-EXTERN(void) jinit_upsampler (j_decompress_ptr cinfo);
-EXTERN(void) jinit_color_deconverter (j_decompress_ptr cinfo);
-EXTERN(void) jinit_1pass_quantizer (j_decompress_ptr cinfo);
-EXTERN(void) jinit_2pass_quantizer (j_decompress_ptr cinfo);
-EXTERN(void) jinit_merged_upsampler (j_decompress_ptr cinfo);
+EXTERN(void) jinit_master_decompress(j_decompress_ptr cinfo);
+EXTERN(void) jinit_d_main_controller(j_decompress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) jinit_d_coef_controller(j_decompress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) jinit_d_post_controller(j_decompress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) jinit_input_controller(j_decompress_ptr cinfo);
+EXTERN(void) jinit_marker_reader(j_decompress_ptr cinfo);
+EXTERN(void) jinit_huff_decoder(j_decompress_ptr cinfo);
+EXTERN(void) jinit_phuff_decoder(j_decompress_ptr cinfo);
+EXTERN(void) jinit_arith_decoder(j_decompress_ptr cinfo);
+EXTERN(void) jinit_inverse_dct(j_decompress_ptr cinfo);
+EXTERN(void) jinit_upsampler(j_decompress_ptr cinfo);
+EXTERN(void) jinit_color_deconverter(j_decompress_ptr cinfo);
+EXTERN(void) jinit_1pass_quantizer(j_decompress_ptr cinfo);
+EXTERN(void) jinit_2pass_quantizer(j_decompress_ptr cinfo);
+EXTERN(void) jinit_merged_upsampler(j_decompress_ptr cinfo);
 /* Memory manager initialization */
-EXTERN(void) jinit_memory_mgr (j_common_ptr cinfo);
+EXTERN(void) jinit_memory_mgr(j_common_ptr cinfo);
 
 /* Utility routines in jutils.c */
-EXTERN(long) jdiv_round_up (long a, long b);
-EXTERN(long) jround_up (long a, long b);
-EXTERN(void) jcopy_sample_rows (JSAMPARRAY input_array, int source_row,
-                                JSAMPARRAY output_array, int dest_row,
-                                int num_rows, JDIMENSION num_cols);
-EXTERN(void) jcopy_block_row (JBLOCKROW input_row, JBLOCKROW output_row,
-                              JDIMENSION num_blocks);
-EXTERN(void) jzero_far (void *target, size_t bytestozero);
+EXTERN(long) jdiv_round_up(long a, long b);
+EXTERN(long) jround_up(long a, long b);
+EXTERN(void) jcopy_sample_rows(JSAMPARRAY input_array, int source_row,
+                               JSAMPARRAY output_array, int dest_row,
+                               int num_rows, JDIMENSION num_cols);
+EXTERN(void) jcopy_block_row(JBLOCKROW input_row, JBLOCKROW output_row,
+                             JDIMENSION num_blocks);
+EXTERN(void) jzero_far(void *target, size_t bytestozero);
 /* Constant tables in jutils.c */
 #if 0                           /* This table is not actually needed in v6a */
 extern const int jpeg_zigzag_order[]; /* natural coef order to zigzag order */
diff --git a/jpeglib.h b/jpeglib.h
index 6c63f58..8ce1572 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modified 2002-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
+ * Copyright (C) 2009-2011, 2013-2014, 2016-2017, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -268,11 +268,11 @@
 /* Common fields between JPEG compression and decompression master structs. */
 
 #define jpeg_common_fields \
-  struct jpeg_error_mgr *err;   /* Error handler module */\
-  struct jpeg_memory_mgr *mem;  /* Memory manager module */\
-  struct jpeg_progress_mgr *progress; /* Progress monitor, or NULL if none */\
-  void *client_data;            /* Available for use by application */\
-  boolean is_decompressor;      /* So common code can tell which is which */\
+  struct jpeg_error_mgr *err;   /* Error handler module */ \
+  struct jpeg_memory_mgr *mem;  /* Memory manager module */ \
+  struct jpeg_progress_mgr *progress; /* Progress monitor, or NULL if none */ \
+  void *client_data;            /* Available for use by application */ \
+  boolean is_decompressor;      /* So common code can tell which is which */ \
   int global_state              /* For checking call sequence validity */
 
 /* Routines that are to be used by both halves of the library are declared
@@ -888,7 +888,7 @@
 
 
 /* Default error-management setup */
-EXTERN(struct jpeg_error_mgr *) jpeg_std_error (struct jpeg_error_mgr *err);
+EXTERN(struct jpeg_error_mgr *) jpeg_std_error(struct jpeg_error_mgr *err);
 
 /* Initialization of JPEG compression objects.
  * jpeg_create_compress() and jpeg_create_decompress() are the exported
@@ -898,86 +898,91 @@
  * NB: you must set up the error-manager BEFORE calling jpeg_create_xxx.
  */
 #define jpeg_create_compress(cinfo) \
-    jpeg_CreateCompress((cinfo), JPEG_LIB_VERSION, \
-                        (size_t) sizeof(struct jpeg_compress_struct))
+  jpeg_CreateCompress((cinfo), JPEG_LIB_VERSION, \
+                      (size_t)sizeof(struct jpeg_compress_struct))
 #define jpeg_create_decompress(cinfo) \
-    jpeg_CreateDecompress((cinfo), JPEG_LIB_VERSION, \
-                          (size_t) sizeof(struct jpeg_decompress_struct))
-EXTERN(void) jpeg_CreateCompress (j_compress_ptr cinfo, int version,
-                                  size_t structsize);
-EXTERN(void) jpeg_CreateDecompress (j_decompress_ptr cinfo, int version,
-                                    size_t structsize);
+  jpeg_CreateDecompress((cinfo), JPEG_LIB_VERSION, \
+                        (size_t)sizeof(struct jpeg_decompress_struct))
+EXTERN(void) jpeg_CreateCompress(j_compress_ptr cinfo, int version,
+                                 size_t structsize);
+EXTERN(void) jpeg_CreateDecompress(j_decompress_ptr cinfo, int version,
+                                   size_t structsize);
 /* Destruction of JPEG compression objects */
-EXTERN(void) jpeg_destroy_compress (j_compress_ptr cinfo);
-EXTERN(void) jpeg_destroy_decompress (j_decompress_ptr cinfo);
+EXTERN(void) jpeg_destroy_compress(j_compress_ptr cinfo);
+EXTERN(void) jpeg_destroy_decompress(j_decompress_ptr cinfo);
 
 /* Standard data source and destination managers: stdio streams. */
 /* Caller is responsible for opening the file before and closing after. */
-EXTERN(void) jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile);
-EXTERN(void) jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile);
+EXTERN(void) jpeg_stdio_dest(j_compress_ptr cinfo, FILE *outfile);
+EXTERN(void) jpeg_stdio_src(j_decompress_ptr cinfo, FILE *infile);
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 /* Data source and destination managers: memory buffers. */
-EXTERN(void) jpeg_mem_dest (j_compress_ptr cinfo, unsigned char **outbuffer,
-                            unsigned long *outsize);
-EXTERN(void) jpeg_mem_src (j_decompress_ptr cinfo,
-                           const unsigned char *inbuffer,
-                           unsigned long insize);
+EXTERN(void) jpeg_mem_dest(j_compress_ptr cinfo, unsigned char **outbuffer,
+                           unsigned long *outsize);
+EXTERN(void) jpeg_mem_src(j_decompress_ptr cinfo,
+                          const unsigned char *inbuffer, unsigned long insize);
 #endif
 
 /* Default parameter setup for compression */
-EXTERN(void) jpeg_set_defaults (j_compress_ptr cinfo);
+EXTERN(void) jpeg_set_defaults(j_compress_ptr cinfo);
 /* Compression parameter setup aids */
-EXTERN(void) jpeg_set_colorspace (j_compress_ptr cinfo,
-                                  J_COLOR_SPACE colorspace);
-EXTERN(void) jpeg_default_colorspace (j_compress_ptr cinfo);
-EXTERN(void) jpeg_set_quality (j_compress_ptr cinfo, int quality,
-                               boolean force_baseline);
-EXTERN(void) jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
-                                      boolean force_baseline);
+EXTERN(void) jpeg_set_colorspace(j_compress_ptr cinfo,
+                                 J_COLOR_SPACE colorspace);
+EXTERN(void) jpeg_default_colorspace(j_compress_ptr cinfo);
+EXTERN(void) jpeg_set_quality(j_compress_ptr cinfo, int quality,
+                              boolean force_baseline);
+EXTERN(void) jpeg_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+                                     boolean force_baseline);
 #if JPEG_LIB_VERSION >= 70
-EXTERN(void) jpeg_default_qtables (j_compress_ptr cinfo,
-                                   boolean force_baseline);
+EXTERN(void) jpeg_default_qtables(j_compress_ptr cinfo,
+                                  boolean force_baseline);
 #endif
-EXTERN(void) jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
-                                   const unsigned int *basic_table,
-                                   int scale_factor, boolean force_baseline);
-EXTERN(int) jpeg_quality_scaling (int quality);
-EXTERN(void) jpeg_simple_progression (j_compress_ptr cinfo);
-EXTERN(void) jpeg_suppress_tables (j_compress_ptr cinfo, boolean suppress);
-EXTERN(JQUANT_TBL *) jpeg_alloc_quant_table (j_common_ptr cinfo);
-EXTERN(JHUFF_TBL *) jpeg_alloc_huff_table (j_common_ptr cinfo);
+EXTERN(void) jpeg_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+                                  const unsigned int *basic_table,
+                                  int scale_factor, boolean force_baseline);
+EXTERN(int) jpeg_quality_scaling(int quality);
+EXTERN(void) jpeg_simple_progression(j_compress_ptr cinfo);
+EXTERN(void) jpeg_suppress_tables(j_compress_ptr cinfo, boolean suppress);
+EXTERN(JQUANT_TBL *) jpeg_alloc_quant_table(j_common_ptr cinfo);
+EXTERN(JHUFF_TBL *) jpeg_alloc_huff_table(j_common_ptr cinfo);
 
 /* Main entry points for compression */
-EXTERN(void) jpeg_start_compress (j_compress_ptr cinfo,
-                                  boolean write_all_tables);
-EXTERN(JDIMENSION) jpeg_write_scanlines (j_compress_ptr cinfo,
-                                         JSAMPARRAY scanlines,
-                                         JDIMENSION num_lines);
-EXTERN(void) jpeg_finish_compress (j_compress_ptr cinfo);
+EXTERN(void) jpeg_start_compress(j_compress_ptr cinfo,
+                                 boolean write_all_tables);
+EXTERN(JDIMENSION) jpeg_write_scanlines(j_compress_ptr cinfo,
+                                        JSAMPARRAY scanlines,
+                                        JDIMENSION num_lines);
+EXTERN(void) jpeg_finish_compress(j_compress_ptr cinfo);
 
 #if JPEG_LIB_VERSION >= 70
 /* Precalculate JPEG dimensions for current compression parameters. */
-EXTERN(void) jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo);
+EXTERN(void) jpeg_calc_jpeg_dimensions(j_compress_ptr cinfo);
 #endif
 
 /* Replaces jpeg_write_scanlines when writing raw downsampled data. */
-EXTERN(JDIMENSION) jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data,
-                                        JDIMENSION num_lines);
+EXTERN(JDIMENSION) jpeg_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+                                       JDIMENSION num_lines);
 
 /* Write a special marker.  See libjpeg.txt concerning safe usage. */
-EXTERN(void) jpeg_write_marker (j_compress_ptr cinfo, int marker,
-                                const JOCTET *dataptr, unsigned int datalen);
+EXTERN(void) jpeg_write_marker(j_compress_ptr cinfo, int marker,
+                               const JOCTET *dataptr, unsigned int datalen);
 /* Same, but piecemeal. */
-EXTERN(void) jpeg_write_m_header (j_compress_ptr cinfo, int marker,
-                                  unsigned int datalen);
-EXTERN(void) jpeg_write_m_byte (j_compress_ptr cinfo, int val);
+EXTERN(void) jpeg_write_m_header(j_compress_ptr cinfo, int marker,
+                                 unsigned int datalen);
+EXTERN(void) jpeg_write_m_byte(j_compress_ptr cinfo, int val);
 
 /* Alternate compression function: just write an abbreviated table file */
-EXTERN(void) jpeg_write_tables (j_compress_ptr cinfo);
+EXTERN(void) jpeg_write_tables(j_compress_ptr cinfo);
+
+/* Write ICC profile.  See libjpeg.txt for usage information. */
+EXTERN(void) jpeg_write_icc_profile(j_compress_ptr cinfo,
+                                    const JOCTET *icc_data_ptr,
+                                    unsigned int icc_data_len);
+
 
 /* Decompression startup: read start of JPEG datastream to see what's there */
-EXTERN(int) jpeg_read_header (j_decompress_ptr cinfo, boolean require_image);
+EXTERN(int) jpeg_read_header(j_decompress_ptr cinfo, boolean require_image);
 /* Return value is one of: */
 #define JPEG_SUSPENDED          0 /* Suspended due to lack of input data */
 #define JPEG_HEADER_OK          1 /* Found valid image datastream */
@@ -989,27 +994,27 @@
  */
 
 /* Main entry points for decompression */
-EXTERN(boolean) jpeg_start_decompress (j_decompress_ptr cinfo);
-EXTERN(JDIMENSION) jpeg_read_scanlines (j_decompress_ptr cinfo,
-                                        JSAMPARRAY scanlines,
-                                        JDIMENSION max_lines);
-EXTERN(JDIMENSION) jpeg_skip_scanlines (j_decompress_ptr cinfo,
-                                        JDIMENSION num_lines);
-EXTERN(void) jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
-                                 JDIMENSION *width);
-EXTERN(boolean) jpeg_finish_decompress (j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_start_decompress(j_decompress_ptr cinfo);
+EXTERN(JDIMENSION) jpeg_read_scanlines(j_decompress_ptr cinfo,
+                                       JSAMPARRAY scanlines,
+                                       JDIMENSION max_lines);
+EXTERN(JDIMENSION) jpeg_skip_scanlines(j_decompress_ptr cinfo,
+                                       JDIMENSION num_lines);
+EXTERN(void) jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                                JDIMENSION *width);
+EXTERN(boolean) jpeg_finish_decompress(j_decompress_ptr cinfo);
 
 /* Replaces jpeg_read_scanlines when reading raw downsampled data. */
-EXTERN(JDIMENSION) jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
-                                       JDIMENSION max_lines);
+EXTERN(JDIMENSION) jpeg_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+                                      JDIMENSION max_lines);
 
 /* Additional entry points for buffered-image mode. */
-EXTERN(boolean) jpeg_has_multiple_scans (j_decompress_ptr cinfo);
-EXTERN(boolean) jpeg_start_output (j_decompress_ptr cinfo, int scan_number);
-EXTERN(boolean) jpeg_finish_output (j_decompress_ptr cinfo);
-EXTERN(boolean) jpeg_input_complete (j_decompress_ptr cinfo);
-EXTERN(void) jpeg_new_colormap (j_decompress_ptr cinfo);
-EXTERN(int) jpeg_consume_input (j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_has_multiple_scans(j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_start_output(j_decompress_ptr cinfo, int scan_number);
+EXTERN(boolean) jpeg_finish_output(j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_input_complete(j_decompress_ptr cinfo);
+EXTERN(void) jpeg_new_colormap(j_decompress_ptr cinfo);
+EXTERN(int) jpeg_consume_input(j_decompress_ptr cinfo);
 /* Return value is one of: */
 /* #define JPEG_SUSPENDED       0    Suspended due to lack of input data */
 #define JPEG_REACHED_SOS        1 /* Reached start of new scan */
@@ -1019,25 +1024,25 @@
 
 /* Precalculate output dimensions for current decompression parameters. */
 #if JPEG_LIB_VERSION >= 80
-EXTERN(void) jpeg_core_output_dimensions (j_decompress_ptr cinfo);
+EXTERN(void) jpeg_core_output_dimensions(j_decompress_ptr cinfo);
 #endif
-EXTERN(void) jpeg_calc_output_dimensions (j_decompress_ptr cinfo);
+EXTERN(void) jpeg_calc_output_dimensions(j_decompress_ptr cinfo);
 
 /* Control saving of COM and APPn markers into marker_list. */
-EXTERN(void) jpeg_save_markers (j_decompress_ptr cinfo, int marker_code,
-                                unsigned int length_limit);
+EXTERN(void) jpeg_save_markers(j_decompress_ptr cinfo, int marker_code,
+                               unsigned int length_limit);
 
 /* Install a special processing method for COM or APPn markers. */
-EXTERN(void) jpeg_set_marker_processor (j_decompress_ptr cinfo,
-                                        int marker_code,
-                                        jpeg_marker_parser_method routine);
+EXTERN(void) jpeg_set_marker_processor(j_decompress_ptr cinfo,
+                                       int marker_code,
+                                       jpeg_marker_parser_method routine);
 
 /* Read or write raw DCT coefficients --- useful for lossless transcoding. */
-EXTERN(jvirt_barray_ptr *) jpeg_read_coefficients (j_decompress_ptr cinfo);
-EXTERN(void) jpeg_write_coefficients (j_compress_ptr cinfo,
-                                      jvirt_barray_ptr *coef_arrays);
-EXTERN(void) jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
-                                            j_compress_ptr dstinfo);
+EXTERN(jvirt_barray_ptr *) jpeg_read_coefficients(j_decompress_ptr cinfo);
+EXTERN(void) jpeg_write_coefficients(j_compress_ptr cinfo,
+                                     jvirt_barray_ptr *coef_arrays);
+EXTERN(void) jpeg_copy_critical_parameters(j_decompress_ptr srcinfo,
+                                           j_compress_ptr dstinfo);
 
 /* If you choose to abort compression or decompression before completing
  * jpeg_finish_(de)compress, then you need to clean up to release memory,
@@ -1045,17 +1050,22 @@
  * if you're done with the JPEG object, but if you want to clean it up and
  * reuse it, call this:
  */
-EXTERN(void) jpeg_abort_compress (j_compress_ptr cinfo);
-EXTERN(void) jpeg_abort_decompress (j_decompress_ptr cinfo);
+EXTERN(void) jpeg_abort_compress(j_compress_ptr cinfo);
+EXTERN(void) jpeg_abort_decompress(j_decompress_ptr cinfo);
 
 /* Generic versions of jpeg_abort and jpeg_destroy that work on either
  * flavor of JPEG object.  These may be more convenient in some places.
  */
-EXTERN(void) jpeg_abort (j_common_ptr cinfo);
-EXTERN(void) jpeg_destroy (j_common_ptr cinfo);
+EXTERN(void) jpeg_abort(j_common_ptr cinfo);
+EXTERN(void) jpeg_destroy(j_common_ptr cinfo);
 
 /* Default restart-marker-resync procedure for use by data source modules */
-EXTERN(boolean) jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired);
+EXTERN(boolean) jpeg_resync_to_restart(j_decompress_ptr cinfo, int desired);
+
+/* Read ICC profile.  See libjpeg.txt for usage information. */
+EXTERN(boolean) jpeg_read_icc_profile(j_decompress_ptr cinfo,
+                                      JOCTET **icc_data_ptr,
+                                      unsigned int *icc_data_len);
 
 
 /* These marker codes are exported since applications and data source modules
diff --git a/jpegtran.1 b/jpegtran.1
index 631455b..2efb264 100644
--- a/jpegtran.1
+++ b/jpegtran.1
@@ -217,6 +217,11 @@
 .PP
 Additional switches recognized by jpegtran are:
 .TP
+.BI \-icc " file"
+Embed ICC color management profile contained in the specified file.  Note that
+this will cause \fBjpegtran\fR to ignore any APP2 markers in the input file,
+even if \fB-copy all\fR is specified.
+.TP
 .BI \-maxmemory " N"
 Set limit for amount of memory to use in processing large images.  Value is
 in thousands of bytes, or millions of bytes if "M" is attached to the
diff --git a/jpegtran.c b/jpegtran.c
index 6f8fd5b..058e844 100644
--- a/jpegtran.c
+++ b/jpegtran.c
@@ -40,13 +40,14 @@
 
 
 static const char *progname;    /* program name for error messages */
+static char *icc_filename;      /* for -icc switch */
 static char *outfilename;       /* for -outfile switch */
 static JCOPY_OPTION copyoption; /* -copy switch */
 static jpeg_transform_info transformoption; /* image transformation options */
 
 
 LOCAL(void)
-usage (void)
+usage(void)
 /* complain about bad command line */
 {
   fprintf(stderr, "usage: %s [switches] ", progname);
@@ -83,6 +84,7 @@
 #ifdef C_ARITH_CODING_SUPPORTED
   fprintf(stderr, "  -arithmetic    Use arithmetic coding\n");
 #endif
+  fprintf(stderr, "  -icc FILE      Embed ICC profile contained in FILE\n");
   fprintf(stderr, "  -restart N     Set restart interval in rows, or in blocks with B\n");
   fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
   fprintf(stderr, "  -outfile name  Specify name for output file\n");
@@ -97,7 +99,7 @@
 
 
 LOCAL(void)
-select_transform (JXFORM_CODE transform)
+select_transform(JXFORM_CODE transform)
 /* Silly little routine to detect multiple transform options,
  * which we can't handle.
  */
@@ -120,8 +122,8 @@
 
 
 LOCAL(int)
-parse_switches (j_compress_ptr cinfo, int argc, char **argv,
-                int last_file_arg_seen, boolean for_real)
+parse_switches(j_compress_ptr cinfo, int argc, char **argv,
+               int last_file_arg_seen, boolean for_real)
 /* Parse optional switches.
  * Returns argv[] index of first file-name argument (== argc if none).
  * Any file names with indexes <= last_file_arg_seen are ignored;
@@ -138,6 +140,7 @@
 
   /* Set up default JPEG parameters. */
   simple_progressive = FALSE;
+  icc_filename = NULL;
   outfilename = NULL;
   copyoption = JCOPYOPT_DEFAULT;
   transformoption.transform = JXFORM_NONE;
@@ -190,7 +193,7 @@
 #if TRANSFORMS_SUPPORTED
       if (++argn >= argc)       /* advance to next argument */
         usage();
-      if (! jtransform_parse_crop_spec(&transformoption, argv[argn])) {
+      if (!jtransform_parse_crop_spec(&transformoption, argv[argn])) {
         fprintf(stderr, "%s: bogus -crop argument '%s'\n",
                 progname, argv[argn]);
         exit(EXIT_FAILURE);
@@ -204,7 +207,7 @@
       /* On first -d, print version identification */
       static boolean printed_version = FALSE;
 
-      if (! printed_version) {
+      if (!printed_version) {
         fprintf(stderr, "%s version %s (build %s)\n",
                 PACKAGE_NAME, VERSION, BUILD);
         fprintf(stderr, "%s\n\n", JCOPYRIGHT);
@@ -230,7 +233,8 @@
       else
         usage();
 
-    } else if (keymatch(arg, "grayscale", 1) || keymatch(arg, "greyscale",1)) {
+    } else if (keymatch(arg, "grayscale", 1) ||
+               keymatch(arg, "greyscale", 1)) {
       /* Force to grayscale. */
 #if TRANSFORMS_SUPPORTED
       transformoption.force_grayscale = TRUE;
@@ -238,6 +242,12 @@
       select_transform(JXFORM_NONE);    /* force an error */
 #endif
 
+    } else if (keymatch(arg, "icc", 1)) {
+      /* Set ICC filename. */
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      icc_filename = argv[argn];
+
     } else if (keymatch(arg, "maxmemory", 3)) {
       /* Maximum memory in Kb (or Mb with 'm'). */
       long lval;
@@ -295,10 +305,10 @@
       if (lval < 0 || lval > 65535L)
         usage();
       if (ch == 'b' || ch == 'B') {
-        cinfo->restart_interval = (unsigned int) lval;
+        cinfo->restart_interval = (unsigned int)lval;
         cinfo->restart_in_rows = 0; /* else prior '-restart n' overrides me */
       } else {
-        cinfo->restart_in_rows = (int) lval;
+        cinfo->restart_in_rows = (int)lval;
         /* restart_interval will be computed during startup */
       }
 
@@ -356,7 +366,7 @@
 
 #ifdef C_MULTISCAN_FILES_SUPPORTED
     if (scansarg != NULL)       /* process -scans if it was present */
-      if (! read_scan_script(cinfo, scansarg))
+      if (!read_scan_script(cinfo, scansarg))
         usage();
 #endif
   }
@@ -370,7 +380,7 @@
  */
 
 int
-main (int argc, char **argv)
+main(int argc, char **argv)
 {
   struct jpeg_decompress_struct srcinfo;
   struct jpeg_compress_struct dstinfo;
@@ -385,6 +395,9 @@
    * single file pointer for sequential input and output operation.
    */
   FILE *fp;
+  FILE *icc_file;
+  JOCTET *icc_profile = NULL;
+  long icc_len = 0;
 
   /* On Mac, fetch a command line. */
 #ifdef USE_CCOMMAND
@@ -417,14 +430,14 @@
 #ifdef TWO_FILE_COMMANDLINE
   /* Must have either -outfile switch or explicit output file name */
   if (outfilename == NULL) {
-    if (file_index != argc-2) {
+    if (file_index != argc - 2) {
       fprintf(stderr, "%s: must name one input and one output file\n",
               progname);
       usage();
     }
-    outfilename = argv[file_index+1];
+    outfilename = argv[file_index + 1];
   } else {
-    if (file_index != argc-1) {
+    if (file_index != argc - 1) {
       fprintf(stderr, "%s: must name one input and one output file\n",
               progname);
       usage();
@@ -432,7 +445,7 @@
   }
 #else
   /* Unix style: expect zero or one file name */
-  if (file_index < argc-1) {
+  if (file_index < argc - 1) {
     fprintf(stderr, "%s: only one input file\n", progname);
     usage();
   }
@@ -441,7 +454,8 @@
   /* Open the input file. */
   if (file_index < argc) {
     if ((fp = fopen(argv[file_index], READ_BINARY)) == NULL) {
-      fprintf(stderr, "%s: can't open %s for reading\n", progname, argv[file_index]);
+      fprintf(stderr, "%s: can't open %s for reading\n", progname,
+              argv[file_index]);
       exit(EXIT_FAILURE);
     }
   } else {
@@ -449,8 +463,37 @@
     fp = read_stdin();
   }
 
+  if (icc_filename != NULL) {
+    if ((icc_file = fopen(icc_filename, READ_BINARY)) == NULL) {
+      fprintf(stderr, "%s: can't open %s\n", progname, icc_filename);
+      exit(EXIT_FAILURE);
+    }
+    if (fseek(icc_file, 0, SEEK_END) < 0 ||
+        (icc_len = ftell(icc_file)) < 1 ||
+        fseek(icc_file, 0, SEEK_SET) < 0) {
+      fprintf(stderr, "%s: can't determine size of %s\n", progname,
+              icc_filename);
+      exit(EXIT_FAILURE);
+    }
+    if ((icc_profile = (JOCTET *)malloc(icc_len)) == NULL) {
+      fprintf(stderr, "%s: can't allocate memory for ICC profile\n", progname);
+      fclose(icc_file);
+      exit(EXIT_FAILURE);
+    }
+    if (fread(icc_profile, icc_len, 1, icc_file) < 1) {
+      fprintf(stderr, "%s: can't read ICC profile from %s\n", progname,
+              icc_filename);
+      free(icc_profile);
+      fclose(icc_file);
+      exit(EXIT_FAILURE);
+    }
+    fclose(icc_file);
+    if (copyoption == JCOPYOPT_ALL)
+      copyoption = JCOPYOPT_ALL_EXCEPT_ICC;
+  }
+
 #ifdef PROGRESS_REPORT
-  start_progress_monitor((j_common_ptr) &dstinfo, &progress);
+  start_progress_monitor((j_common_ptr)&dstinfo, &progress);
 #endif
 
   /* Specify data source for decompression */
@@ -460,7 +503,7 @@
   jcopy_markers_setup(&srcinfo, copyoption);
 
   /* Read file header */
-  (void) jpeg_read_header(&srcinfo, TRUE);
+  (void)jpeg_read_header(&srcinfo, TRUE);
 
   /* Any space needed by a transform option must be requested before
    * jpeg_read_coefficients so that memory allocation will be done right.
@@ -494,7 +537,7 @@
   /* Close input file, if we opened it.
    * Note: we assume that jpeg_read_coefficients consumed all input
    * until JPEG_REACHED_EOI, and that jpeg_finish_decompress will
-   * only consume more while (! cinfo->inputctl->eoi_reached).
+   * only consume more while (!cinfo->inputctl->eoi_reached).
    * We cannot call jpeg_finish_decompress here since we still need the
    * virtual arrays allocated from the source object for processing.
    */
@@ -504,7 +547,8 @@
   /* Open the output file. */
   if (outfilename != NULL) {
     if ((fp = fopen(outfilename, WRITE_BINARY)) == NULL) {
-      fprintf(stderr, "%s: can't open %s for writing\n", progname, outfilename);
+      fprintf(stderr, "%s: can't open %s for writing\n", progname,
+              outfilename);
       exit(EXIT_FAILURE);
     }
   } else {
@@ -524,17 +568,19 @@
   /* Copy to the output file any extra markers that we want to preserve */
   jcopy_markers_execute(&srcinfo, &dstinfo, copyoption);
 
+  if (icc_profile != NULL)
+    jpeg_write_icc_profile(&dstinfo, icc_profile, (unsigned int)icc_len);
+
   /* Execute image transformation, if any */
 #if TRANSFORMS_SUPPORTED
-  jtransform_execute_transformation(&srcinfo, &dstinfo,
-                                    src_coef_arrays,
+  jtransform_execute_transformation(&srcinfo, &dstinfo, src_coef_arrays,
                                     &transformoption);
 #endif
 
   /* Finish compression and release memory */
   jpeg_finish_compress(&dstinfo);
   jpeg_destroy_compress(&dstinfo);
-  (void) jpeg_finish_decompress(&srcinfo);
+  (void)jpeg_finish_decompress(&srcinfo);
   jpeg_destroy_decompress(&srcinfo);
 
   /* Close output file, if we opened it */
@@ -542,10 +588,14 @@
     fclose(fp);
 
 #ifdef PROGRESS_REPORT
-  end_progress_monitor((j_common_ptr) &dstinfo);
+  end_progress_monitor((j_common_ptr)&dstinfo);
 #endif
 
+  if (icc_profile != NULL)
+    free(icc_profile);
+
   /* All done. */
-  exit(jsrcerr.num_warnings + jdsterr.num_warnings ?EXIT_WARNING:EXIT_SUCCESS);
+  exit(jsrcerr.num_warnings + jdsterr.num_warnings ?
+       EXIT_WARNING : EXIT_SUCCESS);
   return 0;                     /* suppress no-return-value warnings */
 }
diff --git a/jquant1.c b/jquant1.c
index e781481..7502035 100644
--- a/jquant1.c
+++ b/jquant1.c
@@ -73,8 +73,8 @@
 
 #define ODITHER_SIZE  16        /* dimension of dither matrix */
 /* NB: if ODITHER_SIZE is not a power of 2, ODITHER_MASK uses will break */
-#define ODITHER_CELLS (ODITHER_SIZE*ODITHER_SIZE)       /* # cells in matrix */
-#define ODITHER_MASK  (ODITHER_SIZE-1) /* mask for wrapping around counters */
+#define ODITHER_CELLS (ODITHER_SIZE * ODITHER_SIZE)     /* # cells in matrix */
+#define ODITHER_MASK (ODITHER_SIZE - 1) /* mask for wrapping around counters */
 
 typedef int ODITHER_MATRIX[ODITHER_SIZE][ODITHER_SIZE];
 typedef int (*ODITHER_MATRIX_PTR)[ODITHER_SIZE];
@@ -183,7 +183,7 @@
 
 
 LOCAL(int)
-select_ncolors (j_decompress_ptr cinfo, int Ncolors[])
+select_ncolors(j_decompress_ptr cinfo, int Ncolors[])
 /* Determine allocation of desired colors to components, */
 /* and fill in Ncolors[] array to indicate choice. */
 /* Return value is total number of colors (product of Ncolors[] values). */
@@ -206,12 +206,12 @@
     temp = iroot;               /* set temp = iroot ** nc */
     for (i = 1; i < nc; i++)
       temp *= iroot;
-  } while (temp <= (long) max_colors); /* repeat till iroot exceeds root */
+  } while (temp <= (long)max_colors); /* repeat till iroot exceeds root */
   iroot--;                      /* now iroot = floor(root) */
 
   /* Must have at least 2 color values per component */
   if (iroot < 2)
-    ERREXIT1(cinfo, JERR_QUANT_FEW_COLORS, (int) temp);
+    ERREXIT1(cinfo, JERR_QUANT_FEW_COLORS, (int)temp);
 
   /* Initialize to iroot color values for each component */
   total_colors = 1;
@@ -231,11 +231,11 @@
       j = (cinfo->out_color_space == JCS_RGB ? RGB_order[i] : i);
       /* calculate new total_colors if Ncolors[j] is incremented */
       temp = total_colors / Ncolors[j];
-      temp *= Ncolors[j]+1;     /* done in long arith to avoid oflo */
-      if (temp > (long) max_colors)
+      temp *= Ncolors[j] + 1;   /* done in long arith to avoid oflo */
+      if (temp > (long)max_colors)
         break;                  /* won't fit, done with this pass */
       Ncolors[j]++;             /* OK, apply the increment */
-      total_colors = (int) temp;
+      total_colors = (int)temp;
       changed = TRUE;
     }
   } while (changed);
@@ -245,7 +245,7 @@
 
 
 LOCAL(int)
-output_value (j_decompress_ptr cinfo, int ci, int j, int maxj)
+output_value(j_decompress_ptr cinfo, int ci, int j, int maxj)
 /* Return j'th output value, where j will range from 0 to maxj */
 /* The output values must fall in 0..MAXJSAMPLE in increasing order */
 {
@@ -254,17 +254,17 @@
    * (Forcing the upper and lower values to the limits ensures that
    * dithering can't produce a color outside the selected gamut.)
    */
-  return (int) (((JLONG) j * MAXJSAMPLE + maxj/2) / maxj);
+  return (int)(((JLONG)j * MAXJSAMPLE + maxj / 2) / maxj);
 }
 
 
 LOCAL(int)
-largest_input_value (j_decompress_ptr cinfo, int ci, int j, int maxj)
+largest_input_value(j_decompress_ptr cinfo, int ci, int j, int maxj)
 /* Return largest input value that should map to j'th output value */
 /* Must have largest(j=0) >= 0, and largest(j=maxj) >= MAXJSAMPLE */
 {
   /* Breakpoints are halfway between values returned by output_value */
-  return (int) (((JLONG) (2*j + 1) * MAXJSAMPLE + maxj) / (2*maxj));
+  return (int)(((JLONG)(2 * j + 1) * MAXJSAMPLE + maxj) / (2 * maxj));
 }
 
 
@@ -273,21 +273,21 @@
  */
 
 LOCAL(void)
-create_colormap (j_decompress_ptr cinfo)
+create_colormap(j_decompress_ptr cinfo)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   JSAMPARRAY colormap;          /* Created colormap */
   int total_colors;             /* Number of distinct output colors */
-  int i,j,k, nci, blksize, blkdist, ptr, val;
+  int i, j, k, nci, blksize, blkdist, ptr, val;
 
   /* Select number of colors for each component */
   total_colors = select_ncolors(cinfo, cquantize->Ncolors);
 
   /* Report selected color counts */
   if (cinfo->out_color_components == 3)
-    TRACEMS4(cinfo, 1, JTRC_QUANT_3_NCOLORS,
-             total_colors, cquantize->Ncolors[0],
-             cquantize->Ncolors[1], cquantize->Ncolors[2]);
+    TRACEMS4(cinfo, 1, JTRC_QUANT_3_NCOLORS, total_colors,
+             cquantize->Ncolors[0], cquantize->Ncolors[1],
+             cquantize->Ncolors[2]);
   else
     TRACEMS1(cinfo, 1, JTRC_QUANT_NCOLORS, total_colors);
 
@@ -296,8 +296,8 @@
   /* i.e. rightmost (highest-indexed) color changes most rapidly. */
 
   colormap = (*cinfo->mem->alloc_sarray)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE,
-     (JDIMENSION) total_colors, (JDIMENSION) cinfo->out_color_components);
+    ((j_common_ptr)cinfo, JPOOL_IMAGE,
+     (JDIMENSION)total_colors, (JDIMENSION)cinfo->out_color_components);
 
   /* blksize is number of adjacent repeated entries for a component */
   /* blkdist is distance between groups of identical entries for a component */
@@ -309,12 +309,12 @@
     blksize = blkdist / nci;
     for (j = 0; j < nci; j++) {
       /* Compute j'th output value (out of nci) for component */
-      val = output_value(cinfo, i, j, nci-1);
+      val = output_value(cinfo, i, j, nci - 1);
       /* Fill in all colormap entries that have this value of this component */
       for (ptr = j * blksize; ptr < total_colors; ptr += blkdist) {
         /* fill in blksize entries beginning at ptr */
         for (k = 0; k < blksize; k++)
-          colormap[i][ptr+k] = (JSAMPLE) val;
+          colormap[i][ptr + k] = (JSAMPLE)val;
       }
     }
     blkdist = blksize;          /* blksize of this color is blkdist of next */
@@ -333,11 +333,11 @@
  */
 
 LOCAL(void)
-create_colorindex (j_decompress_ptr cinfo)
+create_colorindex(j_decompress_ptr cinfo)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   JSAMPROW indexptr;
-  int i,j,k, nci, blksize, val, pad;
+  int i, j, k, nci, blksize, val, pad;
 
   /* For ordered dither, we pad the color index tables by MAXJSAMPLE in
    * each direction (input index values can be -MAXJSAMPLE .. 2*MAXJSAMPLE).
@@ -345,7 +345,7 @@
    * flag whether it was done in case user changes dithering mode.
    */
   if (cinfo->dither_mode == JDITHER_ORDERED) {
-    pad = MAXJSAMPLE*2;
+    pad = MAXJSAMPLE * 2;
     cquantize->is_padded = TRUE;
   } else {
     pad = 0;
@@ -353,9 +353,9 @@
   }
 
   cquantize->colorindex = (*cinfo->mem->alloc_sarray)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE,
-     (JDIMENSION) (MAXJSAMPLE+1 + pad),
-     (JDIMENSION) cinfo->out_color_components);
+    ((j_common_ptr)cinfo, JPOOL_IMAGE,
+     (JDIMENSION)(MAXJSAMPLE + 1 + pad),
+     (JDIMENSION)cinfo->out_color_components);
 
   /* blksize is number of adjacent repeated entries for a component */
   blksize = cquantize->sv_actual;
@@ -373,18 +373,18 @@
     /* and k = largest j that maps to current val */
     indexptr = cquantize->colorindex[i];
     val = 0;
-    k = largest_input_value(cinfo, i, 0, nci-1);
+    k = largest_input_value(cinfo, i, 0, nci - 1);
     for (j = 0; j <= MAXJSAMPLE; j++) {
       while (j > k)             /* advance val if past boundary */
-        k = largest_input_value(cinfo, i, ++val, nci-1);
+        k = largest_input_value(cinfo, i, ++val, nci - 1);
       /* premultiply so that no multiplication needed in main processing */
-      indexptr[j] = (JSAMPLE) (val * blksize);
+      indexptr[j] = (JSAMPLE)(val * blksize);
     }
     /* Pad at both ends if necessary */
     if (pad)
       for (j = 1; j <= MAXJSAMPLE; j++) {
         indexptr[-j] = indexptr[0];
-        indexptr[MAXJSAMPLE+j] = indexptr[MAXJSAMPLE];
+        indexptr[MAXJSAMPLE + j] = indexptr[MAXJSAMPLE];
       }
   }
 }
@@ -396,29 +396,29 @@
  */
 
 LOCAL(ODITHER_MATRIX_PTR)
-make_odither_array (j_decompress_ptr cinfo, int ncolors)
+make_odither_array(j_decompress_ptr cinfo, int ncolors)
 {
   ODITHER_MATRIX_PTR odither;
-  int j,k;
-  JLONG num,den;
+  int j, k;
+  JLONG num, den;
 
   odither = (ODITHER_MATRIX_PTR)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(ODITHER_MATRIX));
   /* The inter-value distance for this color is MAXJSAMPLE/(ncolors-1).
    * Hence the dither value for the matrix cell with fill order f
    * (f=0..N-1) should be (N-1-2*f)/(2*N) * MAXJSAMPLE/(ncolors-1).
    * On 16-bit-int machine, be careful to avoid overflow.
    */
-  den = 2 * ODITHER_CELLS * ((JLONG) (ncolors - 1));
+  den = 2 * ODITHER_CELLS * ((JLONG)(ncolors - 1));
   for (j = 0; j < ODITHER_SIZE; j++) {
     for (k = 0; k < ODITHER_SIZE; k++) {
-      num = ((JLONG) (ODITHER_CELLS-1 - 2*((int)base_dither_matrix[j][k])))
-            * MAXJSAMPLE;
+      num = ((JLONG)(ODITHER_CELLS - 1 -
+                     2 * ((int)base_dither_matrix[j][k]))) * MAXJSAMPLE;
       /* Ensure round towards zero despite C's lack of consistency
        * about rounding negative values in integer division...
        */
-      odither[j][k] = (int) (num<0 ? -((-num)/den) : num/den);
+      odither[j][k] = (int)(num < 0 ? -((-num) / den) : num / den);
     }
   }
   return odither;
@@ -432,9 +432,9 @@
  */
 
 LOCAL(void)
-create_odither_tables (j_decompress_ptr cinfo)
+create_odither_tables(j_decompress_ptr cinfo)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   ODITHER_MATRIX_PTR odither;
   int i, j, nci;
 
@@ -459,11 +459,11 @@
  */
 
 METHODDEF(void)
-color_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                JSAMPARRAY output_buf, int num_rows)
+color_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+               JSAMPARRAY output_buf, int num_rows)
 /* General case, no dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   JSAMPARRAY colorindex = cquantize->colorindex;
   register int pixcode, ci;
   register JSAMPROW ptrin, ptrout;
@@ -480,18 +480,18 @@
       for (ci = 0; ci < nc; ci++) {
         pixcode += GETJSAMPLE(colorindex[ci][GETJSAMPLE(*ptrin++)]);
       }
-      *ptrout++ = (JSAMPLE) pixcode;
+      *ptrout++ = (JSAMPLE)pixcode;
     }
   }
 }
 
 
 METHODDEF(void)
-color_quantize3 (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                 JSAMPARRAY output_buf, int num_rows)
+color_quantize3(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                JSAMPARRAY output_buf, int num_rows)
 /* Fast path for out_color_components==3, no dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   register int pixcode;
   register JSAMPROW ptrin, ptrout;
   JSAMPROW colorindex0 = cquantize->colorindex[0];
@@ -508,18 +508,18 @@
       pixcode  = GETJSAMPLE(colorindex0[GETJSAMPLE(*ptrin++)]);
       pixcode += GETJSAMPLE(colorindex1[GETJSAMPLE(*ptrin++)]);
       pixcode += GETJSAMPLE(colorindex2[GETJSAMPLE(*ptrin++)]);
-      *ptrout++ = (JSAMPLE) pixcode;
+      *ptrout++ = (JSAMPLE)pixcode;
     }
   }
 }
 
 
 METHODDEF(void)
-quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                     JSAMPARRAY output_buf, int num_rows)
+quantize_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                    JSAMPARRAY output_buf, int num_rows)
 /* General case, with ordered dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   register JSAMPROW input_ptr;
   register JSAMPROW output_ptr;
   JSAMPROW colorindex_ci;
@@ -533,7 +533,7 @@
 
   for (row = 0; row < num_rows; row++) {
     /* Initialize output values to 0 so can process components separately */
-    jzero_far((void *) output_buf[row], (size_t) (width * sizeof(JSAMPLE)));
+    jzero_far((void *)output_buf[row], (size_t)(width * sizeof(JSAMPLE)));
     row_index = cquantize->row_index;
     for (ci = 0; ci < nc; ci++) {
       input_ptr = input_buf[row] + ci;
@@ -550,7 +550,8 @@
          * inputs.  The maximum dither is +- MAXJSAMPLE; this sets the
          * required amount of padding.
          */
-        *output_ptr += colorindex_ci[GETJSAMPLE(*input_ptr)+dither[col_index]];
+        *output_ptr +=
+          colorindex_ci[GETJSAMPLE(*input_ptr) + dither[col_index]];
         input_ptr += nc;
         output_ptr++;
         col_index = (col_index + 1) & ODITHER_MASK;
@@ -564,11 +565,11 @@
 
 
 METHODDEF(void)
-quantize3_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                      JSAMPARRAY output_buf, int num_rows)
+quantize3_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                     JSAMPARRAY output_buf, int num_rows)
 /* Fast path for out_color_components==3, with ordered dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   register int pixcode;
   register JSAMPROW input_ptr;
   register JSAMPROW output_ptr;
@@ -593,13 +594,13 @@
     col_index = 0;
 
     for (col = width; col > 0; col--) {
-      pixcode  = GETJSAMPLE(colorindex0[GETJSAMPLE(*input_ptr++) +
-                                        dither0[col_index]]);
-      pixcode += GETJSAMPLE(colorindex1[GETJSAMPLE(*input_ptr++) +
-                                        dither1[col_index]]);
-      pixcode += GETJSAMPLE(colorindex2[GETJSAMPLE(*input_ptr++) +
-                                        dither2[col_index]]);
-      *output_ptr++ = (JSAMPLE) pixcode;
+      pixcode  =
+        GETJSAMPLE(colorindex0[GETJSAMPLE(*input_ptr++) + dither0[col_index]]);
+      pixcode +=
+        GETJSAMPLE(colorindex1[GETJSAMPLE(*input_ptr++) + dither1[col_index]]);
+      pixcode +=
+        GETJSAMPLE(colorindex2[GETJSAMPLE(*input_ptr++) + dither2[col_index]]);
+      *output_ptr++ = (JSAMPLE)pixcode;
       col_index = (col_index + 1) & ODITHER_MASK;
     }
     row_index = (row_index + 1) & ODITHER_MASK;
@@ -609,11 +610,11 @@
 
 
 METHODDEF(void)
-quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                    JSAMPARRAY output_buf, int num_rows)
+quantize_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                   JSAMPARRAY output_buf, int num_rows)
 /* General case, with Floyd-Steinberg dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   register LOCFSERROR cur;      /* current error or pixel value */
   LOCFSERROR belowerr;          /* error for pixel below cur */
   LOCFSERROR bpreverr;          /* error for below/prev col */
@@ -637,17 +638,17 @@
 
   for (row = 0; row < num_rows; row++) {
     /* Initialize output values to 0 so can process components separately */
-    jzero_far((void *) output_buf[row], (size_t) (width * sizeof(JSAMPLE)));
+    jzero_far((void *)output_buf[row], (size_t)(width * sizeof(JSAMPLE)));
     for (ci = 0; ci < nc; ci++) {
       input_ptr = input_buf[row] + ci;
       output_ptr = output_buf[row];
       if (cquantize->on_odd_row) {
         /* work right to left in this row */
-        input_ptr += (width-1) * nc; /* so point to rightmost pixel */
-        output_ptr += width-1;
+        input_ptr += (width - 1) * nc; /* so point to rightmost pixel */
+        output_ptr += width - 1;
         dir = -1;
         dirnc = -nc;
-        errorptr = cquantize->fserrors[ci] + (width+1); /* => entry after last column */
+        errorptr = cquantize->fserrors[ci] + (width + 1); /* => entry after last column */
       } else {
         /* work left to right in this row */
         dir = 1;
@@ -679,7 +680,7 @@
         cur = GETJSAMPLE(range_limit[cur]);
         /* Select output value, accumulate into output code for this pixel */
         pixcode = GETJSAMPLE(colorindex_ci[cur]);
-        *output_ptr += (JSAMPLE) pixcode;
+        *output_ptr += (JSAMPLE)pixcode;
         /* Compute actual representation error at this pixel */
         /* Note: we can do this even though we don't have the final */
         /* pixel code, because the colormap is orthogonal. */
@@ -691,7 +692,7 @@
         bnexterr = cur;
         delta = cur * 2;
         cur += delta;           /* form error * 3 */
-        errorptr[0] = (FSERROR) (bpreverr + cur);
+        errorptr[0] = (FSERROR)(bpreverr + cur);
         cur += delta;           /* form error * 5 */
         bpreverr = belowerr + cur;
         belowerr = bnexterr;
@@ -708,7 +709,7 @@
        * final fserrors[] entry.  Note we need not unload belowerr because
        * it is for the dummy column before or after the actual array.
        */
-      errorptr[0] = (FSERROR) bpreverr; /* unload prev err into array */
+      errorptr[0] = (FSERROR)bpreverr; /* unload prev err into array */
     }
     cquantize->on_odd_row = (cquantize->on_odd_row ? FALSE : TRUE);
   }
@@ -720,16 +721,16 @@
  */
 
 LOCAL(void)
-alloc_fs_workspace (j_decompress_ptr cinfo)
+alloc_fs_workspace(j_decompress_ptr cinfo)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   size_t arraysize;
   int i;
 
-  arraysize = (size_t) ((cinfo->output_width + 2) * sizeof(FSERROR));
+  arraysize = (size_t)((cinfo->output_width + 2) * sizeof(FSERROR));
   for (i = 0; i < cinfo->out_color_components; i++) {
     cquantize->fserrors[i] = (FSERRPTR)
-      (*cinfo->mem->alloc_large)((j_common_ptr) cinfo, JPOOL_IMAGE, arraysize);
+      (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE, arraysize);
   }
 }
 
@@ -739,9 +740,9 @@
  */
 
 METHODDEF(void)
-start_pass_1_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
+start_pass_1_quant(j_decompress_ptr cinfo, boolean is_pre_scan)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   size_t arraysize;
   int i;
 
@@ -767,7 +768,7 @@
      * we must recreate the color index table with padding.
      * This will cost extra space, but probably isn't very likely.
      */
-    if (! cquantize->is_padded)
+    if (!cquantize->is_padded)
       create_colorindex(cinfo);
     /* Create ordered-dither tables if we didn't already. */
     if (cquantize->odither[0] == NULL)
@@ -780,9 +781,9 @@
     if (cquantize->fserrors[0] == NULL)
       alloc_fs_workspace(cinfo);
     /* Initialize the propagated errors to zero. */
-    arraysize = (size_t) ((cinfo->output_width + 2) * sizeof(FSERROR));
+    arraysize = (size_t)((cinfo->output_width + 2) * sizeof(FSERROR));
     for (i = 0; i < cinfo->out_color_components; i++)
-      jzero_far((void *) cquantize->fserrors[i], arraysize);
+      jzero_far((void *)cquantize->fserrors[i], arraysize);
     break;
   default:
     ERREXIT(cinfo, JERR_NOT_COMPILED);
@@ -796,7 +797,7 @@
  */
 
 METHODDEF(void)
-finish_pass_1_quant (j_decompress_ptr cinfo)
+finish_pass_1_quant(j_decompress_ptr cinfo)
 {
   /* no work in 1-pass case */
 }
@@ -808,7 +809,7 @@
  */
 
 METHODDEF(void)
-new_color_map_1_quant (j_decompress_ptr cinfo)
+new_color_map_1_quant(j_decompress_ptr cinfo)
 {
   ERREXIT(cinfo, JERR_MODE_CHANGE);
 }
@@ -819,14 +820,14 @@
  */
 
 GLOBAL(void)
-jinit_1pass_quantizer (j_decompress_ptr cinfo)
+jinit_1pass_quantizer(j_decompress_ptr cinfo)
 {
   my_cquantize_ptr cquantize;
 
   cquantize = (my_cquantize_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_cquantizer));
-  cinfo->cquantize = (struct jpeg_color_quantizer *) cquantize;
+  cinfo->cquantize = (struct jpeg_color_quantizer *)cquantize;
   cquantize->pub.start_pass = start_pass_1_quant;
   cquantize->pub.finish_pass = finish_pass_1_quant;
   cquantize->pub.new_color_map = new_color_map_1_quant;
@@ -837,8 +838,8 @@
   if (cinfo->out_color_components > MAX_Q_COMPS)
     ERREXIT1(cinfo, JERR_QUANT_COMPONENTS, MAX_Q_COMPS);
   /* Make sure colormap indexes can be represented by JSAMPLEs */
-  if (cinfo->desired_number_of_colors > (MAXJSAMPLE+1))
-    ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXJSAMPLE+1);
+  if (cinfo->desired_number_of_colors > (MAXJSAMPLE + 1))
+    ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXJSAMPLE + 1);
 
   /* Create the colormap and color index table. */
   create_colormap(cinfo);
diff --git a/jquant2.c b/jquant2.c
index cfbd0f1..67ebde4 100644
--- a/jquant2.c
+++ b/jquant2.c
@@ -77,7 +77,7 @@
 #define G_SCALE 3               /* scale G distances by this much */
 #define B_SCALE 1               /* and B by this much */
 
-static const int c_scales[3]={R_SCALE, G_SCALE, B_SCALE};
+static const int c_scales[3] = { R_SCALE, G_SCALE, B_SCALE };
 #define C0_SCALE c_scales[rgb_red[cinfo->out_color_space]]
 #define C1_SCALE c_scales[rgb_green[cinfo->out_color_space]]
 #define C2_SCALE c_scales[rgb_blue[cinfo->out_color_space]]
@@ -106,7 +106,7 @@
  * each 2-D array has 2^6*2^5 = 2048 or 2^6*2^6 = 4096 entries.
  */
 
-#define MAXNUMCOLORS  (MAXJSAMPLE+1) /* maximum size of colormap */
+#define MAXNUMCOLORS  (MAXJSAMPLE + 1) /* maximum size of colormap */
 
 /* These will do the right thing for either R,G,B or B,G,R color order,
  * but you may not like the results for other color orders.
@@ -116,19 +116,19 @@
 #define HIST_C2_BITS  5         /* bits of precision in B/R histogram */
 
 /* Number of elements along histogram axes. */
-#define HIST_C0_ELEMS  (1<<HIST_C0_BITS)
-#define HIST_C1_ELEMS  (1<<HIST_C1_BITS)
-#define HIST_C2_ELEMS  (1<<HIST_C2_BITS)
+#define HIST_C0_ELEMS  (1 << HIST_C0_BITS)
+#define HIST_C1_ELEMS  (1 << HIST_C1_BITS)
+#define HIST_C2_ELEMS  (1 << HIST_C2_BITS)
 
 /* These are the amounts to shift an input value to get a histogram index. */
-#define C0_SHIFT  (BITS_IN_JSAMPLE-HIST_C0_BITS)
-#define C1_SHIFT  (BITS_IN_JSAMPLE-HIST_C1_BITS)
-#define C2_SHIFT  (BITS_IN_JSAMPLE-HIST_C2_BITS)
+#define C0_SHIFT  (BITS_IN_JSAMPLE - HIST_C0_BITS)
+#define C1_SHIFT  (BITS_IN_JSAMPLE - HIST_C1_BITS)
+#define C2_SHIFT  (BITS_IN_JSAMPLE - HIST_C2_BITS)
 
 
 typedef UINT16 histcell;        /* histogram cell; prefer an unsigned type */
 
-typedef histcell *histptr; /* for pointers to histogram cells */
+typedef histcell *histptr;      /* for pointers to histogram cells */
 
 typedef histcell hist1d[HIST_C2_ELEMS]; /* typedefs for the array */
 typedef hist1d *hist2d;         /* type for the 2nd-level pointers */
@@ -200,10 +200,10 @@
  */
 
 METHODDEF(void)
-prescan_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                  JSAMPARRAY output_buf, int num_rows)
+prescan_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                 JSAMPARRAY output_buf, int num_rows)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   register JSAMPROW ptr;
   register histptr histp;
   register hist3d histogram = cquantize->histogram;
@@ -215,9 +215,9 @@
     ptr = input_buf[row];
     for (col = width; col > 0; col--) {
       /* get pixel value and index into the histogram */
-      histp = & histogram[GETJSAMPLE(ptr[0]) >> C0_SHIFT]
-                         [GETJSAMPLE(ptr[1]) >> C1_SHIFT]
-                         [GETJSAMPLE(ptr[2]) >> C2_SHIFT];
+      histp = &histogram[GETJSAMPLE(ptr[0]) >> C0_SHIFT]
+                        [GETJSAMPLE(ptr[1]) >> C1_SHIFT]
+                        [GETJSAMPLE(ptr[2]) >> C2_SHIFT];
       /* increment, check for overflow and undo increment if so. */
       if (++(*histp) <= 0)
         (*histp)--;
@@ -249,7 +249,7 @@
 
 
 LOCAL(boxptr)
-find_biggest_color_pop (boxptr boxlist, int numboxes)
+find_biggest_color_pop(boxptr boxlist, int numboxes)
 /* Find the splittable box with the largest color population */
 /* Returns NULL if no splittable boxes remain */
 {
@@ -269,7 +269,7 @@
 
 
 LOCAL(boxptr)
-find_biggest_volume (boxptr boxlist, int numboxes)
+find_biggest_volume(boxptr boxlist, int numboxes)
 /* Find the splittable box with the largest (scaled) volume */
 /* Returns NULL if no splittable boxes remain */
 {
@@ -289,16 +289,16 @@
 
 
 LOCAL(void)
-update_box (j_decompress_ptr cinfo, boxptr boxp)
+update_box(j_decompress_ptr cinfo, boxptr boxp)
 /* Shrink the min/max bounds of a box to enclose only nonzero elements, */
 /* and recompute its volume and population */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
   histptr histp;
-  int c0,c1,c2;
-  int c0min,c0max,c1min,c1max,c2min,c2max;
-  JLONG dist0,dist1,dist2;
+  int c0, c1, c2;
+  int c0min, c0max, c1min, c1max, c2min, c2max;
+  JLONG dist0, dist1, dist2;
   long ccount;
 
   c0min = boxp->c0min;  c0max = boxp->c0max;
@@ -308,69 +308,69 @@
   if (c0max > c0min)
     for (c0 = c0min; c0 <= c0max; c0++)
       for (c1 = c1min; c1 <= c1max; c1++) {
-        histp = & histogram[c0][c1][c2min];
+        histp = &histogram[c0][c1][c2min];
         for (c2 = c2min; c2 <= c2max; c2++)
           if (*histp++ != 0) {
             boxp->c0min = c0min = c0;
             goto have_c0min;
           }
       }
- have_c0min:
+have_c0min:
   if (c0max > c0min)
     for (c0 = c0max; c0 >= c0min; c0--)
       for (c1 = c1min; c1 <= c1max; c1++) {
-        histp = & histogram[c0][c1][c2min];
+        histp = &histogram[c0][c1][c2min];
         for (c2 = c2min; c2 <= c2max; c2++)
           if (*histp++ != 0) {
             boxp->c0max = c0max = c0;
             goto have_c0max;
           }
       }
- have_c0max:
+have_c0max:
   if (c1max > c1min)
     for (c1 = c1min; c1 <= c1max; c1++)
       for (c0 = c0min; c0 <= c0max; c0++) {
-        histp = & histogram[c0][c1][c2min];
+        histp = &histogram[c0][c1][c2min];
         for (c2 = c2min; c2 <= c2max; c2++)
           if (*histp++ != 0) {
             boxp->c1min = c1min = c1;
             goto have_c1min;
           }
       }
- have_c1min:
+have_c1min:
   if (c1max > c1min)
     for (c1 = c1max; c1 >= c1min; c1--)
       for (c0 = c0min; c0 <= c0max; c0++) {
-        histp = & histogram[c0][c1][c2min];
+        histp = &histogram[c0][c1][c2min];
         for (c2 = c2min; c2 <= c2max; c2++)
           if (*histp++ != 0) {
             boxp->c1max = c1max = c1;
             goto have_c1max;
           }
       }
- have_c1max:
+have_c1max:
   if (c2max > c2min)
     for (c2 = c2min; c2 <= c2max; c2++)
       for (c0 = c0min; c0 <= c0max; c0++) {
-        histp = & histogram[c0][c1min][c2];
+        histp = &histogram[c0][c1min][c2];
         for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS)
           if (*histp != 0) {
             boxp->c2min = c2min = c2;
             goto have_c2min;
           }
       }
- have_c2min:
+have_c2min:
   if (c2max > c2min)
     for (c2 = c2max; c2 >= c2min; c2--)
       for (c0 = c0min; c0 <= c0max; c0++) {
-        histp = & histogram[c0][c1min][c2];
+        histp = &histogram[c0][c1min][c2];
         for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS)
           if (*histp != 0) {
             boxp->c2max = c2max = c2;
             goto have_c2max;
           }
       }
- have_c2max:
+have_c2max:
 
   /* Update box volume.
    * We use 2-norm rather than real volume here; this biases the method
@@ -383,13 +383,13 @@
   dist0 = ((c0max - c0min) << C0_SHIFT) * C0_SCALE;
   dist1 = ((c1max - c1min) << C1_SHIFT) * C1_SCALE;
   dist2 = ((c2max - c2min) << C2_SHIFT) * C2_SCALE;
-  boxp->volume = dist0*dist0 + dist1*dist1 + dist2*dist2;
+  boxp->volume = dist0 * dist0 + dist1 * dist1 + dist2 * dist2;
 
   /* Now scan remaining volume of box and compute population */
   ccount = 0;
   for (c0 = c0min; c0 <= c0max; c0++)
     for (c1 = c1min; c1 <= c1max; c1++) {
-      histp = & histogram[c0][c1][c2min];
+      histp = &histogram[c0][c1][c2min];
       for (c2 = c2min; c2 <= c2max; c2++, histp++)
         if (*histp != 0) {
           ccount++;
@@ -400,19 +400,19 @@
 
 
 LOCAL(int)
-median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
-            int desired_colors)
+median_cut(j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
+           int desired_colors)
 /* Repeatedly select and split the largest box until we have enough boxes */
 {
-  int n,lb;
-  int c0,c1,c2,cmax;
-  register boxptr b1,b2;
+  int n, lb;
+  int c0, c1, c2, cmax;
+  register boxptr b1, b2;
 
   while (numboxes < desired_colors) {
     /* Select box to split.
      * Current algorithm: by population for first half, then by volume.
      */
-    if (numboxes*2 <= desired_colors) {
+    if (numboxes * 2 <= desired_colors) {
       b1 = find_biggest_color_pop(boxlist, numboxes);
     } else {
       b1 = find_biggest_volume(boxlist, numboxes);
@@ -421,8 +421,8 @@
       break;
     b2 = &boxlist[numboxes];    /* where new box will go */
     /* Copy the color bounds to the new box. */
-    b2->c0max = b1->c0max; b2->c1max = b1->c1max; b2->c2max = b1->c2max;
-    b2->c0min = b1->c0min; b2->c1min = b1->c1min; b2->c2min = b1->c2min;
+    b2->c0max = b1->c0max;  b2->c1max = b1->c1max;  b2->c2max = b1->c2max;
+    b2->c0min = b1->c0min;  b2->c1min = b1->c1min;  b2->c2min = b1->c2min;
     /* Choose which axis to split the box on.
      * Current algorithm: longest scaled axis.
      * See notes in update_box about scaling distances.
@@ -434,13 +434,12 @@
      * This code does the right thing for R,G,B or B,G,R color orders only.
      */
     if (rgb_red[cinfo->out_color_space] == 0) {
-      cmax = c1; n = 1;
-      if (c0 > cmax) { cmax = c0; n = 0; }
+      cmax = c1;  n = 1;
+      if (c0 > cmax) { cmax = c0;  n = 0; }
       if (c2 > cmax) { n = 2; }
-    }
-    else {
-      cmax = c1; n = 1;
-      if (c2 > cmax) { cmax = c2; n = 2; }
+    } else {
+      cmax = c1;  n = 1;
+      if (c2 > cmax) { cmax = c2;  n = 2; }
       if (c0 > cmax) { n = 0; }
     }
     /* Choose split point along selected axis, and update box bounds.
@@ -453,17 +452,17 @@
     case 0:
       lb = (b1->c0max + b1->c0min) / 2;
       b1->c0max = lb;
-      b2->c0min = lb+1;
+      b2->c0min = lb + 1;
       break;
     case 1:
       lb = (b1->c1max + b1->c1min) / 2;
       b1->c1max = lb;
-      b2->c1min = lb+1;
+      b2->c1min = lb + 1;
       break;
     case 2:
       lb = (b1->c2max + b1->c2min) / 2;
       b1->c2max = lb;
-      b2->c2min = lb+1;
+      b2->c2min = lb + 1;
       break;
     }
     /* Update stats for boxes */
@@ -476,16 +475,16 @@
 
 
 LOCAL(void)
-compute_color (j_decompress_ptr cinfo, boxptr boxp, int icolor)
+compute_color(j_decompress_ptr cinfo, boxptr boxp, int icolor)
 /* Compute representative color for a box, put it in colormap[icolor] */
 {
   /* Current algorithm: mean weighted by pixels (not colors) */
   /* Note it is important to get the rounding correct! */
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
   histptr histp;
-  int c0,c1,c2;
-  int c0min,c0max,c1min,c1max,c2min,c2max;
+  int c0, c1, c2;
+  int c0min, c0max, c1min, c1max, c2min, c2max;
   long count;
   long total = 0;
   long c0total = 0;
@@ -498,25 +497,25 @@
 
   for (c0 = c0min; c0 <= c0max; c0++)
     for (c1 = c1min; c1 <= c1max; c1++) {
-      histp = & histogram[c0][c1][c2min];
+      histp = &histogram[c0][c1][c2min];
       for (c2 = c2min; c2 <= c2max; c2++) {
         if ((count = *histp++) != 0) {
           total += count;
-          c0total += ((c0 << C0_SHIFT) + ((1<<C0_SHIFT)>>1)) * count;
-          c1total += ((c1 << C1_SHIFT) + ((1<<C1_SHIFT)>>1)) * count;
-          c2total += ((c2 << C2_SHIFT) + ((1<<C2_SHIFT)>>1)) * count;
+          c0total += ((c0 << C0_SHIFT) + ((1 << C0_SHIFT) >> 1)) * count;
+          c1total += ((c1 << C1_SHIFT) + ((1 << C1_SHIFT) >> 1)) * count;
+          c2total += ((c2 << C2_SHIFT) + ((1 << C2_SHIFT) >> 1)) * count;
         }
       }
     }
 
-  cinfo->colormap[0][icolor] = (JSAMPLE) ((c0total + (total>>1)) / total);
-  cinfo->colormap[1][icolor] = (JSAMPLE) ((c1total + (total>>1)) / total);
-  cinfo->colormap[2][icolor] = (JSAMPLE) ((c2total + (total>>1)) / total);
+  cinfo->colormap[0][icolor] = (JSAMPLE)((c0total + (total >> 1)) / total);
+  cinfo->colormap[1][icolor] = (JSAMPLE)((c1total + (total >> 1)) / total);
+  cinfo->colormap[2][icolor] = (JSAMPLE)((c2total + (total >> 1)) / total);
 }
 
 
 LOCAL(void)
-select_colors (j_decompress_ptr cinfo, int desired_colors)
+select_colors(j_decompress_ptr cinfo, int desired_colors)
 /* Master routine for color selection */
 {
   boxptr boxlist;
@@ -524,8 +523,8 @@
   int i;
 
   /* Allocate workspace for box list */
-  boxlist = (boxptr) (*cinfo->mem->alloc_small)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, desired_colors * sizeof(box));
+  boxlist = (boxptr)(*cinfo->mem->alloc_small)
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, desired_colors * sizeof(box));
   /* Initialize one box containing whole space */
   numboxes = 1;
   boxlist[0].c0min = 0;
@@ -535,12 +534,12 @@
   boxlist[0].c2min = 0;
   boxlist[0].c2max = MAXJSAMPLE >> C2_SHIFT;
   /* Shrink it to actually-used volume and set its statistics */
-  update_box(cinfo, & boxlist[0]);
+  update_box(cinfo, &boxlist[0]);
   /* Perform median-cut to produce final box list */
   numboxes = median_cut(cinfo, boxlist, numboxes, desired_colors);
   /* Compute the representative color for each box, fill colormap */
   for (i = 0; i < numboxes; i++)
-    compute_color(cinfo, & boxlist[i], i);
+    compute_color(cinfo, &boxlist[i], i);
   cinfo->actual_number_of_colors = numboxes;
   TRACEMS1(cinfo, 1, JTRC_QUANT_SELECTED, numboxes);
 }
@@ -601,13 +600,13 @@
 
 
 /* log2(histogram cells in update box) for each axis; this can be adjusted */
-#define BOX_C0_LOG  (HIST_C0_BITS-3)
-#define BOX_C1_LOG  (HIST_C1_BITS-3)
-#define BOX_C2_LOG  (HIST_C2_BITS-3)
+#define BOX_C0_LOG  (HIST_C0_BITS - 3)
+#define BOX_C1_LOG  (HIST_C1_BITS - 3)
+#define BOX_C2_LOG  (HIST_C2_BITS - 3)
 
-#define BOX_C0_ELEMS  (1<<BOX_C0_LOG) /* # of hist cells in update box */
-#define BOX_C1_ELEMS  (1<<BOX_C1_LOG)
-#define BOX_C2_ELEMS  (1<<BOX_C2_LOG)
+#define BOX_C0_ELEMS  (1 << BOX_C0_LOG) /* # of hist cells in update box */
+#define BOX_C1_ELEMS  (1 << BOX_C1_LOG)
+#define BOX_C2_ELEMS  (1 << BOX_C2_LOG)
 
 #define BOX_C0_SHIFT  (C0_SHIFT + BOX_C0_LOG)
 #define BOX_C1_SHIFT  (C1_SHIFT + BOX_C1_LOG)
@@ -623,8 +622,8 @@
  */
 
 LOCAL(int)
-find_nearby_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
-                    JSAMPLE colorlist[])
+find_nearby_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
+                   JSAMPLE colorlist[])
 /* Locate the colormap entries close enough to an update box to be candidates
  * for the nearest entry to some cell(s) in the update box.  The update box
  * is specified by the center coordinates of its first cell.  The number of
@@ -669,67 +668,67 @@
     x = GETJSAMPLE(cinfo->colormap[0][i]);
     if (x < minc0) {
       tdist = (x - minc0) * C0_SCALE;
-      min_dist = tdist*tdist;
+      min_dist = tdist * tdist;
       tdist = (x - maxc0) * C0_SCALE;
-      max_dist = tdist*tdist;
+      max_dist = tdist * tdist;
     } else if (x > maxc0) {
       tdist = (x - maxc0) * C0_SCALE;
-      min_dist = tdist*tdist;
+      min_dist = tdist * tdist;
       tdist = (x - minc0) * C0_SCALE;
-      max_dist = tdist*tdist;
+      max_dist = tdist * tdist;
     } else {
       /* within cell range so no contribution to min_dist */
       min_dist = 0;
       if (x <= centerc0) {
         tdist = (x - maxc0) * C0_SCALE;
-        max_dist = tdist*tdist;
+        max_dist = tdist * tdist;
       } else {
         tdist = (x - minc0) * C0_SCALE;
-        max_dist = tdist*tdist;
+        max_dist = tdist * tdist;
       }
     }
 
     x = GETJSAMPLE(cinfo->colormap[1][i]);
     if (x < minc1) {
       tdist = (x - minc1) * C1_SCALE;
-      min_dist += tdist*tdist;
+      min_dist += tdist * tdist;
       tdist = (x - maxc1) * C1_SCALE;
-      max_dist += tdist*tdist;
+      max_dist += tdist * tdist;
     } else if (x > maxc1) {
       tdist = (x - maxc1) * C1_SCALE;
-      min_dist += tdist*tdist;
+      min_dist += tdist * tdist;
       tdist = (x - minc1) * C1_SCALE;
-      max_dist += tdist*tdist;
+      max_dist += tdist * tdist;
     } else {
       /* within cell range so no contribution to min_dist */
       if (x <= centerc1) {
         tdist = (x - maxc1) * C1_SCALE;
-        max_dist += tdist*tdist;
+        max_dist += tdist * tdist;
       } else {
         tdist = (x - minc1) * C1_SCALE;
-        max_dist += tdist*tdist;
+        max_dist += tdist * tdist;
       }
     }
 
     x = GETJSAMPLE(cinfo->colormap[2][i]);
     if (x < minc2) {
       tdist = (x - minc2) * C2_SCALE;
-      min_dist += tdist*tdist;
+      min_dist += tdist * tdist;
       tdist = (x - maxc2) * C2_SCALE;
-      max_dist += tdist*tdist;
+      max_dist += tdist * tdist;
     } else if (x > maxc2) {
       tdist = (x - maxc2) * C2_SCALE;
-      min_dist += tdist*tdist;
+      min_dist += tdist * tdist;
       tdist = (x - minc2) * C2_SCALE;
-      max_dist += tdist*tdist;
+      max_dist += tdist * tdist;
     } else {
       /* within cell range so no contribution to min_dist */
       if (x <= centerc2) {
         tdist = (x - maxc2) * C2_SCALE;
-        max_dist += tdist*tdist;
+        max_dist += tdist * tdist;
       } else {
         tdist = (x - minc2) * C2_SCALE;
-        max_dist += tdist*tdist;
+        max_dist += tdist * tdist;
       }
     }
 
@@ -745,15 +744,15 @@
   ncolors = 0;
   for (i = 0; i < numcolors; i++) {
     if (mindist[i] <= minmaxdist)
-      colorlist[ncolors++] = (JSAMPLE) i;
+      colorlist[ncolors++] = (JSAMPLE)i;
   }
   return ncolors;
 }
 
 
 LOCAL(void)
-find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
-                  int numcolors, JSAMPLE colorlist[], JSAMPLE bestcolor[])
+find_best_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
+                 int numcolors, JSAMPLE colorlist[], JSAMPLE bestcolor[])
 /* Find the closest colormap entry for each cell in the update box,
  * given the list of candidate colors prepared by find_nearby_colors.
  * Return the indexes of the closest entries in the bestcolor[] array.
@@ -775,7 +774,7 @@
 
   /* Initialize best-distance for each cell of the update box */
   bptr = bestdist;
-  for (i = BOX_C0_ELEMS*BOX_C1_ELEMS*BOX_C2_ELEMS-1; i >= 0; i--)
+  for (i = BOX_C0_ELEMS * BOX_C1_ELEMS * BOX_C2_ELEMS - 1; i >= 0; i--)
     *bptr++ = 0x7FFFFFFFL;
 
   /* For each color selected by find_nearby_colors,
@@ -792,11 +791,11 @@
     icolor = GETJSAMPLE(colorlist[i]);
     /* Compute (square of) distance from minc0/c1/c2 to this color */
     inc0 = (minc0 - GETJSAMPLE(cinfo->colormap[0][icolor])) * C0_SCALE;
-    dist0 = inc0*inc0;
+    dist0 = inc0 * inc0;
     inc1 = (minc1 - GETJSAMPLE(cinfo->colormap[1][icolor])) * C1_SCALE;
-    dist0 += inc1*inc1;
+    dist0 += inc1 * inc1;
     inc2 = (minc2 - GETJSAMPLE(cinfo->colormap[2][icolor])) * C2_SCALE;
-    dist0 += inc2*inc2;
+    dist0 += inc2 * inc2;
     /* Form the initial difference increments */
     inc0 = inc0 * (2 * STEP_C0) + STEP_C0 * STEP_C0;
     inc1 = inc1 * (2 * STEP_C1) + STEP_C1 * STEP_C1;
@@ -805,16 +804,16 @@
     bptr = bestdist;
     cptr = bestcolor;
     xx0 = inc0;
-    for (ic0 = BOX_C0_ELEMS-1; ic0 >= 0; ic0--) {
+    for (ic0 = BOX_C0_ELEMS - 1; ic0 >= 0; ic0--) {
       dist1 = dist0;
       xx1 = inc1;
-      for (ic1 = BOX_C1_ELEMS-1; ic1 >= 0; ic1--) {
+      for (ic1 = BOX_C1_ELEMS - 1; ic1 >= 0; ic1--) {
         dist2 = dist1;
         xx2 = inc2;
-        for (ic2 = BOX_C2_ELEMS-1; ic2 >= 0; ic2--) {
+        for (ic2 = BOX_C2_ELEMS - 1; ic2 >= 0; ic2--) {
           if (dist2 < *bptr) {
             *bptr = dist2;
-            *cptr = (JSAMPLE) icolor;
+            *cptr = (JSAMPLE)icolor;
           }
           dist2 += xx2;
           xx2 += 2 * STEP_C2 * STEP_C2;
@@ -832,12 +831,12 @@
 
 
 LOCAL(void)
-fill_inverse_cmap (j_decompress_ptr cinfo, int c0, int c1, int c2)
+fill_inverse_cmap(j_decompress_ptr cinfo, int c0, int c1, int c2)
 /* Fill the inverse-colormap entries in the update box that contains */
 /* histogram cell c0/c1/c2.  (Only that one cell MUST be filled, but */
 /* we can fill as many others as we wish.) */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
   int minc0, minc1, minc2;      /* lower left corner of update box */
   int ic0, ic1, ic2;
@@ -878,9 +877,9 @@
   cptr = bestcolor;
   for (ic0 = 0; ic0 < BOX_C0_ELEMS; ic0++) {
     for (ic1 = 0; ic1 < BOX_C1_ELEMS; ic1++) {
-      cachep = & histogram[c0+ic0][c1+ic1][c2];
+      cachep = &histogram[c0 + ic0][c1 + ic1][c2];
       for (ic2 = 0; ic2 < BOX_C2_ELEMS; ic2++) {
-        *cachep++ = (histcell) (GETJSAMPLE(*cptr++) + 1);
+        *cachep++ = (histcell)(GETJSAMPLE(*cptr++) + 1);
       }
     }
   }
@@ -892,11 +891,11 @@
  */
 
 METHODDEF(void)
-pass2_no_dither (j_decompress_ptr cinfo,
-                 JSAMPARRAY input_buf, JSAMPARRAY output_buf, int num_rows)
+pass2_no_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                JSAMPARRAY output_buf, int num_rows)
 /* This version performs no dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
   register JSAMPROW inptr, outptr;
   register histptr cachep;
@@ -913,24 +912,24 @@
       c0 = GETJSAMPLE(*inptr++) >> C0_SHIFT;
       c1 = GETJSAMPLE(*inptr++) >> C1_SHIFT;
       c2 = GETJSAMPLE(*inptr++) >> C2_SHIFT;
-      cachep = & histogram[c0][c1][c2];
+      cachep = &histogram[c0][c1][c2];
       /* If we have not seen this color before, find nearest colormap entry */
       /* and update the cache */
       if (*cachep == 0)
-        fill_inverse_cmap(cinfo, c0,c1,c2);
+        fill_inverse_cmap(cinfo, c0, c1, c2);
       /* Now emit the colormap index for this cell */
-      *outptr++ = (JSAMPLE) (*cachep - 1);
+      *outptr++ = (JSAMPLE)(*cachep - 1);
     }
   }
 }
 
 
 METHODDEF(void)
-pass2_fs_dither (j_decompress_ptr cinfo,
-                 JSAMPARRAY input_buf, JSAMPARRAY output_buf, int num_rows)
+pass2_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                JSAMPARRAY output_buf, int num_rows)
 /* This version performs Floyd-Steinberg dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
   register LOCFSERROR cur0, cur1, cur2; /* current error or pixel value */
   LOCFSERROR belowerr0, belowerr1, belowerr2; /* error for pixel below cur */
@@ -956,11 +955,11 @@
     outptr = output_buf[row];
     if (cquantize->on_odd_row) {
       /* work right to left in this row */
-      inptr += (width-1) * 3;   /* so point to rightmost pixel */
-      outptr += width-1;
+      inptr += (width - 1) * 3; /* so point to rightmost pixel */
+      outptr += width - 1;
       dir = -1;
       dir3 = -3;
-      errorptr = cquantize->fserrors + (width+1)*3; /* => entry after last column */
+      errorptr = cquantize->fserrors + (width + 1) * 3; /* => entry after last column */
       cquantize->on_odd_row = FALSE; /* flip for next time */
     } else {
       /* work left to right in this row */
@@ -984,9 +983,9 @@
        * for either sign of the error value.
        * Note: errorptr points to *previous* column's array entry.
        */
-      cur0 = RIGHT_SHIFT(cur0 + errorptr[dir3+0] + 8, 4);
-      cur1 = RIGHT_SHIFT(cur1 + errorptr[dir3+1] + 8, 4);
-      cur2 = RIGHT_SHIFT(cur2 + errorptr[dir3+2] + 8, 4);
+      cur0 = RIGHT_SHIFT(cur0 + errorptr[dir3 + 0] + 8, 4);
+      cur1 = RIGHT_SHIFT(cur1 + errorptr[dir3 + 1] + 8, 4);
+      cur2 = RIGHT_SHIFT(cur2 + errorptr[dir3 + 2] + 8, 4);
       /* Limit the error using transfer function set by init_error_limit.
        * See comments with init_error_limit for rationale.
        */
@@ -1004,14 +1003,17 @@
       cur1 = GETJSAMPLE(range_limit[cur1]);
       cur2 = GETJSAMPLE(range_limit[cur2]);
       /* Index into the cache with adjusted pixel value */
-      cachep = & histogram[cur0>>C0_SHIFT][cur1>>C1_SHIFT][cur2>>C2_SHIFT];
+      cachep =
+        &histogram[cur0 >> C0_SHIFT][cur1 >> C1_SHIFT][cur2 >> C2_SHIFT];
       /* If we have not seen this color before, find nearest colormap */
       /* entry and update the cache */
       if (*cachep == 0)
-        fill_inverse_cmap(cinfo, cur0>>C0_SHIFT,cur1>>C1_SHIFT,cur2>>C2_SHIFT);
+        fill_inverse_cmap(cinfo, cur0 >> C0_SHIFT, cur1 >> C1_SHIFT,
+                          cur2 >> C2_SHIFT);
       /* Now emit the colormap index for this cell */
-      { register int pixcode = *cachep - 1;
-        *outptr = (JSAMPLE) pixcode;
+      {
+        register int pixcode = *cachep - 1;
+        *outptr = (JSAMPLE)pixcode;
         /* Compute representation error for this pixel */
         cur0 -= GETJSAMPLE(colormap0[pixcode]);
         cur1 -= GETJSAMPLE(colormap1[pixcode]);
@@ -1021,20 +1023,21 @@
        * Add these into the running sums, and simultaneously shift the
        * next-line error sums left by 1 column.
        */
-      { register LOCFSERROR bnexterr;
+      {
+        register LOCFSERROR bnexterr;
 
         bnexterr = cur0;        /* Process component 0 */
-        errorptr[0] = (FSERROR) (bpreverr0 + cur0 * 3);
+        errorptr[0] = (FSERROR)(bpreverr0 + cur0 * 3);
         bpreverr0 = belowerr0 + cur0 * 5;
         belowerr0 = bnexterr;
         cur0 *= 7;
         bnexterr = cur1;        /* Process component 1 */
-        errorptr[1] = (FSERROR) (bpreverr1 + cur1 * 3);
+        errorptr[1] = (FSERROR)(bpreverr1 + cur1 * 3);
         bpreverr1 = belowerr1 + cur1 * 5;
         belowerr1 = bnexterr;
         cur1 *= 7;
         bnexterr = cur2;        /* Process component 2 */
-        errorptr[2] = (FSERROR) (bpreverr2 + cur2 * 3);
+        errorptr[2] = (FSERROR)(bpreverr2 + cur2 * 3);
         bpreverr2 = belowerr2 + cur2 * 5;
         belowerr2 = bnexterr;
         cur2 *= 7;
@@ -1051,9 +1054,9 @@
      * final fserrors[] entry.  Note we need not unload belowerrN because
      * it is for the dummy column before or after the actual array.
      */
-    errorptr[0] = (FSERROR) bpreverr0; /* unload prev errs into array */
-    errorptr[1] = (FSERROR) bpreverr1;
-    errorptr[2] = (FSERROR) bpreverr2;
+    errorptr[0] = (FSERROR)bpreverr0; /* unload prev errs into array */
+    errorptr[1] = (FSERROR)bpreverr1;
+    errorptr[2] = (FSERROR)bpreverr2;
   }
 }
 
@@ -1076,31 +1079,31 @@
  */
 
 LOCAL(void)
-init_error_limit (j_decompress_ptr cinfo)
+init_error_limit(j_decompress_ptr cinfo)
 /* Allocate and fill in the error_limiter table */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   int *table;
   int in, out;
 
-  table = (int *) (*cinfo->mem->alloc_small)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, (MAXJSAMPLE*2+1) * sizeof(int));
+  table = (int *)(*cinfo->mem->alloc_small)
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, (MAXJSAMPLE * 2 + 1) * sizeof(int));
   table += MAXJSAMPLE;          /* so can index -MAXJSAMPLE .. +MAXJSAMPLE */
   cquantize->error_limiter = table;
 
-#define STEPSIZE ((MAXJSAMPLE+1)/16)
+#define STEPSIZE ((MAXJSAMPLE + 1) / 16)
   /* Map errors 1:1 up to +- MAXJSAMPLE/16 */
   out = 0;
   for (in = 0; in < STEPSIZE; in++, out++) {
-    table[in] = out; table[-in] = -out;
+    table[in] = out;  table[-in] = -out;
   }
   /* Map errors 1:2 up to +- 3*MAXJSAMPLE/16 */
-  for (; in < STEPSIZE*3; in++, out += (in&1) ? 0 : 1) {
-    table[in] = out; table[-in] = -out;
+  for (; in < STEPSIZE * 3; in++, out += (in & 1) ? 0 : 1) {
+    table[in] = out;  table[-in] = -out;
   }
   /* Clamp the rest to final out value (which is (MAXJSAMPLE+1)/8) */
   for (; in <= MAXJSAMPLE; in++) {
-    table[in] = out; table[-in] = -out;
+    table[in] = out;  table[-in] = -out;
   }
 #undef STEPSIZE
 }
@@ -1111,9 +1114,9 @@
  */
 
 METHODDEF(void)
-finish_pass1 (j_decompress_ptr cinfo)
+finish_pass1(j_decompress_ptr cinfo)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
 
   /* Select the representative colors and fill in cinfo->colormap */
   cinfo->colormap = cquantize->sv_colormap;
@@ -1124,7 +1127,7 @@
 
 
 METHODDEF(void)
-finish_pass2 (j_decompress_ptr cinfo)
+finish_pass2(j_decompress_ptr cinfo)
 {
   /* no work */
 }
@@ -1135,9 +1138,9 @@
  */
 
 METHODDEF(void)
-start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
+start_pass_2_quant(j_decompress_ptr cinfo, boolean is_pre_scan)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
   int i;
 
@@ -1167,14 +1170,14 @@
       ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXNUMCOLORS);
 
     if (cinfo->dither_mode == JDITHER_FS) {
-      size_t arraysize = (size_t) ((cinfo->output_width + 2) *
-                                   (3 * sizeof(FSERROR)));
+      size_t arraysize =
+        (size_t)((cinfo->output_width + 2) * (3 * sizeof(FSERROR)));
       /* Allocate Floyd-Steinberg workspace if we didn't already. */
       if (cquantize->fserrors == NULL)
-        cquantize->fserrors = (FSERRPTR) (*cinfo->mem->alloc_large)
-          ((j_common_ptr) cinfo, JPOOL_IMAGE, arraysize);
+        cquantize->fserrors = (FSERRPTR)(*cinfo->mem->alloc_large)
+          ((j_common_ptr)cinfo, JPOOL_IMAGE, arraysize);
       /* Initialize the propagated errors to zero. */
-      jzero_far((void *) cquantize->fserrors, arraysize);
+      jzero_far((void *)cquantize->fserrors, arraysize);
       /* Make the error-limit table if we didn't already. */
       if (cquantize->error_limiter == NULL)
         init_error_limit(cinfo);
@@ -1185,8 +1188,8 @@
   /* Zero the histogram or inverse color map, if necessary */
   if (cquantize->needs_zeroed) {
     for (i = 0; i < HIST_C0_ELEMS; i++) {
-      jzero_far((void *) histogram[i],
-                HIST_C1_ELEMS*HIST_C2_ELEMS * sizeof(histcell));
+      jzero_far((void *)histogram[i],
+                HIST_C1_ELEMS * HIST_C2_ELEMS * sizeof(histcell));
     }
     cquantize->needs_zeroed = FALSE;
   }
@@ -1198,9 +1201,9 @@
  */
 
 METHODDEF(void)
-new_color_map_2_quant (j_decompress_ptr cinfo)
+new_color_map_2_quant(j_decompress_ptr cinfo)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
 
   /* Reset the inverse color map */
   cquantize->needs_zeroed = TRUE;
@@ -1212,15 +1215,15 @@
  */
 
 GLOBAL(void)
-jinit_2pass_quantizer (j_decompress_ptr cinfo)
+jinit_2pass_quantizer(j_decompress_ptr cinfo)
 {
   my_cquantize_ptr cquantize;
   int i;
 
   cquantize = (my_cquantize_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_cquantizer));
-  cinfo->cquantize = (struct jpeg_color_quantizer *) cquantize;
+  cinfo->cquantize = (struct jpeg_color_quantizer *)cquantize;
   cquantize->pub.start_pass = start_pass_2_quant;
   cquantize->pub.new_color_map = new_color_map_2_quant;
   cquantize->fserrors = NULL;   /* flag optional arrays not allocated */
@@ -1231,12 +1234,12 @@
     ERREXIT(cinfo, JERR_NOTIMPL);
 
   /* Allocate the histogram/inverse colormap storage */
-  cquantize->histogram = (hist3d) (*cinfo->mem->alloc_small)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, HIST_C0_ELEMS * sizeof(hist2d));
+  cquantize->histogram = (hist3d)(*cinfo->mem->alloc_small)
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, HIST_C0_ELEMS * sizeof(hist2d));
   for (i = 0; i < HIST_C0_ELEMS; i++) {
-    cquantize->histogram[i] = (hist2d) (*cinfo->mem->alloc_large)
-      ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       HIST_C1_ELEMS*HIST_C2_ELEMS * sizeof(histcell));
+    cquantize->histogram[i] = (hist2d)(*cinfo->mem->alloc_large)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE,
+       HIST_C1_ELEMS * HIST_C2_ELEMS * sizeof(histcell));
   }
   cquantize->needs_zeroed = TRUE; /* histogram is garbage now */
 
@@ -1254,7 +1257,7 @@
     if (desired > MAXNUMCOLORS)
       ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXNUMCOLORS);
     cquantize->sv_colormap = (*cinfo->mem->alloc_sarray)
-      ((j_common_ptr) cinfo,JPOOL_IMAGE, (JDIMENSION) desired, (JDIMENSION) 3);
+      ((j_common_ptr)cinfo, JPOOL_IMAGE, (JDIMENSION)desired, (JDIMENSION)3);
     cquantize->desired = desired;
   } else
     cquantize->sv_colormap = NULL;
@@ -1271,9 +1274,9 @@
    * dither_mode changes.
    */
   if (cinfo->dither_mode == JDITHER_FS) {
-    cquantize->fserrors = (FSERRPTR) (*cinfo->mem->alloc_large)
-      ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       (size_t) ((cinfo->output_width + 2) * (3 * sizeof(FSERROR))));
+    cquantize->fserrors = (FSERRPTR)(*cinfo->mem->alloc_large)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE,
+       (size_t)((cinfo->output_width + 2) * (3 * sizeof(FSERROR))));
     /* Might as well create the error-limiting table too. */
     init_error_limit(cinfo);
   }
diff --git a/jsimd.h b/jsimd.h
index 3aa0779..1b9af30 100644
--- a/jsimd.h
+++ b/jsimd.h
@@ -13,81 +13,93 @@
 
 #include "jchuff.h"             /* Declarations shared with jcphuff.c */
 
-EXTERN(int) jsimd_can_rgb_ycc (void);
-EXTERN(int) jsimd_can_rgb_gray (void);
-EXTERN(int) jsimd_can_ycc_rgb (void);
-EXTERN(int) jsimd_can_ycc_rgb565 (void);
-EXTERN(int) jsimd_c_can_null_convert (void);
+EXTERN(int) jsimd_can_rgb_ycc(void);
+EXTERN(int) jsimd_can_rgb_gray(void);
+EXTERN(int) jsimd_can_ycc_rgb(void);
+EXTERN(int) jsimd_can_ycc_rgb565(void);
+EXTERN(int) jsimd_c_can_null_convert(void);
 
-EXTERN(void) jsimd_rgb_ycc_convert
-        (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_rgb_gray_convert
-        (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_ycc_rgb_convert
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_rgb565_convert
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_c_null_convert
-        (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                                   JSAMPIMAGE output_buf,
+                                   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                                    JSAMPIMAGE output_buf,
+                                    JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_ycc_rgb_convert(j_decompress_ptr cinfo,
+                                   JSAMPIMAGE input_buf, JDIMENSION input_row,
+                                   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo,
+                                      JSAMPIMAGE input_buf,
+                                      JDIMENSION input_row,
+                                      JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                                  JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                  int num_rows);
 
-EXTERN(int) jsimd_can_h2v2_downsample (void);
-EXTERN(int) jsimd_can_h2v1_downsample (void);
+EXTERN(int) jsimd_can_h2v2_downsample(void);
+EXTERN(int) jsimd_can_h2v1_downsample(void);
 
-EXTERN(void) jsimd_h2v2_downsample
-        (j_compress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+EXTERN(void) jsimd_h2v2_downsample(j_compress_ptr cinfo,
+                                   jpeg_component_info *compptr,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY output_data);
 
-EXTERN(int) jsimd_can_h2v2_smooth_downsample (void);
+EXTERN(int) jsimd_can_h2v2_smooth_downsample(void);
 
-EXTERN(void) jsimd_h2v2_smooth_downsample
-        (j_compress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+EXTERN(void) jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+                                          jpeg_component_info *compptr,
+                                          JSAMPARRAY input_data,
+                                          JSAMPARRAY output_data);
 
-EXTERN(void) jsimd_h2v1_downsample
-        (j_compress_ptr cinfo, jpeg_component_info *compptr,
-        JSAMPARRAY input_data, JSAMPARRAY output_data);
+EXTERN(void) jsimd_h2v1_downsample(j_compress_ptr cinfo,
+                                   jpeg_component_info *compptr,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY output_data);
 
-EXTERN(int) jsimd_can_h2v2_upsample (void);
-EXTERN(int) jsimd_can_h2v1_upsample (void);
-EXTERN(int) jsimd_can_int_upsample (void);
+EXTERN(int) jsimd_can_h2v2_upsample(void);
+EXTERN(int) jsimd_can_h2v1_upsample(void);
+EXTERN(int) jsimd_can_int_upsample(void);
 
-EXTERN(void) jsimd_h2v2_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_h2v1_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_int_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample(j_decompress_ptr cinfo,
+                                 jpeg_component_info *compptr,
+                                 JSAMPARRAY input_data,
+                                 JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v1_upsample(j_decompress_ptr cinfo,
+                                 jpeg_component_info *compptr,
+                                 JSAMPARRAY input_data,
+                                 JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_int_upsample(j_decompress_ptr cinfo,
+                                jpeg_component_info *compptr,
+                                JSAMPARRAY input_data,
+                                JSAMPARRAY *output_data_ptr);
 
-EXTERN(int) jsimd_can_h2v2_fancy_upsample (void);
-EXTERN(int) jsimd_can_h2v1_fancy_upsample (void);
+EXTERN(int) jsimd_can_h2v2_fancy_upsample(void);
+EXTERN(int) jsimd_can_h2v1_fancy_upsample(void);
 
-EXTERN(void) jsimd_h2v2_fancy_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_h2v1_fancy_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo,
+                                       jpeg_component_info *compptr,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo,
+                                       jpeg_component_info *compptr,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr);
 
-EXTERN(int) jsimd_can_h2v2_merged_upsample (void);
-EXTERN(int) jsimd_can_h2v1_merged_upsample (void);
+EXTERN(int) jsimd_can_h2v2_merged_upsample(void);
+EXTERN(int) jsimd_can_h2v1_merged_upsample(void);
 
-EXTERN(void) jsimd_h2v2_merged_upsample
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
-EXTERN(void) jsimd_h2v1_merged_upsample
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo,
+                                        JSAMPIMAGE input_buf,
+                                        JDIMENSION in_row_group_ctr,
+                                        JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo,
+                                        JSAMPIMAGE input_buf,
+                                        JDIMENSION in_row_group_ctr,
+                                        JSAMPARRAY output_buf);
 
-EXTERN(int) jsimd_can_huff_encode_one_block (void);
+EXTERN(int) jsimd_can_huff_encode_one_block(void);
 
-EXTERN(JOCTET*) jsimd_huff_encode_one_block
-        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
-         c_derived_tbl *dctbl, c_derived_tbl *actbl);
+EXTERN(JOCTET *) jsimd_huff_encode_one_block(void *state, JOCTET *buffer,
+                                             JCOEFPTR block, int last_dc_val,
+                                             c_derived_tbl *dctbl,
+                                             c_derived_tbl *actbl);
diff --git a/jsimd_none.c b/jsimd_none.c
index f29030c..58acbe6 100644
--- a/jsimd_none.c
+++ b/jsimd_none.c
@@ -20,385 +20,372 @@
 #include "jsimddct.h"
 
 GLOBAL(int)
-jsimd_can_rgb_ycc (void)
+jsimd_can_rgb_ycc(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_rgb_gray (void)
+jsimd_can_rgb_gray(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_ycc_rgb (void)
+jsimd_can_ycc_rgb(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
+jsimd_can_ycc_rgb565(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_c_can_null_convert (void)
+jsimd_c_can_null_convert(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
 {
 }
 
 GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
 {
 }
 
 GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
 {
 }
 
 GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
 {
 }
 
 GLOBAL(void)
-jsimd_c_null_convert (j_compress_ptr cinfo,
-                      JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                      JDIMENSION output_row, int num_rows)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                     JSAMPIMAGE output_buf, JDIMENSION output_row,
+                     int num_rows)
 {
 }
 
 GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
+jsimd_can_h2v2_downsample(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
+jsimd_can_h2v1_downsample(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_h2v2_smooth_downsample (void)
+jsimd_can_h2v2_smooth_downsample(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo,
-                              jpeg_component_info *compptr,
-                              JSAMPARRAY input_data, JSAMPARRAY output_data)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+                             jpeg_component_info *compptr,
+                             JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 }
 
 GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
+jsimd_can_h2v2_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
+jsimd_can_h2v1_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_int_upsample (void)
+jsimd_can_int_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                      JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                   JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
+jsimd_can_h2v2_fancy_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
+jsimd_can_h2v1_fancy_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
+jsimd_can_h2v2_merged_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
+jsimd_can_h2v1_merged_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
 }
 
 GLOBAL(int)
-jsimd_can_convsamp (void)
+jsimd_can_convsamp(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_convsamp_float (void)
+jsimd_can_convsamp_float(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
 {
 }
 
 GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
 {
 }
 
 GLOBAL(int)
-jsimd_can_fdct_islow (void)
+jsimd_can_fdct_islow(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_fdct_ifast (void)
+jsimd_can_fdct_ifast(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_fdct_float (void)
+jsimd_can_fdct_float(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
+jsimd_fdct_islow(DCTELEM *data)
 {
 }
 
 GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
+jsimd_fdct_ifast(DCTELEM *data)
 {
 }
 
 GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
+jsimd_fdct_float(FAST_FLOAT *data)
 {
 }
 
 GLOBAL(int)
-jsimd_can_quantize (void)
+jsimd_can_quantize(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_quantize_float (void)
+jsimd_can_quantize_float(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
 {
 }
 
 GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
 {
 }
 
 GLOBAL(int)
-jsimd_can_idct_2x2 (void)
+jsimd_can_idct_2x2(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_idct_4x4 (void)
+jsimd_can_idct_4x4(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_idct_6x6 (void)
+jsimd_can_idct_6x6(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_idct_12x12 (void)
+jsimd_can_idct_12x12(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
 }
 
 GLOBAL(int)
-jsimd_can_idct_islow (void)
+jsimd_can_idct_islow(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_idct_ifast (void)
+jsimd_can_idct_ifast(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_idct_float (void)
+jsimd_can_idct_float(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
 }
 
 GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
+jsimd_can_huff_encode_one_block(void)
 {
   return 0;
 }
 
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
 {
   return NULL;
 }
diff --git a/jsimddct.h b/jsimddct.h
index b19ab48..55ee8cf 100644
--- a/jsimddct.h
+++ b/jsimddct.h
@@ -9,66 +9,62 @@
  *
  */
 
-EXTERN(int) jsimd_can_convsamp (void);
-EXTERN(int) jsimd_can_convsamp_float (void);
+EXTERN(int) jsimd_can_convsamp(void);
+EXTERN(int) jsimd_can_convsamp_float(void);
 
-EXTERN(void) jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                             DCTELEM *workspace);
-EXTERN(void) jsimd_convsamp_float (JSAMPARRAY sample_data,
-                                   JDIMENSION start_col,
-                                   FAST_FLOAT *workspace);
+EXTERN(void) jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+                            DCTELEM *workspace);
+EXTERN(void) jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                                  FAST_FLOAT *workspace);
 
-EXTERN(int) jsimd_can_fdct_islow (void);
-EXTERN(int) jsimd_can_fdct_ifast (void);
-EXTERN(int) jsimd_can_fdct_float (void);
+EXTERN(int) jsimd_can_fdct_islow(void);
+EXTERN(int) jsimd_can_fdct_ifast(void);
+EXTERN(int) jsimd_can_fdct_float(void);
 
-EXTERN(void) jsimd_fdct_islow (DCTELEM *data);
-EXTERN(void) jsimd_fdct_ifast (DCTELEM *data);
-EXTERN(void) jsimd_fdct_float (FAST_FLOAT *data);
+EXTERN(void) jsimd_fdct_islow(DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast(DCTELEM *data);
+EXTERN(void) jsimd_fdct_float(FAST_FLOAT *data);
 
-EXTERN(int) jsimd_can_quantize (void);
-EXTERN(int) jsimd_can_quantize_float (void);
+EXTERN(int) jsimd_can_quantize(void);
+EXTERN(int) jsimd_can_quantize_float(void);
 
-EXTERN(void) jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                             DCTELEM *workspace);
-EXTERN(void) jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                                   FAST_FLOAT *workspace);
+EXTERN(void) jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors,
+                            DCTELEM *workspace);
+EXTERN(void) jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                                  FAST_FLOAT *workspace);
 
-EXTERN(int) jsimd_can_idct_2x2 (void);
-EXTERN(int) jsimd_can_idct_4x4 (void);
-EXTERN(int) jsimd_can_idct_6x6 (void);
-EXTERN(int) jsimd_can_idct_12x12 (void);
+EXTERN(int) jsimd_can_idct_2x2(void);
+EXTERN(int) jsimd_can_idct_4x4(void);
+EXTERN(int) jsimd_can_idct_6x6(void);
+EXTERN(int) jsimd_can_idct_12x12(void);
 
-EXTERN(void) jsimd_idct_2x2 (j_decompress_ptr cinfo,
-                             jpeg_component_info *compptr,
-                             JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                             JDIMENSION output_col);
-EXTERN(void) jsimd_idct_4x4 (j_decompress_ptr cinfo,
-                             jpeg_component_info *compptr,
-                             JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                             JDIMENSION output_col);
-EXTERN(void) jsimd_idct_6x6 (j_decompress_ptr cinfo,
-                             jpeg_component_info *compptr,
-                             JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                             JDIMENSION output_col);
-EXTERN(void) jsimd_idct_12x12 (j_decompress_ptr cinfo,
-                               jpeg_component_info *compptr,
-                               JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                               JDIMENSION output_col);
+EXTERN(void) jsimd_idct_2x2(j_decompress_ptr cinfo,
+                            jpeg_component_info *compptr, JCOEFPTR coef_block,
+                            JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4(j_decompress_ptr cinfo,
+                            jpeg_component_info *compptr, JCOEFPTR coef_block,
+                            JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jsimd_idct_6x6(j_decompress_ptr cinfo,
+                            jpeg_component_info *compptr, JCOEFPTR coef_block,
+                            JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jsimd_idct_12x12(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
 
-EXTERN(int) jsimd_can_idct_islow (void);
-EXTERN(int) jsimd_can_idct_ifast (void);
-EXTERN(int) jsimd_can_idct_float (void);
+EXTERN(int) jsimd_can_idct_islow(void);
+EXTERN(int) jsimd_can_idct_ifast(void);
+EXTERN(int) jsimd_can_idct_float(void);
 
-EXTERN(void) jsimd_idct_islow (j_decompress_ptr cinfo,
-                               jpeg_component_info *compptr,
-                               JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                               JDIMENSION output_col);
-EXTERN(void) jsimd_idct_ifast (j_decompress_ptr cinfo,
-                               jpeg_component_info *compptr,
-                               JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                               JDIMENSION output_col);
-EXTERN(void) jsimd_idct_float (j_decompress_ptr cinfo,
-                               jpeg_component_info *compptr,
-                               JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                               JDIMENSION output_col);
+EXTERN(void) jsimd_idct_islow(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
+EXTERN(void) jsimd_idct_ifast(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
+EXTERN(void) jsimd_idct_float(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
diff --git a/jstdhuff.c b/jstdhuff.c
index e202e8e..036d649 100644
--- a/jstdhuff.c
+++ b/jstdhuff.c
@@ -17,8 +17,8 @@
  */
 
 LOCAL(void)
-add_huff_table (j_common_ptr cinfo,
-                JHUFF_TBL **htblptr, const UINT8 *bits, const UINT8 *val)
+add_huff_table(j_common_ptr cinfo, JHUFF_TBL **htblptr, const UINT8 *bits,
+               const UINT8 *val)
 /* Define a Huffman table */
 {
   int nsymbols, len;
@@ -50,71 +50,79 @@
 
 
 LOCAL(void)
-std_huff_tables (j_common_ptr cinfo)
+std_huff_tables(j_common_ptr cinfo)
 /* Set up the standard Huffman tables (cf. JPEG standard section K.3) */
 /* IMPORTANT: these are only valid for 8-bit data precision! */
 {
   JHUFF_TBL **dc_huff_tbl_ptrs, **ac_huff_tbl_ptrs;
 
-  static const UINT8 bits_dc_luminance[17] =
-    { /* 0-base */ 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
-  static const UINT8 val_dc_luminance[] =
-    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+  static const UINT8 bits_dc_luminance[17] = {
+    /* 0-base */ 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0
+  };
+  static const UINT8 val_dc_luminance[] = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+  };
 
-  static const UINT8 bits_dc_chrominance[17] =
-    { /* 0-base */ 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 };
-  static const UINT8 val_dc_chrominance[] =
-    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+  static const UINT8 bits_dc_chrominance[17] = {
+    /* 0-base */ 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
+  };
+  static const UINT8 val_dc_chrominance[] = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+  };
 
-  static const UINT8 bits_ac_luminance[17] =
-    { /* 0-base */ 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d };
-  static const UINT8 val_ac_luminance[] =
-    { 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
-      0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
-      0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
-      0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
-      0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
-      0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
-      0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
-      0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
-      0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
-      0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
-      0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
-      0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
-      0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
-      0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
-      0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
-      0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
-      0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
-      0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
-      0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
-      0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
-      0xf9, 0xfa };
+  static const UINT8 bits_ac_luminance[17] = {
+    /* 0-base */ 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d
+  };
+  static const UINT8 val_ac_luminance[] = {
+    0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
+    0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
+    0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
+    0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
+    0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
+    0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
+    0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+    0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+    0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+    0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+    0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+    0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+    0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+    0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+    0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
+    0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
+    0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
+    0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
+    0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
+    0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+    0xf9, 0xfa
+  };
 
-  static const UINT8 bits_ac_chrominance[17] =
-    { /* 0-base */ 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77 };
-  static const UINT8 val_ac_chrominance[] =
-    { 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
-      0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
-      0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
-      0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
-      0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
-      0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
-      0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
-      0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
-      0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
-      0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
-      0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
-      0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
-      0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
-      0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
-      0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
-      0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
-      0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
-      0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
-      0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
-      0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
-      0xf9, 0xfa };
+  static const UINT8 bits_ac_chrominance[17] = {
+    /* 0-base */ 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77
+  };
+  static const UINT8 val_ac_chrominance[] = {
+    0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
+    0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
+    0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
+    0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
+    0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
+    0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
+    0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
+    0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
+    0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
+    0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+    0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
+    0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+    0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
+    0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
+    0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
+    0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
+    0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
+    0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
+    0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
+    0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+    0xf9, 0xfa
+  };
 
   if (cinfo->is_decompressor) {
     dc_huff_tbl_ptrs = ((j_decompress_ptr)cinfo)->dc_huff_tbl_ptrs;
diff --git a/jutils.c b/jutils.c
index f9d3502..5c5bb17 100644
--- a/jutils.c
+++ b/jutils.c
@@ -53,7 +53,7 @@
  * fake entries.
  */
 
-const int jpeg_natural_order[DCTSIZE2+16] = {
+const int jpeg_natural_order[DCTSIZE2 + 16] = {
   0,  1,  8, 16,  9,  2,  3, 10,
  17, 24, 32, 25, 18, 11,  4,  5,
  12, 19, 26, 33, 40, 48, 41, 34,
@@ -72,7 +72,7 @@
  */
 
 GLOBAL(long)
-jdiv_round_up (long a, long b)
+jdiv_round_up(long a, long b)
 /* Compute a/b rounded up to next integer, ie, ceil(a/b) */
 /* Assumes a >= 0, b > 0 */
 {
@@ -81,7 +81,7 @@
 
 
 GLOBAL(long)
-jround_up (long a, long b)
+jround_up(long a, long b)
 /* Compute a rounded up to next multiple of b, ie, ceil(a/b)*b */
 /* Assumes a >= 0, b > 0 */
 {
@@ -91,9 +91,9 @@
 
 
 GLOBAL(void)
-jcopy_sample_rows (JSAMPARRAY input_array, int source_row,
-                   JSAMPARRAY output_array, int dest_row,
-                   int num_rows, JDIMENSION num_cols)
+jcopy_sample_rows(JSAMPARRAY input_array, int source_row,
+                  JSAMPARRAY output_array, int dest_row, int num_rows,
+                  JDIMENSION num_cols)
 /* Copy some rows of samples from one place to another.
  * num_rows rows are copied from input_array[source_row++]
  * to output_array[dest_row++]; these areas may overlap for duplication.
@@ -101,7 +101,7 @@
  */
 {
   register JSAMPROW inptr, outptr;
-  register size_t count = (size_t) (num_cols * sizeof(JSAMPLE));
+  register size_t count = (size_t)(num_cols * sizeof(JSAMPLE));
   register int row;
 
   input_array += source_row;
@@ -116,8 +116,8 @@
 
 
 GLOBAL(void)
-jcopy_block_row (JBLOCKROW input_row, JBLOCKROW output_row,
-                 JDIMENSION num_blocks)
+jcopy_block_row(JBLOCKROW input_row, JBLOCKROW output_row,
+                JDIMENSION num_blocks)
 /* Copy a row of coefficient blocks from one place to another. */
 {
   MEMCOPY(output_row, input_row, num_blocks * (DCTSIZE2 * sizeof(JCOEF)));
@@ -125,7 +125,7 @@
 
 
 GLOBAL(void)
-jzero_far (void *target, size_t bytestozero)
+jzero_far(void *target, size_t bytestozero)
 /* Zero out a chunk of memory. */
 /* This might be sample-array data, block-array data, or alloc_large data. */
 {
diff --git a/jversion.h b/jversion.h
index 7e44eaa..8612a8e 100644
--- a/jversion.h
+++ b/jversion.h
@@ -38,12 +38,13 @@
 #define JCOPYRIGHT      "Copyright (C) 2009-2017 D. R. Commander\n" \
                         "Copyright (C) 2011-2016 Siarhei Siamashka\n" \
                         "Copyright (C) 2015-2016 Matthieu Darbois\n" \
+                        "Copyright (C) 2015 Intel Corporation\n" \
                         "Copyright (C) 2015 Google, Inc.\n" \
                         "Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \
                         "Copyright (C) 2013 Linaro Limited\n" \
                         "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
                         "Copyright (C) 2009 Pierre Ossman for Cendio AB\n" \
                         "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
-                        "Copyright (C) 1991-2016 Thomas G. Lane, Guido Vollbeding" \
+                        "Copyright (C) 1991-2016 Thomas G. Lane, Guido Vollbeding"
 
 #define JCOPYRIGHT_SHORT "Copyright (C) 1991-2017 The libjpeg-turbo Project and many others"
diff --git a/libjpeg.txt b/libjpeg.txt
index 5181afc..7e2341c 100644
--- a/libjpeg.txt
+++ b/libjpeg.txt
@@ -11,7 +11,7 @@
 This file describes how to use the IJG JPEG library within an application
 program.  Read it if you want to write a program that uses the library.
 
-The file example.c provides heavily commented skeleton code for calling the
+The file example.txt provides heavily commented skeleton code for calling the
 JPEG library.  Also see jpeglib.h (the include file to be used by application
 programs) for full details about data structures and function parameter lists.
 The library source code, of course, is the ultimate reference.
@@ -47,6 +47,7 @@
         Buffered-image mode
         Abbreviated datastreams and multiple images
         Special markers
+        ICC profiles
         Raw (downsampled) image data
         Really raw data: DCT coefficients
         Progress monitoring
@@ -401,7 +402,7 @@
 "while (cinfo.next_scanline < cinfo.image_height)".
 
 Code for this step depends heavily on the way that you store the source data.
-example.c shows the following code for the case of a full-size 2-D source
+example.txt shows the following code for the case of a full-size 2-D source
 array containing 3-byte RGB pixels:
 
         JSAMPROW row_pointer[1];        /* pointer to a single row */
@@ -410,7 +411,7 @@
         row_stride = image_width * 3;   /* JSAMPLEs per row in image_buffer */
 
         while (cinfo.next_scanline < cinfo.image_height) {
-            row_pointer[0] = & image_buffer[cinfo.next_scanline * row_stride];
+            row_pointer[0] = &image_buffer[cinfo.next_scanline * row_stride];
             jpeg_write_scanlines(&cinfo, row_pointer, 1);
         }
 
@@ -960,8 +961,8 @@
 Compression parameters (cinfo fields) include:
 
 boolean arith_code
-	If TRUE, use arithmetic coding.
-	If FALSE, use Huffman coding.
+        If TRUE, use arithmetic coding.
+        If FALSE, use Huffman coding.
 
 J_DCT_METHOD dct_method
         Selects the algorithm used for the DCT step.  Choices are:
@@ -1436,7 +1437,7 @@
 routines will cause a message to be printed on stderr, followed by exit().
 You can supply your own error handling routines to override this behavior
 and to control the treatment of nonfatal warnings and trace/debug messages.
-The file example.c illustrates the most common case, which is to have the
+The file example.txt illustrates the most common case, which is to have the
 application regain control after an error rather than exiting.
 
 The JPEG library never writes any message directly; it always goes through
@@ -1453,7 +1454,7 @@
 only replacing some of the routines depending on the behavior you need.
 This is accomplished by calling jpeg_std_error() as usual, but then overriding
 some of the method pointers in the jpeg_error_mgr struct, as illustrated by
-example.c.
+example.txt.
 
 All of the error handling routines will receive a pointer to the JPEG object
 (a j_common_ptr which points to either a jpeg_compress_struct or a
@@ -1464,7 +1465,7 @@
 handler.  The most convenient way to do this is to embed either the JPEG
 object or the jpeg_error_mgr struct in a larger structure that contains
 additional fields; then casting the passed pointer provides access to the
-additional fields.  Again, see example.c for one way to do it.  (Beginning
+additional fields.  Again, see example.txt for one way to do it.  (Beginning
 with IJG version 6b, there is also a void pointer "client_data" in each
 JPEG object, which the application can also use to find related data.
 The library does not touch client_data at all.)
@@ -1973,7 +1974,7 @@
 The simplest approach to displaying progressive images is to do one display
 pass for each scan appearing in the input file.  In this case the outer loop
 condition is typically
-        while (! jpeg_input_complete(&cinfo))
+        while (!jpeg_input_complete(&cinfo))
 and the start-output call should read
         jpeg_start_output(&cinfo, cinfo.input_scan_number);
 The second parameter to jpeg_start_output() indicates which scan of the input
@@ -2094,7 +2095,7 @@
             jpeg_start_output(&cinfo, cinfo.input_scan_number);
             ...
             jpeg_finish_output()
-        } while (! final_pass);
+        } while (!final_pass);
 rather than quitting as soon as jpeg_input_complete() returns TRUE.  This
 arrangement makes it simple to use higher-quality decoding parameters
 for the final pass.  But if you don't want to use special parameters for
@@ -2633,6 +2634,44 @@
 Also, see jpegtran.c for an example of using jpeg_save_markers.
 
 
+ICC profiles
+------------
+
+Two functions are provided for writing and reading International Color
+Consortium (ICC) device profiles embedded in JFIF JPEG image files:
+
+        void jpeg_write_icc_profile (j_compress_ptr cinfo,
+                                     const JOCTET *icc_data_ptr,
+                                     unsigned int icc_data_len);
+        boolean jpeg_read_icc_profile (j_decompress_ptr cinfo,
+                                       JOCTET **icc_data_ptr,
+                                       unsigned int *icc_data_len);
+
+The ICC has defined a standard for including such data in JPEG "APP2" markers.
+The aforementioned functions do not know anything about the internal structure
+of the ICC profile data; they just know how to embed the profile data into a
+JPEG file while writing it, or to extract the profile data from a JPEG file
+while reading it.
+
+jpeg_write_icc_profile() must be called after calling jpeg_start_compress() and
+before the first call to jpeg_write_scanlines() or jpeg_write_raw_data().  This
+ordering ensures that the APP2 marker(s) will appear after the SOI and JFIF or
+Adobe markers, but before all other data.
+
+jpeg_read_icc_profile() returns TRUE if an ICC profile was found and FALSE
+otherwise.  If an ICC profile was found, then the function will allocate a
+memory region containing the profile and will return a pointer to that memory
+region in *icc_data_ptr, as well as the length of the region in *icc_data_len.
+This memory region is allocated by the library using malloc() and must be freed
+by the caller using free() when the memory region is no longer needed.  Callers
+wishing to use jpeg_read_icc_profile() must call
+
+        jpeg_save_markers(cinfo, JPEG_APP0 + 2, 0xFFFF);
+
+prior to calling jpeg_read_header().  jpeg_read_icc_profile() can be called at
+any point between jpeg_read_header() and jpeg_finish_decompress().
+
+
 Raw (downsampled) image data
 ----------------------------
 
@@ -2929,7 +2968,7 @@
 jpeg_finish_compress, jpeg_finish_decompress, or jpeg_abort.  You can call the
 memory manager yourself to allocate structures that will automatically be
 freed at these times.  Typical code for this is
-  ptr = (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, size);
+  ptr = (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, size);
 Use JPOOL_PERMANENT to get storage that lasts as long as the JPEG object.
 Use alloc_large instead of alloc_small for anything bigger than a few Kbytes.
 There are also alloc_sarray and alloc_barray routines that automatically
diff --git a/md5/Makefile.am b/md5/Makefile.am
deleted file mode 100644
index b36f019..0000000
--- a/md5/Makefile.am
+++ /dev/null
@@ -1,4 +0,0 @@
-noinst_PROGRAMS = md5cmp
-
-md5cmp_SOURCES = md5cmp.c md5.c md5hl.c md5.h
-md5cmp_CFLAGS = -I$(srcdir)
diff --git a/md5/md5.c b/md5/md5.c
index 4b5ba5e..2c8631a 100644
--- a/md5/md5.c
+++ b/md5/md5.c
@@ -1,8 +1,8 @@
 /*
  * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
  *
- * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
- * rights reserved.
+ * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991.
+ *                       All rights reserved.
  *
  * License to copy and use this software is granted provided that it
  * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
@@ -61,14 +61,14 @@
  * a multiple of 4.
  */
 
-static void
-Encode (unsigned char *output, unsigned int *input, unsigned int len)
+static void Encode(unsigned char *output, unsigned int *input,
+                   unsigned int len)
 {
-	unsigned int i;
-	unsigned int *op = (unsigned int *)output;
+  unsigned int i;
+  unsigned int *op = (unsigned int *)output;
 
-	for (i = 0; i < len / 4; i++)
-		op[i] = htole32(input[i]);
+  for (i = 0; i < len / 4; i++)
+    op[i] = htole32(input[i]);
 }
 
 /*
@@ -76,14 +76,14 @@
  * a multiple of 4.
  */
 
-static void
-Decode (unsigned int *output, const unsigned char *input, unsigned int len)
+static void Decode(unsigned int *output, const unsigned char *input,
+                   unsigned int len)
 {
-	unsigned int i;
-	const unsigned int *ip = (const unsigned int *)input;
+  unsigned int i;
+  const unsigned int *ip = (const unsigned int *)input;
 
-	for (i = 0; i < len / 4; i++)
-		output[i] = le32toh(ip[i]);
+  for (i = 0; i < len / 4; i++)
+    output[i] = le32toh(ip[i]);
 }
 #endif
 
@@ -100,47 +100,44 @@
 #define I(x, y, z) ((y) ^ ((x) | (~z)))
 
 /* ROTATE_LEFT rotates x left n bits. */
-#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
 
 /*
  * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
  * Rotation is separate from addition to prevent recomputation.
  */
 #define FF(a, b, c, d, x, s, ac) { \
-	(a) += F ((b), (c), (d)) + (x) + (unsigned int)(ac); \
-	(a) = ROTATE_LEFT ((a), (s)); \
-	(a) += (b); \
-	}
+  (a) += F((b), (c), (d)) + (x) + (unsigned int)(ac); \
+  (a) = ROTATE_LEFT((a), (s)); \
+  (a) += (b); \
+}
 #define GG(a, b, c, d, x, s, ac) { \
-	(a) += G ((b), (c), (d)) + (x) + (unsigned int)(ac); \
-	(a) = ROTATE_LEFT ((a), (s)); \
-	(a) += (b); \
-	}
+  (a) += G((b), (c), (d)) + (x) + (unsigned int)(ac); \
+  (a) = ROTATE_LEFT((a), (s)); \
+  (a) += (b); \
+}
 #define HH(a, b, c, d, x, s, ac) { \
-	(a) += H ((b), (c), (d)) + (x) + (unsigned int)(ac); \
-	(a) = ROTATE_LEFT ((a), (s)); \
-	(a) += (b); \
-	}
+  (a) += H((b), (c), (d)) + (x) + (unsigned int)(ac); \
+  (a) = ROTATE_LEFT((a), (s)); \
+  (a) += (b); \
+}
 #define II(a, b, c, d, x, s, ac) { \
-	(a) += I ((b), (c), (d)) + (x) + (unsigned int)(ac); \
-	(a) = ROTATE_LEFT ((a), (s)); \
-	(a) += (b); \
-	}
+  (a) += I((b), (c), (d)) + (x) + (unsigned int)(ac); \
+  (a) = ROTATE_LEFT((a), (s)); \
+  (a) += (b); \
+}
 
 /* MD5 initialization. Begins an MD5 operation, writing a new context. */
 
-void
-MD5Init (context)
-	MD5_CTX *context;
+void MD5Init(MD5_CTX *context)
 {
+  context->count[0] = context->count[1] = 0;
 
-	context->count[0] = context->count[1] = 0;
-
-	/* Load magic initialization constants.  */
-	context->state[0] = 0x67452301;
-	context->state[1] = 0xefcdab89;
-	context->state[2] = 0x98badcfe;
-	context->state[3] = 0x10325476;
+  /* Load magic initialization constants.  */
+  context->state[0] = 0x67452301;
+  context->state[1] = 0xefcdab89;
+  context->state[2] = 0x98badcfe;
+  context->state[3] = 0x10325476;
 }
 
 /*
@@ -149,66 +146,57 @@
  * context.
  */
 
-void
-MD5Update (context, in, inputLen)
-	MD5_CTX *context;
-	const void *in;
-	unsigned int inputLen;
+void MD5Update(MD5_CTX *context, const void *in, unsigned int inputLen)
 {
-	unsigned int i, idx, partLen;
-	const unsigned char *input = in;
+  unsigned int i, idx, partLen;
+  const unsigned char *input = in;
 
-	/* Compute number of bytes mod 64 */
-	idx = (unsigned int)((context->count[0] >> 3) & 0x3F);
+  /* Compute number of bytes mod 64 */
+  idx = (unsigned int)((context->count[0] >> 3) & 0x3F);
 
-	/* Update number of bits */
-	if ((context->count[0] += ((unsigned int)inputLen << 3))
-	    < ((unsigned int)inputLen << 3))
-		context->count[1]++;
-	context->count[1] += ((unsigned int)inputLen >> 29);
+  /* Update number of bits */
+  if ((context->count[0] += ((unsigned int)inputLen << 3)) <
+      ((unsigned int)inputLen << 3))
+    context->count[1]++;
+  context->count[1] += ((unsigned int)inputLen >> 29);
 
-	partLen = 64 - idx;
+  partLen = 64 - idx;
 
-	/* Transform as many times as possible. */
-	if (inputLen >= partLen) {
-		memcpy((void *)&context->buffer[idx], (const void *)input,
-		    partLen);
-		MD5Transform (context->state, context->buffer);
+  /* Transform as many times as possible. */
+  if (inputLen >= partLen) {
+    memcpy((void *)&context->buffer[idx], (const void *)input, partLen);
+    MD5Transform(context->state, context->buffer);
 
-		for (i = partLen; i + 63 < inputLen; i += 64)
-			MD5Transform (context->state, &input[i]);
+    for (i = partLen; i + 63 < inputLen; i += 64)
+      MD5Transform(context->state, &input[i]);
 
-		idx = 0;
-	}
-	else
-		i = 0;
+    idx = 0;
+  } else
+    i = 0;
 
-	/* Buffer remaining input */
-	memcpy ((void *)&context->buffer[idx], (const void *)&input[i],
-	    inputLen-i);
+  /* Buffer remaining input */
+  memcpy((void *)&context->buffer[idx], (const void *)&input[i], inputLen - i);
 }
 
 /*
  * MD5 padding. Adds padding followed by original length.
  */
 
-void
-MD5Pad (context)
-	MD5_CTX *context;
+void MD5Pad(MD5_CTX *context)
 {
-	unsigned char bits[8];
-	unsigned int idx, padLen;
+  unsigned char bits[8];
+  unsigned int idx, padLen;
 
-	/* Save number of bits */
-	Encode (bits, context->count, 8);
+  /* Save number of bits */
+  Encode(bits, context->count, 8);
 
-	/* Pad out to 56 mod 64. */
-	idx = (unsigned int)((context->count[0] >> 3) & 0x3f);
-	padLen = (idx < 56) ? (56 - idx) : (120 - idx);
-	MD5Update (context, PADDING, padLen);
+  /* Pad out to 56 mod 64. */
+  idx = (unsigned int)((context->count[0] >> 3) & 0x3f);
+  padLen = (idx < 56) ? (56 - idx) : (120 - idx);
+  MD5Update(context, PADDING, padLen);
 
-	/* Append length (before padding) */
-	MD5Update (context, bits, 8);
+  /* Append length (before padding) */
+  MD5Update(context, bits, 8);
 }
 
 /*
@@ -216,125 +204,119 @@
  * the message digest and zeroizing the context.
  */
 
-void
-MD5Final (digest, context)
-	unsigned char digest[16];
-	MD5_CTX *context;
+void MD5Final(unsigned char digest[16], MD5_CTX *context)
 {
-	/* Do padding. */
-	MD5Pad (context);
+  /* Do padding. */
+  MD5Pad(context);
 
-	/* Store state in digest */
-	Encode (digest, context->state, 16);
+  /* Store state in digest */
+  Encode(digest, context->state, 16);
 
-	/* Zeroize sensitive information. */
-	memset ((void *)context, 0, sizeof (*context));
+  /* Zeroize sensitive information. */
+  memset((void *)context, 0, sizeof(*context));
 }
 
 /* MD5 basic transformation. Transforms state based on block. */
 
-static void
-MD5Transform (state, block)
-	unsigned int state[4];
-	const unsigned char block[64];
+static void MD5Transform(unsigned int state[4], const unsigned char block[64])
 {
-	unsigned int a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+  unsigned int a = state[0], b = state[1], c = state[2], d = state[3], x[16];
 
-	Decode (x, block, 64);
+  Decode(x, block, 64);
 
-	/* Round 1 */
+  /* Round 1 */
 #define S11 7
 #define S12 12
 #define S13 17
 #define S14 22
-	FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
-	FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
-	FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
-	FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
-	FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
-	FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
-	FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
-	FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
-	FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
-	FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
-	FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
-	FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
-	FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
-	FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
-	FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
-	FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
+  FF(a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
+  FF(d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
+  FF(c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
+  FF(b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
+  FF(a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
+  FF(d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
+  FF(c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
+  FF(b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
+  FF(a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
+  FF(d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
+  FF(c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
+  FF(b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
+  FF(a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
+  FF(d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
+  FF(c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
+  FF(b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
 
-	/* Round 2 */
+  /* Round 2 */
 #define S21 5
 #define S22 9
 #define S23 14
 #define S24 20
-	GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
-	GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
-	GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
-	GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
-	GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
-	GG (d, a, b, c, x[10], S22,  0x2441453); /* 22 */
-	GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
-	GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
-	GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
-	GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
-	GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
-	GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
-	GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
-	GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
-	GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
-	GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
+  GG(a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
+  GG(d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
+  GG(c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
+  GG(b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
+  GG(a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
+  GG(d, a, b, c, x[10], S22,  0x2441453); /* 22 */
+  GG(c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
+  GG(b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
+  GG(a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
+  GG(d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
+  GG(c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
+  GG(b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
+  GG(a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
+  GG(d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
+  GG(c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
+  GG(b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
 
-	/* Round 3 */
+  /* Round 3 */
 #define S31 4
 #define S32 11
 #define S33 16
 #define S34 23
-	HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
-	HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
-	HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
-	HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
-	HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
-	HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
-	HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
-	HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
-	HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
-	HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
-	HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
-	HH (b, c, d, a, x[ 6], S34,  0x4881d05); /* 44 */
-	HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
-	HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
-	HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
-	HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
+  HH(a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
+  HH(d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
+  HH(c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
+  HH(b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
+  HH(a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
+  HH(d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
+  HH(c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
+  HH(b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
+  HH(a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
+  HH(d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
+  HH(c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
+  HH(b, c, d, a, x[ 6], S34,  0x4881d05); /* 44 */
+  HH(a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
+  HH(d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
+  HH(c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
+  HH(b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
 
-	/* Round 4 */
+  /* Round 4 */
 #define S41 6
 #define S42 10
 #define S43 15
 #define S44 21
-	II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
-	II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
-	II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
-	II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
-	II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
-	II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
-	II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
-	II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
-	II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
-	II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
-	II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
-	II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
-	II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
-	II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
-	II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
-	II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
+  II(a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
+  II(d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
+  II(c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
+  II(b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
+  II(a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
+  II(d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
+  II(c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
+  II(b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
+  II(a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
+  II(d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
+  II(c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
+  II(b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
+  II(a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
+  II(d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
+  II(c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
+  II(b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
 
-	state[0] += a;
-	state[1] += b;
-	state[2] += c;
-	state[3] += d;
+  state[0] += a;
+  state[1] += b;
+  state[2] += c;
+  state[3] += d;
 
-	/* Zeroize sensitive information. */
-	memset ((void *)x, 0, sizeof (x));
+  /* Zeroize sensitive information. */
+  memset((void *)x, 0, sizeof(x));
 }
diff --git a/md5/md5.h b/md5/md5.h
index 551e252..9115e9e 100644
--- a/md5/md5.h
+++ b/md5/md5.h
@@ -3,8 +3,8 @@
  */
 
 /*-
- Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
-rights reserved.
+ Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991.
+                       All rights reserved.
 
 License to copy and use this software is granted provided that it
 is identified as the "RSA Data Security, Inc. MD5 Message-Digest
@@ -28,22 +28,24 @@
 #ifndef _SYS_MD5_H_
 #define _SYS_MD5_H_
 
-#define MD5_BLOCK_LENGTH		64
-#define MD5_DIGEST_LENGTH		16
-#define MD5_DIGEST_STRING_LENGTH	(MD5_DIGEST_LENGTH * 2 + 1)
+#include <sys/types.h>
+
+#define MD5_BLOCK_LENGTH                64
+#define MD5_DIGEST_LENGTH               16
+#define MD5_DIGEST_STRING_LENGTH        (MD5_DIGEST_LENGTH * 2 + 1)
 
 /* MD5 context. */
 typedef struct MD5Context {
-  unsigned int state[4];	/* state (ABCD) */
-  unsigned int count[2];	/* number of bits, modulo 2^64 (lsb first) */
-  unsigned char buffer[64];	/* input buffer */
+  unsigned int state[4];        /* state (ABCD) */
+  unsigned int count[2];        /* number of bits, modulo 2^64 (lsb first) */
+  unsigned char buffer[64];     /* input buffer */
 } MD5_CTX;
 
-void   MD5Init (MD5_CTX *);
-void   MD5Update (MD5_CTX *, const void *, unsigned int);
-void   MD5Final (unsigned char [16], MD5_CTX *);
-char * MD5End(MD5_CTX *, char *);
-char * MD5File(const char *, char *);
-char * MD5FileChunk(const char *, char *, off_t, off_t);
-char * MD5Data(const void *, unsigned int, char *);
+void  MD5Init(MD5_CTX *);
+void  MD5Update(MD5_CTX *, const void *, unsigned int);
+void  MD5Final(unsigned char [16], MD5_CTX *);
+char *MD5End(MD5_CTX *, char *);
+char *MD5File(const char *, char *);
+char *MD5FileChunk(const char *, char *, off_t, off_t);
+char *MD5Data(const void *, unsigned int, char *);
 #endif /* _SYS_MD5_H_ */
diff --git a/md5/md5cmp.c b/md5/md5cmp.c
index dfd60bd..42b94ce 100644
--- a/md5/md5cmp.c
+++ b/md5/md5cmp.c
@@ -28,33 +28,32 @@
 
 #include <stdio.h>
 #include <string.h>
-#include <sys/types.h>
 #include "./md5.h"
 #include "../tjutil.h"
 
 int main(int argc, char *argv[])
 {
-	char *md5sum = NULL, buf[65];
+  char *md5sum = NULL, buf[65];
 
-	if (argc < 3) {
-		fprintf(stderr, "USAGE: %s <correct MD5 sum> <file>\n", argv[0]);
-		return -1;
-	}
+  if (argc < 3) {
+    fprintf(stderr, "USAGE: %s <correct MD5 sum> <file>\n", argv[0]);
+    return -1;
+  }
 
-	if (strlen(argv[1]) != 32)
-		fprintf(stderr, "WARNING: MD5 hash size is wrong.\n");
+  if (strlen(argv[1]) != 32)
+    fprintf(stderr, "WARNING: MD5 hash size is wrong.\n");
 
-	md5sum = MD5File(argv[2], buf);
-	if (!md5sum) {
-		perror("Could not obtain MD5 sum");
-		return -1;
-	}
+  md5sum = MD5File(argv[2], buf);
+  if (!md5sum) {
+    perror("Could not obtain MD5 sum");
+    return -1;
+  }
 
-	if (!strcasecmp(md5sum, argv[1])) {
-		fprintf(stderr, "%s: OK\n", argv[2]);
-		return 0;
-	} else {
-		fprintf(stderr, "%s: FAILED.  Checksum is %s\n", argv[2], md5sum);
-		return -1;
-	}
+  if (!strcasecmp(md5sum, argv[1])) {
+    fprintf(stderr, "%s: OK\n", argv[2]);
+    return 0;
+  } else {
+    fprintf(stderr, "%s: FAILED.  Checksum is %s\n", argv[2], md5sum);
+    return -1;
+  }
 }
diff --git a/md5/md5hl.c b/md5/md5hl.c
index 983ea76..189a66e 100644
--- a/md5/md5hl.c
+++ b/md5/md5hl.c
@@ -1,4 +1,5 @@
-/* mdXhl.c * ----------------------------------------------------------------------------
+/* mdXhl.c
+ * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.org> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
@@ -32,83 +33,79 @@
 
 #include "./md5.h"
 
-char *
-MD5End(MD5_CTX *ctx, char *buf)
+char *MD5End(MD5_CTX *ctx, char *buf)
 {
-	int i;
-	unsigned char digest[LENGTH];
-	static const char hex[]="0123456789abcdef";
+  int i;
+  unsigned char digest[LENGTH];
+  static const char hex[] = "0123456789abcdef";
 
-	if (!buf)
-		buf = malloc(2*LENGTH + 1);
-	if (!buf)
-		return 0;
-	MD5Final(digest, ctx);
-	for (i = 0; i < LENGTH; i++) {
-		buf[i+i] = hex[digest[i] >> 4];
-		buf[i+i+1] = hex[digest[i] & 0x0f];
-	}
-	buf[i+i] = '\0';
-	return buf;
+  if (!buf)
+    buf = malloc(2 * LENGTH + 1);
+  if (!buf)
+    return 0;
+  MD5Final(digest, ctx);
+  for (i = 0; i < LENGTH; i++) {
+    buf[i + i] = hex[digest[i] >> 4];
+    buf[i + i + 1] = hex[digest[i] & 0x0f];
+  }
+  buf[i + i] = '\0';
+  return buf;
 }
 
-char *
-MD5File(const char *filename, char *buf)
+char *MD5File(const char *filename, char *buf)
 {
-	return (MD5FileChunk(filename, buf, 0, 0));
+  return (MD5FileChunk(filename, buf, 0, 0));
 }
 
-char *
-MD5FileChunk(const char *filename, char *buf, off_t ofs, off_t len)
+char *MD5FileChunk(const char *filename, char *buf, off_t ofs, off_t len)
 {
-	unsigned char buffer[BUFSIZ];
-	MD5_CTX ctx;
-	struct stat stbuf;
-	int f, i, e;
-	off_t n;
+  unsigned char buffer[BUFSIZ];
+  MD5_CTX ctx;
+  struct stat stbuf;
+  int f, i, e;
+  off_t n;
 
-	MD5Init(&ctx);
+  MD5Init(&ctx);
 #if _WIN32
-	f = _open(filename, O_RDONLY|O_BINARY);
+  f = _open(filename, O_RDONLY | O_BINARY);
 #else
-	f = open(filename, O_RDONLY);
+  f = open(filename, O_RDONLY);
 #endif
-	if (f < 0)
-		return 0;
-	if (fstat(f, &stbuf) < 0)
-		return 0;
-	if (ofs > stbuf.st_size)
-		ofs = stbuf.st_size;
-	if ((len == 0) || (len > stbuf.st_size - ofs))
-		len = stbuf.st_size - ofs;
-	if (lseek(f, ofs, SEEK_SET) < 0)
-		return 0;
-	n = len;
-	i = 0;
-	while (n > 0) {
-		if (n > sizeof(buffer))
-			i = read(f, buffer, sizeof(buffer));
-		else
-			i = read(f, buffer, n);
-		if (i < 0)
-			break;
-		MD5Update(&ctx, buffer, i);
-		n -= i;
-	}
-	e = errno;
-	close(f);
-	errno = e;
-	if (i < 0)
-		return 0;
-	return (MD5End(&ctx, buf));
+  if (f < 0)
+    return 0;
+  if (fstat(f, &stbuf) < 0)
+    return 0;
+  if (ofs > stbuf.st_size)
+    ofs = stbuf.st_size;
+  if ((len == 0) || (len > stbuf.st_size - ofs))
+    len = stbuf.st_size - ofs;
+  if (lseek(f, ofs, SEEK_SET) < 0)
+    return 0;
+  n = len;
+  i = 0;
+  while (n > 0) {
+    if (n > sizeof(buffer))
+      i = read(f, buffer, sizeof(buffer));
+    else
+      i = read(f, buffer, n);
+    if (i < 0)
+      break;
+    MD5Update(&ctx, buffer, i);
+    n -= i;
+  }
+  e = errno;
+  close(f);
+  errno = e;
+  if (i < 0)
+    return 0;
+  return (MD5End(&ctx, buf));
 }
 
-char *
-MD5Data (const void *data, unsigned int len, char *buf)
+char *MD5Data(const void *data, unsigned int len, char *buf)
 {
-	MD5_CTX ctx;
+  MD5_CTX ctx;
 
-	MD5Init(&ctx);
-	MD5Update(&ctx,data,len);
-	return (MD5End(&ctx, buf));
+  MD5Init(&ctx);
+  MD5Update(&ctx, data, len);
+  return (MD5End(&ctx, buf));
 }
diff --git a/rdbmp.c b/rdbmp.c
index eaa7086..fcabbb1 100644
--- a/rdbmp.c
+++ b/rdbmp.c
@@ -6,7 +6,7 @@
  * Modified 2009-2010 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Modified 2011 by Siarhei Siamashka.
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2017, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -26,6 +26,7 @@
  * This code contributed by James Arthur Boucher.
  */
 
+#include "cmyk.h"
 #include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
 
 #ifdef BMP_SUPPORTED
@@ -35,19 +36,24 @@
 
 #ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char U_CHAR;
-#define UCH(x)  ((int) (x))
+#define UCH(x)  ((int)(x))
 #else /* !HAVE_UNSIGNED_CHAR */
 #ifdef __CHAR_UNSIGNED__
 typedef char U_CHAR;
-#define UCH(x)  ((int) (x))
+#define UCH(x)  ((int)(x))
 #else
 typedef char U_CHAR;
-#define UCH(x)  ((int) (x) & 0xFF)
+#define UCH(x)  ((int)(x) & 0xFF)
 #endif
 #endif /* HAVE_UNSIGNED_CHAR */
 
 
-#define ReadOK(file,buffer,len) (JFREAD(file,buffer,len) == ((size_t) (len)))
+#define ReadOK(file, buffer, len) \
+  (JFREAD(file, buffer, len) == ((size_t)(len)))
+
+static int alpha_index[JPEG_NUMCS] = {
+  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0, -1
+};
 
 
 /* Private version of data source object */
@@ -66,11 +72,22 @@
   JDIMENSION row_width;         /* Physical width of scanlines in file */
 
   int bits_per_pixel;           /* remembers 8- or 24-bit format */
+
+  boolean use_inversion_array;  /* TRUE = preload the whole image, which is
+                                   stored in bottom-up order, and feed it to
+                                   the calling program in top-down order
+
+                                   FALSE = the calling program will maintain
+                                   its own image buffer and read the rows in
+                                   bottom-up order */
+
+  U_CHAR *iobuffer;             /* I/O buffer (used to buffer a single row from
+                                   disk if use_inversion_array == FALSE) */
 } bmp_source_struct;
 
 
 LOCAL(int)
-read_byte (bmp_source_ptr sinfo)
+read_byte(bmp_source_ptr sinfo)
 /* Read next byte from BMP file */
 {
   register FILE *infile = sinfo->pub.input_file;
@@ -83,33 +100,45 @@
 
 
 LOCAL(void)
-read_colormap (bmp_source_ptr sinfo, int cmaplen, int mapentrysize)
+read_colormap(bmp_source_ptr sinfo, int cmaplen, int mapentrysize)
 /* Read the colormap from a BMP file */
 {
-  int i;
+  int i, gray = 1;
 
   switch (mapentrysize) {
   case 3:
     /* BGR format (occurs in OS/2 files) */
     for (i = 0; i < cmaplen; i++) {
-      sinfo->colormap[2][i] = (JSAMPLE) read_byte(sinfo);
-      sinfo->colormap[1][i] = (JSAMPLE) read_byte(sinfo);
-      sinfo->colormap[0][i] = (JSAMPLE) read_byte(sinfo);
+      sinfo->colormap[2][i] = (JSAMPLE)read_byte(sinfo);
+      sinfo->colormap[1][i] = (JSAMPLE)read_byte(sinfo);
+      sinfo->colormap[0][i] = (JSAMPLE)read_byte(sinfo);
+      if (sinfo->colormap[2][i] != sinfo->colormap[1][i] ||
+          sinfo->colormap[1][i] != sinfo->colormap[0][i])
+        gray = 0;
     }
     break;
   case 4:
     /* BGR0 format (occurs in MS Windows files) */
     for (i = 0; i < cmaplen; i++) {
-      sinfo->colormap[2][i] = (JSAMPLE) read_byte(sinfo);
-      sinfo->colormap[1][i] = (JSAMPLE) read_byte(sinfo);
-      sinfo->colormap[0][i] = (JSAMPLE) read_byte(sinfo);
-      (void) read_byte(sinfo);
+      sinfo->colormap[2][i] = (JSAMPLE)read_byte(sinfo);
+      sinfo->colormap[1][i] = (JSAMPLE)read_byte(sinfo);
+      sinfo->colormap[0][i] = (JSAMPLE)read_byte(sinfo);
+      (void)read_byte(sinfo);
+      if (sinfo->colormap[2][i] != sinfo->colormap[1][i] ||
+          sinfo->colormap[1][i] != sinfo->colormap[0][i])
+        gray = 0;
     }
     break;
   default:
     ERREXIT(sinfo->cinfo, JERR_BMP_BADCMAP);
     break;
   }
+
+  if (sinfo->cinfo->in_color_space == JCS_UNKNOWN && gray)
+    sinfo->cinfo->in_color_space = JCS_GRAYSCALE;
+
+  if (sinfo->cinfo->in_color_space == JCS_GRAYSCALE && !gray)
+    ERREXIT(sinfo->cinfo, JERR_BAD_IN_COLORSPACE);
 }
 
 
@@ -121,30 +150,68 @@
  */
 
 METHODDEF(JDIMENSION)
-get_8bit_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+get_8bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 /* This version is for reading 8-bit colormap indexes */
 {
-  bmp_source_ptr source = (bmp_source_ptr) sinfo;
+  bmp_source_ptr source = (bmp_source_ptr)sinfo;
   register JSAMPARRAY colormap = source->colormap;
   JSAMPARRAY image_ptr;
   register int t;
   register JSAMPROW inptr, outptr;
   register JDIMENSION col;
 
-  /* Fetch next row from virtual array */
-  source->source_row--;
-  image_ptr = (*cinfo->mem->access_virt_sarray)
-    ((j_common_ptr) cinfo, source->whole_image,
-     source->source_row, (JDIMENSION) 1, FALSE);
+  if (source->use_inversion_array) {
+    /* Fetch next row from virtual array */
+    source->source_row--;
+    image_ptr = (*cinfo->mem->access_virt_sarray)
+      ((j_common_ptr)cinfo, source->whole_image,
+       source->source_row, (JDIMENSION)1, FALSE);
+    inptr = image_ptr[0];
+  } else {
+    if (!ReadOK(source->pub.input_file, source->iobuffer, source->row_width))
+      ERREXIT(cinfo, JERR_INPUT_EOF);
+    inptr = source->iobuffer;
+  }
 
   /* Expand the colormap indexes to real data */
-  inptr = image_ptr[0];
   outptr = source->pub.buffer[0];
-  for (col = cinfo->image_width; col > 0; col--) {
-    t = GETJSAMPLE(*inptr++);
-    *outptr++ = colormap[0][t]; /* can omit GETJSAMPLE() safely */
-    *outptr++ = colormap[1][t];
-    *outptr++ = colormap[2][t];
+  if (cinfo->in_color_space == JCS_GRAYSCALE) {
+    for (col = cinfo->image_width; col > 0; col--) {
+      t = GETJSAMPLE(*inptr++);
+      *outptr++ = colormap[0][t];
+    }
+  } else if (cinfo->in_color_space == JCS_CMYK) {
+    for (col = cinfo->image_width; col > 0; col--) {
+      t = GETJSAMPLE(*inptr++);
+      rgb_to_cmyk(colormap[0][t], colormap[1][t], colormap[2][t], outptr,
+                  outptr + 1, outptr + 2, outptr + 3);
+      outptr += 4;
+    }
+  } else {
+    register int rindex = rgb_red[cinfo->in_color_space];
+    register int gindex = rgb_green[cinfo->in_color_space];
+    register int bindex = rgb_blue[cinfo->in_color_space];
+    register int aindex = alpha_index[cinfo->in_color_space];
+    register int ps = rgb_pixelsize[cinfo->in_color_space];
+
+    if (aindex >= 0) {
+      for (col = cinfo->image_width; col > 0; col--) {
+        t = GETJSAMPLE(*inptr++);
+        outptr[rindex] = colormap[0][t];
+        outptr[gindex] = colormap[1][t];
+        outptr[bindex] = colormap[2][t];
+        outptr[aindex] = 0xFF;
+        outptr += ps;
+      }
+    } else {
+      for (col = cinfo->image_width; col > 0; col--) {
+        t = GETJSAMPLE(*inptr++);
+        outptr[rindex] = colormap[0][t];
+        outptr[gindex] = colormap[1][t];
+        outptr[bindex] = colormap[2][t];
+        outptr += ps;
+      }
+    }
   }
 
   return 1;
@@ -152,30 +219,63 @@
 
 
 METHODDEF(JDIMENSION)
-get_24bit_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+get_24bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 /* This version is for reading 24-bit pixels */
 {
-  bmp_source_ptr source = (bmp_source_ptr) sinfo;
+  bmp_source_ptr source = (bmp_source_ptr)sinfo;
   JSAMPARRAY image_ptr;
   register JSAMPROW inptr, outptr;
   register JDIMENSION col;
 
-  /* Fetch next row from virtual array */
-  source->source_row--;
-  image_ptr = (*cinfo->mem->access_virt_sarray)
-    ((j_common_ptr) cinfo, source->whole_image,
-     source->source_row, (JDIMENSION) 1, FALSE);
+  if (source->use_inversion_array) {
+    /* Fetch next row from virtual array */
+    source->source_row--;
+    image_ptr = (*cinfo->mem->access_virt_sarray)
+      ((j_common_ptr)cinfo, source->whole_image,
+       source->source_row, (JDIMENSION)1, FALSE);
+    inptr = image_ptr[0];
+  } else {
+    if (!ReadOK(source->pub.input_file, source->iobuffer, source->row_width))
+      ERREXIT(cinfo, JERR_INPUT_EOF);
+    inptr = source->iobuffer;
+  }
 
   /* Transfer data.  Note source values are in BGR order
    * (even though Microsoft's own documents say the opposite).
    */
-  inptr = image_ptr[0];
   outptr = source->pub.buffer[0];
-  for (col = cinfo->image_width; col > 0; col--) {
-    outptr[2] = *inptr++;       /* can omit GETJSAMPLE() safely */
-    outptr[1] = *inptr++;
-    outptr[0] = *inptr++;
-    outptr += 3;
+  if (cinfo->in_color_space == JCS_EXT_BGR) {
+    MEMCOPY(outptr, inptr, source->row_width);
+  } else if (cinfo->in_color_space == JCS_CMYK) {
+    for (col = cinfo->image_width; col > 0; col--) {
+      /* can omit GETJSAMPLE() safely */
+      JSAMPLE b = *inptr++, g = *inptr++, r = *inptr++;
+      rgb_to_cmyk(r, g, b, outptr, outptr + 1, outptr + 2, outptr + 3);
+      outptr += 4;
+    }
+  } else {
+    register int rindex = rgb_red[cinfo->in_color_space];
+    register int gindex = rgb_green[cinfo->in_color_space];
+    register int bindex = rgb_blue[cinfo->in_color_space];
+    register int aindex = alpha_index[cinfo->in_color_space];
+    register int ps = rgb_pixelsize[cinfo->in_color_space];
+
+    if (aindex >= 0) {
+      for (col = cinfo->image_width; col > 0; col--) {
+        outptr[bindex] = *inptr++;      /* can omit GETJSAMPLE() safely */
+        outptr[gindex] = *inptr++;
+        outptr[rindex] = *inptr++;
+        outptr[aindex] = 0xFF;
+        outptr += ps;
+      }
+    } else {
+      for (col = cinfo->image_width; col > 0; col--) {
+        outptr[bindex] = *inptr++;      /* can omit GETJSAMPLE() safely */
+        outptr[gindex] = *inptr++;
+        outptr[rindex] = *inptr++;
+        outptr += ps;
+      }
+    }
   }
 
   return 1;
@@ -183,30 +283,66 @@
 
 
 METHODDEF(JDIMENSION)
-get_32bit_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+get_32bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 /* This version is for reading 32-bit pixels */
 {
-  bmp_source_ptr source = (bmp_source_ptr) sinfo;
+  bmp_source_ptr source = (bmp_source_ptr)sinfo;
   JSAMPARRAY image_ptr;
   register JSAMPROW inptr, outptr;
   register JDIMENSION col;
 
-  /* Fetch next row from virtual array */
-  source->source_row--;
-  image_ptr = (*cinfo->mem->access_virt_sarray)
-    ((j_common_ptr) cinfo, source->whole_image,
-     source->source_row, (JDIMENSION) 1, FALSE);
+  if (source->use_inversion_array) {
+    /* Fetch next row from virtual array */
+    source->source_row--;
+    image_ptr = (*cinfo->mem->access_virt_sarray)
+      ((j_common_ptr)cinfo, source->whole_image,
+       source->source_row, (JDIMENSION)1, FALSE);
+    inptr = image_ptr[0];
+  } else {
+    if (!ReadOK(source->pub.input_file, source->iobuffer, source->row_width))
+      ERREXIT(cinfo, JERR_INPUT_EOF);
+    inptr = source->iobuffer;
+  }
+
   /* Transfer data.  Note source values are in BGR order
    * (even though Microsoft's own documents say the opposite).
    */
-  inptr = image_ptr[0];
   outptr = source->pub.buffer[0];
-  for (col = cinfo->image_width; col > 0; col--) {
-    outptr[2] = *inptr++;       /* can omit GETJSAMPLE() safely */
-    outptr[1] = *inptr++;
-    outptr[0] = *inptr++;
-    inptr++;                    /* skip the 4th byte (Alpha channel) */
-    outptr += 3;
+  if (cinfo->in_color_space == JCS_EXT_BGRX ||
+      cinfo->in_color_space == JCS_EXT_BGRA) {
+    MEMCOPY(outptr, inptr, source->row_width);
+  } else if (cinfo->in_color_space == JCS_CMYK) {
+    for (col = cinfo->image_width; col > 0; col--) {
+      /* can omit GETJSAMPLE() safely */
+      JSAMPLE b = *inptr++, g = *inptr++, r = *inptr++;
+      rgb_to_cmyk(r, g, b, outptr, outptr + 1, outptr + 2, outptr + 3);
+      inptr++;                          /* skip the 4th byte (Alpha channel) */
+      outptr += 4;
+    }
+  } else {
+    register int rindex = rgb_red[cinfo->in_color_space];
+    register int gindex = rgb_green[cinfo->in_color_space];
+    register int bindex = rgb_blue[cinfo->in_color_space];
+    register int aindex = alpha_index[cinfo->in_color_space];
+    register int ps = rgb_pixelsize[cinfo->in_color_space];
+
+    if (aindex >= 0) {
+      for (col = cinfo->image_width; col > 0; col--) {
+        outptr[bindex] = *inptr++;      /* can omit GETJSAMPLE() safely */
+        outptr[gindex] = *inptr++;
+        outptr[rindex] = *inptr++;
+        outptr[aindex] = *inptr++;
+        outptr += ps;
+      }
+    } else {
+      for (col = cinfo->image_width; col > 0; col--) {
+        outptr[bindex] = *inptr++;      /* can omit GETJSAMPLE() safely */
+        outptr[gindex] = *inptr++;
+        outptr[rindex] = *inptr++;
+        inptr++;                        /* skip the 4th byte (Alpha channel) */
+        outptr += ps;
+      }
+    }
   }
 
   return 1;
@@ -220,25 +356,24 @@
  */
 
 METHODDEF(JDIMENSION)
-preload_image (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+preload_image(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 {
-  bmp_source_ptr source = (bmp_source_ptr) sinfo;
+  bmp_source_ptr source = (bmp_source_ptr)sinfo;
   register FILE *infile = source->pub.input_file;
   register JSAMPROW out_ptr;
   JSAMPARRAY image_ptr;
   JDIMENSION row;
-  cd_progress_ptr progress = (cd_progress_ptr) cinfo->progress;
+  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
 
   /* Read the data into a virtual array in input-file row order. */
   for (row = 0; row < cinfo->image_height; row++) {
     if (progress != NULL) {
-      progress->pub.pass_counter = (long) row;
-      progress->pub.pass_limit = (long) cinfo->image_height;
-      (*progress->pub.progress_monitor) ((j_common_ptr) cinfo);
+      progress->pub.pass_counter = (long)row;
+      progress->pub.pass_limit = (long)cinfo->image_height;
+      (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
     }
     image_ptr = (*cinfo->mem->access_virt_sarray)
-      ((j_common_ptr) cinfo, source->whole_image,
-       row, (JDIMENSION) 1, TRUE);
+      ((j_common_ptr)cinfo, source->whole_image, row, (JDIMENSION)1, TRUE);
     out_ptr = image_ptr[0];
     if (fread(out_ptr, 1, source->row_width, infile) != source->row_width) {
       if (feof(infile))
@@ -276,55 +411,59 @@
  */
 
 METHODDEF(void)
-start_input_bmp (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+start_input_bmp(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 {
-  bmp_source_ptr source = (bmp_source_ptr) sinfo;
+  bmp_source_ptr source = (bmp_source_ptr)sinfo;
   U_CHAR bmpfileheader[14];
   U_CHAR bmpinfoheader[64];
-#define GET_2B(array,offset)  ((unsigned short) UCH(array[offset]) + \
-                               (((unsigned short) UCH(array[offset+1])) << 8))
-#define GET_4B(array,offset)  ((unsigned int) UCH(array[offset]) + \
-                               (((unsigned int) UCH(array[offset+1])) << 8) + \
-                               (((unsigned int) UCH(array[offset+2])) << 16) + \
-                               (((unsigned int) UCH(array[offset+3])) << 24))
+
+#define GET_2B(array, offset) \
+  ((unsigned short)UCH(array[offset]) + \
+   (((unsigned short)UCH(array[offset + 1])) << 8))
+#define GET_4B(array, offset) \
+  ((unsigned int)UCH(array[offset]) + \
+   (((unsigned int)UCH(array[offset + 1])) << 8) + \
+   (((unsigned int)UCH(array[offset + 2])) << 16) + \
+   (((unsigned int)UCH(array[offset + 3])) << 24))
+
   unsigned int bfOffBits;
   unsigned int headerSize;
   int biWidth;
   int biHeight;
   unsigned short biPlanes;
   unsigned int biCompression;
-  int biXPelsPerMeter,biYPelsPerMeter;
+  int biXPelsPerMeter, biYPelsPerMeter;
   unsigned int biClrUsed = 0;
   int mapentrysize = 0;         /* 0 indicates no colormap */
   int bPad;
-  JDIMENSION row_width;
+  JDIMENSION row_width = 0;
 
   /* Read and verify the bitmap file header */
-  if (! ReadOK(source->pub.input_file, bmpfileheader, 14))
+  if (!ReadOK(source->pub.input_file, bmpfileheader, 14))
     ERREXIT(cinfo, JERR_INPUT_EOF);
-  if (GET_2B(bmpfileheader,0) != 0x4D42) /* 'BM' */
+  if (GET_2B(bmpfileheader, 0) != 0x4D42) /* 'BM' */
     ERREXIT(cinfo, JERR_BMP_NOT);
-  bfOffBits = GET_4B(bmpfileheader,10);
+  bfOffBits = GET_4B(bmpfileheader, 10);
   /* We ignore the remaining fileheader fields */
 
   /* The infoheader might be 12 bytes (OS/2 1.x), 40 bytes (Windows),
    * or 64 bytes (OS/2 2.x).  Check the first 4 bytes to find out which.
    */
-  if (! ReadOK(source->pub.input_file, bmpinfoheader, 4))
+  if (!ReadOK(source->pub.input_file, bmpinfoheader, 4))
     ERREXIT(cinfo, JERR_INPUT_EOF);
-  headerSize = GET_4B(bmpinfoheader,0);
+  headerSize = GET_4B(bmpinfoheader, 0);
   if (headerSize < 12 || headerSize > 64)
     ERREXIT(cinfo, JERR_BMP_BADHEADER);
-  if (! ReadOK(source->pub.input_file, bmpinfoheader+4, headerSize-4))
+  if (!ReadOK(source->pub.input_file, bmpinfoheader + 4, headerSize - 4))
     ERREXIT(cinfo, JERR_INPUT_EOF);
 
   switch (headerSize) {
   case 12:
     /* Decode OS/2 1.x header (Microsoft calls this a BITMAPCOREHEADER) */
-    biWidth = (int) GET_2B(bmpinfoheader,4);
-    biHeight = (int) GET_2B(bmpinfoheader,6);
-    biPlanes = GET_2B(bmpinfoheader,8);
-    source->bits_per_pixel = (int) GET_2B(bmpinfoheader,10);
+    biWidth = (int)GET_2B(bmpinfoheader, 4);
+    biHeight = (int)GET_2B(bmpinfoheader, 6);
+    biPlanes = GET_2B(bmpinfoheader, 8);
+    source->bits_per_pixel = (int)GET_2B(bmpinfoheader, 10);
 
     switch (source->bits_per_pixel) {
     case 8:                     /* colormapped image */
@@ -343,14 +482,14 @@
   case 64:
     /* Decode Windows 3.x header (Microsoft calls this a BITMAPINFOHEADER) */
     /* or OS/2 2.x header, which has additional fields that we ignore */
-    biWidth = (int) GET_4B(bmpinfoheader,4);
-    biHeight = (int) GET_4B(bmpinfoheader,8);
-    biPlanes = GET_2B(bmpinfoheader,12);
-    source->bits_per_pixel = (int) GET_2B(bmpinfoheader,14);
-    biCompression = GET_4B(bmpinfoheader,16);
-    biXPelsPerMeter = (int) GET_4B(bmpinfoheader,24);
-    biYPelsPerMeter = (int) GET_4B(bmpinfoheader,28);
-    biClrUsed = GET_4B(bmpinfoheader,32);
+    biWidth = (int)GET_4B(bmpinfoheader, 4);
+    biHeight = (int)GET_4B(bmpinfoheader, 8);
+    biPlanes = GET_2B(bmpinfoheader, 12);
+    source->bits_per_pixel = (int)GET_2B(bmpinfoheader, 14);
+    biCompression = GET_4B(bmpinfoheader, 16);
+    biXPelsPerMeter = (int)GET_4B(bmpinfoheader, 24);
+    biYPelsPerMeter = (int)GET_4B(bmpinfoheader, 28);
+    biClrUsed = GET_4B(bmpinfoheader, 32);
     /* biSizeImage, biClrImportant fields are ignored */
 
     switch (source->bits_per_pixel) {
@@ -373,8 +512,8 @@
 
     if (biXPelsPerMeter > 0 && biYPelsPerMeter > 0) {
       /* Set JFIF density parameters from the BMP data */
-      cinfo->X_density = (UINT16) (biXPelsPerMeter/100); /* 100 cm per meter */
-      cinfo->Y_density = (UINT16) (biYPelsPerMeter/100);
+      cinfo->X_density = (UINT16)(biXPelsPerMeter / 100); /* 100 cm per meter */
+      cinfo->Y_density = (UINT16)(biYPelsPerMeter / 100);
       cinfo->density_unit = 2;  /* dots/cm */
     }
     break;
@@ -399,10 +538,9 @@
       ERREXIT(cinfo, JERR_BMP_BADCMAP);
     /* Allocate space to store the colormap */
     source->colormap = (*cinfo->mem->alloc_sarray)
-      ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       (JDIMENSION) biClrUsed, (JDIMENSION) 3);
+      ((j_common_ptr)cinfo, JPOOL_IMAGE, (JDIMENSION)biClrUsed, (JDIMENSION)3);
     /* and read it from the file */
-    read_colormap(source, (int) biClrUsed, mapentrysize);
+    read_colormap(source, (int)biClrUsed, mapentrysize);
     /* account for size of colormap */
     bPad -= biClrUsed * mapentrysize;
   }
@@ -411,40 +549,89 @@
   if (bPad < 0)                 /* incorrect bfOffBits value? */
     ERREXIT(cinfo, JERR_BMP_BADHEADER);
   while (--bPad >= 0) {
-    (void) read_byte(source);
+    (void)read_byte(source);
   }
 
   /* Compute row width in file, including padding to 4-byte boundary */
-  if (source->bits_per_pixel == 24)
-    row_width = (JDIMENSION) (biWidth * 3);
-  else if (source->bits_per_pixel == 32)
-    row_width = (JDIMENSION) (biWidth * 4);
-  else
-    row_width = (JDIMENSION) biWidth;
+  switch (source->bits_per_pixel) {
+  case 8:
+    if (cinfo->in_color_space == JCS_UNKNOWN)
+      cinfo->in_color_space = JCS_EXT_RGB;
+    if (IsExtRGB(cinfo->in_color_space))
+      cinfo->input_components = rgb_pixelsize[cinfo->in_color_space];
+    else if (cinfo->in_color_space == JCS_GRAYSCALE)
+      cinfo->input_components = 1;
+    else if (cinfo->in_color_space == JCS_CMYK)
+      cinfo->input_components = 4;
+    else
+      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
+    row_width = (JDIMENSION)biWidth;
+    break;
+  case 24:
+    if (cinfo->in_color_space == JCS_UNKNOWN)
+      cinfo->in_color_space = JCS_EXT_BGR;
+    if (IsExtRGB(cinfo->in_color_space))
+      cinfo->input_components = rgb_pixelsize[cinfo->in_color_space];
+    else if (cinfo->in_color_space == JCS_CMYK)
+      cinfo->input_components = 4;
+    else
+      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
+    row_width = (JDIMENSION)(biWidth * 3);
+    break;
+  case 32:
+    if (cinfo->in_color_space == JCS_UNKNOWN)
+      cinfo->in_color_space = JCS_EXT_BGRA;
+    if (IsExtRGB(cinfo->in_color_space))
+      cinfo->input_components = rgb_pixelsize[cinfo->in_color_space];
+    else if (cinfo->in_color_space == JCS_CMYK)
+      cinfo->input_components = 4;
+    else
+      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
+    row_width = (JDIMENSION)(biWidth * 4);
+    break;
+  default:
+    ERREXIT(cinfo, JERR_BMP_BADDEPTH);
+  }
   while ((row_width & 3) != 0) row_width++;
   source->row_width = row_width;
 
-  /* Allocate space for inversion array, prepare for preload pass */
-  source->whole_image = (*cinfo->mem->request_virt_sarray)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
-     row_width, (JDIMENSION) biHeight, (JDIMENSION) 1);
-  source->pub.get_pixel_rows = preload_image;
-  if (cinfo->progress != NULL) {
-    cd_progress_ptr progress = (cd_progress_ptr) cinfo->progress;
-    progress->total_extra_passes++; /* count file input as separate pass */
+  if (source->use_inversion_array) {
+    /* Allocate space for inversion array, prepare for preload pass */
+    source->whole_image = (*cinfo->mem->request_virt_sarray)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
+       row_width, (JDIMENSION)biHeight, (JDIMENSION)1);
+    source->pub.get_pixel_rows = preload_image;
+    if (cinfo->progress != NULL) {
+      cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
+      progress->total_extra_passes++; /* count file input as separate pass */
+    }
+  } else {
+    source->iobuffer = (U_CHAR *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, row_width);
+    switch (source->bits_per_pixel) {
+    case 8:
+      source->pub.get_pixel_rows = get_8bit_row;
+      break;
+    case 24:
+      source->pub.get_pixel_rows = get_24bit_row;
+      break;
+    case 32:
+      source->pub.get_pixel_rows = get_32bit_row;
+      break;
+    default:
+      ERREXIT(cinfo, JERR_BMP_BADDEPTH);
+    }
   }
 
   /* Allocate one-row buffer for returned data */
   source->pub.buffer = (*cinfo->mem->alloc_sarray)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE,
-     (JDIMENSION) (biWidth * 3), (JDIMENSION) 1);
+    ((j_common_ptr)cinfo, JPOOL_IMAGE,
+     (JDIMENSION)(biWidth * cinfo->input_components), (JDIMENSION)1);
   source->pub.buffer_height = 1;
 
-  cinfo->in_color_space = JCS_RGB;
-  cinfo->input_components = 3;
   cinfo->data_precision = 8;
-  cinfo->image_width = (JDIMENSION) biWidth;
-  cinfo->image_height = (JDIMENSION) biHeight;
+  cinfo->image_width = (JDIMENSION)biWidth;
+  cinfo->image_height = (JDIMENSION)biHeight;
 }
 
 
@@ -453,7 +640,7 @@
  */
 
 METHODDEF(void)
-finish_input_bmp (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+finish_input_bmp(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 {
   /* no work */
 }
@@ -464,20 +651,22 @@
  */
 
 GLOBAL(cjpeg_source_ptr)
-jinit_read_bmp (j_compress_ptr cinfo)
+jinit_read_bmp(j_compress_ptr cinfo, boolean use_inversion_array)
 {
   bmp_source_ptr source;
 
   /* Create module interface object */
   source = (bmp_source_ptr)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  sizeof(bmp_source_struct));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(bmp_source_struct));
   source->cinfo = cinfo;        /* make back link for subroutines */
   /* Fill in method ptrs, except get_pixel_rows which start_input sets */
   source->pub.start_input = start_input_bmp;
   source->pub.finish_input = finish_input_bmp;
 
-  return (cjpeg_source_ptr) source;
+  source->use_inversion_array = use_inversion_array;
+
+  return (cjpeg_source_ptr)source;
 }
 
 #endif /* BMP_SUPPORTED */
diff --git a/rdcolmap.c b/rdcolmap.c
index ed8ca3b..cbbef59 100644
--- a/rdcolmap.c
+++ b/rdcolmap.c
@@ -44,7 +44,7 @@
  */
 
 LOCAL(void)
-add_map_entry (j_decompress_ptr cinfo, int R, int G, int B)
+add_map_entry(j_decompress_ptr cinfo, int R, int G, int B)
 {
   JSAMPROW colormap0 = cinfo->colormap[0];
   JSAMPROW colormap1 = cinfo->colormap[1];
@@ -61,13 +61,13 @@
   }
 
   /* Check for map overflow. */
-  if (ncolors >= (MAXJSAMPLE+1))
-    ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, (MAXJSAMPLE+1));
+  if (ncolors >= (MAXJSAMPLE + 1))
+    ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, (MAXJSAMPLE + 1));
 
   /* OK, add color to map. */
-  colormap0[ncolors] = (JSAMPLE) R;
-  colormap1[ncolors] = (JSAMPLE) G;
-  colormap2[ncolors] = (JSAMPLE) B;
+  colormap0[ncolors] = (JSAMPLE)R;
+  colormap1[ncolors] = (JSAMPLE)G;
+  colormap2[ncolors] = (JSAMPLE)B;
   cinfo->actual_number_of_colors++;
 }
 
@@ -77,7 +77,7 @@
  */
 
 LOCAL(void)
-read_gif_map (j_decompress_ptr cinfo, FILE *infile)
+read_gif_map(j_decompress_ptr cinfo, FILE *infile)
 {
   int header[13];
   int i, colormaplen;
@@ -108,9 +108,9 @@
     if (R == EOF || G == EOF || B == EOF)
       ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
     add_map_entry(cinfo,
-                  R << (BITS_IN_JSAMPLE-8),
-                  G << (BITS_IN_JSAMPLE-8),
-                  B << (BITS_IN_JSAMPLE-8));
+                  R << (BITS_IN_JSAMPLE - 8),
+                  G << (BITS_IN_JSAMPLE - 8),
+                  B << (BITS_IN_JSAMPLE - 8));
   }
 }
 
@@ -119,7 +119,7 @@
 
 
 LOCAL(int)
-pbm_getc (FILE *infile)
+pbm_getc(FILE *infile)
 /* Read next char, skipping over any comments */
 /* A comment/newline sequence is returned as a newline */
 {
@@ -136,7 +136,7 @@
 
 
 LOCAL(unsigned int)
-read_pbm_integer (j_decompress_ptr cinfo, FILE *infile)
+read_pbm_integer(j_decompress_ptr cinfo, FILE *infile)
 /* Read an unsigned decimal integer from the PPM file */
 /* Swallows one trailing character after the integer */
 /* Note that on a 16-bit-int machine, only values up to 64k can be read. */
@@ -169,7 +169,7 @@
  */
 
 LOCAL(void)
-read_ppm_map (j_decompress_ptr cinfo, FILE *infile)
+read_ppm_map(j_decompress_ptr cinfo, FILE *infile)
 {
   int c;
   unsigned int w, h, maxval, row, col;
@@ -187,7 +187,7 @@
     ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
 
   /* For now, we don't support rescaling from an unusual maxval. */
-  if (maxval != (unsigned int) MAXJSAMPLE)
+  if (maxval != (unsigned int)MAXJSAMPLE)
     ERREXIT(cinfo, JERR_BAD_CMAP_FILE);
 
   switch (c) {
@@ -229,12 +229,12 @@
  */
 
 GLOBAL(void)
-read_color_map (j_decompress_ptr cinfo, FILE *infile)
+read_color_map(j_decompress_ptr cinfo, FILE *infile)
 {
   /* Allocate space for a color map of maximum supported size. */
   cinfo->colormap = (*cinfo->mem->alloc_sarray)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE,
-     (JDIMENSION) (MAXJSAMPLE+1), (JDIMENSION) 3);
+    ((j_common_ptr)cinfo, JPOOL_IMAGE,
+     (JDIMENSION)(MAXJSAMPLE + 1), (JDIMENSION)3);
   cinfo->actual_number_of_colors = 0; /* initialize map to empty */
 
   /* Read first byte to determine file format */
diff --git a/rdgif.c b/rdgif.c
index ce689f7..ff9258d 100644
--- a/rdgif.c
+++ b/rdgif.c
@@ -29,7 +29,7 @@
  */
 
 GLOBAL(cjpeg_source_ptr)
-jinit_read_gif (j_compress_ptr cinfo)
+jinit_read_gif(j_compress_ptr cinfo)
 {
   fprintf(stderr, "GIF input is unsupported for legal reasons.  Sorry.\n");
   exit(EXIT_FAILURE);
diff --git a/rdjpgcom.c b/rdjpgcom.c
index b3076dd..c435b90 100644
--- a/rdjpgcom.c
+++ b/rdjpgcom.c
@@ -69,7 +69,7 @@
 
 /* Read one byte, testing for EOF */
 static int
-read_1_byte (void)
+read_1_byte(void)
 {
   int c;
 
@@ -82,7 +82,7 @@
 /* Read 2 bytes, convert to unsigned int */
 /* All 2-byte quantities in JPEG markers are MSB first */
 static unsigned int
-read_2_bytes (void)
+read_2_bytes(void)
 {
   int c1, c2;
 
@@ -92,7 +92,7 @@
   c2 = NEXTBYTE();
   if (c2 == EOF)
     ERREXIT("Premature EOF in JPEG file");
-  return (((unsigned int) c1) << 8) + ((unsigned int) c2);
+  return (((unsigned int)c1) << 8) + ((unsigned int)c2);
 }
 
 
@@ -134,7 +134,7 @@
  */
 
 static int
-next_marker (void)
+next_marker(void)
 {
   int c;
   int discarded_bytes = 0;
@@ -169,7 +169,7 @@
  */
 
 static int
-first_marker (void)
+first_marker(void)
 {
   int c1, c2;
 
@@ -191,7 +191,7 @@
  */
 
 static void
-skip_variable (void)
+skip_variable(void)
 /* Skip over an unknown or uninteresting variable-length marker */
 {
   unsigned int length;
@@ -204,7 +204,7 @@
   length -= 2;
   /* Skip over the remaining bytes */
   while (length > 0) {
-    (void) read_1_byte();
+    (void)read_1_byte();
     length--;
   }
 }
@@ -217,7 +217,7 @@
  */
 
 static void
-process_COM (int raw)
+process_COM(int raw)
 {
   unsigned int length;
   int ch;
@@ -274,7 +274,7 @@
  */
 
 static void
-process_SOFn (int marker)
+process_SOFn(int marker)
 {
   unsigned int length;
   unsigned int image_height, image_width;
@@ -310,13 +310,13 @@
          image_width, image_height, num_components, data_precision);
   printf("JPEG process: %s\n", process);
 
-  if (length != (unsigned int) (8 + num_components * 3))
+  if (length != (unsigned int)(8 + num_components * 3))
     ERREXIT("Bogus SOF marker length");
 
   for (ci = 0; ci < num_components; ci++) {
-    (void) read_1_byte();       /* Component ID code */
-    (void) read_1_byte();       /* H, V sampling factors */
-    (void) read_1_byte();       /* Quantization table number */
+    (void)read_1_byte();        /* Component ID code */
+    (void)read_1_byte();        /* H, V sampling factors */
+    (void)read_1_byte();        /* Quantization table number */
   }
 }
 
@@ -332,7 +332,7 @@
  */
 
 static int
-scan_JPEG_header (int verbose, int raw)
+scan_JPEG_header(int verbose, int raw)
 {
   int marker;
 
@@ -401,7 +401,7 @@
 
 
 static void
-usage (void)
+usage(void)
 /* complain about bad command line */
 {
   fprintf(stderr, "rdjpgcom displays any textual comments in a JPEG file.\n");
@@ -417,7 +417,7 @@
 
 
 static int
-keymatch (char *arg, const char *keyword, int minchars)
+keymatch(char *arg, const char *keyword, int minchars)
 /* Case-insensitive matching of (possibly abbreviated) keyword switches. */
 /* keyword is the constant keyword (must be lower case already), */
 /* minchars is length of minimum legal abbreviation. */
@@ -446,7 +446,7 @@
  */
 
 int
-main (int argc, char **argv)
+main(int argc, char **argv)
 {
   int argn;
   char *arg;
@@ -477,7 +477,7 @@
 
   /* Open the input file. */
   /* Unix style: expect zero or one file name */
-  if (argn < argc-1) {
+  if (argn < argc - 1) {
     fprintf(stderr, "%s: only one input file\n", progname);
     usage();
   }
@@ -502,7 +502,7 @@
   }
 
   /* Scan the JPEG headers. */
-  (void) scan_JPEG_header(verbose, raw);
+  (void)scan_JPEG_header(verbose, raw);
 
   /* All done. */
   exit(EXIT_SUCCESS);
diff --git a/rdppm.c b/rdppm.c
index 33ff749..f3bb79e 100644
--- a/rdppm.c
+++ b/rdppm.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2009 by Bill Allombert, Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, 2016, D. R. Commander.
+ * Copyright (C) 2015-2017, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -22,6 +22,7 @@
  * the file is indeed PPM format).
  */
 
+#include "cmyk.h"
 #include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
 
 #ifdef PPM_SUPPORTED
@@ -44,19 +45,24 @@
 
 #ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char U_CHAR;
-#define UCH(x)  ((int) (x))
+#define UCH(x)  ((int)(x))
 #else /* !HAVE_UNSIGNED_CHAR */
 #ifdef __CHAR_UNSIGNED__
 typedef char U_CHAR;
-#define UCH(x)  ((int) (x))
+#define UCH(x)  ((int)(x))
 #else
 typedef char U_CHAR;
-#define UCH(x)  ((int) (x) & 0xFF)
+#define UCH(x)  ((int)(x) & 0xFF)
 #endif
 #endif /* HAVE_UNSIGNED_CHAR */
 
 
-#define ReadOK(file,buffer,len) (JFREAD(file,buffer,len) == ((size_t) (len)))
+#define ReadOK(file, buffer, len) \
+  (JFREAD(file, buffer, len) == ((size_t)(len)))
+
+static int alpha_index[JPEG_NUMCS] = {
+  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0, -1
+};
 
 
 /* Private version of data source object */
@@ -76,7 +82,7 @@
 
 
 LOCAL(int)
-pbm_getc (FILE *infile)
+pbm_getc(FILE *infile)
 /* Read next char, skipping over any comments */
 /* A comment/newline sequence is returned as a newline */
 {
@@ -93,7 +99,7 @@
 
 
 LOCAL(unsigned int)
-read_pbm_integer (j_compress_ptr cinfo, FILE *infile, unsigned int maxval)
+read_pbm_integer(j_compress_ptr cinfo, FILE *infile, unsigned int maxval)
 /* Read an unsigned decimal integer from the PPM file */
 /* Swallows one trailing character after the integer */
 /* Note that on a 16-bit-int machine, only values up to 64k can be read. */
@@ -137,10 +143,10 @@
 
 
 METHODDEF(JDIMENSION)
-get_text_gray_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+get_text_gray_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 /* This version is for reading text-format PGM files with any maxval */
 {
-  ppm_source_ptr source = (ppm_source_ptr) sinfo;
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
   FILE *infile = source->pub.input_file;
   register JSAMPROW ptr;
   register JSAMPLE *rescale = source->rescale;
@@ -155,11 +161,129 @@
 }
 
 
+#define GRAY_RGB_READ_LOOP(read_op, alpha_set_op) { \
+  for (col = cinfo->image_width; col > 0; col--) { \
+    ptr[rindex] = ptr[gindex] = ptr[bindex] = read_op; \
+    alpha_set_op \
+    ptr += ps; \
+  } \
+}
+
 METHODDEF(JDIMENSION)
-get_text_rgb_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+get_text_gray_rgb_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading text-format PGM files with any maxval and
+   converting to extended RGB */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  FILE *infile = source->pub.input_file;
+  register JSAMPROW ptr;
+  register JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+  register int rindex = rgb_red[cinfo->in_color_space];
+  register int gindex = rgb_green[cinfo->in_color_space];
+  register int bindex = rgb_blue[cinfo->in_color_space];
+  register int aindex = alpha_index[cinfo->in_color_space];
+  register int ps = rgb_pixelsize[cinfo->in_color_space];
+
+  ptr = source->pub.buffer[0];
+  if (maxval == MAXJSAMPLE) {
+    if (aindex >= 0)
+      GRAY_RGB_READ_LOOP(read_pbm_integer(cinfo, infile, maxval),
+                         ptr[aindex] = 0xFF;)
+    else
+      GRAY_RGB_READ_LOOP(read_pbm_integer(cinfo, infile, maxval),)
+  } else {
+    if (aindex >= 0)
+      GRAY_RGB_READ_LOOP(rescale[read_pbm_integer(cinfo, infile, maxval)],
+                         ptr[aindex] = 0xFF;)
+    else
+      GRAY_RGB_READ_LOOP(rescale[read_pbm_integer(cinfo, infile, maxval)],)
+  }
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_text_gray_cmyk_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading text-format PGM files with any maxval and
+   converting to CMYK */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  FILE *infile = source->pub.input_file;
+  register JSAMPROW ptr;
+  register JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+
+  ptr = source->pub.buffer[0];
+  if (maxval == MAXJSAMPLE) {
+    for (col = cinfo->image_width; col > 0; col--) {
+      JSAMPLE gray = read_pbm_integer(cinfo, infile, maxval);
+      rgb_to_cmyk(gray, gray, gray, ptr, ptr + 1, ptr + 2, ptr + 3);
+      ptr += 4;
+    }
+  } else {
+    for (col = cinfo->image_width; col > 0; col--) {
+      JSAMPLE gray = rescale[read_pbm_integer(cinfo, infile, maxval)];
+      rgb_to_cmyk(gray, gray, gray, ptr, ptr + 1, ptr + 2, ptr + 3);
+      ptr += 4;
+    }
+  }
+  return 1;
+}
+
+
+#define RGB_READ_LOOP(read_op, alpha_set_op) { \
+  for (col = cinfo->image_width; col > 0; col--) { \
+    ptr[rindex] = read_op; \
+    ptr[gindex] = read_op; \
+    ptr[bindex] = read_op; \
+    alpha_set_op \
+    ptr += ps; \
+  } \
+}
+
+METHODDEF(JDIMENSION)
+get_text_rgb_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 /* This version is for reading text-format PPM files with any maxval */
 {
-  ppm_source_ptr source = (ppm_source_ptr) sinfo;
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  FILE *infile = source->pub.input_file;
+  register JSAMPROW ptr;
+  register JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+  register int rindex = rgb_red[cinfo->in_color_space];
+  register int gindex = rgb_green[cinfo->in_color_space];
+  register int bindex = rgb_blue[cinfo->in_color_space];
+  register int aindex = alpha_index[cinfo->in_color_space];
+  register int ps = rgb_pixelsize[cinfo->in_color_space];
+
+  ptr = source->pub.buffer[0];
+  if (maxval == MAXJSAMPLE) {
+    if (aindex >= 0)
+      RGB_READ_LOOP(read_pbm_integer(cinfo, infile, maxval),
+                    ptr[aindex] = 0xFF;)
+    else
+      RGB_READ_LOOP(read_pbm_integer(cinfo, infile, maxval),)
+  } else {
+    if (aindex >= 0)
+      RGB_READ_LOOP(rescale[read_pbm_integer(cinfo, infile, maxval)],
+                    ptr[aindex] = 0xFF;)
+    else
+      RGB_READ_LOOP(rescale[read_pbm_integer(cinfo, infile, maxval)],)
+  }
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_text_rgb_cmyk_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading text-format PPM files with any maxval and
+   converting to CMYK */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
   FILE *infile = source->pub.input_file;
   register JSAMPROW ptr;
   register JSAMPLE *rescale = source->rescale;
@@ -167,26 +291,38 @@
   unsigned int maxval = source->maxval;
 
   ptr = source->pub.buffer[0];
-  for (col = cinfo->image_width; col > 0; col--) {
-    *ptr++ = rescale[read_pbm_integer(cinfo, infile, maxval)];
-    *ptr++ = rescale[read_pbm_integer(cinfo, infile, maxval)];
-    *ptr++ = rescale[read_pbm_integer(cinfo, infile, maxval)];
+  if (maxval == MAXJSAMPLE) {
+    for (col = cinfo->image_width; col > 0; col--) {
+      JSAMPLE r = read_pbm_integer(cinfo, infile, maxval);
+      JSAMPLE g = read_pbm_integer(cinfo, infile, maxval);
+      JSAMPLE b = read_pbm_integer(cinfo, infile, maxval);
+      rgb_to_cmyk(r, g, b, ptr, ptr + 1, ptr + 2, ptr + 3);
+      ptr += 4;
+    }
+  } else {
+    for (col = cinfo->image_width; col > 0; col--) {
+      JSAMPLE r = rescale[read_pbm_integer(cinfo, infile, maxval)];
+      JSAMPLE g = rescale[read_pbm_integer(cinfo, infile, maxval)];
+      JSAMPLE b = rescale[read_pbm_integer(cinfo, infile, maxval)];
+      rgb_to_cmyk(r, g, b, ptr, ptr + 1, ptr + 2, ptr + 3);
+      ptr += 4;
+    }
   }
   return 1;
 }
 
 
 METHODDEF(JDIMENSION)
-get_scaled_gray_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+get_scaled_gray_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 /* This version is for reading raw-byte-format PGM files with any maxval */
 {
-  ppm_source_ptr source = (ppm_source_ptr) sinfo;
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
   register JSAMPROW ptr;
   register U_CHAR *bufferptr;
   register JSAMPLE *rescale = source->rescale;
   JDIMENSION col;
 
-  if (! ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
+  if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
     ERREXIT(cinfo, JERR_INPUT_EOF);
   ptr = source->pub.buffer[0];
   bufferptr = source->iobuffer;
@@ -198,55 +334,173 @@
 
 
 METHODDEF(JDIMENSION)
-get_scaled_rgb_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+get_gray_rgb_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading raw-byte-format PGM files with any maxval
+   and converting to extended RGB */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  register JSAMPROW ptr;
+  register U_CHAR *bufferptr;
+  register JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+  register int rindex = rgb_red[cinfo->in_color_space];
+  register int gindex = rgb_green[cinfo->in_color_space];
+  register int bindex = rgb_blue[cinfo->in_color_space];
+  register int aindex = alpha_index[cinfo->in_color_space];
+  register int ps = rgb_pixelsize[cinfo->in_color_space];
+
+  if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  ptr = source->pub.buffer[0];
+  bufferptr = source->iobuffer;
+  if (maxval == MAXJSAMPLE) {
+    if (aindex >= 0)
+      GRAY_RGB_READ_LOOP(*bufferptr++, ptr[aindex] = 0xFF;)
+    else
+      GRAY_RGB_READ_LOOP(*bufferptr++,)
+  } else {
+    if (aindex >= 0)
+      GRAY_RGB_READ_LOOP(rescale[UCH(*bufferptr++)], ptr[aindex] = 0xFF;)
+    else
+      GRAY_RGB_READ_LOOP(rescale[UCH(*bufferptr++)],)
+  }
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_gray_cmyk_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading raw-byte-format PGM files with any maxval
+   and converting to CMYK */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  register JSAMPROW ptr;
+  register U_CHAR *bufferptr;
+  register JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+
+  if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  ptr = source->pub.buffer[0];
+  bufferptr = source->iobuffer;
+  if (maxval == MAXJSAMPLE) {
+    for (col = cinfo->image_width; col > 0; col--) {
+      JSAMPLE gray = *bufferptr++;
+      rgb_to_cmyk(gray, gray, gray, ptr, ptr + 1, ptr + 2, ptr + 3);
+      ptr += 4;
+    }
+  } else {
+    for (col = cinfo->image_width; col > 0; col--) {
+      JSAMPLE gray = rescale[UCH(*bufferptr++)];
+      rgb_to_cmyk(gray, gray, gray, ptr, ptr + 1, ptr + 2, ptr + 3);
+      ptr += 4;
+    }
+  }
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_rgb_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 /* This version is for reading raw-byte-format PPM files with any maxval */
 {
-  ppm_source_ptr source = (ppm_source_ptr) sinfo;
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
   register JSAMPROW ptr;
   register U_CHAR *bufferptr;
   register JSAMPLE *rescale = source->rescale;
   JDIMENSION col;
+  unsigned int maxval = source->maxval;
+  register int rindex = rgb_red[cinfo->in_color_space];
+  register int gindex = rgb_green[cinfo->in_color_space];
+  register int bindex = rgb_blue[cinfo->in_color_space];
+  register int aindex = alpha_index[cinfo->in_color_space];
+  register int ps = rgb_pixelsize[cinfo->in_color_space];
 
-  if (! ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
+  if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
     ERREXIT(cinfo, JERR_INPUT_EOF);
   ptr = source->pub.buffer[0];
   bufferptr = source->iobuffer;
-  for (col = cinfo->image_width; col > 0; col--) {
-    *ptr++ = rescale[UCH(*bufferptr++)];
-    *ptr++ = rescale[UCH(*bufferptr++)];
-    *ptr++ = rescale[UCH(*bufferptr++)];
+  if (maxval == MAXJSAMPLE) {
+    if (aindex >= 0)
+      RGB_READ_LOOP(*bufferptr++, ptr[aindex] = 0xFF;)
+    else
+      RGB_READ_LOOP(*bufferptr++,)
+  } else {
+    if (aindex >= 0)
+      RGB_READ_LOOP(rescale[UCH(*bufferptr++)], ptr[aindex] = 0xFF;)
+    else
+      RGB_READ_LOOP(rescale[UCH(*bufferptr++)],)
   }
   return 1;
 }
 
 
 METHODDEF(JDIMENSION)
-get_raw_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+get_rgb_cmyk_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+/* This version is for reading raw-byte-format PPM files with any maxval and
+   converting to CMYK */
+{
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
+  register JSAMPROW ptr;
+  register U_CHAR *bufferptr;
+  register JSAMPLE *rescale = source->rescale;
+  JDIMENSION col;
+  unsigned int maxval = source->maxval;
+
+  if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  ptr = source->pub.buffer[0];
+  bufferptr = source->iobuffer;
+  if (maxval == MAXJSAMPLE) {
+    for (col = cinfo->image_width; col > 0; col--) {
+      JSAMPLE r = *bufferptr++;
+      JSAMPLE g = *bufferptr++;
+      JSAMPLE b = *bufferptr++;
+      rgb_to_cmyk(r, g, b, ptr, ptr + 1, ptr + 2, ptr + 3);
+      ptr += 4;
+    }
+  } else {
+    for (col = cinfo->image_width; col > 0; col--) {
+      JSAMPLE r = rescale[UCH(*bufferptr++)];
+      JSAMPLE g = rescale[UCH(*bufferptr++)];
+      JSAMPLE b = rescale[UCH(*bufferptr++)];
+      rgb_to_cmyk(r, g, b, ptr, ptr + 1, ptr + 2, ptr + 3);
+      ptr += 4;
+    }
+  }
+  return 1;
+}
+
+
+METHODDEF(JDIMENSION)
+get_raw_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 /* This version is for reading raw-byte-format files with maxval = MAXJSAMPLE.
  * In this case we just read right into the JSAMPLE buffer!
  * Note that same code works for PPM and PGM files.
  */
 {
-  ppm_source_ptr source = (ppm_source_ptr) sinfo;
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
 
-  if (! ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
+  if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
     ERREXIT(cinfo, JERR_INPUT_EOF);
   return 1;
 }
 
 
 METHODDEF(JDIMENSION)
-get_word_gray_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+get_word_gray_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 /* This version is for reading raw-word-format PGM files with any maxval */
 {
-  ppm_source_ptr source = (ppm_source_ptr) sinfo;
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
   register JSAMPROW ptr;
   register U_CHAR *bufferptr;
   register JSAMPLE *rescale = source->rescale;
   JDIMENSION col;
   unsigned int maxval = source->maxval;
 
-  if (! ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
+  if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
     ERREXIT(cinfo, JERR_INPUT_EOF);
   ptr = source->pub.buffer[0];
   bufferptr = source->iobuffer;
@@ -263,17 +517,17 @@
 
 
 METHODDEF(JDIMENSION)
-get_word_rgb_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+get_word_rgb_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 /* This version is for reading raw-word-format PPM files with any maxval */
 {
-  ppm_source_ptr source = (ppm_source_ptr) sinfo;
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
   register JSAMPROW ptr;
   register U_CHAR *bufferptr;
   register JSAMPLE *rescale = source->rescale;
   JDIMENSION col;
   unsigned int maxval = source->maxval;
 
-  if (! ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
+  if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
     ERREXIT(cinfo, JERR_INPUT_EOF);
   ptr = source->pub.buffer[0];
   bufferptr = source->iobuffer;
@@ -304,9 +558,9 @@
  */
 
 METHODDEF(void)
-start_input_ppm (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+start_input_ppm(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 {
-  ppm_source_ptr source = (ppm_source_ptr) sinfo;
+  ppm_source_ptr source = (ppm_source_ptr)sinfo;
   int c;
   unsigned int w, h, maxval;
   boolean need_iobuffer, use_raw_buffer, need_rescale;
@@ -337,8 +591,8 @@
     ERREXIT(cinfo, JERR_PPM_NOT);
 
   cinfo->data_precision = BITS_IN_JSAMPLE; /* we always rescale data to this */
-  cinfo->image_width = (JDIMENSION) w;
-  cinfo->image_height = (JDIMENSION) h;
+  cinfo->image_width = (JDIMENSION)w;
+  cinfo->image_height = (JDIMENSION)h;
   source->maxval = maxval;
 
   /* initialize flags to most common settings */
@@ -348,58 +602,99 @@
 
   switch (c) {
   case '2':                     /* it's a text-format PGM file */
-    cinfo->input_components = 1;
-    cinfo->in_color_space = JCS_GRAYSCALE;
+    if (cinfo->in_color_space == JCS_UNKNOWN)
+      cinfo->in_color_space = JCS_GRAYSCALE;
     TRACEMS2(cinfo, 1, JTRC_PGM_TEXT, w, h);
-    source->pub.get_pixel_rows = get_text_gray_row;
+    if (cinfo->in_color_space == JCS_GRAYSCALE)
+      source->pub.get_pixel_rows = get_text_gray_row;
+    else if (IsExtRGB(cinfo->in_color_space))
+      source->pub.get_pixel_rows = get_text_gray_rgb_row;
+    else if (cinfo->in_color_space == JCS_CMYK)
+      source->pub.get_pixel_rows = get_text_gray_cmyk_row;
+    else
+      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
     need_iobuffer = FALSE;
     break;
 
   case '3':                     /* it's a text-format PPM file */
-    cinfo->input_components = 3;
-    cinfo->in_color_space = JCS_RGB;
+    if (cinfo->in_color_space == JCS_UNKNOWN)
+      cinfo->in_color_space = JCS_EXT_RGB;
     TRACEMS2(cinfo, 1, JTRC_PPM_TEXT, w, h);
-    source->pub.get_pixel_rows = get_text_rgb_row;
+    if (IsExtRGB(cinfo->in_color_space))
+      source->pub.get_pixel_rows = get_text_rgb_row;
+    else if (cinfo->in_color_space == JCS_CMYK)
+      source->pub.get_pixel_rows = get_text_rgb_cmyk_row;
+    else
+      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
     need_iobuffer = FALSE;
     break;
 
   case '5':                     /* it's a raw-format PGM file */
-    cinfo->input_components = 1;
-    cinfo->in_color_space = JCS_GRAYSCALE;
+    if (cinfo->in_color_space == JCS_UNKNOWN)
+      cinfo->in_color_space = JCS_GRAYSCALE;
     TRACEMS2(cinfo, 1, JTRC_PGM, w, h);
     if (maxval > 255) {
       source->pub.get_pixel_rows = get_word_gray_row;
-    } else if (maxval == MAXJSAMPLE && sizeof(JSAMPLE) == sizeof(U_CHAR)) {
+    } else if (maxval == MAXJSAMPLE && sizeof(JSAMPLE) == sizeof(U_CHAR) &&
+               cinfo->in_color_space == JCS_GRAYSCALE) {
       source->pub.get_pixel_rows = get_raw_row;
       use_raw_buffer = TRUE;
       need_rescale = FALSE;
     } else {
-      source->pub.get_pixel_rows = get_scaled_gray_row;
+      if (cinfo->in_color_space == JCS_GRAYSCALE)
+        source->pub.get_pixel_rows = get_scaled_gray_row;
+      else if (IsExtRGB(cinfo->in_color_space))
+        source->pub.get_pixel_rows = get_gray_rgb_row;
+      else if (cinfo->in_color_space == JCS_CMYK)
+        source->pub.get_pixel_rows = get_gray_cmyk_row;
+      else
+        ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
     }
     break;
 
   case '6':                     /* it's a raw-format PPM file */
-    cinfo->input_components = 3;
-    cinfo->in_color_space = JCS_RGB;
+    if (cinfo->in_color_space == JCS_UNKNOWN)
+      cinfo->in_color_space = JCS_EXT_RGB;
     TRACEMS2(cinfo, 1, JTRC_PPM, w, h);
     if (maxval > 255) {
       source->pub.get_pixel_rows = get_word_rgb_row;
-    } else if (maxval == MAXJSAMPLE && sizeof(JSAMPLE) == sizeof(U_CHAR)) {
+    } else if (maxval == MAXJSAMPLE && sizeof(JSAMPLE) == sizeof(U_CHAR) &&
+               (cinfo->in_color_space == JCS_EXT_RGB
+#if RGB_RED == 0 && RGB_GREEN == 1 && RGB_BLUE == 2 && RGB_PIXELSIZE == 3
+                || cinfo->in_color_space == JCS_RGB
+#endif
+               )) {
       source->pub.get_pixel_rows = get_raw_row;
       use_raw_buffer = TRUE;
       need_rescale = FALSE;
     } else {
-      source->pub.get_pixel_rows = get_scaled_rgb_row;
+      if (IsExtRGB(cinfo->in_color_space))
+        source->pub.get_pixel_rows = get_rgb_row;
+      else if (cinfo->in_color_space == JCS_CMYK)
+        source->pub.get_pixel_rows = get_rgb_cmyk_row;
+      else
+        ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
     }
     break;
   }
 
+  if (IsExtRGB(cinfo->in_color_space))
+    cinfo->input_components = rgb_pixelsize[cinfo->in_color_space];
+  else if (cinfo->in_color_space == JCS_GRAYSCALE)
+    cinfo->input_components = 1;
+  else if (cinfo->in_color_space == JCS_CMYK)
+    cinfo->input_components = 4;
+
   /* Allocate space for I/O buffer: 1 or 3 bytes or words/pixel. */
   if (need_iobuffer) {
-    source->buffer_width = (size_t) w * cinfo->input_components *
-      ((maxval <= 255) ? sizeof(U_CHAR) : (2 * sizeof(U_CHAR)));
+    if (c == '6')
+      source->buffer_width = (size_t)w * 3 *
+        ((maxval <= 255) ? sizeof(U_CHAR) : (2 * sizeof(U_CHAR)));
+    else
+      source->buffer_width = (size_t)w *
+        ((maxval <= 255) ? sizeof(U_CHAR) : (2 * sizeof(U_CHAR)));
     source->iobuffer = (U_CHAR *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   source->buffer_width);
   }
 
@@ -407,14 +702,14 @@
   if (use_raw_buffer) {
     /* For unscaled raw-input case, we can just map it onto the I/O buffer. */
     /* Synthesize a JSAMPARRAY pointer structure */
-    source->pixrow = (JSAMPROW) source->iobuffer;
-    source->pub.buffer = & source->pixrow;
+    source->pixrow = (JSAMPROW)source->iobuffer;
+    source->pub.buffer = &source->pixrow;
     source->pub.buffer_height = 1;
   } else {
     /* Need to translate anyway, so make a separate sample buffer. */
     source->pub.buffer = (*cinfo->mem->alloc_sarray)
-      ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       (JDIMENSION) w * cinfo->input_components, (JDIMENSION) 1);
+      ((j_common_ptr)cinfo, JPOOL_IMAGE,
+       (JDIMENSION)w * cinfo->input_components, (JDIMENSION)1);
     source->pub.buffer_height = 1;
   }
 
@@ -424,13 +719,13 @@
 
     /* On 16-bit-int machines we have to be careful of maxval = 65535 */
     source->rescale = (JSAMPLE *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  (size_t) (((long) maxval + 1L) *
-                                            sizeof(JSAMPLE)));
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                  (size_t)(((long)maxval + 1L) *
+                                           sizeof(JSAMPLE)));
     half_maxval = maxval / 2;
-    for (val = 0; val <= (long) maxval; val++) {
+    for (val = 0; val <= (long)maxval; val++) {
       /* The multiplication here must be done in 32 bits to avoid overflow */
-      source->rescale[val] = (JSAMPLE) ((val * MAXJSAMPLE + half_maxval) /
+      source->rescale[val] = (JSAMPLE)((val * MAXJSAMPLE + half_maxval) /
                                         maxval);
     }
   }
@@ -442,7 +737,7 @@
  */
 
 METHODDEF(void)
-finish_input_ppm (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+finish_input_ppm(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 {
   /* no work */
 }
@@ -453,19 +748,19 @@
  */
 
 GLOBAL(cjpeg_source_ptr)
-jinit_read_ppm (j_compress_ptr cinfo)
+jinit_read_ppm(j_compress_ptr cinfo)
 {
   ppm_source_ptr source;
 
   /* Create module interface object */
   source = (ppm_source_ptr)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  sizeof(ppm_source_struct));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(ppm_source_struct));
   /* Fill in method ptrs, except get_pixel_rows which start_input sets */
   source->pub.start_input = start_input_ppm;
   source->pub.finish_input = finish_input_ppm;
 
-  return (cjpeg_source_ptr) source;
+  return (cjpeg_source_ptr)source;
 }
 
 #endif /* PPM_SUPPORTED */
diff --git a/rdrle.c b/rdrle.c
index 226c528..b694514 100644
--- a/rdrle.c
+++ b/rdrle.c
@@ -81,12 +81,12 @@
  */
 
 METHODDEF(void)
-start_input_rle (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+start_input_rle(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 {
-  rle_source_ptr source = (rle_source_ptr) sinfo;
+  rle_source_ptr source = (rle_source_ptr)sinfo;
   JDIMENSION width, height;
 #ifdef PROGRESS_REPORT
-  cd_progress_ptr progress = (cd_progress_ptr) cinfo->progress;
+  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
 #endif
 
   /* Use RLE library routine to get the header info */
@@ -118,7 +118,7 @@
   width  = source->header.xmax - source->header.xmin + 1;
   height = source->header.ymax - source->header.ymin + 1;
   source->header.xmin = 0;              /* realign horizontally */
-  source->header.xmax = width-1;
+  source->header.xmax = width - 1;
 
   cinfo->image_width      = width;
   cinfo->image_height     = height;
@@ -158,16 +158,16 @@
    * (GRAYSCALE scanlines don't need converting)
    */
   if (source->visual != GRAYSCALE) {
-    source->rle_row = (rle_pixel**) (*cinfo->mem->alloc_sarray)
-      ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       (JDIMENSION) width, (JDIMENSION) cinfo->input_components);
+    source->rle_row = (rle_pixel **)(*cinfo->mem->alloc_sarray)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE,
+       (JDIMENSION)width, (JDIMENSION)cinfo->input_components);
   }
 
   /* request a virtual array to hold the image */
   source->image = (*cinfo->mem->request_virt_sarray)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
-     (JDIMENSION) (width * source->header.ncolors),
-     (JDIMENSION) height, (JDIMENSION) 1);
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
+     (JDIMENSION)(width * source->header.ncolors),
+     (JDIMENSION)height, (JDIMENSION)1);
 
 #ifdef PROGRESS_REPORT
   if (progress != NULL) {
@@ -187,13 +187,13 @@
  */
 
 METHODDEF(JDIMENSION)
-get_rle_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+get_rle_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 {
-  rle_source_ptr source = (rle_source_ptr) sinfo;
+  rle_source_ptr source = (rle_source_ptr)sinfo;
 
   source->row--;
   source->pub.buffer = (*cinfo->mem->access_virt_sarray)
-    ((j_common_ptr) cinfo, source->image, source->row, (JDIMENSION) 1, FALSE);
+    ((j_common_ptr)cinfo, source->image, source->row, (JDIMENSION)1, FALSE);
 
   return 1;
 }
@@ -205,9 +205,9 @@
  */
 
 METHODDEF(JDIMENSION)
-get_pseudocolor_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+get_pseudocolor_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 {
-  rle_source_ptr source = (rle_source_ptr) sinfo;
+  rle_source_ptr source = (rle_source_ptr)sinfo;
   JSAMPROW src_row, dest_row;
   JDIMENSION col;
   rle_map *colormap;
@@ -217,13 +217,13 @@
   dest_row = source->pub.buffer[0];
   source->row--;
   src_row = *(*cinfo->mem->access_virt_sarray)
-    ((j_common_ptr) cinfo, source->image, source->row, (JDIMENSION) 1, FALSE);
+    ((j_common_ptr)cinfo, source->image, source->row, (JDIMENSION)1, FALSE);
 
   for (col = cinfo->image_width; col > 0; col--) {
     val = GETJSAMPLE(*src_row++);
-    *dest_row++ = (JSAMPLE) (colormap[val      ] >> 8);
-    *dest_row++ = (JSAMPLE) (colormap[val + 256] >> 8);
-    *dest_row++ = (JSAMPLE) (colormap[val + 512] >> 8);
+    *dest_row++ = (JSAMPLE)(colormap[val      ] >> 8);
+    *dest_row++ = (JSAMPLE)(colormap[val + 256] >> 8);
+    *dest_row++ = (JSAMPLE)(colormap[val + 512] >> 8);
   }
 
   return 1;
@@ -241,16 +241,16 @@
  */
 
 METHODDEF(JDIMENSION)
-load_image (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+load_image(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 {
-  rle_source_ptr source = (rle_source_ptr) sinfo;
+  rle_source_ptr source = (rle_source_ptr)sinfo;
   JDIMENSION row, col;
-  JSAMPROW  scanline, red_ptr, green_ptr, blue_ptr;
+  JSAMPROW scanline, red_ptr, green_ptr, blue_ptr;
   rle_pixel **rle_row;
   rle_map *colormap;
   char channel;
 #ifdef PROGRESS_REPORT
-  cd_progress_ptr progress = (cd_progress_ptr) cinfo->progress;
+  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
 #endif
 
   colormap = source->header.cmap;
@@ -265,7 +265,7 @@
   if (progress != NULL) {
     progress->pub.pass_limit = cinfo->image_height;
     progress->pub.pass_counter = 0;
-    (*progress->pub.progress_monitor) ((j_common_ptr) cinfo);
+    (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
   }
 #endif
 
@@ -274,13 +274,13 @@
   case GRAYSCALE:
   case PSEUDOCOLOR:
     for (row = 0; row < cinfo->image_height; row++) {
-      rle_row = (rle_pixel **) (*cinfo->mem->access_virt_sarray)
-         ((j_common_ptr) cinfo, source->image, row, (JDIMENSION) 1, TRUE);
+      rle_row = (rle_pixel **)(*cinfo->mem->access_virt_sarray)
+        ((j_common_ptr)cinfo, source->image, row, (JDIMENSION)1, TRUE);
       rle_getrow(&source->header, rle_row);
 #ifdef PROGRESS_REPORT
       if (progress != NULL) {
         progress->pub.pass_counter++;
-        (*progress->pub.progress_monitor) ((j_common_ptr) cinfo);
+        (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
       }
 #endif
     }
@@ -290,7 +290,7 @@
   case TRUECOLOR:
     for (row = 0; row < cinfo->image_height; row++) {
       scanline = *(*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr) cinfo, source->image, row, (JDIMENSION) 1, TRUE);
+        ((j_common_ptr)cinfo, source->image, row, (JDIMENSION)1, TRUE);
       rle_row = source->rle_row;
       rle_getrow(&source->header, rle_row);
 
@@ -304,7 +304,7 @@
 #ifdef PROGRESS_REPORT
       if (progress != NULL) {
         progress->pub.pass_counter++;
-        (*progress->pub.progress_monitor) ((j_common_ptr) cinfo);
+        (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
       }
 #endif
     }
@@ -313,7 +313,7 @@
   case DIRECTCOLOR:
     for (row = 0; row < cinfo->image_height; row++) {
       scanline = *(*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr) cinfo, source->image, row, (JDIMENSION) 1, TRUE);
+        ((j_common_ptr)cinfo, source->image, row, (JDIMENSION)1, TRUE);
       rle_getrow(&source->header, rle_row);
 
       red_ptr   = rle_row[0];
@@ -329,7 +329,7 @@
 #ifdef PROGRESS_REPORT
       if (progress != NULL) {
         progress->pub.pass_counter++;
-        (*progress->pub.progress_monitor) ((j_common_ptr) cinfo);
+        (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
       }
 #endif
     }
@@ -359,7 +359,7 @@
  */
 
 METHODDEF(void)
-finish_input_rle (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+finish_input_rle(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 {
   /* no work */
 }
@@ -370,20 +370,20 @@
  */
 
 GLOBAL(cjpeg_source_ptr)
-jinit_read_rle (j_compress_ptr cinfo)
+jinit_read_rle(j_compress_ptr cinfo)
 {
   rle_source_ptr source;
 
   /* Create module interface object */
   source = (rle_source_ptr)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  sizeof(rle_source_struct));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(rle_source_struct));
   /* Fill in method ptrs */
   source->pub.start_input = start_input_rle;
   source->pub.finish_input = finish_input_rle;
   source->pub.get_pixel_rows = load_image;
 
-  return (cjpeg_source_ptr) source;
+  return (cjpeg_source_ptr)source;
 }
 
 #endif /* RLE_SUPPORTED */
diff --git a/rdswitch.c b/rdswitch.c
index 7d870c3..610346b 100644
--- a/rdswitch.c
+++ b/rdswitch.c
@@ -22,7 +22,7 @@
 
 
 LOCAL(int)
-text_getc (FILE *file)
+text_getc(FILE *file)
 /* Read next char, skipping over any comments (# to end of line) */
 /* A comment/newline sequence is returned as a newline */
 {
@@ -39,7 +39,7 @@
 
 
 LOCAL(boolean)
-read_text_integer (FILE *file, long *result, int *termchar)
+read_text_integer(FILE *file, long *result, int *termchar)
 /* Read an unsigned decimal integer from a file, store it in result */
 /* Reads one trailing character after the integer; returns it in termchar */
 {
@@ -55,14 +55,14 @@
     }
   } while (isspace(ch));
 
-  if (! isdigit(ch)) {
+  if (!isdigit(ch)) {
     *termchar = ch;
     return FALSE;
   }
 
   val = ch - '0';
   while ((ch = text_getc(file)) != EOF) {
-    if (! isdigit(ch))
+    if (!isdigit(ch))
       break;
     val *= 10;
     val += ch - '0';
@@ -74,12 +74,11 @@
 
 
 #if JPEG_LIB_VERSION < 70
-static int q_scale_factor[NUM_QUANT_TBLS] = {100, 100, 100, 100};
+static int q_scale_factor[NUM_QUANT_TBLS] = { 100, 100, 100, 100 };
 #endif
 
 GLOBAL(boolean)
-read_quant_tables (j_compress_ptr cinfo, char *filename,
-                   boolean force_baseline)
+read_quant_tables(j_compress_ptr cinfo, char *filename, boolean force_baseline)
 /* Read a set of quantization tables from the specified file.
  * The file is plain ASCII text: decimal numbers with whitespace between.
  * Comments preceded by '#' may be included in the file.
@@ -107,14 +106,14 @@
       fclose(fp);
       return FALSE;
     }
-    table[0] = (unsigned int) val;
+    table[0] = (unsigned int)val;
     for (i = 1; i < DCTSIZE2; i++) {
-      if (! read_text_integer(fp, &val, &termchar)) {
+      if (!read_text_integer(fp, &val, &termchar)) {
         fprintf(stderr, "Invalid table data in file %s\n", filename);
         fclose(fp);
         return FALSE;
       }
-      table[i] = (unsigned int) val;
+      table[i] = (unsigned int)val;
     }
 #if JPEG_LIB_VERSION >= 70
     jpeg_add_quant_table(cinfo, tblno, table, cinfo->q_scale_factor[tblno],
@@ -140,14 +139,14 @@
 #ifdef C_MULTISCAN_FILES_SUPPORTED
 
 LOCAL(boolean)
-read_scan_integer (FILE *file, long *result, int *termchar)
+read_scan_integer(FILE *file, long *result, int *termchar)
 /* Variant of read_text_integer that always looks for a non-space termchar;
  * this simplifies parsing of punctuation in scan scripts.
  */
 {
   register int ch;
 
-  if (! read_text_integer(file, result, termchar))
+  if (!read_text_integer(file, result, termchar))
     return FALSE;
   ch = *termchar;
   while (ch != EOF && isspace(ch))
@@ -169,7 +168,7 @@
 
 
 GLOBAL(boolean)
-read_scan_script (j_compress_ptr cinfo, char *filename)
+read_scan_script(j_compress_ptr cinfo, char *filename)
 /* Read a scan script from the specified text file.
  * Each entry in the file defines one scan to be emitted.
  * Entries are separated by semicolons ';'.
@@ -206,7 +205,7 @@
       fclose(fp);
       return FALSE;
     }
-    scanptr->component_index[0] = (int) val;
+    scanptr->component_index[0] = (int)val;
     ncomps = 1;
     while (termchar == ' ') {
       if (ncomps >= MAX_COMPS_IN_SCAN) {
@@ -215,29 +214,29 @@
         fclose(fp);
         return FALSE;
       }
-      if (! read_scan_integer(fp, &val, &termchar))
+      if (!read_scan_integer(fp, &val, &termchar))
         goto bogus;
-      scanptr->component_index[ncomps] = (int) val;
+      scanptr->component_index[ncomps] = (int)val;
       ncomps++;
     }
     scanptr->comps_in_scan = ncomps;
     if (termchar == ':') {
-      if (! read_scan_integer(fp, &val, &termchar) || termchar != ' ')
+      if (!read_scan_integer(fp, &val, &termchar) || termchar != ' ')
         goto bogus;
-      scanptr->Ss = (int) val;
-      if (! read_scan_integer(fp, &val, &termchar) || termchar != ' ')
+      scanptr->Ss = (int)val;
+      if (!read_scan_integer(fp, &val, &termchar) || termchar != ' ')
         goto bogus;
-      scanptr->Se = (int) val;
-      if (! read_scan_integer(fp, &val, &termchar) || termchar != ' ')
+      scanptr->Se = (int)val;
+      if (!read_scan_integer(fp, &val, &termchar) || termchar != ' ')
         goto bogus;
-      scanptr->Ah = (int) val;
-      if (! read_scan_integer(fp, &val, &termchar))
+      scanptr->Ah = (int)val;
+      if (!read_scan_integer(fp, &val, &termchar))
         goto bogus;
-      scanptr->Al = (int) val;
+      scanptr->Al = (int)val;
     } else {
       /* set non-progressive parameters */
       scanptr->Ss = 0;
-      scanptr->Se = DCTSIZE2-1;
+      scanptr->Se = DCTSIZE2 - 1;
       scanptr->Ah = 0;
       scanptr->Al = 0;
     }
@@ -262,7 +261,7 @@
      * but if you want to compress multiple images you'd want JPOOL_PERMANENT.
      */
     scanptr = (jpeg_scan_info *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   scanno * sizeof(jpeg_scan_info));
     MEMCOPY(scanptr, scans, scanno * sizeof(jpeg_scan_info));
     cinfo->scan_info = scanptr;
@@ -304,18 +303,18 @@
 
 
 LOCAL(void)
-jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
+jpeg_default_qtables(j_compress_ptr cinfo, boolean force_baseline)
 {
-  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
-                       q_scale_factor[0], force_baseline);
-  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
-                       q_scale_factor[1], force_baseline);
+  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl, q_scale_factor[0],
+                       force_baseline);
+  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl, q_scale_factor[1],
+                       force_baseline);
 }
 #endif
 
 
 GLOBAL(boolean)
-set_quality_ratings (j_compress_ptr cinfo, char *arg, boolean force_baseline)
+set_quality_ratings(j_compress_ptr cinfo, char *arg, boolean force_baseline)
 /* Process a quality-ratings parameter string, of the form
  *     N[,N,...]
  * If there are more q-table slots than parameters, the last value is replicated.
@@ -355,7 +354,7 @@
 
 
 GLOBAL(boolean)
-set_quant_slots (j_compress_ptr cinfo, char *arg)
+set_quant_slots(j_compress_ptr cinfo, char *arg)
 /* Process a quantization-table-selectors parameter string, of the form
  *     N[,N,...]
  * If there are more components than parameters, the last value is replicated.
@@ -374,7 +373,7 @@
         return FALSE;
       if (val < 0 || val >= NUM_QUANT_TBLS) {
         fprintf(stderr, "JPEG quantization tables are numbered 0..%d\n",
-                NUM_QUANT_TBLS-1);
+                NUM_QUANT_TBLS - 1);
         return FALSE;
       }
       cinfo->comp_info[ci].quant_tbl_no = val;
@@ -390,7 +389,7 @@
 
 
 GLOBAL(boolean)
-set_sample_factors (j_compress_ptr cinfo, char *arg)
+set_sample_factors(j_compress_ptr cinfo, char *arg)
 /* Process a sample-factors parameter string, of the form
  *     HxV[,HxV,...]
  * If there are more components than parameters, "1x1" is assumed for the rest.
diff --git a/rdtarga.c b/rdtarga.c
index b9bbd07..ecb4219 100644
--- a/rdtarga.c
+++ b/rdtarga.c
@@ -29,19 +29,20 @@
 
 #ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char U_CHAR;
-#define UCH(x)  ((int) (x))
+#define UCH(x)  ((int)(x))
 #else /* !HAVE_UNSIGNED_CHAR */
 #ifdef __CHAR_UNSIGNED__
 typedef char U_CHAR;
-#define UCH(x)  ((int) (x))
+#define UCH(x)  ((int)(x))
 #else
 typedef char U_CHAR;
-#define UCH(x)  ((int) (x) & 0xFF)
+#define UCH(x)  ((int)(x) & 0xFF)
 #endif
 #endif /* HAVE_UNSIGNED_CHAR */
 
 
-#define ReadOK(file,buffer,len) (JFREAD(file,buffer,len) == ((size_t) (len)))
+#define ReadOK(file, buffer, len) \
+  (JFREAD(file, buffer, len) == ((size_t)(len)))
 
 
 /* Private version of data source object */
@@ -87,7 +88,7 @@
 
 
 LOCAL(int)
-read_byte (tga_source_ptr sinfo)
+read_byte(tga_source_ptr sinfo)
 /* Read next byte from Targa file */
 {
   register FILE *infile = sinfo->pub.input_file;
@@ -100,7 +101,7 @@
 
 
 LOCAL(void)
-read_colormap (tga_source_ptr sinfo, int cmaplen, int mapentrysize)
+read_colormap(tga_source_ptr sinfo, int cmaplen, int mapentrysize)
 /* Read the colormap from a Targa file */
 {
   int i;
@@ -110,9 +111,9 @@
     ERREXIT(sinfo->cinfo, JERR_TGA_BADCMAP);
 
   for (i = 0; i < cmaplen; i++) {
-    sinfo->colormap[2][i] = (JSAMPLE) read_byte(sinfo);
-    sinfo->colormap[1][i] = (JSAMPLE) read_byte(sinfo);
-    sinfo->colormap[0][i] = (JSAMPLE) read_byte(sinfo);
+    sinfo->colormap[2][i] = (JSAMPLE)read_byte(sinfo);
+    sinfo->colormap[1][i] = (JSAMPLE)read_byte(sinfo);
+    sinfo->colormap[0][i] = (JSAMPLE)read_byte(sinfo);
   }
 }
 
@@ -122,20 +123,20 @@
  */
 
 METHODDEF(void)
-read_non_rle_pixel (tga_source_ptr sinfo)
+read_non_rle_pixel(tga_source_ptr sinfo)
 /* Read one Targa pixel from the input file; no RLE expansion */
 {
   register FILE *infile = sinfo->pub.input_file;
   register int i;
 
   for (i = 0; i < sinfo->pixel_size; i++) {
-    sinfo->tga_pixel[i] = (U_CHAR) getc(infile);
+    sinfo->tga_pixel[i] = (U_CHAR)getc(infile);
   }
 }
 
 
 METHODDEF(void)
-read_rle_pixel (tga_source_ptr sinfo)
+read_rle_pixel(tga_source_ptr sinfo)
 /* Read one Targa pixel from the input file, expanding RLE data as needed */
 {
   register FILE *infile = sinfo->pub.input_file;
@@ -160,7 +161,7 @@
 
   /* Read next pixel */
   for (i = 0; i < sinfo->pixel_size; i++) {
-    sinfo->tga_pixel[i] = (U_CHAR) getc(infile);
+    sinfo->tga_pixel[i] = (U_CHAR)getc(infile);
   }
 }
 
@@ -173,26 +174,26 @@
 
 
 METHODDEF(JDIMENSION)
-get_8bit_gray_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+get_8bit_gray_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 /* This version is for reading 8-bit grayscale pixels */
 {
-  tga_source_ptr source = (tga_source_ptr) sinfo;
+  tga_source_ptr source = (tga_source_ptr)sinfo;
   register JSAMPROW ptr;
   register JDIMENSION col;
 
   ptr = source->pub.buffer[0];
   for (col = cinfo->image_width; col > 0; col--) {
     (*source->read_pixel) (source); /* Load next pixel into tga_pixel */
-    *ptr++ = (JSAMPLE) UCH(source->tga_pixel[0]);
+    *ptr++ = (JSAMPLE)UCH(source->tga_pixel[0]);
   }
   return 1;
 }
 
 METHODDEF(JDIMENSION)
-get_8bit_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+get_8bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 /* This version is for reading 8-bit colormap indexes */
 {
-  tga_source_ptr source = (tga_source_ptr) sinfo;
+  tga_source_ptr source = (tga_source_ptr)sinfo;
   register int t;
   register JSAMPROW ptr;
   register JDIMENSION col;
@@ -210,10 +211,10 @@
 }
 
 METHODDEF(JDIMENSION)
-get_16bit_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+get_16bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 /* This version is for reading 16-bit pixels */
 {
-  tga_source_ptr source = (tga_source_ptr) sinfo;
+  tga_source_ptr source = (tga_source_ptr)sinfo;
   register int t;
   register JSAMPROW ptr;
   register JDIMENSION col;
@@ -227,30 +228,30 @@
      * The format of the 16-bit (LSB first) input word is
      *     xRRRRRGGGGGBBBBB
      */
-    ptr[2] = (JSAMPLE) c5to8bits[t & 0x1F];
+    ptr[2] = (JSAMPLE)c5to8bits[t & 0x1F];
     t >>= 5;
-    ptr[1] = (JSAMPLE) c5to8bits[t & 0x1F];
+    ptr[1] = (JSAMPLE)c5to8bits[t & 0x1F];
     t >>= 5;
-    ptr[0] = (JSAMPLE) c5to8bits[t & 0x1F];
+    ptr[0] = (JSAMPLE)c5to8bits[t & 0x1F];
     ptr += 3;
   }
   return 1;
 }
 
 METHODDEF(JDIMENSION)
-get_24bit_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+get_24bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 /* This version is for reading 24-bit pixels */
 {
-  tga_source_ptr source = (tga_source_ptr) sinfo;
+  tga_source_ptr source = (tga_source_ptr)sinfo;
   register JSAMPROW ptr;
   register JDIMENSION col;
 
   ptr = source->pub.buffer[0];
   for (col = cinfo->image_width; col > 0; col--) {
     (*source->read_pixel) (source); /* Load next pixel into tga_pixel */
-    *ptr++ = (JSAMPLE) UCH(source->tga_pixel[2]); /* change BGR to RGB order */
-    *ptr++ = (JSAMPLE) UCH(source->tga_pixel[1]);
-    *ptr++ = (JSAMPLE) UCH(source->tga_pixel[0]);
+    *ptr++ = (JSAMPLE)UCH(source->tga_pixel[2]); /* change BGR to RGB order */
+    *ptr++ = (JSAMPLE)UCH(source->tga_pixel[1]);
+    *ptr++ = (JSAMPLE)UCH(source->tga_pixel[0]);
   }
   return 1;
 }
@@ -272,9 +273,9 @@
  */
 
 METHODDEF(JDIMENSION)
-get_memory_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+get_memory_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 {
-  tga_source_ptr source = (tga_source_ptr) sinfo;
+  tga_source_ptr source = (tga_source_ptr)sinfo;
   JDIMENSION source_row;
 
   /* Compute row of source that maps to current_row of normal order */
@@ -284,8 +285,8 @@
 
   /* Fetch that row from virtual array */
   source->pub.buffer = (*cinfo->mem->access_virt_sarray)
-    ((j_common_ptr) cinfo, source->whole_image,
-     source_row, (JDIMENSION) 1, FALSE);
+    ((j_common_ptr)cinfo, source->whole_image,
+     source_row, (JDIMENSION)1, FALSE);
 
   source->current_row++;
   return 1;
@@ -299,21 +300,21 @@
  */
 
 METHODDEF(JDIMENSION)
-preload_image (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+preload_image(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 {
-  tga_source_ptr source = (tga_source_ptr) sinfo;
+  tga_source_ptr source = (tga_source_ptr)sinfo;
   JDIMENSION row;
-  cd_progress_ptr progress = (cd_progress_ptr) cinfo->progress;
+  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
 
   /* Read the data into a virtual array in input-file row order. */
   for (row = 0; row < cinfo->image_height; row++) {
     if (progress != NULL) {
-      progress->pub.pass_counter = (long) row;
-      progress->pub.pass_limit = (long) cinfo->image_height;
-      (*progress->pub.progress_monitor) ((j_common_ptr) cinfo);
+      progress->pub.pass_counter = (long)row;
+      progress->pub.pass_limit = (long)cinfo->image_height;
+      (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
     }
     source->pub.buffer = (*cinfo->mem->access_virt_sarray)
-      ((j_common_ptr) cinfo, source->whole_image, row, (JDIMENSION) 1, TRUE);
+      ((j_common_ptr)cinfo, source->whole_image, row, (JDIMENSION)1, TRUE);
     (*source->get_pixel_rows) (cinfo, sinfo);
   }
   if (progress != NULL)
@@ -332,18 +333,18 @@
  */
 
 METHODDEF(void)
-start_input_tga (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+start_input_tga(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 {
-  tga_source_ptr source = (tga_source_ptr) sinfo;
+  tga_source_ptr source = (tga_source_ptr)sinfo;
   U_CHAR targaheader[18];
   int idlen, cmaptype, subtype, flags, interlace_type, components;
   unsigned int width, height, maplen;
   boolean is_bottom_up;
 
-#define GET_2B(offset)  ((unsigned int) UCH(targaheader[offset]) + \
-                         (((unsigned int) UCH(targaheader[offset+1])) << 8))
+#define GET_2B(offset)  ((unsigned int)UCH(targaheader[offset]) + \
+                         (((unsigned int)UCH(targaheader[offset + 1])) << 8))
 
-  if (! ReadOK(source->pub.input_file, targaheader, 18))
+  if (!ReadOK(source->pub.input_file, targaheader, 18))
     ERREXIT(cinfo, JERR_INPUT_EOF);
 
   /* Pretend "15-bit" pixels are 16-bit --- we ignore attribute bit anyway */
@@ -425,10 +426,10 @@
   if (is_bottom_up) {
     /* Create a virtual array to buffer the upside-down image. */
     source->whole_image = (*cinfo->mem->request_virt_sarray)
-      ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
-       (JDIMENSION) width * components, (JDIMENSION) height, (JDIMENSION) 1);
+      ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
+       (JDIMENSION)width * components, (JDIMENSION)height, (JDIMENSION)1);
     if (cinfo->progress != NULL) {
-      cd_progress_ptr progress = (cd_progress_ptr) cinfo->progress;
+      cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
       progress->total_extra_passes++; /* count file input as separate pass */
     }
     /* source->pub.buffer will point to the virtual array. */
@@ -438,23 +439,23 @@
     /* Don't need a virtual array, but do need a one-row input buffer. */
     source->whole_image = NULL;
     source->pub.buffer = (*cinfo->mem->alloc_sarray)
-      ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       (JDIMENSION) width * components, (JDIMENSION) 1);
+      ((j_common_ptr)cinfo, JPOOL_IMAGE,
+       (JDIMENSION)width * components, (JDIMENSION)1);
     source->pub.buffer_height = 1;
     source->pub.get_pixel_rows = source->get_pixel_rows;
   }
 
   while (idlen--)               /* Throw away ID field */
-    (void) read_byte(source);
+    (void)read_byte(source);
 
   if (maplen > 0) {
     if (maplen > 256 || GET_2B(3) != 0)
       ERREXIT(cinfo, JERR_TGA_BADCMAP);
     /* Allocate space to store the colormap */
     source->colormap = (*cinfo->mem->alloc_sarray)
-      ((j_common_ptr) cinfo, JPOOL_IMAGE, (JDIMENSION) maplen, (JDIMENSION) 3);
+      ((j_common_ptr)cinfo, JPOOL_IMAGE, (JDIMENSION)maplen, (JDIMENSION)3);
     /* and read it from the file */
-    read_colormap(source, (int) maplen, UCH(targaheader[7]));
+    read_colormap(source, (int)maplen, UCH(targaheader[7]));
   } else {
     if (cmaptype)               /* but you promised a cmap! */
       ERREXIT(cinfo, JERR_TGA_BADPARMS);
@@ -473,7 +474,7 @@
  */
 
 METHODDEF(void)
-finish_input_tga (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+finish_input_tga(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 {
   /* no work */
 }
@@ -484,20 +485,20 @@
  */
 
 GLOBAL(cjpeg_source_ptr)
-jinit_read_targa (j_compress_ptr cinfo)
+jinit_read_targa(j_compress_ptr cinfo)
 {
   tga_source_ptr source;
 
   /* Create module interface object */
   source = (tga_source_ptr)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  sizeof(tga_source_struct));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(tga_source_struct));
   source->cinfo = cinfo;        /* make back link for subroutines */
   /* Fill in method ptrs, except get_pixel_rows which start_input sets */
   source->pub.start_input = start_input_tga;
   source->pub.finish_input = finish_input_tga;
 
-  return (cjpeg_source_ptr) source;
+  return (cjpeg_source_ptr)source;
 }
 
 #endif /* TARGA_SUPPORTED */
diff --git a/release/Distribution.xml b/release/Distribution.xml.in
similarity index 63%
rename from release/Distribution.xml
rename to release/Distribution.xml.in
index ee73ab0..e1f79ee 100644
--- a/release/Distribution.xml
+++ b/release/Distribution.xml.in
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <installer-gui-script minSpecVersion="1">
-	<title>libjpeg-turbo</title>
+	<title>@CMAKE_PROJECT_NAME@</title>
 	<welcome file="Welcome.rtf" />
 	<readme file="ReadMe.txt" />
 	<license file="License.rtf" />
@@ -12,13 +12,13 @@
 	<options customize="never" />
 	<choices-outline>
 		<line choice="default">
-			<line choice="com.libjpeg-turbo.libjpeg-turbo"/>
+			<line choice="@PKGID@"/>
 		</line>
 	</choices-outline>
 	<choice id="default"/>
-	<choice id="com.libjpeg-turbo.libjpeg-turbo" visible="false">
-		<pkg-ref id="com.libjpeg-turbo.libjpeg-turbo"/>
+	<choice id="@PKGID@" visible="false">
+		<pkg-ref id="@PKGID@"/>
 	</choice>
 	<pkg-ref auth="root"
-		id="com.libjpeg-turbo.libjpeg-turbo">libjpeg-turbo.pkg</pkg-ref>
+		id="@PKGID@">@PKGNAME@.pkg</pkg-ref>
 </installer-gui-script>
diff --git a/release/ReadMe.txt b/release/ReadMe.txt
index 7fb8d0f..a3bafd9 100644
--- a/release/ReadMe.txt
+++ b/release/ReadMe.txt
@@ -1,4 +1,4 @@
-libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression on x86, x86-64, ARM, and PowerPC systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines.  In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2, AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression on x86, x86-64, ARM, and PowerPC systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines.  In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
 
 libjpeg-turbo implements both the traditional libjpeg API as well as the less powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features colorspace extensions that allow it to compress from/decompress to 32-bit and big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java interface.
 
diff --git a/release/deb-control.tmpl b/release/deb-control.in
similarity index 81%
rename from release/deb-control.tmpl
rename to release/deb-control.in
index 681721d..8c54dd9 100644
--- a/release/deb-control.tmpl
+++ b/release/deb-control.in
@@ -1,16 +1,16 @@
 Package: {__PKGNAME}
-Version: {__VERSION}-{__BUILD}
+Version: @VERSION@-@BUILD@
 Section: misc
 Priority: optional
 Architecture: {__ARCH}
 Essential: no
-Maintainer: The libjpeg-turbo Project <information@libjpeg-turbo.org>
-Homepage: http://www.libjpeg-turbo.org
+Maintainer: @PKGVENDOR@ <@PKGEMAIL@>
+Homepage: @PKGURL@
 Installed-Size: {__SIZE}
 Description: A SIMD-accelerated JPEG codec that provides both the libjpeg and TurboJPEG APIs
  libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
- NEON, AltiVec) to accelerate baseline JPEG compression and decompression on
- x86, x86-64, ARM, and PowerPC systems.  On such systems, libjpeg-turbo is
+ AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression
+ on x86, x86-64, ARM, and PowerPC systems.  On such systems, libjpeg-turbo is
  generally 2-6x as fast as libjpeg, all else being equal.  On other types of
  systems, libjpeg-turbo can still outperform libjpeg by a significant amount,
  by virtue of its highly-optimized Huffman coding routines.  In many cases, the
diff --git a/release/libjpeg-turbo.nsi.in b/release/installer.nsi.in
similarity index 61%
rename from release/libjpeg-turbo.nsi.in
rename to release/installer.nsi.in
index f458b81..ec03f5e 100755
--- a/release/libjpeg-turbo.nsi.in
+++ b/release/installer.nsi.in
@@ -1,6 +1,6 @@
 !include x64.nsh
 Name "@CMAKE_PROJECT_NAME@ SDK for @INST_PLATFORM@"
-OutFile "@CMAKE_BINARY_DIR@\${BUILDDIR}@INST_NAME@.exe"
+OutFile "@CMAKE_CURRENT_BINARY_DIR@\${BUILDDIR}@INST_NAME@.exe"
 InstallDir @INST_DIR@
 
 SetCompressor bzip2
@@ -35,58 +35,60 @@
 	notexists:
 	SetOutPath $SYSDIR
 !ifdef GCC
-	File "@CMAKE_BINARY_DIR@\libturbojpeg.dll"
+	File "@CMAKE_CURRENT_BINARY_DIR@\libturbojpeg.dll"
 !else
-	File "@CMAKE_BINARY_DIR@\${BUILDDIR}turbojpeg.dll"
+	File "@CMAKE_CURRENT_BINARY_DIR@\${BUILDDIR}turbojpeg.dll"
 !endif
 	SetOutPath $INSTDIR\bin
 !ifdef GCC
-	File "@CMAKE_BINARY_DIR@\libturbojpeg.dll"
+	File "@CMAKE_CURRENT_BINARY_DIR@\libturbojpeg.dll"
 !else
-	File "@CMAKE_BINARY_DIR@\${BUILDDIR}turbojpeg.dll"
+	File "@CMAKE_CURRENT_BINARY_DIR@\${BUILDDIR}turbojpeg.dll"
 !endif
 !ifdef GCC
-	File "/oname=libjpeg-@DLL_VERSION@.dll" "@CMAKE_BINARY_DIR@\sharedlib\libjpeg-*.dll"
+	File "@CMAKE_CURRENT_BINARY_DIR@\libjpeg-@SO_MAJOR_VERSION@.dll"
 !else
-	File "@CMAKE_BINARY_DIR@\sharedlib\${BUILDDIR}jpeg@DLL_VERSION@.dll"
+	File "@CMAKE_CURRENT_BINARY_DIR@\${BUILDDIR}jpeg@SO_MAJOR_VERSION@.dll"
 !endif
-	File "@CMAKE_BINARY_DIR@\sharedlib\${BUILDDIR}cjpeg.exe"
-	File "@CMAKE_BINARY_DIR@\sharedlib\${BUILDDIR}djpeg.exe"
-	File "@CMAKE_BINARY_DIR@\sharedlib\${BUILDDIR}jpegtran.exe"
-	File "@CMAKE_BINARY_DIR@\${BUILDDIR}tjbench.exe"
-	File "@CMAKE_BINARY_DIR@\${BUILDDIR}rdjpgcom.exe"
-	File "@CMAKE_BINARY_DIR@\${BUILDDIR}wrjpgcom.exe"
+	File "@CMAKE_CURRENT_BINARY_DIR@\${BUILDDIR}cjpeg.exe"
+	File "@CMAKE_CURRENT_BINARY_DIR@\${BUILDDIR}djpeg.exe"
+	File "@CMAKE_CURRENT_BINARY_DIR@\${BUILDDIR}jpegtran.exe"
+	File "@CMAKE_CURRENT_BINARY_DIR@\${BUILDDIR}tjbench.exe"
+	File "@CMAKE_CURRENT_BINARY_DIR@\${BUILDDIR}rdjpgcom.exe"
+	File "@CMAKE_CURRENT_BINARY_DIR@\${BUILDDIR}wrjpgcom.exe"
 	SetOutPath $INSTDIR\lib
 !ifdef GCC
-	File "@CMAKE_BINARY_DIR@\libturbojpeg.dll.a"
-	File "@CMAKE_BINARY_DIR@\libturbojpeg.a"
-	File "@CMAKE_BINARY_DIR@\sharedlib\libjpeg.dll.a"
-	File "@CMAKE_BINARY_DIR@\libjpeg.a"
+	File "@CMAKE_CURRENT_BINARY_DIR@\libturbojpeg.dll.a"
+	File "@CMAKE_CURRENT_BINARY_DIR@\libturbojpeg.a"
+	File "@CMAKE_CURRENT_BINARY_DIR@\libjpeg.dll.a"
+	File "@CMAKE_CURRENT_BINARY_DIR@\libjpeg.a"
 !else
-	File "@CMAKE_BINARY_DIR@\${BUILDDIR}turbojpeg.lib"
-	File "@CMAKE_BINARY_DIR@\${BUILDDIR}turbojpeg-static.lib"
-	File "@CMAKE_BINARY_DIR@\sharedlib\${BUILDDIR}jpeg.lib"
-	File "@CMAKE_BINARY_DIR@\${BUILDDIR}jpeg-static.lib"
+	File "@CMAKE_CURRENT_BINARY_DIR@\${BUILDDIR}turbojpeg.lib"
+	File "@CMAKE_CURRENT_BINARY_DIR@\${BUILDDIR}turbojpeg-static.lib"
+	File "@CMAKE_CURRENT_BINARY_DIR@\${BUILDDIR}jpeg.lib"
+	File "@CMAKE_CURRENT_BINARY_DIR@\${BUILDDIR}jpeg-static.lib"
 !endif
 !ifdef JAVA
 	SetOutPath $INSTDIR\classes
-	File "@CMAKE_BINARY_DIR@\java\${BUILDDIR}turbojpeg.jar"
+	File "@CMAKE_CURRENT_BINARY_DIR@\java\turbojpeg.jar"
 !endif
 	SetOutPath $INSTDIR\include
-	File "@CMAKE_BINARY_DIR@\jconfig.h"
-	File "@CMAKE_SOURCE_DIR@\jerror.h"
-	File "@CMAKE_SOURCE_DIR@\jmorecfg.h"
-	File "@CMAKE_SOURCE_DIR@\jpeglib.h"
-	File "@CMAKE_SOURCE_DIR@\turbojpeg.h"
+	File "@CMAKE_CURRENT_BINARY_DIR@\jconfig.h"
+	File "@CMAKE_CURRENT_SOURCE_DIR@\jerror.h"
+	File "@CMAKE_CURRENT_SOURCE_DIR@\jmorecfg.h"
+	File "@CMAKE_CURRENT_SOURCE_DIR@\jpeglib.h"
+	File "@CMAKE_CURRENT_SOURCE_DIR@\turbojpeg.h"
 	SetOutPath $INSTDIR\doc
-	File "@CMAKE_SOURCE_DIR@\README.ijg"
-	File "@CMAKE_SOURCE_DIR@\README.md"
-	File "@CMAKE_SOURCE_DIR@\LICENSE.md"
-	File "@CMAKE_SOURCE_DIR@\example.c"
-	File "@CMAKE_SOURCE_DIR@\libjpeg.txt"
-	File "@CMAKE_SOURCE_DIR@\structure.txt"
-	File "@CMAKE_SOURCE_DIR@\usage.txt"
-	File "@CMAKE_SOURCE_DIR@\wizard.txt"
+	File "@CMAKE_CURRENT_SOURCE_DIR@\README.ijg"
+	File "@CMAKE_CURRENT_SOURCE_DIR@\README.md"
+	File "@CMAKE_CURRENT_SOURCE_DIR@\LICENSE.md"
+	File "@CMAKE_CURRENT_SOURCE_DIR@\example.txt"
+	File "@CMAKE_CURRENT_SOURCE_DIR@\libjpeg.txt"
+	File "@CMAKE_CURRENT_SOURCE_DIR@\structure.txt"
+	File "@CMAKE_CURRENT_SOURCE_DIR@\usage.txt"
+	File "@CMAKE_CURRENT_SOURCE_DIR@\wizard.txt"
+	File "@CMAKE_CURRENT_SOURCE_DIR@\tjexample.c"
+	File "@CMAKE_CURRENT_SOURCE_DIR@\java\TJExample.java"
 
 	WriteRegStr HKLM "SOFTWARE\@INST_REG_NAME@ @VERSION@" "Install_Dir" "$INSTDIR"
 
@@ -110,7 +112,7 @@
 	DeleteRegKey HKLM "SOFTWARE\@INST_REG_NAME@ @VERSION@"
 
 !ifdef GCC
-	Delete $INSTDIR\bin\libjpeg-@DLL_VERSION@.dll
+	Delete $INSTDIR\bin\libjpeg-@SO_MAJOR_VERSION@.dll
 	Delete $INSTDIR\bin\libturbojpeg.dll
 	Delete $SYSDIR\libturbojpeg.dll
 	Delete $INSTDIR\lib\libturbojpeg.dll.a"
@@ -118,7 +120,7 @@
 	Delete $INSTDIR\lib\libjpeg.dll.a"
 	Delete $INSTDIR\lib\libjpeg.a"
 !else
-	Delete $INSTDIR\bin\jpeg@DLL_VERSION@.dll
+	Delete $INSTDIR\bin\jpeg@SO_MAJOR_VERSION@.dll
 	Delete $INSTDIR\bin\turbojpeg.dll
 	Delete $SYSDIR\turbojpeg.dll
 	Delete $INSTDIR\lib\jpeg.lib
@@ -144,11 +146,13 @@
 	Delete $INSTDIR\doc\README.ijg
 	Delete $INSTDIR\doc\README.md
 	Delete $INSTDIR\doc\LICENSE.md
-	Delete $INSTDIR\doc\example.c
+	Delete $INSTDIR\doc\example.txt
 	Delete $INSTDIR\doc\libjpeg.txt
 	Delete $INSTDIR\doc\structure.txt
 	Delete $INSTDIR\doc\usage.txt
 	Delete $INSTDIR\doc\wizard.txt
+	Delete $INSTDIR\doc\tjexample.c
+	Delete $INSTDIR\doc\TJExample.java
 
 	RMDir "$INSTDIR\include"
 	RMDir "$INSTDIR\lib"
diff --git a/release/libjpeg-turbo.spec.in b/release/libjpeg-turbo.spec.in
deleted file mode 100644
index e4e4b9c..0000000
--- a/release/libjpeg-turbo.spec.in
+++ /dev/null
@@ -1,164 +0,0 @@
-# Path under which libjpeg-turbo should be installed
-%define _prefix %{__prefix}
-
-# Path under which executables should be installed
-%define _bindir %{__bindir}
-
-# Path under which Java classes and man pages should be installed
-%define _datadir %{__datadir}
-
-# Path under which docs should be installed
-%define _docdir /usr/share/doc/%{name}-%{version}
-
-# Path under which headers should be installed
-%define _includedir %{__includedir}
-
-%if "%{?__isa_bits:1}" == "1"
-%define _bits %{__isa_bits}
-%else
-# RPM < 4.6
-%if "%{_lib}" == "lib64"
-%define _bits 64
-%else
-%define _bits 32
-%endif
-%endif
-
-%if "%{_bits}" == "64"
-%define _libdir %{_exec_prefix}/lib64
-%else
-%if "%{_prefix}" == "/opt/libjpeg-turbo"
-%define _libdir %{_exec_prefix}/lib32
-%endif
-%endif
-
-# Path under which man pages should be installed
-%define _mandir %{__mandir}
-
-Summary: A SIMD-accelerated JPEG codec that provides both the libjpeg and TurboJPEG APIs
-Name: @PKGNAME@
-Version: @VERSION@
-Vendor: The libjpeg-turbo Project
-URL: http://www.libjpeg-turbo.org
-Group: System Environment/Libraries
-#-->Source0: http://prdownloads.sourceforge.net/libjpeg-turbo/libjpeg-turbo-%{version}.tar.gz
-Release: @BUILD@
-License: BSD-style
-BuildRoot: %{_blddir}/%{name}-buildroot-%{version}-%{release}
-Prereq: /sbin/ldconfig
-%if "%{_bits}" == "64"
-Provides: %{name} = %{version}-%{release}, @PACKAGE_NAME@ = %{version}-%{release}, libturbojpeg.so()(64bit)
-%else
-Provides: %{name} = %{version}-%{release}, @PACKAGE_NAME@ = %{version}-%{release}, libturbojpeg.so
-%endif
-
-%description
-libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
-NEON, AltiVec) to accelerate baseline JPEG compression and decompression on
-x86, x86-64, ARM, and PowerPC systems.  On such systems, libjpeg-turbo is
-generally 2-6x as fast as libjpeg, all else being equal.  On other types of
-systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by
-virtue of its highly-optimized Huffman coding routines.  In many cases, the
-performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
-
-libjpeg-turbo implements both the traditional libjpeg API as well as the less
-powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features
-colorspace extensions that allow it to compress from/decompress to 32-bit and
-big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java
-interface.
-
-libjpeg-turbo was originally based on libjpeg/SIMD, an MMX-accelerated
-derivative of libjpeg v6b developed by Miyasaka Masaru.  The TigerVNC and
-VirtualGL projects made numerous enhancements to the codec in 2009, and in
-early 2010, libjpeg-turbo spun off into an independent project, with the goal
-of making high-speed JPEG compression/decompression technology available to a
-broader range of users and developers.
-
-#-->%prep
-#-->%setup -q -n libjpeg-turbo-%{version}
-
-#-->%build
-#-->./configure prefix=%{_prefix} bindir=%{_bindir} datadir=%{_datadir} \
-#-->	docdir=%{_docdir} includedir=%{_includedir} libdir=%{_libdir} \
-#-->	mandir=%{_mandir} JPEG_LIB_VERSION=@JPEG_LIB_VERSION@ \
-#-->	SO_MAJOR_VERSION=@SO_MAJOR_VERSION@ SO_MINOR_VERSION=@SO_MINOR_VERSION@ \
-#-->	--with-pic @RPM_CONFIG_ARGS@
-#-->export NUMCPUS=`grep -c '^processor' /proc/cpuinfo`
-#-->make -j$NUMCPUS --load-average=$NUMCPUS DESTDIR=$RPM_BUILD_ROOT
-
-%install
-
-rm -rf $RPM_BUILD_ROOT
-make install DESTDIR=$RPM_BUILD_ROOT docdir=%{_docdir} exampledir=%{_docdir}
-rm -f $RPM_BUILD_ROOT%{_libdir}/*.la
-/sbin/ldconfig -n $RPM_BUILD_ROOT%{_libdir}
-
-#-->%if 0
-
-LJT_LIBDIR=%{__libdir}
-if [ ! "$LJT_LIBDIR" = "%{_libdir}" ]; then
-	echo ERROR: libjpeg-turbo must be configured with libdir=%{_libdir} when generating an in-tree RPM for this architecture.
-	exit 1
-fi
-
-#-->%endif
-
-LJT_DOCDIR=%{__docdir}
-if [ "%{_prefix}" = "/opt/libjpeg-turbo" -a "$LJT_DOCDIR" = "/opt/libjpeg-turbo/doc" ]; then
-	ln -fs %{_docdir} $RPM_BUILD_ROOT/$LJT_DOCDIR
-fi
-
-%post -p /sbin/ldconfig
-
-%postun -p /sbin/ldconfig
-
-%clean
-rm -rf $RPM_BUILD_ROOT
-
-%files
-%defattr(-,root,root)
-%dir %{_docdir}
-%doc %{_docdir}/*
-%dir %{_prefix}
-%if "%{_prefix}" == "/opt/libjpeg-turbo" && "%{_docdir}" != "%{_prefix}/doc"
- %{_prefix}/doc
-%endif
-%dir %{_bindir}
-%{_bindir}/cjpeg
-%{_bindir}/djpeg
-%{_bindir}/jpegtran
-%{_bindir}/tjbench
-%{_bindir}/rdjpgcom
-%{_bindir}/wrjpgcom
-%dir %{_libdir}
-%{_libdir}/libjpeg.so.@SO_MAJOR_VERSION@.@SO_AGE@.@SO_MINOR_VERSION@
-%{_libdir}/libjpeg.so.@SO_MAJOR_VERSION@
-%{_libdir}/libjpeg.so
-%{_libdir}/libjpeg.a
-%{_libdir}/pkgconfig
-%{_libdir}/pkgconfig/libjpeg.pc
-%{_libdir}/libturbojpeg.so.0.1.0
-%{_libdir}/libturbojpeg.so.0
-%{_libdir}/libturbojpeg.so
-%{_libdir}/libturbojpeg.a
-%{_libdir}/pkgconfig/libturbojpeg.pc
-%dir %{_includedir}
-%{_includedir}/jconfig.h
-%{_includedir}/jerror.h
-%{_includedir}/jmorecfg.h
-%{_includedir}/jpeglib.h
-%{_includedir}/turbojpeg.h
-%dir %{_mandir}
-%dir %{_mandir}/man1
-%{_mandir}/man1/cjpeg.1*
-%{_mandir}/man1/djpeg.1*
-%{_mandir}/man1/jpegtran.1*
-%{_mandir}/man1/rdjpgcom.1*
-%{_mandir}/man1/wrjpgcom.1*
-%if "%{_prefix}" != "%{_datadir}"
- %dir %{_datadir}
-%endif
-@JAVA_RPM_CONTENTS_1@
-@JAVA_RPM_CONTENTS_2@
-
-%changelog
diff --git a/release/libjpeg.pc.in b/release/libjpeg.pc.in
index 40795f7..74fb7fc 100644
--- a/release/libjpeg.pc.in
+++ b/release/libjpeg.pc.in
@@ -1,10 +1,10 @@
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=@CMAKE_INSTALL_PREFIX@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
 
 Name: libjpeg
 Description: A SIMD-accelerated JPEG codec that provides the libjpeg API
-Version: @PACKAGE_VERSION@
+Version: @VERSION@
 Libs: -L${libdir} -ljpeg
 Cflags: -I${includedir}
diff --git a/release/libturbojpeg.pc.in b/release/libturbojpeg.pc.in
index 7d4b656..81a0063 100644
--- a/release/libturbojpeg.pc.in
+++ b/release/libturbojpeg.pc.in
@@ -1,10 +1,10 @@
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=@CMAKE_INSTALL_PREFIX@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
 
 Name: libturbojpeg
 Description: A SIMD-accelerated JPEG codec that provides the TurboJPEG API
-Version: @PACKAGE_VERSION@
+Version: @VERSION@
 Libs: -L${libdir} -lturbojpeg
 Cflags: -I${includedir}
diff --git a/release/makecygwinpkg.in b/release/makecygwinpkg.in
index f303546..b7f353e 100755
--- a/release/makecygwinpkg.in
+++ b/release/makecygwinpkg.in
@@ -15,27 +15,51 @@
 	fi
 }
 
-PACKAGE_NAME=@PKGNAME@
+safedirmove ()
+{
+	if [ "$1" = "$2" ]; then
+		return 0
+	fi
+	if [ "$1" = "" -o ! -d "$1" ]; then
+		echo safedirmove: source dir $1 is not valid
+		return 1
+	fi
+	if [ "$2" = "" -o -e "$2" ]; then
+		echo safedirmove: dest dir $2 is not valid
+		return 1
+	fi
+	if [ "$3" = "" -o -e "$3" ]; then
+		echo safedirmove: tmp dir $3 is not valid
+		return 1
+	fi
+	mkdir -p $3
+	mv $1/* $3/
+	rmdir $1
+	mkdir -p $2
+	mv $3/* $2/
+	rmdir $3
+	return 0
+}
+
+PKGNAME=@PKGNAME@
 VERSION=@VERSION@
 BUILD=@BUILD@
-SRCDIR=@abs_top_srcdir@
 
-PREFIX=%{__prefix}
-DOCDIR=%{__docdir}
-LIBDIR=%{__libdir}
+PREFIX=@CMAKE_INSTALL_PREFIX@
+DOCDIR=@CMAKE_INSTALL_FULL_DOCDIR@
+LIBDIR=@CMAKE_INSTALL_FULL_LIBDIR@
 
 umask 022
-rm -f $PACKAGE_NAME-$VERSION-$BUILD.tar.bz2
+rm -f $PKGNAME-$VERSION-$BUILD.tar.bz2
 TMPDIR=`mktemp -d /tmp/ljtbuild.XXXXXX`
 __PWD=`pwd`
-make install DESTDIR=$TMPDIR/pkg docdir=/usr/share/doc/$PACKAGE_NAME-$VERSION \
-	exampledir=/usr/share/doc/$PACKAGE_NAME-$VERSION
-rm $TMPDIR/pkg$LIBDIR/*.la
-if [ "$PREFIX" = "/opt/libjpeg-turbo" -a "$DOCDIR" = "/opt/libjpeg-turbo/doc" ]; then
-	ln -fs /usr/share/doc/$PACKAGE_NAME-$VERSION $TMPDIR/pkg$DOCDIR
+make install DESTDIR=$TMPDIR/pkg
+if [ "$PREFIX" = "@CMAKE_INSTALL_DEFAULT_PREFIX@" -a "$DOCDIR" = "@CMAKE_INSTALL_DEFAULT_PREFIX@/doc" ]; then
+	safedirmove $TMPDIR/pkg$DOCDIR $TMPDIR/pkg/usr/share/doc/$PKGNAME-$VERSION $TMPDIR/__tmpdoc
+	ln -fs /usr/share/doc/$PKGNAME-$VERSION $TMPDIR/pkg$DOCDIR
 fi
 cd $TMPDIR/pkg
-tar cfj ../$PACKAGE_NAME-$VERSION-$BUILD.tar.bz2 *
+tar cfj ../$PKGNAME-$VERSION-$BUILD.tar.bz2 *
 cd $__PWD
 mv $TMPDIR/*.tar.bz2 .
 
diff --git a/release/makedpkg.in b/release/makedpkg.in
index 80cc89b..77836dd 100644
--- a/release/makedpkg.in
+++ b/release/makedpkg.in
@@ -21,56 +21,89 @@
 	id | cut -f2 -d = | cut -f1 -d \(;
 }
 
+safedirmove ()
+{
+	if [ "$1" = "$2" ]; then
+		return 0
+	fi
+	if [ "$1" = "" -o ! -d "$1" ]; then
+		echo safedirmove: source dir $1 is not valid
+		return 1
+	fi
+	if [ "$2" = "" -o -e "$2" ]; then
+		echo safedirmove: dest dir $2 is not valid
+		return 1
+	fi
+	if [ "$3" = "" -o -e "$3" ]; then
+		echo safedirmove: tmp dir $3 is not valid
+		return 1
+	fi
+	mkdir -p $3
+	mv $1/* $3/
+	rmdir $1
+	mkdir -p $2
+	mv $3/* $2/
+	rmdir $3
+	return 0
+}
+
 makedeb()
 {
 	SUPPLEMENT=$1
-	DIRNAME=$PACKAGE_NAME
+	DIRNAME=$PKGNAME
 
 	if [ $SUPPLEMENT = 1 ]; then
-		PACKAGE_NAME=$PACKAGE_NAME\32
+		PKGNAME=$PKGNAME\32
 		DEBARCH=amd64
 	fi
 
 	umask 022
-	rm -f $PACKAGE_NAME\_$VERSION\_$DEBARCH.deb
-	TMPDIR=`mktemp -d /tmp/$PACKAGE_NAME-build.XXXXXX`
+	rm -f $PKGNAME\_$VERSION\_$DEBARCH.deb
+	TMPDIR=`mktemp -d /tmp/$PKGNAME-build.XXXXXX`
 	mkdir $TMPDIR/DEBIAN
 
 	if [ $SUPPLEMENT = 1 ]; then
-		make install DESTDIR=$TMPDIR bindir=/dummy/bin datadir=/dummy/data \
-			docdir=/dummy/doc includedir=/dummy/include mandir=/dummy/man
-		rm -f $TMPDIR$LIBDIR/*.la
-		rm -rf $TMPDIR/dummy
+		make install DESTDIR=$TMPDIR
+		rm -rf $TMPDIR$BINDIR
+		if [ "$DATAROOTDIR" != "$PREFIX" ]; then
+			rm -rf $TMPDIR$DATAROOTDIR
+		fi
+		if [ "$JAVADIR" != "" ]; then
+			rm -rf $TMPDIR$JAVADIR
+		fi
+		rm -rf $TMPDIR$DOCDIR
+		rm -rf $TMPDIR$INCLUDEDIR
+		rm -rf $TMPDIR$MANDIR
 	else
-		make install DESTDIR=$TMPDIR docdir=/usr/share/doc/$DIRNAME-$VERSION \
-			exampledir=/usr/share/doc/$DIRNAME-$VERSION
-		rm -f $TMPDIR$LIBDIR/*.la
-		if [ "$PREFIX" = "/opt/libjpeg-turbo" -a "$DOCDIR" = "/opt/libjpeg-turbo/doc" ]; then
+		make install DESTDIR=$TMPDIR
+		if [ "$PREFIX" = "@CMAKE_INSTALL_DEFAULT_PREFIX@" -a "$DOCDIR" = "@CMAKE_INSTALL_DEFAULT_PREFIX@/doc" ]; then
+			safedirmove $TMPDIR/$DOCDIR $TMPDIR/usr/share/doc/$PKGNAME-$VERSION $TMPDIR/__tmpdoc
 			ln -fs /usr/share/doc/$DIRNAME-$VERSION $TMPDIR$DOCDIR
 		fi
 	fi
 
 	SIZE=`du -s $TMPDIR | cut -f1`
-	(cat $SRCDIR/release/deb-control.tmpl | sed s/{__PKGNAME}/$PACKAGE_NAME/g \
-		| sed s/{__VERSION}/$VERSION/g | sed s/{__BUILD}/$BUILD/g \
+	(cat pkgscripts/deb-control | sed s/{__PKGNAME}/$PKGNAME/g \
 		| sed s/{__ARCH}/$DEBARCH/g | sed s/{__SIZE}/$SIZE/g \
 		> $TMPDIR/DEBIAN/control)
 
-
 	/sbin/ldconfig -n $TMPDIR$LIBDIR
 
 	$SUDO chown -Rh root:root $TMPDIR/*
-	dpkg -b $TMPDIR $PACKAGE_NAME\_$VERSION\_$DEBARCH.deb
+	dpkg -b $TMPDIR $PKGNAME\_$VERSION\_$DEBARCH.deb
 }
 
-PACKAGE_NAME=@PKGNAME@
+PKGNAME=@PKGNAME@
 VERSION=@VERSION@
-BUILD=@BUILD@
 DEBARCH=@DEBARCH@
-SRCDIR=@abs_top_srcdir@
-PREFIX=%{__prefix}
-DOCDIR=%{__docdir}
-LIBDIR=%{__libdir}
+PREFIX=@CMAKE_INSTALL_PREFIX@
+BINDIR=@CMAKE_INSTALL_FULL_BINDIR@
+DATAROOTDIR=@CMAKE_INSTALL_FULL_DATAROOTDIR@
+DOCDIR=@CMAKE_INSTALL_FULL_DOCDIR@
+INCLUDEDIR=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+JAVADIR=@CMAKE_INSTALL_FULL_JAVADIR@
+LIBDIR=@CMAKE_INSTALL_FULL_LIBDIR@
+MANDIR=@CMAKE_INSTALL_FULL_MANDIR@
 
 if [ ! `uid` -eq 0 ]; then
 	SUDO=sudo
diff --git a/release/makemacpkg.in b/release/makemacpkg.in
index 7163757..b0a2e23 100644
--- a/release/makemacpkg.in
+++ b/release/makemacpkg.in
@@ -15,76 +15,63 @@
 	fi
 }
 
+safedirmove ()
+{
+	if [ "$1" = "$2" ]; then
+		return 0
+	fi
+	if [ "$1" = "" -o ! -d "$1" ]; then
+		echo safedirmove: source dir $1 is not valid
+		return 1
+	fi
+	if [ "$2" = "" -o -e "$2" ]; then
+		echo safedirmove: dest dir $2 is not valid
+		return 1
+	fi
+	if [ "$3" = "" -o -e "$3" ]; then
+		echo safedirmove: tmp dir $3 is not valid
+		return 1
+	fi
+	mkdir -p $3
+	mv $1/* $3/
+	rmdir $1
+	mkdir -p $2
+	mv $3/* $2/
+	rmdir $3
+	return 0
+}
+
 usage()
 {
-	echo "$0 [-build32 [32-bit build dir]] [-buildarmv6 [ARMv6 build dir]] [-buildarmv7 [ARMv7 build dir]] [-buildarmv7s [ARMv7s build dir] [-buildarmv8 [ARMv8 build dir]] [-lipo [path to lipo]]"
+	echo "$0 [universal] [-lipo [path to lipo]]"
 	exit 1
 }
 
-PACKAGE_NAME=@PKGNAME@
+UNIVERSAL=0
+
+PKGNAME=@PKGNAME@
 VERSION=@VERSION@
 BUILD=@BUILD@
-SRCDIR=@abs_top_srcdir@
-BUILDDIR32=@abs_top_srcdir@/osxx86
-BUILD32=0
-BUILDDIRARMV6=@abs_top_srcdir@/iosarmv6
-BUILDARMV6=0
-BUILDDIRARMV7=@abs_top_srcdir@/iosarmv7
-BUILDARMV7=0
-BUILDDIRARMV7S=@abs_top_srcdir@/iosarmv7s
-BUILDARMV7S=0
-BUILDDIRARMV8=@abs_top_srcdir@/iosarmv8
-BUILDARMV8=0
+SRCDIR=@CMAKE_CURRENT_SOURCE_DIR@
+BUILDDIR32=@OSX_32BIT_BUILD@
+BUILDDIRARMV7=@IOS_ARMV7_BUILD@
+BUILDDIRARMV7S=@IOS_ARMV7S_BUILD@
+BUILDDIRARMV8=@IOS_ARMV8_BUILD@
 WITH_JAVA=@WITH_JAVA@
 LIPO=lipo
 
-PREFIX=%{__prefix}
-BINDIR=%{__bindir}
-DOCDIR=%{__docdir}
-LIBDIR=%{__libdir}
+PREFIX=@CMAKE_INSTALL_PREFIX@
+BINDIR=@CMAKE_INSTALL_FULL_BINDIR@
+DOCDIR=@CMAKE_INSTALL_FULL_DOCDIR@
+LIBDIR=@CMAKE_INSTALL_FULL_LIBDIR@
+
+LIBJPEG_DSO_NAME=libjpeg.@SO_MAJOR_VERSION@.@SO_AGE@.@SO_MINOR_VERSION@.dylib
+TURBOJPEG_DSO_NAME=libturbojpeg.@TURBOJPEG_SO_VERSION@.dylib
 
 while [ $# -gt 0 ]; do
 	case $1 in
-	-h*)             usage 0                   ;;
-	-build32)
-		BUILD32=1
-		if [ $# -gt 1 ]; then
-			if [[ ! "$2" =~ -.* ]]; then
-				BUILDDIR32=$2;  shift
-			fi
-		fi
-		;;
-	-buildarmv6)
-		BUILDARMV6=1
-		if [ $# -gt 1 ]; then
-			if [[ ! "$2" =~ -.* ]]; then
-				BUILDDIRARMV6=$2;  shift
-			fi
-		fi
-		;;
-	-buildarmv7)
-		BUILDARMV7=1
-		if [ $# -gt 1 ]; then
-			if [[ ! "$2" =~ -.* ]]; then
-				BUILDDIRARMV7=$2;  shift
-			fi
-		fi
-		;;
-	-buildarmv7s)
-		BUILDARMV7S=1
-		if [ $# -gt 1 ]; then
-			if [[ ! "$2" =~ -.* ]]; then
-				BUILDDIRARMV7S=$2;  shift
-			fi
-		fi
-		;;
-	-buildarmv8)
-		BUILDARMV8=1
-		if [ $# -gt 1 ]; then
-			if [[ ! "$2" =~ -.* ]]; then
-				BUILDDIRARMV8=$2;  shift
-			fi
-		fi
+	-h*)
+		usage 0
 		;;
 	-lipo)
 		if [ $# -gt 1 ]; then
@@ -93,27 +80,31 @@
 			fi
 		fi
 		;;
+	universal)
+		UNIVERSAL=1
+		;;
 	esac
 	shift
 done
 
-if [ -f $PACKAGE_NAME-$VERSION.dmg ]; then
-	rm -f $PACKAGE_NAME-$VERSION.dmg
+if [ -f $PKGNAME-$VERSION.dmg ]; then
+	rm -f $PKGNAME-$VERSION.dmg
 fi
 
 umask 022
-TMPDIR=`mktemp -d /tmp/$PACKAGE_NAME-build.XXXXXX`
+TMPDIR=`mktemp -d /tmp/$PKGNAME-build.XXXXXX`
 PKGROOT=$TMPDIR/pkg/Package_Root
 mkdir -p $PKGROOT
-make install DESTDIR=$PKGROOT docdir=/Library/Documentation/$PACKAGE_NAME \
-	exampledir=/Library/Documentation/$PACKAGE_NAME
-rm -f $PKGROOT$LIBDIR/*.la
 
-if [ "$PREFIX" = "/opt/libjpeg-turbo" -a "$DOCDIR" = "/opt/libjpeg-turbo/doc" ]; then
-	ln -fs /Library/Documentation/$PACKAGE_NAME $PKGROOT$DOCDIR
+make install DESTDIR=$PKGROOT
+
+if [ "$PREFIX" = "@CMAKE_INSTALL_DEFAULT_PREFIX@" -a "$DOCDIR" = "@CMAKE_INSTALL_DEFAULT_PREFIX@/doc" ]; then
+	mkdir -p $PKGROOT/Library/Documentation
+	safedirmove $PKGROOT$DOCDIR $PKGROOT/Library/Documentation/$PKGNAME $TMPDIR/__tmpdoc
+	ln -fs /Library/Documentation/$PKGNAME $PKGROOT$DOCDIR
 fi
 
-if [ $BUILD32 = 1 ]; then
+if [ $UNIVERSAL = 1 -a "$BUILDDIR32" != "" ]; then
 	if [ ! -d $BUILDDIR32 ]; then
 		echo ERROR: 32-bit build directory $BUILDDIR32 does not exist
 		exit 1
@@ -126,27 +117,18 @@
 	pushd $BUILDDIR32
 	make install DESTDIR=$TMPDIR/dist.x86
 	popd
-	if [ ! -h $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
-		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
-		$LIPO -create \
-			-arch i386 $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
-			-arch x86_64 $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
-			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
-	elif [ ! -h $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
-		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
-		$LIPO -create \
-			-arch i386 $TMPDIR/dist.x86/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
-			-arch x86_64 $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
-			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
-	fi
+	$LIPO -create \
+		-arch i386 $TMPDIR/dist.x86/$LIBDIR/$LIBJPEG_DSO_NAME \
+		-arch x86_64 $PKGROOT/$LIBDIR/$LIBJPEG_DSO_NAME \
+		-output $PKGROOT/$LIBDIR/$LIBJPEG_DSO_NAME
 	$LIPO -create \
 		-arch i386 $TMPDIR/dist.x86/$LIBDIR/libjpeg.a \
 		-arch x86_64 $PKGROOT/$LIBDIR/libjpeg.a \
 		-output $PKGROOT/$LIBDIR/libjpeg.a
 	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$LIBDIR/libturbojpeg.0.dylib \
-		-arch x86_64 $PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
-		-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
+		-arch i386 $TMPDIR/dist.x86/$LIBDIR/$TURBOJPEG_DSO_NAME \
+		-arch x86_64 $PKGROOT/$LIBDIR/$TURBOJPEG_DSO_NAME \
+		-output $PKGROOT/$LIBDIR/$TURBOJPEG_DSO_NAME
 	$LIPO -create \
 		-arch i386 $TMPDIR/dist.x86/$LIBDIR/libturbojpeg.a \
 		-arch x86_64 $PKGROOT/$LIBDIR/libturbojpeg.a \
@@ -175,272 +157,88 @@
 		-arch i386 $TMPDIR/dist.x86/$BINDIR/wrjpgcom \
 		-arch x86_64 $PKGROOT/$BINDIR/wrjpgcom \
 		-output $PKGROOT/$BINDIR/wrjpgcom
-
 fi
 
-if [ $BUILDARMV6 = 1 ]; then
-	if [ ! -d $BUILDDIRARMV6 ]; then
-		echo ERROR: ARMv6 build directory $BUILDDIRARMV6 does not exist
+install_ios()
+{
+	BUILDDIR=$1
+	ARCHNAME=$2
+	DIRNAME=$3
+	LIPOARCH=$4
+
+	if [ ! -d $BUILDDIR ]; then
+		echo ERROR: $ARCHNAME build directory $BUILDDIR does not exist
 		exit 1
 	fi
-	if [ ! -f $BUILDDIRARMV6/Makefile ]; then
-		echo ERROR: ARMv6 build directory $BUILDDIRARMV6 is not configured
+	if [ ! -f $BUILDDIR/Makefile ]; then
+		echo ERROR: $ARCHNAME build directory $BUILDDIR is not configured
 		exit 1
 	fi
-	mkdir -p $TMPDIR/dist.armv6
-	pushd $BUILDDIRARMV6
-	make install DESTDIR=$TMPDIR/dist.armv6
+	mkdir -p $TMPDIR/dist.$DIRNAME
+	pushd $BUILDDIR
+	make install DESTDIR=$TMPDIR/dist.$DIRNAME
 	popd
-	if [ ! -h $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
-		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
-		$LIPO -create \
-			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
-			-arch arm $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
-			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
-	elif [ ! -h $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
-		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
-		$LIPO -create \
-			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
-			-arch arm $TMPDIR/dist.armv6/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
-			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
-	fi
+	$LIPO -create \
+		$PKGROOT/$LIBDIR/$LIBJPEG_DSO_NAME \
+		-arch $LIPOARCH $TMPDIR/dist.$DIRNAME/$LIBDIR/$LIBJPEG_DSO_NAME \
+		-output $PKGROOT/$LIBDIR/$LIBJPEG_DSO_NAME
 	$LIPO -create \
 		$PKGROOT/$LIBDIR/libjpeg.a \
-		-arch arm $TMPDIR/dist.armv6/$LIBDIR/libjpeg.a \
+		-arch $LIPOARCH $TMPDIR/dist.$DIRNAME/$LIBDIR/libjpeg.a \
 		-output $PKGROOT/$LIBDIR/libjpeg.a
 	$LIPO -create \
-		$PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
-		-arch arm $TMPDIR/dist.armv6/$LIBDIR/libturbojpeg.0.dylib \
-		-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
+		$PKGROOT/$LIBDIR/$TURBOJPEG_DSO_NAME \
+		-arch $LIPOARCH $TMPDIR/dist.$DIRNAME/$LIBDIR/$TURBOJPEG_DSO_NAME \
+		-output $PKGROOT/$LIBDIR/$TURBOJPEG_DSO_NAME
 	$LIPO -create \
 		$PKGROOT/$LIBDIR/libturbojpeg.a \
-		-arch arm $TMPDIR/dist.armv6/$LIBDIR/libturbojpeg.a \
+		-arch $LIPOARCH $TMPDIR/dist.$DIRNAME/$LIBDIR/libturbojpeg.a \
 		-output $PKGROOT/$LIBDIR/libturbojpeg.a
 	$LIPO -create \
 		$PKGROOT/$BINDIR/cjpeg \
-		-arch arm $TMPDIR/dist.armv6/$BINDIR/cjpeg \
+		-arch $LIPOARCH $TMPDIR/dist.$DIRNAME/$BINDIR/cjpeg \
 		-output $PKGROOT/$BINDIR/cjpeg
 	$LIPO -create \
 		$PKGROOT/$BINDIR/djpeg \
-		-arch arm $TMPDIR/dist.armv6/$BINDIR/djpeg \
+		-arch $LIPOARCH $TMPDIR/dist.$DIRNAME/$BINDIR/djpeg \
 		-output $PKGROOT/$BINDIR/djpeg
 	$LIPO -create \
 		$PKGROOT/$BINDIR/jpegtran \
-		-arch arm $TMPDIR/dist.armv6/$BINDIR/jpegtran \
+		-arch $LIPOARCH $TMPDIR/dist.$DIRNAME/$BINDIR/jpegtran \
 		-output $PKGROOT/$BINDIR/jpegtran
 	$LIPO -create \
 		$PKGROOT/$BINDIR/tjbench \
-		-arch arm $TMPDIR/dist.armv6/$BINDIR/tjbench \
+		-arch $LIPOARCH $TMPDIR/dist.$DIRNAME/$BINDIR/tjbench \
 		-output $PKGROOT/$BINDIR/tjbench
 	$LIPO -create \
 		$PKGROOT/$BINDIR/rdjpgcom \
-		-arch arm $TMPDIR/dist.armv6/$BINDIR/rdjpgcom \
+		-arch $LIPOARCH $TMPDIR/dist.$DIRNAME/$BINDIR/rdjpgcom \
 		-output $PKGROOT/$BINDIR/rdjpgcom
 	$LIPO -create \
 		$PKGROOT/$BINDIR/wrjpgcom \
-		-arch arm $TMPDIR/dist.armv6/$BINDIR/wrjpgcom \
+		-arch $LIPOARCH $TMPDIR/dist.$DIRNAME/$BINDIR/wrjpgcom \
 		-output $PKGROOT/$BINDIR/wrjpgcom
+}
+
+if [ $UNIVERSAL = 1 -a "$BUILDDIRARMV7" != "" ]; then
+	install_ios $BUILDDIRARMV7 ARMv7 armv7 arm
 fi
 
-if [ $BUILDARMV7 = 1 ]; then
-	if [ ! -d $BUILDDIRARMV7 ]; then
-		echo ERROR: ARMv7 build directory $BUILDDIRARMV7 does not exist
-		exit 1
-	fi
-	if [ ! -f $BUILDDIRARMV7/Makefile ]; then
-		echo ERROR: ARMv7 build directory $BUILDDIRARMV7 is not configured
-		exit 1
-	fi
-	mkdir -p $TMPDIR/dist.armv7
-	pushd $BUILDDIRARMV7
-	make install DESTDIR=$TMPDIR/dist.armv7
-	popd
-	if [ ! -h $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
-		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
-		$LIPO -create \
-			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
-			-arch arm $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
-			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
-	elif [ ! -h $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
-		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
-		$LIPO -create \
-			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
-			-arch arm $TMPDIR/dist.armv7/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
-			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
-	fi
-	$LIPO -create \
-		$PKGROOT/$LIBDIR/libjpeg.a \
-		-arch arm $TMPDIR/dist.armv7/$LIBDIR/libjpeg.a \
-		-output $PKGROOT/$LIBDIR/libjpeg.a
-	$LIPO -create \
-		$PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
-		-arch arm $TMPDIR/dist.armv7/$LIBDIR/libturbojpeg.0.dylib \
-		-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
-	$LIPO -create \
-		$PKGROOT/$LIBDIR/libturbojpeg.a \
-		-arch arm $TMPDIR/dist.armv7/$LIBDIR/libturbojpeg.a \
-		-output $PKGROOT/$LIBDIR/libturbojpeg.a
-	$LIPO -create \
-		$PKGROOT/$BINDIR/cjpeg \
-		-arch arm $TMPDIR/dist.armv7/$BINDIR/cjpeg \
-		-output $PKGROOT/$BINDIR/cjpeg
-	$LIPO -create \
-		$PKGROOT/$BINDIR/djpeg \
-		-arch arm $TMPDIR/dist.armv7/$BINDIR/djpeg \
-		-output $PKGROOT/$BINDIR/djpeg
-	$LIPO -create \
-		$PKGROOT/$BINDIR/jpegtran \
-		-arch arm $TMPDIR/dist.armv7/$BINDIR/jpegtran \
-		-output $PKGROOT/$BINDIR/jpegtran
-	$LIPO -create \
-		$PKGROOT/$BINDIR/tjbench \
-		-arch arm $TMPDIR/dist.armv7/$BINDIR/tjbench \
-		-output $PKGROOT/$BINDIR/tjbench
-	$LIPO -create \
-		$PKGROOT/$BINDIR/rdjpgcom \
-		-arch arm $TMPDIR/dist.armv7/$BINDIR/rdjpgcom \
-		-output $PKGROOT/$BINDIR/rdjpgcom
-	$LIPO -create \
-		$PKGROOT/$BINDIR/wrjpgcom \
-		-arch arm $TMPDIR/dist.armv7/$BINDIR/wrjpgcom \
-		-output $PKGROOT/$BINDIR/wrjpgcom
+if [ $UNIVERSAL = 1 -a "$BUILDDIRARMV7S" != "" ]; then
+	install_ios $BUILDDIRARMV7S ARMv7s armv7s arm
 fi
 
-if [ $BUILDARMV7S = 1 ]; then
-	if [ ! -d $BUILDDIRARMV7S ]; then
-		echo ERROR: ARMv7s build directory $BUILDDIRARMV7S does not exist
-		exit 1
-	fi
-	if [ ! -f $BUILDDIRARMV7S/Makefile ]; then
-		echo ERROR: ARMv7s build directory $BUILDDIRARMV7S is not configured
-		exit 1
-	fi
-	mkdir -p $TMPDIR/dist.armv7s
-	pushd $BUILDDIRARMV7S
-	make install DESTDIR=$TMPDIR/dist.armv7s
-	popd
-	if [ ! -h $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
-		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
-		$LIPO -create \
-			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
-			-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
-			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
-	elif [ ! -h $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
-		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
-		$LIPO -create \
-			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
-			-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
-			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
-	fi
-	$LIPO -create \
-		$PKGROOT/$LIBDIR/libjpeg.a \
-		-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libjpeg.a \
-		-output $PKGROOT/$LIBDIR/libjpeg.a
-	$LIPO -create \
-		$PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
-		-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libturbojpeg.0.dylib \
-		-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
-	$LIPO -create \
-		$PKGROOT/$LIBDIR/libturbojpeg.a \
-		-arch arm $TMPDIR/dist.armv7s/$LIBDIR/libturbojpeg.a \
-		-output $PKGROOT/$LIBDIR/libturbojpeg.a
-	$LIPO -create \
-		$PKGROOT/$BINDIR/cjpeg \
-		-arch arm $TMPDIR/dist.armv7s/$BINDIR/cjpeg \
-		-output $PKGROOT/$BINDIR/cjpeg
-	$LIPO -create \
-		$PKGROOT/$BINDIR/djpeg \
-		-arch arm $TMPDIR/dist.armv7s/$BINDIR/djpeg \
-		-output $PKGROOT/$BINDIR/djpeg
-	$LIPO -create \
-		$PKGROOT/$BINDIR/jpegtran \
-		-arch arm $TMPDIR/dist.armv7s/$BINDIR/jpegtran \
-		-output $PKGROOT/$BINDIR/jpegtran
-	$LIPO -create \
-		$PKGROOT/$BINDIR/tjbench \
-		-arch arm $TMPDIR/dist.armv7s/$BINDIR/tjbench \
-		-output $PKGROOT/$BINDIR/tjbench
-	$LIPO -create \
-		$PKGROOT/$BINDIR/rdjpgcom \
-		-arch arm $TMPDIR/dist.armv7s/$BINDIR/rdjpgcom \
-		-output $PKGROOT/$BINDIR/rdjpgcom
-	$LIPO -create \
-		$PKGROOT/$BINDIR/wrjpgcom \
-		-arch arm $TMPDIR/dist.armv7s/$BINDIR/wrjpgcom \
-		-output $PKGROOT/$BINDIR/wrjpgcom
+if [ $UNIVERSAL = 1 -a "BUILDDIRARMV8" != "" ]; then
+	install_ios $BUILDDIRARMV8 ARMv8 armv8 arm64
 fi
 
-if [ $BUILDARMV8 = 1 ]; then
-	if [ ! -d $BUILDDIRARMV8 ]; then
-		echo ERROR: ARMv8 build directory $BUILDDIRARMV8 does not exist
-		exit 1
-	fi
-	if [ ! -f $BUILDDIRARMV8/Makefile ]; then
-		echo ERROR: ARMv8 build directory $BUILDDIRARMV8 is not configured
-		exit 1
-	fi
-	mkdir -p $TMPDIR/dist.armv8
-	pushd $BUILDDIRARMV8
-	make install DESTDIR=$TMPDIR/dist.armv8
-	popd
-	if [ ! -h $TMPDIR/dist.armv8/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib -a \
-		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib ]; then
-		$LIPO -create \
-			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
-			-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib \
-			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
-	elif [ ! -h $TMPDIR/dist.armv8/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib -a \
-		! -h $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib ]; then
-		$LIPO -create \
-			$PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
-			-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib \
-			-output $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.0.@SO_MINOR_VERSION@.dylib
-	fi
-	$LIPO -create \
-		$PKGROOT/$LIBDIR/libjpeg.a \
-		-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libjpeg.a \
-		-output $PKGROOT/$LIBDIR/libjpeg.a
-	$LIPO -create \
-		$PKGROOT/$LIBDIR/libturbojpeg.0.dylib \
-		-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libturbojpeg.0.dylib \
-		-output $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
-	$LIPO -create \
-		$PKGROOT/$LIBDIR/libturbojpeg.a \
-		-arch arm64 $TMPDIR/dist.armv8/$LIBDIR/libturbojpeg.a \
-		-output $PKGROOT/$LIBDIR/libturbojpeg.a
-	$LIPO -create \
-		$PKGROOT/$BINDIR/cjpeg \
-		-arch arm64 $TMPDIR/dist.armv8/$BINDIR/cjpeg \
-		-output $PKGROOT/$BINDIR/cjpeg
-	$LIPO -create \
-		$PKGROOT/$BINDIR/djpeg \
-		-arch arm64 $TMPDIR/dist.armv8/$BINDIR/djpeg \
-		-output $PKGROOT/$BINDIR/djpeg
-	$LIPO -create \
-		$PKGROOT/$BINDIR/jpegtran \
-		-arch arm64 $TMPDIR/dist.armv8/$BINDIR/jpegtran \
-		-output $PKGROOT/$BINDIR/jpegtran
-	$LIPO -create \
-		$PKGROOT/$BINDIR/tjbench \
-		-arch arm64 $TMPDIR/dist.armv8/$BINDIR/tjbench \
-		-output $PKGROOT/$BINDIR/tjbench
-	$LIPO -create \
-		$PKGROOT/$BINDIR/rdjpgcom \
-		-arch arm64 $TMPDIR/dist.armv8/$BINDIR/rdjpgcom \
-		-output $PKGROOT/$BINDIR/rdjpgcom
-	$LIPO -create \
-		$PKGROOT/$BINDIR/wrjpgcom \
-		-arch arm64 $TMPDIR/dist.armv8/$BINDIR/wrjpgcom \
-		-output $PKGROOT/$BINDIR/wrjpgcom
-fi
-
-install_name_tool -id $LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib $PKGROOT/$LIBDIR/libjpeg.@SO_MAJOR_VERSION@.dylib
-install_name_tool -id $LIBDIR/libturbojpeg.0.dylib $PKGROOT/$LIBDIR/libturbojpeg.0.dylib
+install_name_tool -id $LIBDIR/$LIBJPEG_DSO_NAME $PKGROOT/$LIBDIR/$LIBJPEG_DSO_NAME
+install_name_tool -id $LIBDIR/$TURBOJPEG_DSO_NAME $PKGROOT/$LIBDIR/$TURBOJPEG_DSO_NAME
 
 if [ $WITH_JAVA = 1 ]; then
-	ln -fs libturbojpeg.0.dylib $PKGROOT/$LIBDIR/libturbojpeg.jnilib
+	ln -fs $TURBOJPEG_DSO_NAME $PKGROOT/$LIBDIR/libturbojpeg.jnilib
 fi
-if [ "$PREFIX" = "/opt/libjpeg-turbo" -a "$LIBDIR" = "/opt/libjpeg-turbo/lib" ]; then
+if [ "$PREFIX" = "@CMAKE_INSTALL_DEFAULT_PREFIX@" -a "$LIBDIR" = "@CMAKE_INSTALL_DEFAULT_PREFIX@/lib" ]; then
 	if [ ! -h $PKGROOT/$PREFIX/lib32 ]; then
 		ln -fs lib $PKGROOT/$PREFIX/lib32
 	fi
@@ -458,13 +256,13 @@
 cp $SRCDIR/release/License.rtf $SRCDIR/release/Welcome.rtf $SRCDIR/release/ReadMe.txt $TMPDIR/pkg/
 
 mkdir $TMPDIR/dmg
-pkgbuild --root $PKGROOT --version $VERSION.$BUILD \
-	--identifier com.libjpeg-turbo.libjpeg-turbo $TMPDIR/pkg/$PACKAGE_NAME.pkg
-productbuild --distribution $SRCDIR/release/Distribution.xml \
+pkgbuild --root $PKGROOT --version $VERSION.$BUILD --identifier @PKGID@ \
+	$TMPDIR/pkg/$PKGNAME.pkg
+productbuild --distribution pkgscripts/Distribution.xml \
 	--package-path $TMPDIR/pkg/ --resources $TMPDIR/pkg/ \
-	$TMPDIR/dmg/$PACKAGE_NAME.pkg
-hdiutil create -fs HFS+ -volname $PACKAGE_NAME-$VERSION \
-	-srcfolder "$TMPDIR/dmg" $TMPDIR/$PACKAGE_NAME-$VERSION.dmg
-cp $TMPDIR/$PACKAGE_NAME-$VERSION.dmg .
+	$TMPDIR/dmg/$PKGNAME.pkg
+hdiutil create -fs HFS+ -volname $PKGNAME-$VERSION \
+	-srcfolder "$TMPDIR/dmg" $TMPDIR/$PKGNAME-$VERSION.dmg
+cp $TMPDIR/$PKGNAME-$VERSION.dmg .
 
 exit
diff --git a/release/makerpm.in b/release/makerpm.in
new file mode 100644
index 0000000..fc3b1d4
--- /dev/null
+++ b/release/makerpm.in
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+set -u
+set -e
+trap onexit INT
+trap onexit TERM
+trap onexit EXIT
+
+TMPDIR=
+
+onexit()
+{
+	if [ ! "$TMPDIR" = "" ]; then
+		rm -rf $TMPDIR
+	fi
+}
+
+if [ -f @PKGNAME@-@VERSION@.@RPMARCH@.rpm ]; then
+	rm -f @PKGNAME@-@VERSION@.@RPMARCH@.rpm
+fi
+
+umask 022
+TMPDIR=`mktemp -d /tmp/@CMAKE_PROJECT_NAME@-build.XXXXXX`
+
+mkdir -p $TMPDIR/RPMS
+ln -fs `pwd` $TMPDIR/BUILD
+rpmbuild -bb --define "_blddir $TMPDIR/buildroot" --define "_topdir $TMPDIR" \
+	--target @RPMARCH@ pkgscripts/rpm.spec; \
+cp $TMPDIR/RPMS/@RPMARCH@/@PKGNAME@-@VERSION@-@BUILD@.@RPMARCH@.rpm \
+	@PKGNAME@-@VERSION@.@RPMARCH@.rpm
diff --git a/release/makesrpm.in b/release/makesrpm.in
new file mode 100644
index 0000000..84c39d4
--- /dev/null
+++ b/release/makesrpm.in
@@ -0,0 +1,48 @@
+#!/bin/sh
+
+set -u
+set -e
+trap onexit INT
+trap onexit TERM
+trap onexit EXIT
+
+TMPDIR=
+
+onexit()
+{
+	if [ ! "$TMPDIR" = "" ]; then
+		rm -rf $TMPDIR
+	fi
+}
+
+PKGNAME=@PKGNAME@
+PROJECT=@CMAKE_PROJECT_NAME@
+VERSION=@VERSION@
+BUILD=@BUILD@
+
+if [ -f $PKGNAME-$VERSION.src.rpm ]; then
+	rm -f $PKGNAME-$VERSION.src.rpm
+fi
+
+umask 022
+TMPDIR=`mktemp -d /tmp/$PKGNAME-build.XXXXXX`
+
+mkdir -p $TMPDIR/RPMS
+mkdir -p $TMPDIR/SRPMS
+mkdir -p $TMPDIR/BUILD
+mkdir -p $TMPDIR/SOURCES
+mkdir -p $TMPDIR/SPECS
+
+if [ ! -f $PROJECT-$VERSION.tar.gz ]; then
+	echo "ERROR: $PROJECT-$VERSION.tar.gz does not exist."
+fi
+
+cp $PROJECT-$VERSION.tar.gz $TMPDIR/SOURCES/$PROJECT-$VERSION.tar.gz
+
+cat pkgscripts/rpm.spec | sed s/%{_blddir}/%{_tmppath}/g \
+	| sed s/#--\>//g > $TMPDIR/SPECS/$PKGNAME.spec
+
+rpmbuild -bs --define "_topdir $TMPDIR" $TMPDIR/SPECS/$PKGNAME.spec
+mv $TMPDIR/SRPMS/$PKGNAME-$VERSION-$BUILD.src.rpm $PKGNAME-$VERSION.src.rpm
+
+exit
diff --git a/release/maketarball.in b/release/maketarball.in
new file mode 100644
index 0000000..00a9c7e
--- /dev/null
+++ b/release/maketarball.in
@@ -0,0 +1,51 @@
+#!/bin/sh
+
+set -u
+set -e
+trap onexit INT
+trap onexit TERM
+trap onexit EXIT
+
+TMPDIR=
+SUDO=
+
+onexit()
+{
+	if [ ! "$TMPDIR" = "" ]; then
+		rm -rf $TMPDIR
+	fi
+}
+
+uid()
+{
+	id | cut -f2 -d = | cut -f1 -d \(;
+}
+
+PKGNAME=@PKGNAME@
+VERSION=@VERSION@
+ARCH=@CPU_TYPE@
+OS=@CMAKE_SYSTEM_NAME@
+PREFIX=@CMAKE_INSTALL_PREFIX@
+
+umask 022
+rm -f $PKGNAME-$VERSION-$OS-$ARCH.tar.bz2
+TMPDIR=`mktemp -d /tmp/$PKGNAME-build.XXXXXX`
+mkdir -p $TMPDIR/install
+
+make install DESTDIR=$TMPDIR/install
+echo tartest >$TMPDIR/tartest
+GNUTAR=0
+BSDTAR=0
+tar cf $TMPDIR/tartest.tar --owner=root --group=root -C $TMPDIR tartest >/dev/null 2>&1 && GNUTAR=1
+if [ "$GNUTAR" = "1" ]; then
+	tar cf - --owner=root --group=root -C $TMPDIR/install .$PREFIX | bzip2 -c >$PKGNAME-$VERSION-$OS-$ARCH.tar.bz2
+else
+	tar cf $TMPDIR/tartest.tar --uid 0 --gid 0 -C $TMPDIR tartest >/dev/null 2>&1 && BSDTAR=1
+	if [ "$BSDTAR" = "1" ]; then
+		tar cf - --uid=0 --gid=0 -C $TMPDIR/install .$PREFIX | bzip2 -c >$PKGNAME-$VERSION-$OS-$ARCH.tar.bz2
+	else
+		tar cf - -C $TMPDIR/install .$PREFIX | bzip2 -c >$PKGNAME-$VERSION-$OS-$ARCH.tar.bz2
+	fi
+fi
+
+exit
diff --git a/release/rpm.spec.in b/release/rpm.spec.in
new file mode 100644
index 0000000..54101bc
--- /dev/null
+++ b/release/rpm.spec.in
@@ -0,0 +1,220 @@
+%define _prefix @CMAKE_INSTALL_PREFIX@
+%define _bindir @CMAKE_INSTALL_FULL_BINDIR@
+%define _datarootdir @CMAKE_INSTALL_FULL_DATAROOTDIR@
+%define _docdir %{_defaultdocdir}/%{name}-%{version}
+%define _includedir @CMAKE_INSTALL_FULL_INCLUDEDIR@
+%define _javadir @CMAKE_INSTALL_FULL_JAVADIR@
+%define _mandir @CMAKE_INSTALL_FULL_MANDIR@
+%define _enable_static @ENABLE_STATIC@
+%define _enable_shared @ENABLE_SHARED@
+%define _with_turbojpeg @WITH_TURBOJPEG@
+%define _with_java @WITH_JAVA@
+
+%if "%{?__isa_bits:1}" == "1"
+%define _bits %{__isa_bits}
+%else
+# RPM < 4.6
+%if "%{_lib}" == "lib64"
+%define _bits 64
+%else
+%define _bits 32
+%endif
+%endif
+
+#-->%if 1
+%if "%{_bits}" == "64"
+%define _libdir %{_exec_prefix}/lib64
+%else
+%if "%{_prefix}" == "/opt/libjpeg-turbo"
+%define _libdir %{_exec_prefix}/lib32
+%endif
+%endif
+#-->%else
+%define _libdir @CMAKE_INSTALL_FULL_LIBDIR@
+#-->%endif
+
+Summary: A SIMD-accelerated JPEG codec that provides both the libjpeg and TurboJPEG APIs
+Name: @PKGNAME@
+Version: @VERSION@
+Vendor: @PKGVENDOR@
+URL: @PKGURL@
+Group: System Environment/Libraries
+#-->Source0: http://prdownloads.sourceforge.net/@CMAKE_PROJECT_NAME@/@CMAKE_PROJECT_NAME@-%{version}.tar.gz
+Release: @BUILD@
+License: BSD-style
+BuildRoot: %{_blddir}/%{name}-buildroot-%{version}-%{release}
+Prereq: /sbin/ldconfig
+%if "%{_bits}" == "64"
+Provides: %{name} = %{version}-%{release}, @CMAKE_PROJECT_NAME@ = %{version}-%{release}, libturbojpeg.so()(64bit)
+%else
+Provides: %{name} = %{version}-%{release}, @CMAKE_PROJECT_NAME@ = %{version}-%{release}, libturbojpeg.so
+%endif
+
+%description
+libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2,
+AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression
+on x86, x86-64, ARM, and PowerPC systems.  On such systems, libjpeg-turbo is
+generally 2-6x as fast as libjpeg, all else being equal.  On other types of
+systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by
+virtue of its highly-optimized Huffman coding routines.  In many cases, the
+performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+
+libjpeg-turbo implements both the traditional libjpeg API as well as the less
+powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features
+colorspace extensions that allow it to compress from/decompress to 32-bit and
+big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java
+interface.
+
+libjpeg-turbo was originally based on libjpeg/SIMD, an MMX-accelerated
+derivative of libjpeg v6b developed by Miyasaka Masaru.  The TigerVNC and
+VirtualGL projects made numerous enhancements to the codec in 2009, and in
+early 2010, libjpeg-turbo spun off into an independent project, with the goal
+of making high-speed JPEG compression/decompression technology available to a
+broader range of users and developers.
+
+#-->%prep
+#-->%setup -q -n @CMAKE_PROJECT_NAME@-%{version}
+
+#-->%build
+#-->cmake -G"Unix Makefiles" -DCMAKE_BUILD_TYPE=@CMAKE_BUILD_TYPE@ \
+#-->  -DBUILD=%{release} \
+#-->  -DCMAKE_INSTALL_BINDIR=%{_bindir} \
+#-->  -DCMAKE_INSTALL_DATAROOTDIR=%{_datarootdir} \
+#-->  -DCMAKE_INSTALL_DOCDIR=%{_docdir} \
+#-->  -DCMAKE_INSTALL_INCLUDEDIR=%{_includedir} \
+#-->  -DCMAKE_INSTALL_JAVADIR=%{_javadir} \
+#-->  -DCMAKE_INSTALL_LIBDIR=%{_libdir} \
+#-->  -DCMAKE_INSTALL_MANDIR=%{_mandir} \
+#-->  -DCMAKE_INSTALL_PREFIX=%{_prefix} \
+#-->  -DCMAKE_POSITION_INDEPENDENT_CODE=@CMAKE_POSITION_INDEPENDENT_CODE@ \
+#-->  -DENABLE_SHARED=@ENABLE_SHARED@ -DENABLE_STATIC=@ENABLE_STATIC@ \
+#-->  -DSO_MAJOR_VERSION=@SO_MAJOR_VERSION@ \
+#-->  -DSO_MINOR_VERSION=@SO_MINOR_VERSION@ \
+#-->  -DJPEG_LIB_VERSION=@JPEG_LIB_VERSION@ \
+#-->  -DREQUIRE_SIMD=@REQUIRE_SIMD@ \
+#-->  -DWITH_12BIT=@WITH_12BIT@ -DWITH_ARITH_DEC=@WITH_ARITH_DEC@ \
+#-->  -DWITH_ARITH_ENC=@WITH_ARITH_ENC@ -DWITH_JAVA=@WITH_JAVA@ \
+#-->  -DWITH_JPEG7=@WITH_JPEG7@ -DWITH_JPEG8=@WITH_JPEG8@ \
+#-->  -DWITH_MEM_SRCDST=@WITH_MEM_SRCDST@ -DWITH_SIMD=@WITH_SIMD@ \
+#-->  -DWITH_TURBOJPEG=@WITH_TURBOJPEG@ .
+#-->make DESTDIR=$RPM_BUILD_ROOT
+
+%install
+
+rm -rf $RPM_BUILD_ROOT
+make install DESTDIR=$RPM_BUILD_ROOT
+/sbin/ldconfig -n $RPM_BUILD_ROOT%{_libdir}
+
+#-->%if 0
+
+# This is only needed to support in-tree RPM generation via 'make rpm'.  When
+# building from a SRPM, we control where things are installed via CMake
+# variables.
+
+safedirmove ()
+{
+	if [ "$1" = "$2" ]; then
+		return 0
+	fi
+	if [ "$1" = "" -o ! -d "$1" ]; then
+		echo safedirmove: source dir $1 is not valid
+		return 1
+	fi
+	if [ "$2" = "" -o -e "$2" ]; then
+		echo safedirmove: dest dir $2 is not valid
+		return 1
+	fi
+	if [ "$3" = "" -o -e "$3" ]; then
+		echo safedirmove: tmp dir $3 is not valid
+		return 1
+	fi
+	mkdir -p $3
+	mv $1/* $3/
+	rmdir $1
+	mkdir -p $2
+	mv $3/* $2/
+	rmdir $3
+	return 0
+}
+
+LJT_DOCDIR=@CMAKE_INSTALL_FULL_DOCDIR@
+if [ ! "$LJT_DOCDIR" = "%{_docdir}" ]; then
+	safedirmove $RPM_BUILD_ROOT/$LJT_DOCDIR $RPM_BUILD_ROOT/%{_docdir} $RPM_BUILD_ROOT/__tmpdoc
+fi
+
+#-->%endif
+
+LJT_DOCDIR=@CMAKE_INSTALL_FULL_DOCDIR@
+if [ "%{_prefix}" = "@CMAKE_INSTALL_DEFAULT_PREFIX@" -a "$LJT_DOCDIR" = "@CMAKE_INSTALL_DEFAULT_PREFIX@/doc" ]; then
+	ln -fs %{_docdir} $RPM_BUILD_ROOT/$LJT_DOCDIR
+fi
+
+%post -p /sbin/ldconfig
+
+%postun -p /sbin/ldconfig
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%files
+%defattr(-,root,root)
+%dir %{_docdir}
+%doc %{_docdir}/*
+%dir %{_prefix}
+%if "%{_prefix}" == "@CMAKE_INSTALL_DEFAULT_PREFIX@" && "%{_docdir}" != "%{_prefix}/doc"
+ %{_prefix}/doc
+%endif
+%dir %{_bindir}
+%{_bindir}/cjpeg
+%{_bindir}/djpeg
+%{_bindir}/jpegtran
+%if "%{_with_turbojpeg}" == "1"
+ %{_bindir}/tjbench
+%endif
+%{_bindir}/rdjpgcom
+%{_bindir}/wrjpgcom
+%dir %{_libdir}
+%if "%{_enable_shared}" == "1"
+ %{_libdir}/libjpeg.so.@SO_MAJOR_VERSION@.@SO_AGE@.@SO_MINOR_VERSION@
+ %{_libdir}/libjpeg.so.@SO_MAJOR_VERSION@
+ %{_libdir}/libjpeg.so
+%endif
+%if "%{_enable_static}" == "1"
+ %{_libdir}/libjpeg.a
+%endif
+%{_libdir}/pkgconfig
+%{_libdir}/pkgconfig/libjpeg.pc
+%if "%{_with_turbojpeg}" == "1"
+ %if "%{_enable_shared}" == "1" || "%{_with_java}" == "1"
+  %{_libdir}/libturbojpeg.so.@TURBOJPEG_SO_VERSION@
+  %{_libdir}/libturbojpeg.so.@TURBOJPEG_SO_MAJOR_VERSION@
+  %{_libdir}/libturbojpeg.so
+ %endif
+ %if "%{_enable_static}" == "1"
+  %{_libdir}/libturbojpeg.a
+ %endif
+ %{_libdir}/pkgconfig/libturbojpeg.pc
+%endif
+%dir %{_includedir}
+%{_includedir}/jconfig.h
+%{_includedir}/jerror.h
+%{_includedir}/jmorecfg.h
+%{_includedir}/jpeglib.h
+%if "%{_with_turbojpeg}" == "1"
+ %{_includedir}/turbojpeg.h
+%endif
+%dir %{_mandir}
+%dir %{_mandir}/man1
+%{_mandir}/man1/cjpeg.1*
+%{_mandir}/man1/djpeg.1*
+%{_mandir}/man1/jpegtran.1*
+%{_mandir}/man1/rdjpgcom.1*
+%{_mandir}/man1/wrjpgcom.1*
+%if "%{_prefix}" != "%{_datarootdir}"
+ %dir %{_datarootdir}
+%endif
+%if "%{_with_java}" == "1"
+ %dir %{_javadir}
+ %{_javadir}/turbojpeg.jar
+%endif
+%changelog
diff --git a/release/uninstall.in b/release/uninstall.in
index 6cd1f86..cf1ba77 100644
--- a/release/uninstall.in
+++ b/release/uninstall.in
@@ -31,15 +31,15 @@
 	exit -1
 fi
 
-PACKAGE=@PKGNAME@
-MACPACKAGE=com.$PACKAGE.$PACKAGE
-RECEIPT=/Library/Receipts/$PACKAGE.pkg
+PKGNAME=@PKGNAME@
+PKGID=@PKGID@
+RECEIPT=/Library/Receipts/$PKGNAME.pkg
 
 LSBOM=
 if [ -d $RECEIPT ]; then
 	LSBOM='lsbom -s -f -l '$RECEIPT'/Contents/Archive.bom'
 else
-	LSBOM='pkgutil --files '$MACPACKAGE
+	LSBOM='pkgutil --files '$PKGID
 fi
 
 mylsbom()
@@ -56,12 +56,13 @@
 popd
 
 echo Removing package directories ...
-PREFIX=%{__prefix}
-BINDIR=%{__bindir}
-DATADIR=%{__datadir}
-INCLUDEDIR=%{__includedir}
-LIBDIR=%{__libdir}
-MANDIR=%{__mandir}
+PREFIX=@CMAKE_INSTALL_PREFIX@
+BINDIR=@CMAKE_INSTALL_FULL_BINDIR@
+DATAROOTDIR=@CMAKE_INSTALL_FULL_DATAROOTDIR@
+INCLUDEDIR=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+JAVADIR=@CMAKE_INSTALL_FULL_JAVADIR@
+LIBDIR=@CMAKE_INSTALL_FULL_LIBDIR@
+MANDIR=@CMAKE_INSTALL_FULL_MANDIR@
 
 if [ -d $BINDIR ]; then
 	rmdir $BINDIR 2>&1 || EXITSTATUS=-1
@@ -75,7 +76,7 @@
 if [ -d $INCLUDEDIR ]; then
 	rmdir $INCLUDEDIR 2>&1 || EXITSTATUS=-1
 fi
-if [ "$PREFIX" = "/opt/libjpeg-turbo" -a "$LIBDIR" = "/opt/libjpeg-turbo/lib" ]; then
+if [ "$PREFIX" = "@CMAKE_INSTALL_DEFAULT_PREFIX@" -a "$LIBDIR" = "@CMAKE_INSTALL_DEFAULT_PREFIX@/lib" ]; then
 	if [ -h $LIBDIR\32 ]; then
 		rm $LIBDIR\32 2>&1 || EXITSTATUS=-1
 	fi
@@ -89,24 +90,24 @@
 if [ -d $MANDIR ]; then
 	rmdir $MANDIR 2>&1 || EXITSTATUS=-1
 fi
-if [ -d $DATADIR/classes ]; then
-	rmdir $DATADIR/classes 2>&1 || EXITSTATUS=-1
+if [ -d $JAVADIR ]; then
+	rmdir $JAVADIR 2>&1 || EXITSTATUS=-1
 fi
-if [ -d $DATADIR -a "$DATADIR" != "$PREFIX" ]; then
-	rmdir $DATADIR 2>&1 || EXITSTATUS=-1
+if [ -d $DATAROOTDIR -a "$DATAROOTDIR" != "$PREFIX" ]; then
+	rmdir $DATAROOTDIR 2>&1 || EXITSTATUS=-1
 fi
-if [ "$PREFIX" = "/opt/libjpeg-turbo" -a -h "$PREFIX/doc" ]; then
+if [ "$PREFIX" = "@CMAKE_INSTALL_DEFAULT_PREFIX@" -a -h "$PREFIX/doc" ]; then
 	rm $PREFIX/doc 2>&1 || EXITSTATUS=-1
 fi
 rmdir $PREFIX 2>&1 || EXITSTATUS=-1
-rmdir /Library/Documentation/$PACKAGE 2>&1 || EXITSTATUS=-1
+rmdir /Library/Documentation/$PKGNAME 2>&1 || EXITSTATUS=-1
 
 if [ -d $RECEIPT ]; then
 	echo Removing package receipt ...
 	rm -r $RECEIPT 2>&1 || EXITSTATUS=-1
 else
-	echo Forgetting package $MACPACKAGE ...
-	pkgutil --forget $MACPACKAGE
+	echo Forgetting package $PKGID ...
+	pkgutil --forget $PKGID
 fi
 
 exit $EXITSTATUS
diff --git a/sharedlib/CMakeLists.txt b/sharedlib/CMakeLists.txt
index d423cce..95aed25 100755
--- a/sharedlib/CMakeLists.txt
+++ b/sharedlib/CMakeLists.txt
@@ -5,6 +5,10 @@
 # better yet, provide a friendly way of configuring a Windows target to use the
 # static C library.
 
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/..)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/..)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/..)
+
 if(MSVC)
   # Build all configurations against shared C library
   foreach(var CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
@@ -16,38 +20,50 @@
 endif()
 
 foreach(src ${JPEG_SOURCES})
-  set(JPEG_SRCS ${JPEG_SRCS} ${CMAKE_SOURCE_DIR}/${src})
+  set(JPEG_SRCS ${JPEG_SRCS} ../${src})
 endforeach()
 
-if(WITH_SIMD)
+if(WITH_SIMD AND MSVC_IDE)
   # This tells CMake that the "source" files haven't been generated yet
   set_source_files_properties(${SIMD_OBJS} PROPERTIES GENERATED 1)
 endif()
 
-if(WITH_MEM_SRCDST AND NOT WITH_JPEG8)
-  add_library(jpeg SHARED ${JPEG_SRCS} ${SIMD_OBJS}
-    ${CMAKE_SOURCE_DIR}/win/jpeg${DLL_VERSION}-memsrcdst.def)
-else()
-  add_library(jpeg SHARED ${JPEG_SRCS} ${SIMD_OBJS}
-    ${CMAKE_SOURCE_DIR}/win/jpeg${DLL_VERSION}.def)
+if(WIN32)
+  if(WITH_MEM_SRCDST)
+    set(DEFFILE ../win/jpeg${SO_MAJOR_VERSION}-memsrcdst.def)
+  else()
+    set(DEFFILE ../win/jpeg${SO_MAJOR_VERSION}.def)
+  endif()
 endif()
-set_target_properties(jpeg PROPERTIES SOVERSION ${DLL_VERSION}
-  VERSION ${FULLVERSION})
+add_library(jpeg SHARED ${JPEG_SRCS} ${DEFFILE} $<TARGET_OBJECTS:simd>
+  ${SIMD_OBJS})
+
+set_target_properties(jpeg PROPERTIES SOVERSION ${SO_MAJOR_VERSION}
+  VERSION ${SO_MAJOR_VERSION}.${SO_AGE}.${SO_MINOR_VERSION})
+if(APPLE)
+  set_target_properties(jpeg PROPERTIES MACOSX_RPATH 1)
+endif()
+if(MAPFLAG)
+  set_target_properties(jpeg PROPERTIES
+    LINK_FLAGS "${MAPFLAG}${CMAKE_CURRENT_BINARY_DIR}/../libjpeg.map")
+endif()
 if(MSVC)
-  set_target_properties(jpeg PROPERTIES SUFFIX ${DLL_VERSION}.dll)
-elseif(MINGW OR CYGWIN)
-  set_target_properties(jpeg PROPERTIES SUFFIX -${DLL_VERSION}.dll)
-endif(MSVC)
-if(WITH_SIMD)
-  add_dependencies(jpeg simd)
+  set_target_properties(jpeg PROPERTIES SUFFIX ${SO_MAJOR_VERSION}.dll)
+  # The jsimd_*.c file is built using /MT, so this prevents a linker warning.
+  set_target_properties(jpeg PROPERTIES LINK_FLAGS "/NODEFAULTLIB:LIBCMT /NODEFAULTLIB:LIBCMTD")
+elseif(MINGW)
+  set_target_properties(jpeg PROPERTIES SUFFIX -${SO_MAJOR_VERSION}.dll)
 endif()
 
+if(WIN32)
+  set(USE_SETMODE "-DUSE_SETMODE")
+endif()
 if(WITH_12BIT)
-  set(COMPILE_FLAGS "-DGIF_SUPPORTED -DPPM_SUPPORTED -DUSE_SETMODE")
+  set(COMPILE_FLAGS "-DGIF_SUPPORTED -DPPM_SUPPORTED ${USE_SETMODE}")
 else()
-  set(COMPILE_FLAGS "-DBMP_SUPPORTED -DGIF_SUPPORTED -DPPM_SUPPORTED -DTARGA_SUPPORTED -DUSE_SETMODE")
-	set(CJPEG_BMP_SOURCES ../rdbmp.c ../rdtarga.c)
-	set(DJPEG_BMP_SOURCES ../wrbmp.c ../wrtarga.c)
+  set(COMPILE_FLAGS "-DBMP_SUPPORTED -DGIF_SUPPORTED -DPPM_SUPPORTED -DTARGA_SUPPORTED ${USE_SETMODE}")
+  set(CJPEG_BMP_SOURCES ../rdbmp.c ../rdtarga.c)
+  set(DJPEG_BMP_SOURCES ../wrbmp.c ../wrtarga.c)
 endif()
 
 add_executable(cjpeg ../cjpeg.c ../cdjpeg.c ../rdgif.c ../rdppm.c
@@ -62,12 +78,12 @@
 
 add_executable(jpegtran ../jpegtran.c ../cdjpeg.c ../rdswitch.c ../transupp.c)
 target_link_libraries(jpegtran jpeg)
-set_property(TARGET jpegtran PROPERTY COMPILE_FLAGS "-DUSE_SETMODE")
+set_property(TARGET jpegtran PROPERTY COMPILE_FLAGS "${USE_SETMODE}")
 
 add_executable(jcstest ../jcstest.c)
 target_link_libraries(jcstest jpeg)
 
 install(TARGETS jpeg cjpeg djpeg jpegtran
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-  RUNTIME DESTINATION bin)
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
index 6e898d8..624350c 100755
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -1,81 +1,352 @@
-if(NOT DEFINED NASM)
-  find_program(NASM NAMES nasm yasm DOC "Path to NASM/YASM executable")
-endif()
-message(STATUS "NASM = ${NASM}")
-
-if(SIMD_X86_64)
-  set(NAFLAGS -fwin64 -DWIN64 -D__x86_64__)
-else()
-  if(BORLAND)
-    set(NAFLAGS -fobj -DOBJ32)
+macro(simd_fail message)
+  if(REQUIRE_SIMD)
+    message(FATAL_ERROR "${message}.")
   else()
-    set(NAFLAGS -fwin32 -DWIN32)
+    message(WARNING "${message}.  Performance will suffer.")
+    set(WITH_SIMD 0 PARENT_SCOPE)
+  endif()
+endmacro()
+
+
+###############################################################################
+# x86[-64] (NASM)
+###############################################################################
+
+if(CPU_TYPE STREQUAL "x86_64" OR CPU_TYPE STREQUAL "i386")
+
+set(CMAKE_ASM_NASM_FLAGS_DEBUG_INIT "-g")
+set(CMAKE_ASM_NASM_FLAGS_RELWITHDEBINFO_INIT "-g")
+
+# Allow the location of the NASM executable to be specified using the ASM_NASM
+# environment variable.  This should happen automatically, but unfortunately
+# enable_language(ASM_NASM) doesn't parse the ASM_NASM environment variable
+# until after CMAKE_ASM_NASM_COMPILER has been populated with the results of
+# searching for NASM or YASM in the PATH.
+if(NOT DEFINED CMAKE_ASM_NASM_COMPILER AND DEFINED ENV{ASM_NASM})
+  set(CMAKE_ASM_NASM_COMPILER $ENV{ASM_NASM})
+endif()
+
+if(CPU_TYPE STREQUAL "x86_64")
+  if(CYGWIN)
+    set(CMAKE_ASM_NASM_OBJECT_FORMAT win64)
+  endif()
+elseif(CPU_TYPE STREQUAL "i386")
+  if(BORLAND)
+    set(CMAKE_ASM_NASM_OBJECT_FORMAT obj)
+  elseif(CYGWIN)
+    set(CMAKE_ASM_NASM_OBJECT_FORMAT win32)
   endif()
 endif()
-set(NAFLAGS ${NAFLAGS} -I${CMAKE_SOURCE_DIR}/win/ -I${CMAKE_CURRENT_SOURCE_DIR}/)
 
-# This only works if building from the command line.  There is currently no way
-# to set a variable's value based on the build type when using the MSVC IDE.
-if(CMAKE_BUILD_TYPE STREQUAL "Debug"
-  OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
-  set(NAFLAGS ${NAFLAGS} -g)
+enable_language(ASM_NASM)
+message(STATUS "CMAKE_ASM_NASM_COMPILER = ${CMAKE_ASM_NASM_COMPILER}")
+
+if(CMAKE_ASM_NASM_OBJECT_FORMAT MATCHES "macho*")
+  set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DMACHO")
+elseif(CMAKE_ASM_NASM_OBJECT_FORMAT MATCHES "elf*")
+  set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DELF")
+  set(CMAKE_ASM_NASM_DEBUG_FORMAT "dwarf2")
+endif()
+if(CPU_TYPE STREQUAL "x86_64")
+  if(WIN32 OR CYGWIN)
+    set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DWIN64")
+  endif()
+  set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -D__x86_64__")
+elseif(CPU_TYPE STREQUAL "i386")
+  if(BORLAND)
+    set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DOBJ32")
+  elseif(WIN32 OR CYGWIN)
+    set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DWIN32")
+  endif()
 endif()
 
-if(SIMD_X86_64)
-  set(SIMD_BASENAMES jfdctflt-sse-64 jccolor-sse2-64 jcgray-sse2-64
-    jchuff-sse2-64 jcsample-sse2-64 jdcolor-sse2-64 jdmerge-sse2-64
-    jdsample-sse2-64 jfdctfst-sse2-64 jfdctint-sse2-64 jidctflt-sse2-64
-    jidctfst-sse2-64 jidctint-sse2-64 jidctred-sse2-64 jquantf-sse2-64
-    jquanti-sse2-64)
-  message(STATUS "Building x86_64 SIMD extensions")
+message(STATUS "CMAKE_ASM_NASM_OBJECT_FORMAT = ${CMAKE_ASM_NASM_OBJECT_FORMAT}")
+
+if(NOT CMAKE_ASM_NASM_OBJECT_FORMAT)
+  simd_fail("SIMD extensions disabled: could not determine NASM object format")
+  return()
+endif()
+
+get_filename_component(CMAKE_ASM_NASM_COMPILER_TYPE
+  "${CMAKE_ASM_NASM_COMPILER}" NAME_WE)
+if(CMAKE_ASM_NASM_COMPILER_TYPE MATCHES "yasm")
+  foreach(var CMAKE_ASM_NASM_FLAGS_DEBUG CMAKE_ASM_NASM_FLAGS_RELWITHDEBINFO)
+    if(${var} STREQUAL "-g")
+      if(CMAKE_ASM_NASM_DEBUG_FORMAT)
+        set_property(CACHE ${var} PROPERTY VALUE "-g ${CMAKE_ASM_NASM_DEBUG_FORMAT}")
+      else()
+        set_property(CACHE ${var} PROPERTY VALUE "")
+      endif()
+    endif()
+  endforeach()
+endif()
+
+if(NOT WIN32 AND (CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED))
+  set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DPIC")
+endif()
+
+string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
+set(EFFECTIVE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} ${CMAKE_ASM_NASM_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
+message(STATUS "CMAKE_ASM_NASM_FLAGS = ${EFFECTIVE_ASM_NASM_FLAGS}")
+
+set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -I\"${CMAKE_CURRENT_SOURCE_DIR}/nasm/\" -I\"${CMAKE_CURRENT_SOURCE_DIR}/${CPU_TYPE}/\"")
+
+if(WIN32)
+  set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -I\"${CMAKE_CURRENT_SOURCE_DIR}/../win/\"")
+  set(JSIMDCFG_INC ${CMAKE_CURRENT_SOURCE_DIR}/../win/jsimdcfg.inc)
 else()
-  set(SIMD_BASENAMES jsimdcpu jfdctflt-3dn jidctflt-3dn jquant-3dn jccolor-mmx
-    jcgray-mmx jcsample-mmx jdcolor-mmx jdmerge-mmx jdsample-mmx jfdctfst-mmx
-    jfdctint-mmx jidctfst-mmx jidctint-mmx jidctred-mmx jquant-mmx jfdctflt-sse
-    jidctflt-sse jquant-sse jccolor-sse2 jcgray-sse2 jchuff-sse2 jcsample-sse2
-    jdcolor-sse2 jdmerge-sse2 jdsample-sse2 jfdctfst-sse2 jfdctint-sse2
-    jidctflt-sse2 jidctfst-sse2 jidctint-sse2 jidctred-sse2 jquantf-sse2
-    jquanti-sse2)
-  message(STATUS "Building i386 SIMD extensions")
+  set(GREP grep)
+  if(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
+    set(GREP ggrep)
+  endif()
+  add_custom_command(OUTPUT jsimdcfg.inc
+    COMMAND ${CMAKE_C_COMPILER} -E -I${CMAKE_BINARY_DIR} -I${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/jsimdcfg.inc.h | ${GREP} -E '^[\;%]|^\ %' | sed 's%_cpp_protection_%%' | sed 's@% define@%define@g' >jsimdcfg.inc)
+  set(JSIMDCFG_INC ${CMAKE_CURRENT_BINARY_DIR}/jsimdcfg.inc)
+  set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -I\"${CMAKE_CURRENT_BINARY_DIR}/\"")
+endif()
+
+if(CPU_TYPE STREQUAL "x86_64")
+  set(SIMD_SOURCES x86_64/jsimdcpu.asm x86_64/jfdctflt-sse.asm
+    x86_64/jccolor-sse2.asm x86_64/jcgray-sse2.asm x86_64/jchuff-sse2.asm
+    x86_64/jcsample-sse2.asm x86_64/jdcolor-sse2.asm x86_64/jdmerge-sse2.asm
+    x86_64/jdsample-sse2.asm x86_64/jfdctfst-sse2.asm x86_64/jfdctint-sse2.asm
+    x86_64/jidctflt-sse2.asm x86_64/jidctfst-sse2.asm x86_64/jidctint-sse2.asm
+    x86_64/jidctred-sse2.asm x86_64/jquantf-sse2.asm x86_64/jquanti-sse2.asm
+    x86_64/jccolor-avx2.asm x86_64/jcgray-avx2.asm x86_64/jcsample-avx2.asm
+    x86_64/jdcolor-avx2.asm x86_64/jdmerge-avx2.asm x86_64/jdsample-avx2.asm
+    x86_64/jfdctint-avx2.asm x86_64/jidctint-avx2.asm x86_64/jquanti-avx2.asm)
+else()
+  set(SIMD_SOURCES i386/jsimdcpu.asm i386/jfdctflt-3dn.asm
+    i386/jidctflt-3dn.asm i386/jquant-3dn.asm
+    i386/jccolor-mmx.asm i386/jcgray-mmx.asm i386/jcsample-mmx.asm
+    i386/jdcolor-mmx.asm i386/jdmerge-mmx.asm i386/jdsample-mmx.asm
+    i386/jfdctfst-mmx.asm i386/jfdctint-mmx.asm i386/jidctfst-mmx.asm
+    i386/jidctint-mmx.asm i386/jidctred-mmx.asm i386/jquant-mmx.asm
+    i386/jfdctflt-sse.asm i386/jidctflt-sse.asm i386/jquant-sse.asm
+    i386/jccolor-sse2.asm i386/jcgray-sse2.asm i386/jchuff-sse2.asm
+    i386/jcsample-sse2.asm i386/jdcolor-sse2.asm i386/jdmerge-sse2.asm
+    i386/jdsample-sse2.asm i386/jfdctfst-sse2.asm i386/jfdctint-sse2.asm
+    i386/jidctflt-sse2.asm i386/jidctfst-sse2.asm i386/jidctint-sse2.asm
+    i386/jidctred-sse2.asm i386/jquantf-sse2.asm i386/jquanti-sse2.asm
+    i386/jccolor-avx2.asm i386/jcgray-avx2.asm i386/jcsample-avx2.asm
+    i386/jdcolor-avx2.asm i386/jdmerge-avx2.asm i386/jdsample-avx2.asm
+    i386/jfdctint-avx2.asm i386/jidctint-avx2.asm i386/jquanti-avx2.asm)
 endif()
 
 if(MSVC_IDE)
   set(OBJDIR "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}")
-else()
-  set(OBJDIR ${CMAKE_CURRENT_BINARY_DIR})
+  string(REGEX REPLACE " " ";" CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS}")
 endif()
 
-file(GLOB INC_FILES *.inc)
+file(GLOB INC_FILES nasm/*.inc)
 
-foreach(file ${SIMD_BASENAMES})
-  set(DEPFILE "")
-  set(SIMD_SRC ${CMAKE_CURRENT_SOURCE_DIR}/${file}.asm)
+foreach(file ${SIMD_SOURCES})
+  set(OBJECT_DEPENDS "")
   if(${file} MATCHES jccolor)
-    set(DEPFILE ${file})
-    string(REGEX REPLACE "jccolor" "jccolext" DEPFILE ${DEPFILE})
-    set(DEPFILE ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE}.asm)
+    string(REGEX REPLACE "jccolor" "jccolext" DEPFILE ${file})
+    set(OBJECT_DEPENDS ${OBJECT_DEPENDS}
+      ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE})
   endif()
   if(${file} MATCHES jcgray)
-    set(DEPFILE ${file})
-    string(REGEX REPLACE "jcgray" "jcgryext" DEPFILE ${DEPFILE})
-    set(DEPFILE ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE}.asm)
+    string(REGEX REPLACE "jcgray" "jcgryext" DEPFILE ${file})
+    set(OBJECT_DEPENDS ${OBJECT_DEPENDS}
+      ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE})
   endif()
   if(${file} MATCHES jdcolor)
-    set(DEPFILE ${file})
-    string(REGEX REPLACE "jdcolor" "jdcolext" DEPFILE ${DEPFILE})
-    set(DEPFILE ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE}.asm)
+    string(REGEX REPLACE "jdcolor" "jdcolext" DEPFILE ${file})
+    set(OBJECT_DEPENDS ${OBJECT_DEPENDS}
+      ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE})
   endif()
   if(${file} MATCHES jdmerge)
-    set(DEPFILE ${file})
-    string(REGEX REPLACE "jdmerge" "jdmrgext" DEPFILE ${DEPFILE})
-    set(DEPFILE ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE}.asm)
+    string(REGEX REPLACE "jdmerge" "jdmrgext" DEPFILE ${file})
+    set(OBJECT_DEPENDS ${OBJECT_DEPENDS}
+      ${CMAKE_CURRENT_SOURCE_DIR}/${DEPFILE})
   endif()
-  set(SIMD_OBJ ${OBJDIR}/${file}.obj)
-  add_custom_command(OUTPUT ${SIMD_OBJ}
-    DEPENDS ${SIMD_SRC} ${DEPFILE} ${INC_FILES}
-    COMMAND ${NASM} ${NAFLAGS} ${SIMD_SRC} -o${SIMD_OBJ})
-  set(SIMD_OBJS ${SIMD_OBJS} ${SIMD_OBJ})
+  set(OBJECT_DEPENDS ${OBJECT_DEPENDS} ${INC_FILES} ${JSIMDCFG_INC})
+  if(MSVC_IDE)
+    # The CMake Visual Studio generators do not work properly with the ASM_NASM
+    # language, so we have to go rogue here and use a custom command like we
+    # did in prior versions of libjpeg-turbo.  (This is why we can't have nice
+    # things.)
+    string(REGEX REPLACE "${CPU_TYPE}/" "" filename ${file})
+    set(SIMD_OBJ ${OBJDIR}/${filename}.obj)
+    add_custom_command(OUTPUT ${SIMD_OBJ} DEPENDS ${file} ${OBJECT_DEPENDS}
+      COMMAND ${CMAKE_ASM_NASM_COMPILER} -f${CMAKE_ASM_NASM_OBJECT_FORMAT}
+        ${CMAKE_ASM_NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${file}
+        -o${SIMD_OBJ})
+    set(SIMD_OBJS ${SIMD_OBJS} ${SIMD_OBJ})
+  else()
+    set_source_files_properties(${file} PROPERTIES OBJECT_DEPENDS
+      "${OBJECT_DEPENDS}")
+  endif()
 endforeach()
 
-set(SIMD_OBJS ${SIMD_OBJS} PARENT_SCOPE)
-add_custom_target(simd DEPENDS ${SIMD_OBJS})
+if(MSVC_IDE)
+  set(SIMD_OBJS ${SIMD_OBJS} PARENT_SCOPE)
+  add_library(simd OBJECT ${CPU_TYPE}/jsimd.c)
+  add_custom_target(simd-objs DEPENDS ${SIMD_OBJS})
+  add_dependencies(simd simd-objs)
+else()
+  add_library(simd OBJECT ${SIMD_SOURCES} ${CPU_TYPE}/jsimd.c)
+endif()
+if(NOT WIN32 AND (CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED))
+  set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
+endif()
+
+
+###############################################################################
+# ARM (GAS)
+###############################################################################
+
+elseif(CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm")
+
+enable_language(ASM)
+
+set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_ASM_FLAGS}")
+
+string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
+set(EFFECTIVE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CMAKE_ASM_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
+message(STATUS "CMAKE_ASM_FLAGS = ${EFFECTIVE_ASM_FLAGS}")
+
+# Test whether we need gas-preprocessor.pl
+if(CPU_TYPE STREQUAL "arm")
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S "
+    .text
+    .fpu neon
+    .arch armv7a
+    .object_arch armv4
+    .arm
+    pld [r0]
+    vmovn.u16 d0, q0")
+else()
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S "
+    .text
+    MYVAR .req x0
+    movi v0.16b, #100
+    mov MYVAR, #100
+    .unreq MYVAR")
+endif()
+
+separate_arguments(CMAKE_ASM_FLAGS_SEP UNIX_COMMAND "${CMAKE_ASM_FLAGS}")
+
+execute_process(COMMAND ${CMAKE_ASM_COMPILER} ${CMAKE_ASM_FLAGS_SEP}
+    -x assembler-with-cpp -c ${CMAKE_CURRENT_BINARY_DIR}/gastest.S
+  RESULT_VARIABLE RESULT OUTPUT_VARIABLE OUTPUT ERROR_VARIABLE ERROR)
+if(NOT RESULT EQUAL 0)
+  message(STATUS "GAS appears to be broken.  Trying gas-preprocessor.pl ...")
+  execute_process(COMMAND gas-preprocessor.pl ${CMAKE_ASM_COMPILER}
+      ${CMAKE_ASM_FLAGS_SEP} -x assembler-with-cpp -c
+      ${CMAKE_CURRENT_BINARY_DIR}/gastest.S
+    RESULT_VARIABLE RESULT OUTPUT_VARIABLE OUTPUT ERROR_VARIABLE ERROR)
+  if(NOT RESULT EQUAL 0)
+    simd_fail("SIMD extensions disabled: GAS is not working properly")
+    return()
+  else()
+    message(STATUS "Using gas-preprocessor.pl")
+    configure_file(gas-preprocessor.in gas-preprocessor @ONLY)
+    set(CMAKE_ASM_COMPILER ${CMAKE_CURRENT_BINARY_DIR}/gas-preprocessor)
+  endif()
+else()
+  message(STATUS "GAS is working properly")
+endif()
+
+file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S)
+
+add_library(simd OBJECT ${CPU_TYPE}/jsimd_neon.S ${CPU_TYPE}/jsimd.c)
+
+if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
+  set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
+endif()
+
+
+###############################################################################
+# MIPS (GAS)
+###############################################################################
+
+elseif(CPU_TYPE STREQUAL "mips")
+
+enable_language(ASM)
+
+string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
+set(EFFECTIVE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CMAKE_ASM_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
+message(STATUS "CMAKE_ASM_FLAGS = ${EFFECTIVE_ASM_FLAGS}")
+
+add_library(simd OBJECT ${CPU_TYPE}/jsimd_dspr2.S ${CPU_TYPE}/jsimd.c)
+
+if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
+  set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
+endif()
+
+###############################################################################
+# Loongson (Intrinsics)
+###############################################################################
+
+elseif(CPU_TYPE STREQUAL "loongson")
+
+set(SIMD_SOURCES loongson/jccolor-mmi.c loongson/jcsample-mmi.c
+  loongson/jdcolor-mmi.c loongson/jdsample-mmi.c loongson/jfdctint-mmi.c
+  loongson/jidctint-mmi.c loongson/jquanti-mmi.c)
+
+if(CMAKE_COMPILER_IS_GNUCC)
+  foreach(file ${SIMD_SOURCES})
+    set_property(SOURCE ${file} APPEND_STRING PROPERTY COMPILE_FLAGS
+      " -fno-strict-aliasing")
+  endforeach()
+endif()
+
+add_library(simd OBJECT ${SIMD_SOURCES} loongson/jsimd.c)
+
+if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
+  set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
+endif()
+
+###############################################################################
+# PowerPC (Intrinsics)
+###############################################################################
+
+elseif(CPU_TYPE STREQUAL "powerpc")
+
+set(CMAKE_REQUIRED_FLAGS -maltivec)
+
+check_c_source_compiles("
+  #include <altivec.h>
+  int main(void) {
+    __vector int vi = { 0, 0, 0, 0 };
+    int i[4];
+    vec_st(vi, 0, i);
+    return i[0];
+  }" HAVE_ALTIVEC)
+
+unset(CMAKE_REQUIRED_FLAGS)
+
+if(NOT HAVE_ALTIVEC)
+  simd_fail("SIMD extensions not available for this CPU (PowerPC SPE)")
+  return()
+endif()
+
+set(SIMD_SOURCES powerpc/jccolor-altivec.c powerpc/jcgray-altivec.c
+  powerpc/jcsample-altivec.c powerpc/jdcolor-altivec.c
+  powerpc/jdmerge-altivec.c powerpc/jdsample-altivec.c
+  powerpc/jfdctfst-altivec.c powerpc/jfdctint-altivec.c
+  powerpc/jidctfst-altivec.c powerpc/jidctint-altivec.c
+  powerpc/jquanti-altivec.c)
+
+set_source_files_properties(${SIMD_SOURCES} PROPERTIES
+  COMPILE_FLAGS -maltivec)
+
+add_library(simd OBJECT ${SIMD_SOURCES} powerpc/jsimd.c)
+
+if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
+  set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
+endif()
+
+
+###############################################################################
+# None
+###############################################################################
+
+else()
+
+simd_fail("SIMD extensions not available for this CPU (${CMAKE_SYSTEM_PROCESSOR})")
+
+endif() # CPU_TYPE
diff --git a/simd/Makefile.am b/simd/Makefile.am
deleted file mode 100644
index b8660d1..0000000
--- a/simd/Makefile.am
+++ /dev/null
@@ -1,102 +0,0 @@
-noinst_LTLIBRARIES = libsimd.la
-
-BUILT_SOURCES = jsimdcfg.inc
-
-EXTRA_DIST = nasm_lt.sh CMakeLists.txt \
-	jccolext-mmx.asm   jcgryext-mmx.asm   jdcolext-mmx.asm   jdmrgext-mmx.asm \
-	jccolext-sse2.asm  jcgryext-sse2.asm  jdcolext-sse2.asm  jdmrgext-sse2.asm \
-	jccolext-sse2-64.asm  jcgryext-sse2-64.asm  jdcolext-sse2-64.asm \
-	jdmrgext-sse2-64.asm  jccolext-altivec.c    jcgryext-altivec.c \
-	jdcolext-altivec.c    jdmrgext-altivec.c
-
-if SIMD_X86_64
-
-libsimd_la_SOURCES = jsimd_x86_64.c jsimd.h jsimdcfg.inc.h jsimdext.inc \
-	jcolsamp.inc jdct.inc jpeg_nbits_table.inc jfdctflt-sse-64.asm \
-	jccolor-sse2-64.asm   jcgray-sse2-64.asm    jchuff-sse2-64.asm \
-	jcsample-sse2-64.asm  jdcolor-sse2-64.asm   jdmerge-sse2-64.asm \
-	jdsample-sse2-64.asm  jfdctfst-sse2-64.asm  jfdctint-sse2-64.asm \
-	jidctflt-sse2-64.asm  jidctfst-sse2-64.asm  jidctint-sse2-64.asm \
-	jidctred-sse2-64.asm  jquantf-sse2-64.asm   jquanti-sse2-64.asm
-
-jccolor-sse2-64.lo:  jccolext-sse2-64.asm
-jcgray-sse2-64.lo:   jcgryext-sse2-64.asm
-jdcolor-sse2-64.lo:  jdcolext-sse2-64.asm
-jdmerge-sse2-64.lo:  jdmrgext-sse2-64.asm
-
-endif
-
-if SIMD_I386
-
-libsimd_la_SOURCES = jsimd_i386.c jsimd.h jsimdcfg.inc.h jsimdext.inc \
-	jcolsamp.inc jdct.inc jpeg_nbits_table.inc jsimdcpu.asm \
-	jfdctflt-3dn.asm   jidctflt-3dn.asm   jquant-3dn.asm \
-	jccolor-mmx.asm    jcgray-mmx.asm     jcsample-mmx.asm \
-	jdcolor-mmx.asm    jdmerge-mmx.asm    jdsample-mmx.asm \
-	jfdctfst-mmx.asm   jfdctint-mmx.asm   jidctfst-mmx.asm \
-	jidctint-mmx.asm   jidctred-mmx.asm   jquant-mmx.asm \
-	jfdctflt-sse.asm   jidctflt-sse.asm   jquant-sse.asm \
-	jccolor-sse2.asm   jcgray-sse2.asm    jchuff-sse2.asm \
-	jcsample-sse2.asm  jdcolor-sse2.asm   jdmerge-sse2.asm \
-	jdsample-sse2.asm  jfdctfst-sse2.asm  jfdctint-sse2.asm \
-	jidctflt-sse2.asm  jidctfst-sse2.asm  jidctint-sse2.asm \
-	jidctred-sse2.asm  jquantf-sse2.asm   jquanti-sse2.asm
-
-jccolor-mmx.lo:   jccolext-mmx.asm
-jcgray.-mmx.lo:   jcgryext-mmx.asm
-jdcolor-mmx.lo:   jdcolext-mmx.asm
-jdmerge-mmx.lo:   jdmrgext-mmx.asm
-jccolor-sse2.lo:  jccolext-sse2.asm
-jcgray-sse2.lo:   jcgryext-sse2.asm
-jdcolor-sse2.lo:  jdcolext-sse2.asm
-jdmerge-sse2.lo:  jdmrgext-sse2.asm
-
-endif
-
-if SIMD_ARM
-
-libsimd_la_SOURCES = jsimd_arm.c jsimd_arm_neon.S
-
-endif
-
-if SIMD_ARM_64
-
-libsimd_la_SOURCES = jsimd_arm64.c jsimd_arm64_neon.S
-
-endif
-
-if SIMD_MIPS
-
-libsimd_la_SOURCES = jsimd_mips.c jsimd_mips_dspr2_asm.h jsimd_mips_dspr2.S
-
-endif
-
-if SIMD_POWERPC
-
-noinst_LTLIBRARIES += libsimd_altivec.la
-
-libsimd_altivec_la_SOURCES = \
-	jccolor-altivec.c     jcgray-altivec.c      jcsample-altivec.c \
-	jdcolor-altivec.c     jdmerge-altivec.c     jdsample-altivec.c \
-	jfdctfst-altivec.c    jfdctint-altivec.c \
-	jidctfst-altivec.c    jidctint-altivec.c \
-	jquanti-altivec.c
-libsimd_altivec_la_CFLAGS = -maltivec
-
-jccolor-altivec.lo:  jccolext-altivec.c
-jcgray-altivec.lo:   jcgryext-altivec.c
-jdcolor-altivec.lo:  jdcolext-altivec.c
-jdmerge-altivec.lo:  jdmrgext-altivec.c
-
-libsimd_la_SOURCES = jsimd_powerpc.c jsimd_altivec.h jcsample.h
-libsimd_la_LIBADD = libsimd_altivec.la
-
-endif
-
-AM_CPPFLAGS = -I$(top_srcdir)
-
-.asm.lo:
-	$(AM_V_GEN) $(LIBTOOL) $(AM_V_lt) --mode=compile --tag NASM $(srcdir)/nasm_lt.sh $(AM_V_lt) $(NASM) $(NAFLAGS) -I$(srcdir) -I. $< -o $@
-
-jsimdcfg.inc: $(srcdir)/jsimdcfg.inc.h ../jpeglib.h ../jconfig.h ../jmorecfg.h
-	$(AM_V_GEN) $(CPP) -I$(top_builddir) -I$(top_builddir)/simd $(srcdir)/jsimdcfg.inc.h | $(EGREP) "^[\;%]|^\ %" | sed 's%_cpp_protection_%%' | sed 's@% define@%define@g' > $@
diff --git a/simd/arm/jsimd.c b/simd/arm/jsimd.c
new file mode 100644
index 0000000..8fcd6e3
--- /dev/null
+++ b/simd/arm/jsimd.c
@@ -0,0 +1,689 @@
+/*
+ * jsimd_arm.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
+ * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
+ * Copyright (C) 2015-2016, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 32-bit ARM architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+  char *p;
+
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "Features", 8) != 0)
+    return 0;
+  buffer += 8;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "neon"))
+        simd_support |= JSIMD_NEON;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+  char *env = NULL;
+#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__ARM_NEON__)
+  simd_support |= JSIMD_NEON;
+#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  /* We still have a chance to use NEON regardless of globally used
+   * -mcpu/-mfpu options passed to gcc by performing runtime detection via
+   * /proc/cpuinfo parsing on linux/android */
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#endif
+
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCENEON");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_NEON;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+  env = getenv("JSIMD_NOHUFFENC");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_huffman = 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_extrgb_ycc_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_ycc_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_extbgr_ycc_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_ycc_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_ycc_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_ycc_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_extrgb_ycc_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_ycc_extrgb_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_ycc_extrgbx_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_ycc_extbgr_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_ycc_extbgrx_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_ycc_extxbgr_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_ycc_extxrgb_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_ycc_extrgb_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+                                output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_neon(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_neon(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON && simd_huffman)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
diff --git a/simd/jsimd_arm_neon.S b/simd/arm/jsimd_neon.S
similarity index 93%
rename from simd/jsimd_arm_neon.S
rename to simd/arm/jsimd_neon.S
index cd26127..56cde1f 100644
--- a/simd/jsimd_arm_neon.S
+++ b/simd/arm/jsimd_neon.S
@@ -2,12 +2,12 @@
  * ARMv7 NEON optimizations for libjpeg-turbo
  *
  * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
- * All Rights Reserved.
- * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ *                          All Rights Reserved.
+ * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
  * Copyright (C) 2014, Siarhei Siamashka.  All Rights Reserved.
  * Copyright (C) 2014, Linaro Limited.  All Rights Reserved.
  * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2015-2016, Matthieu Darbois.  All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -46,6 +46,7 @@
 /* Supplementary macro for setting function attributes */
 .macro asm_function fname
 #ifdef __APPLE__
+    .private_extern _\fname
     .globl _\fname
 _\fname:
 #else
@@ -75,8 +76,8 @@
  * Perform dequantization and inverse DCT on one block of coefficients.
  *
  * GLOBAL(void)
- * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
- *                        JSAMPARRAY output_buf, JDIMENSION output_col)
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ *                       JSAMPARRAY output_buf, JDIMENSION output_col)
  */
 
 #define FIX_0_298631336 (2446)
@@ -105,71 +106,70 @@
  * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
  * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
  */
-#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
-{                                                                             \
-    DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
-    JLONG   q1, q2, q3, q4, q5, q6, q7;                                       \
-    JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
-                                                                              \
-    /* 1-D iDCT input data */                                                 \
-    row0 = xrow0;                                                             \
-    row1 = xrow1;                                                             \
-    row2 = xrow2;                                                             \
-    row3 = xrow3;                                                             \
-    row4 = xrow4;                                                             \
-    row5 = xrow5;                                                             \
-    row6 = xrow6;                                                             \
-    row7 = xrow7;                                                             \
-                                                                              \
-    q5 = row7 + row3;                                                         \
-    q4 = row5 + row1;                                                         \
-    q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) +                    \
-         MULTIPLY(q4, FIX_1_175875602);                                       \
-    q7 = MULTIPLY(q5, FIX_1_175875602) +                                      \
-         MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644);                     \
-    q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
-         MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
-    q4 = q6;                                                                  \
-    q3 = ((JLONG) row0 - (JLONG) row4) << 13;                                 \
-    q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
-          MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
-    /* now we can use q1 (reloadable constants have been used up) */          \
-    q1 = q3 + q2;                                                             \
-    q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) +                 \
-          MULTIPLY(row1, -FIX_0_899976223);                                   \
-    q5 = q7;                                                                  \
-    q1 = q1 + q6;                                                             \
-    q7 += MULTIPLY(row7, -FIX_0_899976223) +                                  \
-          MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223);                  \
-                                                                              \
-    /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */        \
-    tmp11_plus_tmp2 = q1;                                                     \
-    row1 = 0;                                                                 \
-                                                                              \
-    q1 = q1 - q6;                                                             \
-    q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) +                 \
-          MULTIPLY(row3, -FIX_2_562915447);                                   \
-    q1 = q1 - q6;                                                             \
-    q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) +                   \
-         MULTIPLY(row6, FIX_0_541196100);                                     \
-    q3 = q3 - q2;                                                             \
-                                                                              \
-    /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
-    tmp11_minus_tmp2 = q1;                                                    \
-                                                                              \
-    q1 = ((JLONG) row0 + (JLONG) row4) << 13;                                 \
-    q2 = q1 + q6;                                                             \
-    q1 = q1 - q6;                                                             \
-                                                                              \
-    /* pick up the results */                                                 \
-    tmp0  = q4;                                                               \
-    tmp1  = q5;                                                               \
-    tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2;                         \
-    tmp3  = q7;                                                               \
-    tmp10 = q2;                                                               \
-    tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2;                         \
-    tmp12 = q3;                                                               \
-    tmp13 = q1;                                                               \
+#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
+    DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
+    JLONG   q1, q2, q3, q4, q5, q6, q7; \
+    JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2; \
+    \
+    /* 1-D iDCT input data */ \
+    row0 = xrow0; \
+    row1 = xrow1; \
+    row2 = xrow2; \
+    row3 = xrow3; \
+    row4 = xrow4; \
+    row5 = xrow5; \
+    row6 = xrow6; \
+    row7 = xrow7; \
+    \
+    q5 = row7 + row3; \
+    q4 = row5 + row1; \
+    q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
+         MULTIPLY(q4, FIX_1_175875602); \
+    q7 = MULTIPLY(q5, FIX_1_175875602) + \
+         MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
+    q2 = MULTIPLY(row2, FIX_0_541196100) + \
+         MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
+    q4 = q6; \
+    q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
+    q6 += MULTIPLY(row5, -FIX_2_562915447) + \
+          MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
+    /* now we can use q1 (reloadable constants have been used up) */ \
+    q1 = q3 + q2; \
+    q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
+          MULTIPLY(row1, -FIX_0_899976223); \
+    q5 = q7; \
+    q1 = q1 + q6; \
+    q7 += MULTIPLY(row7, -FIX_0_899976223) + \
+          MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
+    \
+    /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
+    tmp11_plus_tmp2 = q1; \
+    row1 = 0; \
+    \
+    q1 = q1 - q6; \
+    q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
+          MULTIPLY(row3, -FIX_2_562915447); \
+    q1 = q1 - q6; \
+    q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
+         MULTIPLY(row6, FIX_0_541196100); \
+    q3 = q3 - q2; \
+    \
+    /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
+    tmp11_minus_tmp2 = q1; \
+    \
+    q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
+    q2 = q1 + q6; \
+    q1 = q1 - q6; \
+    \
+    /* pick up the results */ \
+    tmp0  = q4; \
+    tmp1  = q5; \
+    tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
+    tmp3  = q7; \
+    tmp10 = q2; \
+    tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
+    tmp12 = q3; \
+    tmp13 = q1; \
 }
 
 #define XFIX_0_899976223                   d0[0]
@@ -940,18 +940,18 @@
 
 .balign 16
 jsimd_idct_4x4_neon_consts:
-  .short FIX_1_847759065      /* d0[0] */
-  .short -FIX_0_765366865     /* d0[1] */
-  .short -FIX_0_211164243     /* d0[2] */
-  .short FIX_1_451774981      /* d0[3] */
-  .short -FIX_2_172734803     /* d1[0] */
-  .short FIX_1_061594337      /* d1[1] */
-  .short -FIX_0_509795579     /* d1[2] */
-  .short -FIX_0_601344887     /* d1[3] */
-  .short FIX_0_899976223      /* d2[0] */
-  .short FIX_2_562915447      /* d2[1] */
-  .short 1 << (CONST_BITS+1)  /* d2[2] */
-  .short 0                    /* d2[3] */
+  .short FIX_1_847759065        /* d0[0] */
+  .short -FIX_0_765366865       /* d0[1] */
+  .short -FIX_0_211164243       /* d0[2] */
+  .short FIX_1_451774981        /* d0[3] */
+  .short -FIX_2_172734803       /* d1[0] */
+  .short FIX_1_061594337        /* d1[1] */
+  .short -FIX_0_509795579       /* d1[2] */
+  .short -FIX_0_601344887       /* d1[3] */
+  .short FIX_0_899976223        /* d2[0] */
+  .short FIX_2_562915447        /* d2[1] */
+  .short 1 << (CONST_BITS + 1)  /* d2[2] */
+  .short 0                      /* d2[3] */
 
 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
     vmull.s16       q14, \x4, d2[2]
@@ -2107,8 +2107,8 @@
 
 /*
  * GLOBAL(void)
- * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
- *                      DCTELEM *workspace);
+ * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
+ *                     DCTELEM *workspace);
  *
  * Note: the code uses 2 stage pipelining in order to improve instructions
  *       scheduling and eliminate stalls (this provides ~15% better
@@ -2208,10 +2208,10 @@
 
 /*
  * GLOBAL(void)
- * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
- *                                 JDIMENSION downsampled_width,
- *                                 JSAMPARRAY input_data,
- *                                 JSAMPARRAY *output_data_ptr);
+ * jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
+ *                                JDIMENSION downsampled_width,
+ *                                JSAMPARRAY input_data,
+ *                                JSAMPARRAY *output_data_ptr);
  *
  * Note: the use of unaligned writes is the main remaining bottleneck in
  *       this code, which can be potentially solved to get up to tens
@@ -2444,10 +2444,10 @@
 /*****************************************************************************/
 
 /*
- * GLOBAL(JOCTET*)
- * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
- *                              JCOEFPTR block, int last_dc_val,
- *                              c_derived_tbl *dctbl, c_derived_tbl *actbl)
+ * GLOBAL(JOCTET *)
+ * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
+ *                             JCOEFPTR block, int last_dc_val,
+ *                             c_derived_tbl *dctbl, c_derived_tbl *actbl)
  *
  */
 
@@ -2731,7 +2731,7 @@
     ldr             r11, [r0, #0x8]       /* r11 = put_buffer */
     ldr             r4, [r0, #0xc]        /* r4  = put_bits */
     ldrh            r2, [r6, #-128]       /* r2  = nbits */
-    ldrh            r3, [r6]              /* r3  = temp2 & (((JLONG) 1)<<nbits) - 1; */
+    ldrh            r3, [r6]              /* r3  = temp2 & (((JLONG)1)<<nbits) - 1; */
     ldr             r0, [lr, r2, lsl #2]
     ldrb            r5, [r1, r2]
     put_bits        r11, r4, r0, r5
diff --git a/simd/arm64/jsimd.c b/simd/arm64/jsimd.c
new file mode 100644
index 0000000..b7f019d
--- /dev/null
+++ b/simd/arm64/jsimd.c
@@ -0,0 +1,767 @@
+/*
+ * jsimd_arm64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
+ * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
+ * Copyright (C) 2015-2016, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit ARM architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+#define JSIMD_FASTLD3 1
+#define JSIMD_FASTST3 2
+#define JSIMD_FASTTBL 4
+
+static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
+                                    JSIMD_FASTTBL;
+
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
+
+LOCAL(int)
+check_cpuinfo(char *buffer, const char *field, char *value)
+{
+  char *p;
+
+  if (*value == 0)
+    return 0;
+  if (strncmp(buffer, field, strlen(field)) != 0)
+    return 0;
+  buffer += strlen(field);
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'value' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, value))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(value);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_cpuinfo(buffer, "CPU part", "0xd03") ||
+          check_cpuinfo(buffer, "CPU part", "0xd07"))
+        /* The Cortex-A53 has a slow tbl implementation.  We can gain a few
+           percent speedup by disabling the use of that instruction.  The
+           speedup on Cortex-A57 is more subtle but still measurable. */
+        simd_features &= ~JSIMD_FASTTBL;
+      else if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
+        /* The SIMD version of Huffman encoding is slower than the C version on
+           Cavium ThunderX.  Also, ld3 and st3 are abyssmally slow on that
+           CPU. */
+        simd_huffman = simd_features = 0;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+
+/*
+ * ARMv8 architectures support NEON extensions by default.
+ * It is no longer optional as it was with ARMv7.
+ */
+
+
+LOCAL(void)
+init_simd(void)
+{
+  char *env = NULL;
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+  simd_support |= JSIMD_NEON;
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#endif
+
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCENEON");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_NEON;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+  env = getenv("JSIMD_NOHUFFENC");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_huffman = 0;
+  env = getenv("JSIMD_FASTLD3");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_features |= JSIMD_FASTLD3;
+  if ((env != NULL) && (strcmp(env, "0") == 0))
+    simd_features &= ~JSIMD_FASTLD3;
+  env = getenv("JSIMD_FASTST3");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_features |= JSIMD_FASTST3;
+  if ((env != NULL) && (strcmp(env, "0") == 0))
+    simd_features &= ~JSIMD_FASTST3;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    if (simd_features & JSIMD_FASTLD3)
+      neonfct = jsimd_extrgb_ycc_convert_neon;
+    else
+      neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_ycc_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    if (simd_features & JSIMD_FASTLD3)
+      neonfct = jsimd_extbgr_ycc_convert_neon;
+    else
+      neonfct = jsimd_extbgr_ycc_convert_neon_slowld3;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_ycc_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_ycc_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_ycc_convert_neon;
+    break;
+  default:
+    if (simd_features & JSIMD_FASTLD3)
+      neonfct = jsimd_extrgb_ycc_convert_neon;
+    else
+      neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    if (simd_features & JSIMD_FASTST3)
+      neonfct = jsimd_ycc_extrgb_convert_neon;
+    else
+      neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_ycc_extrgbx_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    if (simd_features & JSIMD_FASTST3)
+      neonfct = jsimd_ycc_extbgr_convert_neon;
+    else
+      neonfct = jsimd_ycc_extbgr_convert_neon_slowst3;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_ycc_extbgrx_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_ycc_extxbgr_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_ycc_extxrgb_convert_neon;
+    break;
+  default:
+    if (simd_features & JSIMD_FASTST3)
+      neonfct = jsimd_ycc_extrgb_convert_neon;
+    else
+      neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+    break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+                                output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_neon(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_neon(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON && simd_huffman)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  if (simd_features & JSIMD_FASTTBL)
+    return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+                                            dctbl, actbl);
+  else
+    return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
+                                                    last_dc_val, dctbl, actbl);
+}
diff --git a/simd/jsimd_arm64_neon.S b/simd/arm64/jsimd_neon.S
similarity index 94%
rename from simd/jsimd_arm64_neon.S
rename to simd/arm64/jsimd_neon.S
index 3309858..bd9f113 100644
--- a/simd/jsimd_arm64_neon.S
+++ b/simd/arm64/jsimd_neon.S
@@ -2,12 +2,12 @@
  * ARMv8 NEON optimizations for libjpeg-turbo
  *
  * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
- * All Rights Reserved.
- * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ *                          All Rights Reserved.
+ * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
  * Copyright (C) 2013-2014, Linaro Limited.  All Rights Reserved.
- * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
+ * Author:  Ragesh Radhakrishnan <ragesh.r@linaro.org>
  * Copyright (C) 2014-2016, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2015-2016, Matthieu Darbois.  All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
  * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
@@ -42,6 +42,7 @@
 /* Supplementary macro for setting function attributes */
 .macro asm_function fname
 #ifdef __APPLE__
+    .private_extern _\fname
     .globl _\fname
 _\fname:
 #else
@@ -131,8 +132,8 @@
  * Perform dequantization and inverse DCT on one block of coefficients.
  *
  * GLOBAL(void)
- * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
- *                        JSAMPARRAY output_buf, JDIMENSION output_col)
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ *                       JSAMPARRAY output_buf, JDIMENSION output_col)
  */
 
 #define CONST_BITS 13
@@ -292,8 +293,8 @@
     sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
     mov             v21.16b, v19.16b               /* tmp3 = z1 */
     mov             v20.16b, v18.16b               /* tmp3 = z1 */
-    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
-    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
     sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
     smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
     smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
@@ -323,20 +324,20 @@
     smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
     smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
     smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
 
     smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
     smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
     smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
     smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
     smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
 
     add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
@@ -380,22 +381,22 @@
     sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
     sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
 
-    shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
-    shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
     movi            v0.16b, #(CENTERJSAMPLE)
     /* Prepare pointers (dual-issue with NEON instructions) */
       ldp             TMP1, TMP2, [OUTPUT_BUF], 16
@@ -474,7 +475,7 @@
     sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
     mov             v20.16b, v18.16b               /* tmp3 = z1 */
     sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
-    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
     smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
     add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
     sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
@@ -496,10 +497,10 @@
     smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
     smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
     smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
 
     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
     add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
@@ -525,14 +526,14 @@
     add             v14.4s, v6.4s, v10.4s  /* tmp13 + tmp0 */
     sub             v16.4s, v6.4s, v10.4s  /* tmp13 - tmp0 */
 
-    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
     mov             v6.16b, v15.16b
     mov             v7.16b, v15.16b
     mov             v8.16b, v15.16b
@@ -551,7 +552,7 @@
     sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
     sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
     mov             v21.16b, v19.16b               /* tmp3 = z1 */
-    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
     sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
     smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
     add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
@@ -574,10 +575,10 @@
     smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
     smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
     smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
 
     add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
@@ -609,14 +610,14 @@
     mov             v3.16b, v14.16b
     mov             v4.16b, v14.16b
     mov             v5.16b, v14.16b
-    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
     b               1b
 
 .balign 16
@@ -631,8 +632,8 @@
     sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
     mov             v21.16b, v19.16b               /* tmp3 = z1 */
     mov             v20.16b, v18.16b               /* tmp3 = z1 */
-    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
-    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
     sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
     smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
     smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
@@ -662,20 +663,20 @@
     smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
     smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
     smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
 
     smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
     smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
     smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
     smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
     smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
 
     add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
@@ -719,22 +720,22 @@
     sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
     sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
 
-    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
     b               1b
 
     .unreq          DCT_TABLE
@@ -1041,18 +1042,18 @@
 
 .balign 16
 Ljsimd_idct_4x4_neon_consts:
-  .short FIX_1_847759065      /* v0.h[0] */
-  .short -FIX_0_765366865     /* v0.h[1] */
-  .short -FIX_0_211164243     /* v0.h[2] */
-  .short FIX_1_451774981      /* v0.h[3] */
-  .short -FIX_2_172734803     /* d1[0] */
-  .short FIX_1_061594337      /* d1[1] */
-  .short -FIX_0_509795579     /* d1[2] */
-  .short -FIX_0_601344887     /* d1[3] */
-  .short FIX_0_899976223      /* v2.h[0] */
-  .short FIX_2_562915447      /* v2.h[1] */
-  .short 1 << (CONST_BITS+1)  /* v2.h[2] */
-  .short 0                    /* v2.h[3] */
+  .short FIX_1_847759065        /* v0.h[0] */
+  .short -FIX_0_765366865       /* v0.h[1] */
+  .short -FIX_0_211164243       /* v0.h[2] */
+  .short FIX_1_451774981        /* v0.h[3] */
+  .short -FIX_2_172734803       /* d1[0] */
+  .short FIX_1_061594337        /* d1[1] */
+  .short -FIX_0_509795579       /* d1[2] */
+  .short -FIX_0_601344887       /* d1[3] */
+  .short FIX_0_899976223        /* v2.h[0] */
+  .short FIX_2_562915447        /* v2.h[1] */
+  .short 1 << (CONST_BITS + 1)  /* v2.h[2] */
+  .short 0                      /* v2.h[3] */
 
 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
     smull           v28.4s, \x4, v2.h[2]
@@ -1543,7 +1544,7 @@
     .else
       .error unsupported macroblock size
     .endif
-  .elseif \bpp==16
+  .elseif \bpp == 16
     .if \size == 8
       st1           {v25.8h}, [RGB], 16
     .elseif \size == 4
@@ -2237,8 +2238,8 @@
 #define CONST_BITS 13
 #define PASS1_BITS 2
 
-#define DESCALE_P1 (CONST_BITS-PASS1_BITS)
-#define DESCALE_P2 (CONST_BITS+PASS1_BITS)
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
 
 #define F_0_298  2446  /* FIX(0.298631336) */
 #define F_0_390  3196  /* FIX(0.390180644) */
@@ -2353,8 +2354,8 @@
 
     add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
 
-    shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
-    shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
+    shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
+    shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
 
     smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
     smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
@@ -2368,8 +2369,8 @@
 
     rshrn           v18.4h, v18.4s, #DESCALE_P1
     rshrn           v22.4h, v22.4s, #DESCALE_P1
-    rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
-    rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+    rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+    rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
 
     /* Odd part */
 
@@ -2395,10 +2396,10 @@
     smull2          v13.4s, v9.8h, XFIX_N_2_562
     smull2          v14.4s, v10.8h, XFIX_N_1_961
     smull2          v15.4s, v11.8h, XFIX_N_0_390
-    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
-    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
-    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
-    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
+    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
+    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
+    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
+    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
 
     add             v10.4s, v10.4s, v4.4s  /* z3 += z5 */
     add             v14.4s, v14.4s, v5.4s
@@ -2427,10 +2428,10 @@
     rshrn           v21.4h, v29.4s, #DESCALE_P1
     rshrn           v19.4h, v30.4s, #DESCALE_P1
     rshrn           v17.4h, v31.4s, #DESCALE_P1
-    rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
-    rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
 
     /* Transpose */
     transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
@@ -2456,8 +2457,8 @@
 
     add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
 
-    srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); */
-    srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); */
+    srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
+    srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
 
     smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
     smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
@@ -2471,8 +2472,8 @@
 
     rshrn           v18.4h, v18.4s, #DESCALE_P2
     rshrn           v22.4h, v22.4s, #DESCALE_P2
-    rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
-    rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+    rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+    rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
 
     /* Odd part */
     add             v8.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
@@ -2498,10 +2499,10 @@
     smull2          v13.4s, v9.8h, XFIX_N_2_562
     smull2          v14.4s, v10.8h, XFIX_N_1_961
     smull2          v15.4s, v11.8h, XFIX_N_0_390
-    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
-    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
-    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
-    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
+    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
+    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
+    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
+    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
 
     add             v10.4s, v10.4s, v4.4s
     add             v14.4s, v14.4s, v5.4s
@@ -2530,10 +2531,10 @@
     rshrn           v21.4h, v29.4s, #DESCALE_P2
     rshrn           v19.4h, v30.4s, #DESCALE_P2
     rshrn           v17.4h, v31.4s, #DESCALE_P2
-    rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
-    rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
 
     /* store results */
     st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
@@ -2676,8 +2677,8 @@
 
 /*
  * GLOBAL(void)
- * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
- *                      DCTELEM *workspace);
+ * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
+ *                     DCTELEM *workspace);
  *
  */
 asm_function jsimd_quantize_neon
@@ -2768,10 +2769,10 @@
  * without smoothing.
  *
  * GLOBAL(void)
- * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
- *                             JDIMENSION v_samp_factor,
- *                             JDIMENSION width_blocks, JSAMPARRAY input_data,
- *                             JSAMPARRAY output_data);
+ * jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+ *                            JDIMENSION v_samp_factor,
+ *                            JDIMENSION width_in_blocks,
+ *                            JSAMPARRAY input_data, JSAMPARRAY output_data);
  */
 
 .balign 16
@@ -2879,9 +2880,10 @@
  * without smoothing.
  *
  * GLOBAL(void)
- * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
- *                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
- *                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+ * jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+ *                            JDIMENSION v_samp_factor,
+ *                            JDIMENSION width_in_blocks,
+ *                            JSAMPARRAY input_data, JSAMPARRAY output_data);
  */
 
 .balign 16
@@ -2960,10 +2962,10 @@
 /*****************************************************************************/
 
 /*
- * GLOBAL(JOCTET*)
- * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
- *                              JCOEFPTR block, int last_dc_val,
- *                              c_derived_tbl *dctbl, c_derived_tbl *actbl)
+ * GLOBAL(JOCTET *)
+ * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
+ *                             JCOEFPTR block, int last_dc_val,
+ *                             c_derived_tbl *dctbl, c_derived_tbl *actbl)
  *
  */
 
@@ -3278,7 +3280,7 @@
       put_bits        x10, x11
     addp            v16.16b, v16.16b, v18.16b
       checkbuf47
-    umov            x9,v16.D[0]
+    umov            x9, v16.D[0]
       put_bits        x13, x12
     cnt             v17.8b, v16.8b
       mvn             x9, x9
diff --git a/simd/gas-preprocessor.in b/simd/gas-preprocessor.in
new file mode 100755
index 0000000..560f788
--- /dev/null
+++ b/simd/gas-preprocessor.in
@@ -0,0 +1 @@
+gas-preprocessor.pl @CMAKE_ASM_COMPILER@ ${1+"$@"}
diff --git a/simd/i386/jccolext-avx2.asm b/simd/i386/jccolext-avx2.asm
new file mode 100644
index 0000000..7a8d784
--- /dev/null
+++ b/simd/i386/jccolext-avx2.asm
@@ -0,0 +1,580 @@
+;
+; jccolext.asm - colorspace conversion (AVX2)
+;
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         8
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
+
+EXTN(jsimd_rgb_ycc_convert_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edx
+    push        ebx
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    mov         ebx, JSAMPROW [ebx]     ; outptr1
+    mov         edx, JSAMPROW [edx]     ; outptr2
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, BYTE [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, WORD [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         ecx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [esi+ecx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         ecx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=RE
+    vmovdqa     YMMWORD [wk(1)], ymm1   ; wk(1)=RO
+    vmovdqa     YMMWORD [wk(2)], ymm4   ; wk(2)=BE
+    vmovdqa     YMMWORD [wk(3)], ymm5   ; wk(3)=BO
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vmovdqa     ymm7, ymm1
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    vpmaddwd    ymm7, ymm7, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(4)], ymm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(5)], ymm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vpxor       ymm1, ymm1, ymm1
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm1, ymm1, ymm5        ; ymm1=BOL
+    vpunpckhwd  ymm6, ymm6, ymm5        ; ymm6=BOH
+    vpsrld      ymm1, ymm1, 1           ; ymm1=BOL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BOH*FIX(0.500)
+
+    vmovdqa     ymm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm5=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm7, ymm7, ymm5
+    vpaddd      ymm4, ymm4, ymm5
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CbOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbOH
+    vpackssdw   ymm7, ymm7, ymm4        ; ymm7=CbO
+
+    vmovdqa     ymm1, YMMWORD [wk(2)]   ; ymm1=BE
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vmovdqa     ymm5, ymm0
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(6)], ymm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(7)], ymm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vpxor       ymm0, ymm0, ymm0
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm0, ymm0, ymm1        ; ymm0=BEL
+    vpunpckhwd  ymm6, ymm6, ymm1        ; ymm6=BEH
+    vpsrld      ymm0, ymm0, 1           ; ymm0=BEL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BEH*FIX(0.500)
+
+    vmovdqa     ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm5, ymm5, ymm0
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm5, ymm5, ymm1
+    vpaddd      ymm4, ymm4, ymm1
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CbEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbEH
+    vpackssdw   ymm5, ymm5, ymm4        ; ymm5=CbE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm5, ymm5, ymm7        ; ymm5=Cb
+    vmovdqu     YMMWORD [ebx], ymm5     ; Save Cb
+
+    vmovdqa     ymm0, YMMWORD [wk(3)]   ; ymm0=BO
+    vmovdqa     ymm6, YMMWORD [wk(2)]   ; ymm6=BE
+    vmovdqa     ymm1, YMMWORD [wk(1)]   ; ymm1=RO
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vmovdqa     ymm7, ymm0
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    vpmaddwd    ymm7, ymm7, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    vmovdqa     ymm3, [GOTOFF(eax,PD_ONEHALF)]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, YMMWORD [wk(4)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(5)]
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vpxor       ymm3, ymm3, ymm3
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm3, ymm3, ymm1        ; ymm3=ROL
+    vpunpckhwd  ymm4, ymm4, ymm1        ; ymm4=ROH
+    vpsrld      ymm3, ymm3, 1           ; ymm3=ROL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=ROH*FIX(0.500)
+
+    vmovdqa     ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm3
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm5, ymm5, ymm1
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CrOL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrOH
+    vpackssdw   ymm7, ymm7, ymm5        ; ymm7=CrO
+
+    vmovdqa     ymm3, YMMWORD [wk(0)]   ; ymm3=RE
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vmovdqa     ymm1, ymm6
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    vpmaddwd    ymm1, ymm1, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    vmovdqa     ymm2, [GOTOFF(eax,PD_ONEHALF)]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(6)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(7)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [edi], ymm6     ; Save Y
+
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm2, ymm2, ymm3        ; ymm2=REL
+    vpunpckhwd  ymm4, ymm4, ymm3        ; ymm4=REH
+    vpsrld      ymm2, ymm2, 1           ; ymm2=REL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=REH*FIX(0.500)
+
+    vmovdqa     ymm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm0=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm1, ymm1, ymm2
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm1, ymm1, ymm0
+    vpaddd      ymm5, ymm5, ymm0
+    vpsrld      ymm1, ymm1, SCALEBITS   ; ymm1=CrEL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrEH
+    vpackssdw   ymm1, ymm1, ymm5        ; ymm1=CrE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Cr
+    vmovdqu     YMMWORD [edx], ymm1     ; Save Cr
+
+    sub         ecx, byte SIZEOF_YMMWORD
+    add         esi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte SIZEOF_YMMWORD           ; outptr0
+    add         ebx, byte SIZEOF_YMMWORD           ; outptr1
+    add         edx, byte SIZEOF_YMMWORD           ; outptr2
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    pop         ebx
+    pop         edx
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jccolext-mmx.asm b/simd/i386/jccolext-mmx.asm
new file mode 100644
index 0000000..9a2c30e
--- /dev/null
+++ b/simd/i386/jccolext-mmx.asm
@@ -0,0 +1,478 @@
+;
+; jccolext.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                           JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                           int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         8
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_mmx)
+
+EXTN(jsimd_rgb_ycc_convert_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edx
+    push        ebx
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    mov         ebx, JSAMPROW [ebx]     ; outptr1
+    mov         edx, JSAMPROW [edx]     ; outptr2
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    xor         eax, eax
+    mov         al, BYTE [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    xor         edx, edx
+    mov         dx, WORD [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        mmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        mmG, DWORD [esi+ecx]
+    psllq       mmA, DWORD_BIT
+    por         mmA, mmG
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    movq        mmG, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    mov         ecx, SIZEOF_MMWORD
+    jmp         short .rgb_ycc_cnv
+.column_ld16:
+    test        cl, 2*SIZEOF_MMWORD
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_ycc_cnv
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+    ; mmA=(00 10 20 01 11 21 02 12)
+    ; mmG=(22 03 13 23 04 14 24 05)
+    ; mmF=(15 25 06 16 26 07 17 27)
+
+    movq        mmD, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 10 20 01)
+    psrlq       mmD, 4*BYTE_BIT         ; mmD=(11 21 02 12 -- -- -- --)
+
+    punpckhbw   mmA, mmG                ; mmA=(00 04 10 14 20 24 01 05)
+    psllq       mmG, 4*BYTE_BIT         ; mmG=(-- -- -- -- 22 03 13 23)
+
+    punpcklbw   mmD, mmF                ; mmD=(11 15 21 25 02 06 12 16)
+    punpckhbw   mmG, mmF                ; mmG=(22 26 03 07 13 17 23 27)
+
+    movq        mmE, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 04 10 14)
+    psrlq       mmE, 4*BYTE_BIT         ; mmE=(20 24 01 05 -- -- -- --)
+
+    punpckhbw   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    psllq       mmD, 4*BYTE_BIT         ; mmD=(-- -- -- -- 11 15 21 25)
+
+    punpcklbw   mmE, mmG                ; mmE=(20 22 24 26 01 03 05 07)
+    punpckhbw   mmD, mmG                ; mmD=(11 13 15 17 21 23 25 27)
+
+    pxor        mmH, mmH
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmH                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmH                ; mmC=(10 12 14 16)
+
+    movq        mmB, mmE
+    punpcklbw   mmE, mmH                ; mmE=(20 22 24 26)
+    punpckhbw   mmB, mmH                ; mmB=(01 03 05 07)
+
+    movq        mmF, mmD
+    punpcklbw   mmD, mmH                ; mmD=(11 13 15 17)
+    punpckhbw   mmF, mmH                ; mmF=(21 23 25 27)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_MMWORD/8
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_MMWORD/8
+    movd        mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_MMWORD/4
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_MMWORD/4
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+    test        cl, SIZEOF_MMWORD/2
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_ycc_cnv
+    movq        mmD, mmA
+    movq        mmC, mmF
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+    movq        mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+    ; mmA=(00 10 20 30 01 11 21 31)
+    ; mmF=(02 12 22 32 03 13 23 33)
+    ; mmD=(04 14 24 34 05 15 25 35)
+    ; mmC=(06 16 26 36 07 17 27 37)
+
+    movq        mmB, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 10 12 20 22 30 32)
+    punpckhbw   mmB, mmF                ; mmB=(01 03 11 13 21 23 31 33)
+
+    movq        mmG, mmD
+    punpcklbw   mmD, mmC                ; mmD=(04 06 14 16 24 26 34 36)
+    punpckhbw   mmG, mmC                ; mmG=(05 07 15 17 25 27 35 37)
+
+    movq        mmE, mmA
+    punpcklwd   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    punpckhwd   mmE, mmD                ; mmE=(20 22 24 26 30 32 34 36)
+
+    movq        mmH, mmB
+    punpcklwd   mmB, mmG                ; mmB=(01 03 05 07 11 13 15 17)
+    punpckhwd   mmH, mmG                ; mmH=(21 23 25 27 31 33 35 37)
+
+    pxor        mmF, mmF
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmF                ; mmC=(10 12 14 16)
+
+    movq        mmD, mmB
+    punpcklbw   mmB, mmF                ; mmB=(01 03 05 07)
+    punpckhbw   mmD, mmF                ; mmD=(11 13 15 17)
+
+    movq        mmG, mmE
+    punpcklbw   mmE, mmF                ; mmE=(20 22 24 26)
+    punpckhbw   mmG, mmF                ; mmG=(30 32 34 36)
+
+    punpcklbw   mmF, mmH
+    punpckhbw   mmH, mmH
+    psrlw       mmF, BYTE_BIT           ; mmF=(21 23 25 27)
+    psrlw       mmH, BYTE_BIT           ; mmH=(31 33 35 37)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+    ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=RE
+    movq        MMWORD [wk(1)], mm1     ; wk(1)=RO
+    movq        MMWORD [wk(2)], mm4     ; wk(2)=BE
+    movq        MMWORD [wk(3)], mm5     ; wk(3)=BO
+
+    movq        mm6, mm1
+    punpcklwd   mm1, mm3
+    punpckhwd   mm6, mm3
+    movq        mm7, mm1
+    movq        mm4, mm6
+    pmaddwd     mm1, [GOTOFF(eax,PW_F0299_F0337)]  ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    pmaddwd     mm7, [GOTOFF(eax,PW_MF016_MF033)]  ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    pmaddwd     mm4, [GOTOFF(eax,PW_MF016_MF033)]  ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    movq        MMWORD [wk(4)], mm1     ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    movq        MMWORD [wk(5)], mm6     ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    pxor        mm1, mm1
+    pxor        mm6, mm6
+    punpcklwd   mm1, mm5                ; mm1=BOL
+    punpckhwd   mm6, mm5                ; mm6=BOH
+    psrld       mm1, 1                  ; mm1=BOL*FIX(0.500)
+    psrld       mm6, 1                  ; mm6=BOH*FIX(0.500)
+
+    movq        mm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm5=[PD_ONEHALFM1_CJ]
+
+    paddd       mm7, mm1
+    paddd       mm4, mm6
+    paddd       mm7, mm5
+    paddd       mm4, mm5
+    psrld       mm7, SCALEBITS          ; mm7=CbOL
+    psrld       mm4, SCALEBITS          ; mm4=CbOH
+    packssdw    mm7, mm4                ; mm7=CbO
+
+    movq        mm1, MMWORD [wk(2)]     ; mm1=BE
+
+    movq        mm6, mm0
+    punpcklwd   mm0, mm2
+    punpckhwd   mm6, mm2
+    movq        mm5, mm0
+    movq        mm4, mm6
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0299_F0337)]  ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF016_MF033)]  ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    pmaddwd     mm4, [GOTOFF(eax,PW_MF016_MF033)]  ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    movq        MMWORD [wk(6)], mm0     ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movq        MMWORD [wk(7)], mm6     ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    pxor        mm0, mm0
+    pxor        mm6, mm6
+    punpcklwd   mm0, mm1                ; mm0=BEL
+    punpckhwd   mm6, mm1                ; mm6=BEH
+    psrld       mm0, 1                  ; mm0=BEL*FIX(0.500)
+    psrld       mm6, 1                  ; mm6=BEH*FIX(0.500)
+
+    movq        mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm1=[PD_ONEHALFM1_CJ]
+
+    paddd       mm5, mm0
+    paddd       mm4, mm6
+    paddd       mm5, mm1
+    paddd       mm4, mm1
+    psrld       mm5, SCALEBITS          ; mm5=CbEL
+    psrld       mm4, SCALEBITS          ; mm4=CbEH
+    packssdw    mm5, mm4                ; mm5=CbE
+
+    psllw       mm7, BYTE_BIT
+    por         mm5, mm7                ; mm5=Cb
+    movq        MMWORD [ebx], mm5       ; Save Cb
+
+    movq        mm0, MMWORD [wk(3)]     ; mm0=BO
+    movq        mm6, MMWORD [wk(2)]     ; mm6=BE
+    movq        mm1, MMWORD [wk(1)]     ; mm1=RO
+
+    movq        mm4, mm0
+    punpcklwd   mm0, mm3
+    punpckhwd   mm4, mm3
+    movq        mm7, mm0
+    movq        mm5, mm4
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0114_F0250)]  ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    pmaddwd     mm7, [GOTOFF(eax,PW_MF008_MF041)]  ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF008_MF041)]  ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    movq        mm3, [GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
+
+    paddd       mm0, MMWORD [wk(4)]
+    paddd       mm4, MMWORD [wk(5)]
+    paddd       mm0, mm3
+    paddd       mm4, mm3
+    psrld       mm0, SCALEBITS          ; mm0=YOL
+    psrld       mm4, SCALEBITS          ; mm4=YOH
+    packssdw    mm0, mm4                ; mm0=YO
+
+    pxor        mm3, mm3
+    pxor        mm4, mm4
+    punpcklwd   mm3, mm1                ; mm3=ROL
+    punpckhwd   mm4, mm1                ; mm4=ROH
+    psrld       mm3, 1                  ; mm3=ROL*FIX(0.500)
+    psrld       mm4, 1                  ; mm4=ROH*FIX(0.500)
+
+    movq        mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm1=[PD_ONEHALFM1_CJ]
+
+    paddd       mm7, mm3
+    paddd       mm5, mm4
+    paddd       mm7, mm1
+    paddd       mm5, mm1
+    psrld       mm7, SCALEBITS          ; mm7=CrOL
+    psrld       mm5, SCALEBITS          ; mm5=CrOH
+    packssdw    mm7, mm5                ; mm7=CrO
+
+    movq        mm3, MMWORD [wk(0)]     ; mm3=RE
+
+    movq        mm4, mm6
+    punpcklwd   mm6, mm2
+    punpckhwd   mm4, mm2
+    movq        mm1, mm6
+    movq        mm5, mm4
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0114_F0250)]  ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    pmaddwd     mm1, [GOTOFF(eax,PW_MF008_MF041)]  ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF008_MF041)]  ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    movq        mm2, [GOTOFF(eax,PD_ONEHALF)]      ; mm2=[PD_ONEHALF]
+
+    paddd       mm6, MMWORD [wk(6)]
+    paddd       mm4, MMWORD [wk(7)]
+    paddd       mm6, mm2
+    paddd       mm4, mm2
+    psrld       mm6, SCALEBITS          ; mm6=YEL
+    psrld       mm4, SCALEBITS          ; mm4=YEH
+    packssdw    mm6, mm4                ; mm6=YE
+
+    psllw       mm0, BYTE_BIT
+    por         mm6, mm0                ; mm6=Y
+    movq        MMWORD [edi], mm6       ; Save Y
+
+    pxor        mm2, mm2
+    pxor        mm4, mm4
+    punpcklwd   mm2, mm3                ; mm2=REL
+    punpckhwd   mm4, mm3                ; mm4=REH
+    psrld       mm2, 1                  ; mm2=REL*FIX(0.500)
+    psrld       mm4, 1                  ; mm4=REH*FIX(0.500)
+
+    movq        mm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm0=[PD_ONEHALFM1_CJ]
+
+    paddd       mm1, mm2
+    paddd       mm5, mm4
+    paddd       mm1, mm0
+    paddd       mm5, mm0
+    psrld       mm1, SCALEBITS          ; mm1=CrEL
+    psrld       mm5, SCALEBITS          ; mm5=CrEH
+    packssdw    mm1, mm5                ; mm1=CrE
+
+    psllw       mm7, BYTE_BIT
+    por         mm1, mm7                ; mm1=Cr
+    movq        MMWORD [edx], mm1       ; Save Cr
+
+    sub         ecx, byte SIZEOF_MMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; inptr
+    add         edi, byte SIZEOF_MMWORD                ; outptr0
+    add         ebx, byte SIZEOF_MMWORD                ; outptr1
+    add         edx, byte SIZEOF_MMWORD                ; outptr2
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    pop         ebx
+    pop         edx
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jccolext-sse2.asm b/simd/i386/jccolext-sse2.asm
new file mode 100644
index 0000000..e830562
--- /dev/null
+++ b/simd/i386/jccolext-sse2.asm
@@ -0,0 +1,505 @@
+;
+; jccolext.asm - colorspace conversion (SSE2)
+;
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         8
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edx
+    push        ebx
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    mov         ebx, JSAMPROW [ebx]     ; outptr1
+    mov         edx, JSAMPROW [edx]     ; outptr2
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, BYTE [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, WORD [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .rgb_ycc_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=RE
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=RO
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=BE
+    movdqa      XMMWORD [wk(3)], xmm5   ; wk(3)=BO
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm1, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    pmaddwd     xmm7, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(4)], xmm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    movdqa      XMMWORD [wk(5)], xmm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    pxor        xmm1, xmm1
+    pxor        xmm6, xmm6
+    punpcklwd   xmm1, xmm5              ; xmm1=BOL
+    punpckhwd   xmm6, xmm5              ; xmm6=BOH
+    psrld       xmm1, 1                 ; xmm1=BOL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BOH*FIX(0.500)
+
+    movdqa      xmm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm5=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm1
+    paddd       xmm4, xmm6
+    paddd       xmm7, xmm5
+    paddd       xmm4, xmm5
+    psrld       xmm7, SCALEBITS         ; xmm7=CbOL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbOH
+    packssdw    xmm7, xmm4              ; xmm7=CbO
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=BE
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(7)], xmm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    pxor        xmm0, xmm0
+    pxor        xmm6, xmm6
+    punpcklwd   xmm0, xmm1              ; xmm0=BEL
+    punpckhwd   xmm6, xmm1              ; xmm6=BEH
+    psrld       xmm0, 1                 ; xmm0=BEL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BEH*FIX(0.500)
+
+    movdqa      xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm5, xmm0
+    paddd       xmm4, xmm6
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrld       xmm5, SCALEBITS         ; xmm5=CbEL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbEH
+    packssdw    xmm5, xmm4              ; xmm5=CbE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm5, xmm7              ; xmm5=Cb
+    movdqa      XMMWORD [ebx], xmm5     ; Save Cb
+
+    movdqa      xmm0, XMMWORD [wk(3)]   ; xmm0=BO
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=BE
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=RO
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    movdqa      xmm7, xmm0
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    pmaddwd     xmm7, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    movdqa      xmm3, [GOTOFF(eax,PD_ONEHALF)]  ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, XMMWORD [wk(4)]
+    paddd       xmm4, XMMWORD [wk(5)]
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    pxor        xmm3, xmm3
+    pxor        xmm4, xmm4
+    punpcklwd   xmm3, xmm1              ; xmm3=ROL
+    punpckhwd   xmm4, xmm1              ; xmm4=ROH
+    psrld       xmm3, 1                 ; xmm3=ROL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=ROH*FIX(0.500)
+
+    movdqa      xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm3
+    paddd       xmm5, xmm4
+    paddd       xmm7, xmm1
+    paddd       xmm5, xmm1
+    psrld       xmm7, SCALEBITS         ; xmm7=CrOL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrOH
+    packssdw    xmm7, xmm5              ; xmm7=CrO
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=RE
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    pmaddwd     xmm1, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    movdqa      xmm2, [GOTOFF(eax,PD_ONEHALF)]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(6)]
+    paddd       xmm4, XMMWORD [wk(7)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [edi], xmm6     ; Save Y
+
+    pxor        xmm2, xmm2
+    pxor        xmm4, xmm4
+    punpcklwd   xmm2, xmm3              ; xmm2=REL
+    punpckhwd   xmm4, xmm3              ; xmm4=REH
+    psrld       xmm2, 1                 ; xmm2=REL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=REH*FIX(0.500)
+
+    movdqa      xmm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm0=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm1, xmm2
+    paddd       xmm5, xmm4
+    paddd       xmm1, xmm0
+    paddd       xmm5, xmm0
+    psrld       xmm1, SCALEBITS         ; xmm1=CrEL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrEH
+    packssdw    xmm1, xmm5              ; xmm1=CrE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Cr
+    movdqa      XMMWORD [edx], xmm1     ; Save Cr
+
+    sub         ecx, byte SIZEOF_XMMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte SIZEOF_XMMWORD                ; outptr0
+    add         ebx, byte SIZEOF_XMMWORD                ; outptr1
+    add         edx, byte SIZEOF_XMMWORD                ; outptr2
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    pop         ebx
+    pop         edx
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jccolor-avx2.asm b/simd/i386/jccolor-avx2.asm
new file mode 100644
index 0000000..ba19e20
--- /dev/null
+++ b/simd/i386/jccolor-avx2.asm
@@ -0,0 +1,123 @@
+;
+; jccolor.asm - colorspace conversion (AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
+
+EXTN(jconst_rgb_ycc_convert_avx2):
+
+PW_F0299_F0337  times 8 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 8 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 8 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 8 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 8 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 8 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgbx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgrx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extxbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extxrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
diff --git a/simd/jccolor-mmx.asm b/simd/i386/jccolor-mmx.asm
similarity index 71%
rename from simd/jccolor-mmx.asm
rename to simd/i386/jccolor-mmx.asm
index c4e6d88..cc01897 100644
--- a/simd/jccolor-mmx.asm
+++ b/simd/i386/jccolor-mmx.asm
@@ -2,7 +2,7 @@
 ; jccolor.asm - colorspace conversion (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -20,38 +20,39 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS       16
+%define SCALEBITS  16
 
-F_0_081 equ      5329                   ; FIX(0.08131)
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_168 equ     11059                   ; FIX(0.16874)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_331 equ     21709                   ; FIX(0.33126)
-F_0_418 equ     27439                   ; FIX(0.41869)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_CONST
+    SECTION     SEG_CONST
 
-        alignz  16
-        global  EXTN(jconst_rgb_ycc_convert_mmx)
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_mmx)
 
 EXTN(jconst_rgb_ycc_convert_mmx):
 
-PW_F0299_F0337  times 2 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 2 dw  F_0_114, F_0_250
-PW_MF016_MF033  times 2 dw -F_0_168,-F_0_331
-PW_MF008_MF041  times 2 dw -F_0_081,-F_0_418
-PD_ONEHALFM1_CJ times 2 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
-PD_ONEHALF      times 2 dd  (1 << (SCALEBITS-1))
+PW_F0299_F0337  times 2 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 2 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 2 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 2 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 2 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 2 dd  (1 << (SCALEBITS - 1))
 
-        alignz  16
+    alignz      32
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
+    SECTION     SEG_TEXT
+    BITS        32
 
 %include "jccolext-mmx.asm"
 
diff --git a/simd/jccolor-sse2.asm b/simd/i386/jccolor-sse2.asm
similarity index 71%
rename from simd/jccolor-sse2.asm
rename to simd/i386/jccolor-sse2.asm
index 13124d1..737590e 100644
--- a/simd/jccolor-sse2.asm
+++ b/simd/i386/jccolor-sse2.asm
@@ -1,7 +1,7 @@
 ;
 ; jccolor.asm - colorspace conversion (SSE2)
 ;
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -19,38 +19,39 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS       16
+%define SCALEBITS  16
 
-F_0_081 equ      5329                   ; FIX(0.08131)
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_168 equ     11059                   ; FIX(0.16874)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_331 equ     21709                   ; FIX(0.33126)
-F_0_418 equ     27439                   ; FIX(0.41869)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_CONST
+    SECTION     SEG_CONST
 
-        alignz  16
-        global  EXTN(jconst_rgb_ycc_convert_sse2)
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
 
 EXTN(jconst_rgb_ycc_convert_sse2):
 
-PW_F0299_F0337  times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 4 dw  F_0_114, F_0_250
-PW_MF016_MF033  times 4 dw -F_0_168,-F_0_331
-PW_MF008_MF041  times 4 dw -F_0_081,-F_0_418
-PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
-PD_ONEHALF      times 4 dd  (1 << (SCALEBITS-1))
+PW_F0299_F0337  times 4 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 4 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 4 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 4 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 4 dd  (1 << (SCALEBITS - 1))
 
-        alignz  16
+    alignz      32
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
+    SECTION     SEG_TEXT
+    BITS        32
 
 %include "jccolext-sse2.asm"
 
diff --git a/simd/i386/jcgray-avx2.asm b/simd/i386/jcgray-avx2.asm
new file mode 100644
index 0000000..cc4713f
--- /dev/null
+++ b/simd/i386/jcgray-avx2.asm
@@ -0,0 +1,115 @@
+;
+; jcgray.asm - grayscale colorspace conversion (AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
+
+EXTN(jconst_rgb_gray_convert_avx2):
+
+PW_F0299_F0337 times 8 dw F_0_299, F_0_337
+PW_F0114_F0250 times 8 dw F_0_114, F_0_250
+PD_ONEHALF     times 8 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extrgbx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extbgrx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extxbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extxrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
diff --git a/simd/jcgray-mmx.asm b/simd/i386/jcgray-mmx.asm
similarity index 80%
rename from simd/jcgray-mmx.asm
rename to simd/i386/jcgray-mmx.asm
index 0819b6c..3e5a98a 100644
--- a/simd/jcgray-mmx.asm
+++ b/simd/i386/jcgray-mmx.asm
@@ -2,7 +2,7 @@
 ; jcgray.asm - grayscale colorspace conversion (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2011, D. R. Commander.
+; Copyright (C) 2011, 2016, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -20,31 +20,31 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS       16
+%define SCALEBITS  16
 
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_CONST
+    SECTION     SEG_CONST
 
-        alignz  16
-        global  EXTN(jconst_rgb_gray_convert_mmx)
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_mmx)
 
 EXTN(jconst_rgb_gray_convert_mmx):
 
-PW_F0299_F0337  times 2 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 2 dw  F_0_114, F_0_250
-PD_ONEHALF      times 2 dd  (1 << (SCALEBITS-1))
+PW_F0299_F0337 times 2 dw F_0_299, F_0_337
+PW_F0114_F0250 times 2 dw F_0_114, F_0_250
+PD_ONEHALF     times 2 dd (1 << (SCALEBITS - 1))
 
-        alignz  16
+    alignz      32
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
+    SECTION     SEG_TEXT
+    BITS        32
 
 %include "jcgryext-mmx.asm"
 
diff --git a/simd/jcgray-sse2.asm b/simd/i386/jcgray-sse2.asm
similarity index 80%
rename from simd/jcgray-sse2.asm
rename to simd/i386/jcgray-sse2.asm
index 5b0b466..405f96b 100644
--- a/simd/jcgray-sse2.asm
+++ b/simd/i386/jcgray-sse2.asm
@@ -1,7 +1,7 @@
 ;
 ; jcgray.asm - grayscale colorspace conversion (SSE2)
 ;
-; Copyright (C) 2011, D. R. Commander.
+; Copyright (C) 2011, 2016, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -19,31 +19,31 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS       16
+%define SCALEBITS  16
 
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_CONST
+    SECTION     SEG_CONST
 
-        alignz  16
-        global  EXTN(jconst_rgb_gray_convert_sse2)
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
 
 EXTN(jconst_rgb_gray_convert_sse2):
 
-PW_F0299_F0337  times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 4 dw  F_0_114, F_0_250
-PD_ONEHALF      times 4 dd  (1 << (SCALEBITS-1))
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PD_ONEHALF     times 4 dd (1 << (SCALEBITS - 1))
 
-        alignz  16
+    alignz      32
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
+    SECTION     SEG_TEXT
+    BITS        32
 
 %include "jcgryext-sse2.asm"
 
diff --git a/simd/i386/jcgryext-avx2.asm b/simd/i386/jcgryext-avx2.asm
new file mode 100644
index 0000000..52e99a8
--- /dev/null
+++ b/simd/i386/jcgryext-avx2.asm
@@ -0,0 +1,459 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
+
+EXTN(jsimd_rgb_gray_convert_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, BYTE [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, WORD [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         ecx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [esi+ecx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         ecx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vpmaddwd    ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm7, ymm6              ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(1)], ymm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     ymm0, ymm5              ; ymm0=BO
+    vmovdqa     ymm6, ymm4              ; ymm6=BE
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    vmovdqa     ymm3, [GOTOFF(eax,PD_ONEHALF)]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, ymm1
+    vpaddd      ymm4, ymm4, ymm7
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    vmovdqa     ymm2, [GOTOFF(eax,PD_ONEHALF)]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(0)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(1)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [edi], ymm6     ; Save Y
+
+    sub         ecx, byte SIZEOF_YMMWORD
+    add         esi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte SIZEOF_YMMWORD           ; outptr0
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jcgryext-mmx.asm b/simd/i386/jcgryext-mmx.asm
new file mode 100644
index 0000000..4a9ab0d
--- /dev/null
+++ b/simd/i386/jcgryext-mmx.asm
@@ -0,0 +1,357 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_mmx)
+
+EXTN(jsimd_rgb_gray_convert_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    xor         eax, eax
+    mov         al, BYTE [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    xor         edx, edx
+    mov         dx, WORD [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        mmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        mmG, DWORD [esi+ecx]
+    psllq       mmA, DWORD_BIT
+    por         mmA, mmG
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    movq        mmG, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    mov         ecx, SIZEOF_MMWORD
+    jmp         short .rgb_gray_cnv
+.column_ld16:
+    test        cl, 2*SIZEOF_MMWORD
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_gray_cnv
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_gray_cnv:
+    ; mmA=(00 10 20 01 11 21 02 12)
+    ; mmG=(22 03 13 23 04 14 24 05)
+    ; mmF=(15 25 06 16 26 07 17 27)
+
+    movq        mmD, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 10 20 01)
+    psrlq       mmD, 4*BYTE_BIT         ; mmD=(11 21 02 12 -- -- -- --)
+
+    punpckhbw   mmA, mmG                ; mmA=(00 04 10 14 20 24 01 05)
+    psllq       mmG, 4*BYTE_BIT         ; mmG=(-- -- -- -- 22 03 13 23)
+
+    punpcklbw   mmD, mmF                ; mmD=(11 15 21 25 02 06 12 16)
+    punpckhbw   mmG, mmF                ; mmG=(22 26 03 07 13 17 23 27)
+
+    movq        mmE, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 04 10 14)
+    psrlq       mmE, 4*BYTE_BIT         ; mmE=(20 24 01 05 -- -- -- --)
+
+    punpckhbw   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    psllq       mmD, 4*BYTE_BIT         ; mmD=(-- -- -- -- 11 15 21 25)
+
+    punpcklbw   mmE, mmG                ; mmE=(20 22 24 26 01 03 05 07)
+    punpckhbw   mmD, mmG                ; mmD=(11 13 15 17 21 23 25 27)
+
+    pxor        mmH, mmH
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmH                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmH                ; mmC=(10 12 14 16)
+
+    movq        mmB, mmE
+    punpcklbw   mmE, mmH                ; mmE=(20 22 24 26)
+    punpckhbw   mmB, mmH                ; mmB=(01 03 05 07)
+
+    movq        mmF, mmD
+    punpcklbw   mmD, mmH                ; mmD=(11 13 15 17)
+    punpckhbw   mmF, mmH                ; mmF=(21 23 25 27)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_MMWORD/8
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_MMWORD/8
+    movd        mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_MMWORD/4
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_MMWORD/4
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+    test        cl, SIZEOF_MMWORD/2
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_gray_cnv
+    movq        mmD, mmA
+    movq        mmC, mmF
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+    movq        mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_gray_cnv:
+    ; mmA=(00 10 20 30 01 11 21 31)
+    ; mmF=(02 12 22 32 03 13 23 33)
+    ; mmD=(04 14 24 34 05 15 25 35)
+    ; mmC=(06 16 26 36 07 17 27 37)
+
+    movq        mmB, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 10 12 20 22 30 32)
+    punpckhbw   mmB, mmF                ; mmB=(01 03 11 13 21 23 31 33)
+
+    movq        mmG, mmD
+    punpcklbw   mmD, mmC                ; mmD=(04 06 14 16 24 26 34 36)
+    punpckhbw   mmG, mmC                ; mmG=(05 07 15 17 25 27 35 37)
+
+    movq        mmE, mmA
+    punpcklwd   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    punpckhwd   mmE, mmD                ; mmE=(20 22 24 26 30 32 34 36)
+
+    movq        mmH, mmB
+    punpcklwd   mmB, mmG                ; mmB=(01 03 05 07 11 13 15 17)
+    punpckhwd   mmH, mmG                ; mmH=(21 23 25 27 31 33 35 37)
+
+    pxor        mmF, mmF
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmF                ; mmC=(10 12 14 16)
+
+    movq        mmD, mmB
+    punpcklbw   mmB, mmF                ; mmB=(01 03 05 07)
+    punpckhbw   mmD, mmF                ; mmD=(11 13 15 17)
+
+    movq        mmG, mmE
+    punpcklbw   mmE, mmF                ; mmE=(20 22 24 26)
+    punpckhbw   mmG, mmF                ; mmG=(30 32 34 36)
+
+    punpcklbw   mmF, mmH
+    punpckhbw   mmH, mmH
+    psrlw       mmF, BYTE_BIT           ; mmF=(21 23 25 27)
+    psrlw       mmH, BYTE_BIT           ; mmH=(31 33 35 37)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+    ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    movq        mm6, mm1
+    punpcklwd   mm1, mm3
+    punpckhwd   mm6, mm3
+    pmaddwd     mm1, [GOTOFF(eax,PW_F0299_F0337)]  ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movq        mm7,  mm6               ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movq        mm6, mm0
+    punpcklwd   mm0, mm2
+    punpckhwd   mm6, mm2
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0299_F0337)]  ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movq        mm0, mm5                ; mm0=BO
+    movq        mm6, mm4                ; mm6=BE
+
+    movq        mm4, mm0
+    punpcklwd   mm0, mm3
+    punpckhwd   mm4, mm3
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0114_F0250)]  ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    movq        mm3, [GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
+
+    paddd       mm0, mm1
+    paddd       mm4, mm7
+    paddd       mm0, mm3
+    paddd       mm4, mm3
+    psrld       mm0, SCALEBITS          ; mm0=YOL
+    psrld       mm4, SCALEBITS          ; mm4=YOH
+    packssdw    mm0, mm4                ; mm0=YO
+
+    movq        mm4, mm6
+    punpcklwd   mm6, mm2
+    punpckhwd   mm4, mm2
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0114_F0250)]  ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    movq        mm2, [GOTOFF(eax,PD_ONEHALF)]      ; mm2=[PD_ONEHALF]
+
+    paddd       mm6, MMWORD [wk(0)]
+    paddd       mm4, MMWORD [wk(1)]
+    paddd       mm6, mm2
+    paddd       mm4, mm2
+    psrld       mm6, SCALEBITS          ; mm6=YEL
+    psrld       mm4, SCALEBITS          ; mm4=YEH
+    packssdw    mm6, mm4                ; mm6=YE
+
+    psllw       mm0, BYTE_BIT
+    por         mm6, mm0                ; mm6=Y
+    movq        MMWORD [edi], mm6       ; Save Y
+
+    sub         ecx, byte SIZEOF_MMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; inptr
+    add         edi, byte SIZEOF_MMWORD                ; outptr0
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jcgryext-sse2.asm b/simd/i386/jcgryext-sse2.asm
new file mode 100644
index 0000000..04d891c
--- /dev/null
+++ b/simd/i386/jcgryext-sse2.asm
@@ -0,0 +1,384 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
+
+EXTN(jsimd_rgb_gray_convert_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, BYTE [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, WORD [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .rgb_gray_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    pmaddwd     xmm1, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm7, xmm6              ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      xmm0, xmm5              ; xmm0=BO
+    movdqa      xmm6, xmm4              ; xmm6=BE
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    movdqa      xmm3, [GOTOFF(eax,PD_ONEHALF)]      ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, xmm1
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    movdqa      xmm2, [GOTOFF(eax,PD_ONEHALF)]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(0)]
+    paddd       xmm4, XMMWORD [wk(1)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [edi], xmm6     ; Save Y
+
+    sub         ecx, byte SIZEOF_XMMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte SIZEOF_XMMWORD                ; outptr0
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jchuff-sse2.asm b/simd/i386/jchuff-sse2.asm
new file mode 100644
index 0000000..25995cb
--- /dev/null
+++ b/simd/i386/jchuff-sse2.asm
@@ -0,0 +1,426 @@
+;
+; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
+;
+; Copyright (C) 2009-2011, 2014-2017, D. R. Commander.
+; Copyright (C) 2015, Matthieu Darbois.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation for Huffman coding of one block.
+; The following code is based directly on jchuff.c; see jchuff.c for more
+; details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_huff_encode_one_block)
+
+EXTN(jconst_huff_encode_one_block):
+
+%include "jpeg_nbits_table.inc"
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+; These macros perform the same task as the emit_bits() function in the
+; original libjpeg code.  In addition to reducing overhead by explicitly
+; inlining the code, additional performance is achieved by taking into
+; account the size of the bit buffer and waiting until it is almost full
+; before emptying it.  This mostly benefits 64-bit platforms, since 6
+; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
+
+%macro EMIT_BYTE 0
+    sub         put_bits, 8             ; put_bits -= 8;
+    mov         edx, put_buffer
+    mov         ecx, put_bits
+    shr         edx, cl                 ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
+    mov         byte [eax], dl          ; *buffer++ = c;
+    add         eax, 1
+    cmp         dl, 0xFF                ; need to stuff a zero byte?
+    jne         %%.EMIT_BYTE_END
+    mov         byte [eax], 0           ; *buffer++ = 0;
+    add         eax, 1
+%%.EMIT_BYTE_END:
+%endmacro
+
+%macro PUT_BITS 1
+    add         put_bits, ecx           ; put_bits += size;
+    shl         put_buffer, cl          ; put_buffer = (put_buffer << size);
+    or          put_buffer, %1
+%endmacro
+
+%macro CHECKBUF15 0
+    cmp         put_bits, 16            ; if (put_bits > 31) {
+    jl          %%.CHECKBUF15_END
+    mov         eax, POINTER [esp+buffer]
+    EMIT_BYTE
+    EMIT_BYTE
+    mov         POINTER [esp+buffer], eax
+%%.CHECKBUF15_END:
+%endmacro
+
+%macro EMIT_BITS 1
+    PUT_BITS    %1
+    CHECKBUF15
+%endmacro
+
+%macro kloop_prepare 37                 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
+    pxor        xmm4, xmm4              ; __m128i neg = _mm_setzero_si128();
+    pxor        xmm5, xmm5              ; __m128i neg = _mm_setzero_si128();
+    pxor        xmm6, xmm6              ; __m128i neg = _mm_setzero_si128();
+    pxor        xmm7, xmm7              ; __m128i neg = _mm_setzero_si128();
+    pinsrw      %34, word [esi + %2  * SIZEOF_WORD], 0  ; xmm_shadow[0] = block[jno0];
+    pinsrw      %35, word [esi + %10 * SIZEOF_WORD], 0  ; xmm_shadow[8] = block[jno8];
+    pinsrw      %36, word [esi + %18 * SIZEOF_WORD], 0  ; xmm_shadow[16] = block[jno16];
+    pinsrw      %37, word [esi + %26 * SIZEOF_WORD], 0  ; xmm_shadow[24] = block[jno24];
+    pinsrw      %34, word [esi + %3  * SIZEOF_WORD], 1  ; xmm_shadow[1] = block[jno1];
+    pinsrw      %35, word [esi + %11 * SIZEOF_WORD], 1  ; xmm_shadow[9] = block[jno9];
+    pinsrw      %36, word [esi + %19 * SIZEOF_WORD], 1  ; xmm_shadow[17] = block[jno17];
+    pinsrw      %37, word [esi + %27 * SIZEOF_WORD], 1  ; xmm_shadow[25] = block[jno25];
+    pinsrw      %34, word [esi + %4  * SIZEOF_WORD], 2  ; xmm_shadow[2] = block[jno2];
+    pinsrw      %35, word [esi + %12 * SIZEOF_WORD], 2  ; xmm_shadow[10] = block[jno10];
+    pinsrw      %36, word [esi + %20 * SIZEOF_WORD], 2  ; xmm_shadow[18] = block[jno18];
+    pinsrw      %37, word [esi + %28 * SIZEOF_WORD], 2  ; xmm_shadow[26] = block[jno26];
+    pinsrw      %34, word [esi + %5  * SIZEOF_WORD], 3  ; xmm_shadow[3] = block[jno3];
+    pinsrw      %35, word [esi + %13 * SIZEOF_WORD], 3  ; xmm_shadow[11] = block[jno11];
+    pinsrw      %36, word [esi + %21 * SIZEOF_WORD], 3  ; xmm_shadow[19] = block[jno19];
+    pinsrw      %37, word [esi + %29 * SIZEOF_WORD], 3  ; xmm_shadow[27] = block[jno27];
+    pinsrw      %34, word [esi + %6  * SIZEOF_WORD], 4  ; xmm_shadow[4] = block[jno4];
+    pinsrw      %35, word [esi + %14 * SIZEOF_WORD], 4  ; xmm_shadow[12] = block[jno12];
+    pinsrw      %36, word [esi + %22 * SIZEOF_WORD], 4  ; xmm_shadow[20] = block[jno20];
+    pinsrw      %37, word [esi + %30 * SIZEOF_WORD], 4  ; xmm_shadow[28] = block[jno28];
+    pinsrw      %34, word [esi + %7  * SIZEOF_WORD], 5  ; xmm_shadow[5] = block[jno5];
+    pinsrw      %35, word [esi + %15 * SIZEOF_WORD], 5  ; xmm_shadow[13] = block[jno13];
+    pinsrw      %36, word [esi + %23 * SIZEOF_WORD], 5  ; xmm_shadow[21] = block[jno21];
+    pinsrw      %37, word [esi + %31 * SIZEOF_WORD], 5  ; xmm_shadow[29] = block[jno29];
+    pinsrw      %34, word [esi + %8  * SIZEOF_WORD], 6  ; xmm_shadow[6] = block[jno6];
+    pinsrw      %35, word [esi + %16 * SIZEOF_WORD], 6  ; xmm_shadow[14] = block[jno14];
+    pinsrw      %36, word [esi + %24 * SIZEOF_WORD], 6  ; xmm_shadow[22] = block[jno22];
+    pinsrw      %37, word [esi + %32 * SIZEOF_WORD], 6  ; xmm_shadow[30] = block[jno30];
+    pinsrw      %34, word [esi + %9  * SIZEOF_WORD], 7  ; xmm_shadow[7] = block[jno7];
+    pinsrw      %35, word [esi + %17 * SIZEOF_WORD], 7  ; xmm_shadow[15] = block[jno15];
+    pinsrw      %36, word [esi + %25 * SIZEOF_WORD], 7  ; xmm_shadow[23] = block[jno23];
+%if %1 != 32
+    pinsrw      %37, word [esi + %33 * SIZEOF_WORD], 7  ; xmm_shadow[31] = block[jno31];
+%else
+    pinsrw      %37, ecx, 7             ; xmm_shadow[31] = block[jno31];
+%endif
+    pcmpgtw     xmm4, %34               ; neg = _mm_cmpgt_epi16(neg, x1);
+    pcmpgtw     xmm5, %35               ; neg = _mm_cmpgt_epi16(neg, x1);
+    pcmpgtw     xmm6, %36               ; neg = _mm_cmpgt_epi16(neg, x1);
+    pcmpgtw     xmm7, %37               ; neg = _mm_cmpgt_epi16(neg, x1);
+    paddw       %34, xmm4               ; x1 = _mm_add_epi16(x1, neg);
+    paddw       %35, xmm5               ; x1 = _mm_add_epi16(x1, neg);
+    paddw       %36, xmm6               ; x1 = _mm_add_epi16(x1, neg);
+    paddw       %37, xmm7               ; x1 = _mm_add_epi16(x1, neg);
+    pxor        %34, xmm4               ; x1 = _mm_xor_si128(x1, neg);
+    pxor        %35, xmm5               ; x1 = _mm_xor_si128(x1, neg);
+    pxor        %36, xmm6               ; x1 = _mm_xor_si128(x1, neg);
+    pxor        %37, xmm7               ; x1 = _mm_xor_si128(x1, neg);
+    pxor        xmm4, %34               ; neg = _mm_xor_si128(neg, x1);
+    pxor        xmm5, %35               ; neg = _mm_xor_si128(neg, x1);
+    pxor        xmm6, %36               ; neg = _mm_xor_si128(neg, x1);
+    pxor        xmm7, %37               ; neg = _mm_xor_si128(neg, x1);
+    movdqa      XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34          ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
+    movdqa      XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35    ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
+    movdqa      XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36   ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
+    movdqa      XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37   ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
+    movdqa      XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4         ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
+    movdqa      XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5   ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
+    movdqa      XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6  ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
+    movdqa      XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7  ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
+%endmacro
+
+;
+; Encode a single block's worth of coefficients.
+;
+; GLOBAL(JOCTET *)
+; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
+;                                  JCOEFPTR block, int last_dc_val,
+;                                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
+;
+
+; eax + 8 = working_state *state
+; eax + 12 = JOCTET *buffer
+; eax + 16 = JCOEFPTR block
+; eax + 20 = int last_dc_val
+; eax + 24 = c_derived_tbl *dctbl
+; eax + 28 = c_derived_tbl *actbl
+
+%define pad         6 * SIZEOF_DWORD    ; Align to 16 bytes
+%define t1          pad
+%define t2          t1 + (DCTSIZE2 * SIZEOF_WORD)
+%define block       t2 + (DCTSIZE2 * SIZEOF_WORD)
+%define actbl       block + SIZEOF_DWORD
+%define buffer      actbl + SIZEOF_DWORD
+%define temp        buffer + SIZEOF_DWORD
+%define temp2       temp + SIZEOF_DWORD
+%define temp3       temp2 + SIZEOF_DWORD
+%define temp4       temp3 + SIZEOF_DWORD
+%define temp5       temp4 + SIZEOF_DWORD
+%define gotptr      temp5 + SIZEOF_DWORD  ; void *gotptr
+%define put_buffer  ebx
+%define put_bits    edi
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
+
+EXTN(jsimd_huff_encode_one_block_sse2):
+    push        ebp
+    mov         eax, esp                       ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)    ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                       ; ebp = aligned ebp
+    sub         esp, temp5+9*SIZEOF_DWORD-pad
+    push        ebx
+    push        ecx
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+    push        ebp
+
+    mov         esi, POINTER [eax+8]       ; (working_state *state)
+    mov         put_buffer, DWORD [esi+8]  ; put_buffer = state->cur.put_buffer;
+    mov         put_bits, DWORD [esi+12]   ; put_bits = state->cur.put_bits;
+    push        esi                        ; esi is now scratch
+
+    get_GOT     edx                        ; get GOT address
+    movpic      POINTER [esp+gotptr], edx  ; save GOT address
+
+    mov         ecx, POINTER [eax+28]
+    mov         edx, POINTER [eax+16]
+    mov         esi, POINTER [eax+12]
+    mov         POINTER [esp+actbl], ecx
+    mov         POINTER [esp+block], edx
+    mov         POINTER [esp+buffer], esi
+
+    ; Encode the DC coefficient difference per section F.1.2.1
+    mov         esi, POINTER [esp+block]  ; block
+    movsx       ecx, word [esi]           ; temp = temp2 = block[0] - last_dc_val;
+    sub         ecx, DWORD [eax+20]
+    mov         esi, ecx
+
+    ; This is a well-known technique for obtaining the absolute value
+    ; with out a branch.  It is derived from an assembly language technique
+    ; presented in "How to Optimize for the Pentium Processors",
+    ; Copyright (c) 1996, 1997 by Agner Fog.
+    mov         edx, ecx
+    sar         edx, 31                 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
+    xor         ecx, edx                ; temp ^= temp3;
+    sub         ecx, edx                ; temp -= temp3;
+
+    ; For a negative input, want temp2 = bitwise complement of abs(input)
+    ; This code assumes we are on a two's complement machine
+    add         esi, edx                ; temp2 += temp3;
+    mov         DWORD [esp+temp], esi   ; backup temp2 in temp
+
+    ; Find the number of bits needed for the magnitude of the coefficient
+    movpic      ebp, POINTER [esp+gotptr]                        ; load GOT address (ebp)
+    movzx       edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)]  ; nbits = JPEG_NBITS(temp);
+    mov         DWORD [esp+temp2], edx                           ; backup nbits in temp2
+
+    ; Emit the Huffman-coded symbol for the number of bits
+    mov         ebp, POINTER [eax+24]         ; After this point, arguments are not accessible anymore
+    mov         eax,  INT [ebp + edx * 4]     ; code = dctbl->ehufco[nbits];
+    movzx       ecx, byte [ebp + edx + 1024]  ; size = dctbl->ehufsi[nbits];
+    EMIT_BITS   eax                           ; EMIT_BITS(code, size)
+
+    mov         ecx, DWORD [esp+temp2]        ; restore nbits
+
+    ; Mask off any extra bits in code
+    mov         eax, 1
+    shl         eax, cl
+    dec         eax
+    and         eax, DWORD [esp+temp]   ; temp2 &= (((JLONG)1)<<nbits) - 1;
+
+    ; Emit that number of bits of the value, if positive,
+    ; or the complement of its magnitude, if negative.
+    EMIT_BITS   eax                     ; EMIT_BITS(temp2, nbits)
+
+    ; Prepare data
+    xor         ecx, ecx
+    mov         esi, POINTER [esp+block]
+    kloop_prepare  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, \
+                   18, 11, 4,  5,  12, 19, 26, 33, 40, 48, 41, 34, \
+                   27, 20, 13, 6,  7,  14, 21, 28, 35, \
+                   xmm0, xmm1, xmm2, xmm3
+    kloop_prepare  32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
+                   30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
+                   53, 60, 61, 54, 47, 55, 62, 63, 63, \
+                   xmm0, xmm1, xmm2, xmm3
+
+    pxor        xmm7, xmm7
+    movdqa      xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD]   ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
+    movdqa      xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD]   ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
+    movdqa      xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
+    movdqa      xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
+    pcmpeqw     xmm0, xmm7              ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
+    pcmpeqw     xmm1, xmm7              ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
+    pcmpeqw     xmm2, xmm7              ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
+    pcmpeqw     xmm3, xmm7              ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
+    packsswb    xmm0, xmm1              ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
+    packsswb    xmm2, xmm3              ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
+    pmovmskb    edx, xmm0               ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
+    pmovmskb    ecx, xmm2               ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
+    shl         ecx, 16
+    or          edx, ecx
+    not         edx                     ; index = ~index;
+
+    lea         esi, [esp+t1]
+    mov         ebp, POINTER [esp+actbl]  ; ebp = actbl
+
+.BLOOP:
+    bsf         ecx, edx                ; r = __builtin_ctzl(index);
+    jz          near .ELOOP
+    lea         esi, [esi+ecx*2]        ; k += r;
+    shr         edx, cl                 ; index >>= r;
+    mov         DWORD [esp+temp3], edx
+.BRLOOP:
+    cmp         ecx, 16                       ; while (r > 15) {
+    jl          near .ERLOOP
+    sub         ecx, 16                       ; r -= 16;
+    mov         DWORD [esp+temp], ecx
+    mov         eax, INT [ebp + 240 * 4]      ; code_0xf0 = actbl->ehufco[0xf0];
+    movzx       ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
+    EMIT_BITS   eax                           ; EMIT_BITS(code_0xf0, size_0xf0)
+    mov         ecx, DWORD [esp+temp]
+    jmp         .BRLOOP
+.ERLOOP:
+    movsx       eax, word [esi]                                  ; temp = t1[k];
+    movpic      edx, POINTER [esp+gotptr]                        ; load GOT address (edx)
+    movzx       eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)]  ; nbits = JPEG_NBITS(temp);
+    mov         DWORD [esp+temp2], eax
+    ; Emit Huffman symbol for run length / number of bits
+    shl         ecx, 4                        ; temp3 = (r << 4) + nbits;
+    add         ecx, eax
+    mov         eax,  INT [ebp + ecx * 4]     ; code = actbl->ehufco[temp3];
+    movzx       ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
+    EMIT_BITS   eax
+
+    movsx       edx, word [esi+DCTSIZE2*2]    ; temp2 = t2[k];
+    ; Mask off any extra bits in code
+    mov         ecx, DWORD [esp+temp2]
+    mov         eax, 1
+    shl         eax, cl
+    dec         eax
+    and         eax, edx                ; temp2 &= (((JLONG)1)<<nbits) - 1;
+    EMIT_BITS   eax                     ; PUT_BITS(temp2, nbits)
+    mov         edx, DWORD [esp+temp3]
+    add         esi, 2                  ; ++k;
+    shr         edx, 1                  ; index >>= 1;
+
+    jmp         .BLOOP
+.ELOOP:
+    movdqa      xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD]  ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
+    movdqa      xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD]  ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
+    movdqa      xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
+    movdqa      xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
+    pcmpeqw     xmm0, xmm7              ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
+    pcmpeqw     xmm1, xmm7              ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
+    pcmpeqw     xmm2, xmm7              ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
+    pcmpeqw     xmm3, xmm7              ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
+    packsswb    xmm0, xmm1              ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
+    packsswb    xmm2, xmm3              ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
+    pmovmskb    edx, xmm0               ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
+    pmovmskb    ecx, xmm2               ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
+    shl         ecx, 16
+    or          edx, ecx
+    not         edx                     ; index = ~index;
+
+    lea         eax, [esp + t1 + (DCTSIZE2/2) * 2]
+    sub         eax, esi
+    shr         eax, 1
+    bsf         ecx, edx                ; r = __builtin_ctzl(index);
+    jz          near .ELOOP2
+    shr         edx, cl                 ; index >>= r;
+    add         ecx, eax
+    lea         esi, [esi+ecx*2]        ; k += r;
+    mov         DWORD [esp+temp3], edx
+    jmp         .BRLOOP2
+.BLOOP2:
+    bsf         ecx, edx                ; r = __builtin_ctzl(index);
+    jz          near .ELOOP2
+    lea         esi, [esi+ecx*2]        ; k += r;
+    shr         edx, cl                 ; index >>= r;
+    mov         DWORD [esp+temp3], edx
+.BRLOOP2:
+    cmp         ecx, 16                       ; while (r > 15) {
+    jl          near .ERLOOP2
+    sub         ecx, 16                       ; r -= 16;
+    mov         DWORD [esp+temp], ecx
+    mov         eax, INT [ebp + 240 * 4]      ; code_0xf0 = actbl->ehufco[0xf0];
+    movzx       ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
+    EMIT_BITS   eax                           ; EMIT_BITS(code_0xf0, size_0xf0)
+    mov         ecx, DWORD [esp+temp]
+    jmp         .BRLOOP2
+.ERLOOP2:
+    movsx       eax, word [esi]         ; temp = t1[k];
+    bsr         eax, eax                ; nbits = 32 - __builtin_clz(temp);
+    inc         eax
+    mov         DWORD [esp+temp2], eax
+    ; Emit Huffman symbol for run length / number of bits
+    shl         ecx, 4                        ; temp3 = (r << 4) + nbits;
+    add         ecx, eax
+    mov         eax,  INT [ebp + ecx * 4]     ; code = actbl->ehufco[temp3];
+    movzx       ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
+    EMIT_BITS   eax
+
+    movsx       edx, word [esi+DCTSIZE2*2]    ; temp2 = t2[k];
+    ; Mask off any extra bits in code
+    mov         ecx, DWORD [esp+temp2]
+    mov         eax, 1
+    shl         eax, cl
+    dec         eax
+    and         eax, edx                ; temp2 &= (((JLONG)1)<<nbits) - 1;
+    EMIT_BITS   eax                     ; PUT_BITS(temp2, nbits)
+    mov         edx, DWORD [esp+temp3]
+    add         esi, 2                  ; ++k;
+    shr         edx, 1                  ; index >>= 1;
+
+    jmp         .BLOOP2
+.ELOOP2:
+    ; If the last coef(s) were zero, emit an end-of-block code
+    lea         edx, [esp + t1 + (DCTSIZE2-1) * 2]  ; r = DCTSIZE2-1-k;
+    cmp         edx, esi                            ; if (r > 0) {
+    je          .EFN
+    mov         eax,  INT [ebp]                     ; code = actbl->ehufco[0];
+    movzx       ecx, byte [ebp + 1024]              ; size = actbl->ehufsi[0];
+    EMIT_BITS   eax
+.EFN:
+    mov         eax, [esp+buffer]
+    pop         esi
+    ; Save put_buffer & put_bits
+    mov         DWORD [esi+8], put_buffer  ; state->cur.put_buffer = put_buffer;
+    mov         DWORD [esi+12], put_bits   ; state->cur.put_bits = put_bits;
+
+    pop         ebp
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+    pop         ecx
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jcsample-avx2.asm b/simd/i386/jcsample-avx2.asm
new file mode 100644
index 0000000..5bcdefd
--- /dev/null
+++ b/simd/i386/jcsample-avx2.asm
@@ -0,0 +1,390 @@
+;
+; jcsample.asm - downsampling (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
+
+EXTN(jsimd_h2v1_downsample_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00010000         ; bias pattern
+    vmovd       xmm7, edx
+    vpshufd     xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    vperm2i128  ymm7, ymm7, ymm7, 0     ; ymm7={xmm7, xmm7}
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r24:
+    ; ecx can possibly be 8, 16, 24
+    cmp         ecx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD]
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         ecx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpsrlw      ymm2, ymm0, BYTE_BIT
+    vpand       ymm0, ymm0, ymm6
+    vpsrlw      ymm3, ymm1, BYTE_BIT
+    vpand       ymm1, ymm1, ymm6
+
+    vpaddw      ymm0, ymm0, ymm2
+    vpaddw      ymm1, ymm1, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm1, ymm1, ymm7
+    vpsrlw      ymm0, ymm0, 1
+    vpsrlw      ymm1, ymm1, 1
+
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         ecx, byte SIZEOF_YMMWORD    ; outcol
+    add         esi, byte 2*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    test        ecx, ecx
+    jnz         near .columnloop_r24
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         eax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
+
+EXTN(jsimd_h2v2_downsample_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00020001         ; bias pattern
+    vmovd       xmm7, edx
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpshufd     xmm7, xmm7, 0x00        ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
+    vperm2i128  ymm7, ymm7, ymm7, 0
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edi, JSAMPROW [edi]                    ; outptr
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r24:
+    cmp         ecx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD]
+    vmovdqu     xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD]
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         ecx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    vmovdqu     xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD]
+    vmovdqu     ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpand       ymm4, ymm0, ymm6
+    vpsrlw      ymm0, ymm0, BYTE_BIT
+    vpand       ymm5, ymm1, ymm6
+    vpsrlw      ymm1, ymm1, BYTE_BIT
+    vpaddw      ymm0, ymm0, ymm4
+    vpaddw      ymm1, ymm1, ymm5
+
+    vpand       ymm4, ymm2, ymm6
+    vpsrlw      ymm2, ymm2, BYTE_BIT
+    vpand       ymm5, ymm3, ymm6
+    vpsrlw      ymm3, ymm3, BYTE_BIT
+    vpaddw      ymm2, ymm2, ymm4
+    vpaddw      ymm3, ymm3, ymm5
+
+    vpaddw      ymm0, ymm0, ymm1
+    vpaddw      ymm2, ymm2, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpsrlw      ymm0, ymm0, 2
+    vpsrlw      ymm2, ymm2, 2
+
+    vpackuswb   ymm0, ymm0, ymm2
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         ecx, byte SIZEOF_YMMWORD    ; outcol
+    add         edx, byte 2*SIZEOF_YMMWORD  ; inptr0
+    add         esi, byte 2*SIZEOF_YMMWORD  ; inptr1
+    add         edi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .columnloop_r24
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         eax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jcsample-mmx.asm b/simd/i386/jcsample-mmx.asm
new file mode 100644
index 0000000..faf4234
--- /dev/null
+++ b/simd/i386/jcsample-mmx.asm
@@ -0,0 +1,326 @@
+;
+; jcsample.asm - downsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
+;                           JDIMENSION v_samp_factor,
+;                           JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                           JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_mmx)
+
+EXTN(jsimd_h2v1_downsample_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00010000         ; bias pattern
+    movd        mm7, edx
+    pcmpeqw     mm6, mm6
+    punpckldq   mm7, mm7                ; mm7={0, 1, 0, 1}
+    psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mm1, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mm2, mm0
+    movq        mm3, mm1
+
+    pand        mm0, mm6
+    psrlw       mm2, BYTE_BIT
+    pand        mm1, mm6
+    psrlw       mm3, BYTE_BIT
+
+    paddw       mm0, mm2
+    paddw       mm1, mm3
+    paddw       mm0, mm7
+    paddw       mm1, mm7
+    psrlw       mm0, 1
+    psrlw       mm1, 1
+
+    packuswb    mm0, mm1
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
+    add         edi, byte 1*SIZEOF_MMWORD  ; outptr
+    sub         ecx, byte SIZEOF_MMWORD    ; outcol
+    jnz         short .columnloop
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         eax                        ; rowctr
+    jg          short .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
+;                           JDIMENSION v_samp_factor,
+;                           JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                           JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_mmx)
+
+EXTN(jsimd_h2v2_downsample_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00020001         ; bias pattern
+    movd        mm7, edx
+    pcmpeqw     mm6, mm6
+    punpckldq   mm7, mm7                ; mm7={1, 2, 1, 2}
+    psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edi, JSAMPROW [edi]                    ; outptr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [edx+0*SIZEOF_MMWORD]
+    movq        mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mm2, MMWORD [edx+1*SIZEOF_MMWORD]
+    movq        mm3, MMWORD [esi+1*SIZEOF_MMWORD]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pand        mm0, mm6
+    psrlw       mm4, BYTE_BIT
+    pand        mm1, mm6
+    psrlw       mm5, BYTE_BIT
+    paddw       mm0, mm4
+    paddw       mm1, mm5
+
+    movq        mm4, mm2
+    movq        mm5, mm3
+    pand        mm2, mm6
+    psrlw       mm4, BYTE_BIT
+    pand        mm3, mm6
+    psrlw       mm5, BYTE_BIT
+    paddw       mm2, mm4
+    paddw       mm3, mm5
+
+    paddw       mm0, mm1
+    paddw       mm2, mm3
+    paddw       mm0, mm7
+    paddw       mm2, mm7
+    psrlw       mm0, 2
+    psrlw       mm2, 2
+
+    packuswb    mm0, mm2
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+    add         edx, byte 2*SIZEOF_MMWORD  ; inptr0
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr1
+    add         edi, byte 1*SIZEOF_MMWORD  ; outptr
+    sub         ecx, byte SIZEOF_MMWORD    ; outcol
+    jnz         near .columnloop
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         eax                          ; rowctr
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jcsample-sse2.asm b/simd/i386/jcsample-sse2.asm
new file mode 100644
index 0000000..b10fa83
--- /dev/null
+++ b/simd/i386/jcsample-sse2.asm
@@ -0,0 +1,353 @@
+;
+; jcsample.asm - downsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00010000         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    pxor        xmm1, xmm1
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    pand        xmm0, xmm6
+    psrlw       xmm2, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm3, BYTE_BIT
+
+    paddw       xmm0, xmm2
+    paddw       xmm1, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    psrlw       xmm0, 1
+    psrlw       xmm1, 1
+
+    packuswb    xmm0, xmm1
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         ecx, byte SIZEOF_XMMWORD    ; outcol
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    test        ecx, ecx
+    jnz         short .columnloop_r8
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         eax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00020001         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edi, JSAMPROW [edi]                    ; outptr
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    pxor        xmm2, xmm2
+    pxor        xmm3, xmm3
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    pand        xmm0, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm0, xmm4
+    paddw       xmm1, xmm5
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    pand        xmm2, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm3, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+
+    paddw       xmm0, xmm1
+    paddw       xmm2, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm7
+    psrlw       xmm0, 2
+    psrlw       xmm2, 2
+
+    packuswb    xmm0, xmm2
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         ecx, byte SIZEOF_XMMWORD    ; outcol
+    add         edx, byte 2*SIZEOF_XMMWORD  ; inptr0
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr1
+    add         edi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .columnloop_r8
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         eax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jdcolext-avx2.asm b/simd/i386/jdcolext-avx2.asm
new file mode 100644
index 0000000..46de9b9
--- /dev/null
+++ b/simd/i386/jdcolext-avx2.asm
@@ -0,0 +1,517 @@
+;
+; jdcolext.asm - colorspace conversion (AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+%define out_width(b)   (b) + 8          ; JDIMENSION out_width
+%define input_buf(b)   (b) + 12         ; JSAMPIMAGE input_buf
+%define input_row(b)   (b) + 16         ; JDIMENSION input_row
+%define output_buf(b)  (b) + 20         ; JSAMPARRAY output_buf
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2)
+
+EXTN(jsimd_ycc_rgb_convert_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [out_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [input_row(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    push        eax
+    push        edi
+    push        edx
+    push        ebx
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr0
+    mov         ebx, JSAMPROW [ebx]     ; inptr1
+    mov         edx, JSAMPROW [edx]     ; inptr2
+    mov         edi, JSAMPROW [edi]     ; outptr
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    alignx      16, 7
+.columnloop:
+
+    vmovdqu     ymm5, YMMWORD [ebx]     ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm1, YMMWORD [edx]     ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm0, ymm0, ymm0
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsrlw      ymm0, ymm0, BYTE_BIT    ; ymm0={0xFF 0x00 0xFF 0x00 ..}
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpand       ymm4, ymm0, ymm5        ; ymm4=Cb(02468ACEGIKMOQSU)=CbE
+    vpsrlw      ymm5, ymm5, BYTE_BIT    ; ymm5=Cb(13579BDFHJLNPRTV)=CbO
+    vpand       ymm0, ymm0, ymm1        ; ymm0=Cr(02468ACEGIKMOQSU)=CrE
+    vpsrlw      ymm1, ymm1, BYTE_BIT    ; ymm1=Cr(13579BDFHJLNPRTV)=CrO
+
+    vpaddw      ymm2, ymm4, ymm7
+    vpaddw      ymm3, ymm5, ymm7
+    vpaddw      ymm6, ymm0, ymm7
+    vpaddw      ymm7, ymm1, ymm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm4, ymm2, ymm2                     ; ymm4=2*CbE
+    vpaddw      ymm5, ymm3, ymm3                     ; ymm5=2*CbO
+    vpaddw      ymm0, ymm6, ymm6                     ; ymm0=2*CrE
+    vpaddw      ymm1, ymm7, ymm7                     ; ymm1=2*CrO
+
+    vpmulhw     ymm4, ymm4, [GOTOFF(eax,PW_MF0228)]  ; ymm4=(2*CbE * -FIX(0.22800))
+    vpmulhw     ymm5, ymm5, [GOTOFF(eax,PW_MF0228)]  ; ymm5=(2*CbO * -FIX(0.22800))
+    vpmulhw     ymm0, ymm0, [GOTOFF(eax,PW_F0402)]   ; ymm0=(2*CrE * FIX(0.40200))
+    vpmulhw     ymm1, ymm1, [GOTOFF(eax,PW_F0402)]   ; ymm1=(2*CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm5, ymm5, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm4, ymm4, 1                        ; ymm4=(CbE * -FIX(0.22800))
+    vpsraw      ymm5, ymm5, 1                        ; ymm5=(CbO * -FIX(0.22800))
+    vpaddw      ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm1, ymm1, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm0, ymm0, 1                        ; ymm0=(CrE * FIX(0.40200))
+    vpsraw      ymm1, ymm1, 1                        ; ymm1=(CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm5, ymm5, ymm3
+    vpaddw      ymm4, ymm4, ymm2                     ; ymm4=(CbE * FIX(1.77200))=(B-Y)E
+    vpaddw      ymm5, ymm5, ymm3                     ; ymm5=(CbO * FIX(1.77200))=(B-Y)O
+    vpaddw      ymm0, ymm0, ymm6                     ; ymm0=(CrE * FIX(1.40200))=(R-Y)E
+    vpaddw      ymm1, ymm1, ymm7                     ; ymm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    vmovdqa     YMMWORD [wk(0)], ymm4                ; wk(0)=(B-Y)E
+    vmovdqa     YMMWORD [wk(1)], ymm5                ; wk(1)=(B-Y)O
+
+    vpunpckhwd  ymm4, ymm2, ymm6
+    vpunpcklwd  ymm2, ymm2, ymm6
+    vpmaddwd    ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpunpckhwd  ymm5, ymm3, ymm7
+    vpunpcklwd  ymm3, ymm3, ymm7
+    vpmaddwd    ymm3, ymm3, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    vpaddd      ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm4, ymm4, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm4, ymm4, SCALEBITS
+    vpaddd      ymm3, ymm3, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm3, ymm3, SCALEBITS
+    vpsrad      ymm5, ymm5, SCALEBITS
+
+    vpackssdw   ymm2, ymm2, ymm4             ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    vpackssdw   ymm3, ymm3, ymm5             ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    vpsubw      ymm2, ymm2, ymm6             ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    vpsubw      ymm3, ymm3, ymm7             ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    vmovdqu     ymm5, YMMWORD [esi]          ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm4, ymm4, ymm4
+    vpsrlw      ymm4, ymm4, BYTE_BIT         ; ymm4={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm4, ymm4, ymm5             ; ymm4=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm5, ymm5, BYTE_BIT         ; ymm5=Y(13579BDFHJLNPRTV)=YO
+
+    vpaddw      ymm0, ymm0, ymm4             ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm5             ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0             ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1             ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm4             ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm5             ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2             ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3             ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, YMMWORD [wk(0)]  ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, YMMWORD [wk(1)]  ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4             ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5             ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    lea         ecx, [ecx+ecx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         ecx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         WORD [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    mov         BYTE [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         edi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    cmp         ecx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         ecx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         ecx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_YMMWORD/16*4
+    sub         ecx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    vmovd       XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    alignx      16, 7
+
+.nextrow:
+    pop         ecx
+    pop         esi
+    pop         ebx
+    pop         edx
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    add         edi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jdcolext-mmx.asm b/simd/i386/jdcolext-mmx.asm
new file mode 100644
index 0000000..cd2cb3f
--- /dev/null
+++ b/simd/i386/jdcolext-mmx.asm
@@ -0,0 +1,406 @@
+;
+; jdcolext.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_mmx(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                           JDIMENSION input_row, JSAMPARRAY output_buf,
+;                           int num_rows)
+;
+
+%define out_width(b)   (b) + 8          ; JDIMENSION out_width
+%define input_buf(b)   (b) + 12         ; JSAMPIMAGE input_buf
+%define input_row(b)   (b) + 16         ; JDIMENSION input_row
+%define output_buf(b)  (b) + 20         ; JSAMPARRAY output_buf
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_mmx)
+
+EXTN(jsimd_ycc_rgb_convert_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [out_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [input_row(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    push        eax
+    push        edi
+    push        edx
+    push        ebx
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr0
+    mov         ebx, JSAMPROW [ebx]     ; inptr1
+    mov         edx, JSAMPROW [edx]     ; inptr2
+    mov         edi, JSAMPROW [edi]     ; outptr
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    alignx      16, 7
+.columnloop:
+
+    movq        mm5, MMWORD [ebx]       ; mm5=Cb(01234567)
+    movq        mm1, MMWORD [edx]       ; mm1=Cr(01234567)
+
+    pcmpeqw     mm4, mm4
+    pcmpeqw     mm7, mm7
+    psrlw       mm4, BYTE_BIT
+    psllw       mm7, 7                  ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+    movq        mm0, mm4                ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
+
+    pand        mm4, mm5                ; mm4=Cb(0246)=CbE
+    psrlw       mm5, BYTE_BIT           ; mm5=Cb(1357)=CbO
+    pand        mm0, mm1                ; mm0=Cr(0246)=CrE
+    psrlw       mm1, BYTE_BIT           ; mm1=Cr(1357)=CrO
+
+    paddw       mm4, mm7
+    paddw       mm5, mm7
+    paddw       mm0, mm7
+    paddw       mm1, mm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movq        mm2, mm4                ; mm2=CbE
+    movq        mm3, mm5                ; mm3=CbO
+    paddw       mm4, mm4                ; mm4=2*CbE
+    paddw       mm5, mm5                ; mm5=2*CbO
+    movq        mm6, mm0                ; mm6=CrE
+    movq        mm7, mm1                ; mm7=CrO
+    paddw       mm0, mm0                ; mm0=2*CrE
+    paddw       mm1, mm1                ; mm1=2*CrO
+
+    pmulhw      mm4, [GOTOFF(eax,PW_MF0228)]  ; mm4=(2*CbE * -FIX(0.22800))
+    pmulhw      mm5, [GOTOFF(eax,PW_MF0228)]  ; mm5=(2*CbO * -FIX(0.22800))
+    pmulhw      mm0, [GOTOFF(eax,PW_F0402)]   ; mm0=(2*CrE * FIX(0.40200))
+    pmulhw      mm1, [GOTOFF(eax,PW_F0402)]   ; mm1=(2*CrO * FIX(0.40200))
+
+    paddw       mm4, [GOTOFF(eax,PW_ONE)]
+    paddw       mm5, [GOTOFF(eax,PW_ONE)]
+    psraw       mm4, 1                  ; mm4=(CbE * -FIX(0.22800))
+    psraw       mm5, 1                  ; mm5=(CbO * -FIX(0.22800))
+    paddw       mm0, [GOTOFF(eax,PW_ONE)]
+    paddw       mm1, [GOTOFF(eax,PW_ONE)]
+    psraw       mm0, 1                  ; mm0=(CrE * FIX(0.40200))
+    psraw       mm1, 1                  ; mm1=(CrO * FIX(0.40200))
+
+    paddw       mm4, mm2
+    paddw       mm5, mm3
+    paddw       mm4, mm2                ; mm4=(CbE * FIX(1.77200))=(B-Y)E
+    paddw       mm5, mm3                ; mm5=(CbO * FIX(1.77200))=(B-Y)O
+    paddw       mm0, mm6                ; mm0=(CrE * FIX(1.40200))=(R-Y)E
+    paddw       mm1, mm7                ; mm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(B-Y)E
+    movq        MMWORD [wk(1)], mm5     ; wk(1)=(B-Y)O
+
+    movq        mm4, mm2
+    movq        mm5, mm3
+    punpcklwd   mm2, mm6
+    punpckhwd   mm4, mm6
+    pmaddwd     mm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm4, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   mm3, mm7
+    punpckhwd   mm5, mm7
+    pmaddwd     mm3, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       mm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm4, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm2, SCALEBITS
+    psrad       mm4, SCALEBITS
+    paddd       mm3, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm5, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm3, SCALEBITS
+    psrad       mm5, SCALEBITS
+
+    packssdw    mm2, mm4                ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    packssdw    mm3, mm5                ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    psubw       mm2, mm6                ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    psubw       mm3, mm7                ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    movq        mm5, MMWORD [esi]       ; mm5=Y(01234567)
+
+    pcmpeqw     mm4, mm4
+    psrlw       mm4, BYTE_BIT           ; mm4={0xFF 0x00 0xFF 0x00 ..}
+    pand        mm4, mm5                ; mm4=Y(0246)=YE
+    psrlw       mm5, BYTE_BIT           ; mm5=Y(1357)=YO
+
+    paddw       mm0, mm4                ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
+    paddw       mm1, mm5                ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
+    packuswb    mm0, mm0                ; mm0=(R0 R2 R4 R6 ** ** ** **)
+    packuswb    mm1, mm1                ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+    paddw       mm2, mm4                ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
+    paddw       mm3, mm5                ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
+    packuswb    mm2, mm2                ; mm2=(G0 G2 G4 G6 ** ** ** **)
+    packuswb    mm3, mm3                ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+    paddw       mm4,  MMWORD [wk(0)]    ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
+    paddw       mm5,  MMWORD [wk(1)]    ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
+    packuswb    mm4, mm4                ; mm4=(B0 B2 B4 B6 ** ** ** **)
+    packuswb    mm5, mm5                ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmB                ; mmE=(20 01 22 03 24 05 26 07)
+    punpcklbw   mmD, mmF                ; mmD=(11 21 13 23 15 25 17 27)
+
+    movq        mmG, mmA
+    movq        mmH, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 01 02 12 22 03)
+    punpckhwd   mmG, mmE                ; mmG=(04 14 24 05 06 16 26 07)
+
+    psrlq       mmH, 2*BYTE_BIT         ; mmH=(02 12 04 14 06 16 -- --)
+    psrlq       mmE, 2*BYTE_BIT         ; mmE=(22 03 24 05 26 07 -- --)
+
+    movq        mmC, mmD
+    movq        mmB, mmD
+    punpcklwd   mmD, mmH                ; mmD=(11 21 02 12 13 23 04 14)
+    punpckhwd   mmC, mmH                ; mmC=(15 25 06 16 17 27 -- --)
+
+    psrlq       mmB, 2*BYTE_BIT         ; mmB=(13 23 15 25 17 27 -- --)
+
+    movq        mmF, mmE
+    punpcklwd   mmE, mmB                ; mmE=(22 03 13 23 24 05 15 25)
+    punpckhwd   mmF, mmB                ; mmF=(26 07 17 27 -- -- -- --)
+
+    punpckldq   mmA, mmD                ; mmA=(00 10 20 01 11 21 02 12)
+    punpckldq   mmE, mmG                ; mmE=(22 03 13 23 04 14 24 05)
+    punpckldq   mmC, mmF                ; mmC=(15 25 06 16 26 07 17 27)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    lea         ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_MMWORD
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        mmA, mmC
+    sub         ecx, byte 2*SIZEOF_MMWORD
+    add         edi, byte 2*SIZEOF_MMWORD
+    jmp         short .column_st4
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmE
+    sub         ecx, byte SIZEOF_MMWORD
+    add         edi, byte SIZEOF_MMWORD
+.column_st4:
+    movd        eax, mmA
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st2
+    mov         DWORD [edi+0*SIZEOF_DWORD], eax
+    psrlq       mmA, DWORD_BIT
+    movd        eax, mmA
+    sub         ecx, byte SIZEOF_DWORD
+    add         edi, byte SIZEOF_DWORD
+.column_st2:
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         WORD [edi+0*SIZEOF_WORD], ax
+    shr         eax, WORD_BIT
+    sub         ecx, byte SIZEOF_WORD
+    add         edi, byte SIZEOF_WORD
+.column_st1:
+    cmp         ecx, byte SIZEOF_BYTE
+    jb          short .nextrow
+    mov         BYTE [edi+0*SIZEOF_BYTE], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pcmpeqb     mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+    pxor        mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pxor        mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmG                ; mmE=(20 30 22 32 24 34 26 36)
+    punpcklbw   mmB, mmD                ; mmB=(01 11 03 13 05 15 07 17)
+    punpcklbw   mmF, mmH                ; mmF=(21 31 23 33 25 35 27 37)
+
+    movq        mmC, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 30 02 12 22 32)
+    punpckhwd   mmC, mmE                ; mmC=(04 14 24 34 06 16 26 36)
+    movq        mmG, mmB
+    punpcklwd   mmB, mmF                ; mmB=(01 11 21 31 03 13 23 33)
+    punpckhwd   mmG, mmF                ; mmG=(05 15 25 35 07 17 27 37)
+
+    movq        mmD, mmA
+    punpckldq   mmA, mmB                ; mmA=(00 10 20 30 01 11 21 31)
+    punpckhdq   mmD, mmB                ; mmD=(02 12 22 32 03 13 23 33)
+    movq        mmH, mmC
+    punpckldq   mmC, mmG                ; mmC=(04 14 24 34 05 15 25 35)
+    punpckhdq   mmH, mmG                ; mmH=(06 16 26 36 07 17 27 37)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    cmp         ecx, byte SIZEOF_MMWORD/2
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        mmA, mmC
+    movq        mmD, mmH
+    sub         ecx, byte SIZEOF_MMWORD/2
+    add         edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD/4
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmD
+    sub         ecx, byte SIZEOF_MMWORD/4
+    add         edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+    cmp         ecx, byte SIZEOF_MMWORD/8
+    jb          short .nextrow
+    movd        DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    alignx      16, 7
+
+.nextrow:
+    pop         ecx
+    pop         esi
+    pop         ebx
+    pop         edx
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    add         edi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jdcolext-sse2.asm b/simd/i386/jdcolext-sse2.asm
new file mode 100644
index 0000000..0fcb006
--- /dev/null
+++ b/simd/i386/jdcolext-sse2.asm
@@ -0,0 +1,460 @@
+;
+; jdcolext.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+%define out_width(b)   (b) + 8          ; JDIMENSION out_width
+%define input_buf(b)   (b) + 12         ; JSAMPIMAGE input_buf
+%define input_row(b)   (b) + 16         ; JDIMENSION input_row
+%define output_buf(b)  (b) + 20         ; JSAMPARRAY output_buf
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [out_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [input_row(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    push        eax
+    push        edi
+    push        edx
+    push        ebx
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr0
+    mov         ebx, JSAMPROW [ebx]     ; inptr1
+    mov         edx, JSAMPROW [edx]     ; inptr2
+    mov         edi, JSAMPROW [edi]     ; outptr
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    alignx      16, 7
+.columnloop:
+
+    movdqa      xmm5, XMMWORD [ebx]     ; xmm5=Cb(0123456789ABCDEF)
+    movdqa      xmm1, XMMWORD [edx]     ; xmm1=Cr(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    pcmpeqw     xmm7, xmm7
+    psrlw       xmm4, BYTE_BIT
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+    movdqa      xmm0, xmm4              ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+    pand        xmm4, xmm5              ; xmm4=Cb(02468ACE)=CbE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Cb(13579BDF)=CbO
+    pand        xmm0, xmm1              ; xmm0=Cr(02468ACE)=CrE
+    psrlw       xmm1, BYTE_BIT          ; xmm1=Cr(13579BDF)=CrO
+
+    paddw       xmm4, xmm7
+    paddw       xmm5, xmm7
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm2, xmm4              ; xmm2=CbE
+    movdqa      xmm3, xmm5              ; xmm3=CbO
+    paddw       xmm4, xmm4              ; xmm4=2*CbE
+    paddw       xmm5, xmm5              ; xmm5=2*CbO
+    movdqa      xmm6, xmm0              ; xmm6=CrE
+    movdqa      xmm7, xmm1              ; xmm7=CrO
+    paddw       xmm0, xmm0              ; xmm0=2*CrE
+    paddw       xmm1, xmm1              ; xmm1=2*CrO
+
+    pmulhw      xmm4, [GOTOFF(eax,PW_MF0228)]  ; xmm4=(2*CbE * -FIX(0.22800))
+    pmulhw      xmm5, [GOTOFF(eax,PW_MF0228)]  ; xmm5=(2*CbO * -FIX(0.22800))
+    pmulhw      xmm0, [GOTOFF(eax,PW_F0402)]   ; xmm0=(2*CrE * FIX(0.40200))
+    pmulhw      xmm1, [GOTOFF(eax,PW_F0402)]   ; xmm1=(2*CrO * FIX(0.40200))
+
+    paddw       xmm4, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm5, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm4, 1                 ; xmm4=(CbE * -FIX(0.22800))
+    psraw       xmm5, 1                 ; xmm5=(CbO * -FIX(0.22800))
+    paddw       xmm0, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm1, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm0, 1                 ; xmm0=(CrE * FIX(0.40200))
+    psraw       xmm1, 1                 ; xmm1=(CrO * FIX(0.40200))
+
+    paddw       xmm4, xmm2
+    paddw       xmm5, xmm3
+    paddw       xmm4, xmm2              ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+    paddw       xmm5, xmm3              ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+    paddw       xmm0, xmm6              ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+    paddw       xmm1, xmm7              ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=(B-Y)E
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(B-Y)O
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm4, xmm6
+    pmaddwd     xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm4, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   xmm3, xmm7
+    punpckhwd   xmm5, xmm7
+    pmaddwd     xmm3, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       xmm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm4, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm4, SCALEBITS
+    paddd       xmm3, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm5, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm3, SCALEBITS
+    psrad       xmm5, SCALEBITS
+
+    packssdw    xmm2, xmm4              ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    packssdw    xmm3, xmm5              ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    psubw       xmm2, xmm6              ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    psubw       xmm3, xmm7              ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    movdqa      xmm5, XMMWORD [esi]     ; xmm5=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    psrlw       xmm4, BYTE_BIT          ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm4, xmm5              ; xmm4=Y(02468ACE)=YE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Y(13579BDF)=YO
+
+    paddw       xmm0, xmm4              ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm5              ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm4              ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm5              ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, XMMWORD [wk(0)]   ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+    paddw       xmm5, XMMWORD [wk(1)]   ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    lea         ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         ecx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         WORD [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    mov         BYTE [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    cmp         ecx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_XMMWORD/8*4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    movd        XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    alignx      16, 7
+
+.nextrow:
+    pop         ecx
+    pop         esi
+    pop         ebx
+    pop         edx
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    add         edi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jdcolor-avx2.asm b/simd/i386/jdcolor-avx2.asm
new file mode 100644
index 0000000..7a1a963
--- /dev/null
+++ b/simd/i386/jdcolor-avx2.asm
@@ -0,0 +1,120 @@
+;
+; jdcolor.asm - colorspace conversion (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
+
+EXTN(jconst_ycc_rgb_convert_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgb_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgbx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgrx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxrgb_convert_avx2
+%include "jdcolext-avx2.asm"
diff --git a/simd/jdcolor-mmx.asm b/simd/i386/jdcolor-mmx.asm
similarity index 80%
rename from simd/jdcolor-mmx.asm
rename to simd/i386/jdcolor-mmx.asm
index 4e58031..7e9d645 100644
--- a/simd/jdcolor-mmx.asm
+++ b/simd/i386/jdcolor-mmx.asm
@@ -2,7 +2,7 @@
 ; jdcolor.asm - colorspace conversion (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -20,21 +20,21 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS       16
+%define SCALEBITS  16
 
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_CONST
+    SECTION     SEG_CONST
 
-        alignz  16
-        global  EXTN(jconst_ycc_rgb_convert_mmx)
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_mmx)
 
 EXTN(jconst_ycc_rgb_convert_mmx):
 
@@ -42,13 +42,13 @@
 PW_MF0228       times 4 dw -F_0_228
 PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
 PW_ONE          times 4 dw  1
-PD_ONEHALF      times 2 dd  1 << (SCALEBITS-1)
+PD_ONEHALF      times 2 dd  1 << (SCALEBITS - 1)
 
-        alignz  16
+    alignz      32
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
+    SECTION     SEG_TEXT
+    BITS        32
 
 %include "jdcolext-mmx.asm"
 
diff --git a/simd/jdcolor-sse2.asm b/simd/i386/jdcolor-sse2.asm
similarity index 80%
rename from simd/jdcolor-sse2.asm
rename to simd/i386/jdcolor-sse2.asm
index 7ff5d05..be443df 100644
--- a/simd/jdcolor-sse2.asm
+++ b/simd/i386/jdcolor-sse2.asm
@@ -2,7 +2,7 @@
 ; jdcolor.asm - colorspace conversion (SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -20,21 +20,21 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS       16
+%define SCALEBITS  16
 
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_CONST
+    SECTION     SEG_CONST
 
-        alignz  16
-        global  EXTN(jconst_ycc_rgb_convert_sse2)
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
 
 EXTN(jconst_ycc_rgb_convert_sse2):
 
@@ -42,13 +42,13 @@
 PW_MF0228       times 8 dw -F_0_228
 PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
 PW_ONE          times 8 dw  1
-PD_ONEHALF      times 4 dd  1 << (SCALEBITS-1)
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
 
-        alignz  16
+    alignz      32
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
+    SECTION     SEG_TEXT
+    BITS        32
 
 %include "jdcolext-sse2.asm"
 
diff --git a/simd/i386/jdmerge-avx2.asm b/simd/i386/jdmerge-avx2.asm
new file mode 100644
index 0000000..244ae74
--- /dev/null
+++ b/simd/i386/jdmerge-avx2.asm
@@ -0,0 +1,126 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_avx2)
+
+EXTN(jconst_merged_upsample_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 jsimd_h2v1_extrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 jsimd_h2v2_extrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 jsimd_h2v1_extrgbx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 jsimd_h2v2_extrgbx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 jsimd_h2v1_extbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 jsimd_h2v2_extbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 jsimd_h2v1_extbgrx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 jsimd_h2v2_extbgrx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 jsimd_h2v1_extxbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 jsimd_h2v2_extxbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 jsimd_h2v1_extxrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 jsimd_h2v2_extxrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
diff --git a/simd/jdmerge-mmx.asm b/simd/i386/jdmerge-mmx.asm
similarity index 83%
rename from simd/jdmerge-mmx.asm
rename to simd/i386/jdmerge-mmx.asm
index ee58bff..9dd23d1 100644
--- a/simd/jdmerge-mmx.asm
+++ b/simd/i386/jdmerge-mmx.asm
@@ -2,7 +2,7 @@
 ; jdmerge.asm - merged upsampling/color conversion (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -20,21 +20,21 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS       16
+%define SCALEBITS  16
 
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_CONST
+    SECTION     SEG_CONST
 
-        alignz  16
-        global  EXTN(jconst_merged_upsample_mmx)
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_mmx)
 
 EXTN(jconst_merged_upsample_mmx):
 
@@ -42,13 +42,13 @@
 PW_MF0228       times 4 dw -F_0_228
 PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
 PW_ONE          times 4 dw  1
-PD_ONEHALF      times 2 dd  1 << (SCALEBITS-1)
+PD_ONEHALF      times 2 dd  1 << (SCALEBITS - 1)
 
-        alignz  16
+    alignz      32
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
+    SECTION     SEG_TEXT
+    BITS        32
 
 %include "jdmrgext-mmx.asm"
 
diff --git a/simd/jdmerge-sse2.asm b/simd/i386/jdmerge-sse2.asm
similarity index 83%
rename from simd/jdmerge-sse2.asm
rename to simd/i386/jdmerge-sse2.asm
index 236de5a..b210c07 100644
--- a/simd/jdmerge-sse2.asm
+++ b/simd/i386/jdmerge-sse2.asm
@@ -2,7 +2,7 @@
 ; jdmerge.asm - merged upsampling/color conversion (SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -20,21 +20,21 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS       16
+%define SCALEBITS  16
 
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_CONST
+    SECTION     SEG_CONST
 
-        alignz  16
-        global  EXTN(jconst_merged_upsample_sse2)
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_sse2)
 
 EXTN(jconst_merged_upsample_sse2):
 
@@ -42,13 +42,13 @@
 PW_MF0228       times 8 dw -F_0_228
 PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
 PW_ONE          times 8 dw  1
-PD_ONEHALF      times 4 dd  1 << (SCALEBITS-1)
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
 
-        alignz  16
+    alignz      32
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
+    SECTION     SEG_TEXT
+    BITS        32
 
 %include "jdmrgext-sse2.asm"
 
diff --git a/simd/i386/jdmrgext-avx2.asm b/simd/i386/jdmrgext-avx2.asm
new file mode 100644
index 0000000..cde4865
--- /dev/null
+++ b/simd/i386/jdmrgext-avx2.asm
@@ -0,0 +1,577 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM        3
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
+
+EXTN(jsimd_h2v1_merged_upsample_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [output_width(eax)]  ; col
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         edi, JSAMPROW [edi]                      ; outptr
+
+    pop         ecx                     ; col
+
+    alignx      16, 7
+.columnloop:
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    vmovdqu     ymm6, YMMWORD [ebx]     ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm7, YMMWORD [edx]     ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+    vpcmpeqw    ymm3, ymm3, ymm3
+    vpsllw      ymm3, ymm3, 7           ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpermq      ymm6, ymm6, 0xd8        ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpermq      ymm7, ymm7, 0xd8        ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpunpcklbw  ymm4, ymm6, ymm1        ; ymm4=Cb(0123456789ABCDEF)=CbL
+    vpunpckhbw  ymm6, ymm6, ymm1        ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH
+    vpunpcklbw  ymm0, ymm7, ymm1        ; ymm0=Cr(0123456789ABCDEF)=CrL
+    vpunpckhbw  ymm7, ymm7, ymm1        ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH
+
+    vpaddw      ymm5, ymm6, ymm3
+    vpaddw      ymm2, ymm4, ymm3
+    vpaddw      ymm1, ymm7, ymm3
+    vpaddw      ymm3, ymm0, ymm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm6, ymm5, ymm5             ; ymm6=2*CbH
+    vpaddw      ymm4, ymm2, ymm2             ; ymm4=2*CbL
+    vpaddw      ymm7, ymm1, ymm1             ; ymm7=2*CrH
+    vpaddw      ymm0, ymm3, ymm3             ; ymm0=2*CrL
+
+    vpmulhw     ymm6, ymm6, [GOTOFF(eax,PW_MF0228)]  ; ymm6=(2*CbH * -FIX(0.22800))
+    vpmulhw     ymm4, ymm4, [GOTOFF(eax,PW_MF0228)]  ; ymm4=(2*CbL * -FIX(0.22800))
+    vpmulhw     ymm7, ymm7, [GOTOFF(eax,PW_F0402)]   ; ymm7=(2*CrH * FIX(0.40200))
+    vpmulhw     ymm0, ymm0, [GOTOFF(eax,PW_F0402)]   ; ymm0=(2*CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm6, ymm6, 1                     ; ymm6=(CbH * -FIX(0.22800))
+    vpsraw      ymm4, ymm4, 1                     ; ymm4=(CbL * -FIX(0.22800))
+    vpaddw      ymm7, ymm7, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm7, ymm7, 1                     ; ymm7=(CrH * FIX(0.40200))
+    vpsraw      ymm0, ymm0, 1                     ; ymm0=(CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, ymm5
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm6, ymm6, ymm5                  ; ymm6=(CbH * FIX(1.77200))=(B-Y)H
+    vpaddw      ymm4, ymm4, ymm2                  ; ymm4=(CbL * FIX(1.77200))=(B-Y)L
+    vpaddw      ymm7, ymm7, ymm1                  ; ymm7=(CrH * FIX(1.40200))=(R-Y)H
+    vpaddw      ymm0, ymm0, ymm3                  ; ymm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    vmovdqa     YMMWORD [wk(0)], ymm6             ; wk(0)=(B-Y)H
+    vmovdqa     YMMWORD [wk(1)], ymm7             ; wk(1)=(R-Y)H
+
+    vpunpckhwd  ymm6, ymm5, ymm1
+    vpunpcklwd  ymm5, ymm5, ymm1
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpunpckhwd  ymm7, ymm2, ymm3
+    vpunpcklwd  ymm2, ymm2, ymm3
+    vpmaddwd    ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm7, ymm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    vpaddd      ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm6, ymm6, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm5, ymm5, SCALEBITS
+    vpsrad      ymm6, ymm6, SCALEBITS
+    vpaddd      ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm7, ymm7, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm7, ymm7, SCALEBITS
+
+    vpackssdw   ymm5, ymm5, ymm6        ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    vpackssdw   ymm2, ymm2, ymm7        ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    vpsubw      ymm5, ymm5, ymm1        ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    vpsubw      ymm2, ymm2, ymm3        ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    vmovdqa     YMMWORD [wk(2)], ymm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+    alignx      16, 7
+
+.Yloop_2nd:
+    vmovdqa     ymm0, YMMWORD [wk(1)]   ; ymm0=(R-Y)H
+    vmovdqa     ymm2, YMMWORD [wk(2)]   ; ymm2=(G-Y)H
+    vmovdqa     ymm4, YMMWORD [wk(0)]   ; ymm4=(B-Y)H
+    alignx      16, 7
+
+.Yloop_1st:
+    vmovdqu     ymm7, YMMWORD [esi]     ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm6, ymm6, ymm7        ; ymm6=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm7, ymm7, BYTE_BIT    ; ymm7=Y(13579BDFHJLNPRTV)=YO
+
+    vmovdqa     ymm1, ymm0              ; ymm1=ymm0=(R-Y)(L/H)
+    vmovdqa     ymm3, ymm2              ; ymm3=ymm2=(G-Y)(L/H)
+    vmovdqa     ymm5, ymm4              ; ymm5=ymm4=(B-Y)(L/H)
+
+    vpaddw      ymm0, ymm0, ymm6        ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm7        ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0        ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1        ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm6        ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm7        ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2        ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3        ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, ymm6        ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, ymm7        ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4        ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5        ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    lea         ecx, [ecx+ecx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         ecx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         WORD [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    mov         BYTE [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         edi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    cmp         ecx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         ecx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         ecx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         ecx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_YMMWORD/16*4
+    sub         ecx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    vmovd       XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
+
+EXTN(jsimd_h2v2_merged_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, POINTER [output_width(ebp)]
+
+    mov         edi, JSAMPIMAGE [input_buf(ebp)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(ebp)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(ebp)]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+    push        edx                     ; inptr2
+    push        ebx                     ; inptr1
+    push        esi                     ; inptr00
+    mov         ebx, esp
+
+    push        edi                     ; output_buf (outptr0)
+    push        ecx                     ; in_row_group_ctr
+    push        ebx                     ; input_buf
+    push        eax                     ; output_width
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    add         esi, byte SIZEOF_JSAMPROW  ; inptr01
+    add         edi, byte SIZEOF_JSAMPROW  ; outptr1
+    mov         POINTER [ebx+0*SIZEOF_POINTER], esi
+    mov         POINTER [ebx-1*SIZEOF_POINTER], edi
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    add         esp, byte 7*SIZEOF_DWORD
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jdmrgext-mmx.asm b/simd/i386/jdmrgext-mmx.asm
new file mode 100644
index 0000000..4b9e35d
--- /dev/null
+++ b/simd/i386/jdmrgext-mmx.asm
@@ -0,0 +1,462 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
+;                                JDIMENSION in_row_group_ctr,
+;                                JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        3
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_mmx)
+
+EXTN(jsimd_h2v1_merged_upsample_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [output_width(eax)]  ; col
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         edi, JSAMPROW [edi]                      ; outptr
+
+    pop         ecx                     ; col
+
+    alignx      16, 7
+.columnloop:
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    movq        mm6, MMWORD [ebx]       ; mm6=Cb(01234567)
+    movq        mm7, MMWORD [edx]       ; mm7=Cr(01234567)
+
+    pxor        mm1, mm1                ; mm1=(all 0's)
+    pcmpeqw     mm3, mm3
+    psllw       mm3, 7                  ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
+
+    movq        mm4, mm6
+    punpckhbw   mm6, mm1                ; mm6=Cb(4567)=CbH
+    punpcklbw   mm4, mm1                ; mm4=Cb(0123)=CbL
+    movq        mm0, mm7
+    punpckhbw   mm7, mm1                ; mm7=Cr(4567)=CrH
+    punpcklbw   mm0, mm1                ; mm0=Cr(0123)=CrL
+
+    paddw       mm6, mm3
+    paddw       mm4, mm3
+    paddw       mm7, mm3
+    paddw       mm0, mm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movq        mm5, mm6                ; mm5=CbH
+    movq        mm2, mm4                ; mm2=CbL
+    paddw       mm6, mm6                ; mm6=2*CbH
+    paddw       mm4, mm4                ; mm4=2*CbL
+    movq        mm1, mm7                ; mm1=CrH
+    movq        mm3, mm0                ; mm3=CrL
+    paddw       mm7, mm7                ; mm7=2*CrH
+    paddw       mm0, mm0                ; mm0=2*CrL
+
+    pmulhw      mm6, [GOTOFF(eax,PW_MF0228)]  ; mm6=(2*CbH * -FIX(0.22800))
+    pmulhw      mm4, [GOTOFF(eax,PW_MF0228)]  ; mm4=(2*CbL * -FIX(0.22800))
+    pmulhw      mm7, [GOTOFF(eax,PW_F0402)]   ; mm7=(2*CrH * FIX(0.40200))
+    pmulhw      mm0, [GOTOFF(eax,PW_F0402)]   ; mm0=(2*CrL * FIX(0.40200))
+
+    paddw       mm6, [GOTOFF(eax,PW_ONE)]
+    paddw       mm4, [GOTOFF(eax,PW_ONE)]
+    psraw       mm6, 1                  ; mm6=(CbH * -FIX(0.22800))
+    psraw       mm4, 1                  ; mm4=(CbL * -FIX(0.22800))
+    paddw       mm7, [GOTOFF(eax,PW_ONE)]
+    paddw       mm0, [GOTOFF(eax,PW_ONE)]
+    psraw       mm7, 1                  ; mm7=(CrH * FIX(0.40200))
+    psraw       mm0, 1                  ; mm0=(CrL * FIX(0.40200))
+
+    paddw       mm6, mm5
+    paddw       mm4, mm2
+    paddw       mm6, mm5                ; mm6=(CbH * FIX(1.77200))=(B-Y)H
+    paddw       mm4, mm2                ; mm4=(CbL * FIX(1.77200))=(B-Y)L
+    paddw       mm7, mm1                ; mm7=(CrH * FIX(1.40200))=(R-Y)H
+    paddw       mm0, mm3                ; mm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    movq        MMWORD [wk(0)], mm6     ; wk(0)=(B-Y)H
+    movq        MMWORD [wk(1)], mm7     ; wk(1)=(R-Y)H
+
+    movq        mm6, mm5
+    movq        mm7, mm2
+    punpcklwd   mm5, mm1
+    punpckhwd   mm6, mm1
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm6, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   mm2, mm3
+    punpckhwd   mm7, mm3
+    pmaddwd     mm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       mm5, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm6, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm5, SCALEBITS
+    psrad       mm6, SCALEBITS
+    paddd       mm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm7, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm2, SCALEBITS
+    psrad       mm7, SCALEBITS
+
+    packssdw    mm5, mm6                ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    packssdw    mm2, mm7                ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    psubw       mm5, mm1                ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    psubw       mm2, mm3                ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    movq        MMWORD [wk(2)], mm5     ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+    alignx      16, 7
+
+.Yloop_2nd:
+    movq        mm0, MMWORD [wk(1)]     ; mm0=(R-Y)H
+    movq        mm2, MMWORD [wk(2)]     ; mm2=(G-Y)H
+    movq        mm4, MMWORD [wk(0)]     ; mm4=(B-Y)H
+    alignx      16, 7
+
+.Yloop_1st:
+    movq        mm7, MMWORD [esi]       ; mm7=Y(01234567)
+
+    pcmpeqw     mm6, mm6
+    psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
+    pand        mm6, mm7                ; mm6=Y(0246)=YE
+    psrlw       mm7, BYTE_BIT           ; mm7=Y(1357)=YO
+
+    movq        mm1, mm0                ; mm1=mm0=(R-Y)(L/H)
+    movq        mm3, mm2                ; mm3=mm2=(G-Y)(L/H)
+    movq        mm5, mm4                ; mm5=mm4=(B-Y)(L/H)
+
+    paddw       mm0, mm6                ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
+    paddw       mm1, mm7                ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
+    packuswb    mm0, mm0                ; mm0=(R0 R2 R4 R6 ** ** ** **)
+    packuswb    mm1, mm1                ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+    paddw       mm2, mm6                ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
+    paddw       mm3, mm7                ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
+    packuswb    mm2, mm2                ; mm2=(G0 G2 G4 G6 ** ** ** **)
+    packuswb    mm3, mm3                ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+    paddw       mm4, mm6                ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
+    paddw       mm5, mm7                ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
+    packuswb    mm4, mm4                ; mm4=(B0 B2 B4 B6 ** ** ** **)
+    packuswb    mm5, mm5                ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmB                ; mmE=(20 01 22 03 24 05 26 07)
+    punpcklbw   mmD, mmF                ; mmD=(11 21 13 23 15 25 17 27)
+
+    movq        mmG, mmA
+    movq        mmH, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 01 02 12 22 03)
+    punpckhwd   mmG, mmE                ; mmG=(04 14 24 05 06 16 26 07)
+
+    psrlq       mmH, 2*BYTE_BIT         ; mmH=(02 12 04 14 06 16 -- --)
+    psrlq       mmE, 2*BYTE_BIT         ; mmE=(22 03 24 05 26 07 -- --)
+
+    movq        mmC, mmD
+    movq        mmB, mmD
+    punpcklwd   mmD, mmH                ; mmD=(11 21 02 12 13 23 04 14)
+    punpckhwd   mmC, mmH                ; mmC=(15 25 06 16 17 27 -- --)
+
+    psrlq       mmB, 2*BYTE_BIT         ; mmB=(13 23 15 25 17 27 -- --)
+
+    movq        mmF, mmE
+    punpcklwd   mmE, mmB                ; mmE=(22 03 13 23 24 05 15 25)
+    punpckhwd   mmF, mmB                ; mmF=(26 07 17 27 -- -- -- --)
+
+    punpckldq   mmA, mmD                ; mmA=(00 10 20 01 11 21 02 12)
+    punpckldq   mmE, mmG                ; mmE=(22 03 13 23 04 14 24 05)
+    punpckldq   mmC, mmF                ; mmC=(15 25 06 16 26 07 17 27)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          near .endcolumn
+
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    dec         al                                     ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    lea         ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_MMWORD
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        mmA, mmC
+    sub         ecx, byte 2*SIZEOF_MMWORD
+    add         edi, byte 2*SIZEOF_MMWORD
+    jmp         short .column_st4
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmE
+    sub         ecx, byte SIZEOF_MMWORD
+    add         edi, byte SIZEOF_MMWORD
+.column_st4:
+    movd        eax, mmA
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st2
+    mov         DWORD [edi+0*SIZEOF_DWORD], eax
+    psrlq       mmA, DWORD_BIT
+    movd        eax, mmA
+    sub         ecx, byte SIZEOF_DWORD
+    add         edi, byte SIZEOF_DWORD
+.column_st2:
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         WORD [edi+0*SIZEOF_WORD], ax
+    shr         eax, WORD_BIT
+    sub         ecx, byte SIZEOF_WORD
+    add         edi, byte SIZEOF_WORD
+.column_st1:
+    cmp         ecx, byte SIZEOF_BYTE
+    jb          short .endcolumn
+    mov         BYTE [edi+0*SIZEOF_BYTE], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pcmpeqb     mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+    pxor        mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pxor        mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmG                ; mmE=(20 30 22 32 24 34 26 36)
+    punpcklbw   mmB, mmD                ; mmB=(01 11 03 13 05 15 07 17)
+    punpcklbw   mmF, mmH                ; mmF=(21 31 23 33 25 35 27 37)
+
+    movq        mmC, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 30 02 12 22 32)
+    punpckhwd   mmC, mmE                ; mmC=(04 14 24 34 06 16 26 36)
+    movq        mmG, mmB
+    punpcklwd   mmB, mmF                ; mmB=(01 11 21 31 03 13 23 33)
+    punpckhwd   mmG, mmF                ; mmG=(05 15 25 35 07 17 27 37)
+
+    movq        mmD, mmA
+    punpckldq   mmA, mmB                ; mmA=(00 10 20 30 01 11 21 31)
+    punpckhdq   mmD, mmB                ; mmD=(02 12 22 32 03 13 23 33)
+    movq        mmH, mmC
+    punpckldq   mmC, mmG                ; mmC=(04 14 24 34 05 15 25 35)
+    punpckhdq   mmH, mmG                ; mmH=(06 16 26 36 07 17 27 37)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          short .endcolumn
+
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    dec         al                                     ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    cmp         ecx, byte SIZEOF_MMWORD/2
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        mmA, mmC
+    movq        mmD, mmH
+    sub         ecx, byte SIZEOF_MMWORD/2
+    add         edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD/4
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmD
+    sub         ecx, byte SIZEOF_MMWORD/4
+    add         edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+    cmp         ecx, byte SIZEOF_MMWORD/8
+    jb          short .endcolumn
+    movd        DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
+;                                JDIMENSION in_row_group_ctr,
+;                                JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_mmx)
+
+EXTN(jsimd_h2v2_merged_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, JDIMENSION [output_width(ebp)]
+
+    mov         edi, JSAMPIMAGE [input_buf(ebp)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(ebp)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(ebp)]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+    push        edx                     ; inptr2
+    push        ebx                     ; inptr1
+    push        esi                     ; inptr00
+    mov         ebx, esp
+
+    push        edi                     ; output_buf (outptr0)
+    push        ecx                     ; in_row_group_ctr
+    push        ebx                     ; input_buf
+    push        eax                     ; output_width
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+    add         esi, byte SIZEOF_JSAMPROW  ; inptr01
+    add         edi, byte SIZEOF_JSAMPROW  ; outptr1
+    mov         POINTER [ebx+0*SIZEOF_POINTER], esi
+    mov         POINTER [ebx-1*SIZEOF_POINTER], edi
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+    add         esp, byte 7*SIZEOF_DWORD
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jdmrgext-sse2.asm b/simd/i386/jdmrgext-sse2.asm
new file mode 100644
index 0000000..ac4697e
--- /dev/null
+++ b/simd/i386/jdmrgext-sse2.asm
@@ -0,0 +1,519 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        3
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [output_width(eax)]  ; col
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         edi, JSAMPROW [edi]                      ; outptr
+
+    pop         ecx                     ; col
+
+    alignx      16, 7
+.columnloop:
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    movdqa      xmm6, XMMWORD [ebx]     ; xmm6=Cb(0123456789ABCDEF)
+    movdqa      xmm7, XMMWORD [edx]     ; xmm7=Cr(0123456789ABCDEF)
+
+    pxor        xmm1, xmm1              ; xmm1=(all 0's)
+    pcmpeqw     xmm3, xmm3
+    psllw       xmm3, 7                 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    movdqa      xmm4, xmm6
+    punpckhbw   xmm6, xmm1              ; xmm6=Cb(89ABCDEF)=CbH
+    punpcklbw   xmm4, xmm1              ; xmm4=Cb(01234567)=CbL
+    movdqa      xmm0, xmm7
+    punpckhbw   xmm7, xmm1              ; xmm7=Cr(89ABCDEF)=CrH
+    punpcklbw   xmm0, xmm1              ; xmm0=Cr(01234567)=CrL
+
+    paddw       xmm6, xmm3
+    paddw       xmm4, xmm3
+    paddw       xmm7, xmm3
+    paddw       xmm0, xmm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm5, xmm6              ; xmm5=CbH
+    movdqa      xmm2, xmm4              ; xmm2=CbL
+    paddw       xmm6, xmm6              ; xmm6=2*CbH
+    paddw       xmm4, xmm4              ; xmm4=2*CbL
+    movdqa      xmm1, xmm7              ; xmm1=CrH
+    movdqa      xmm3, xmm0              ; xmm3=CrL
+    paddw       xmm7, xmm7              ; xmm7=2*CrH
+    paddw       xmm0, xmm0              ; xmm0=2*CrL
+
+    pmulhw      xmm6, [GOTOFF(eax,PW_MF0228)]  ; xmm6=(2*CbH * -FIX(0.22800))
+    pmulhw      xmm4, [GOTOFF(eax,PW_MF0228)]  ; xmm4=(2*CbL * -FIX(0.22800))
+    pmulhw      xmm7, [GOTOFF(eax,PW_F0402)]   ; xmm7=(2*CrH * FIX(0.40200))
+    pmulhw      xmm0, [GOTOFF(eax,PW_F0402)]   ; xmm0=(2*CrL * FIX(0.40200))
+
+    paddw       xmm6, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm4, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm6, 1                 ; xmm6=(CbH * -FIX(0.22800))
+    psraw       xmm4, 1                 ; xmm4=(CbL * -FIX(0.22800))
+    paddw       xmm7, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm0, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm7, 1                 ; xmm7=(CrH * FIX(0.40200))
+    psraw       xmm0, 1                 ; xmm0=(CrL * FIX(0.40200))
+
+    paddw       xmm6, xmm5
+    paddw       xmm4, xmm2
+    paddw       xmm6, xmm5              ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+    paddw       xmm4, xmm2              ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+    paddw       xmm7, xmm1              ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+    paddw       xmm0, xmm3              ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm5, xmm1
+    punpckhwd   xmm6, xmm1
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm6, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   xmm2, xmm3
+    punpckhwd   xmm7, xmm3
+    pmaddwd     xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       xmm5, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm6, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm5, SCALEBITS
+    psrad       xmm6, SCALEBITS
+    paddd       xmm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm7, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm7, SCALEBITS
+
+    packssdw    xmm5, xmm6              ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    packssdw    xmm2, xmm7              ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    psubw       xmm5, xmm1              ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    psubw       xmm2, xmm3              ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    movdqa      XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+    alignx      16, 7
+
+.Yloop_2nd:
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
+    alignx      16, 7
+
+.Yloop_1st:
+    movdqa      xmm7, XMMWORD [esi]     ; xmm7=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm6, xmm6
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm6, xmm7              ; xmm6=Y(02468ACE)=YE
+    psrlw       xmm7, BYTE_BIT          ; xmm7=Y(13579BDF)=YO
+
+    movdqa      xmm1, xmm0              ; xmm1=xmm0=(R-Y)(L/H)
+    movdqa      xmm3, xmm2              ; xmm3=xmm2=(G-Y)(L/H)
+    movdqa      xmm5, xmm4              ; xmm5=xmm4=(B-Y)(L/H)
+
+    paddw       xmm0, xmm6              ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm7              ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm6              ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm7              ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, xmm6              ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+    paddw       xmm5, xmm7              ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    lea         ecx, [ecx+ecx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         ecx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         WORD [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    mov         BYTE [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    cmp         ecx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_XMMWORD/8*4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    movd        XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, POINTER [output_width(ebp)]
+
+    mov         edi, JSAMPIMAGE [input_buf(ebp)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(ebp)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(ebp)]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+    push        edx                     ; inptr2
+    push        ebx                     ; inptr1
+    push        esi                     ; inptr00
+    mov         ebx, esp
+
+    push        edi                     ; output_buf (outptr0)
+    push        ecx                     ; in_row_group_ctr
+    push        ebx                     ; input_buf
+    push        eax                     ; output_width
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    add         esi, byte SIZEOF_JSAMPROW  ; inptr01
+    add         edi, byte SIZEOF_JSAMPROW  ; outptr1
+    mov         POINTER [ebx+0*SIZEOF_POINTER], esi
+    mov         POINTER [ebx-1*SIZEOF_POINTER], edi
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    add         esp, byte 7*SIZEOF_DWORD
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jdsample-avx2.asm b/simd/i386/jdsample-avx2.asm
new file mode 100644
index 0000000..61ce511
--- /dev/null
+++ b/simd/i386/jdsample-avx2.asm
@@ -0,0 +1,762 @@
+;
+; jdsample.asm - upsampling (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_avx2)
+
+EXTN(jconst_fancy_upsample_avx2):
+
+PW_ONE   times 16 dw 1
+PW_TWO   times 16 dw 2
+PW_THREE times 16 dw 3
+PW_SEVEN times 16 dw 7
+PW_EIGHT times 16 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v1_fancy_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    test        eax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    vpxor       ymm0, ymm0, ymm0                ; ymm0=(all 0's)
+    vpcmpeqb    xmm7, xmm7, xmm7
+    vpsrldq     xmm7, xmm7, (SIZEOF_XMMWORD-1)  ; (ff -- -- -- ... -- --) LSB is ff
+    vpand       ymm7, ymm7, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+    add         eax, byte SIZEOF_YMMWORD-1
+    and         eax, byte -SIZEOF_YMMWORD
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    vpcmpeqb    xmm6, xmm6, xmm6
+    vpslldq     xmm6, xmm6, (SIZEOF_XMMWORD-1)
+    vperm2i128  ymm6, ymm6, ymm6, 1             ; (---- ---- ... ---- ---- ff) MSB is ff
+    vpand       ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vperm2i128  ymm6, ymm0, ymm6, 0x20
+    vpslldq     ymm6, ymm6, 15
+
+.upsample:
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]  ; ymm1=( 0  1  2 ... 29 30 31)
+
+    vperm2i128  ymm2, ymm0, ymm1, 0x20
+    vpalignr    ymm2, ymm1, ymm2, 15            ; ymm2=(--  0  1 ... 28 29 30)
+    vperm2i128  ymm4, ymm0, ymm1, 0x03
+    vpalignr    ymm3, ymm4, ymm1, 1             ; ymm3=( 1  2  3 ... 30 31 --)
+
+    vpor        ymm2, ymm2, ymm7                ; ymm2=(-1  0  1 ... 28 29 30)
+    vpor        ymm3, ymm3, ymm6                ; ymm3=( 1  2  3 ... 30 31 32)
+
+    vpsrldq     ymm7, ymm4, (SIZEOF_XMMWORD-1)  ; ymm7=(31 -- -- ... -- -- --)
+
+    vpunpckhbw  ymm4, ymm1, ymm0                ; ymm4=( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm1, ymm0                ; ymm5=( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm5, ymm4, 0x20          ; ymm1=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31          ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm2, ymm0                ; ymm5=( 7  8  9 10 11 12 13 14 23 24 25 26 27 28 29 30)
+    vpunpcklbw  ymm6, ymm2, ymm0                ; ymm6=(-1  0  1  2  3  4  5  6 15 16 17 18 19 20 21 22)
+    vperm2i128  ymm2, ymm6, ymm5, 0x20          ; ymm2=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31          ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpunpckhbw  ymm6, ymm3, ymm0                ; ymm6=( 1  2  3  4  5  6  7  8 17 18 19 20 21 22 23 24)
+    vpunpcklbw  ymm0, ymm3, ymm0                ; ymm0=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
+    vperm2i128  ymm3, ymm0, ymm6, 0x20          ; ymm3=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vperm2i128  ymm6, ymm0, ymm6, 0x31          ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vpxor       ymm0, ymm0, ymm0                ; ymm0=(all 0's)
+
+    vpmullw     ymm1, ymm1, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+    vpaddw      ymm2, ymm2, [GOTOFF(ebx,PW_ONE)]
+    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_ONE)]
+    vpaddw      ymm3, ymm3, [GOTOFF(ebx,PW_TWO)]
+    vpaddw      ymm6, ymm6, [GOTOFF(ebx,PW_TWO)]
+
+    vpaddw      ymm2, ymm2, ymm1
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm2, ymm2, 2                   ; ymm2=OutLE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 2                   ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm3, ymm3, ymm1
+    vpaddw      ymm6, ymm6, ymm4
+    vpsrlw      ymm3, ymm3, 2                   ; ymm3=OutLO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm6, ymm6, 2                   ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm3, ymm3, BYTE_BIT
+    vpsllw      ymm6, ymm6, BYTE_BIT
+    vpor        ymm2, ymm2, ymm3                ; ymm2=OutL=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm6                ; ymm5=OutH=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm5
+
+    sub         eax, byte SIZEOF_YMMWORD
+    add         esi, byte 1*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM        4
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v2_fancy_upsample_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         edx, eax                ; edx = original ebp
+    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(edx)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        eax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    push        edx
+    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         edx
+.skip:
+    ; -- process the first column block
+
+    vmovdqu     ymm0, YMMWORD [ebx+0*SIZEOF_YMMWORD]  ; ymm0=row[ 0][0]
+    vmovdqu     ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD]  ; ymm1=row[-1][0]
+    vmovdqu     ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD]  ; ymm2=row[+1][0]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    vpxor       ymm3, ymm3, ymm3        ; ymm3=(all 0's)
+
+    vpunpckhbw  ymm4, ymm0, ymm3        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm3        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm3        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm3        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm3        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm3, ymm2, ymm3        ; ymm3=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm3, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm3, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+
+    vpcmpeqb    xmm7, xmm7, xmm7
+    vpsrldq     xmm7, xmm7, (SIZEOF_XMMWORD-2)  ; (ffff ---- ---- ... ---- ----) LSB is ffff
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm6
+
+    vpand       ymm1, ymm1, ymm7        ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vpand       ymm2, ymm2, ymm7        ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+
+    vmovdqa     YMMWORD [wk(0)], ymm1
+    vmovdqa     YMMWORD [wk(1)], ymm2
+
+    poppic      ebx
+
+    add         eax, byte SIZEOF_YMMWORD-1
+    and         eax, byte -SIZEOF_YMMWORD
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    vpcmpeqb    xmm1, xmm1, xmm1
+    vpslldq     xmm1, xmm1, (SIZEOF_XMMWORD-2)
+    vperm2i128  ymm1, ymm1, ymm1, 1             ; (---- ---- ... ---- ---- ffff) MSB is ffff
+
+    vpand       ymm2, ymm1, YMMWORD [edi+1*SIZEOF_YMMWORD]
+    vpand       ymm1, ymm1, YMMWORD [edx+1*SIZEOF_YMMWORD]
+
+    vmovdqa     YMMWORD [wk(2)], ymm1          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+    vmovdqa     YMMWORD [wk(3)], ymm2          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+
+    jmp         near .upsample
+    alignx      16, 7
+
+.columnloop:
+    ; -- process the next column block
+
+    vmovdqu     ymm0, YMMWORD [ebx+1*SIZEOF_YMMWORD]  ; ymm0=row[ 0][1]
+    vmovdqu     ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD]  ; ymm1=row[-1][1]
+    vmovdqu     ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD]  ; ymm2=row[+1][1]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    vpxor       ymm3, ymm3, ymm3        ; ymm3=(all 0's)
+
+    vpunpckhbw  ymm4, ymm0, ymm3        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm3        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm3        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm3        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm3        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm7, ymm2, ymm3        ; ymm7=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm7, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm7, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [edx+2*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [edx+3*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [edi+3*SIZEOF_YMMWORD], ymm6
+
+    vperm2i128  ymm1, ymm3, ymm1, 0x20
+    vpslldq     ymm1, ymm1, 14          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+    vperm2i128  ymm2, ymm3, ymm2, 0x20
+    vpslldq     ymm2, ymm2, 14          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+
+    vmovdqa     YMMWORD [wk(2)], ymm1
+    vmovdqa     YMMWORD [wk(3)], ymm2
+
+.upsample:
+    ; -- process the upper row
+
+    vmovdqu     ymm7, YMMWORD [edx+0*SIZEOF_YMMWORD]  ; ymm7=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm3, YMMWORD [edx+1*SIZEOF_YMMWORD]  ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+
+    vperm2i128  ymm0, ymm1, ymm7, 0x03
+    vpalignr    ymm0, ymm0, ymm7, 2     ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm4, ymm1, ymm3, 0x20
+    vpslldq     ymm4, ymm4, 14          ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm5, ymm1, ymm7, 0x03
+    vpsrldq     ymm5, ymm5, 14          ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm6, ymm1, ymm3, 0x20
+    vpalignr    ymm6, ymm3, ymm6, 14    ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm0, ymm0, ymm4        ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm5, ymm5, ymm6        ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm2, ymm1, ymm3, 0x03
+    vpalignr    ymm2, ymm2, ymm3, 2     ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm4, ymm1, ymm3, 0x03
+    vpsrldq     ymm4, ymm4, 14          ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm1, ymm7, 0x20
+    vpalignr    ymm1, ymm7, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(0)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm2, ymm2, YMMWORD [wk(2)]  ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(0)], ymm4
+
+    vpmullw     ymm7, ymm7, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm3, ymm3, [GOTOFF(ebx,PW_THREE)]
+    vpaddw      ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm0, ymm0, [GOTOFF(ebx,PW_SEVEN)]
+    vpaddw      ymm2, [GOTOFF(ebx,PW_SEVEN)]
+
+    vpaddw      ymm1, ymm1, ymm7
+    vpaddw      ymm5, ymm5, ymm3
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out0LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm3
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out0LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm2, ymm2, 4           ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpsllw      ymm2, ymm2, BYTE_BIT
+    vpor        ymm1, ymm1, ymm0        ; ymm1=Out0L=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm2        ; ymm5=Out0H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5
+
+    ; -- process the lower row
+
+    vmovdqu     ymm6, YMMWORD [edi+0*SIZEOF_YMMWORD]  ; ymm6=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm4, YMMWORD [edi+1*SIZEOF_YMMWORD]  ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+
+    vperm2i128  ymm7, ymm1, ymm6, 0x03
+    vpalignr    ymm7, ymm7, ymm6, 2     ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm3, ymm1, ymm4, 0x20
+    vpslldq     ymm3, ymm3, 14          ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm0, ymm1, ymm6, 0x03
+    vpsrldq     ymm0, ymm0, 14          ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm2, ymm1, ymm4, 0x20
+    vpalignr    ymm2, ymm4, ymm2, 14    ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm7, ymm7, ymm3        ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm0, ymm0, ymm2        ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm5, ymm1, ymm4, 0x03
+    vpalignr    ymm5, ymm5, ymm4, 2     ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm3, ymm1, ymm4, 0x03
+    vpsrldq     ymm3, ymm3, 14          ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm1, ymm6, 0x20
+    vpalignr    ymm1, ymm6, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(1)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm5, ymm5, YMMWORD [wk(3)]  ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(1)], ymm3
+
+    vpmullw     ymm6, ymm6, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+    vpaddw      ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm0, ymm0, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm7, ymm7, [GOTOFF(ebx,PW_SEVEN)]
+    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_SEVEN)]
+
+    vpaddw      ymm1, ymm1, ymm6
+    vpaddw      ymm0, ymm0, ymm4
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out1LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm7, ymm7, ymm6
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm7, ymm7, 4           ; ymm7=Out1LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpsllw      ymm5, ymm5, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Out1L=( 0  1  2 ... 29 30 31)
+    vpor        ymm0, ymm0, ymm5        ; ymm0=Out1H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0
+
+    poppic      ebx
+
+    sub         eax, byte SIZEOF_YMMWORD
+    add         ecx, byte 1*SIZEOF_YMMWORD  ; inptr1(above)
+    add         ebx, byte 1*SIZEOF_YMMWORD  ; inptr0
+    add         esi, byte 1*SIZEOF_YMMWORD  ; inptr1(below)
+    add         edx, byte 2*SIZEOF_YMMWORD  ; outptr0
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr1
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         ecx
+    pop         eax
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
+
+EXTN(jsimd_h2v1_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (SIZEOF_YMMWORD-1)
+    and         edx, -SIZEOF_YMMWORD
+    jz          short .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    mov         eax, edx                ; colctr
+    alignx      16, 7
+.columnloop:
+
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          near .above_16
+
+    vmovdqu     xmm0, XMMWORD [esi+0*SIZEOF_YMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         short .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         eax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD    ; inptr
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
+
+EXTN(jsimd_h2v2_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (SIZEOF_YMMWORD-1)
+    and         edx, -SIZEOF_YMMWORD
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]                    ; inptr
+    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         eax, edx                               ; colctr
+    alignx      16, 7
+.columnloop:
+
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          short .above_16
+
+    vmovdqu     xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         near .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [ebx+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [ebx+1*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         eax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr
+    add         ebx, 2*SIZEOF_YMMWORD     ; outptr0
+    add         edi, 2*SIZEOF_YMMWORD     ; outptr1
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jdsample-mmx.asm b/simd/i386/jdsample-mmx.asm
new file mode 100644
index 0000000..1f810fa
--- /dev/null
+++ b/simd/i386/jdsample-mmx.asm
@@ -0,0 +1,733 @@
+;
+; jdsample.asm - upsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_mmx)
+
+EXTN(jconst_fancy_upsample_mmx):
+
+PW_ONE   times 4 dw 1
+PW_TWO   times 4 dw 2
+PW_THREE times 4 dw 3
+PW_SEVEN times 4 dw 7
+PW_EIGHT times 4 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_mmx(int max_v_samp_factor,
+;                               JDIMENSION downsampled_width,
+;                               JSAMPARRAY input_data,
+;                               JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v1_fancy_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    test        eax, SIZEOF_MMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    pxor        mm0, mm0                ; mm0=(all 0's)
+    pcmpeqb     mm7, mm7
+    psrlq       mm7, (SIZEOF_MMWORD-1)*BYTE_BIT
+    pand        mm7,  MMWORD [esi+0*SIZEOF_MMWORD]
+
+    add         eax, byte SIZEOF_MMWORD-1
+    and         eax, byte -SIZEOF_MMWORD
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    pcmpeqb     mm6, mm6
+    psllq       mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
+    pand        mm6, MMWORD [esi+0*SIZEOF_MMWORD]
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    movq        mm6, MMWORD [esi+1*SIZEOF_MMWORD]
+    psllq       mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
+
+.upsample:
+    movq        mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mm2, mm1
+    movq        mm3, mm1                ; mm1=( 0 1 2 3 4 5 6 7)
+    psllq       mm2, BYTE_BIT           ; mm2=( - 0 1 2 3 4 5 6)
+    psrlq       mm3, BYTE_BIT           ; mm3=( 1 2 3 4 5 6 7 -)
+
+    por         mm2, mm7                ; mm2=(-1 0 1 2 3 4 5 6)
+    por         mm3, mm6                ; mm3=( 1 2 3 4 5 6 7 8)
+
+    movq        mm7, mm1
+    psrlq       mm7, (SIZEOF_MMWORD-1)*BYTE_BIT  ; mm7=( 7 - - - - - - -)
+
+    movq        mm4, mm1
+    punpcklbw   mm1, mm0                ; mm1=( 0 1 2 3)
+    punpckhbw   mm4, mm0                ; mm4=( 4 5 6 7)
+    movq        mm5, mm2
+    punpcklbw   mm2, mm0                ; mm2=(-1 0 1 2)
+    punpckhbw   mm5, mm0                ; mm5=( 3 4 5 6)
+    movq        mm6, mm3
+    punpcklbw   mm3, mm0                ; mm3=( 1 2 3 4)
+    punpckhbw   mm6, mm0                ; mm6=( 5 6 7 8)
+
+    pmullw      mm1, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       mm2, [GOTOFF(ebx,PW_ONE)]
+    paddw       mm5, [GOTOFF(ebx,PW_ONE)]
+    paddw       mm3, [GOTOFF(ebx,PW_TWO)]
+    paddw       mm6, [GOTOFF(ebx,PW_TWO)]
+
+    paddw       mm2, mm1
+    paddw       mm5, mm4
+    psrlw       mm2, 2                  ; mm2=OutLE=( 0  2  4  6)
+    psrlw       mm5, 2                  ; mm5=OutHE=( 8 10 12 14)
+    paddw       mm3, mm1
+    paddw       mm6, mm4
+    psrlw       mm3, 2                  ; mm3=OutLO=( 1  3  5  7)
+    psrlw       mm6, 2                  ; mm6=OutHO=( 9 11 13 15)
+
+    psllw       mm3, BYTE_BIT
+    psllw       mm6, BYTE_BIT
+    por         mm2, mm3                ; mm2=OutL=( 0  1  2  3  4  5  6  7)
+    por         mm5, mm6                ; mm5=OutH=( 8  9 10 11 12 13 14 15)
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm5
+
+    sub         eax, byte SIZEOF_MMWORD
+    add         esi, byte 1*SIZEOF_MMWORD  ; inptr
+    add         edi, byte 2*SIZEOF_MMWORD  ; outptr
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_mmx(int max_v_samp_factor,
+;                               JDIMENSION downsampled_width,
+;                               JSAMPARRAY input_data,
+;                               JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        4
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v2_fancy_upsample_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         edx, eax                ; edx = original ebp
+    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(edx)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        eax, SIZEOF_MMWORD-1
+    jz          short .skip
+    push        edx
+    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         edx
+.skip:
+    ; -- process the first column block
+
+    movq        mm0, MMWORD [ebx+0*SIZEOF_MMWORD]  ; mm0=row[ 0][0]
+    movq        mm1, MMWORD [ecx+0*SIZEOF_MMWORD]  ; mm1=row[-1][0]
+    movq        mm2, MMWORD [esi+0*SIZEOF_MMWORD]  ; mm2=row[+1][0]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        mm3, mm3                ; mm3=(all 0's)
+    movq        mm4, mm0
+    punpcklbw   mm0, mm3                ; mm0=row[ 0][0]( 0 1 2 3)
+    punpckhbw   mm4, mm3                ; mm4=row[ 0][0]( 4 5 6 7)
+    movq        mm5, mm1
+    punpcklbw   mm1, mm3                ; mm1=row[-1][0]( 0 1 2 3)
+    punpckhbw   mm5, mm3                ; mm5=row[-1][0]( 4 5 6 7)
+    movq        mm6, mm2
+    punpcklbw   mm2, mm3                ; mm2=row[+1][0]( 0 1 2 3)
+    punpckhbw   mm6, mm3                ; mm6=row[+1][0]( 4 5 6 7)
+
+    pmullw      mm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+
+    pcmpeqb     mm7, mm7
+    psrlq       mm7, (SIZEOF_MMWORD-2)*BYTE_BIT
+
+    paddw       mm1, mm0                ; mm1=Int0L=( 0 1 2 3)
+    paddw       mm5, mm4                ; mm5=Int0H=( 4 5 6 7)
+    paddw       mm2, mm0                ; mm2=Int1L=( 0 1 2 3)
+    paddw       mm6, mm4                ; mm6=Int1H=( 4 5 6 7)
+
+    movq        MMWORD [edx+0*SIZEOF_MMWORD], mm1  ; temporarily save
+    movq        MMWORD [edx+1*SIZEOF_MMWORD], mm5  ; the intermediate data
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm6
+
+    pand        mm1, mm7                ; mm1=( 0 - - -)
+    pand        mm2, mm7                ; mm2=( 0 - - -)
+
+    movq        MMWORD [wk(0)], mm1
+    movq        MMWORD [wk(1)], mm2
+
+    poppic      ebx
+
+    add         eax, byte SIZEOF_MMWORD-1
+    and         eax, byte -SIZEOF_MMWORD
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pcmpeqb     mm1, mm1
+    psllq       mm1, (SIZEOF_MMWORD-2)*BYTE_BIT
+    movq        mm2, mm1
+
+    pand        mm1, MMWORD [edx+1*SIZEOF_MMWORD]  ; mm1=( - - - 7)
+    pand        mm2, MMWORD [edi+1*SIZEOF_MMWORD]  ; mm2=( - - - 7)
+
+    movq        MMWORD [wk(2)], mm1
+    movq        MMWORD [wk(3)], mm2
+
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    ; -- process the next column block
+
+    movq        mm0, MMWORD [ebx+1*SIZEOF_MMWORD]  ; mm0=row[ 0][1]
+    movq        mm1, MMWORD [ecx+1*SIZEOF_MMWORD]  ; mm1=row[-1][1]
+    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]  ; mm2=row[+1][1]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        mm3, mm3                ; mm3=(all 0's)
+    movq        mm4, mm0
+    punpcklbw   mm0, mm3                ; mm0=row[ 0][1]( 0 1 2 3)
+    punpckhbw   mm4, mm3                ; mm4=row[ 0][1]( 4 5 6 7)
+    movq        mm5, mm1
+    punpcklbw   mm1, mm3                ; mm1=row[-1][1]( 0 1 2 3)
+    punpckhbw   mm5, mm3                ; mm5=row[-1][1]( 4 5 6 7)
+    movq        mm6, mm2
+    punpcklbw   mm2, mm3                ; mm2=row[+1][1]( 0 1 2 3)
+    punpckhbw   mm6, mm3                ; mm6=row[+1][1]( 4 5 6 7)
+
+    pmullw      mm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+
+    paddw       mm1, mm0                ; mm1=Int0L=( 0 1 2 3)
+    paddw       mm5, mm4                ; mm5=Int0H=( 4 5 6 7)
+    paddw       mm2, mm0                ; mm2=Int1L=( 0 1 2 3)
+    paddw       mm6, mm4                ; mm6=Int1H=( 4 5 6 7)
+
+    movq        MMWORD [edx+2*SIZEOF_MMWORD], mm1  ; temporarily save
+    movq        MMWORD [edx+3*SIZEOF_MMWORD], mm5  ; the intermediate data
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm6
+
+    psllq       mm1, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm1=( - - - 0)
+    psllq       mm2, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm2=( - - - 0)
+
+    movq        MMWORD [wk(2)], mm1
+    movq        MMWORD [wk(3)], mm2
+
+.upsample:
+    ; -- process the upper row
+
+    movq        mm7, MMWORD [edx+0*SIZEOF_MMWORD]  ; mm7=Int0L=( 0 1 2 3)
+    movq        mm3, MMWORD [edx+1*SIZEOF_MMWORD]  ; mm3=Int0H=( 4 5 6 7)
+
+    movq        mm0, mm7
+    movq        mm4, mm3
+    psrlq       mm0, 2*BYTE_BIT                  ; mm0=( 1 2 3 -)
+    psllq       mm4, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( - - - 4)
+    movq        mm5, mm7
+    movq        mm6, mm3
+    psrlq       mm5, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm5=( 3 - - -)
+    psllq       mm6, 2*BYTE_BIT                  ; mm6=( - 4 5 6)
+
+    por         mm0, mm4                         ; mm0=( 1 2 3 4)
+    por         mm5, mm6                         ; mm5=( 3 4 5 6)
+
+    movq        mm1, mm7
+    movq        mm2, mm3
+    psllq       mm1, 2*BYTE_BIT                  ; mm1=( - 0 1 2)
+    psrlq       mm2, 2*BYTE_BIT                  ; mm2=( 5 6 7 -)
+    movq        mm4, mm3
+    psrlq       mm4, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( 7 - - -)
+
+    por         mm1, MMWORD [wk(0)]              ; mm1=(-1 0 1 2)
+    por         mm2, MMWORD [wk(2)]              ; mm2=( 5 6 7 8)
+
+    movq        MMWORD [wk(0)], mm4
+
+    pmullw      mm7, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm3, [GOTOFF(ebx,PW_THREE)]
+    paddw       mm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm5, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm0, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       mm2, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       mm1, mm7
+    paddw       mm5, mm3
+    psrlw       mm1, 4                  ; mm1=Out0LE=( 0  2  4  6)
+    psrlw       mm5, 4                  ; mm5=Out0HE=( 8 10 12 14)
+    paddw       mm0, mm7
+    paddw       mm2, mm3
+    psrlw       mm0, 4                  ; mm0=Out0LO=( 1  3  5  7)
+    psrlw       mm2, 4                  ; mm2=Out0HO=( 9 11 13 15)
+
+    psllw       mm0, BYTE_BIT
+    psllw       mm2, BYTE_BIT
+    por         mm1, mm0                ; mm1=Out0L=( 0  1  2  3  4  5  6  7)
+    por         mm5, mm2                ; mm5=Out0H=( 8  9 10 11 12 13 14 15)
+
+    movq        MMWORD [edx+0*SIZEOF_MMWORD], mm1
+    movq        MMWORD [edx+1*SIZEOF_MMWORD], mm5
+
+    ; -- process the lower row
+
+    movq        mm6, MMWORD [edi+0*SIZEOF_MMWORD]  ; mm6=Int1L=( 0 1 2 3)
+    movq        mm4, MMWORD [edi+1*SIZEOF_MMWORD]  ; mm4=Int1H=( 4 5 6 7)
+
+    movq        mm7, mm6
+    movq        mm3, mm4
+    psrlq       mm7, 2*BYTE_BIT                  ; mm7=( 1 2 3 -)
+    psllq       mm3, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( - - - 4)
+    movq        mm0, mm6
+    movq        mm2, mm4
+    psrlq       mm0, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm0=( 3 - - -)
+    psllq       mm2, 2*BYTE_BIT                  ; mm2=( - 4 5 6)
+
+    por         mm7, mm3                         ; mm7=( 1 2 3 4)
+    por         mm0, mm2                         ; mm0=( 3 4 5 6)
+
+    movq        mm1, mm6
+    movq        mm5, mm4
+    psllq       mm1, 2*BYTE_BIT                  ; mm1=( - 0 1 2)
+    psrlq       mm5, 2*BYTE_BIT                  ; mm5=( 5 6 7 -)
+    movq        mm3, mm4
+    psrlq       mm3, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( 7 - - -)
+
+    por         mm1, MMWORD [wk(1)]              ; mm1=(-1 0 1 2)
+    por         mm5, MMWORD [wk(3)]              ; mm5=( 5 6 7 8)
+
+    movq        MMWORD [wk(1)], mm3
+
+    pmullw      mm6, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       mm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm0, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm7, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       mm5, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       mm1, mm6
+    paddw       mm0, mm4
+    psrlw       mm1, 4                  ; mm1=Out1LE=( 0  2  4  6)
+    psrlw       mm0, 4                  ; mm0=Out1HE=( 8 10 12 14)
+    paddw       mm7, mm6
+    paddw       mm5, mm4
+    psrlw       mm7, 4                  ; mm7=Out1LO=( 1  3  5  7)
+    psrlw       mm5, 4                  ; mm5=Out1HO=( 9 11 13 15)
+
+    psllw       mm7, BYTE_BIT
+    psllw       mm5, BYTE_BIT
+    por         mm1, mm7                ; mm1=Out1L=( 0  1  2  3  4  5  6  7)
+    por         mm0, mm5                ; mm0=Out1H=( 8  9 10 11 12 13 14 15)
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm1
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm0
+
+    poppic      ebx
+
+    sub         eax, byte SIZEOF_MMWORD
+    add         ecx, byte 1*SIZEOF_MMWORD  ; inptr1(above)
+    add         ebx, byte 1*SIZEOF_MMWORD  ; inptr0
+    add         esi, byte 1*SIZEOF_MMWORD  ; inptr1(below)
+    add         edx, byte 2*SIZEOF_MMWORD  ; outptr0
+    add         edi, byte 2*SIZEOF_MMWORD  ; outptr1
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         ecx
+    pop         eax
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
+;                         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_mmx)
+
+EXTN(jsimd_h2v1_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_MMWORD)-1
+    and         edx, byte -(2*SIZEOF_MMWORD)
+    jz          short .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    mov         eax, edx                ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+    movq        mm1, mm0
+    punpcklbw   mm0, mm0
+    punpckhbw   mm1, mm1
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+    movq        mm3, mm2
+    punpcklbw   mm2, mm2
+    punpckhbw   mm3, mm3
+
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
+    add         edi, byte 4*SIZEOF_MMWORD  ; outptr
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          short .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
+;                         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_mmx)
+
+EXTN(jsimd_h2v2_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_MMWORD)-1
+    and         edx, byte -(2*SIZEOF_MMWORD)
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]                    ; inptr
+    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         eax, edx                               ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+    movq        mm1, mm0
+    punpcklbw   mm0, mm0
+    punpckhbw   mm1, mm1
+
+    movq        MMWORD [ebx+0*SIZEOF_MMWORD], mm0
+    movq        MMWORD [ebx+1*SIZEOF_MMWORD], mm1
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+    movq        mm3, mm2
+    punpcklbw   mm2, mm2
+    punpckhbw   mm3, mm3
+
+    movq        MMWORD [ebx+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [ebx+3*SIZEOF_MMWORD], mm3
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
+    add         ebx, byte 4*SIZEOF_MMWORD  ; outptr0
+    add         edi, byte 4*SIZEOF_MMWORD  ; outptr1
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          short .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jdsample-sse2.asm b/simd/i386/jdsample-sse2.asm
new file mode 100644
index 0000000..f0da626
--- /dev/null
+++ b/simd/i386/jdsample-sse2.asm
@@ -0,0 +1,726 @@
+;
+; jdsample.asm - upsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE   times 8 dw 1
+PW_TWO   times 8 dw 2
+PW_THREE times 8 dw 3
+PW_SEVEN times 8 dw 7
+PW_EIGHT times 8 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    test        eax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    pxor        xmm0, xmm0              ; xmm0=(all 0's)
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)
+    pand        xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+    add         eax, byte SIZEOF_XMMWORD-1
+    and         eax, byte -SIZEOF_XMMWORD
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    pcmpeqb     xmm6, xmm6
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+    pand        xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    movdqa      xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+
+.upsample:
+    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, xmm1
+    movdqa      xmm3, xmm1                ; xmm1=( 0  1  2 ... 13 14 15)
+    pslldq      xmm2, 1                   ; xmm2=(--  0  1 ... 12 13 14)
+    psrldq      xmm3, 1                   ; xmm3=( 1  2  3 ... 14 15 --)
+
+    por         xmm2, xmm7                ; xmm2=(-1  0  1 ... 12 13 14)
+    por         xmm3, xmm6                ; xmm3=( 1  2  3 ... 14 15 16)
+
+    movdqa      xmm7, xmm1
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)  ; xmm7=(15 -- -- ... -- -- --)
+
+    movdqa      xmm4, xmm1
+    punpcklbw   xmm1, xmm0                ; xmm1=( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm0                ; xmm4=( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm2
+    punpcklbw   xmm2, xmm0                ; xmm2=(-1  0  1  2  3  4  5  6)
+    punpckhbw   xmm5, xmm0                ; xmm5=( 7  8  9 10 11 12 13 14)
+    movdqa      xmm6, xmm3
+    punpcklbw   xmm3, xmm0                ; xmm3=( 1  2  3  4  5  6  7  8)
+    punpckhbw   xmm6, xmm0                ; xmm6=( 9 10 11 12 13 14 15 16)
+
+    pmullw      xmm1, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       xmm2, [GOTOFF(ebx,PW_ONE)]
+    paddw       xmm5, [GOTOFF(ebx,PW_ONE)]
+    paddw       xmm3, [GOTOFF(ebx,PW_TWO)]
+    paddw       xmm6, [GOTOFF(ebx,PW_TWO)]
+
+    paddw       xmm2, xmm1
+    paddw       xmm5, xmm4
+    psrlw       xmm2, 2                 ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 2                 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+    paddw       xmm3, xmm1
+    paddw       xmm6, xmm4
+    psrlw       xmm3, 2                 ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm6, 2                 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm3, BYTE_BIT
+    psllw       xmm6, BYTE_BIT
+    por         xmm2, xmm3              ; xmm2=OutL=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm6              ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
+
+    sub         eax, byte SIZEOF_XMMWORD
+    add         esi, byte 1*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        4
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         edx, eax                ; edx = original ebp
+    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(edx)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        eax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    push        edx
+    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         edx
+.skip:
+    ; -- process the first column block
+
+    movdqa      xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]  ; xmm0=row[ 0][0]
+    movdqa      xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]  ; xmm1=row[-1][0]
+    movdqa      xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]  ; xmm2=row[+1][0]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-2)
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
+
+    pand        xmm1, xmm7              ; xmm1=( 0 -- -- -- -- -- -- --)
+    pand        xmm2, xmm7              ; xmm2=( 0 -- -- -- -- -- -- --)
+
+    movdqa      XMMWORD [wk(0)], xmm1
+    movdqa      XMMWORD [wk(1)], xmm2
+
+    poppic      ebx
+
+    add         eax, byte SIZEOF_XMMWORD-1
+    and         eax, byte -SIZEOF_XMMWORD
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pcmpeqb     xmm1, xmm1
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)
+    movdqa      xmm2, xmm1
+
+    pand        xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
+    pand        xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+    movdqa      XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
+    movdqa      XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
+
+    jmp         near .upsample
+    alignx      16, 7
+
+.columnloop:
+    ; -- process the next column block
+
+    movdqa      xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]  ; xmm0=row[ 0][1]
+    movdqa      xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]  ; xmm1=row[-1][1]
+    movdqa      xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]  ; xmm2=row[+1][1]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
+
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)  ; xmm1=(-- -- -- -- -- -- --  0)
+    pslldq      xmm2, (SIZEOF_XMMWORD-2)  ; xmm2=(-- -- -- -- -- -- --  0)
+
+    movdqa      XMMWORD [wk(2)], xmm1
+    movdqa      XMMWORD [wk(3)], xmm2
+
+.upsample:
+    ; -- process the upper row
+
+    movdqa      xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm0, xmm7                ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm4, xmm3                ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm0, 2                   ; xmm0=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(-- -- -- -- -- -- --  8)
+    movdqa      xmm5, xmm7
+    movdqa      xmm6, xmm3
+    psrldq      xmm5, (SIZEOF_XMMWORD-2)  ; xmm5=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm6, 2                   ; xmm6=(--  8  9 10 11 12 13 14)
+
+    por         xmm0, xmm4                ; xmm0=( 1  2  3  4  5  6  7  8)
+    por         xmm5, xmm6                ; xmm5=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm2, xmm3
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm2, 2                   ; xmm2=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm4, xmm3
+    psrldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(0)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm2, XMMWORD [wk(2)]     ; xmm2=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(0)], xmm4
+
+    pmullw      xmm7, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm3, [GOTOFF(ebx,PW_THREE)]
+    paddw       xmm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm5, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm0, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       xmm2, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       xmm1, xmm7
+    paddw       xmm5, xmm3
+    psrlw       xmm1, 4                 ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 4                 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm3
+    psrlw       xmm0, 4                 ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm2, 4                 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm0, BYTE_BIT
+    psllw       xmm2, BYTE_BIT
+    por         xmm1, xmm0              ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm2              ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
+
+    ; -- process the lower row
+
+    movdqa      xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
+    movdqa      xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm7, xmm6                ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm3, xmm4                ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm7, 2                   ; xmm7=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(-- -- -- -- -- -- --  8)
+    movdqa      xmm0, xmm6
+    movdqa      xmm2, xmm4
+    psrldq      xmm0, (SIZEOF_XMMWORD-2)  ; xmm0=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm2, 2                   ; xmm2=(--  8  9 10 11 12 13 14)
+
+    por         xmm7, xmm3                ; xmm7=( 1  2  3  4  5  6  7  8)
+    por         xmm0, xmm2                ; xmm0=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm5, 2                   ; xmm5=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm3, xmm4
+    psrldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(1)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm5, XMMWORD [wk(3)]     ; xmm5=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(1)], xmm3
+
+    pmullw      xmm6, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       xmm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm0, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm7, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       xmm5, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       xmm1, xmm6
+    paddw       xmm0, xmm4
+    psrlw       xmm1, 4                 ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm0, 4                 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm7, xmm6
+    paddw       xmm5, xmm4
+    psrlw       xmm7, 4                 ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm5, 4                 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm7, BYTE_BIT
+    psllw       xmm5, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
+    por         xmm0, xmm5              ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
+
+    poppic      ebx
+
+    sub         eax, byte SIZEOF_XMMWORD
+    add         ecx, byte 1*SIZEOF_XMMWORD  ; inptr1(above)
+    add         ebx, byte 1*SIZEOF_XMMWORD  ; inptr0
+    add         esi, byte 1*SIZEOF_XMMWORD  ; inptr1(below)
+    add         edx, byte 2*SIZEOF_XMMWORD  ; outptr0
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr1
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         ecx
+    pop         eax
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_XMMWORD)-1
+    and         edx, byte -(2*SIZEOF_XMMWORD)
+    jz          short .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    mov         eax, edx                ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte 4*SIZEOF_XMMWORD  ; outptr
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_XMMWORD)-1
+    and         edx, byte -(2*SIZEOF_XMMWORD)
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]                    ; inptr
+    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         eax, edx                               ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
+    movdqa      XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         ebx, byte 4*SIZEOF_XMMWORD  ; outptr0
+    add         edi, byte 4*SIZEOF_XMMWORD  ; outptr1
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          short .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jfdctflt-3dn.asm b/simd/i386/jfdctflt-3dn.asm
new file mode 100644
index 0000000..1d45865
--- /dev/null
+++ b/simd/i386/jfdctflt-3dn.asm
@@ -0,0 +1,320 @@
+;
+; jfdctflt.asm - floating-point FDCT (3DNow!)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_float_3dnow)
+
+EXTN(jconst_fdct_float_3dnow):
+
+PD_0_382 times 2 dd 0.382683432365089771728460
+PD_0_707 times 2 dd 0.707106781186547524400844
+PD_0_541 times 2 dd 0.541196100146196984399723
+PD_1_306 times 2 dd 1.306562964876376527856643
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_3dnow(FAST_FLOAT *data)
+;
+
+%define data(b)       (b) + 8           ; FAST_FLOAT *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_float_3dnow)
+
+EXTN(jsimd_fdct_float_3dnow):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.rowloop:
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
+
+    movq        mm4, mm0                ; transpose coefficients
+    punpckldq   mm0, mm1                ; mm0=(00 10)=data0
+    punpckhdq   mm4, mm1                ; mm4=(01 11)=data1
+    movq        mm5, mm2                ; transpose coefficients
+    punpckldq   mm2, mm3                ; mm2=(06 16)=data6
+    punpckhdq   mm5, mm3                ; mm5=(07 17)=data7
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm2                ; mm4=data1-data6=tmp6
+    pfsub       mm0, mm5                ; mm0=data0-data7=tmp7
+    pfadd       mm6, mm2                ; mm6=data1+data6=tmp1
+    pfadd       mm7, mm5                ; mm7=data0+data7=tmp0
+
+    movq        mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm0     ; wk(1)=tmp7
+
+    movq        mm4, mm1                ; transpose coefficients
+    punpckldq   mm1, mm3                ; mm1=(02 12)=data2
+    punpckhdq   mm4, mm3                ; mm4=(03 13)=data3
+    movq        mm0, mm2                ; transpose coefficients
+    punpckldq   mm2, mm5                ; mm2=(04 14)=data4
+    punpckhdq   mm0, mm5                ; mm0=(05 15)=data5
+
+    movq        mm3, mm4
+    movq        mm5, mm1
+    pfadd       mm4, mm2                ; mm4=data3+data4=tmp3
+    pfadd       mm1, mm0                ; mm1=data2+data5=tmp2
+    pfsub       mm3, mm2                ; mm3=data3-data4=tmp4
+    pfsub       mm5, mm0                ; mm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm2, mm7
+    movq        mm0, mm6
+    pfsub       mm7, mm4                ; mm7=tmp13
+    pfsub       mm6, mm1                ; mm6=tmp12
+    pfadd       mm2, mm4                ; mm2=tmp10
+    pfadd       mm0, mm1                ; mm0=tmp11
+
+    pfadd       mm6, mm7
+    pfmul       mm6, [GOTOFF(ebx,PD_0_707)]  ; mm6=z1
+
+    movq        mm4, mm2
+    movq        mm1, mm7
+    pfsub       mm2, mm0                ; mm2=data4
+    pfsub       mm7, mm6                ; mm7=data6
+    pfadd       mm4, mm0                ; mm4=data0
+    pfadd       mm1, mm6                ; mm1=data2
+
+    movq        MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
+    movq        MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [wk(0)]     ; mm0=tmp6
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp7
+
+    pfadd       mm3, mm5                ; mm3=tmp10
+    pfadd       mm5, mm0                ; mm5=tmp11
+    pfadd       mm0, mm6                ; mm0=tmp12, mm6=tmp7
+
+    pfmul       mm5, [GOTOFF(ebx,PD_0_707)]  ; mm5=z3
+
+    movq        mm2, mm3                     ; mm2=tmp10
+    pfsub       mm3, mm0
+    pfmul       mm3, [GOTOFF(ebx,PD_0_382)]  ; mm3=z5
+    pfmul       mm2, [GOTOFF(ebx,PD_0_541)]  ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+    pfmul       mm0, [GOTOFF(ebx,PD_1_306)]  ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+    pfadd       mm2, mm3                     ; mm2=z2
+    pfadd       mm0, mm3                     ; mm0=z4
+
+    movq        mm7, mm6
+    pfsub       mm6, mm5                ; mm6=z13
+    pfadd       mm7, mm5                ; mm7=z11
+
+    movq        mm4, mm6
+    movq        mm1, mm7
+    pfsub       mm6, mm2                ; mm6=data3
+    pfsub       mm7, mm0                ; mm7=data7
+    pfadd       mm4, mm2                ; mm4=data5
+    pfadd       mm1, mm0                ; mm1=data1
+
+    movq        MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    add         edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
+
+    movq        mm4, mm0                ; transpose coefficients
+    punpckldq   mm0, mm1                ; mm0=(00 01)=data0
+    punpckhdq   mm4, mm1                ; mm4=(10 11)=data1
+    movq        mm5, mm2                ; transpose coefficients
+    punpckldq   mm2, mm3                ; mm2=(60 61)=data6
+    punpckhdq   mm5, mm3                ; mm5=(70 71)=data7
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm2                ; mm4=data1-data6=tmp6
+    pfsub       mm0, mm5                ; mm0=data0-data7=tmp7
+    pfadd       mm6, mm2                ; mm6=data1+data6=tmp1
+    pfadd       mm7, mm5                ; mm7=data0+data7=tmp0
+
+    movq        mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm0     ; wk(1)=tmp7
+
+    movq        mm4, mm1                ; transpose coefficients
+    punpckldq   mm1, mm3                ; mm1=(20 21)=data2
+    punpckhdq   mm4, mm3                ; mm4=(30 31)=data3
+    movq        mm0, mm2                ; transpose coefficients
+    punpckldq   mm2, mm5                ; mm2=(40 41)=data4
+    punpckhdq   mm0, mm5                ; mm0=(50 51)=data5
+
+    movq        mm3, mm4
+    movq        mm5, mm1
+    pfadd       mm4, mm2                ; mm4=data3+data4=tmp3
+    pfadd       mm1, mm0                ; mm1=data2+data5=tmp2
+    pfsub       mm3, mm2                ; mm3=data3-data4=tmp4
+    pfsub       mm5, mm0                ; mm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm2, mm7
+    movq        mm0, mm6
+    pfsub       mm7, mm4                ; mm7=tmp13
+    pfsub       mm6, mm1                ; mm6=tmp12
+    pfadd       mm2, mm4                ; mm2=tmp10
+    pfadd       mm0, mm1                ; mm0=tmp11
+
+    pfadd       mm6, mm7
+    pfmul       mm6, [GOTOFF(ebx,PD_0_707)]  ; mm6=z1
+
+    movq        mm4, mm2
+    movq        mm1, mm7
+    pfsub       mm2, mm0                ; mm2=data4
+    pfsub       mm7, mm6                ; mm7=data6
+    pfadd       mm4, mm0                ; mm4=data0
+    pfadd       mm1, mm6                ; mm1=data2
+
+    movq        MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
+    movq        MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [wk(0)]     ; mm0=tmp6
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp7
+
+    pfadd       mm3, mm5                ; mm3=tmp10
+    pfadd       mm5, mm0                ; mm5=tmp11
+    pfadd       mm0, mm6                ; mm0=tmp12, mm6=tmp7
+
+    pfmul       mm5, [GOTOFF(ebx,PD_0_707)]  ; mm5=z3
+
+    movq        mm2, mm3                     ; mm2=tmp10
+    pfsub       mm3, mm0
+    pfmul       mm3, [GOTOFF(ebx,PD_0_382)]  ; mm3=z5
+    pfmul       mm2, [GOTOFF(ebx,PD_0_541)]  ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+    pfmul       mm0, [GOTOFF(ebx,PD_1_306)]  ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+    pfadd       mm2, mm3                     ; mm2=z2
+    pfadd       mm0, mm3                     ; mm0=z4
+
+    movq        mm7, mm6
+    pfsub       mm6, mm5                ; mm6=z13
+    pfadd       mm7, mm5                ; mm7=z11
+
+    movq        mm4, mm6
+    movq        mm1, mm7
+    pfsub       mm6, mm2                ; mm6=data3
+    pfsub       mm7, mm0                ; mm7=data7
+    pfadd       mm4, mm2                ; mm4=data5
+    pfadd       mm1, mm0                ; mm1=data1
+
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    add         edx, byte 2*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .columnloop
+
+    femms                               ; empty MMX/3DNow! state
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jfdctflt-sse.asm b/simd/i386/jfdctflt-sse.asm
new file mode 100644
index 0000000..1faf835
--- /dev/null
+++ b/simd/i386/jfdctflt-sse.asm
@@ -0,0 +1,371 @@
+;
+; jfdctflt.asm - floating-point FDCT (SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro  unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro  unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382 times 4 dd 0.382683432365089771728460
+PD_0_707 times 4 dd 0.707106781186547524400844
+PD_0_541 times 4 dd 0.541196100146196984399723
+PD_1_306 times 4 dd 1.306562964876376527856643
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse(FAST_FLOAT *data)
+;
+
+%define data(b)       (b) + 8           ; FAST_FLOAT *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.rowloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+    ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(20 30 21 31)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 32 23 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(24 34 25 35)
+    unpckhps    xmm5, xmm3              ; xmm5=(26 36 27 37)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+    ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(02 12 03 13)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(04 14 05 15)
+    unpckhps    xmm2, xmm3              ; xmm2=(06 16 07 17)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 10 20 30)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(01 11 21 31)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(06 16 26 36)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(07 17 27 37)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(02 12 22 32)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(03 13 23 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(04 14 24 34)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(05 15 25 35)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [GOTOFF(ebx,PD_0_707)]  ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [GOTOFF(ebx,PD_0_707)]  ; xmm3=z3
+
+    movaps      xmm1, xmm2                    ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [GOTOFF(ebx,PD_0_382)]  ; xmm2=z5
+    mulps       xmm1, [GOTOFF(ebx,PD_0_541)]  ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [GOTOFF(ebx,PD_1_306)]  ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2                    ; xmm1=z2
+    addps       xmm6, xmm2                    ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.columnloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+    ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(02 03 12 13)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 23 32 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(42 43 52 53)
+    unpckhps    xmm5, xmm3              ; xmm5=(62 63 72 73)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+    ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 01 10 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(20 21 30 31)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(40 41 50 51)
+    unpckhps    xmm2, xmm3              ; xmm2=(60 61 70 71)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 01 02 03)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(10 11 12 13)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(60 61 62 63)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(70 71 72 73)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(20 21 22 23)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(30 31 32 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(40 41 42 43)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(50 51 52 53)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [GOTOFF(ebx,PD_0_707)]  ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [GOTOFF(ebx,PD_0_707)]  ; xmm3=z3
+
+    movaps      xmm1, xmm2                    ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [GOTOFF(ebx,PD_0_382)]  ; xmm2=z5
+    mulps       xmm1, [GOTOFF(ebx,PD_0_541)]  ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [GOTOFF(ebx,PD_1_306)]  ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2                    ; xmm1=z2
+    addps       xmm6, xmm2                    ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         edx, byte 4*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .columnloop
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jfdctfst-mmx.asm b/simd/i386/jfdctfst-mmx.asm
new file mode 100644
index 0000000..0271901
--- /dev/null
+++ b/simd/i386/jfdctfst-mmx.asm
@@ -0,0 +1,397 @@
+;
+; jfdctfst.asm - fast integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ  98  ; FIX(0.382683433)
+F_0_541 equ 139  ; FIX(0.541196100)
+F_0_707 equ 181  ; FIX(0.707106781)
+F_1_306 equ 334  ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS)  ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS)  ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS)  ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_ifast_mmx)
+
+EXTN(jconst_fdct_ifast_mmx):
+
+PW_F0707 times 4 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 4 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 4 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 4 dw F_1_306 << CONST_SHIFT
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_mmx(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_ifast_mmx)
+
+EXTN(jsimd_fdct_ifast_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.rowloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(20 21 22 23), mm2=(24 25 26 27)
+    ; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(20 30 21 31)
+    punpckhwd   mm4, mm1                ; mm4=(22 32 23 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(24 34 25 35)
+    punpckhwd   mm5, mm3                ; mm5=(26 36 27 37)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 01 02 03), mm1=(04 05 06 07)
+    ; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 10 01 11)
+    punpckhwd   mm4, mm7                ; mm4=(02 12 03 13)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(04 14 05 15)
+    punpckhwd   mm2, mm3                ; mm2=(06 16 07 17)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 10 20 30)=data0
+    punpckhdq   mm7, mm0                ; mm7=(01 11 21 31)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(06 16 26 36)=data6
+    punpckhdq   mm3, mm5                ; mm3=(07 17 27 37)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(02 12 22 32)=data2
+    punpckhdq   mm7, mm2                ; mm7=(03 13 23 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(04 14 24 34)=data4
+    punpckhdq   mm6, mm3                ; mm6=(05 15 25 35)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    psubw       mm5, mm7                ; mm5=tmp13
+    psubw       mm0, mm4                ; mm0=tmp12
+    paddw       mm1, mm7                ; mm1=tmp10
+    paddw       mm6, mm4                ; mm6=tmp11
+
+    paddw       mm0, mm5
+    psllw       mm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm0, [GOTOFF(ebx,PW_F0707)]  ; mm0=z1
+
+    movq        mm7, mm1
+    movq        mm4, mm5
+    psubw       mm1, mm6                ; mm1=data4
+    psubw       mm5, mm0                ; mm5=data6
+    paddw       mm7, mm6                ; mm7=data0
+    paddw       mm4, mm0                ; mm4=data2
+
+    movq        MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+    ; -- Odd part
+
+    movq        mm6, MMWORD [wk(0)]     ; mm6=tmp6
+    movq        mm0, MMWORD [wk(1)]     ; mm0=tmp7
+
+    paddw       mm2, mm3                ; mm2=tmp10
+    paddw       mm3, mm6                ; mm3=tmp11
+    paddw       mm6, mm0                ; mm6=tmp12, mm0=tmp7
+
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm6, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       mm3, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm3, [GOTOFF(ebx,PW_F0707)]  ; mm3=z3
+
+    movq        mm1, mm2                     ; mm1=tmp10
+    psubw       mm2, mm6
+    pmulhw      mm2, [GOTOFF(ebx,PW_F0382)]  ; mm2=z5
+    pmulhw      mm1, [GOTOFF(ebx,PW_F0541)]  ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+    pmulhw      mm6, [GOTOFF(ebx,PW_F1306)]  ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+    paddw       mm1, mm2                     ; mm1=z2
+    paddw       mm6, mm2                     ; mm6=z4
+
+    movq        mm5, mm0
+    psubw       mm0, mm3                ; mm0=z13
+    paddw       mm5, mm3                ; mm5=z11
+
+    movq        mm7, mm0
+    movq        mm4, mm5
+    psubw       mm0, mm1                ; mm0=data3
+    psubw       mm5, mm6                ; mm5=data7
+    paddw       mm7, mm1                ; mm7=data5
+    paddw       mm4, mm6                ; mm4=data1
+
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+    add         edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(02 12 22 32), mm2=(42 52 62 72)
+    ; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(02 03 12 13)
+    punpckhwd   mm4, mm1                ; mm4=(22 23 32 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(42 43 52 53)
+    punpckhwd   mm5, mm3                ; mm5=(62 63 72 73)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 10 20 30), mm1=(40 50 60 70)
+    ; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 01 10 11)
+    punpckhwd   mm4, mm7                ; mm4=(20 21 30 31)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(40 41 50 51)
+    punpckhwd   mm2, mm3                ; mm2=(60 61 70 71)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 01 02 03)=data0
+    punpckhdq   mm7, mm0                ; mm7=(10 11 12 13)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(60 61 62 63)=data6
+    punpckhdq   mm3, mm5                ; mm3=(70 71 72 73)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(20 21 22 23)=data2
+    punpckhdq   mm7, mm2                ; mm7=(30 31 32 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(40 41 42 43)=data4
+    punpckhdq   mm6, mm3                ; mm6=(50 51 52 53)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    psubw       mm5, mm7                ; mm5=tmp13
+    psubw       mm0, mm4                ; mm0=tmp12
+    paddw       mm1, mm7                ; mm1=tmp10
+    paddw       mm6, mm4                ; mm6=tmp11
+
+    paddw       mm0, mm5
+    psllw       mm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm0, [GOTOFF(ebx,PW_F0707)]  ; mm0=z1
+
+    movq        mm7, mm1
+    movq        mm4, mm5
+    psubw       mm1, mm6                ; mm1=data4
+    psubw       mm5, mm0                ; mm5=data6
+    paddw       mm7, mm6                ; mm7=data0
+    paddw       mm4, mm0                ; mm4=data2
+
+    movq        MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+    ; -- Odd part
+
+    movq        mm6, MMWORD [wk(0)]     ; mm6=tmp6
+    movq        mm0, MMWORD [wk(1)]     ; mm0=tmp7
+
+    paddw       mm2, mm3                ; mm2=tmp10
+    paddw       mm3, mm6                ; mm3=tmp11
+    paddw       mm6, mm0                ; mm6=tmp12, mm0=tmp7
+
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm6, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       mm3, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm3, [GOTOFF(ebx,PW_F0707)]  ; mm3=z3
+
+    movq        mm1, mm2                     ; mm1=tmp10
+    psubw       mm2, mm6
+    pmulhw      mm2, [GOTOFF(ebx,PW_F0382)]  ; mm2=z5
+    pmulhw      mm1, [GOTOFF(ebx,PW_F0541)]  ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+    pmulhw      mm6, [GOTOFF(ebx,PW_F1306)]  ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+    paddw       mm1, mm2                     ; mm1=z2
+    paddw       mm6, mm2                     ; mm6=z4
+
+    movq        mm5, mm0
+    psubw       mm0, mm3                ; mm0=z13
+    paddw       mm5, mm3                ; mm5=z11
+
+    movq        mm7, mm0
+    movq        mm4, mm5
+    psubw       mm0, mm1                ; mm0=data3
+    psubw       mm5, mm6                ; mm5=data7
+    paddw       mm7, mm1                ; mm7=data5
+    paddw       mm4, mm6                ; mm4=data1
+
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+    add         edx, byte 4*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .columnloop
+
+    emms                                ; empty MMX state
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jfdctfst-sse2.asm b/simd/i386/jfdctfst-sse2.asm
new file mode 100644
index 0000000..f09dadd
--- /dev/null
+++ b/simd/i386/jfdctfst-sse2.asm
@@ -0,0 +1,405 @@
+;
+; jfdctfst.asm - fast integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ  98  ; FIX(0.382683433)
+F_0_541 equ 139  ; FIX(0.541196100)
+F_0_707 equ 181  ; FIX(0.707106781)
+F_1_306 equ 334  ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS)  ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS)  ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS)  ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    psubw       xmm3, xmm1              ; xmm3=tmp13
+    psubw       xmm6, xmm7              ; xmm6=tmp12
+    paddw       xmm4, xmm1              ; xmm4=tmp10
+    paddw       xmm0, xmm7              ; xmm0=tmp11
+
+    paddw       xmm6, xmm3
+    psllw       xmm6, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm6, [GOTOFF(ebx,PW_F0707)]  ; xmm6=z1
+
+    movdqa      xmm1, xmm4
+    movdqa      xmm7, xmm3
+    psubw       xmm4, xmm0              ; xmm4=data4
+    psubw       xmm3, xmm6              ; xmm3=data6
+    paddw       xmm1, xmm0              ; xmm1=data0
+    paddw       xmm7, xmm6              ; xmm7=data2
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=data4
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=data6
+
+    ; -- Odd part
+
+    paddw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm5, xmm0              ; xmm5=tmp11
+    paddw       xmm0, xmm6              ; xmm0=tmp12, xmm6=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F0707)]  ; xmm5=z3
+
+    movdqa      xmm4, xmm2                    ; xmm4=tmp10
+    psubw       xmm2, xmm0
+    pmulhw      xmm2, [GOTOFF(ebx,PW_F0382)]  ; xmm2=z5
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F0541)]  ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm0, [GOTOFF(ebx,PW_F1306)]  ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm2                    ; xmm4=z2
+    paddw       xmm0, xmm2                    ; xmm0=z4
+
+    movdqa      xmm3, xmm6
+    psubw       xmm6, xmm5              ; xmm6=z13
+    paddw       xmm3, xmm5              ; xmm3=z11
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm5, xmm3
+    psubw       xmm6, xmm4              ; xmm6=data3
+    psubw       xmm3, xmm0              ; xmm3=data7
+    paddw       xmm2, xmm4              ; xmm2=data5
+    paddw       xmm5, xmm0              ; xmm5=data1
+
+    ; ---- Pass 2: process columns.
+
+;   mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+    ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm4, xmm5              ; xmm4=(40 41 50 51 60 61 70 71)
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm6              ; xmm7=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm0, xmm6              ; xmm0=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=col4
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=col6
+
+    ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+    ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm2              ; xmm5=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm7, xmm2              ; xmm7=(44 45 54 55 64 65 74 75)
+    movdqa      xmm0, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm3              ; xmm6=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm0, xmm3              ; xmm0=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm2, xmm5              ; transpose coefficients(phase 2)
+    punpckldq   xmm5, xmm6              ; xmm5=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm2, xmm6              ; xmm2=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm0              ; xmm7=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm3, xmm0              ; xmm3=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm6              ; xmm1=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm2, xmm6              ; xmm2=(20 21 22 23 30 31 32 33)
+    movdqa      xmm7, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm7, xmm0              ; xmm7=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm5              ; xmm1=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm6, xmm5              ; xmm6=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm3              ; xmm7=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm0, xmm3              ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm3, xmm1
+    psubw       xmm6, xmm7              ; xmm6=data1-data6=tmp6
+    psubw       xmm1, xmm0              ; xmm1=data0-data7=tmp7
+    paddw       xmm5, xmm7              ; xmm5=data1+data6=tmp1
+    paddw       xmm3, xmm0              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
+
+    movdqa      xmm6, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm7              ; xmm2=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm6, xmm7              ; xmm6=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm1, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm1, xmm0              ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm0, xmm2
+    paddw       xmm6, xmm4              ; xmm6=data3+data4=tmp3
+    paddw       xmm2, xmm1              ; xmm2=data2+data5=tmp2
+    psubw       xmm7, xmm4              ; xmm7=data3-data4=tmp4
+    psubw       xmm0, xmm1              ; xmm0=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm1, xmm5
+    psubw       xmm3, xmm6              ; xmm3=tmp13
+    psubw       xmm5, xmm2              ; xmm5=tmp12
+    paddw       xmm4, xmm6              ; xmm4=tmp10
+    paddw       xmm1, xmm2              ; xmm1=tmp11
+
+    paddw       xmm5, xmm3
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F0707)]  ; xmm5=z1
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm2, xmm3
+    psubw       xmm4, xmm1              ; xmm4=data4
+    psubw       xmm3, xmm5              ; xmm3=data6
+    paddw       xmm6, xmm1              ; xmm6=data0
+    paddw       xmm2, xmm5              ; xmm2=data2
+
+    movdqa      XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
+
+    ; -- Odd part
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    paddw       xmm7, xmm0              ; xmm7=tmp10
+    paddw       xmm0, xmm1              ; xmm0=tmp11
+    paddw       xmm1, xmm5              ; xmm1=tmp12, xmm5=tmp7
+
+    psllw       xmm7, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm0, [GOTOFF(ebx,PW_F0707)]  ; xmm0=z3
+
+    movdqa      xmm4, xmm7                    ; xmm4=tmp10
+    psubw       xmm7, xmm1
+    pmulhw      xmm7, [GOTOFF(ebx,PW_F0382)]  ; xmm7=z5
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F0541)]  ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm1, [GOTOFF(ebx,PW_F1306)]  ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm7                    ; xmm4=z2
+    paddw       xmm1, xmm7                    ; xmm1=z4
+
+    movdqa      xmm3, xmm5
+    psubw       xmm5, xmm0              ; xmm5=z13
+    paddw       xmm3, xmm0              ; xmm3=z11
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm2, xmm3
+    psubw       xmm5, xmm4              ; xmm5=data3
+    psubw       xmm3, xmm1              ; xmm3=data7
+    paddw       xmm6, xmm4              ; xmm6=data5
+    paddw       xmm2, xmm1              ; xmm2=data1
+
+    movdqa      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
+    movdqa      XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jfdctint-avx2.asm b/simd/i386/jfdctint-avx2.asm
new file mode 100644
index 0000000..ae258ee
--- /dev/null
+++ b/simd/i386/jfdctint-avx2.asm
@@ -0,0 +1,333 @@
+;
+; jfdctint.asm - accurate integer FDCT (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpckhwd  %6, %1, %2
+    vpunpcklwd  %7, %3, %4
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 01 11 02 12 03 13  40 50 41 51 42 52 43 53)
+    ; %6=(04 14 05 15 06 16 07 17  44 54 45 55 46 56 47 57)
+    ; %7=(20 30 21 31 22 32 23 33  60 70 61 71 62 72 63 73)
+    ; %8=(24 34 25 35 26 36 27 37  64 74 65 75 66 76 67 77)
+
+    vpunpckldq  %1, %5, %7
+    vpunpckhdq  %2, %5, %7
+    vpunpckldq  %3, %6, %8
+    vpunpckhdq  %4, %6, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %2=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %3=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %4=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpermq      %1, %1, 0x8D
+    vpermq      %2, %2, 0x8D
+    vpermq      %3, %3, 0xD8
+    vpermq      %4, %4, 0xD8
+    ; transpose coefficients(phase 3)
+    ; %1=(01 11 21 31 41 51 61 71  00 10 20 30 40 50 60 70)
+    ; %2=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %3=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %4=(06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit slow integer forward DCT using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+; %9:    Pass (1 or 2)
+
+%macro dodct 9
+    vpsubw      %5, %1, %4              ; %5=data1_0-data6_7=tmp6_7
+    vpaddw      %6, %1, %4              ; %6=data1_0+data6_7=tmp1_0
+    vpaddw      %7, %2, %3              ; %7=data3_2+data4_5=tmp3_2
+    vpsubw      %8, %2, %3              ; %8=data3_2-data4_5=tmp4_5
+
+    ; -- Even part
+
+    vperm2i128  %6, %6, %6, 0x01        ; %6=tmp0_1
+    vpaddw      %1, %6, %7              ; %1=tmp0_1+tmp3_2=tmp10_11
+    vpsubw      %6, %6, %7              ; %6=tmp0_1-tmp3_2=tmp13_12
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=tmp11_10
+    vpsignw     %1, %1, [GOTOFF(ebx, PW_1_NEG1)]  ; %1=tmp10_neg11
+    vpaddw      %7, %7, %1              ; %7=(tmp10+tmp11)_(tmp10-tmp11)
+%if %9 == 1
+    vpsllw      %1, %7, PASS1_BITS      ; %1=data0_4
+%else
+    vpaddw      %7, %7, [GOTOFF(ebx, PW_DESCALE_P2X)]
+    vpsraw      %1, %7, PASS1_BITS      ; %1=data0_4
+%endif
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    vperm2i128  %7, %6, %6, 0x01        ; %7=tmp12_13
+    vpunpcklwd  %2, %6, %7
+    vpunpckhwd  %6, %6, %7
+    vpmaddwd    %2, %2, [GOTOFF(ebx, PW_F130_F054_MF130_F054)]  ; %2=data2_6L
+    vpmaddwd    %6, %6, [GOTOFF(ebx, PW_F130_F054_MF130_F054)]  ; %6=data2_6H
+
+    vpaddd      %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpaddd      %6, %6, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %6, %6, DESCALE_P %+ %9
+
+    vpackssdw   %3, %2, %6              ; %6=data2_6
+
+    ; -- Odd part
+
+    vpaddw      %7, %8, %5              ; %7=tmp4_5+tmp6_7=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %2, %7, %7, 0x01        ; %2=z4_3
+    vpunpcklwd  %6, %7, %2
+    vpunpckhwd  %7, %7, %2
+    vpmaddwd    %6, %6, [GOTOFF(ebx, PW_MF078_F117_F078_F117)]  ; %6=z3_4L
+    vpmaddwd    %7, %7, [GOTOFF(ebx, PW_MF078_F117_F078_F117)]  ; %7=z3_4H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    vperm2i128  %4, %5, %5, 0x01        ; %4=tmp7_6
+    vpunpcklwd  %2, %8, %4
+    vpunpckhwd  %4, %8, %4
+    vpmaddwd    %2, %2, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)]  ; %2=tmp4_5L
+    vpmaddwd    %4, %4, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)]  ; %4=tmp4_5H
+
+    vpaddd      %2, %2, %6              ; %2=data7_5L
+    vpaddd      %4, %4, %7              ; %4=data7_5H
+
+    vpaddd      %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpaddd      %4, %4, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %4, %4, DESCALE_P %+ %9
+
+    vpackssdw   %4, %2, %4              ; %4=data7_5
+
+    vperm2i128  %2, %8, %8, 0x01        ; %2=tmp5_4
+    vpunpcklwd  %8, %5, %2
+    vpunpckhwd  %5, %5, %2
+    vpmaddwd    %8, %8, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)]  ; %8=tmp6_7L
+    vpmaddwd    %5, %5, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)]  ; %5=tmp6_7H
+
+    vpaddd      %8, %8, %6              ; %8=data3_1L
+    vpaddd      %5, %5, %7              ; %5=data3_1H
+
+    vpaddd      %8, %8, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpaddd      %5, %5, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpsrad      %8, %8, DESCALE_P %+ %9
+    vpsrad      %5, %5, DESCALE_P %+ %9
+
+    vpackssdw   %2, %8, %5              ; %2=data3_1
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_avx2)
+
+EXTN(jconst_fdct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_F050_MF256_F060_MF089   times 4  dw  (F_3_072 - F_2_562), -F_2_562
+                           times 4  dw  (F_1_501 - F_0_899), -F_0_899
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X             times 16 dw  1 << (PASS1_BITS - 1)
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_avx2(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_avx2)
+
+EXTN(jsimd_fdct_islow_avx2):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(ebp)]  ; (DCTELEM *)
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    ; ymm4=(00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17)
+    ; ymm5=(20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37)
+    ; ymm6=(40 41 42 43 44 45 46 47  50 51 52 53 54 55 56 57)
+    ; ymm7=(60 61 62 63 64 65 66 67  70 71 72 73 74 75 76 77)
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20
+    vperm2i128  ymm1, ymm4, ymm6, 0x31
+    vperm2i128  ymm2, ymm5, ymm7, 0x20
+    vperm2i128  ymm3, ymm5, ymm7, 0x31
+    ; ymm0=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; ymm1=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; ymm2=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; ymm3=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
+
+    ; ---- Pass 2: process columns.
+
+    vperm2i128  ymm4, ymm1, ymm3, 0x20  ; ymm4=data3_7
+    vperm2i128  ymm1, ymm1, ymm3, 0x31  ; ymm1=data1_5
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
+
+    vperm2i128 ymm3, ymm0, ymm1, 0x30   ; ymm3=data0_1
+    vperm2i128 ymm5, ymm2, ymm1, 0x20   ; ymm5=data2_3
+    vperm2i128 ymm6, ymm0, ymm4, 0x31   ; ymm6=data4_5
+    vperm2i128 ymm7, ymm2, ymm4, 0x21   ; ymm7=data6_7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], ymm3
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], ymm5
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], ymm6
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], ymm7
+
+    vzeroupper
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jfdctint-mmx.asm b/simd/i386/jfdctint-mmx.asm
new file mode 100644
index 0000000..c6bd959
--- /dev/null
+++ b/simd/i386/jfdctint-mmx.asm
@@ -0,0 +1,622 @@
+;
+; jfdctint.asm - accurate integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_mmx)
+
+EXTN(jconst_fdct_islow_mmx):
+
+PW_F130_F054   times 2 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 2 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 2 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 2 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 2 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 2 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 2 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 2 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 2 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 2 dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 4 dw  1 << (PASS1_BITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_mmx(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_mmx)
+
+EXTN(jsimd_fdct_islow_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.rowloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(20 21 22 23), mm2=(24 25 26 27)
+    ; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(20 30 21 31)
+    punpckhwd   mm4, mm1                ; mm4=(22 32 23 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(24 34 25 35)
+    punpckhwd   mm5, mm3                ; mm5=(26 36 27 37)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 01 02 03), mm1=(04 05 06 07)
+    ; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 10 01 11)
+    punpckhwd   mm4, mm7                ; mm4=(02 12 03 13)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(04 14 05 15)
+    punpckhwd   mm2, mm3                ; mm2=(06 16 07 17)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 10 20 30)=data0
+    punpckhdq   mm7, mm0                ; mm7=(01 11 21 31)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(06 16 26 36)=data6
+    punpckhdq   mm3, mm5                ; mm3=(07 17 27 37)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(02 12 22 32)=data2
+    punpckhdq   mm7, mm2                ; mm7=(03 13 23 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(04 14 24 34)=data4
+    punpckhdq   mm6, mm3                ; mm6=(05 15 25 35)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    paddw       mm5, mm7                ; mm5=tmp10
+    paddw       mm0, mm4                ; mm0=tmp11
+    psubw       mm1, mm7                ; mm1=tmp13
+    psubw       mm6, mm4                ; mm6=tmp12
+
+    movq        mm7, mm5
+    paddw       mm5, mm0                ; mm5=tmp10+tmp11
+    psubw       mm7, mm0                ; mm7=tmp10-tmp11
+
+    psllw       mm5, PASS1_BITS         ; mm5=data0
+    psllw       mm7, PASS1_BITS         ; mm7=data4
+
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movq        mm4, mm1                ; mm1=tmp13
+    movq        mm0, mm1
+    punpcklwd   mm4, mm6                ; mm6=tmp12
+    punpckhwd   mm0, mm6
+    movq        mm1, mm4
+    movq        mm6, mm0
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=data2L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F130_F054)]   ; mm0=data2H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=data6L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F054_MF130)]  ; mm6=data6H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm0, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm4, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm6, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm1, DESCALE_P1
+    psrad       mm6, DESCALE_P1
+
+    packssdw    mm4, mm0                ; mm4=data2
+    packssdw    mm1, mm6                ; mm1=data6
+
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
+
+    ; -- Odd part
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp6
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp7
+
+    movq        mm0, mm2                ; mm2=tmp4
+    movq        mm6, mm3                ; mm3=tmp5
+    paddw       mm0, mm5                ; mm0=z3
+    paddw       mm6, mm7                ; mm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm4, mm0
+    movq        mm1, mm0
+    punpcklwd   mm4, mm6
+    punpckhwd   mm1, mm6
+    movq        mm0, mm4
+    movq        mm6, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF078_F117)]  ; mm4=z3L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF078_F117)]  ; mm1=z3H
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F117_F078)]   ; mm0=z4L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F117_F078)]   ; mm6=z4H
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=z3L
+    movq        MMWORD [wk(1)], mm1     ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movq        mm4, mm2
+    movq        mm1, mm2
+    punpcklwd   mm4, mm7
+    punpckhwd   mm1, mm7
+    movq        mm2, mm4
+    movq        mm7, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm4=tmp4L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm1=tmp4H
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF089_F060)]   ; mm2=tmp7L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF089_F060)]   ; mm7=tmp7H
+
+    paddd       mm4, MMWORD [wk(0)]     ; mm4=data7L
+    paddd       mm1, MMWORD [wk(1)]     ; mm1=data7H
+    paddd       mm2, mm0                ; mm2=data1L
+    paddd       mm7, mm6                ; mm7=data1H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm4, DESCALE_P1
+    psrad       mm1, DESCALE_P1
+    paddd       mm2, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm2, DESCALE_P1
+    psrad       mm7, DESCALE_P1
+
+    packssdw    mm4, mm1                ; mm4=data7
+    packssdw    mm2, mm7                ; mm2=data1
+
+    movq        MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+    movq        mm1, mm3
+    movq        mm7, mm3
+    punpcklwd   mm1, mm5
+    punpckhwd   mm7, mm5
+    movq        mm3, mm1
+    movq        mm5, mm7
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm1=tmp5L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm7=tmp5H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF256_F050)]   ; mm3=tmp6L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_MF256_F050)]   ; mm5=tmp6H
+
+    paddd       mm1, mm0                ; mm1=data5L
+    paddd       mm7, mm6                ; mm7=data5H
+    paddd       mm3, MMWORD [wk(0)]     ; mm3=data3L
+    paddd       mm5, MMWORD [wk(1)]     ; mm5=data3H
+
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm1, DESCALE_P1
+    psrad       mm7, DESCALE_P1
+    paddd       mm3, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm5, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm3, DESCALE_P1
+    psrad       mm5, DESCALE_P1
+
+    packssdw    mm1, mm7                ; mm1=data5
+    packssdw    mm3, mm5                ; mm3=data3
+
+    movq        MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+    add         edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(02 12 22 32), mm2=(42 52 62 72)
+    ; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(02 03 12 13)
+    punpckhwd   mm4, mm1                ; mm4=(22 23 32 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(42 43 52 53)
+    punpckhwd   mm5, mm3                ; mm5=(62 63 72 73)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 10 20 30), mm1=(40 50 60 70)
+    ; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 01 10 11)
+    punpckhwd   mm4, mm7                ; mm4=(20 21 30 31)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(40 41 50 51)
+    punpckhwd   mm2, mm3                ; mm2=(60 61 70 71)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 01 02 03)=data0
+    punpckhdq   mm7, mm0                ; mm7=(10 11 12 13)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(60 61 62 63)=data6
+    punpckhdq   mm3, mm5                ; mm3=(70 71 72 73)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(20 21 22 23)=data2
+    punpckhdq   mm7, mm2                ; mm7=(30 31 32 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(40 41 42 43)=data4
+    punpckhdq   mm6, mm3                ; mm6=(50 51 52 53)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    paddw       mm5, mm7                ; mm5=tmp10
+    paddw       mm0, mm4                ; mm0=tmp11
+    psubw       mm1, mm7                ; mm1=tmp13
+    psubw       mm6, mm4                ; mm6=tmp12
+
+    movq        mm7, mm5
+    paddw       mm5, mm0                ; mm5=tmp10+tmp11
+    psubw       mm7, mm0                ; mm7=tmp10-tmp11
+
+    paddw       mm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    paddw       mm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    psraw       mm5, PASS1_BITS         ; mm5=data0
+    psraw       mm7, PASS1_BITS         ; mm7=data4
+
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movq        mm4, mm1                ; mm1=tmp13
+    movq        mm0, mm1
+    punpcklwd   mm4, mm6                ; mm6=tmp12
+    punpckhwd   mm0, mm6
+    movq        mm1, mm4
+    movq        mm6, mm0
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=data2L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F130_F054)]   ; mm0=data2H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=data6L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F054_MF130)]  ; mm6=data6H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm0, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm4, DESCALE_P2
+    psrad       mm0, DESCALE_P2
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm6, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm1, DESCALE_P2
+    psrad       mm6, DESCALE_P2
+
+    packssdw    mm4, mm0                ; mm4=data2
+    packssdw    mm1, mm6                ; mm1=data6
+
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
+
+    ; -- Odd part
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp6
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp7
+
+    movq        mm0, mm2                ; mm2=tmp4
+    movq        mm6, mm3                ; mm3=tmp5
+    paddw       mm0, mm5                ; mm0=z3
+    paddw       mm6, mm7                ; mm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm4, mm0
+    movq        mm1, mm0
+    punpcklwd   mm4, mm6
+    punpckhwd   mm1, mm6
+    movq        mm0, mm4
+    movq        mm6, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF078_F117)]  ; mm4=z3L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF078_F117)]  ; mm1=z3H
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F117_F078)]   ; mm0=z4L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F117_F078)]   ; mm6=z4H
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=z3L
+    movq        MMWORD [wk(1)], mm1     ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movq        mm4, mm2
+    movq        mm1, mm2
+    punpcklwd   mm4, mm7
+    punpckhwd   mm1, mm7
+    movq        mm2, mm4
+    movq        mm7, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm4=tmp4L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm1=tmp4H
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF089_F060)]   ; mm2=tmp7L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF089_F060)]   ; mm7=tmp7H
+
+    paddd       mm4, MMWORD [wk(0)]     ; mm4=data7L
+    paddd       mm1, MMWORD [wk(1)]     ; mm1=data7H
+    paddd       mm2, mm0                ; mm2=data1L
+    paddd       mm7, mm6                ; mm7=data1H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm4, DESCALE_P2
+    psrad       mm1, DESCALE_P2
+    paddd       mm2, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm2, DESCALE_P2
+    psrad       mm7, DESCALE_P2
+
+    packssdw    mm4, mm1                ; mm4=data7
+    packssdw    mm2, mm7                ; mm2=data1
+
+    movq        MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+    movq        mm1, mm3
+    movq        mm7, mm3
+    punpcklwd   mm1, mm5
+    punpckhwd   mm7, mm5
+    movq        mm3, mm1
+    movq        mm5, mm7
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm1=tmp5L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm7=tmp5H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF256_F050)]   ; mm3=tmp6L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_MF256_F050)]   ; mm5=tmp6H
+
+    paddd       mm1, mm0                ; mm1=data5L
+    paddd       mm7, mm6                ; mm7=data5H
+    paddd       mm3, MMWORD [wk(0)]     ; mm3=data3L
+    paddd       mm5, MMWORD [wk(1)]     ; mm5=data3H
+
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm1, DESCALE_P2
+    psrad       mm7, DESCALE_P2
+    paddd       mm3, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm3, DESCALE_P2
+    psrad       mm5, DESCALE_P2
+
+    packssdw    mm1, mm7                ; mm1=data5
+    packssdw    mm3, mm5                ; mm3=data3
+
+    movq        MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+    add         edx, byte 4*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .columnloop
+
+    emms                                ; empty MMX state
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jfdctint-sse2.asm b/simd/i386/jfdctint-sse2.asm
new file mode 100644
index 0000000..d67dcc1
--- /dev/null
+++ b/simd/i386/jfdctint-sse2.asm
@@ -0,0 +1,635 @@
+;
+; jfdctint.asm - accurate integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054   times 4 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4 dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 8 dw  1 << (PASS1_BITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        6
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(3)], xmm2   ; wk(3)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(3)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    paddw       xmm3, xmm1              ; xmm3=tmp10
+    paddw       xmm6, xmm7              ; xmm6=tmp11
+    psubw       xmm4, xmm1              ; xmm4=tmp13
+    psubw       xmm0, xmm7              ; xmm0=tmp12
+
+    movdqa      xmm1, xmm3
+    paddw       xmm3, xmm6              ; xmm3=tmp10+tmp11
+    psubw       xmm1, xmm6              ; xmm1=tmp10-tmp11
+
+    psllw       xmm3, PASS1_BITS        ; xmm3=data0
+    psllw       xmm1, PASS1_BITS        ; xmm1=data4
+
+    movdqa      XMMWORD [wk(2)], xmm3   ; wk(2)=data0
+    movdqa      XMMWORD [wk(3)], xmm1   ; wk(3)=data4
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm7, xmm4              ; xmm4=tmp13
+    movdqa      xmm6, xmm4
+    punpcklwd   xmm7, xmm0              ; xmm0=tmp12
+    punpckhwd   xmm6, xmm0
+    movdqa      xmm4, xmm7
+    movdqa      xmm0, xmm6
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_F130_F054)]   ; xmm7=data2L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F130_F054)]   ; xmm6=data2H
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm4=data6L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm0=data6H
+
+    paddd       xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm6, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm0, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm7, xmm6              ; xmm7=data2
+    packssdw    xmm4, xmm0              ; xmm4=data6
+
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=data2
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=data6
+
+    ; -- Odd part
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=tmp6
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp7
+
+    movdqa      xmm6, xmm2              ; xmm2=tmp4
+    movdqa      xmm0, xmm5              ; xmm5=tmp5
+    paddw       xmm6, xmm3              ; xmm6=z3
+    paddw       xmm0, xmm1              ; xmm0=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm7, xmm0
+    punpckhwd   xmm4, xmm0
+    movdqa      xmm6, xmm7
+    movdqa      xmm0, xmm4
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm7=z3L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm4=z3H
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F117_F078)]   ; xmm6=z4L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F117_F078)]   ; xmm0=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm7, xmm2
+    movdqa      xmm4, xmm2
+    punpcklwd   xmm7, xmm1
+    punpckhwd   xmm4, xmm1
+    movdqa      xmm2, xmm7
+    movdqa      xmm1, xmm4
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm7=tmp4L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm4=tmp4H
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm2=tmp7L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm1=tmp7H
+
+    paddd       xmm7, XMMWORD [wk(0)]   ; xmm7=data7L
+    paddd       xmm4, XMMWORD [wk(1)]   ; xmm4=data7H
+    paddd       xmm2, xmm6              ; xmm2=data1L
+    paddd       xmm1, xmm0              ; xmm1=data1H
+
+    paddd       xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm2, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+
+    packssdw    xmm7, xmm4              ; xmm7=data7
+    packssdw    xmm2, xmm1              ; xmm2=data1
+
+    movdqa      xmm4, xmm5
+    movdqa      xmm1, xmm5
+    punpcklwd   xmm4, xmm3
+    punpckhwd   xmm1, xmm3
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm1
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm4=tmp5L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm1=tmp5H
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm5=tmp6L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm3=tmp6H
+
+    paddd       xmm4, xmm6              ; xmm4=data5L
+    paddd       xmm1, xmm0              ; xmm1=data5H
+    paddd       xmm5, XMMWORD [wk(0)]   ; xmm5=data3L
+    paddd       xmm3, XMMWORD [wk(1)]   ; xmm3=data3H
+
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+    paddd       xmm5, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm3, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+
+    packssdw    xmm4, xmm1              ; xmm4=data5
+    packssdw    xmm5, xmm3              ; xmm5=data3
+
+    ; ---- Pass 2: process columns.
+
+;   mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=col0
+    movdqa      xmm0, XMMWORD [wk(4)]   ; xmm0=col2
+
+    ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+    ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm2              ; xmm6=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm1, xmm2              ; xmm1=(40 41 50 51 60 61 70 71)
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm3, xmm5              ; xmm3=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm2, XMMWORD [wk(3)]   ; xmm2=col4
+    movdqa      xmm5, XMMWORD [wk(5)]   ; xmm5=col6
+
+    ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+    ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm4              ; xmm2=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm0, xmm4              ; xmm0=(44 45 54 55 64 65 74 75)
+    movdqa      xmm3, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm7              ; xmm5=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm3, xmm7              ; xmm3=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm4, xmm5              ; xmm4=(24 25 26 27 34 35 36 37)
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm7, xmm3              ; xmm7=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=(02 03 12 13 22 23 32 33)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(3)], xmm0   ; wk(3)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm5              ; xmm6=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm4, xmm5              ; xmm4=(20 21 22 23 30 31 32 33)
+    movdqa      xmm0, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm3              ; xmm1=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm0, xmm3              ; xmm0=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm2              ; xmm6=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm5, xmm2              ; xmm5=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm7              ; xmm0=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm3, xmm7              ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm7, xmm6
+    psubw       xmm5, xmm0              ; xmm5=data1-data6=tmp6
+    psubw       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    paddw       xmm2, xmm0              ; xmm2=data1+data6=tmp1
+    paddw       xmm7, xmm3              ; xmm7=data0+data7=tmp0
+
+    movdqa      xmm0, XMMWORD [wk(2)]   ; xmm0=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, XMMWORD [wk(3)]   ; xmm3=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movdqa      xmm5, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm5, xmm0              ; xmm5=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm3              ; xmm1=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm6, xmm3              ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm3, xmm4
+    paddw       xmm5, xmm1              ; xmm5=data3+data4=tmp3
+    paddw       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    psubw       xmm0, xmm1              ; xmm0=data3-data4=tmp4
+    psubw       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm6, xmm2
+    paddw       xmm7, xmm5              ; xmm7=tmp10
+    paddw       xmm2, xmm4              ; xmm2=tmp11
+    psubw       xmm1, xmm5              ; xmm1=tmp13
+    psubw       xmm6, xmm4              ; xmm6=tmp12
+
+    movdqa      xmm5, xmm7
+    paddw       xmm7, xmm2              ; xmm7=tmp10+tmp11
+    psubw       xmm5, xmm2              ; xmm5=tmp10-tmp11
+
+    paddw       xmm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    paddw       xmm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    psraw       xmm7, PASS1_BITS        ; xmm7=data0
+    psraw       xmm5, PASS1_BITS        ; xmm5=data4
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
+    movdqa      XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm4, xmm1              ; xmm1=tmp13
+    movdqa      xmm2, xmm1
+    punpcklwd   xmm4, xmm6              ; xmm6=tmp12
+    punpckhwd   xmm2, xmm6
+    movdqa      xmm1, xmm4
+    movdqa      xmm6, xmm2
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F130_F054)]   ; xmm4=data2L
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F130_F054)]   ; xmm2=data2H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm1=data6L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm6=data6H
+
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm2, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm6, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm6, DESCALE_P2
+
+    packssdw    xmm4, xmm2              ; xmm4=data2
+    packssdw    xmm1, xmm6              ; xmm1=data6
+
+    movdqa      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
+
+    ; -- Odd part
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    movdqa      xmm2, xmm0              ; xmm0=tmp4
+    movdqa      xmm6, xmm3              ; xmm3=tmp5
+    paddw       xmm2, xmm7              ; xmm2=z3
+    paddw       xmm6, xmm5              ; xmm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm1, xmm2
+    punpcklwd   xmm4, xmm6
+    punpckhwd   xmm1, xmm6
+    movdqa      xmm2, xmm4
+    movdqa      xmm6, xmm1
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm4=z3L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm1=z3H
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F117_F078)]   ; xmm2=z4L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F117_F078)]   ; xmm6=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm1, xmm0
+    punpcklwd   xmm4, xmm5
+    punpckhwd   xmm1, xmm5
+    movdqa      xmm0, xmm4
+    movdqa      xmm5, xmm1
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm4=tmp4L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm1=tmp4H
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm0=tmp7L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm5=tmp7H
+
+    paddd       xmm4,  XMMWORD [wk(0)]  ; xmm4=data7L
+    paddd       xmm1,  XMMWORD [wk(1)]  ; xmm1=data7H
+    paddd       xmm0, xmm2              ; xmm0=data1L
+    paddd       xmm5, xmm6              ; xmm5=data1H
+
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm0, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+
+    packssdw    xmm4, xmm1              ; xmm4=data7
+    packssdw    xmm0, xmm5              ; xmm0=data1
+
+    movdqa      XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
+
+    movdqa      xmm1, xmm3
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm1, xmm7
+    punpckhwd   xmm5, xmm7
+    movdqa      xmm3, xmm1
+    movdqa      xmm7, xmm5
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm1=tmp5L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm5=tmp5H
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm3=tmp6L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm7=tmp6H
+
+    paddd       xmm1, xmm2              ; xmm1=data5L
+    paddd       xmm5, xmm6              ; xmm5=data5H
+    paddd       xmm3, XMMWORD [wk(0)]   ; xmm3=data3L
+    paddd       xmm7, XMMWORD [wk(1)]   ; xmm7=data3H
+
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+    paddd       xmm3, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm1, xmm5              ; xmm1=data5
+    packssdw    xmm3, xmm7              ; xmm3=data3
+
+    movdqa      XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jidctflt-3dn.asm b/simd/i386/jidctflt-3dn.asm
new file mode 100644
index 0000000..73aa18d
--- /dev/null
+++ b/simd/i386/jidctflt-3dn.asm
@@ -0,0 +1,453 @@
+;
+; jidctflt.asm - floating-point IDCT (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_3dnow)
+
+EXTN(jconst_idct_float_3dnow):
+
+PD_1_414        times 2 dd 1.414213562373095048801689
+PD_1_847        times 2 dd 1.847759065022573512256366
+PD_1_082        times 2 dd 1.082392200292393968799446
+PD_2_613        times 2 dd 2.613125929752753055713286
+PD_RNDINT_MAGIC times 2 dd 100663296.0  ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP  times 8 db CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_3dnow(void *dct_table, JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_3dnow)
+
+EXTN(jsimd_idct_float_3dnow):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
+    mov         ecx, DCTSIZE/2                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
+    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    pushpic     ebx                     ; save GOT address
+    mov         ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    mov         eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    or          ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    or          eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    or          ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    or          eax, ebx
+    poppic      ebx                     ; restore GOT address
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movd        mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   mm0, mm0
+    psrad       mm0, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm0, mm0
+
+    pfmul       mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0
+    punpckhdq   mm1, mm1
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movd        mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movd        mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movd        mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movd        mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   mm0, mm0
+    punpcklwd   mm1, mm1
+    psrad       mm0, (DWORD_BIT-WORD_BIT)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm0, mm0
+    pi2fd       mm1, mm1
+
+    pfmul       mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    punpcklwd   mm2, mm2
+    punpcklwd   mm3, mm3
+    psrad       mm2, (DWORD_BIT-WORD_BIT)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm2, mm2
+    pi2fd       mm3, mm3
+
+    pfmul       mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pfsub       mm0, mm2                ; mm0=tmp11
+    pfsub       mm1, mm3
+    pfadd       mm4, mm2                ; mm4=tmp10
+    pfadd       mm5, mm3                ; mm5=tmp13
+
+    pfmul       mm1, [GOTOFF(ebx,PD_1_414)]
+    pfsub       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm5                ; mm4=tmp3
+    pfsub       mm0, mm1                ; mm0=tmp2
+    pfadd       mm6, mm5                ; mm6=tmp0
+    pfadd       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; tmp3
+    movq        MMWORD [wk(0)], mm0     ; tmp2
+
+    ; -- Odd part
+
+    movd        mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movd        mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movd        mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movd        mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   mm2, mm2
+    punpcklwd   mm3, mm3
+    psrad       mm2, (DWORD_BIT-WORD_BIT)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm2, mm2
+    pi2fd       mm3, mm3
+
+    pfmul       mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    punpcklwd   mm5, mm5
+    punpcklwd   mm1, mm1
+    psrad       mm5, (DWORD_BIT-WORD_BIT)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm5, mm5
+    pi2fd       mm1, mm1
+
+    pfmul       mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    pfadd       mm2, mm1                ; mm2=z11
+    pfadd       mm5, mm3                ; mm5=z13
+    pfsub       mm4, mm1                ; mm4=z12
+    pfsub       mm0, mm3                ; mm0=z10
+
+    movq        mm1, mm2
+    pfsub       mm2, mm5
+    pfadd       mm1, mm5                ; mm1=tmp7
+
+    pfmul       mm2, [GOTOFF(ebx,PD_1_414)]  ; mm2=tmp11
+
+    movq        mm3, mm0
+    pfadd       mm0, mm4
+    pfmul       mm0, [GOTOFF(ebx,PD_1_847)]  ; mm0=z5
+    pfmul       mm3, [GOTOFF(ebx,PD_2_613)]  ; mm3=(z10 * 2.613125930)
+    pfmul       mm4, [GOTOFF(ebx,PD_1_082)]  ; mm4=(z12 * 1.082392200)
+    pfsubr      mm3, mm0                     ; mm3=tmp12
+    pfsub       mm4, mm0                     ; mm4=tmp10
+
+    ; -- Final output stage
+
+    pfsub       mm3, mm1                ; mm3=tmp6
+    movq        mm5, mm6
+    movq        mm0, mm7
+    pfadd       mm6, mm1                ; mm6=data0=(00 01)
+    pfadd       mm7, mm3                ; mm7=data1=(10 11)
+    pfsub       mm5, mm1                ; mm5=data7=(70 71)
+    pfsub       mm0, mm3                ; mm0=data6=(60 61)
+    pfsub       mm2, mm3                ; mm2=tmp5
+
+    movq        mm1, mm6                ; transpose coefficients
+    punpckldq   mm6, mm7                ; mm6=(00 10)
+    punpckhdq   mm1, mm7                ; mm1=(01 11)
+    movq        mm3, mm0                ; transpose coefficients
+    punpckldq   mm0, mm5                ; mm0=(60 70)
+    punpckhdq   mm3, mm5                ; mm3=(61 71)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp2
+    movq        mm5, MMWORD [wk(1)]     ; mm5=tmp3
+
+    pfadd       mm4, mm2                ; mm4=tmp4
+    movq        mm6, mm7
+    movq        mm1, mm5
+    pfadd       mm7, mm2                ; mm7=data2=(20 21)
+    pfadd       mm5, mm4                ; mm5=data4=(40 41)
+    pfsub       mm6, mm2                ; mm6=data5=(50 51)
+    pfsub       mm1, mm4                ; mm1=data3=(30 31)
+
+    movq        mm0, mm7                ; transpose coefficients
+    punpckldq   mm7, mm1                ; mm7=(20 30)
+    punpckhdq   mm0, mm1                ; mm0=(21 31)
+    movq        mm3, mm5                ; transpose coefficients
+    punpckldq   mm5, mm6                ; mm5=(40 50)
+    punpckhdq   mm3, mm6                ; mm3=(41 51)
+
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+    movq        MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
+
+.nextcolumn:
+    add         esi, byte 2*SIZEOF_JCOEF               ; coef_block
+    add         edx, byte 2*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         ecx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/2                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pfsub       mm0, mm2                ; mm0=tmp11
+    pfsub       mm1, mm3
+    pfadd       mm4, mm2                ; mm4=tmp10
+    pfadd       mm5, mm3                ; mm5=tmp13
+
+    pfmul       mm1, [GOTOFF(ebx,PD_1_414)]
+    pfsub       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm5                ; mm4=tmp3
+    pfsub       mm0, mm1                ; mm0=tmp2
+    pfadd       mm6, mm5                ; mm6=tmp0
+    pfadd       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; tmp3
+    movq        MMWORD [wk(0)], mm0     ; tmp2
+
+    ; -- Odd part
+
+    movq        mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    pfadd       mm2, mm1                ; mm2=z11
+    pfadd       mm5, mm3                ; mm5=z13
+    pfsub       mm4, mm1                ; mm4=z12
+    pfsub       mm0, mm3                ; mm0=z10
+
+    movq        mm1, mm2
+    pfsub       mm2, mm5
+    pfadd       mm1, mm5                ; mm1=tmp7
+
+    pfmul       mm2, [GOTOFF(ebx,PD_1_414)]  ; mm2=tmp11
+
+    movq        mm3, mm0
+    pfadd       mm0, mm4
+    pfmul       mm0, [GOTOFF(ebx,PD_1_847)]  ; mm0=z5
+    pfmul       mm3, [GOTOFF(ebx,PD_2_613)]  ; mm3=(z10 * 2.613125930)
+    pfmul       mm4, [GOTOFF(ebx,PD_1_082)]  ; mm4=(z12 * 1.082392200)
+    pfsubr      mm3, mm0                     ; mm3=tmp12
+    pfsub       mm4, mm0                     ; mm4=tmp10
+
+    ; -- Final output stage
+
+    pfsub       mm3, mm1                ; mm3=tmp6
+    movq        mm5, mm6
+    movq        mm0, mm7
+    pfadd       mm6, mm1                ; mm6=data0=(00 10)
+    pfadd       mm7, mm3                ; mm7=data1=(01 11)
+    pfsub       mm5, mm1                ; mm5=data7=(07 17)
+    pfsub       mm0, mm3                ; mm0=data6=(06 16)
+    pfsub       mm2, mm3                ; mm2=tmp5
+
+    movq        mm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; mm1=[PD_RNDINT_MAGIC]
+    pcmpeqd     mm3, mm3
+    psrld       mm3, WORD_BIT           ; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
+
+    pfadd       mm6, mm1                ; mm6=roundint(data0/8)=(00 ** 10 **)
+    pfadd       mm7, mm1                ; mm7=roundint(data1/8)=(01 ** 11 **)
+    pfadd       mm0, mm1                ; mm0=roundint(data6/8)=(06 ** 16 **)
+    pfadd       mm5, mm1                ; mm5=roundint(data7/8)=(07 ** 17 **)
+
+    pand        mm6, mm3                ; mm6=(00 -- 10 --)
+    pslld       mm7, WORD_BIT           ; mm7=(-- 01 -- 11)
+    pand        mm0, mm3                ; mm0=(06 -- 16 --)
+    pslld       mm5, WORD_BIT           ; mm5=(-- 07 -- 17)
+    por         mm6, mm7                ; mm6=(00 01 10 11)
+    por         mm0, mm5                ; mm0=(06 07 16 17)
+
+    movq        mm1, MMWORD [wk(0)]     ; mm1=tmp2
+    movq        mm3, MMWORD [wk(1)]     ; mm3=tmp3
+
+    pfadd       mm4, mm2                ; mm4=tmp4
+    movq        mm7, mm1
+    movq        mm5, mm3
+    pfadd       mm1, mm2                ; mm1=data2=(02 12)
+    pfadd       mm3, mm4                ; mm3=data4=(04 14)
+    pfsub       mm7, mm2                ; mm7=data5=(05 15)
+    pfsub       mm5, mm4                ; mm5=data3=(03 13)
+
+    movq        mm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; mm2=[PD_RNDINT_MAGIC]
+    pcmpeqd     mm4, mm4
+    psrld       mm4, WORD_BIT           ; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
+
+    pfadd       mm3, mm2                ; mm3=roundint(data4/8)=(04 ** 14 **)
+    pfadd       mm7, mm2                ; mm7=roundint(data5/8)=(05 ** 15 **)
+    pfadd       mm1, mm2                ; mm1=roundint(data2/8)=(02 ** 12 **)
+    pfadd       mm5, mm2                ; mm5=roundint(data3/8)=(03 ** 13 **)
+
+    pand        mm3, mm4                ; mm3=(04 -- 14 --)
+    pslld       mm7, WORD_BIT           ; mm7=(-- 05 -- 15)
+    pand        mm1, mm4                ; mm1=(02 -- 12 --)
+    pslld       mm5, WORD_BIT           ; mm5=(-- 03 -- 13)
+    por         mm3, mm7                ; mm3=(04 05 14 15)
+    por         mm1, mm5                ; mm1=(02 03 12 13)
+
+    movq        mm2, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm2=[PB_CENTERJSAMP]
+
+    packsswb    mm6, mm3                ; mm6=(00 01 10 11 04 05 14 15)
+    packsswb    mm1, mm0                ; mm1=(02 03 12 13 06 07 16 17)
+    paddb       mm6, mm2
+    paddb       mm1, mm2
+
+    movq        mm4, mm6                ; transpose coefficients(phase 2)
+    punpcklwd   mm6, mm1                ; mm6=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm4, mm1                ; mm4=(04 05 06 07 14 15 16 17)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 3)
+    punpckldq   mm6, mm4                ; mm6=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm7, mm4                ; mm7=(10 11 12 13 14 15 16 17)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 2*SIZEOF_FAST_FLOAT  ; wsptr
+    add         edi, byte 2*SIZEOF_JSAMPROW
+    dec         ecx                            ; ctr
+    jnz         near .rowloop
+
+    femms                               ; empty MMX/3DNow! state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jidctflt-sse.asm b/simd/i386/jidctflt-sse.asm
new file mode 100644
index 0000000..386650f
--- /dev/null
+++ b/simd/i386/jidctflt-sse.asm
@@ -0,0 +1,573 @@
+;
+; jidctflt.asm - floating-point IDCT (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_sse)
+
+EXTN(jconst_idct_float_sse):
+
+PD_1_414       times 4 dd  1.414213562373095048801689
+PD_1_847       times 4 dd  1.847759065022573512256366
+PD_1_082       times 4 dd  1.082392200292393968799446
+PD_M2_613      times 4 dd -2.613125929752753055713286
+PD_0_125       times 4 dd  0.125        ; 1/8
+PB_CENTERJSAMP times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_sse)
+
+EXTN(jsimd_idct_float_sse):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm1, mm0
+    packsswb    mm1, mm1
+    movd        eax, mm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+    punpckhwd   mm1, mm0                   ; mm1=(** 02 ** 03)
+    punpcklwd   mm0, mm0                   ; mm0=(00 00 01 01)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)  ; mm1=in0H=(02 03)
+    psrad       mm0, (DWORD_BIT-WORD_BIT)  ; mm0=in0L=(00 01)
+    cvtpi2ps    xmm3, mm1                  ; xmm3=(02 03 ** **)
+    cvtpi2ps    xmm0, mm0                  ; xmm0=(00 01 ** **)
+    movlhps     xmm0, xmm3                 ; xmm0=in0=(00 01 02 03)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm1, xmm0
+    movaps      xmm2, xmm0
+    movaps      xmm3, xmm0
+
+    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
+    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
+    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
+    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    punpckhwd   mm4, mm0                ; mm4=(** 02 ** 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm5, mm1                ; mm5=(** 22 ** 23)
+    punpcklwd   mm1, mm1                ; mm1=(20 20 21 21)
+
+    psrad       mm4, (DWORD_BIT-WORD_BIT)  ; mm4=in0H=(02 03)
+    psrad       mm0, (DWORD_BIT-WORD_BIT)  ; mm0=in0L=(00 01)
+    cvtpi2ps    xmm4, mm4                  ; xmm4=(02 03 ** **)
+    cvtpi2ps    xmm0, mm0                  ; xmm0=(00 01 ** **)
+    psrad       mm5, (DWORD_BIT-WORD_BIT)  ; mm5=in2H=(22 23)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)  ; mm1=in2L=(20 21)
+    cvtpi2ps    xmm5, mm5                  ; xmm5=(22 23 ** **)
+    cvtpi2ps    xmm1, mm1                  ; xmm1=(20 21 ** **)
+
+    punpckhwd   mm6, mm2                ; mm6=(** 42 ** 43)
+    punpcklwd   mm2, mm2                ; mm2=(40 40 41 41)
+    punpckhwd   mm7, mm3                ; mm7=(** 62 ** 63)
+    punpcklwd   mm3, mm3                ; mm3=(60 60 61 61)
+
+    psrad       mm6, (DWORD_BIT-WORD_BIT)  ; mm6=in4H=(42 43)
+    psrad       mm2, (DWORD_BIT-WORD_BIT)  ; mm2=in4L=(40 41)
+    cvtpi2ps    xmm6, mm6                  ; xmm6=(42 43 ** **)
+    cvtpi2ps    xmm2, mm2                  ; xmm2=(40 41 ** **)
+    psrad       mm7, (DWORD_BIT-WORD_BIT)  ; mm7=in6H=(62 63)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)  ; mm3=in6L=(60 61)
+    cvtpi2ps    xmm7, mm7                  ; xmm7=(62 63 ** **)
+    cvtpi2ps    xmm3, mm3                  ; xmm3=(60 61 ** **)
+
+    movlhps     xmm0, xmm4              ; xmm0=in0=(00 01 02 03)
+    movlhps     xmm1, xmm5              ; xmm1=in2=(20 21 22 23)
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movlhps     xmm2, xmm6              ; xmm2=in4=(40 41 42 43)
+    movlhps     xmm3, xmm7              ; xmm3=in6=(60 61 62 63)
+    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movq        mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    punpckhwd   mm6, mm4                ; mm6=(** 12 ** 13)
+    punpcklwd   mm4, mm4                ; mm4=(10 10 11 11)
+    punpckhwd   mm2, mm0                ; mm2=(** 32 ** 33)
+    punpcklwd   mm0, mm0                ; mm0=(30 30 31 31)
+
+    psrad       mm6, (DWORD_BIT-WORD_BIT)  ; mm6=in1H=(12 13)
+    psrad       mm4, (DWORD_BIT-WORD_BIT)  ; mm4=in1L=(10 11)
+    cvtpi2ps    xmm4, mm6                  ; xmm4=(12 13 ** **)
+    cvtpi2ps    xmm2, mm4                  ; xmm2=(10 11 ** **)
+    psrad       mm2, (DWORD_BIT-WORD_BIT)  ; mm2=in3H=(32 33)
+    psrad       mm0, (DWORD_BIT-WORD_BIT)  ; mm0=in3L=(30 31)
+    cvtpi2ps    xmm0, mm2                  ; xmm0=(32 33 ** **)
+    cvtpi2ps    xmm3, mm0                  ; xmm3=(30 31 ** **)
+
+    punpckhwd   mm7, mm5                ; mm7=(** 52 ** 53)
+    punpcklwd   mm5, mm5                ; mm5=(50 50 51 51)
+    punpckhwd   mm3, mm1                ; mm3=(** 72 ** 73)
+    punpcklwd   mm1, mm1                ; mm1=(70 70 71 71)
+
+    movlhps     xmm2, xmm4              ; xmm2=in1=(10 11 12 13)
+    movlhps     xmm3, xmm0              ; xmm3=in3=(30 31 32 33)
+
+    psrad       mm7, (DWORD_BIT-WORD_BIT)  ; mm7=in5H=(52 53)
+    psrad       mm5, (DWORD_BIT-WORD_BIT)  ; mm5=in5L=(50 51)
+    cvtpi2ps    xmm4, mm7                  ; xmm4=(52 53 ** **)
+    cvtpi2ps    xmm5, mm5                  ; xmm5=(50 51 ** **)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)  ; mm3=in7H=(72 73)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)  ; mm1=in7L=(70 71)
+    cvtpi2ps    xmm0, mm3                  ; xmm0=(72 73 ** **)
+    cvtpi2ps    xmm1, mm1                  ; xmm1=(70 71 ** **)
+
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movlhps     xmm5, xmm4              ; xmm5=in5=(50 51 52 53)
+    movlhps     xmm1, xmm0              ; xmm1=in7=(70 71 72 73)
+    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
+    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
+    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
+    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
+    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
+    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
+
+    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
+    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm0, xmm7
+    movaps      xmm3, xmm5
+    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
+    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
+    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
+    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
+
+    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
+    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
+    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
+    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
+    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
+    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
+
+    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
+    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
+    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
+    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
+    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
+    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
+    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
+    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
+    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF               ; coef_block
+    add         edx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         ecx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
+    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
+    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
+    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, [GOTOFF(ebx,PD_0_125)]  ; xmm1=[PD_0_125]
+
+    mulps       xmm6, xmm1              ; descale(1/8)
+    mulps       xmm7, xmm1              ; descale(1/8)
+    mulps       xmm5, xmm1              ; descale(1/8)
+    mulps       xmm0, xmm1              ; descale(1/8)
+
+    movhlps     xmm3, xmm6
+    movhlps     xmm1, xmm7
+    cvtps2pi    mm0, xmm6               ; round to int32, mm0=data0L=(00 10)
+    cvtps2pi    mm1, xmm7               ; round to int32, mm1=data1L=(01 11)
+    cvtps2pi    mm2, xmm3               ; round to int32, mm2=data0H=(20 30)
+    cvtps2pi    mm3, xmm1               ; round to int32, mm3=data1H=(21 31)
+    packssdw    mm0, mm2                ; mm0=data0=(00 10 20 30)
+    packssdw    mm1, mm3                ; mm1=data1=(01 11 21 31)
+
+    movhlps     xmm6, xmm5
+    movhlps     xmm7, xmm0
+    cvtps2pi    mm4, xmm5               ; round to int32, mm4=data7L=(07 17)
+    cvtps2pi    mm5, xmm0               ; round to int32, mm5=data6L=(06 16)
+    cvtps2pi    mm6, xmm6               ; round to int32, mm6=data7H=(27 37)
+    cvtps2pi    mm7, xmm7               ; round to int32, mm7=data6H=(26 36)
+    packssdw    mm4, mm6                ; mm4=data7=(07 17 27 37)
+    packssdw    mm5, mm7                ; mm5=data6=(06 16 26 36)
+
+    packsswb    mm0, mm5                ; mm0=(00 10 20 30 06 16 26 36)
+    packsswb    mm1, mm4                ; mm1=(01 11 21 31 07 17 27 37)
+
+    movaps      xmm3, XMMWORD [wk(0)]   ; xmm3=tmp2
+    movaps      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
+
+    movaps      xmm6, [GOTOFF(ebx,PD_0_125)]  ; xmm6=[PD_0_125]
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm5, xmm3
+    movaps      xmm0, xmm1
+    addps       xmm3, xmm2              ; xmm3=data2=(02 12 22 32)
+    addps       xmm1, xmm4              ; xmm1=data4=(04 14 24 34)
+    subps       xmm5, xmm2              ; xmm5=data5=(05 15 25 35)
+    subps       xmm0, xmm4              ; xmm0=data3=(03 13 23 33)
+
+    mulps       xmm3, xmm6              ; descale(1/8)
+    mulps       xmm1, xmm6              ; descale(1/8)
+    mulps       xmm5, xmm6              ; descale(1/8)
+    mulps       xmm0, xmm6              ; descale(1/8)
+
+    movhlps     xmm7, xmm3
+    movhlps     xmm2, xmm1
+    cvtps2pi    mm2, xmm3               ; round to int32, mm2=data2L=(02 12)
+    cvtps2pi    mm3, xmm1               ; round to int32, mm3=data4L=(04 14)
+    cvtps2pi    mm6, xmm7               ; round to int32, mm6=data2H=(22 32)
+    cvtps2pi    mm7, xmm2               ; round to int32, mm7=data4H=(24 34)
+    packssdw    mm2, mm6                ; mm2=data2=(02 12 22 32)
+    packssdw    mm3, mm7                ; mm3=data4=(04 14 24 34)
+
+    movhlps     xmm4, xmm5
+    movhlps     xmm6, xmm0
+    cvtps2pi    mm5, xmm5               ; round to int32, mm5=data5L=(05 15)
+    cvtps2pi    mm4, xmm0               ; round to int32, mm4=data3L=(03 13)
+    cvtps2pi    mm6, xmm4               ; round to int32, mm6=data5H=(25 35)
+    cvtps2pi    mm7, xmm6               ; round to int32, mm7=data3H=(23 33)
+    packssdw    mm5, mm6                ; mm5=data5=(05 15 25 35)
+    packssdw    mm4, mm7                ; mm4=data3=(03 13 23 33)
+
+    movq        mm6, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm6=[PB_CENTERJSAMP]
+
+    packsswb    mm2, mm3                ; mm2=(02 12 22 32 04 14 24 34)
+    packsswb    mm4, mm5                ; mm4=(03 13 23 33 05 15 25 35)
+
+    paddb       mm0, mm6
+    paddb       mm1, mm6
+    paddb       mm2, mm6
+    paddb       mm4, mm6
+
+    movq        mm7, mm0                ; transpose coefficients(phase 1)
+    punpcklbw   mm0, mm1                ; mm0=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm7, mm1                ; mm7=(06 07 16 17 26 27 36 37)
+    movq        mm3, mm2                ; transpose coefficients(phase 1)
+    punpcklbw   mm2, mm4                ; mm2=(02 03 12 13 22 23 32 33)
+    punpckhbw   mm3, mm4                ; mm3=(04 05 14 15 24 25 34 35)
+
+    movq        mm5, mm0                ; transpose coefficients(phase 2)
+    punpcklwd   mm0, mm2                ; mm0=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm5, mm2                ; mm5=(20 21 22 23 30 31 32 33)
+    movq        mm6, mm3                ; transpose coefficients(phase 2)
+    punpcklwd   mm3, mm7                ; mm3=(04 05 06 07 14 15 16 17)
+    punpckhwd   mm6, mm7                ; mm6=(24 25 26 27 34 35 36 37)
+
+    movq        mm1, mm0                ; transpose coefficients(phase 3)
+    punpckldq   mm0, mm3                ; mm0=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm1, mm3                ; mm1=(10 11 12 13 14 15 16 17)
+    movq        mm4, mm5                ; transpose coefficients(phase 3)
+    punpckldq   mm5, mm6                ; mm5=(20 21 22 23 24 25 26 27)
+    punpckhdq   mm4, mm6                ; mm4=(30 31 32 33 34 35 36 37)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                            ; ctr
+    jnz         near .rowloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jidctflt-sse2.asm b/simd/i386/jidctflt-sse2.asm
new file mode 100644
index 0000000..9de7139
--- /dev/null
+++ b/simd/i386/jidctflt-sse2.asm
@@ -0,0 +1,499 @@
+;
+; jidctflt.asm - floating-point IDCT (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414        times 4  dd  1.414213562373095048801689
+PD_1_847        times 4  dd  1.847759065022573512256366
+PD_1_082        times 4  dd  1.082392200292393968799446
+PD_M2_613       times 4  dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, xmm2
+    por         xmm3, xmm4
+    por         xmm5, xmm6
+    por         xmm1, xmm3
+    por         xmm5, xmm7
+    por         xmm1, xmm5
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm1, xmm0
+    movaps      xmm2, xmm0
+    movaps      xmm3, xmm0
+
+    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
+    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
+    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
+    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
+
+    punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
+    psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
+    cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
+
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
+    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
+    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
+    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
+    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
+    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
+
+    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
+    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm0, xmm7
+    movaps      xmm3, xmm5
+    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
+    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
+    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
+    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
+
+    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
+    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
+    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
+    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
+    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
+    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
+
+    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
+    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
+    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
+    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
+    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
+    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
+    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
+    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
+    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF               ; coef_block
+    add         edx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         ecx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
+    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
+    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
+    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; xmm1=[PD_RNDINT_MAGIC]
+    pcmpeqd     xmm3, xmm3
+    psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+    addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+    addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+    addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+    pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+    pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+    por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
+    por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
+
+    movaps      xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm7, xmm1
+    movaps      xmm5, xmm3
+    addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
+    addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
+    subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
+    subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
+
+    movaps      xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; xmm2=[PD_RNDINT_MAGIC]
+    pcmpeqd     xmm4, xmm4
+    psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+    addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+    addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+    addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+    pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+    pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+    por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
+    por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; xmm2=[PB_CENTERJSAMP]
+
+    packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+    packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+    paddb       xmm6, xmm2
+    paddb       xmm1, xmm2
+
+    movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+    movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
+    punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+    pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                            ; ctr
+    jnz         near .rowloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jidctfst-mmx.asm b/simd/i386/jidctfst-mmx.asm
new file mode 100644
index 0000000..d3e8a5d
--- /dev/null
+++ b/simd/i386/jidctfst-mmx.asm
@@ -0,0 +1,501 @@
+;
+; jidctfst.asm - fast integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+%define PASS1_BITS  2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277              ; FIX(1.082392200)
+F_1_414 equ 362              ; FIX(1.414213562)
+F_1_847 equ 473              ; FIX(1.847759065)
+F_2_613 equ 669              ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256)  ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS)  ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS)  ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS)  ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS))       ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_ifast_mmx)
+
+EXTN(jconst_idct_ifast_mmx):
+
+PW_F1414       times 4 dw  F_1_414 << CONST_SHIFT
+PW_F1847       times 4 dw  F_1_847 << CONST_SHIFT
+PW_MF1613      times 4 dw -F_1_613 << CONST_SHIFT
+PW_F1082       times 4 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_mmx(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+                                        ; JCOEF workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_ifast_mmx)
+
+EXTN(jsimd_idct_ifast_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; JCOEF *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
+    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm1, mm0
+    packsswb    mm1, mm1
+    movd        eax, mm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movq        mm2, mm0                ; mm0=in0=(00 01 02 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm2, mm2                ; mm2=(02 02 03 03)
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0                ; mm0=(00 00 00 00)
+    punpckhdq   mm1, mm1                ; mm1=(01 01 01 01)
+    movq        mm3, mm2
+    punpckldq   mm2, mm2                ; mm2=(02 02 02 02)
+    punpckhdq   mm3, mm3                ; mm3=(03 03 03 03)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    psubw       mm0, mm2                ; mm0=tmp11
+    psubw       mm1, mm3
+    paddw       mm4, mm2                ; mm4=tmp10
+    paddw       mm5, mm3                ; mm5=tmp13
+
+    psllw       mm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm1, [GOTOFF(ebx,PW_F1414)]
+    psubw       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    psubw       mm4, mm5                ; mm4=tmp3
+    psubw       mm0, mm1                ; mm0=tmp2
+    paddw       mm6, mm5                ; mm6=tmp0
+    paddw       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; wk(1)=tmp3
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movq        mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    psubw       mm2, mm1                ; mm2=z12
+    psubw       mm5, mm3                ; mm5=z10
+    paddw       mm4, mm1                ; mm4=z11
+    paddw       mm0, mm3                ; mm0=z13
+
+    movq        mm1, mm5                ; mm1=z10(unscaled)
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm5, PRE_MULTIPLY_SCALE_BITS
+
+    movq        mm3, mm4
+    psubw       mm4, mm0
+    paddw       mm3, mm0                ; mm3=tmp7
+
+    psllw       mm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm4, [GOTOFF(ebx,PW_F1414)]  ; mm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movq        mm0, mm5
+    paddw       mm5, mm2
+    pmulhw      mm5, [GOTOFF(ebx,PW_F1847)]   ; mm5=z5
+    pmulhw      mm0, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      mm2, [GOTOFF(ebx,PW_F1082)]
+    psubw       mm0, mm1
+    psubw       mm2, mm5                ; mm2=tmp10
+    paddw       mm0, mm5                ; mm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       mm0, mm3                ; mm0=tmp6
+    movq        mm1, mm6
+    movq        mm5, mm7
+    paddw       mm6, mm3                ; mm6=data0=(00 01 02 03)
+    paddw       mm7, mm0                ; mm7=data1=(10 11 12 13)
+    psubw       mm1, mm3                ; mm1=data7=(70 71 72 73)
+    psubw       mm5, mm0                ; mm5=data6=(60 61 62 63)
+    psubw       mm4, mm0                ; mm4=tmp5
+
+    movq        mm3, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 10 01 11)
+    punpckhwd   mm3, mm7                ; mm3=(02 12 03 13)
+    movq        mm0, mm5                ; transpose coefficients(phase 1)
+    punpcklwd   mm5, mm1                ; mm5=(60 70 61 71)
+    punpckhwd   mm0, mm1                ; mm0=(62 72 63 73)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp2
+    movq        mm1, MMWORD [wk(1)]     ; mm1=tmp3
+
+    movq        MMWORD [wk(0)], mm5     ; wk(0)=(60 70 61 71)
+    movq        MMWORD [wk(1)], mm0     ; wk(1)=(62 72 63 73)
+
+    paddw       mm2, mm4                ; mm2=tmp4
+    movq        mm5, mm7
+    movq        mm0, mm1
+    paddw       mm7, mm4                ; mm7=data2=(20 21 22 23)
+    paddw       mm1, mm2                ; mm1=data4=(40 41 42 43)
+    psubw       mm5, mm4                ; mm5=data5=(50 51 52 53)
+    psubw       mm0, mm2                ; mm0=data3=(30 31 32 33)
+
+    movq        mm4, mm7                ; transpose coefficients(phase 1)
+    punpcklwd   mm7, mm0                ; mm7=(20 30 21 31)
+    punpckhwd   mm4, mm0                ; mm4=(22 32 23 33)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm5                ; mm1=(40 50 41 51)
+    punpckhwd   mm2, mm5                ; mm2=(42 52 43 53)
+
+    movq        mm0, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm7                ; mm6=(00 10 20 30)
+    punpckhdq   mm0, mm7                ; mm0=(01 11 21 31)
+    movq        mm5, mm3                ; transpose coefficients(phase 2)
+    punpckldq   mm3, mm4                ; mm3=(02 12 22 32)
+    punpckhdq   mm5, mm4                ; mm5=(03 13 23 33)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=(60 70 61 71)
+    movq        mm4, MMWORD [wk(1)]     ; mm4=(62 72 63 73)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm7                ; mm1=(40 50 60 70)
+    punpckhdq   mm6, mm7                ; mm6=(41 51 61 71)
+    movq        mm0, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm4                ; mm2=(42 52 62 72)
+    punpckhdq   mm0, mm4                ; mm0=(43 53 63 73)
+
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF            ; coef_block
+    add         edx, byte 4*SIZEOF_IFAST_MULT_TYPE  ; quantptr
+    add         edi, byte 4*DCTSIZE*SIZEOF_JCOEF    ; wsptr
+    dec         ecx                                 ; ctr
+    jnz         near .columnloop
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; JCOEF *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    psubw       mm0, mm2                ; mm0=tmp11
+    psubw       mm1, mm3
+    paddw       mm4, mm2                ; mm4=tmp10
+    paddw       mm5, mm3                ; mm5=tmp13
+
+    psllw       mm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm1, [GOTOFF(ebx,PW_F1414)]
+    psubw       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    psubw       mm4, mm5                ; mm4=tmp3
+    psubw       mm0, mm1                ; mm0=tmp2
+    paddw       mm6, mm5                ; mm6=tmp0
+    paddw       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; wk(1)=tmp3
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movq        mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    psubw       mm2, mm1                ; mm2=z12
+    psubw       mm5, mm3                ; mm5=z10
+    paddw       mm4, mm1                ; mm4=z11
+    paddw       mm0, mm3                ; mm0=z13
+
+    movq        mm1, mm5                ; mm1=z10(unscaled)
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm5, PRE_MULTIPLY_SCALE_BITS
+
+    movq        mm3, mm4
+    psubw       mm4, mm0
+    paddw       mm3, mm0                ; mm3=tmp7
+
+    psllw       mm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm4, [GOTOFF(ebx,PW_F1414)]  ; mm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movq        mm0, mm5
+    paddw       mm5, mm2
+    pmulhw      mm5, [GOTOFF(ebx,PW_F1847)]   ; mm5=z5
+    pmulhw      mm0, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      mm2, [GOTOFF(ebx,PW_F1082)]
+    psubw       mm0, mm1
+    psubw       mm2, mm5                ; mm2=tmp10
+    paddw       mm0, mm5                ; mm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       mm0, mm3                ; mm0=tmp6
+    movq        mm1, mm6
+    movq        mm5, mm7
+    paddw       mm6, mm3                ; mm6=data0=(00 10 20 30)
+    paddw       mm7, mm0                ; mm7=data1=(01 11 21 31)
+    psraw       mm6, (PASS1_BITS+3)     ; descale
+    psraw       mm7, (PASS1_BITS+3)     ; descale
+    psubw       mm1, mm3                ; mm1=data7=(07 17 27 37)
+    psubw       mm5, mm0                ; mm5=data6=(06 16 26 36)
+    psraw       mm1, (PASS1_BITS+3)     ; descale
+    psraw       mm5, (PASS1_BITS+3)     ; descale
+    psubw       mm4, mm0                ; mm4=tmp5
+
+    packsswb    mm6, mm5                ; mm6=(00 10 20 30 06 16 26 36)
+    packsswb    mm7, mm1                ; mm7=(01 11 21 31 07 17 27 37)
+
+    movq        mm3, MMWORD [wk(0)]     ; mm3=tmp2
+    movq        mm0, MMWORD [wk(1)]     ; mm0=tmp3
+
+    paddw       mm2, mm4                ; mm2=tmp4
+    movq        mm5, mm3
+    movq        mm1, mm0
+    paddw       mm3, mm4                ; mm3=data2=(02 12 22 32)
+    paddw       mm0, mm2                ; mm0=data4=(04 14 24 34)
+    psraw       mm3, (PASS1_BITS+3)     ; descale
+    psraw       mm0, (PASS1_BITS+3)     ; descale
+    psubw       mm5, mm4                ; mm5=data5=(05 15 25 35)
+    psubw       mm1, mm2                ; mm1=data3=(03 13 23 33)
+    psraw       mm5, (PASS1_BITS+3)     ; descale
+    psraw       mm1, (PASS1_BITS+3)     ; descale
+
+    movq        mm4, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm4=[PB_CENTERJSAMP]
+
+    packsswb    mm3, mm0                ; mm3=(02 12 22 32 04 14 24 34)
+    packsswb    mm1, mm5                ; mm1=(03 13 23 33 05 15 25 35)
+
+    paddb       mm6, mm4
+    paddb       mm7, mm4
+    paddb       mm3, mm4
+    paddb       mm1, mm4
+
+    movq        mm2, mm6                ; transpose coefficients(phase 1)
+    punpcklbw   mm6, mm7                ; mm6=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm2, mm7                ; mm2=(06 07 16 17 26 27 36 37)
+    movq        mm0, mm3                ; transpose coefficients(phase 1)
+    punpcklbw   mm3, mm1                ; mm3=(02 03 12 13 22 23 32 33)
+    punpckhbw   mm0, mm1                ; mm0=(04 05 14 15 24 25 34 35)
+
+    movq        mm5, mm6                ; transpose coefficients(phase 2)
+    punpcklwd   mm6, mm3                ; mm6=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm5, mm3                ; mm5=(20 21 22 23 30 31 32 33)
+    movq        mm4, mm0                ; transpose coefficients(phase 2)
+    punpcklwd   mm0, mm2                ; mm0=(04 05 06 07 14 15 16 17)
+    punpckhwd   mm4, mm2                ; mm4=(24 25 26 27 34 35 36 37)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 3)
+    punpckldq   mm6, mm0                ; mm6=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm7, mm0                ; mm7=(10 11 12 13 14 15 16 17)
+    movq        mm1, mm5                ; transpose coefficients(phase 3)
+    punpckldq   mm5, mm4                ; mm5=(20 21 22 23 24 25 26 27)
+    punpckhdq   mm1, mm4                ; mm1=(30 31 32 33 34 35 36 37)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_JCOEF     ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                          ; ctr
+    jnz         near .rowloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jidctfst-sse2.asm b/simd/i386/jidctfst-sse2.asm
new file mode 100644
index 0000000..83bc414
--- /dev/null
+++ b/simd/i386/jidctfst-sse2.asm
@@ -0,0 +1,503 @@
+;
+; jidctfst.asm - fast integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+%define PASS1_BITS  2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277              ; FIX(1.082392200)
+F_1_414 equ 362              ; FIX(1.414213562)
+F_1_847 equ 473              ; FIX(1.847759065)
+F_2_613 equ 669              ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256)  ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS)  ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS)  ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS)  ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS))       ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414       times 8  dw  F_1_414 << CONST_SHIFT
+PW_F1847       times 8  dw  F_1_847 << CONST_SHIFT
+PW_MF1613      times 8  dw -F_1_613 << CONST_SHIFT
+PW_F1082       times 8  dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm7, xmm0              ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0              ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm7, xmm7              ; xmm7=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm6, xmm0, 0x00        ; xmm6=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm2, xmm0, 0x55        ; xmm2=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm5, xmm0, 0xAA        ; xmm5=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm0, xmm0, 0xFF        ; xmm0=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm1, xmm7, 0x00        ; xmm1=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm4, xmm7, 0x55        ; xmm4=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm3, xmm7, 0xAA        ; xmm3=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm7, xmm7, 0xFF        ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=col3
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    psubw       xmm0, xmm2              ; xmm0=tmp11
+    psubw       xmm1, xmm3
+    paddw       xmm4, xmm2              ; xmm4=tmp10
+    paddw       xmm5, xmm3              ; xmm5=tmp13
+
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm1, [GOTOFF(ebx,PW_F1414)]
+    psubw       xmm1, xmm5              ; xmm1=tmp12
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm7, xmm0
+    psubw       xmm4, xmm5              ; xmm4=tmp3
+    psubw       xmm0, xmm1              ; xmm0=tmp2
+    paddw       xmm6, xmm5              ; xmm6=tmp0
+    paddw       xmm7, xmm1              ; xmm7=tmp1
+
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movdqa      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm0, xmm5
+    psubw       xmm2, xmm1              ; xmm2=z12
+    psubw       xmm5, xmm3              ; xmm5=z10
+    paddw       xmm4, xmm1              ; xmm4=z11
+    paddw       xmm0, xmm3              ; xmm0=z13
+
+    movdqa      xmm1, xmm5              ; xmm1=z10(unscaled)
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm3, xmm4
+    psubw       xmm4, xmm0
+    paddw       xmm3, xmm0              ; xmm3=tmp7
+
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F1414)]  ; xmm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm0, xmm5
+    paddw       xmm5, xmm2
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F1847)]   ; xmm5=z5
+    pmulhw      xmm0, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      xmm2, [GOTOFF(ebx,PW_F1082)]
+    psubw       xmm0, xmm1
+    psubw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm0, xmm5              ; xmm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm0, xmm3              ; xmm0=tmp6
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm7
+    paddw       xmm6, xmm3              ; xmm6=data0=(00 01 02 03 04 05 06 07)
+    paddw       xmm7, xmm0              ; xmm7=data1=(10 11 12 13 14 15 16 17)
+    psubw       xmm1, xmm3              ; xmm1=data7=(70 71 72 73 74 75 76 77)
+    psubw       xmm5, xmm0              ; xmm5=data6=(60 61 62 63 64 65 66 67)
+    psubw       xmm4, xmm0              ; xmm4=tmp5
+
+    movdqa      xmm3, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm3, xmm7              ; xmm3=(04 14 05 15 06 16 07 17)
+    movdqa      xmm0, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm1              ; xmm5=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm0, xmm1              ; xmm0=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
+
+    paddw       xmm2, xmm4              ; xmm2=tmp4
+    movdqa      xmm5, xmm7
+    movdqa      xmm0, xmm1
+    paddw       xmm7, xmm4              ; xmm7=data2=(20 21 22 23 24 25 26 27)
+    paddw       xmm1, xmm2              ; xmm1=data4=(40 41 42 43 44 45 46 47)
+    psubw       xmm5, xmm4              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+    psubw       xmm0, xmm2              ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm0              ; xmm7=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm0              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm5              ; xmm2=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm0, xmm3              ; transpose coefficients(phase 2)
+    punpckldq   xmm3, xmm4              ; xmm3=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm0, xmm4              ; xmm0=(06 16 26 36 07 17 27 37)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7              ; xmm6=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm5, xmm7              ; xmm5=(02 12 22 32 03 13 23 33)
+
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm4              ; xmm1=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm3, xmm4              ; xmm3=(42 52 62 72 43 53 63 73)
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm7              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm0, xmm7              ; xmm0=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm1              ; xmm6=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm4, xmm1              ; xmm4=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm5, xmm3              ; xmm5=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm7, xmm3              ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=col3
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm7, xmm3              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm3, xmm0              ; xmm3=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm7, xmm0              ; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Even part
+
+    ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm0, xmm5
+    psubw       xmm6, xmm1              ; xmm6=tmp11
+    psubw       xmm5, xmm3
+    paddw       xmm2, xmm1              ; xmm2=tmp10
+    paddw       xmm0, xmm3              ; xmm0=tmp13
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F1414)]
+    psubw       xmm5, xmm0              ; xmm5=tmp12
+
+    movdqa      xmm1, xmm2
+    movdqa      xmm3, xmm6
+    psubw       xmm2, xmm0              ; xmm2=tmp3
+    psubw       xmm6, xmm5              ; xmm6=tmp2
+    paddw       xmm1, xmm0              ; xmm1=tmp0
+    paddw       xmm3, xmm5              ; xmm3=tmp1
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=col1
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=col3
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
+
+    ; -- Odd part
+
+    ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm4
+    psubw       xmm0, xmm7              ; xmm0=z12
+    psubw       xmm4, xmm5              ; xmm4=z10
+    paddw       xmm2, xmm7              ; xmm2=z11
+    paddw       xmm6, xmm5              ; xmm6=z13
+
+    movdqa      xmm7, xmm4              ; xmm7=z10(unscaled)
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm5, xmm2
+    psubw       xmm2, xmm6
+    paddw       xmm5, xmm6              ; xmm5=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm2, [GOTOFF(ebx,PW_F1414)]  ; xmm2=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm6, xmm4
+    paddw       xmm4, xmm0
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F1847)]   ; xmm4=z5
+    pmulhw      xmm6, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      xmm0, [GOTOFF(ebx,PW_F1082)]
+    psubw       xmm6, xmm7
+    psubw       xmm0, xmm4              ; xmm0=tmp10
+    paddw       xmm6, xmm4              ; xmm6=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm6, xmm5              ; xmm6=tmp6
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm3
+    paddw       xmm1, xmm5              ; xmm1=data0=(00 10 20 30 40 50 60 70)
+    paddw       xmm3, xmm6              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    psraw       xmm1, (PASS1_BITS+3)    ; descale
+    psraw       xmm3, (PASS1_BITS+3)    ; descale
+    psubw       xmm7, xmm5              ; xmm7=data7=(07 17 27 37 47 57 67 77)
+    psubw       xmm4, xmm6              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psubw       xmm2, xmm6              ; xmm2=tmp5
+
+    packsswb    xmm1, xmm4        ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm7        ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
+
+    paddw       xmm0, xmm2              ; xmm0=tmp4
+    movdqa      xmm4, xmm5
+    movdqa      xmm7, xmm6
+    paddw       xmm5, xmm2              ; xmm5=data2=(02 12 22 32 42 52 62 72)
+    paddw       xmm6, xmm0              ; xmm6=data4=(04 14 24 34 44 54 64 74)
+    psraw       xmm5, (PASS1_BITS+3)    ; descale
+    psraw       xmm6, (PASS1_BITS+3)    ; descale
+    psubw       xmm4, xmm2              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+    psubw       xmm7, xmm0              ; xmm7=data3=(03 13 23 33 43 53 63 73)
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+
+    movdqa      xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; xmm2=[PB_CENTERJSAMP]
+
+    packsswb    xmm5, xmm6        ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm7, xmm4        ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm1, xmm2
+    paddb       xmm3, xmm2
+    paddb       xmm5, xmm2
+    paddb       xmm7, xmm2
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 1)
+    punpcklbw   xmm1, xmm3        ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm3        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm6, xmm5        ; transpose coefficients(phase 1)
+    punpcklbw   xmm5, xmm7        ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm6, xmm7        ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm1        ; transpose coefficients(phase 2)
+    punpcklwd   xmm1, xmm5        ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm5        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm0        ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm2, xmm0        ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm3, xmm1        ; transpose coefficients(phase 3)
+    punpckldq   xmm1, xmm6        ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm3, xmm6        ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm7, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm2        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm7, xmm2        ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm5, xmm1, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm3, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm6, xmm4, 0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm2, xmm7, 0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+    mov         edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+    mov         edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jidctint-avx2.asm b/simd/i386/jidctint-avx2.asm
new file mode 100644
index 0000000..b3b7b14
--- /dev/null
+++ b/simd/i386/jidctint-avx2.asm
@@ -0,0 +1,455 @@
+;
+; jidctint.asm - accurate integer IDCT (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %5=(00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71)
+    ; %6=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %7=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %8=(07 17 27 37 47 57 67 77  06 16 26 36 46 56 66 76)
+
+    vpermq      %5, %1, 0xD8
+    vpermq      %6, %2, 0x72
+    vpermq      %7, %3, 0xD8
+    vpermq      %8, %4, 0x72
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %6=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %7=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %8=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpunpcklwd  %1, %5, %6
+    vpunpckhwd  %2, %5, %6
+    vpunpcklwd  %3, %7, %8
+    vpunpckhwd  %4, %7, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 02 10 12 20 22 30 32  40 42 50 52 60 62 70 72)
+    ; %2=(01 03 11 13 21 23 31 33  41 43 51 53 61 63 71 73)
+    ; %3=(04 06 14 16 24 26 34 36  44 46 54 56 64 66 74 76)
+    ; %4=(05 07 15 17 25 27 35 37  45 47 55 57 65 67 75 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpcklwd  %6, %3, %4
+    vpunpckhwd  %7, %1, %2
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 3)
+    ; %5=(00 01 02 03 10 11 12 13  40 41 42 43 50 51 52 53)
+    ; %6=(04 05 06 07 14 15 16 17  44 45 46 47 54 55 56 57)
+    ; %7=(20 21 22 23 30 31 32 33  60 61 62 63 70 71 72 73)
+    ; %8=(24 25 26 27 34 35 36 37  64 65 66 67 74 75 76 77)
+
+    vpunpcklqdq %1, %5, %6
+    vpunpckhqdq %2, %5, %6
+    vpunpcklqdq %3, %7, %8
+    vpunpckhqdq %4, %7, %8
+    ; transpose coefficients(phase 4)
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit slow integer inverse DCT using AVX2 instructions
+; %1-%4:  Input/output registers
+; %5-%12: Temp registers
+; %9:     Pass (1 or 2)
+
+%macro dodct 13
+    ; -- Even part
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    vperm2i128  %6, %3, %3, 0x01        ; %6=in6_2
+    vpunpcklwd  %5, %3, %6              ; %5=in26_62L
+    vpunpckhwd  %6, %3, %6              ; %6=in26_62H
+    vpmaddwd    %5, %5, [GOTOFF(ebx,PW_F130_F054_MF130_F054)]  ; %5=tmp3_2L
+    vpmaddwd    %6, %6, [GOTOFF(ebx,PW_F130_F054_MF130_F054)]  ; %6=tmp3_2H
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=in4_0
+    vpsignw     %1, %1, [GOTOFF(ebx,PW_1_NEG1)]
+    vpaddw      %7, %7, %1              ; %7=(in0+in4)_(in0-in4)
+
+    vpxor       %1, %1, %1
+    vpunpcklwd  %8, %1, %7              ; %8=tmp0_1L
+    vpunpckhwd  %1, %1, %7              ; %1=tmp0_1H
+    vpsrad      %8, %8, (16-CONST_BITS)  ; vpsrad %8,16 & vpslld %8,CONST_BITS
+    vpsrad      %1, %1, (16-CONST_BITS)  ; vpsrad %1,16 & vpslld %1,CONST_BITS
+
+    vpsubd      %3, %8, %5
+    vmovdqu     %11, %3                 ; %11=tmp0_1L-tmp3_2L=tmp13_12L
+    vpaddd      %3, %8, %5
+    vmovdqu     %9, %3                  ; %9=tmp0_1L+tmp3_2L=tmp10_11L
+    vpsubd      %3, %1, %6
+    vmovdqu     %12, %3                 ; %12=tmp0_1H-tmp3_2H=tmp13_12H
+    vpaddd      %3, %1, %6
+    vmovdqu     %10, %3                 ; %10=tmp0_1H+tmp3_2H=tmp10_11H
+
+    ; -- Odd part
+
+    vpaddw      %1, %4, %2              ; %1=in7_5+in3_1=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %8, %1, %1, 0x01        ; %8=z4_3
+    vpunpcklwd  %7, %1, %8              ; %7=z34_43L
+    vpunpckhwd  %8, %1, %8              ; %8=z34_43H
+    vpmaddwd    %7, %7, [GOTOFF(ebx,PW_MF078_F117_F078_F117)]  ; %7=z3_4L
+    vpmaddwd    %8, %8, [GOTOFF(ebx,PW_MF078_F117_F078_F117)]  ; %8=z3_4H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    vperm2i128  %2, %2, %2, 0x01        ; %2=in1_3
+    vpunpcklwd  %3, %4, %2              ; %3=in71_53L
+    vpunpckhwd  %4, %4, %2              ; %4=in71_53H
+
+    vpmaddwd    %5, %3, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)]  ; %5=tmp0_1L
+    vpmaddwd    %6, %4, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)]  ; %6=tmp0_1H
+    vpaddd      %5, %5, %7              ; %5=tmp0_1L+z3_4L=tmp0_1L
+    vpaddd      %6, %6, %8              ; %6=tmp0_1H+z3_4H=tmp0_1H
+
+    vpmaddwd    %3, %3, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)]  ; %3=tmp3_2L
+    vpmaddwd    %4, %4, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)]  ; %4=tmp3_2H
+    vperm2i128  %7, %7, %7, 0x01        ; %7=z4_3L
+    vperm2i128  %8, %8, %8, 0x01        ; %8=z4_3H
+    vpaddd      %7, %3, %7              ; %7=tmp3_2L+z4_3L=tmp3_2L
+    vpaddd      %8, %4, %8              ; %8=tmp3_2H+z4_3H=tmp3_2H
+
+    ; -- Final output stage
+
+    vmovdqu     %3, %9
+    vmovdqu     %4, %10
+
+    vpaddd      %1, %3, %7              ; %1=tmp10_11L+tmp3_2L=data0_1L
+    vpaddd      %2, %4, %8              ; %2=tmp10_11H+tmp3_2H=data0_1H
+    vpaddd      %1, %1, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %1, %1, DESCALE_P %+ %13
+    vpsrad      %2, %2, DESCALE_P %+ %13
+    vpackssdw   %1, %1, %2              ; %1=data0_1
+
+    vpsubd      %3, %3, %7              ; %3=tmp10_11L-tmp3_2L=data7_6L
+    vpsubd      %4, %4, %8              ; %4=tmp10_11H-tmp3_2H=data7_6H
+    vpaddd      %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %4, %4, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpsrad      %4, %4, DESCALE_P %+ %13
+    vpackssdw   %4, %3, %4              ; %4=data7_6
+
+    vmovdqu     %7, %11
+    vmovdqu     %8, %12
+
+    vpaddd      %2, %7, %5              ; %7=tmp13_12L+tmp0_1L=data3_2L
+    vpaddd      %3, %8, %6              ; %8=tmp13_12H+tmp0_1H=data3_2H
+    vpaddd      %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %2, %2, DESCALE_P %+ %13
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpackssdw   %2, %2, %3              ; %2=data3_2
+
+    vpsubd      %3, %7, %5              ; %7=tmp13_12L-tmp0_1L=data4_5L
+    vpsubd      %6, %8, %6              ; %8=tmp13_12H-tmp0_1H=data4_5H
+    vpaddd      %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %6, %6, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpsrad      %6, %6, DESCALE_P %+ %13
+    vpackssdw   %3, %3, %6              ; %3=data4_5
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_avx2)
+
+EXTN(jconst_idct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF089_F060_MF256_F050   times 4  dw -F_0_899, (F_1_501 - F_0_899)
+                           times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP             times 32 db  CENTERJSAMPLE
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         4
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_avx2)
+
+EXTN(jsimd_idct_islow_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
+    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, xmm0
+    vpacksswb   xmm1, xmm1, xmm1
+    vpacksswb   xmm1, xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    vpmullw     xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vpsllw      xmm5, xmm5, PASS1_BITS
+
+    vpunpcklwd  xmm4, xmm5, xmm5        ; xmm4=(00 00 01 01 02 02 03 03)
+    vpunpckhwd  xmm5, xmm5, xmm5        ; xmm5=(04 04 05 05 06 06 07 07)
+    vinserti128 ymm4, ymm4, xmm5, 1
+
+    vpshufd     ymm0, ymm4, 0x00        ; ymm0=col0_4=(00 00 00 00 00 00 00 00  04 04 04 04 04 04 04 04)
+    vpshufd     ymm1, ymm4, 0x55        ; ymm1=col1_5=(01 01 01 01 01 01 01 01  05 05 05 05 05 05 05 05)
+    vpshufd     ymm2, ymm4, 0xAA        ; ymm2=col2_6=(02 02 02 02 02 02 02 02  06 06 06 06 06 06 06 06)
+    vpshufd     ymm3, ymm4, 0xFF        ; ymm3=col3_7=(03 03 03 03 03 03 03 03  07 07 07 07 07 07 07 07)
+
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,esi,SIZEOF_JCOEF)]  ; ymm4=in0_1
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,esi,SIZEOF_JCOEF)]  ; ymm5=in2_3
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,esi,SIZEOF_JCOEF)]  ; ymm6=in4_5
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,esi,SIZEOF_JCOEF)]  ; ymm7=in6_7
+    vpmullw     ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20  ; ymm0=in0_4
+    vperm2i128  ymm1, ymm5, ymm4, 0x31  ; ymm1=in3_1
+    vperm2i128  ymm2, ymm5, ymm7, 0x20  ; ymm2=in2_6
+    vperm2i128  ymm3, ymm7, ymm6, 0x31  ; ymm3=in7_5
+
+    dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 1
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
+
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    vperm2i128  ymm4, ymm3, ymm1, 0x31  ; ymm3=in7_5
+    vperm2i128  ymm1, ymm3, ymm1, 0x20  ; ymm1=in3_1
+
+    dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 2
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
+
+    vpacksswb   ymm0, ymm0, ymm1        ; ymm0=data01_45
+    vpacksswb   ymm1, ymm2, ymm4        ; ymm1=data23_67
+    vpaddb      ymm0, ymm0, [GOTOFF(ebx,PB_CENTERJSAMP)]
+    vpaddb      ymm1, ymm1, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    vextracti128 xmm6, ymm1, 1          ; xmm3=data67
+    vextracti128 xmm4, ymm0, 1          ; xmm2=data45
+    vextracti128 xmm2, ymm1, 0          ; xmm1=data23
+    vextracti128 xmm0, ymm0, 0          ; xmm0=data01
+
+    vpshufd     xmm1, xmm0, 0x4E  ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    vpshufd     xmm3, xmm2, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    vpshufd     xmm5, xmm4, 0x4E  ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    vpshufd     xmm7, xmm6, 0x4E  ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    vzeroupper
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm0
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+    mov         edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+    mov         edx, JSAMPROW [edi+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jidctint-mmx.asm b/simd/i386/jidctint-mmx.asm
new file mode 100644
index 0000000..6ca6d06
--- /dev/null
+++ b/simd/i386/jidctint-mmx.asm
@@ -0,0 +1,853 @@
+;
+; jidctint.asm - accurate integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_mmx)
+
+EXTN(jconst_idct_islow_mmx):
+
+PW_F130_F054   times 2 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 2 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 2 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 2 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 2 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 2 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 2 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 2 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 2 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 2 dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_mmx(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         12
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+                                        ; JCOEF workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_mmx)
+
+EXTN(jsimd_idct_islow_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; JCOEF *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
+    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm1, mm0
+    packsswb    mm1, mm1
+    movd        eax, mm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       mm0, PASS1_BITS
+
+    movq        mm2, mm0                ; mm0=in0=(00 01 02 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm2, mm2                ; mm2=(02 02 03 03)
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0                ; mm0=(00 00 00 00)
+    punpckhdq   mm1, mm1                ; mm1=(01 01 01 01)
+    movq        mm3, mm2
+    punpckldq   mm2, mm2                ; mm2=(02 02 02 02)
+    punpckhdq   mm3, mm3                ; mm3=(03 03 03 03)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movq        mm4, mm1                ; mm1=in2=z2
+    movq        mm5, mm1
+    punpcklwd   mm4, mm3                ; mm3=in6=z3
+    punpckhwd   mm5, mm3
+    movq        mm1, mm4
+    movq        mm3, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=tmp3L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F130_F054)]   ; mm5=tmp3H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F054_MF130)]  ; mm3=tmp2H
+
+    movq        mm6, mm0
+    paddw       mm0, mm2                ; mm0=in0+in4
+    psubw       mm6, mm2                ; mm6=in0-in4
+
+    pxor        mm7, mm7
+    pxor        mm2, mm2
+    punpcklwd   mm7, mm0                ; mm7=tmp0L
+    punpckhwd   mm2, mm0                ; mm2=tmp0H
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+    psrad       mm2, (16-CONST_BITS)    ; psrad mm2,16 & pslld mm2,CONST_BITS
+
+    movq        mm0, mm7
+    paddd       mm7, mm4                ; mm7=tmp10L
+    psubd       mm0, mm4                ; mm0=tmp13L
+    movq        mm4, mm2
+    paddd       mm2, mm5                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp13H
+
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp10L
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=tmp10H
+    movq        MMWORD [wk(2)], mm0     ; wk(2)=tmp13L
+    movq        MMWORD [wk(3)], mm4     ; wk(3)=tmp13H
+
+    pxor        mm5, mm5
+    pxor        mm7, mm7
+    punpcklwd   mm5, mm6                ; mm5=tmp1L
+    punpckhwd   mm7, mm6                ; mm7=tmp1H
+    psrad       mm5, (16-CONST_BITS)    ; psrad mm5,16 & pslld mm5,CONST_BITS
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+
+    movq        mm2, mm5
+    paddd       mm5, mm1                ; mm5=tmp11L
+    psubd       mm2, mm1                ; mm2=tmp12L
+    movq        mm0, mm7
+    paddd       mm7, mm3                ; mm7=tmp11H
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    movq        MMWORD [wk(4)], mm5     ; wk(4)=tmp11L
+    movq        MMWORD [wk(5)], mm7     ; wk(5)=tmp11H
+    movq        MMWORD [wk(6)], mm2     ; wk(6)=tmp12L
+    movq        MMWORD [wk(7)], mm0     ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movq        mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movq        mm5, mm6
+    movq        mm7, mm4
+    paddw       mm5, mm3                ; mm5=z3
+    paddw       mm7, mm1                ; mm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm2, mm5
+    movq        mm0, mm5
+    punpcklwd   mm2, mm7
+    punpckhwd   mm0, mm7
+    movq        mm5, mm2
+    movq        mm7, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF078_F117)]  ; mm2=z3L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF078_F117)]  ; mm0=z3H
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F117_F078)]   ; mm5=z4L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_F117_F078)]   ; mm7=z4H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=z3L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movq        mm2, mm3
+    movq        mm0, mm3
+    punpcklwd   mm2, mm4
+    punpckhwd   mm0, mm4
+    movq        mm3, mm2
+    movq        mm4, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm2=tmp0L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm0=tmp0H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF089_F060)]   ; mm3=tmp3L
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF089_F060)]   ; mm4=tmp3H
+
+    paddd       mm2, MMWORD [wk(10)]    ; mm2=tmp0L
+    paddd       mm0, MMWORD [wk(11)]    ; mm0=tmp0H
+    paddd       mm3, mm5                ; mm3=tmp3L
+    paddd       mm4, mm7                ; mm4=tmp3H
+
+    movq        MMWORD [wk(8)], mm2     ; wk(8)=tmp0L
+    movq        MMWORD [wk(9)], mm0     ; wk(9)=tmp0H
+
+    movq        mm2, mm1
+    movq        mm0, mm1
+    punpcklwd   mm2, mm6
+    punpckhwd   mm0, mm6
+    movq        mm1, mm2
+    movq        mm6, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm2=tmp1L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm0=tmp1H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF256_F050)]   ; mm1=tmp2L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF256_F050)]   ; mm6=tmp2H
+
+    paddd       mm2, mm5                ; mm2=tmp1L
+    paddd       mm0, mm7                ; mm0=tmp1H
+    paddd       mm1, MMWORD [wk(10)]    ; mm1=tmp2L
+    paddd       mm6, MMWORD [wk(11)]    ; mm6=tmp2H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=tmp1L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp10L
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp10H
+
+    movq        mm2, mm5
+    movq        mm0, mm7
+    paddd       mm5, mm3                ; mm5=data0L
+    paddd       mm7, mm4                ; mm7=data0H
+    psubd       mm2, mm3                ; mm2=data7L
+    psubd       mm0, mm4                ; mm0=data7H
+
+    movq        mm3, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm3=[PD_DESCALE_P1]
+
+    paddd       mm5, mm3
+    paddd       mm7, mm3
+    psrad       mm5, DESCALE_P1
+    psrad       mm7, DESCALE_P1
+    paddd       mm2, mm3
+    paddd       mm0, mm3
+    psrad       mm2, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+
+    packssdw    mm5, mm7                ; mm5=data0=(00 01 02 03)
+    packssdw    mm2, mm0                ; mm2=data7=(70 71 72 73)
+
+    movq        mm4, MMWORD [wk(4)]     ; mm4=tmp11L
+    movq        mm3, MMWORD [wk(5)]     ; mm3=tmp11H
+
+    movq        mm7, mm4
+    movq        mm0, mm3
+    paddd       mm4, mm1                ; mm4=data1L
+    paddd       mm3, mm6                ; mm3=data1H
+    psubd       mm7, mm1                ; mm7=data6L
+    psubd       mm0, mm6                ; mm0=data6H
+
+    movq        mm1, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm1=[PD_DESCALE_P1]
+
+    paddd       mm4, mm1
+    paddd       mm3, mm1
+    psrad       mm4, DESCALE_P1
+    psrad       mm3, DESCALE_P1
+    paddd       mm7, mm1
+    paddd       mm0, mm1
+    psrad       mm7, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+
+    packssdw    mm4, mm3                ; mm4=data1=(10 11 12 13)
+    packssdw    mm7, mm0                ; mm7=data6=(60 61 62 63)
+
+    movq        mm6, mm5                ; transpose coefficients(phase 1)
+    punpcklwd   mm5, mm4                ; mm5=(00 10 01 11)
+    punpckhwd   mm6, mm4                ; mm6=(02 12 03 13)
+    movq        mm1, mm7                ; transpose coefficients(phase 1)
+    punpcklwd   mm7, mm2                ; mm7=(60 70 61 71)
+    punpckhwd   mm1, mm2                ; mm1=(62 72 63 73)
+
+    movq        mm3, MMWORD [wk(6)]     ; mm3=tmp12L
+    movq        mm0, MMWORD [wk(7)]     ; mm0=tmp12H
+    movq        mm4, MMWORD [wk(10)]    ; mm4=tmp1L
+    movq        mm2, MMWORD [wk(11)]    ; mm2=tmp1H
+
+    movq        MMWORD [wk(0)], mm5     ; wk(0)=(00 10 01 11)
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=(02 12 03 13)
+    movq        MMWORD [wk(4)], mm7     ; wk(4)=(60 70 61 71)
+    movq        MMWORD [wk(5)], mm1     ; wk(5)=(62 72 63 73)
+
+    movq        mm5, mm3
+    movq        mm6, mm0
+    paddd       mm3, mm4                ; mm3=data2L
+    paddd       mm0, mm2                ; mm0=data2H
+    psubd       mm5, mm4                ; mm5=data5L
+    psubd       mm6, mm2                ; mm6=data5H
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm7=[PD_DESCALE_P1]
+
+    paddd       mm3, mm7
+    paddd       mm0, mm7
+    psrad       mm3, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+    paddd       mm5, mm7
+    paddd       mm6, mm7
+    psrad       mm5, DESCALE_P1
+    psrad       mm6, DESCALE_P1
+
+    packssdw    mm3, mm0                ; mm3=data2=(20 21 22 23)
+    packssdw    mm5, mm6                ; mm5=data5=(50 51 52 53)
+
+    movq        mm1, MMWORD [wk(2)]     ; mm1=tmp13L
+    movq        mm4, MMWORD [wk(3)]     ; mm4=tmp13H
+    movq        mm2, MMWORD [wk(8)]     ; mm2=tmp0L
+    movq        mm7, MMWORD [wk(9)]     ; mm7=tmp0H
+
+    movq        mm0, mm1
+    movq        mm6, mm4
+    paddd       mm1, mm2                ; mm1=data3L
+    paddd       mm4, mm7                ; mm4=data3H
+    psubd       mm0, mm2                ; mm0=data4L
+    psubd       mm6, mm7                ; mm6=data4H
+
+    movq        mm2, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm2=[PD_DESCALE_P1]
+
+    paddd       mm1, mm2
+    paddd       mm4, mm2
+    psrad       mm1, DESCALE_P1
+    psrad       mm4, DESCALE_P1
+    paddd       mm0, mm2
+    paddd       mm6, mm2
+    psrad       mm0, DESCALE_P1
+    psrad       mm6, DESCALE_P1
+
+    packssdw    mm1, mm4                ; mm1=data3=(30 31 32 33)
+    packssdw    mm0, mm6                ; mm0=data4=(40 41 42 43)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=(00 10 01 11)
+    movq        mm2, MMWORD [wk(1)]     ; mm2=(02 12 03 13)
+
+    movq        mm4, mm3                ; transpose coefficients(phase 1)
+    punpcklwd   mm3, mm1                ; mm3=(20 30 21 31)
+    punpckhwd   mm4, mm1                ; mm4=(22 32 23 33)
+    movq        mm6, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm5                ; mm0=(40 50 41 51)
+    punpckhwd   mm6, mm5                ; mm6=(42 52 43 53)
+
+    movq        mm1, mm7                ; transpose coefficients(phase 2)
+    punpckldq   mm7, mm3                ; mm7=(00 10 20 30)
+    punpckhdq   mm1, mm3                ; mm1=(01 11 21 31)
+    movq        mm5, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm4                ; mm2=(02 12 22 32)
+    punpckhdq   mm5, mm4                ; mm5=(03 13 23 33)
+
+    movq        mm3, MMWORD [wk(4)]     ; mm3=(60 70 61 71)
+    movq        mm4, MMWORD [wk(5)]     ; mm4=(62 72 63 73)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+    movq        mm7, mm0                ; transpose coefficients(phase 2)
+    punpckldq   mm0, mm3                ; mm0=(40 50 60 70)
+    punpckhdq   mm7, mm3                ; mm7=(41 51 61 71)
+    movq        mm1, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm4                ; mm6=(42 52 62 72)
+    punpckhdq   mm1, mm4                ; mm1=(43 53 63 73)
+
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF            ; coef_block
+    add         edx, byte 4*SIZEOF_ISLOW_MULT_TYPE  ; quantptr
+    add         edi, byte 4*DCTSIZE*SIZEOF_JCOEF    ; wsptr
+    dec         ecx                                 ; ctr
+    jnz         near .columnloop
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; JCOEF *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movq        mm4, mm1                ; mm1=in2=z2
+    movq        mm5, mm1
+    punpcklwd   mm4, mm3                ; mm3=in6=z3
+    punpckhwd   mm5, mm3
+    movq        mm1, mm4
+    movq        mm3, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=tmp3L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F130_F054)]   ; mm5=tmp3H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F054_MF130)]  ; mm3=tmp2H
+
+    movq        mm6, mm0
+    paddw       mm0, mm2                ; mm0=in0+in4
+    psubw       mm6, mm2                ; mm6=in0-in4
+
+    pxor        mm7, mm7
+    pxor        mm2, mm2
+    punpcklwd   mm7, mm0                ; mm7=tmp0L
+    punpckhwd   mm2, mm0                ; mm2=tmp0H
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+    psrad       mm2, (16-CONST_BITS)    ; psrad mm2,16 & pslld mm2,CONST_BITS
+
+    movq        mm0, mm7
+    paddd       mm7, mm4                ; mm7=tmp10L
+    psubd       mm0, mm4                ; mm0=tmp13L
+    movq        mm4, mm2
+    paddd       mm2, mm5                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp13H
+
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp10L
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=tmp10H
+    movq        MMWORD [wk(2)], mm0     ; wk(2)=tmp13L
+    movq        MMWORD [wk(3)], mm4     ; wk(3)=tmp13H
+
+    pxor        mm5, mm5
+    pxor        mm7, mm7
+    punpcklwd   mm5, mm6                ; mm5=tmp1L
+    punpckhwd   mm7, mm6                ; mm7=tmp1H
+    psrad       mm5, (16-CONST_BITS)    ; psrad mm5,16 & pslld mm5,CONST_BITS
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+
+    movq        mm2, mm5
+    paddd       mm5, mm1                ; mm5=tmp11L
+    psubd       mm2, mm1                ; mm2=tmp12L
+    movq        mm0, mm7
+    paddd       mm7, mm3                ; mm7=tmp11H
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    movq        MMWORD [wk(4)], mm5     ; wk(4)=tmp11L
+    movq        MMWORD [wk(5)], mm7     ; wk(5)=tmp11H
+    movq        MMWORD [wk(6)], mm2     ; wk(6)=tmp12L
+    movq        MMWORD [wk(7)], mm0     ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movq        mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm5, mm6
+    movq        mm7, mm4
+    paddw       mm5, mm3                ; mm5=z3
+    paddw       mm7, mm1                ; mm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm2, mm5
+    movq        mm0, mm5
+    punpcklwd   mm2, mm7
+    punpckhwd   mm0, mm7
+    movq        mm5, mm2
+    movq        mm7, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF078_F117)]  ; mm2=z3L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF078_F117)]  ; mm0=z3H
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F117_F078)]   ; mm5=z4L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_F117_F078)]   ; mm7=z4H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=z3L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movq        mm2, mm3
+    movq        mm0, mm3
+    punpcklwd   mm2, mm4
+    punpckhwd   mm0, mm4
+    movq        mm3, mm2
+    movq        mm4, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm2=tmp0L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm0=tmp0H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF089_F060)]   ; mm3=tmp3L
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF089_F060)]   ; mm4=tmp3H
+
+    paddd       mm2, MMWORD [wk(10)]    ; mm2=tmp0L
+    paddd       mm0, MMWORD [wk(11)]    ; mm0=tmp0H
+    paddd       mm3, mm5                ; mm3=tmp3L
+    paddd       mm4, mm7                ; mm4=tmp3H
+
+    movq        MMWORD [wk(8)], mm2     ; wk(8)=tmp0L
+    movq        MMWORD [wk(9)], mm0     ; wk(9)=tmp0H
+
+    movq        mm2, mm1
+    movq        mm0, mm1
+    punpcklwd   mm2, mm6
+    punpckhwd   mm0, mm6
+    movq        mm1, mm2
+    movq        mm6, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm2=tmp1L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm0=tmp1H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF256_F050)]   ; mm1=tmp2L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF256_F050)]   ; mm6=tmp2H
+
+    paddd       mm2, mm5                ; mm2=tmp1L
+    paddd       mm0, mm7                ; mm0=tmp1H
+    paddd       mm1, MMWORD [wk(10)]    ; mm1=tmp2L
+    paddd       mm6, MMWORD [wk(11)]    ; mm6=tmp2H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=tmp1L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp10L
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp10H
+
+    movq        mm2, mm5
+    movq        mm0, mm7
+    paddd       mm5, mm3                ; mm5=data0L
+    paddd       mm7, mm4                ; mm7=data0H
+    psubd       mm2, mm3                ; mm2=data7L
+    psubd       mm0, mm4                ; mm0=data7H
+
+    movq        mm3, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm3=[PD_DESCALE_P2]
+
+    paddd       mm5, mm3
+    paddd       mm7, mm3
+    psrad       mm5, DESCALE_P2
+    psrad       mm7, DESCALE_P2
+    paddd       mm2, mm3
+    paddd       mm0, mm3
+    psrad       mm2, DESCALE_P2
+    psrad       mm0, DESCALE_P2
+
+    packssdw    mm5, mm7                ; mm5=data0=(00 10 20 30)
+    packssdw    mm2, mm0                ; mm2=data7=(07 17 27 37)
+
+    movq        mm4, MMWORD [wk(4)]     ; mm4=tmp11L
+    movq        mm3, MMWORD [wk(5)]     ; mm3=tmp11H
+
+    movq        mm7, mm4
+    movq        mm0, mm3
+    paddd       mm4, mm1                ; mm4=data1L
+    paddd       mm3, mm6                ; mm3=data1H
+    psubd       mm7, mm1                ; mm7=data6L
+    psubd       mm0, mm6                ; mm0=data6H
+
+    movq        mm1, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm1=[PD_DESCALE_P2]
+
+    paddd       mm4, mm1
+    paddd       mm3, mm1
+    psrad       mm4, DESCALE_P2
+    psrad       mm3, DESCALE_P2
+    paddd       mm7, mm1
+    paddd       mm0, mm1
+    psrad       mm7, DESCALE_P2
+    psrad       mm0, DESCALE_P2
+
+    packssdw    mm4, mm3                ; mm4=data1=(01 11 21 31)
+    packssdw    mm7, mm0                ; mm7=data6=(06 16 26 36)
+
+    packsswb    mm5, mm7                ; mm5=(00 10 20 30 06 16 26 36)
+    packsswb    mm4, mm2                ; mm4=(01 11 21 31 07 17 27 37)
+
+    movq        mm6, MMWORD [wk(6)]     ; mm6=tmp12L
+    movq        mm1, MMWORD [wk(7)]     ; mm1=tmp12H
+    movq        mm3, MMWORD [wk(10)]    ; mm3=tmp1L
+    movq        mm0, MMWORD [wk(11)]    ; mm0=tmp1H
+
+    movq        MMWORD [wk(0)], mm5     ; wk(0)=(00 10 20 30 06 16 26 36)
+    movq        MMWORD [wk(1)], mm4     ; wk(1)=(01 11 21 31 07 17 27 37)
+
+    movq        mm7, mm6
+    movq        mm2, mm1
+    paddd       mm6, mm3                ; mm6=data2L
+    paddd       mm1, mm0                ; mm1=data2H
+    psubd       mm7, mm3                ; mm7=data5L
+    psubd       mm2, mm0                ; mm2=data5H
+
+    movq        mm5, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm5=[PD_DESCALE_P2]
+
+    paddd       mm6, mm5
+    paddd       mm1, mm5
+    psrad       mm6, DESCALE_P2
+    psrad       mm1, DESCALE_P2
+    paddd       mm7, mm5
+    paddd       mm2, mm5
+    psrad       mm7, DESCALE_P2
+    psrad       mm2, DESCALE_P2
+
+    packssdw    mm6, mm1                ; mm6=data2=(02 12 22 32)
+    packssdw    mm7, mm2                ; mm7=data5=(05 15 25 35)
+
+    movq        mm4, MMWORD [wk(2)]     ; mm4=tmp13L
+    movq        mm3, MMWORD [wk(3)]     ; mm3=tmp13H
+    movq        mm0, MMWORD [wk(8)]     ; mm0=tmp0L
+    movq        mm5, MMWORD [wk(9)]     ; mm5=tmp0H
+
+    movq        mm1, mm4
+    movq        mm2, mm3
+    paddd       mm4, mm0                ; mm4=data3L
+    paddd       mm3, mm5                ; mm3=data3H
+    psubd       mm1, mm0                ; mm1=data4L
+    psubd       mm2, mm5                ; mm2=data4H
+
+    movq        mm0, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm0=[PD_DESCALE_P2]
+
+    paddd       mm4, mm0
+    paddd       mm3, mm0
+    psrad       mm4, DESCALE_P2
+    psrad       mm3, DESCALE_P2
+    paddd       mm1, mm0
+    paddd       mm2, mm0
+    psrad       mm1, DESCALE_P2
+    psrad       mm2, DESCALE_P2
+
+    movq        mm5, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm5=[PB_CENTERJSAMP]
+
+    packssdw    mm4, mm3                ; mm4=data3=(03 13 23 33)
+    packssdw    mm1, mm2                ; mm1=data4=(04 14 24 34)
+
+    movq        mm0, MMWORD [wk(0)]     ; mm0=(00 10 20 30 06 16 26 36)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(01 11 21 31 07 17 27 37)
+
+    packsswb    mm6, mm1                ; mm6=(02 12 22 32 04 14 24 34)
+    packsswb    mm4, mm7                ; mm4=(03 13 23 33 05 15 25 35)
+
+    paddb       mm0, mm5
+    paddb       mm3, mm5
+    paddb       mm6, mm5
+    paddb       mm4, mm5
+
+    movq        mm2, mm0                ; transpose coefficients(phase 1)
+    punpcklbw   mm0, mm3                ; mm0=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm2, mm3                ; mm2=(06 07 16 17 26 27 36 37)
+    movq        mm1, mm6                ; transpose coefficients(phase 1)
+    punpcklbw   mm6, mm4                ; mm6=(02 03 12 13 22 23 32 33)
+    punpckhbw   mm1, mm4                ; mm1=(04 05 14 15 24 25 34 35)
+
+    movq        mm7, mm0                ; transpose coefficients(phase 2)
+    punpcklwd   mm0, mm6                ; mm0=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm7, mm6                ; mm7=(20 21 22 23 30 31 32 33)
+    movq        mm5, mm1                ; transpose coefficients(phase 2)
+    punpcklwd   mm1, mm2                ; mm1=(04 05 06 07 14 15 16 17)
+    punpckhwd   mm5, mm2                ; mm5=(24 25 26 27 34 35 36 37)
+
+    movq        mm3, mm0                ; transpose coefficients(phase 3)
+    punpckldq   mm0, mm1                ; mm0=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm3, mm1                ; mm3=(10 11 12 13 14 15 16 17)
+    movq        mm4, mm7                ; transpose coefficients(phase 3)
+    punpckldq   mm7, mm5                ; mm7=(20 21 22 23 24 25 26 27)
+    punpckhdq   mm4, mm5                ; mm4=(30 31 32 33 34 35 36 37)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_JCOEF     ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                          ; ctr
+    jnz         near .rowloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jidctint-sse2.asm b/simd/i386/jidctint-sse2.asm
new file mode 100644
index 0000000..a6bd00a
--- /dev/null
+++ b/simd/i386/jidctint-sse2.asm
@@ -0,0 +1,860 @@
+;
+; jidctint.asm - accurate integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054   times 4  dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4  dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4  dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4  dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4  dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         12
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm5, PASS1_BITS
+
+    movdqa      xmm4, xmm5              ; xmm5=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm5, xmm5              ; xmm5=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm4, xmm4              ; xmm4=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm7, xmm5, 0x00        ; xmm7=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm6, xmm5, 0x55        ; xmm6=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm1, xmm5, 0xAA        ; xmm1=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm5, xmm5, 0xFF        ; xmm5=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm0, xmm4, 0x00        ; xmm0=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm3, xmm4, 0x55        ; xmm3=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm2, xmm4, 0xAA        ; xmm2=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm4, xmm4, 0xFF        ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(8)], xmm6   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm5   ; wk(9)=col3
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm4, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm4, xmm3              ; xmm3=in6=z3
+    punpckhwd   xmm5, xmm3
+    movdqa      xmm1, xmm4
+    movdqa      xmm3, xmm5
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F130_F054)]   ; xmm4=tmp3L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F130_F054)]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm1=tmp2L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm3=tmp2H
+
+    movdqa      xmm6, xmm0
+    paddw       xmm0, xmm2              ; xmm0=in0+in4
+    psubw       xmm6, xmm2              ; xmm6=in0-in4
+
+    pxor        xmm7, xmm7
+    pxor        xmm2, xmm2
+    punpcklwd   xmm7, xmm0              ; xmm7=tmp0L
+    punpckhwd   xmm2, xmm0              ; xmm2=tmp0H
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+    psrad       xmm2, (16-CONST_BITS)   ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm4              ; xmm7=tmp10L
+    psubd       xmm0, xmm4              ; xmm0=tmp13L
+    movdqa      xmm4, xmm2
+    paddd       xmm2, xmm5              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm7, xmm7
+    punpcklwd   xmm5, xmm6              ; xmm5=tmp1L
+    punpckhwd   xmm7, xmm6              ; xmm7=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+    movdqa      xmm2, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm2, xmm1              ; xmm2=tmp12L
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm3              ; xmm7=tmp11H
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm7, xmm4
+    paddw       xmm5, xmm3              ; xmm5=z3
+    paddw       xmm7, xmm1              ; xmm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm5
+    punpcklwd   xmm2, xmm7
+    punpckhwd   xmm0, xmm7
+    movdqa      xmm5, xmm2
+    movdqa      xmm7, xmm0
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm2=z3L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm0=z3H
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F117_F078)]   ; xmm5=z4L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_F117_F078)]   ; xmm7=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm3
+    punpcklwd   xmm2, xmm4
+    punpckhwd   xmm0, xmm4
+    movdqa      xmm3, xmm2
+    movdqa      xmm4, xmm0
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm2=tmp0L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm0=tmp0H
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm3=tmp3L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm4=tmp3H
+
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
+    paddd       xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
+    paddd       xmm3, xmm5              ; xmm3=tmp3L
+    paddd       xmm4, xmm7              ; xmm4=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm0, xmm1
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm0, xmm6
+    movdqa      xmm1, xmm2
+    movdqa      xmm6, xmm0
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm2=tmp1L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm0=tmp1H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm1=tmp2L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm6=tmp2H
+
+    paddd       xmm2, xmm5              ; xmm2=tmp1L
+    paddd       xmm0, xmm7              ; xmm0=tmp1H
+    paddd       xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm7
+    paddd       xmm5, xmm3              ; xmm5=data0L
+    paddd       xmm7, xmm4              ; xmm7=data0H
+    psubd       xmm2, xmm3              ; xmm2=data7L
+    psubd       xmm0, xmm4              ; xmm0=data7H
+
+    movdqa      xmm3, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm3=[PD_DESCALE_P1]
+
+    paddd       xmm5, xmm3
+    paddd       xmm7, xmm3
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm7, DESCALE_P1
+    paddd       xmm2, xmm3
+    paddd       xmm0, xmm3
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm5, xmm7              ; xmm5=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm2, xmm0              ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+    movdqa      xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
+    movdqa      xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
+
+    movdqa      xmm7, xmm4
+    movdqa      xmm0, xmm3
+    paddd       xmm4, xmm1              ; xmm4=data1L
+    paddd       xmm3, xmm6              ; xmm3=data1H
+    psubd       xmm7, xmm1              ; xmm7=data6L
+    psubd       xmm0, xmm6              ; xmm0=data6H
+
+    movdqa      xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm1=[PD_DESCALE_P1]
+
+    paddd       xmm4, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+    paddd       xmm7, xmm1
+    paddd       xmm0, xmm1
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm4, xmm3              ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm7, xmm0              ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+    movdqa      xmm6, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm4              ; xmm5=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4              ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm2              ; xmm7=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm1, xmm2              ; xmm1=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
+    movdqa      xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
+    movdqa      xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
+    movdqa      xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm5, xmm3
+    movdqa      xmm6, xmm0
+    paddd       xmm3, xmm4              ; xmm3=data2L
+    paddd       xmm0, xmm2              ; xmm0=data2H
+    psubd       xmm5, xmm4              ; xmm5=data5L
+    psubd       xmm6, xmm2              ; xmm6=data5H
+
+    movdqa      xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm7=[PD_DESCALE_P1]
+
+    paddd       xmm3, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm3, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+    paddd       xmm5, xmm7
+    paddd       xmm6, xmm7
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm3, xmm0              ; xmm3=data2=(20 21 22 23 24 25 26 27)
+    packssdw    xmm5, xmm6              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
+    movdqa      xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
+    movdqa      xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
+    movdqa      xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm6, xmm4
+    paddd       xmm1, xmm2              ; xmm1=data3L
+    paddd       xmm4, xmm7              ; xmm4=data3H
+    psubd       xmm0, xmm2              ; xmm0=data4L
+    psubd       xmm6, xmm7              ; xmm6=data4H
+
+    movdqa      xmm2, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm2=[PD_DESCALE_P1]
+
+    paddd       xmm1, xmm2
+    paddd       xmm4, xmm2
+    psrad       xmm1, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm0, xmm2
+    paddd       xmm6, xmm2
+    psrad       xmm0, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm1, xmm4              ; xmm1=data3=(30 31 32 33 34 35 36 37)
+    packssdw    xmm0, xmm6              ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
+    movdqa      xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
+
+    movdqa      xmm4, xmm3              ; transpose coefficients(phase 1)
+    punpcklwd   xmm3, xmm1              ; xmm3=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm1              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm6, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm6, xmm5              ; xmm6=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm3              ; xmm7=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm1, xmm3              ; xmm1=(02 12 22 32 03 13 23 33)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm4              ; xmm2=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm5, xmm4              ; xmm5=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
+    movdqa      xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm2, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm2, xmm3              ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm4              ; xmm6=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm5, xmm4              ; xmm5=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm0              ; xmm7=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm3, xmm0              ; xmm3=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(8)], xmm3   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm4   ; wk(9)=col3
+
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm3, xmm6              ; xmm3=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm5              ; xmm2=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm4, xmm5              ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Even part
+
+    ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm6, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm6, xmm2              ; xmm2=in6=z3
+    punpckhwd   xmm5, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm2, xmm5
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F130_F054)]   ; xmm6=tmp3L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F130_F054)]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm1=tmp2L
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm2=tmp2H
+
+    movdqa      xmm3, xmm7
+    paddw       xmm7, xmm0              ; xmm7=in0+in4
+    psubw       xmm3, xmm0              ; xmm3=in0-in4
+
+    pxor        xmm4, xmm4
+    pxor        xmm0, xmm0
+    punpcklwd   xmm4, xmm7              ; xmm4=tmp0L
+    punpckhwd   xmm0, xmm7              ; xmm0=tmp0H
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+    psrad       xmm0, (16-CONST_BITS)   ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm6              ; xmm4=tmp10L
+    psubd       xmm7, xmm6              ; xmm7=tmp13L
+    movdqa      xmm6, xmm0
+    paddd       xmm0, xmm5              ; xmm0=tmp10H
+    psubd       xmm6, xmm5              ; xmm6=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm4, xmm4
+    punpcklwd   xmm5, xmm3              ; xmm5=tmp1L
+    punpckhwd   xmm4, xmm3              ; xmm4=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+    movdqa      xmm0, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm0, xmm1              ; xmm0=tmp12L
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm2              ; xmm4=tmp11H
+    psubd       xmm7, xmm2              ; xmm7=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm6, XMMWORD [wk(9)]   ; xmm6=col3
+    movdqa      xmm3, XMMWORD [wk(8)]   ; xmm3=col1
+    movdqa      xmm1, XMMWORD [wk(11)]  ; xmm1=col7
+    movdqa      xmm2, XMMWORD [wk(10)]  ; xmm2=col5
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm4, xmm3
+    paddw       xmm5, xmm1              ; xmm5=z3
+    paddw       xmm4, xmm2              ; xmm4=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm5
+    punpcklwd   xmm0, xmm4
+    punpckhwd   xmm7, xmm4
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm7
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm0=z3L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm7=z3H
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F117_F078)]   ; xmm5=z4L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F117_F078)]   ; xmm4=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm7, xmm1
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm1, xmm0
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm0=tmp0L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm7=tmp0H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm1=tmp3L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm3=tmp3H
+
+    paddd       xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
+    paddd       xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
+    paddd       xmm1, xmm5              ; xmm1=tmp3L
+    paddd       xmm3, xmm4              ; xmm3=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
+
+    movdqa      xmm0, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm0, xmm6
+    punpckhwd   xmm7, xmm6
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm7
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm0=tmp1L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm7=tmp1H
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm2=tmp2L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm6=tmp2H
+
+    paddd       xmm0, xmm5              ; xmm0=tmp1L
+    paddd       xmm7, xmm4              ; xmm7=tmp1H
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm4
+    paddd       xmm5, xmm1              ; xmm5=data0L
+    paddd       xmm4, xmm3              ; xmm4=data0H
+    psubd       xmm0, xmm1              ; xmm0=data7L
+    psubd       xmm7, xmm3              ; xmm7=data7H
+
+    movdqa      xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm1=[PD_DESCALE_P2]
+
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrad       xmm5, DESCALE_P2
+    psrad       xmm4, DESCALE_P2
+    paddd       xmm0, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm5, xmm4              ; xmm5=data0=(00 10 20 30 40 50 60 70)
+    packssdw    xmm0, xmm7              ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
+    movdqa      xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm7, xmm1
+    paddd       xmm3, xmm2              ; xmm3=data1L
+    paddd       xmm1, xmm6              ; xmm1=data1H
+    psubd       xmm4, xmm2              ; xmm4=data6L
+    psubd       xmm7, xmm6              ; xmm7=data6H
+
+    movdqa      xmm2, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm2=[PD_DESCALE_P2]
+
+    paddd       xmm3, xmm2
+    paddd       xmm1, xmm2
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm4, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm3, xmm1              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    packssdw    xmm4, xmm7              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+    packsswb    xmm5, xmm4              ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm0              ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
+    movdqa      xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
+    movdqa      xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm4, xmm6
+    movdqa      xmm0, xmm2
+    paddd       xmm6, xmm1              ; xmm6=data2L
+    paddd       xmm2, xmm7              ; xmm2=data2H
+    psubd       xmm4, xmm1              ; xmm4=data5L
+    psubd       xmm0, xmm7              ; xmm0=data5H
+
+    movdqa      xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm5=[PD_DESCALE_P2]
+
+    paddd       xmm6, xmm5
+    paddd       xmm2, xmm5
+    psrad       xmm6, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm4, xmm5
+    paddd       xmm0, xmm5
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    packssdw    xmm6, xmm2              ; xmm6=data2=(02 12 22 32 42 52 62 72)
+    packssdw    xmm4, xmm0              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+    movdqa      xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
+    movdqa      xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
+    movdqa      xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
+    movdqa      xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm1
+    paddd       xmm3, xmm7              ; xmm3=data3L
+    paddd       xmm1, xmm5              ; xmm1=data3H
+    psubd       xmm2, xmm7              ; xmm2=data4L
+    psubd       xmm0, xmm5              ; xmm0=data4H
+
+    movdqa      xmm7, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm7=[PD_DESCALE_P2]
+
+    paddd       xmm3, xmm7
+    paddd       xmm1, xmm7
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm2, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm2, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    movdqa      xmm5, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; xmm5=[PB_CENTERJSAMP]
+
+    packssdw    xmm3, xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
+    packssdw    xmm2, xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+    movdqa      xmm7, XMMWORD [wk(0)]  ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      xmm1, XMMWORD [wk(1)]  ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    packsswb    xmm6, xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm3, xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm7, xmm5
+    paddb       xmm1, xmm5
+    paddb       xmm6, xmm5
+    paddb       xmm3, xmm5
+
+    movdqa      xmm0, xmm7        ; transpose coefficients(phase 1)
+    punpcklbw   xmm7, xmm1        ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm1        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 1)
+    punpcklbw   xmm6, xmm3        ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm2, xmm3        ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm7        ; transpose coefficients(phase 2)
+    punpcklwd   xmm7, xmm6        ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm6        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm5, xmm2        ; transpose coefficients(phase 2)
+    punpcklwd   xmm2, xmm0        ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm5, xmm0        ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm1, xmm7        ; transpose coefficients(phase 3)
+    punpckldq   xmm7, xmm2        ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm1, xmm2        ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm3, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm5        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm3, xmm5        ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm6, xmm7, 0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm1, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm2, xmm4, 0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm5, xmm3, 0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+    mov         edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+    mov         edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jidctred-mmx.asm b/simd/i386/jidctred-mmx.asm
new file mode 100644
index 0000000..336ee3b
--- /dev/null
+++ b/simd/i386/jidctred-mmx.asm
@@ -0,0 +1,706 @@
+;
+; jidctred.asm - reduced-size IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS    13
+%define PASS1_BITS    2
+
+%define DESCALE_P1_4  (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4  (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2  (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2  (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ  1730  ; FIX(0.211164243)
+F_0_509 equ  4176  ; FIX(0.509795579)
+F_0_601 equ  4926  ; FIX(0.601344887)
+F_0_720 equ  5906  ; FIX(0.720959822)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_850 equ  6967  ; FIX(0.850430095)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_061 equ  8697  ; FIX(1.061594337)
+F_1_272 equ 10426  ; FIX(1.272758580)
+F_1_451 equ 11893  ; FIX(1.451774981)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_2_172 equ 17799  ; FIX(2.172734803)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_624 equ 29692  ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS)  ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS)  ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS)  ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS)  ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS)  ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS)  ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS)  ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS)  ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS)  ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_red_mmx)
+
+EXTN(jconst_idct_red_mmx):
+
+PW_F184_MF076   times 2 dw  F_1_847, -F_0_765
+PW_F256_F089    times 2 dw  F_2_562,  F_0_899
+PW_F106_MF217   times 2 dw  F_1_061, -F_2_172
+PW_MF060_MF050  times 2 dw -F_0_601, -F_0_509
+PW_F145_MF021   times 2 dw  F_1_451, -F_0_211
+PW_F362_MF127   times 2 dw  F_3_624, -F_1_272
+PW_F085_MF072   times 2 dw  F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 2 dd  1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 2 dd  1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 2 dd  1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 2 dd  1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_mmx(void *dct_table, JCOEFPTR coef_block,
+;                    JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+                                        ; JCOEF workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_4x4_mmx)
+
+EXTN(jsimd_idct_4x4_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; JCOEF *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
+    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm0, mm1
+    packsswb    mm0, mm0
+    movd        eax, mm0
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       mm0, PASS1_BITS
+
+    movq        mm2, mm0                ; mm0=in0=(00 01 02 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm2, mm2                ; mm2=(02 02 03 03)
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0                ; mm0=(00 00 00 00)
+    punpckhdq   mm1, mm1                ; mm1=(01 01 01 01)
+    movq        mm3, mm2
+    punpckldq   mm2, mm2                ; mm2=(02 02 02 02)
+    punpckhdq   mm3, mm3                ; mm3=(03 03 03 03)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movq        mm4, mm0
+    movq        mm5, mm0
+    punpcklwd   mm4, mm1
+    punpckhwd   mm5, mm1
+    movq        mm0, mm4
+    movq        mm1, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F256_F089)]   ; mm4=(tmp2L)
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F256_F089)]   ; mm5=(tmp2H)
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F106_MF217)]  ; mm0=(tmp0L)
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F106_MF217)]  ; mm1=(tmp0H)
+
+    movq        mm6, mm2
+    movq        mm7, mm2
+    punpcklwd   mm6, mm3
+    punpckhwd   mm7, mm3
+    movq        mm2, mm6
+    movq        mm3, mm7
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm6=(tmp2L)
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm7=(tmp2H)
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F145_MF021)]   ; mm2=(tmp0L)
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F145_MF021)]   ; mm3=(tmp0H)
+
+    paddd       mm6, mm4                ; mm6=tmp2L
+    paddd       mm7, mm5                ; mm7=tmp2H
+    paddd       mm2, mm0                ; mm2=tmp0L
+    paddd       mm3, mm1                ; mm3=tmp0H
+
+    movq        MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
+    movq        MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movq        mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    pxor        mm1, mm1
+    pxor        mm2, mm2
+    punpcklwd   mm1, mm4                ; mm1=tmp0L
+    punpckhwd   mm2, mm4                ; mm2=tmp0H
+    psrad       mm1, (16-CONST_BITS-1)  ; psrad mm1,16 & pslld mm1,CONST_BITS+1
+    psrad       mm2, (16-CONST_BITS-1)  ; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+    movq        mm3, mm5                ; mm5=in2=z2
+    punpcklwd   mm5, mm0                ; mm0=in6=z3
+    punpckhwd   mm3, mm0
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F184_MF076)]  ; mm5=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F184_MF076)]  ; mm3=tmp2H
+
+    movq        mm4, mm1
+    movq        mm0, mm2
+    paddd       mm1, mm5                ; mm1=tmp10L
+    paddd       mm2, mm3                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp12L
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    ; -- Final output stage
+
+    movq        mm5, mm1
+    movq        mm3, mm2
+    paddd       mm1, mm6                ; mm1=data0L
+    paddd       mm2, mm7                ; mm2=data0H
+    psubd       mm5, mm6                ; mm5=data3L
+    psubd       mm3, mm7                ; mm3=data3H
+
+    movq        mm6, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; mm6=[PD_DESCALE_P1_4]
+
+    paddd       mm1, mm6
+    paddd       mm2, mm6
+    psrad       mm1, DESCALE_P1_4
+    psrad       mm2, DESCALE_P1_4
+    paddd       mm5, mm6
+    paddd       mm3, mm6
+    psrad       mm5, DESCALE_P1_4
+    psrad       mm3, DESCALE_P1_4
+
+    packssdw    mm1, mm2                ; mm1=data0=(00 01 02 03)
+    packssdw    mm5, mm3                ; mm5=data3=(30 31 32 33)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp0L
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp0H
+
+    movq        mm2, mm4
+    movq        mm3, mm0
+    paddd       mm4, mm7                ; mm4=data1L
+    paddd       mm0, mm6                ; mm0=data1H
+    psubd       mm2, mm7                ; mm2=data2L
+    psubd       mm3, mm6                ; mm3=data2H
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; mm7=[PD_DESCALE_P1_4]
+
+    paddd       mm4, mm7
+    paddd       mm0, mm7
+    psrad       mm4, DESCALE_P1_4
+    psrad       mm0, DESCALE_P1_4
+    paddd       mm2, mm7
+    paddd       mm3, mm7
+    psrad       mm2, DESCALE_P1_4
+    psrad       mm3, DESCALE_P1_4
+
+    packssdw    mm4, mm0                ; mm4=data1=(10 11 12 13)
+    packssdw    mm2, mm3                ; mm2=data2=(20 21 22 23)
+
+    movq        mm6, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm4                ; mm1=(00 10 01 11)
+    punpckhwd   mm6, mm4                ; mm6=(02 12 03 13)
+    movq        mm7, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm5                ; mm2=(20 30 21 31)
+    punpckhwd   mm7, mm5                ; mm7=(22 32 23 33)
+
+    movq        mm0, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm2                ; mm1=(00 10 20 30)
+    punpckhdq   mm0, mm2                ; mm0=(01 11 21 31)
+    movq        mm3, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm7                ; mm6=(02 12 22 32)
+    punpckhdq   mm3, mm7                ; mm3=(03 13 23 33)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF            ; coef_block
+    add         edx, byte 4*SIZEOF_ISLOW_MULT_TYPE  ; quantptr
+    add         edi, byte 4*DCTSIZE*SIZEOF_JCOEF    ; wsptr
+    dec         ecx                                 ; ctr
+    jnz         near .columnloop
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; JCOEF *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm4, mm0
+    movq        mm5, mm0
+    punpcklwd   mm4, mm1
+    punpckhwd   mm5, mm1
+    movq        mm0, mm4
+    movq        mm1, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F256_F089)]   ; mm4=(tmp2L)
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F256_F089)]   ; mm5=(tmp2H)
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F106_MF217)]  ; mm0=(tmp0L)
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F106_MF217)]  ; mm1=(tmp0H)
+
+    movq        mm6, mm2
+    movq        mm7, mm2
+    punpcklwd   mm6, mm3
+    punpckhwd   mm7, mm3
+    movq        mm2, mm6
+    movq        mm3, mm7
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm6=(tmp2L)
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm7=(tmp2H)
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F145_MF021)]   ; mm2=(tmp0L)
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F145_MF021)]   ; mm3=(tmp0H)
+
+    paddd       mm6, mm4                ; mm6=tmp2L
+    paddd       mm7, mm5                ; mm7=tmp2H
+    paddd       mm2, mm0                ; mm2=tmp0L
+    paddd       mm3, mm1                ; mm3=tmp0H
+
+    movq        MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
+    movq        MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movq        mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    pxor        mm1, mm1
+    pxor        mm2, mm2
+    punpcklwd   mm1, mm4                ; mm1=tmp0L
+    punpckhwd   mm2, mm4                ; mm2=tmp0H
+    psrad       mm1, (16-CONST_BITS-1)  ; psrad mm1,16 & pslld mm1,CONST_BITS+1
+    psrad       mm2, (16-CONST_BITS-1)  ; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+    movq        mm3, mm5                ; mm5=in2=z2
+    punpcklwd   mm5, mm0                ; mm0=in6=z3
+    punpckhwd   mm3, mm0
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F184_MF076)]  ; mm5=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F184_MF076)]  ; mm3=tmp2H
+
+    movq        mm4, mm1
+    movq        mm0, mm2
+    paddd       mm1, mm5                ; mm1=tmp10L
+    paddd       mm2, mm3                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp12L
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    ; -- Final output stage
+
+    movq        mm5, mm1
+    movq        mm3, mm2
+    paddd       mm1, mm6                ; mm1=data0L
+    paddd       mm2, mm7                ; mm2=data0H
+    psubd       mm5, mm6                ; mm5=data3L
+    psubd       mm3, mm7                ; mm3=data3H
+
+    movq        mm6, [GOTOFF(ebx,PD_DESCALE_P2_4)]  ; mm6=[PD_DESCALE_P2_4]
+
+    paddd       mm1, mm6
+    paddd       mm2, mm6
+    psrad       mm1, DESCALE_P2_4
+    psrad       mm2, DESCALE_P2_4
+    paddd       mm5, mm6
+    paddd       mm3, mm6
+    psrad       mm5, DESCALE_P2_4
+    psrad       mm3, DESCALE_P2_4
+
+    packssdw    mm1, mm2                ; mm1=data0=(00 10 20 30)
+    packssdw    mm5, mm3                ; mm5=data3=(03 13 23 33)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp0L
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp0H
+
+    movq        mm2, mm4
+    movq        mm3, mm0
+    paddd       mm4, mm7                ; mm4=data1L
+    paddd       mm0, mm6                ; mm0=data1H
+    psubd       mm2, mm7                ; mm2=data2L
+    psubd       mm3, mm6                ; mm3=data2H
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P2_4)]  ; mm7=[PD_DESCALE_P2_4]
+
+    paddd       mm4, mm7
+    paddd       mm0, mm7
+    psrad       mm4, DESCALE_P2_4
+    psrad       mm0, DESCALE_P2_4
+    paddd       mm2, mm7
+    paddd       mm3, mm7
+    psrad       mm2, DESCALE_P2_4
+    psrad       mm3, DESCALE_P2_4
+
+    packssdw    mm4, mm0                ; mm4=data1=(01 11 21 31)
+    packssdw    mm2, mm3                ; mm2=data2=(02 12 22 32)
+
+    movq        mm6, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm6=[PB_CENTERJSAMP]
+
+    packsswb    mm1, mm2                ; mm1=(00 10 20 30 02 12 22 32)
+    packsswb    mm4, mm5                ; mm4=(01 11 21 31 03 13 23 33)
+    paddb       mm1, mm6
+    paddb       mm4, mm6
+
+    movq        mm7, mm1                ; transpose coefficients(phase 1)
+    punpcklbw   mm1, mm4                ; mm1=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm7, mm4                ; mm7=(02 03 12 13 22 23 32 33)
+
+    movq        mm0, mm1                ; transpose coefficients(phase 2)
+    punpcklwd   mm1, mm7                ; mm1=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm0, mm7                ; mm0=(20 21 22 23 30 31 32 33)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movd        DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
+    movd        DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+
+    psrlq       mm1, 4*BYTE_BIT
+    psrlq       mm0, 4*BYTE_BIT
+
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movd        DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
+    movd        DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_mmx(void *dct_table, JCOEFPTR coef_block,
+;                    JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_2x2_mmx)
+
+EXTN(jsimd_idct_2x2_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         edx, POINTER [dct_table(ebp)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(ebp)]  ; inptr
+
+    ; | input:                  | result:        |
+    ; | 00 01 ** 03 ** 05 ** 07 |                |
+    ; | 10 11 ** 13 ** 15 ** 17 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+    ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+    ; | 50 51 ** 53 ** 55 ** 57 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 70 71 ** 73 ** 75 ** 77 |                |
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
+    ; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
+
+    pcmpeqd     mm7, mm7
+    pslld       mm7, WORD_BIT           ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
+
+    movq        mm4, mm0                ; mm4=(10 11 ** 13)
+    movq        mm5, mm2                ; mm5=(50 51 ** 53)
+    punpcklwd   mm4, mm1                ; mm4=(10 30 11 31)
+    punpcklwd   mm5, mm3                ; mm5=(50 70 51 71)
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+    psrld       mm0, WORD_BIT           ; mm0=(11 -- 13 --)
+    pand        mm1, mm7                ; mm1=(-- 31 -- 33)
+    psrld       mm2, WORD_BIT           ; mm2=(51 -- 53 --)
+    pand        mm3, mm7                ; mm3=(-- 71 -- 73)
+    por         mm0, mm1                ; mm0=(11 31 13 33)
+    por         mm2, mm3                ; mm2=(51 71 53 73)
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       mm4, mm5                ; mm4=tmp0[col0 col1]
+
+    movq        mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
+    pmullw      mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
+    pmullw      mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
+    ; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
+
+    psrld       mm6, WORD_BIT           ; mm6=(15 -- 17 --)
+    pand        mm1, mm7                ; mm1=(-- 35 -- 37)
+    psrld       mm3, WORD_BIT           ; mm3=(55 -- 57 --)
+    pand        mm5, mm7                ; mm5=(-- 75 -- 77)
+    por         mm6, mm1                ; mm6=(15 35 17 37)
+    por         mm3, mm5                ; mm3=(55 75 57 77)
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       mm0, mm2                ; mm0=tmp0[col1 col3]
+    paddd       mm6, mm3                ; mm6=tmp0[col5 col7]
+
+    ; -- Even part
+
+    movq        mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
+    pmullw      mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
+
+    movq        mm2, mm1                      ; mm2=(00 01 ** 03)
+    pslld       mm1, WORD_BIT                 ; mm1=(-- 00 -- **)
+    psrad       mm1, (WORD_BIT-CONST_BITS-2)  ; mm1=tmp10[col0 ****]
+
+    pand        mm2, mm7                      ; mm2=(-- 01 -- 03)
+    pand        mm5, mm7                      ; mm5=(-- 05 -- 07)
+    psrad       mm2, (WORD_BIT-CONST_BITS-2)  ; mm2=tmp10[col1 col3]
+    psrad       mm5, (WORD_BIT-CONST_BITS-2)  ; mm5=tmp10[col5 col7]
+
+    ; -- Final output stage
+
+    movq        mm3, mm1
+    paddd       mm1, mm4                ; mm1=data0[col0 ****]=(A0 **)
+    psubd       mm3, mm4                ; mm3=data1[col0 ****]=(B0 **)
+    punpckldq   mm1, mm3                ; mm1=(A0 B0)
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P1_2)]  ; mm7=[PD_DESCALE_P1_2]
+
+    movq        mm4, mm2
+    movq        mm3, mm5
+    paddd       mm2, mm0                ; mm2=data0[col1 col3]=(A1 A3)
+    paddd       mm5, mm6                ; mm5=data0[col5 col7]=(A5 A7)
+    psubd       mm4, mm0                ; mm4=data1[col1 col3]=(B1 B3)
+    psubd       mm3, mm6                ; mm3=data1[col5 col7]=(B5 B7)
+
+    paddd       mm1, mm7
+    psrad       mm1, DESCALE_P1_2
+
+    paddd       mm2, mm7
+    paddd       mm5, mm7
+    psrad       mm2, DESCALE_P1_2
+    psrad       mm5, DESCALE_P1_2
+    paddd       mm4, mm7
+    paddd       mm3, mm7
+    psrad       mm4, DESCALE_P1_2
+    psrad       mm3, DESCALE_P1_2
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         edi, JSAMPARRAY [output_buf(ebp)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(ebp)]
+
+    ; | input:| result:|
+    ; | A0 B0 |        |
+    ; | A1 B1 | C0 C1  |
+    ; | A3 B3 | D0 D1  |
+    ; | A5 B5 |        |
+    ; | A7 B7 |        |
+
+    ; -- Odd part
+
+    packssdw    mm2, mm4                ; mm2=(A1 A3 B1 B3)
+    packssdw    mm5, mm3                ; mm5=(A5 A7 B5 B7)
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       mm2, mm5                ; mm2=tmp0[row0 row1]
+
+    ; -- Even part
+
+    pslld       mm1, (CONST_BITS+2)     ; mm1=tmp10[row0 row1]
+
+    ; -- Final output stage
+
+    movq        mm0, [GOTOFF(ebx,PD_DESCALE_P2_2)]  ; mm0=[PD_DESCALE_P2_2]
+
+    movq        mm6, mm1
+    paddd       mm1, mm2                ; mm1=data0[row0 row1]=(C0 C1)
+    psubd       mm6, mm2                ; mm6=data1[row0 row1]=(D0 D1)
+
+    paddd       mm1, mm0
+    paddd       mm6, mm0
+    psrad       mm1, DESCALE_P2_2
+    psrad       mm6, DESCALE_P2_2
+
+    movq        mm7, mm1                ; transpose coefficients
+    punpckldq   mm1, mm6                ; mm1=(C0 D0)
+    punpckhdq   mm7, mm6                ; mm7=(C1 D1)
+
+    packssdw    mm1, mm7                ; mm1=(C0 D0 C1 D1)
+    packsswb    mm1, mm1                ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
+    paddb       mm1, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    movd        ecx, mm1
+    movd        ebx, mm1                ; ebx=(C0 D0 C1 D1)
+    shr         ecx, 2*BYTE_BIT         ; ecx=(C1 D1 -- --)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         WORD [edx+eax*SIZEOF_JSAMPLE], bx
+    mov         WORD [esi+eax*SIZEOF_JSAMPLE], cx
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jidctred-sse2.asm b/simd/i386/jidctred-sse2.asm
new file mode 100644
index 0000000..97838ba
--- /dev/null
+++ b/simd/i386/jidctred-sse2.asm
@@ -0,0 +1,594 @@
+;
+; jidctred.asm - reduced-size IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS    13
+%define PASS1_BITS    2
+
+%define DESCALE_P1_4  (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4  (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2  (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2  (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ  1730  ; FIX(0.211164243)
+F_0_509 equ  4176  ; FIX(0.509795579)
+F_0_601 equ  4926  ; FIX(0.601344887)
+F_0_720 equ  5906  ; FIX(0.720959822)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_850 equ  6967  ; FIX(0.850430095)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_061 equ  8697  ; FIX(1.061594337)
+F_1_272 equ 10426  ; FIX(1.272758580)
+F_1_451 equ 11893  ; FIX(1.451774981)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_2_172 equ 17799  ; FIX(2.172734803)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_624 equ 29692  ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS)  ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS)  ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS)  ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS)  ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS)  ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS)  ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS)  ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS)  ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS)  ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076   times 4  dw  F_1_847, -F_0_765
+PW_F256_F089    times 4  dw  F_2_562,  F_0_899
+PW_F106_MF217   times 4  dw  F_1_061, -F_2_172
+PW_MF060_MF050  times 4  dw -F_0_601, -F_0_509
+PW_F145_MF021   times 4  dw  F_1_451, -F_0_211
+PW_F362_MF127   times 4  dw  F_3_624, -F_1_272
+PW_F085_MF072   times 4  dw  F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 4  dd  1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 4  dd  1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 4  dd  1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 4  dd  1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, xmm1
+    packsswb    xmm0, xmm0
+    packsswb    xmm0, xmm0
+    movd        eax, xmm0
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm0, PASS1_BITS
+
+    movdqa      xmm3, xmm0        ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0        ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm3, xmm3        ; xmm3=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm1, xmm0, 0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+    pshufd      xmm0, xmm0, 0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+    pshufd      xmm6, xmm3, 0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+    pshufd      xmm3, xmm3, 0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm0
+    punpcklwd   xmm4, xmm1
+    punpckhwd   xmm5, xmm1
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F256_F089)]   ; xmm4=(tmp2L)
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F256_F089)]   ; xmm5=(tmp2H)
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F106_MF217)]  ; xmm0=(tmp0L)
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F106_MF217)]  ; xmm1=(tmp0H)
+
+    movdqa      xmm6, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm6, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; xmm6=(tmp2L)
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF060_MF050)]  ; xmm7=(tmp2H)
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F145_MF021)]   ; xmm2=(tmp0L)
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_F145_MF021)]   ; xmm3=(tmp0H)
+
+    paddd       xmm6, xmm4              ; xmm6=tmp2L
+    paddd       xmm7, xmm5              ; xmm7=tmp2H
+    paddd       xmm2, xmm0              ; xmm2=tmp0L
+    paddd       xmm3, xmm1              ; xmm3=tmp0H
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    pxor        xmm1, xmm1
+    pxor        xmm2, xmm2
+    punpcklwd   xmm1, xmm4               ; xmm1=tmp0L
+    punpckhwd   xmm2, xmm4               ; xmm2=tmp0H
+    psrad       xmm1, (16-CONST_BITS-1)  ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+    psrad       xmm2, (16-CONST_BITS-1)  ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+    movdqa      xmm3, xmm5              ; xmm5=in2=z2
+    punpcklwd   xmm5, xmm0              ; xmm0=in6=z3
+    punpckhwd   xmm3, xmm0
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F184_MF076)]  ; xmm5=tmp2L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_F184_MF076)]  ; xmm3=tmp2H
+
+    movdqa      xmm4, xmm1
+    movdqa      xmm0, xmm2
+    paddd       xmm1, xmm5              ; xmm1=tmp10L
+    paddd       xmm2, xmm3              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp12L
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, xmm1
+    movdqa      xmm3, xmm2
+    paddd       xmm1, xmm6              ; xmm1=data0L
+    paddd       xmm2, xmm7              ; xmm2=data0H
+    psubd       xmm5, xmm6              ; xmm5=data3L
+    psubd       xmm3, xmm7              ; xmm3=data3H
+
+    movdqa      xmm6, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; xmm6=[PD_DESCALE_P1_4]
+
+    paddd       xmm1, xmm6
+    paddd       xmm2, xmm6
+    psrad       xmm1, DESCALE_P1_4
+    psrad       xmm2, DESCALE_P1_4
+    paddd       xmm5, xmm6
+    paddd       xmm3, xmm6
+    psrad       xmm5, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm1, xmm2              ; xmm1=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm5, xmm3              ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
+
+    movdqa      xmm2, xmm4
+    movdqa      xmm3, xmm0
+    paddd       xmm4, xmm7              ; xmm4=data1L
+    paddd       xmm0, xmm6              ; xmm0=data1H
+    psubd       xmm2, xmm7              ; xmm2=data2L
+    psubd       xmm3, xmm6              ; xmm3=data2H
+
+    movdqa      xmm7, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; xmm7=[PD_DESCALE_P1_4]
+
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm4, DESCALE_P1_4
+    psrad       xmm0, DESCALE_P1_4
+    paddd       xmm2, xmm7
+    paddd       xmm3, xmm7
+    psrad       xmm2, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm4, xmm0        ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm2, xmm3        ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+    movdqa      xmm6, xmm1        ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm4        ; xmm1=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4        ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm7, xmm2        ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm5        ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm7, xmm5        ; xmm7=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm2        ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm0, xmm2        ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+    movdqa      xmm3, xmm6        ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7        ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm3, xmm7        ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Even part
+
+    pxor        xmm4, xmm4
+    punpcklwd   xmm4, xmm1               ; xmm4=tmp0
+    psrad       xmm4, (16-CONST_BITS-1)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+    ; -- Odd part
+
+    punpckhwd   xmm1, xmm0
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm5, xmm1
+    movdqa      xmm2, xmm6
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F256_F089)]    ; xmm1=(tmp2)
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; xmm6=(tmp2)
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F106_MF217)]   ; xmm5=(tmp0)
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F145_MF021)]   ; xmm2=(tmp0)
+
+    paddd       xmm6, xmm1              ; xmm6=tmp2
+    paddd       xmm2, xmm5              ; xmm2=tmp0
+
+    ; -- Even part
+
+    punpcklwd   xmm0, xmm3
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F184_MF076)]  ; xmm0=tmp2
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm0              ; xmm4=tmp10
+    psubd       xmm7, xmm0              ; xmm7=tmp12
+
+    ; -- Final output stage
+
+    movdqa      xmm1, [GOTOFF(ebx,PD_DESCALE_P2_4)]  ; xmm1=[PD_DESCALE_P2_4]
+
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm7
+    paddd       xmm4, xmm6              ; xmm4=data0=(00 10 20 30)
+    paddd       xmm7, xmm2              ; xmm7=data1=(01 11 21 31)
+    psubd       xmm5, xmm6              ; xmm5=data3=(03 13 23 33)
+    psubd       xmm3, xmm2              ; xmm3=data2=(02 12 22 32)
+
+    paddd       xmm4, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm4, DESCALE_P2_4
+    psrad       xmm7, DESCALE_P2_4
+    paddd       xmm5, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm5, DESCALE_P2_4
+    psrad       xmm3, DESCALE_P2_4
+
+    packssdw    xmm4, xmm3              ; xmm4=(00 10 20 30 02 12 22 32)
+    packssdw    xmm7, xmm5              ; xmm7=(01 11 21 31 03 13 23 33)
+
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 1)
+    punpcklwd   xmm4, xmm7              ; xmm4=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm0, xmm7              ; xmm0=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm6, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm6, xmm0              ; xmm6=(20 21 22 23 30 31 32 33)
+
+    packsswb    xmm4, xmm6              ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+    paddb       xmm4, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    pshufd      xmm2, xmm4, 0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+    pshufd      xmm1, xmm4, 0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+    pshufd      xmm3, xmm4, 0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movd        XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+    movd        XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         edx, POINTER [dct_table(ebp)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(ebp)]  ; inptr
+
+    ; | input:                  | result:        |
+    ; | 00 01 ** 03 ** 05 ** 07 |                |
+    ; | 10 11 ** 13 ** 15 ** 17 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+    ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+    ; | 50 51 ** 53 ** 55 ** 57 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 70 71 ** 73 ** 75 ** 77 |                |
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+    ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+    pcmpeqd     xmm7, xmm7
+    pslld       xmm7, WORD_BIT          ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+    movdqa      xmm4, xmm0              ; xmm4=(10 11 ** 13 ** 15 ** 17)
+    movdqa      xmm5, xmm2              ; xmm5=(50 51 ** 53 ** 55 ** 57)
+    punpcklwd   xmm4, xmm1              ; xmm4=(10 30 11 31 ** ** 13 33)
+    punpcklwd   xmm5, xmm3              ; xmm5=(50 70 51 71 ** ** 53 73)
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+    psrld       xmm0, WORD_BIT          ; xmm0=(11 -- 13 -- 15 -- 17 --)
+    pand        xmm1, xmm7              ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+    psrld       xmm2, WORD_BIT          ; xmm2=(51 -- 53 -- 55 -- 57 --)
+    pand        xmm3, xmm7              ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+    por         xmm0, xmm1              ; xmm0=(11 31 13 33 15 35 17 37)
+    por         xmm2, xmm3              ; xmm2=(51 71 53 73 55 75 57 77)
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       xmm4, xmm5              ; xmm4=tmp0[col0 col1 **** col3]
+    paddd       xmm0, xmm2              ; xmm0=tmp0[col1 col3 col5 col7]
+
+    ; -- Even part
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+    movdqa      xmm1, xmm6              ; xmm1=(00 01 ** 03 ** 05 ** 07)
+    pslld       xmm6, WORD_BIT          ; xmm6=(-- 00 -- ** -- ** -- **)
+    pand        xmm1, xmm7              ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+    psrad       xmm6, (WORD_BIT-CONST_BITS-2)  ; xmm6=tmp10[col0 **** **** ****]
+    psrad       xmm1, (WORD_BIT-CONST_BITS-2)  ; xmm1=tmp10[col1 col3 col5 col7]
+
+    ; -- Final output stage
+
+    movdqa      xmm3, xmm6
+    movdqa      xmm5, xmm1
+    paddd       xmm6, xmm4      ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+    paddd       xmm1, xmm0      ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+    psubd       xmm3, xmm4      ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+    psubd       xmm5, xmm0      ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+    movdqa      xmm2, [GOTOFF(ebx,PD_DESCALE_P1_2)]  ; xmm2=[PD_DESCALE_P1_2]
+
+    punpckldq   xmm6, xmm3              ; xmm6=(A0 B0 ** **)
+
+    movdqa      xmm7, xmm1
+    punpcklqdq  xmm1, xmm5              ; xmm1=(A1 A3 B1 B3)
+    punpckhqdq  xmm7, xmm5              ; xmm7=(A5 A7 B5 B7)
+
+    paddd       xmm6, xmm2
+    psrad       xmm6, DESCALE_P1_2
+
+    paddd       xmm1, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm1, DESCALE_P1_2
+    psrad       xmm7, DESCALE_P1_2
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         edi, JSAMPARRAY [output_buf(ebp)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(ebp)]
+
+    ; | input:| result:|
+    ; | A0 B0 |        |
+    ; | A1 B1 | C0 C1  |
+    ; | A3 B3 | D0 D1  |
+    ; | A5 B5 |        |
+    ; | A7 B7 |        |
+
+    ; -- Odd part
+
+    packssdw    xmm1, xmm1              ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+    packssdw    xmm7, xmm7              ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       xmm1, xmm7              ; xmm1=tmp0[row0 row1 row0 row1]
+
+    ; -- Even part
+
+    pslld       xmm6, (CONST_BITS+2)    ; xmm6=tmp10[row0 row1 **** ****]
+
+    ; -- Final output stage
+
+    movdqa      xmm4, xmm6
+    paddd       xmm6, xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+    psubd       xmm4, xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+    punpckldq   xmm6, xmm4     ; xmm6=(C0 D0 C1 D1)
+
+    paddd       xmm6, [GOTOFF(ebx,PD_DESCALE_P2_2)]
+    psrad       xmm6, DESCALE_P2_2
+
+    packssdw    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+    packsswb    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+    paddb       xmm6, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    pextrw      ebx, xmm6, 0x00         ; ebx=(C0 D0 -- --)
+    pextrw      ecx, xmm6, 0x01         ; ecx=(C1 D1 -- --)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         WORD [edx+eax*SIZEOF_JSAMPLE], bx
+    mov         WORD [esi+eax*SIZEOF_JSAMPLE], cx
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jquant-3dn.asm b/simd/i386/jquant-3dn.asm
new file mode 100644
index 0000000..1767f44
--- /dev/null
+++ b/simd/i386/jquant-3dn.asm
@@ -0,0 +1,232 @@
+;
+; jquant.asm - sample data conversion and quantization (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_3dnow(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                            FAST_FLOAT *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_3dnow)
+
+EXTN(jsimd_convsamp_float_3dnow):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pcmpeqw     mm7, mm7
+    psllw       mm7, 7
+    packsswb    mm7, mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    psubb       mm0, mm7                ; mm0=(01234567)
+    psubb       mm1, mm7                ; mm1=(89ABCDEF)
+
+    punpcklbw   mm2, mm0                ; mm2=(*0*1*2*3)
+    punpckhbw   mm0, mm0                ; mm0=(*4*5*6*7)
+    punpcklbw   mm3, mm1                ; mm3=(*8*9*A*B)
+    punpckhbw   mm1, mm1                ; mm1=(*C*D*E*F)
+
+    punpcklwd   mm4, mm2                ; mm4=(***0***1)
+    punpckhwd   mm2, mm2                ; mm2=(***2***3)
+    punpcklwd   mm5, mm0                ; mm5=(***4***5)
+    punpckhwd   mm0, mm0                ; mm0=(***6***7)
+
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(01)
+    psrad       mm2, (DWORD_BIT-BYTE_BIT)  ; mm2=(23)
+    pi2fd       mm4, mm4
+    pi2fd       mm2, mm2
+    psrad       mm5, (DWORD_BIT-BYTE_BIT)  ; mm5=(45)
+    psrad       mm0, (DWORD_BIT-BYTE_BIT)  ; mm0=(67)
+    pi2fd       mm5, mm5
+    pi2fd       mm0, mm0
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
+    movq        MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+    movq        MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+
+    punpcklwd   mm6, mm3                ; mm6=(***8***9)
+    punpckhwd   mm3, mm3                ; mm3=(***A***B)
+    punpcklwd   mm4, mm1                ; mm4=(***C***D)
+    punpckhwd   mm1, mm1                ; mm1=(***E***F)
+
+    psrad       mm6, (DWORD_BIT-BYTE_BIT)  ; mm6=(89)
+    psrad       mm3, (DWORD_BIT-BYTE_BIT)  ; mm3=(AB)
+    pi2fd       mm6, mm6
+    pi2fd       mm3, mm3
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(CD)
+    psrad       mm1, (DWORD_BIT-BYTE_BIT)  ; mm1=(EF)
+    pi2fd       mm4, mm4
+    pi2fd       mm1, mm1
+
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
+    movq        MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+
+    add         esi, byte 2*SIZEOF_JSAMPROW
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .convloop
+
+    femms                               ; empty MMX/3DNow! state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_3dnow(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                            FAST_FLOAT *workspace);
+;
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; FAST_FLOAT *divisors
+%define workspace   ebp + 16            ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_3dnow)
+
+EXTN(jsimd_quantize_float_3dnow):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, 0x4B400000         ; (float)0x00C00000 (rndint_magic)
+    movd        mm7, eax
+    punpckldq   mm7, mm7                ; mm7={12582912.0F 12582912.0F}
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/16
+    alignx      16, 7
+.quantloop:
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+
+    pfadd       mm0, mm7                ; mm0=(00 ** 01 **)
+    pfadd       mm1, mm7                ; mm1=(02 ** 03 **)
+    pfadd       mm2, mm7                ; mm0=(04 ** 05 **)
+    pfadd       mm3, mm7                ; mm1=(06 ** 07 **)
+
+    movq        mm4, mm0
+    punpcklwd   mm0, mm1                ; mm0=(00 02 ** **)
+    punpckhwd   mm4, mm1                ; mm4=(01 03 ** **)
+    movq        mm5, mm2
+    punpcklwd   mm2, mm3                ; mm2=(04 06 ** **)
+    punpckhwd   mm5, mm3                ; mm5=(05 07 ** **)
+
+    punpcklwd   mm0, mm4                ; mm0=(00 01 02 03)
+    punpcklwd   mm2, mm5                ; mm2=(04 05 06 07)
+
+    movq        mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+    pfadd       mm6, mm7                ; mm0=(10 ** 11 **)
+    pfadd       mm1, mm7                ; mm4=(12 ** 13 **)
+    pfadd       mm3, mm7                ; mm0=(14 ** 15 **)
+    pfadd       mm4, mm7                ; mm4=(16 ** 17 **)
+
+    movq        mm5, mm6
+    punpcklwd   mm6, mm1                ; mm6=(10 12 ** **)
+    punpckhwd   mm5, mm1                ; mm5=(11 13 ** **)
+    movq        mm1, mm3
+    punpcklwd   mm3, mm4                ; mm3=(14 16 ** **)
+    punpckhwd   mm1, mm4                ; mm1=(15 17 ** **)
+
+    punpcklwd   mm6, mm5                ; mm6=(10 11 12 13)
+    punpcklwd   mm3, mm1                ; mm3=(14 15 16 17)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+    add         esi, byte 16*SIZEOF_FAST_FLOAT
+    add         edx, byte 16*SIZEOF_FAST_FLOAT
+    add         edi, byte 16*SIZEOF_JCOEF
+    dec         eax
+    jnz         near .quantloop
+
+    femms                               ; empty MMX/3DNow! state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jquant-mmx.asm b/simd/i386/jquant-mmx.asm
new file mode 100644
index 0000000..98932db
--- /dev/null
+++ b/simd/i386/jquant-mmx.asm
@@ -0,0 +1,278 @@
+;
+; jquant.asm - sample data conversion and quantization (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_mmx(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                    DCTELEM *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_mmx)
+
+EXTN(jsimd_convsamp_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pxor        mm6, mm6                ; mm6=(all 0's)
+    pcmpeqw     mm7, mm7
+    psllw       mm7, 7                  ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]   ; mm0=(01234567)
+    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]   ; mm1=(89ABCDEF)
+
+    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE]   ; mm2=(GHIJKLMN)
+    movq        mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE]   ; mm3=(OPQRSTUV)
+
+    movq        mm4, mm0
+    punpcklbw   mm0, mm6                ; mm0=(0123)
+    punpckhbw   mm4, mm6                ; mm4=(4567)
+    movq        mm5, mm1
+    punpcklbw   mm1, mm6                ; mm1=(89AB)
+    punpckhbw   mm5, mm6                ; mm5=(CDEF)
+
+    paddw       mm0, mm7
+    paddw       mm4, mm7
+    paddw       mm1, mm7
+    paddw       mm5, mm7
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
+
+    movq        mm0, mm2
+    punpcklbw   mm2, mm6                ; mm2=(GHIJ)
+    punpckhbw   mm0, mm6                ; mm0=(KLMN)
+    movq        mm4, mm3
+    punpcklbw   mm3, mm6                ; mm3=(OPQR)
+    punpckhbw   mm4, mm6                ; mm4=(STUV)
+
+    paddw       mm2, mm7
+    paddw       mm0, mm7
+    paddw       mm3, mm7
+    paddw       mm4, mm7
+
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
+
+    add         esi, byte 4*SIZEOF_JSAMPROW
+    add         edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         short .convloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_mmx(JCOEFPTR coef_block, DCTELEM *divisors,
+;                    DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  MMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  MMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  MMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SHIFT(m, n, b) \
+  MMBLOCK(DCTSIZE * 3 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; DCTELEM *divisors
+%define workspace   ebp + 16            ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_mmx)
+
+EXTN(jsimd_quantize_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         ah, 2
+    alignx      16, 7
+.quantloop1:
+    mov         al, DCTSIZE2/8/2
+    alignx      16, 7
+.quantloop2:
+    movq        mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
+
+    movq        mm0, mm2
+    movq        mm1, mm3
+
+    psraw       mm2, (WORD_BIT-1)       ; -1 if value < 0, 0 otherwise
+    psraw       mm3, (WORD_BIT-1)
+
+    pxor        mm0, mm2                ; val = -val
+    pxor        mm1, mm3
+    psubw       mm0, mm2
+    psubw       mm1, mm3
+
+    ;
+    ; MMX is an annoyingly crappy instruction set. It has two
+    ; misfeatures that are causing problems here:
+    ;
+    ; - All multiplications are signed.
+    ;
+    ; - The second operand for the shifts is not treated as packed.
+    ;
+    ;
+    ; We work around the first problem by implementing this algorithm:
+    ;
+    ; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
+    ; {
+    ;   enum { SHORT_BIT = 16 };
+    ;   signed short sx = (signed short)x;
+    ;   signed short sy = (signed short)y;
+    ;   signed long sz;
+    ;
+    ;   sz = (long)sx * (long)sy;    /* signed multiply */
+    ;
+    ;   if (sx < 0) sz += (long)sy << SHORT_BIT;
+    ;   if (sy < 0) sz += (long)sx << SHORT_BIT;
+    ;
+    ;   return (unsigned long)sz;
+    ; }
+    ;
+    ; (note that a negative sx adds _sy_ and vice versa)
+    ;
+    ; For the second problem, we replace the shift by a multiplication.
+    ; Unfortunately that means we have to deal with the signed issue again.
+    ;
+
+    paddw       mm0, MMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+    paddw       mm1, MMWORD [CORRECTION(0,1,edx)]
+
+    movq        mm4, mm0                ; store current value for later
+    movq        mm5, mm1
+    pmulhw      mm0, MMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+    pmulhw      mm1, MMWORD [RECIPROCAL(0,1,edx)]
+    paddw       mm0, mm4  ; reciprocal is always negative (MSB=1),
+    paddw       mm1, mm5  ; so we always need to add the initial value
+                          ; (input value is never negative as we
+                          ; inverted it at the start of this routine)
+
+    ; here it gets a bit tricky as both scale
+    ; and mm0/mm1 can be negative
+    movq        mm6, MMWORD [SCALE(0,0,edx)]  ; scale
+    movq        mm7, MMWORD [SCALE(0,1,edx)]
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pmulhw      mm0, mm6
+    pmulhw      mm1, mm7
+
+    psraw       mm6, (WORD_BIT-1)       ; determine if scale is negative
+    psraw       mm7, (WORD_BIT-1)
+
+    pand        mm6, mm4                ; and add input if it is
+    pand        mm7, mm5
+    paddw       mm0, mm6
+    paddw       mm1, mm7
+
+    psraw       mm4, (WORD_BIT-1)       ; then check if negative input
+    psraw       mm5, (WORD_BIT-1)
+
+    pand        mm4, MMWORD [SCALE(0,0,edx)]  ; and add scale if it is
+    pand        mm5, MMWORD [SCALE(0,1,edx)]
+    paddw       mm0, mm4
+    paddw       mm1, mm5
+
+    pxor        mm0, mm2                ; val = -val
+    pxor        mm1, mm3
+    psubw       mm0, mm2
+    psubw       mm1, mm3
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
+
+    add         esi, byte 8*SIZEOF_DCTELEM
+    add         edx, byte 8*SIZEOF_DCTELEM
+    add         edi, byte 8*SIZEOF_JCOEF
+    dec         al
+    jnz         near .quantloop2
+    dec         ah
+    jnz         near .quantloop1        ; to avoid branch misprediction
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jquant-sse.asm b/simd/i386/jquant-sse.asm
new file mode 100644
index 0000000..cc244c4
--- /dev/null
+++ b/simd/i386/jquant-sse.asm
@@ -0,0 +1,210 @@
+;
+; jquant.asm - sample data conversion and quantization (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                          FAST_FLOAT *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_sse)
+
+EXTN(jsimd_convsamp_float_sse):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pcmpeqw     mm7, mm7
+    psllw       mm7, 7
+    packsswb    mm7, mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    psubb       mm0, mm7                ; mm0=(01234567)
+    psubb       mm1, mm7                ; mm1=(89ABCDEF)
+
+    punpcklbw   mm2, mm0                ; mm2=(*0*1*2*3)
+    punpckhbw   mm0, mm0                ; mm0=(*4*5*6*7)
+    punpcklbw   mm3, mm1                ; mm3=(*8*9*A*B)
+    punpckhbw   mm1, mm1                ; mm1=(*C*D*E*F)
+
+    punpcklwd   mm4, mm2                ; mm4=(***0***1)
+    punpckhwd   mm2, mm2                ; mm2=(***2***3)
+    punpcklwd   mm5, mm0                ; mm5=(***4***5)
+    punpckhwd   mm0, mm0                ; mm0=(***6***7)
+
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(01)
+    psrad       mm2, (DWORD_BIT-BYTE_BIT)  ; mm2=(23)
+    cvtpi2ps    xmm0, mm4                  ; xmm0=(01**)
+    cvtpi2ps    xmm1, mm2                  ; xmm1=(23**)
+    psrad       mm5, (DWORD_BIT-BYTE_BIT)  ; mm5=(45)
+    psrad       mm0, (DWORD_BIT-BYTE_BIT)  ; mm0=(67)
+    cvtpi2ps    xmm2, mm5                  ; xmm2=(45**)
+    cvtpi2ps    xmm3, mm0                  ; xmm3=(67**)
+
+    punpcklwd   mm6, mm3                ; mm6=(***8***9)
+    punpckhwd   mm3, mm3                ; mm3=(***A***B)
+    punpcklwd   mm4, mm1                ; mm4=(***C***D)
+    punpckhwd   mm1, mm1                ; mm1=(***E***F)
+
+    psrad       mm6, (DWORD_BIT-BYTE_BIT)  ; mm6=(89)
+    psrad       mm3, (DWORD_BIT-BYTE_BIT)  ; mm3=(AB)
+    cvtpi2ps    xmm4, mm6                  ; xmm4=(89**)
+    cvtpi2ps    xmm5, mm3                  ; xmm5=(AB**)
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(CD)
+    psrad       mm1, (DWORD_BIT-BYTE_BIT)  ; mm1=(EF)
+    cvtpi2ps    xmm6, mm4                  ; xmm6=(CD**)
+    cvtpi2ps    xmm7, mm1                  ; xmm7=(EF**)
+
+    movlhps     xmm0, xmm1              ; xmm0=(0123)
+    movlhps     xmm2, xmm3              ; xmm2=(4567)
+    movlhps     xmm4, xmm5              ; xmm4=(89AB)
+    movlhps     xmm6, xmm7              ; xmm6=(CDEF)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+
+    add         esi, byte 2*SIZEOF_JSAMPROW
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .convloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                          FAST_FLOAT *workspace);
+;
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; FAST_FLOAT *divisors
+%define workspace   ebp + 16            ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_sse)
+
+EXTN(jsimd_quantize_float_sse):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/16
+    alignx      16, 7
+.quantloop:
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+    movhlps     xmm4, xmm0
+    movhlps     xmm5, xmm1
+
+    cvtps2pi    mm0, xmm0
+    cvtps2pi    mm1, xmm1
+    cvtps2pi    mm4, xmm4
+    cvtps2pi    mm5, xmm5
+
+    movhlps     xmm6, xmm2
+    movhlps     xmm7, xmm3
+
+    cvtps2pi    mm2, xmm2
+    cvtps2pi    mm3, xmm3
+    cvtps2pi    mm6, xmm6
+    cvtps2pi    mm7, xmm7
+
+    packssdw    mm0, mm4
+    packssdw    mm1, mm5
+    packssdw    mm2, mm6
+    packssdw    mm3, mm7
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+    add         esi, byte 16*SIZEOF_FAST_FLOAT
+    add         edx, byte 16*SIZEOF_FAST_FLOAT
+    add         edi, byte 16*SIZEOF_JCOEF
+    dec         eax
+    jnz         short .quantloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jquantf-sse2.asm b/simd/i386/jquantf-sse2.asm
new file mode 100644
index 0000000..8d1201c
--- /dev/null
+++ b/simd/i386/jquantf-sse2.asm
@@ -0,0 +1,170 @@
+;
+; jquantf.asm - sample data conversion and quantization (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                           FAST_FLOAT *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7
+    packsswb    xmm7, xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    psubb       xmm0, xmm7              ; xmm0=(01234567)
+    psubb       xmm1, xmm7              ; xmm1=(89ABCDEF)
+
+    punpcklbw   xmm0, xmm0              ; xmm0=(*0*1*2*3*4*5*6*7)
+    punpcklbw   xmm1, xmm1              ; xmm1=(*8*9*A*B*C*D*E*F)
+
+    punpcklwd   xmm2, xmm0              ; xmm2=(***0***1***2***3)
+    punpckhwd   xmm0, xmm0              ; xmm0=(***4***5***6***7)
+    punpcklwd   xmm3, xmm1              ; xmm3=(***8***9***A***B)
+    punpckhwd   xmm1, xmm1              ; xmm1=(***C***D***E***F)
+
+    psrad       xmm2, (DWORD_BIT-BYTE_BIT)  ; xmm2=(0123)
+    psrad       xmm0, (DWORD_BIT-BYTE_BIT)  ; xmm0=(4567)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=(0123)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=(4567)
+    psrad       xmm3, (DWORD_BIT-BYTE_BIT)  ; xmm3=(89AB)
+    psrad       xmm1, (DWORD_BIT-BYTE_BIT)  ; xmm1=(CDEF)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=(89AB)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=(CDEF)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+
+    add         esi, byte 2*SIZEOF_JSAMPROW
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         short .convloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                           FAST_FLOAT *workspace);
+;
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; FAST_FLOAT *divisors
+%define workspace   ebp + 16            ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/16
+    alignx      16, 7
+.quantloop:
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+    cvtps2dq    xmm0, xmm0
+    cvtps2dq    xmm1, xmm1
+    cvtps2dq    xmm2, xmm2
+    cvtps2dq    xmm3, xmm3
+
+    packssdw    xmm0, xmm1
+    packssdw    xmm2, xmm3
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
+
+    add         esi, byte 16*SIZEOF_FAST_FLOAT
+    add         edx, byte 16*SIZEOF_FAST_FLOAT
+    add         edi, byte 16*SIZEOF_JCOEF
+    dec         eax
+    jnz         short .quantloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jquanti-avx2.asm b/simd/i386/jquanti-avx2.asm
new file mode 100644
index 0000000..ea8e1a1
--- /dev/null
+++ b/simd/i386/jquanti-avx2.asm
@@ -0,0 +1,190 @@
+;
+; jquanti.asm - sample data conversion and quantization (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2018, D. R. Commander.
+; Copyright (C) 2016, Matthieu Darbois.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_avx2)
+
+EXTN(jsimd_convsamp_avx2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    mov         ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    mov         ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    vinserti128 ymm0, ymm0, xmm1, 1
+    vinserti128 ymm2, ymm2, xmm3, 1
+    vinserti128 ymm4, ymm4, xmm5, 1
+    vinserti128 ymm6, ymm6, xmm7, 1
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+    vpunpcklbw  ymm0, ymm0, ymm1
+    vpunpcklbw  ymm2, ymm2, ymm1
+    vpunpcklbw  ymm4, ymm4, ymm1
+    vpunpcklbw  ymm6, ymm6, ymm1
+
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpaddw      ymm4, ymm4, ymm7
+    vpaddw      ymm6, ymm6, ymm7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6
+
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; DCTELEM *divisors
+%define workspace   ebp + 16            ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_avx2)
+
+EXTN(jsimd_quantize_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+
+    vmovdqu     ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)]
+    vpabsw      ymm0, ymm4
+    vpabsw      ymm1, ymm5
+    vpabsw      ymm2, ymm6
+    vpabsw      ymm3, ymm7
+
+    vpaddw      ymm0, YMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+    vpaddw      ymm1, YMMWORD [CORRECTION(2,0,edx)]
+    vpaddw      ymm2, YMMWORD [CORRECTION(4,0,edx)]
+    vpaddw      ymm3, YMMWORD [CORRECTION(6,0,edx)]
+    vpmulhuw    ymm0, YMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+    vpmulhuw    ymm1, YMMWORD [RECIPROCAL(2,0,edx)]
+    vpmulhuw    ymm2, YMMWORD [RECIPROCAL(4,0,edx)]
+    vpmulhuw    ymm3, YMMWORD [RECIPROCAL(6,0,edx)]
+    vpmulhuw    ymm0, YMMWORD [SCALE(0,0,edx)]       ; scale
+    vpmulhuw    ymm1, YMMWORD [SCALE(2,0,edx)]
+    vpmulhuw    ymm2, YMMWORD [SCALE(4,0,edx)]
+    vpmulhuw    ymm3, YMMWORD [SCALE(6,0,edx)]
+
+    vpsignw     ymm0, ymm0, ymm4
+    vpsignw     ymm1, ymm1, ymm5
+    vpsignw     ymm2, ymm2, ymm6
+    vpsignw     ymm3, ymm3, ymm7
+
+    vmovdqu     [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1
+    vmovdqu     [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3
+
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jquanti-sse2.asm b/simd/i386/jquanti-sse2.asm
new file mode 100644
index 0000000..2a69494
--- /dev/null
+++ b/simd/i386/jquanti-sse2.asm
@@ -0,0 +1,203 @@
+;
+; jquanti.asm - sample data conversion and quantization (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pxor        xmm6, xmm6              ; xmm6=(all 0's)
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]  ; xmm0=(01234567)
+    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]  ; xmm1=(89ABCDEF)
+
+    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]  ; xmm2=(GHIJKLMN)
+    movq        xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]  ; xmm3=(OPQRSTUV)
+
+    punpcklbw   xmm0, xmm6              ; xmm0=(01234567)
+    punpcklbw   xmm1, xmm6              ; xmm1=(89ABCDEF)
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    punpcklbw   xmm2, xmm6              ; xmm2=(GHIJKLMN)
+    punpcklbw   xmm3, xmm6              ; xmm3=(OPQRSTUV)
+    paddw       xmm2, xmm7
+    paddw       xmm3, xmm7
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+    add         esi, byte 4*SIZEOF_JSAMPROW
+    add         edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         short .convloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; DCTELEM *divisors
+%define workspace   ebp + 16            ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/32
+    alignx      16, 7
+.quantloop:
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    psraw       xmm4, (WORD_BIT-1)
+    psraw       xmm5, (WORD_BIT-1)
+    psraw       xmm6, (WORD_BIT-1)
+    psraw       xmm7, (WORD_BIT-1)
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4              ; if (xmm0 < 0) xmm0 = -xmm0;
+    psubw       xmm1, xmm5              ; if (xmm1 < 0) xmm1 = -xmm1;
+    psubw       xmm2, xmm6              ; if (xmm2 < 0) xmm2 = -xmm2;
+    psubw       xmm3, xmm7              ; if (xmm3 < 0) xmm3 = -xmm3;
+
+    paddw       xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+    paddw       xmm1, XMMWORD [CORRECTION(1,0,edx)]
+    paddw       xmm2, XMMWORD [CORRECTION(2,0,edx)]
+    paddw       xmm3, XMMWORD [CORRECTION(3,0,edx)]
+    pmulhuw     xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+    pmulhuw     xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
+    pmulhuw     xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
+    pmulhuw     xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
+    pmulhuw     xmm0, XMMWORD [SCALE(0,0,edx)]       ; scale
+    pmulhuw     xmm1, XMMWORD [SCALE(1,0,edx)]
+    pmulhuw     xmm2, XMMWORD [SCALE(2,0,edx)]
+    pmulhuw     xmm3, XMMWORD [SCALE(3,0,edx)]
+
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4
+    psubw       xmm1, xmm5
+    psubw       xmm2, xmm6
+    psubw       xmm3, xmm7
+    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+    add         esi, byte 32*SIZEOF_DCTELEM
+    add         edx, byte 32*SIZEOF_DCTELEM
+    add         edi, byte 32*SIZEOF_JCOEF
+    dec         eax
+    jnz         near .quantloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/i386/jsimd.c b/simd/i386/jsimd.c
new file mode 100644
index 0000000..1e22384
--- /dev/null
+++ b/simd/i386/jsimd.c
@@ -0,0 +1,1195 @@
+/*
+ * jsimd_i386.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
+ * Copyright (C) 2015, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 32-bit x86 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order) (((unsigned)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+#define IS_ALIGNED_AVX(ptr) (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
+
+static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+  char *env = NULL;
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = jpeg_simd_cpu_support();
+
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEMMX");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support &= JSIMD_MMX;
+  env = getenv("JSIMD_FORCE3DNOW");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support &= JSIMD_3DNOW | JSIMD_MMX;
+  env = getenv("JSIMD_FORCESSE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support &= JSIMD_SSE | JSIMD_MMX;
+  env = getenv("JSIMD_FORCESSE2");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support &= JSIMD_SSE2;
+  env = getenv("JSIMD_FORCEAVX2");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support &= JSIMD_AVX2;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+  env = getenv("JSIMD_NOHUFFENC");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_huffman = 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_ycc_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_gray_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extrgb_ycc_convert_sse2;
+    mmxfct = jsimd_extrgb_ycc_convert_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_ycc_convert_avx2;
+    sse2fct = jsimd_extrgbx_ycc_convert_sse2;
+    mmxfct = jsimd_extrgbx_ycc_convert_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extbgr_ycc_convert_sse2;
+    mmxfct = jsimd_extbgr_ycc_convert_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_ycc_convert_avx2;
+    sse2fct = jsimd_extbgrx_ycc_convert_sse2;
+    mmxfct = jsimd_extbgrx_ycc_convert_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extxbgr_ycc_convert_sse2;
+    mmxfct = jsimd_extxbgr_ycc_convert_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extxrgb_ycc_convert_sse2;
+    mmxfct = jsimd_extxrgb_ycc_convert_mmx;
+    break;
+  default:
+    avx2fct = jsimd_rgb_ycc_convert_avx2;
+    sse2fct = jsimd_rgb_ycc_convert_sse2;
+    mmxfct = jsimd_rgb_ycc_convert_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_gray_convert_avx2;
+    sse2fct = jsimd_extrgb_gray_convert_sse2;
+    mmxfct = jsimd_extrgb_gray_convert_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_gray_convert_avx2;
+    sse2fct = jsimd_extrgbx_gray_convert_sse2;
+    mmxfct = jsimd_extrgbx_gray_convert_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_gray_convert_avx2;
+    sse2fct = jsimd_extbgr_gray_convert_sse2;
+    mmxfct = jsimd_extbgr_gray_convert_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_gray_convert_avx2;
+    sse2fct = jsimd_extbgrx_gray_convert_sse2;
+    mmxfct = jsimd_extbgrx_gray_convert_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_gray_convert_avx2;
+    sse2fct = jsimd_extxbgr_gray_convert_sse2;
+    mmxfct = jsimd_extxbgr_gray_convert_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_gray_convert_avx2;
+    sse2fct = jsimd_extxrgb_gray_convert_sse2;
+    mmxfct = jsimd_extxrgb_gray_convert_mmx;
+    break;
+  default:
+    avx2fct = jsimd_rgb_gray_convert_avx2;
+    sse2fct = jsimd_rgb_gray_convert_sse2;
+    mmxfct = jsimd_rgb_gray_convert_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+  void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_ycc_extrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extrgb_convert_sse2;
+    mmxfct = jsimd_ycc_extrgb_convert_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_ycc_extrgbx_convert_avx2;
+    sse2fct = jsimd_ycc_extrgbx_convert_sse2;
+    mmxfct = jsimd_ycc_extrgbx_convert_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_ycc_extbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extbgr_convert_sse2;
+    mmxfct = jsimd_ycc_extbgr_convert_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_ycc_extbgrx_convert_avx2;
+    sse2fct = jsimd_ycc_extbgrx_convert_sse2;
+    mmxfct = jsimd_ycc_extbgrx_convert_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_ycc_extxbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extxbgr_convert_sse2;
+    mmxfct = jsimd_ycc_extxbgr_convert_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_ycc_extxrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extxrgb_convert_sse2;
+    mmxfct = jsimd_ycc_extxrgb_convert_mmx;
+    break;
+  default:
+    avx2fct = jsimd_ycc_rgb_convert_avx2;
+    sse2fct = jsimd_ycc_rgb_convert_sse2;
+    mmxfct = jsimd_ycc_rgb_convert_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  else
+    mmxfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extrgb_merged_upsample_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v2_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgbx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extrgbx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v2_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v2_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgrx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extbgrx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v2_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extxbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v2_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extxrgb_merged_upsample_mmx;
+    break;
+  default:
+    avx2fct = jsimd_h2v2_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_merged_upsample_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extrgb_merged_upsample_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v1_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgbx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extrgbx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v1_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v1_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgrx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extbgrx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v1_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extxbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v1_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extxrgb_merged_upsample_mmx;
+    break;
+  default:
+    avx2fct = jsimd_h2v1_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_merged_upsample_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_SSE)
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_convsamp_avx2(sample_data, start_col, workspace);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_convsamp_sse2(sample_data, start_col, workspace);
+  else
+    jsimd_convsamp_mmx(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+  else if (simd_support & JSIMD_SSE)
+    jsimd_convsamp_float_sse(sample_data, start_col, workspace);
+  else
+    jsimd_convsamp_float_3dnow(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_fdct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_fdct_islow_avx2(data);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_fdct_islow_sse2(data);
+  else
+    jsimd_fdct_islow_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    jsimd_fdct_ifast_sse2(data);
+  else
+    jsimd_fdct_ifast_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    jsimd_fdct_float_sse(data);
+  else if (simd_support & JSIMD_3DNOW)
+    jsimd_fdct_float_3dnow(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_SSE)
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_quantize_avx2(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_quantize_sse2(coef_block, divisors, workspace);
+  else
+    jsimd_quantize_mmx(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_SSE)
+    jsimd_quantize_float_sse(coef_block, divisors, workspace);
+  else
+    jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+  else
+    jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+  else
+    jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_idct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+  if (sizeof(FLOAT_MULT_TYPE) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    return 1;
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else
+    jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf,
+                         output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else
+    jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf,
+                         output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+    jsimd_idct_float_sse(compptr->dct_table, coef_block, output_buf,
+                         output_col);
+  else
+    jsimd_idct_float_3dnow(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
+      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
diff --git a/simd/i386/jsimdcpu.asm b/simd/i386/jsimdcpu.asm
new file mode 100644
index 0000000..50a0d51
--- /dev/null
+++ b/simd/i386/jsimdcpu.asm
@@ -0,0 +1,131 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support(void)
+;
+
+    align       32
+    GLOBAL_FUNCTION(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+    push        edi
+
+    xor         edi, edi                ; simd support flag
+
+    pushfd
+    pop         eax
+    mov         edx, eax
+    xor         eax, 1<<21              ; flip ID bit in EFLAGS
+    push        eax
+    popfd
+    pushfd
+    pop         eax
+    xor         eax, edx
+    jz          near .return            ; CPUID is not supported
+
+    ; Check for MMX instruction support
+    xor         eax, eax
+    cpuid
+    test        eax, eax
+    jz          near .return
+
+    xor         eax, eax
+    inc         eax
+    cpuid
+    mov         eax, edx                ; eax = Standard feature flags
+
+    test        eax, 1<<23              ; bit23:MMX
+    jz          short .no_mmx
+    or          edi, byte JSIMD_MMX
+.no_mmx:
+    test        eax, 1<<25              ; bit25:SSE
+    jz          short .no_sse
+    or          edi, byte JSIMD_SSE
+.no_sse:
+    test        eax, 1<<26              ; bit26:SSE2
+    jz          short .no_sse2
+    or          edi, byte JSIMD_SSE2
+.no_sse2:
+
+    ; Check for AVX2 instruction support
+    mov         eax, 7
+    xor         ecx, ecx
+    cpuid
+    mov         eax, ebx
+    test        eax, 1<<5               ; bit5:AVX2
+    jz          short .no_avx2
+
+    ; Check for AVX2 O/S support
+    mov         eax, 1
+    xor         ecx, ecx
+    cpuid
+    test        ecx, 1<<27
+    jz          short .no_avx2          ; O/S does not support XSAVE
+    test        ecx, 1<<28
+    jz          short .no_avx2          ; CPU does not support AVX2
+
+    xor         ecx, ecx
+    xgetbv
+    test        eax, 6                  ; O/S does not manage XMM/YMM state
+                                        ; using XSAVE
+    jz          short .no_avx2
+
+    or          edi, JSIMD_AVX2
+.no_avx2:
+
+    ; Check for 3DNow! instruction support
+    mov         eax, 0x80000000
+    cpuid
+    cmp         eax, 0x80000000
+    jbe         short .return
+
+    mov         eax, 0x80000001
+    cpuid
+    mov         eax, edx                ; eax = Extended feature flags
+
+    test        eax, 1<<31              ; bit31:3DNow!(vendor independent)
+    jz          short .no_3dnow
+    or          edi, byte JSIMD_3DNOW
+.no_3dnow:
+
+.return:
+    mov         eax, edi
+
+    pop         edi
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/jccolext-mmx.asm b/simd/jccolext-mmx.asm
deleted file mode 100644
index 96a0372..0000000
--- a/simd/jccolext-mmx.asm
+++ /dev/null
@@ -1,476 +0,0 @@
-;
-; jccolext.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width,
-;                           JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                           JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION img_width
-%define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf
-%define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf
-%define output_row(b)   (b)+20          ; JDIMENSION output_row
-%define num_rows(b)     (b)+24          ; int num_rows
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          8
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-        global  EXTN(jsimd_rgb_ycc_convert_mmx)
-
-EXTN(jsimd_rgb_ycc_convert_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [img_width(eax)]        ; num_cols
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     esi, JSAMPIMAGE [output_buf(eax)]
-        mov     ecx, JDIMENSION [output_row(eax)]
-        mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
-        lea     edi, [edi+ecx*SIZEOF_JSAMPROW]
-        lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-        lea     edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-        pop     ecx
-
-        mov     esi, JSAMPARRAY [input_buf(eax)]
-        mov     eax, INT [num_rows(eax)]
-        test    eax,eax
-        jle     near .return
-        alignx  16,7
-.rowloop:
-        pushpic eax
-        push    edx
-        push    ebx
-        push    edi
-        push    esi
-        push    ecx                     ; col
-
-        mov     esi, JSAMPROW [esi]     ; inptr
-        mov     edi, JSAMPROW [edi]     ; outptr0
-        mov     ebx, JSAMPROW [ebx]     ; outptr1
-        mov     edx, JSAMPROW [edx]     ; outptr2
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-        cmp     ecx, byte SIZEOF_MMWORD
-        jae     short .columnloop
-        alignx  16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-        push    eax
-        push    edx
-        lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE
-        test    cl, SIZEOF_BYTE
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_BYTE
-        xor     eax,eax
-        mov     al, BYTE [esi+ecx]
-.column_ld2:
-        test    cl, SIZEOF_WORD
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_WORD
-        xor     edx,edx
-        mov     dx, WORD [esi+ecx]
-        shl     eax, WORD_BIT
-        or      eax,edx
-.column_ld4:
-        movd    mmA,eax
-        pop     edx
-        pop     eax
-        test    cl, SIZEOF_DWORD
-        jz      short .column_ld8
-        sub     ecx, byte SIZEOF_DWORD
-        movd    mmG, DWORD [esi+ecx]
-        psllq   mmA, DWORD_BIT
-        por     mmA,mmG
-.column_ld8:
-        test    cl, SIZEOF_MMWORD
-        jz      short .column_ld16
-        movq    mmG,mmA
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        mov     ecx, SIZEOF_MMWORD
-        jmp     short .rgb_ycc_cnv
-.column_ld16:
-        test    cl, 2*SIZEOF_MMWORD
-        mov     ecx, SIZEOF_MMWORD
-        jz      short .rgb_ycc_cnv
-        movq    mmF,mmA
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-        jmp     short .rgb_ycc_cnv
-        alignx  16,7
-
-.columnloop:
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-        movq    mmF, MMWORD [esi+2*SIZEOF_MMWORD]
-
-.rgb_ycc_cnv:
-        ; mmA=(00 10 20 01 11 21 02 12)
-        ; mmG=(22 03 13 23 04 14 24 05)
-        ; mmF=(15 25 06 16 26 07 17 27)
-
-        movq      mmD,mmA
-        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 10 20 01)
-        psrlq     mmD,4*BYTE_BIT        ; mmD=(11 21 02 12 -- -- -- --)
-
-        punpckhbw mmA,mmG               ; mmA=(00 04 10 14 20 24 01 05)
-        psllq     mmG,4*BYTE_BIT        ; mmG=(-- -- -- -- 22 03 13 23)
-
-        punpcklbw mmD,mmF               ; mmD=(11 15 21 25 02 06 12 16)
-        punpckhbw mmG,mmF               ; mmG=(22 26 03 07 13 17 23 27)
-
-        movq      mmE,mmA
-        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 04 10 14)
-        psrlq     mmE,4*BYTE_BIT        ; mmE=(20 24 01 05 -- -- -- --)
-
-        punpckhbw mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
-        psllq     mmD,4*BYTE_BIT        ; mmD=(-- -- -- -- 11 15 21 25)
-
-        punpcklbw mmE,mmG               ; mmE=(20 22 24 26 01 03 05 07)
-        punpckhbw mmD,mmG               ; mmD=(11 13 15 17 21 23 25 27)
-
-        pxor      mmH,mmH
-
-        movq      mmC,mmA
-        punpcklbw mmA,mmH               ; mmA=(00 02 04 06)
-        punpckhbw mmC,mmH               ; mmC=(10 12 14 16)
-
-        movq      mmB,mmE
-        punpcklbw mmE,mmH               ; mmE=(20 22 24 26)
-        punpckhbw mmB,mmH               ; mmB=(01 03 05 07)
-
-        movq      mmF,mmD
-        punpcklbw mmD,mmH               ; mmD=(11 13 15 17)
-        punpckhbw mmF,mmH               ; mmF=(21 23 25 27)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-        test    cl, SIZEOF_MMWORD/8
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_MMWORD/8
-        movd    mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-        test    cl, SIZEOF_MMWORD/4
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_MMWORD/4
-        movq    mmF,mmA
-        movq    mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld4:
-        test    cl, SIZEOF_MMWORD/2
-        mov     ecx, SIZEOF_MMWORD
-        jz      short .rgb_ycc_cnv
-        movq    mmD,mmA
-        movq    mmC,mmF
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-        jmp     short .rgb_ycc_cnv
-        alignx  16,7
-
-.columnloop:
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-        movq    mmD, MMWORD [esi+2*SIZEOF_MMWORD]
-        movq    mmC, MMWORD [esi+3*SIZEOF_MMWORD]
-
-.rgb_ycc_cnv:
-        ; mmA=(00 10 20 30 01 11 21 31)
-        ; mmF=(02 12 22 32 03 13 23 33)
-        ; mmD=(04 14 24 34 05 15 25 35)
-        ; mmC=(06 16 26 36 07 17 27 37)
-
-        movq      mmB,mmA
-        punpcklbw mmA,mmF               ; mmA=(00 02 10 12 20 22 30 32)
-        punpckhbw mmB,mmF               ; mmB=(01 03 11 13 21 23 31 33)
-
-        movq      mmG,mmD
-        punpcklbw mmD,mmC               ; mmD=(04 06 14 16 24 26 34 36)
-        punpckhbw mmG,mmC               ; mmG=(05 07 15 17 25 27 35 37)
-
-        movq      mmE,mmA
-        punpcklwd mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
-        punpckhwd mmE,mmD               ; mmE=(20 22 24 26 30 32 34 36)
-
-        movq      mmH,mmB
-        punpcklwd mmB,mmG               ; mmB=(01 03 05 07 11 13 15 17)
-        punpckhwd mmH,mmG               ; mmH=(21 23 25 27 31 33 35 37)
-
-        pxor      mmF,mmF
-
-        movq      mmC,mmA
-        punpcklbw mmA,mmF               ; mmA=(00 02 04 06)
-        punpckhbw mmC,mmF               ; mmC=(10 12 14 16)
-
-        movq      mmD,mmB
-        punpcklbw mmB,mmF               ; mmB=(01 03 05 07)
-        punpckhbw mmD,mmF               ; mmD=(11 13 15 17)
-
-        movq      mmG,mmE
-        punpcklbw mmE,mmF               ; mmE=(20 22 24 26)
-        punpckhbw mmG,mmF               ; mmG=(30 32 34 36)
-
-        punpcklbw mmF,mmH
-        punpckhbw mmH,mmH
-        psrlw     mmF,BYTE_BIT          ; mmF=(21 23 25 27)
-        psrlw     mmH,BYTE_BIT          ; mmH=(31 33 35 37)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
-        ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
-
-        ; (Original)
-        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-        ;
-        ; (This implementation)
-        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
-        movq      MMWORD [wk(0)], mm0   ; wk(0)=RE
-        movq      MMWORD [wk(1)], mm1   ; wk(1)=RO
-        movq      MMWORD [wk(2)], mm4   ; wk(2)=BE
-        movq      MMWORD [wk(3)], mm5   ; wk(3)=BO
-
-        movq      mm6,mm1
-        punpcklwd mm1,mm3
-        punpckhwd mm6,mm3
-        movq      mm7,mm1
-        movq      mm4,mm6
-        pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-        pmaddwd   mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
-        pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
-        movq      MMWORD [wk(4)], mm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
-        movq      MMWORD [wk(5)], mm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        pxor      mm1,mm1
-        pxor      mm6,mm6
-        punpcklwd mm1,mm5               ; mm1=BOL
-        punpckhwd mm6,mm5               ; mm6=BOH
-        psrld     mm1,1                 ; mm1=BOL*FIX(0.500)
-        psrld     mm6,1                 ; mm6=BOH*FIX(0.500)
-
-        movq      mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
-
-        paddd     mm7,mm1
-        paddd     mm4,mm6
-        paddd     mm7,mm5
-        paddd     mm4,mm5
-        psrld     mm7,SCALEBITS         ; mm7=CbOL
-        psrld     mm4,SCALEBITS         ; mm4=CbOH
-        packssdw  mm7,mm4               ; mm7=CbO
-
-        movq      mm1, MMWORD [wk(2)]   ; mm1=BE
-
-        movq      mm6,mm0
-        punpcklwd mm0,mm2
-        punpckhwd mm6,mm2
-        movq      mm5,mm0
-        movq      mm4,mm6
-        pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
-        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
-        pmaddwd   mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
-        pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
-        movq      MMWORD [wk(6)], mm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
-        movq      MMWORD [wk(7)], mm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        pxor      mm0,mm0
-        pxor      mm6,mm6
-        punpcklwd mm0,mm1               ; mm0=BEL
-        punpckhwd mm6,mm1               ; mm6=BEH
-        psrld     mm0,1                 ; mm0=BEL*FIX(0.500)
-        psrld     mm6,1                 ; mm6=BEH*FIX(0.500)
-
-        movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
-
-        paddd     mm5,mm0
-        paddd     mm4,mm6
-        paddd     mm5,mm1
-        paddd     mm4,mm1
-        psrld     mm5,SCALEBITS         ; mm5=CbEL
-        psrld     mm4,SCALEBITS         ; mm4=CbEH
-        packssdw  mm5,mm4               ; mm5=CbE
-
-        psllw     mm7,BYTE_BIT
-        por       mm5,mm7               ; mm5=Cb
-        movq      MMWORD [ebx], mm5     ; Save Cb
-
-        movq      mm0, MMWORD [wk(3)]   ; mm0=BO
-        movq      mm6, MMWORD [wk(2)]   ; mm6=BE
-        movq      mm1, MMWORD [wk(1)]   ; mm1=RO
-
-        movq      mm4,mm0
-        punpcklwd mm0,mm3
-        punpckhwd mm4,mm3
-        movq      mm7,mm0
-        movq      mm5,mm4
-        pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-        pmaddwd   mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
-        pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
-        movq      mm3,[GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
-
-        paddd     mm0, MMWORD [wk(4)]
-        paddd     mm4, MMWORD [wk(5)]
-        paddd     mm0,mm3
-        paddd     mm4,mm3
-        psrld     mm0,SCALEBITS         ; mm0=YOL
-        psrld     mm4,SCALEBITS         ; mm4=YOH
-        packssdw  mm0,mm4               ; mm0=YO
-
-        pxor      mm3,mm3
-        pxor      mm4,mm4
-        punpcklwd mm3,mm1               ; mm3=ROL
-        punpckhwd mm4,mm1               ; mm4=ROH
-        psrld     mm3,1                 ; mm3=ROL*FIX(0.500)
-        psrld     mm4,1                 ; mm4=ROH*FIX(0.500)
-
-        movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
-
-        paddd     mm7,mm3
-        paddd     mm5,mm4
-        paddd     mm7,mm1
-        paddd     mm5,mm1
-        psrld     mm7,SCALEBITS         ; mm7=CrOL
-        psrld     mm5,SCALEBITS         ; mm5=CrOH
-        packssdw  mm7,mm5               ; mm7=CrO
-
-        movq      mm3, MMWORD [wk(0)]   ; mm3=RE
-
-        movq      mm4,mm6
-        punpcklwd mm6,mm2
-        punpckhwd mm4,mm2
-        movq      mm1,mm6
-        movq      mm5,mm4
-        pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-        pmaddwd   mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
-        pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
-        movq      mm2,[GOTOFF(eax,PD_ONEHALF)]  ; mm2=[PD_ONEHALF]
-
-        paddd     mm6, MMWORD [wk(6)]
-        paddd     mm4, MMWORD [wk(7)]
-        paddd     mm6,mm2
-        paddd     mm4,mm2
-        psrld     mm6,SCALEBITS         ; mm6=YEL
-        psrld     mm4,SCALEBITS         ; mm4=YEH
-        packssdw  mm6,mm4               ; mm6=YE
-
-        psllw     mm0,BYTE_BIT
-        por       mm6,mm0               ; mm6=Y
-        movq      MMWORD [edi], mm6     ; Save Y
-
-        pxor      mm2,mm2
-        pxor      mm4,mm4
-        punpcklwd mm2,mm3               ; mm2=REL
-        punpckhwd mm4,mm3               ; mm4=REH
-        psrld     mm2,1                 ; mm2=REL*FIX(0.500)
-        psrld     mm4,1                 ; mm4=REH*FIX(0.500)
-
-        movq      mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
-
-        paddd     mm1,mm2
-        paddd     mm5,mm4
-        paddd     mm1,mm0
-        paddd     mm5,mm0
-        psrld     mm1,SCALEBITS         ; mm1=CrEL
-        psrld     mm5,SCALEBITS         ; mm5=CrEH
-        packssdw  mm1,mm5               ; mm1=CrE
-
-        psllw     mm7,BYTE_BIT
-        por       mm1,mm7               ; mm1=Cr
-        movq      MMWORD [edx], mm1     ; Save Cr
-
-        sub     ecx, byte SIZEOF_MMWORD
-        add     esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; inptr
-        add     edi, byte SIZEOF_MMWORD                 ; outptr0
-        add     ebx, byte SIZEOF_MMWORD                 ; outptr1
-        add     edx, byte SIZEOF_MMWORD                 ; outptr2
-        cmp     ecx, byte SIZEOF_MMWORD
-        jae     near .columnloop
-        test    ecx,ecx
-        jnz     near .column_ld1
-
-        pop     ecx                     ; col
-        pop     esi
-        pop     edi
-        pop     ebx
-        pop     edx
-        poppic  eax
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_buf
-        add     edi, byte SIZEOF_JSAMPROW
-        add     ebx, byte SIZEOF_JSAMPROW
-        add     edx, byte SIZEOF_JSAMPROW
-        dec     eax                             ; num_rows
-        jg      near .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jccolext-sse2-64.asm b/simd/jccolext-sse2-64.asm
deleted file mode 100644
index 8e4642d..0000000
--- a/simd/jccolext-sse2-64.asm
+++ /dev/null
@@ -1,486 +0,0 @@
-;
-; jccolext.asm - colorspace conversion (64-bit SSE2)
-;
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
-;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                             JDIMENSION output_row, int num_rows);
-;
-
-; r10 = JDIMENSION img_width
-; r11 = JSAMPARRAY input_buf
-; r12 = JSAMPIMAGE output_buf
-; r13 = JDIMENSION output_row
-; r14 = int num_rows
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          8
-
-        align   16
-
-        global  EXTN(jsimd_rgb_ycc_convert_sse2)
-
-EXTN(jsimd_rgb_ycc_convert_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-        push    rbx
-
-        mov     ecx, r10d
-        test    rcx,rcx
-        jz      near .return
-
-        push    rcx
-
-        mov rsi, r12
-        mov ecx, r13d
-        mov     rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
-        mov     rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
-        mov     rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
-        lea     rdi, [rdi+rcx*SIZEOF_JSAMPROW]
-        lea     rbx, [rbx+rcx*SIZEOF_JSAMPROW]
-        lea     rdx, [rdx+rcx*SIZEOF_JSAMPROW]
-
-        pop     rcx
-
-        mov rsi, r11
-        mov     eax, r14d
-        test    rax,rax
-        jle     near .return
-.rowloop:
-        push    rdx
-        push    rbx
-        push    rdi
-        push    rsi
-        push    rcx                     ; col
-
-        mov     rsi, JSAMPROW [rsi]     ; inptr
-        mov     rdi, JSAMPROW [rdi]     ; outptr0
-        mov     rbx, JSAMPROW [rbx]     ; outptr1
-        mov     rdx, JSAMPROW [rdx]     ; outptr2
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-        push    rax
-        push    rdx
-        lea     rcx,[rcx+rcx*2]         ; imul ecx,RGB_PIXELSIZE
-        test    cl, SIZEOF_BYTE
-        jz      short .column_ld2
-        sub     rcx, byte SIZEOF_BYTE
-        movzx   rax, BYTE [rsi+rcx]
-.column_ld2:
-        test    cl, SIZEOF_WORD
-        jz      short .column_ld4
-        sub     rcx, byte SIZEOF_WORD
-        movzx   rdx, WORD [rsi+rcx]
-        shl     rax, WORD_BIT
-        or      rax,rdx
-.column_ld4:
-        movd    xmmA,eax
-        pop     rdx
-        pop     rax
-        test    cl, SIZEOF_DWORD
-        jz      short .column_ld8
-        sub     rcx, byte SIZEOF_DWORD
-        movd    xmmF, XMM_DWORD [rsi+rcx]
-        pslldq  xmmA, SIZEOF_DWORD
-        por     xmmA,xmmF
-.column_ld8:
-        test    cl, SIZEOF_MMWORD
-        jz      short .column_ld16
-        sub     rcx, byte SIZEOF_MMWORD
-        movq    xmmB, XMM_MMWORD [rsi+rcx]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmB
-.column_ld16:
-        test    cl, SIZEOF_XMMWORD
-        jz      short .column_ld32
-        movdqa  xmmF,xmmA
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        mov     rcx, SIZEOF_XMMWORD
-        jmp     short .rgb_ycc_cnv
-.column_ld32:
-        test    cl, 2*SIZEOF_XMMWORD
-        mov     rcx, SIZEOF_XMMWORD
-        jz      short .rgb_ycc_cnv
-        movdqa  xmmB,xmmA
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_ycc_cnv
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        movdqu  xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        movdqa    xmmG,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-        movdqa    xmmD,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-        movdqa    xmmE,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-        pxor      xmmH,xmmH
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmB,xmmE
-        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-        movdqa    xmmF,xmmD
-        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-        test    cl, SIZEOF_XMMWORD/16
-        jz      short .column_ld2
-        sub     rcx, byte SIZEOF_XMMWORD/16
-        movd    xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld2:
-        test    cl, SIZEOF_XMMWORD/8
-        jz      short .column_ld4
-        sub     rcx, byte SIZEOF_XMMWORD/8
-        movq    xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmE
-.column_ld4:
-        test    cl, SIZEOF_XMMWORD/4
-        jz      short .column_ld8
-        sub     rcx, byte SIZEOF_XMMWORD/4
-        movdqa  xmmE,xmmA
-        movdqu  xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld8:
-        test    cl, SIZEOF_XMMWORD/2
-        mov     rcx, SIZEOF_XMMWORD
-        jz      short .rgb_ycc_cnv
-        movdqa  xmmF,xmmA
-        movdqa  xmmH,xmmE
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_ycc_cnv
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-        movdqu  xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-        movdqa    xmmC,xmmF
-        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-        movdqa    xmmB,xmmA
-        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-        movdqa    xmmG,xmmD
-        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-        movdqa    xmmE,xmmA
-        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-        movdqa    xmmH,xmmB
-        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-        pxor      xmmF,xmmF
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmD,xmmB
-        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-        movdqa    xmmG,xmmE
-        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-        punpcklbw xmmF,xmmH
-        punpckhbw xmmH,xmmH
-        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
-        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-        ; (Original)
-        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-        ;
-        ; (This implementation)
-        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
-        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=RE
-        movdqa    XMMWORD [wk(1)], xmm1 ; wk(1)=RO
-        movdqa    XMMWORD [wk(2)], xmm4 ; wk(2)=BE
-        movdqa    XMMWORD [wk(3)], xmm5 ; wk(3)=BO
-
-        movdqa    xmm6,xmm1
-        punpcklwd xmm1,xmm3
-        punpckhwd xmm6,xmm3
-        movdqa    xmm7,xmm1
-        movdqa    xmm4,xmm6
-        pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-        pmaddwd   xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
-        pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
-        movdqa    XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
-        movdqa    XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        pxor      xmm1,xmm1
-        pxor      xmm6,xmm6
-        punpcklwd xmm1,xmm5             ; xmm1=BOL
-        punpckhwd xmm6,xmm5             ; xmm6=BOH
-        psrld     xmm1,1                ; xmm1=BOL*FIX(0.500)
-        psrld     xmm6,1                ; xmm6=BOH*FIX(0.500)
-
-        movdqa    xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm7,xmm1
-        paddd     xmm4,xmm6
-        paddd     xmm7,xmm5
-        paddd     xmm4,xmm5
-        psrld     xmm7,SCALEBITS        ; xmm7=CbOL
-        psrld     xmm4,SCALEBITS        ; xmm4=CbOH
-        packssdw  xmm7,xmm4             ; xmm7=CbO
-
-        movdqa    xmm1, XMMWORD [wk(2)] ; xmm1=BE
-
-        movdqa    xmm6,xmm0
-        punpcklwd xmm0,xmm2
-        punpckhwd xmm6,xmm2
-        movdqa    xmm5,xmm0
-        movdqa    xmm4,xmm6
-        pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-        pmaddwd   xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
-        pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
-        movdqa    XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
-        movdqa    XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        pxor      xmm0,xmm0
-        pxor      xmm6,xmm6
-        punpcklwd xmm0,xmm1             ; xmm0=BEL
-        punpckhwd xmm6,xmm1             ; xmm6=BEH
-        psrld     xmm0,1                ; xmm0=BEL*FIX(0.500)
-        psrld     xmm6,1                ; xmm6=BEH*FIX(0.500)
-
-        movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm5,xmm0
-        paddd     xmm4,xmm6
-        paddd     xmm5,xmm1
-        paddd     xmm4,xmm1
-        psrld     xmm5,SCALEBITS        ; xmm5=CbEL
-        psrld     xmm4,SCALEBITS        ; xmm4=CbEH
-        packssdw  xmm5,xmm4             ; xmm5=CbE
-
-        psllw     xmm7,BYTE_BIT
-        por       xmm5,xmm7             ; xmm5=Cb
-        movdqa    XMMWORD [rbx], xmm5   ; Save Cb
-
-        movdqa    xmm0, XMMWORD [wk(3)] ; xmm0=BO
-        movdqa    xmm6, XMMWORD [wk(2)] ; xmm6=BE
-        movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=RO
-
-        movdqa    xmm4,xmm0
-        punpcklwd xmm0,xmm3
-        punpckhwd xmm4,xmm3
-        movdqa    xmm7,xmm0
-        movdqa    xmm5,xmm4
-        pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-        pmaddwd   xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
-        pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
-        movdqa    xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
-
-        paddd     xmm0, XMMWORD [wk(4)]
-        paddd     xmm4, XMMWORD [wk(5)]
-        paddd     xmm0,xmm3
-        paddd     xmm4,xmm3
-        psrld     xmm0,SCALEBITS        ; xmm0=YOL
-        psrld     xmm4,SCALEBITS        ; xmm4=YOH
-        packssdw  xmm0,xmm4             ; xmm0=YO
-
-        pxor      xmm3,xmm3
-        pxor      xmm4,xmm4
-        punpcklwd xmm3,xmm1             ; xmm3=ROL
-        punpckhwd xmm4,xmm1             ; xmm4=ROH
-        psrld     xmm3,1                ; xmm3=ROL*FIX(0.500)
-        psrld     xmm4,1                ; xmm4=ROH*FIX(0.500)
-
-        movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm7,xmm3
-        paddd     xmm5,xmm4
-        paddd     xmm7,xmm1
-        paddd     xmm5,xmm1
-        psrld     xmm7,SCALEBITS        ; xmm7=CrOL
-        psrld     xmm5,SCALEBITS        ; xmm5=CrOH
-        packssdw  xmm7,xmm5             ; xmm7=CrO
-
-        movdqa    xmm3, XMMWORD [wk(0)] ; xmm3=RE
-
-        movdqa    xmm4,xmm6
-        punpcklwd xmm6,xmm2
-        punpckhwd xmm4,xmm2
-        movdqa    xmm1,xmm6
-        movdqa    xmm5,xmm4
-        pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-        pmaddwd   xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
-        pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
-        movdqa    xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
-
-        paddd     xmm6, XMMWORD [wk(6)]
-        paddd     xmm4, XMMWORD [wk(7)]
-        paddd     xmm6,xmm2
-        paddd     xmm4,xmm2
-        psrld     xmm6,SCALEBITS        ; xmm6=YEL
-        psrld     xmm4,SCALEBITS        ; xmm4=YEH
-        packssdw  xmm6,xmm4             ; xmm6=YE
-
-        psllw     xmm0,BYTE_BIT
-        por       xmm6,xmm0             ; xmm6=Y
-        movdqa    XMMWORD [rdi], xmm6   ; Save Y
-
-        pxor      xmm2,xmm2
-        pxor      xmm4,xmm4
-        punpcklwd xmm2,xmm3             ; xmm2=REL
-        punpckhwd xmm4,xmm3             ; xmm4=REH
-        psrld     xmm2,1                ; xmm2=REL*FIX(0.500)
-        psrld     xmm4,1                ; xmm4=REH*FIX(0.500)
-
-        movdqa    xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm1,xmm2
-        paddd     xmm5,xmm4
-        paddd     xmm1,xmm0
-        paddd     xmm5,xmm0
-        psrld     xmm1,SCALEBITS        ; xmm1=CrEL
-        psrld     xmm5,SCALEBITS        ; xmm5=CrEH
-        packssdw  xmm1,xmm5             ; xmm1=CrE
-
-        psllw     xmm7,BYTE_BIT
-        por       xmm1,xmm7             ; xmm1=Cr
-        movdqa    XMMWORD [rdx], xmm1   ; Save Cr
-
-        sub     rcx, byte SIZEOF_XMMWORD
-        add     rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
-        add     rdi, byte SIZEOF_XMMWORD                ; outptr0
-        add     rbx, byte SIZEOF_XMMWORD                ; outptr1
-        add     rdx, byte SIZEOF_XMMWORD                ; outptr2
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        test    rcx,rcx
-        jnz     near .column_ld1
-
-        pop     rcx                     ; col
-        pop     rsi
-        pop     rdi
-        pop     rbx
-        pop     rdx
-
-        add     rsi, byte SIZEOF_JSAMPROW       ; input_buf
-        add     rdi, byte SIZEOF_JSAMPROW
-        add     rbx, byte SIZEOF_JSAMPROW
-        add     rdx, byte SIZEOF_JSAMPROW
-        dec     rax                             ; num_rows
-        jg      near .rowloop
-
-.return:
-        pop     rbx
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jccolext-sse2.asm b/simd/jccolext-sse2.asm
deleted file mode 100644
index cc38e98..0000000
--- a/simd/jccolext-sse2.asm
+++ /dev/null
@@ -1,503 +0,0 @@
-;
-; jccolext.asm - colorspace conversion (SSE2)
-;
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
-;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                             JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION img_width
-%define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf
-%define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf
-%define output_row(b)   (b)+20          ; JDIMENSION output_row
-%define num_rows(b)     (b)+24          ; int num_rows
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          8
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-
-        global  EXTN(jsimd_rgb_ycc_convert_sse2)
-
-EXTN(jsimd_rgb_ycc_convert_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [img_width(eax)]
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     esi, JSAMPIMAGE [output_buf(eax)]
-        mov     ecx, JDIMENSION [output_row(eax)]
-        mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
-        lea     edi, [edi+ecx*SIZEOF_JSAMPROW]
-        lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-        lea     edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-        pop     ecx
-
-        mov     esi, JSAMPARRAY [input_buf(eax)]
-        mov     eax, INT [num_rows(eax)]
-        test    eax,eax
-        jle     near .return
-        alignx  16,7
-.rowloop:
-        pushpic eax
-        push    edx
-        push    ebx
-        push    edi
-        push    esi
-        push    ecx                     ; col
-
-        mov     esi, JSAMPROW [esi]     ; inptr
-        mov     edi, JSAMPROW [edi]     ; outptr0
-        mov     ebx, JSAMPROW [ebx]     ; outptr1
-        mov     edx, JSAMPROW [edx]     ; outptr2
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        alignx  16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-        push    eax
-        push    edx
-        lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE
-        test    cl, SIZEOF_BYTE
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_BYTE
-        movzx   eax, BYTE [esi+ecx]
-.column_ld2:
-        test    cl, SIZEOF_WORD
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_WORD
-        movzx   edx, WORD [esi+ecx]
-        shl     eax, WORD_BIT
-        or      eax,edx
-.column_ld4:
-        movd    xmmA,eax
-        pop     edx
-        pop     eax
-        test    cl, SIZEOF_DWORD
-        jz      short .column_ld8
-        sub     ecx, byte SIZEOF_DWORD
-        movd    xmmF, XMM_DWORD [esi+ecx]
-        pslldq  xmmA, SIZEOF_DWORD
-        por     xmmA,xmmF
-.column_ld8:
-        test    cl, SIZEOF_MMWORD
-        jz      short .column_ld16
-        sub     ecx, byte SIZEOF_MMWORD
-        movq    xmmB, XMM_MMWORD [esi+ecx]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmB
-.column_ld16:
-        test    cl, SIZEOF_XMMWORD
-        jz      short .column_ld32
-        movdqa  xmmF,xmmA
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        mov     ecx, SIZEOF_XMMWORD
-        jmp     short .rgb_ycc_cnv
-.column_ld32:
-        test    cl, 2*SIZEOF_XMMWORD
-        mov     ecx, SIZEOF_XMMWORD
-        jz      short .rgb_ycc_cnv
-        movdqa  xmmB,xmmA
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_ycc_cnv
-        alignx  16,7
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        movdqu  xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        movdqa    xmmG,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-        movdqa    xmmD,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-        movdqa    xmmE,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-        pxor      xmmH,xmmH
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmB,xmmE
-        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-        movdqa    xmmF,xmmD
-        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-        test    cl, SIZEOF_XMMWORD/16
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_XMMWORD/16
-        movd    xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-        test    cl, SIZEOF_XMMWORD/8
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_XMMWORD/8
-        movq    xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmE
-.column_ld4:
-        test    cl, SIZEOF_XMMWORD/4
-        jz      short .column_ld8
-        sub     ecx, byte SIZEOF_XMMWORD/4
-        movdqa  xmmE,xmmA
-        movdqu  xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld8:
-        test    cl, SIZEOF_XMMWORD/2
-        mov     ecx, SIZEOF_XMMWORD
-        jz      short .rgb_ycc_cnv
-        movdqa  xmmF,xmmA
-        movdqa  xmmH,xmmE
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_ycc_cnv
-        alignx  16,7
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
-        movdqu  xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-        movdqa    xmmC,xmmF
-        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-        movdqa    xmmB,xmmA
-        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-        movdqa    xmmG,xmmD
-        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-        movdqa    xmmE,xmmA
-        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-        movdqa    xmmH,xmmB
-        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-        pxor      xmmF,xmmF
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmD,xmmB
-        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-        movdqa    xmmG,xmmE
-        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-        punpcklbw xmmF,xmmH
-        punpckhbw xmmH,xmmH
-        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
-        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-        ; (Original)
-        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-        ;
-        ; (This implementation)
-        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
-        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=RE
-        movdqa    XMMWORD [wk(1)], xmm1 ; wk(1)=RO
-        movdqa    XMMWORD [wk(2)], xmm4 ; wk(2)=BE
-        movdqa    XMMWORD [wk(3)], xmm5 ; wk(3)=BO
-
-        movdqa    xmm6,xmm1
-        punpcklwd xmm1,xmm3
-        punpckhwd xmm6,xmm3
-        movdqa    xmm7,xmm1
-        movdqa    xmm4,xmm6
-        pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-        pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-        pmaddwd   xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
-        pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
-        movdqa    XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
-        movdqa    XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        pxor      xmm1,xmm1
-        pxor      xmm6,xmm6
-        punpcklwd xmm1,xmm5             ; xmm1=BOL
-        punpckhwd xmm6,xmm5             ; xmm6=BOH
-        psrld     xmm1,1                ; xmm1=BOL*FIX(0.500)
-        psrld     xmm6,1                ; xmm6=BOH*FIX(0.500)
-
-        movdqa    xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm7,xmm1
-        paddd     xmm4,xmm6
-        paddd     xmm7,xmm5
-        paddd     xmm4,xmm5
-        psrld     xmm7,SCALEBITS        ; xmm7=CbOL
-        psrld     xmm4,SCALEBITS        ; xmm4=CbOH
-        packssdw  xmm7,xmm4             ; xmm7=CbO
-
-        movdqa    xmm1, XMMWORD [wk(2)] ; xmm1=BE
-
-        movdqa    xmm6,xmm0
-        punpcklwd xmm0,xmm2
-        punpckhwd xmm6,xmm2
-        movdqa    xmm5,xmm0
-        movdqa    xmm4,xmm6
-        pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-        pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-        pmaddwd   xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
-        pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
-        movdqa    XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
-        movdqa    XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        pxor      xmm0,xmm0
-        pxor      xmm6,xmm6
-        punpcklwd xmm0,xmm1             ; xmm0=BEL
-        punpckhwd xmm6,xmm1             ; xmm6=BEH
-        psrld     xmm0,1                ; xmm0=BEL*FIX(0.500)
-        psrld     xmm6,1                ; xmm6=BEH*FIX(0.500)
-
-        movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm5,xmm0
-        paddd     xmm4,xmm6
-        paddd     xmm5,xmm1
-        paddd     xmm4,xmm1
-        psrld     xmm5,SCALEBITS        ; xmm5=CbEL
-        psrld     xmm4,SCALEBITS        ; xmm4=CbEH
-        packssdw  xmm5,xmm4             ; xmm5=CbE
-
-        psllw     xmm7,BYTE_BIT
-        por       xmm5,xmm7             ; xmm5=Cb
-        movdqa    XMMWORD [ebx], xmm5   ; Save Cb
-
-        movdqa    xmm0, XMMWORD [wk(3)] ; xmm0=BO
-        movdqa    xmm6, XMMWORD [wk(2)] ; xmm6=BE
-        movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=RO
-
-        movdqa    xmm4,xmm0
-        punpcklwd xmm0,xmm3
-        punpckhwd xmm4,xmm3
-        movdqa    xmm7,xmm0
-        movdqa    xmm5,xmm4
-        pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-        pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-        pmaddwd   xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
-        pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
-        movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
-
-        paddd     xmm0, XMMWORD [wk(4)]
-        paddd     xmm4, XMMWORD [wk(5)]
-        paddd     xmm0,xmm3
-        paddd     xmm4,xmm3
-        psrld     xmm0,SCALEBITS        ; xmm0=YOL
-        psrld     xmm4,SCALEBITS        ; xmm4=YOH
-        packssdw  xmm0,xmm4             ; xmm0=YO
-
-        pxor      xmm3,xmm3
-        pxor      xmm4,xmm4
-        punpcklwd xmm3,xmm1             ; xmm3=ROL
-        punpckhwd xmm4,xmm1             ; xmm4=ROH
-        psrld     xmm3,1                ; xmm3=ROL*FIX(0.500)
-        psrld     xmm4,1                ; xmm4=ROH*FIX(0.500)
-
-        movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm7,xmm3
-        paddd     xmm5,xmm4
-        paddd     xmm7,xmm1
-        paddd     xmm5,xmm1
-        psrld     xmm7,SCALEBITS        ; xmm7=CrOL
-        psrld     xmm5,SCALEBITS        ; xmm5=CrOH
-        packssdw  xmm7,xmm5             ; xmm7=CrO
-
-        movdqa    xmm3, XMMWORD [wk(0)] ; xmm3=RE
-
-        movdqa    xmm4,xmm6
-        punpcklwd xmm6,xmm2
-        punpckhwd xmm4,xmm2
-        movdqa    xmm1,xmm6
-        movdqa    xmm5,xmm4
-        pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-        pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-        pmaddwd   xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
-        pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
-        movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
-
-        paddd     xmm6, XMMWORD [wk(6)]
-        paddd     xmm4, XMMWORD [wk(7)]
-        paddd     xmm6,xmm2
-        paddd     xmm4,xmm2
-        psrld     xmm6,SCALEBITS        ; xmm6=YEL
-        psrld     xmm4,SCALEBITS        ; xmm4=YEH
-        packssdw  xmm6,xmm4             ; xmm6=YE
-
-        psllw     xmm0,BYTE_BIT
-        por       xmm6,xmm0             ; xmm6=Y
-        movdqa    XMMWORD [edi], xmm6   ; Save Y
-
-        pxor      xmm2,xmm2
-        pxor      xmm4,xmm4
-        punpcklwd xmm2,xmm3             ; xmm2=REL
-        punpckhwd xmm4,xmm3             ; xmm4=REH
-        psrld     xmm2,1                ; xmm2=REL*FIX(0.500)
-        psrld     xmm4,1                ; xmm4=REH*FIX(0.500)
-
-        movdqa    xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm1,xmm2
-        paddd     xmm5,xmm4
-        paddd     xmm1,xmm0
-        paddd     xmm5,xmm0
-        psrld     xmm1,SCALEBITS        ; xmm1=CrEL
-        psrld     xmm5,SCALEBITS        ; xmm5=CrEH
-        packssdw  xmm1,xmm5             ; xmm1=CrE
-
-        psllw     xmm7,BYTE_BIT
-        por       xmm1,xmm7             ; xmm1=Cr
-        movdqa    XMMWORD [edx], xmm1   ; Save Cr
-
-        sub     ecx, byte SIZEOF_XMMWORD
-        add     esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
-        add     edi, byte SIZEOF_XMMWORD                ; outptr0
-        add     ebx, byte SIZEOF_XMMWORD                ; outptr1
-        add     edx, byte SIZEOF_XMMWORD                ; outptr2
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        test    ecx,ecx
-        jnz     near .column_ld1
-
-        pop     ecx                     ; col
-        pop     esi
-        pop     edi
-        pop     ebx
-        pop     edx
-        poppic  eax
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_buf
-        add     edi, byte SIZEOF_JSAMPROW
-        add     ebx, byte SIZEOF_JSAMPROW
-        add     edx, byte SIZEOF_JSAMPROW
-        dec     eax                             ; num_rows
-        jg      near .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jccolor-sse2-64.asm b/simd/jccolor-sse2-64.asm
deleted file mode 100644
index bd2188b..0000000
--- a/simd/jccolor-sse2-64.asm
+++ /dev/null
@@ -1,121 +0,0 @@
-;
-; jccolor.asm - colorspace conversion (64-bit SSE2)
-;
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_081 equ      5329                   ; FIX(0.08131)
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_168 equ     11059                   ; FIX(0.16874)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_331 equ     21709                   ; FIX(0.33126)
-F_0_418 equ     27439                   ; FIX(0.41869)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_rgb_ycc_convert_sse2)
-
-EXTN(jconst_rgb_ycc_convert_sse2):
-
-PW_F0299_F0337  times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 4 dw  F_0_114, F_0_250
-PW_MF016_MF033  times 4 dw -F_0_168,-F_0_331
-PW_MF008_MF041  times 4 dw -F_0_081,-F_0_418
-PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
-PD_ONEHALF      times 4 dd  (1 << (SCALEBITS-1))
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
diff --git a/simd/jcgray-sse2-64.asm b/simd/jcgray-sse2-64.asm
deleted file mode 100644
index bafd302..0000000
--- a/simd/jcgray-sse2-64.asm
+++ /dev/null
@@ -1,114 +0,0 @@
-;
-; jcgray.asm - grayscale colorspace conversion (64-bit SSE2)
-;
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_rgb_gray_convert_sse2)
-
-EXTN(jconst_rgb_gray_convert_sse2):
-
-PW_F0299_F0337  times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 4 dw  F_0_114, F_0_250
-PD_ONEHALF      times 4 dd  (1 << (SCALEBITS-1))
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
diff --git a/simd/jcgryext-mmx.asm b/simd/jcgryext-mmx.asm
deleted file mode 100644
index 1c1b8d8..0000000
--- a/simd/jcgryext-mmx.asm
+++ /dev/null
@@ -1,356 +0,0 @@
-;
-; jcgryext.asm - grayscale colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_gray_convert_mmx (JDIMENSION img_width,
-;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                             JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION img_width
-%define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf
-%define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf
-%define output_row(b)   (b)+20          ; JDIMENSION output_row
-%define num_rows(b)     (b)+24          ; int num_rows
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-        global  EXTN(jsimd_rgb_gray_convert_mmx)
-
-EXTN(jsimd_rgb_gray_convert_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [img_width(eax)]        ; num_cols
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     esi, JSAMPIMAGE [output_buf(eax)]
-        mov     ecx, JDIMENSION [output_row(eax)]
-        mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-        lea     edi, [edi+ecx*SIZEOF_JSAMPROW]
-
-        pop     ecx
-
-        mov     esi, JSAMPARRAY [input_buf(eax)]
-        mov     eax, INT [num_rows(eax)]
-        test    eax,eax
-        jle     near .return
-        alignx  16,7
-.rowloop:
-        pushpic eax
-        push    edi
-        push    esi
-        push    ecx                     ; col
-
-        mov     esi, JSAMPROW [esi]     ; inptr
-        mov     edi, JSAMPROW [edi]     ; outptr0
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-        cmp     ecx, byte SIZEOF_MMWORD
-        jae     short .columnloop
-        alignx  16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-        push    eax
-        push    edx
-        lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE
-        test    cl, SIZEOF_BYTE
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_BYTE
-        xor     eax,eax
-        mov     al, BYTE [esi+ecx]
-.column_ld2:
-        test    cl, SIZEOF_WORD
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_WORD
-        xor     edx,edx
-        mov     dx, WORD [esi+ecx]
-        shl     eax, WORD_BIT
-        or      eax,edx
-.column_ld4:
-        movd    mmA,eax
-        pop     edx
-        pop     eax
-        test    cl, SIZEOF_DWORD
-        jz      short .column_ld8
-        sub     ecx, byte SIZEOF_DWORD
-        movd    mmG, DWORD [esi+ecx]
-        psllq   mmA, DWORD_BIT
-        por     mmA,mmG
-.column_ld8:
-        test    cl, SIZEOF_MMWORD
-        jz      short .column_ld16
-        movq    mmG,mmA
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        mov     ecx, SIZEOF_MMWORD
-        jmp     short .rgb_gray_cnv
-.column_ld16:
-        test    cl, 2*SIZEOF_MMWORD
-        mov     ecx, SIZEOF_MMWORD
-        jz      short .rgb_gray_cnv
-        movq    mmF,mmA
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-        jmp     short .rgb_gray_cnv
-        alignx  16,7
-
-.columnloop:
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-        movq    mmF, MMWORD [esi+2*SIZEOF_MMWORD]
-
-.rgb_gray_cnv:
-        ; mmA=(00 10 20 01 11 21 02 12)
-        ; mmG=(22 03 13 23 04 14 24 05)
-        ; mmF=(15 25 06 16 26 07 17 27)
-
-        movq      mmD,mmA
-        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 10 20 01)
-        psrlq     mmD,4*BYTE_BIT        ; mmD=(11 21 02 12 -- -- -- --)
-
-        punpckhbw mmA,mmG               ; mmA=(00 04 10 14 20 24 01 05)
-        psllq     mmG,4*BYTE_BIT        ; mmG=(-- -- -- -- 22 03 13 23)
-
-        punpcklbw mmD,mmF               ; mmD=(11 15 21 25 02 06 12 16)
-        punpckhbw mmG,mmF               ; mmG=(22 26 03 07 13 17 23 27)
-
-        movq      mmE,mmA
-        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 04 10 14)
-        psrlq     mmE,4*BYTE_BIT        ; mmE=(20 24 01 05 -- -- -- --)
-
-        punpckhbw mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
-        psllq     mmD,4*BYTE_BIT        ; mmD=(-- -- -- -- 11 15 21 25)
-
-        punpcklbw mmE,mmG               ; mmE=(20 22 24 26 01 03 05 07)
-        punpckhbw mmD,mmG               ; mmD=(11 13 15 17 21 23 25 27)
-
-        pxor      mmH,mmH
-
-        movq      mmC,mmA
-        punpcklbw mmA,mmH               ; mmA=(00 02 04 06)
-        punpckhbw mmC,mmH               ; mmC=(10 12 14 16)
-
-        movq      mmB,mmE
-        punpcklbw mmE,mmH               ; mmE=(20 22 24 26)
-        punpckhbw mmB,mmH               ; mmB=(01 03 05 07)
-
-        movq      mmF,mmD
-        punpcklbw mmD,mmH               ; mmD=(11 13 15 17)
-        punpckhbw mmF,mmH               ; mmF=(21 23 25 27)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-        test    cl, SIZEOF_MMWORD/8
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_MMWORD/8
-        movd    mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-        test    cl, SIZEOF_MMWORD/4
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_MMWORD/4
-        movq    mmF,mmA
-        movq    mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld4:
-        test    cl, SIZEOF_MMWORD/2
-        mov     ecx, SIZEOF_MMWORD
-        jz      short .rgb_gray_cnv
-        movq    mmD,mmA
-        movq    mmC,mmF
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-        jmp     short .rgb_gray_cnv
-        alignx  16,7
-
-.columnloop:
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-        movq    mmD, MMWORD [esi+2*SIZEOF_MMWORD]
-        movq    mmC, MMWORD [esi+3*SIZEOF_MMWORD]
-
-.rgb_gray_cnv:
-        ; mmA=(00 10 20 30 01 11 21 31)
-        ; mmF=(02 12 22 32 03 13 23 33)
-        ; mmD=(04 14 24 34 05 15 25 35)
-        ; mmC=(06 16 26 36 07 17 27 37)
-
-        movq      mmB,mmA
-        punpcklbw mmA,mmF               ; mmA=(00 02 10 12 20 22 30 32)
-        punpckhbw mmB,mmF               ; mmB=(01 03 11 13 21 23 31 33)
-
-        movq      mmG,mmD
-        punpcklbw mmD,mmC               ; mmD=(04 06 14 16 24 26 34 36)
-        punpckhbw mmG,mmC               ; mmG=(05 07 15 17 25 27 35 37)
-
-        movq      mmE,mmA
-        punpcklwd mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
-        punpckhwd mmE,mmD               ; mmE=(20 22 24 26 30 32 34 36)
-
-        movq      mmH,mmB
-        punpcklwd mmB,mmG               ; mmB=(01 03 05 07 11 13 15 17)
-        punpckhwd mmH,mmG               ; mmH=(21 23 25 27 31 33 35 37)
-
-        pxor      mmF,mmF
-
-        movq      mmC,mmA
-        punpcklbw mmA,mmF               ; mmA=(00 02 04 06)
-        punpckhbw mmC,mmF               ; mmC=(10 12 14 16)
-
-        movq      mmD,mmB
-        punpcklbw mmB,mmF               ; mmB=(01 03 05 07)
-        punpckhbw mmD,mmF               ; mmD=(11 13 15 17)
-
-        movq      mmG,mmE
-        punpcklbw mmE,mmF               ; mmE=(20 22 24 26)
-        punpckhbw mmG,mmF               ; mmG=(30 32 34 36)
-
-        punpcklbw mmF,mmH
-        punpckhbw mmH,mmH
-        psrlw     mmF,BYTE_BIT          ; mmF=(21 23 25 27)
-        psrlw     mmH,BYTE_BIT          ; mmH=(31 33 35 37)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
-        ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
-
-        ; (Original)
-        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-        ;
-        ; (This implementation)
-        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
-        movq      mm6,mm1
-        punpcklwd mm1,mm3
-        punpckhwd mm6,mm3
-        pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        movq      mm7, mm6      ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        movq      mm6,mm0
-        punpcklwd mm0,mm2
-        punpckhwd mm6,mm2
-        pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
-        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        movq      MMWORD [wk(0)], mm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
-        movq      MMWORD [wk(1)], mm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        movq      mm0, mm5      ; mm0=BO
-        movq      mm6, mm4      ; mm6=BE
-
-        movq      mm4,mm0
-        punpcklwd mm0,mm3
-        punpckhwd mm4,mm3
-        pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
-        movq      mm3,[GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
-
-        paddd     mm0, mm1
-        paddd     mm4, mm7
-        paddd     mm0,mm3
-        paddd     mm4,mm3
-        psrld     mm0,SCALEBITS         ; mm0=YOL
-        psrld     mm4,SCALEBITS         ; mm4=YOH
-        packssdw  mm0,mm4               ; mm0=YO
-
-        movq      mm4,mm6
-        punpcklwd mm6,mm2
-        punpckhwd mm4,mm2
-        pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
-        movq      mm2,[GOTOFF(eax,PD_ONEHALF)]  ; mm2=[PD_ONEHALF]
-
-        paddd     mm6, MMWORD [wk(0)]
-        paddd     mm4, MMWORD [wk(1)]
-        paddd     mm6,mm2
-        paddd     mm4,mm2
-        psrld     mm6,SCALEBITS         ; mm6=YEL
-        psrld     mm4,SCALEBITS         ; mm4=YEH
-        packssdw  mm6,mm4               ; mm6=YE
-
-        psllw     mm0,BYTE_BIT
-        por       mm6,mm0               ; mm6=Y
-        movq      MMWORD [edi], mm6     ; Save Y
-
-        sub     ecx, byte SIZEOF_MMWORD
-        add     esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; inptr
-        add     edi, byte SIZEOF_MMWORD                 ; outptr0
-        cmp     ecx, byte SIZEOF_MMWORD
-        jae     near .columnloop
-        test    ecx,ecx
-        jnz     near .column_ld1
-
-        pop     ecx                     ; col
-        pop     esi
-        pop     edi
-        poppic  eax
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_buf
-        add     edi, byte SIZEOF_JSAMPROW
-        dec     eax                             ; num_rows
-        jg      near .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jcgryext-sse2-64.asm b/simd/jcgryext-sse2-64.asm
deleted file mode 100644
index 541355a..0000000
--- a/simd/jcgryext-sse2-64.asm
+++ /dev/null
@@ -1,365 +0,0 @@
-;
-; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
-;
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
-;                              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                              JDIMENSION output_row, int num_rows);
-;
-
-; r10 = JDIMENSION img_width
-; r11 = JSAMPARRAY input_buf
-; r12 = JSAMPIMAGE output_buf
-; r13 = JDIMENSION output_row
-; r14 = int num_rows
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-
-        global  EXTN(jsimd_rgb_gray_convert_sse2)
-
-EXTN(jsimd_rgb_gray_convert_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-        push    rbx
-
-        mov     ecx, r10d
-        test    rcx,rcx
-        jz      near .return
-
-        push    rcx
-
-        mov rsi, r12
-        mov ecx, r13d
-        mov     rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
-        lea     rdi, [rdi+rcx*SIZEOF_JSAMPROW]
-
-        pop     rcx
-
-        mov rsi, r11
-        mov     eax, r14d
-        test    rax,rax
-        jle     near .return
-.rowloop:
-        push    rdi
-        push    rsi
-        push    rcx                     ; col
-
-        mov     rsi, JSAMPROW [rsi]     ; inptr
-        mov     rdi, JSAMPROW [rdi]     ; outptr0
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-        push    rax
-        push    rdx
-        lea     rcx,[rcx+rcx*2]         ; imul ecx,RGB_PIXELSIZE
-        test    cl, SIZEOF_BYTE
-        jz      short .column_ld2
-        sub     rcx, byte SIZEOF_BYTE
-        movzx   rax, BYTE [rsi+rcx]
-.column_ld2:
-        test    cl, SIZEOF_WORD
-        jz      short .column_ld4
-        sub     rcx, byte SIZEOF_WORD
-        movzx   rdx, WORD [rsi+rcx]
-        shl     rax, WORD_BIT
-        or      rax,rdx
-.column_ld4:
-        movd    xmmA,eax
-        pop     rdx
-        pop     rax
-        test    cl, SIZEOF_DWORD
-        jz      short .column_ld8
-        sub     rcx, byte SIZEOF_DWORD
-        movd    xmmF, XMM_DWORD [rsi+rcx]
-        pslldq  xmmA, SIZEOF_DWORD
-        por     xmmA,xmmF
-.column_ld8:
-        test    cl, SIZEOF_MMWORD
-        jz      short .column_ld16
-        sub     rcx, byte SIZEOF_MMWORD
-        movq    xmmB, XMM_MMWORD [rsi+rcx]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmB
-.column_ld16:
-        test    cl, SIZEOF_XMMWORD
-        jz      short .column_ld32
-        movdqa  xmmF,xmmA
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        mov     rcx, SIZEOF_XMMWORD
-        jmp     short .rgb_gray_cnv
-.column_ld32:
-        test    cl, 2*SIZEOF_XMMWORD
-        mov     rcx, SIZEOF_XMMWORD
-        jz      short .rgb_gray_cnv
-        movdqa  xmmB,xmmA
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_gray_cnv
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        movdqu  xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        movdqa    xmmG,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-        movdqa    xmmD,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-        movdqa    xmmE,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-        pxor      xmmH,xmmH
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmB,xmmE
-        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-        movdqa    xmmF,xmmD
-        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-        test    cl, SIZEOF_XMMWORD/16
-        jz      short .column_ld2
-        sub     rcx, byte SIZEOF_XMMWORD/16
-        movd    xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld2:
-        test    cl, SIZEOF_XMMWORD/8
-        jz      short .column_ld4
-        sub     rcx, byte SIZEOF_XMMWORD/8
-        movq    xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmE
-.column_ld4:
-        test    cl, SIZEOF_XMMWORD/4
-        jz      short .column_ld8
-        sub     rcx, byte SIZEOF_XMMWORD/4
-        movdqa  xmmE,xmmA
-        movdqu  xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld8:
-        test    cl, SIZEOF_XMMWORD/2
-        mov     rcx, SIZEOF_XMMWORD
-        jz      short .rgb_gray_cnv
-        movdqa  xmmF,xmmA
-        movdqa  xmmH,xmmE
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_gray_cnv
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-        movdqu  xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-        movdqa    xmmC,xmmF
-        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-        movdqa    xmmB,xmmA
-        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-        movdqa    xmmG,xmmD
-        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-        movdqa    xmmE,xmmA
-        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-        movdqa    xmmH,xmmB
-        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-        pxor      xmmF,xmmF
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmD,xmmB
-        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-        movdqa    xmmG,xmmE
-        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-        punpcklbw xmmF,xmmH
-        punpckhbw xmmH,xmmH
-        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
-        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-        ; (Original)
-        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-        ;
-        ; (This implementation)
-        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
-        movdqa    xmm6,xmm1
-        punpcklwd xmm1,xmm3
-        punpckhwd xmm6,xmm3
-        pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        movdqa    xmm7, xmm6    ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        movdqa    xmm6,xmm0
-        punpcklwd xmm0,xmm2
-        punpckhwd xmm6,xmm2
-        pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
-        movdqa    XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        movdqa    xmm0, xmm5    ; xmm0=BO
-        movdqa    xmm6, xmm4    ; xmm6=BE
-
-        movdqa    xmm4,xmm0
-        punpcklwd xmm0,xmm3
-        punpckhwd xmm4,xmm3
-        pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
-        movdqa    xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
-
-        paddd     xmm0, xmm1
-        paddd     xmm4, xmm7
-        paddd     xmm0,xmm3
-        paddd     xmm4,xmm3
-        psrld     xmm0,SCALEBITS        ; xmm0=YOL
-        psrld     xmm4,SCALEBITS        ; xmm4=YOH
-        packssdw  xmm0,xmm4             ; xmm0=YO
-
-        movdqa    xmm4,xmm6
-        punpcklwd xmm6,xmm2
-        punpckhwd xmm4,xmm2
-        pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
-        movdqa    xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
-
-        paddd     xmm6, XMMWORD [wk(0)]
-        paddd     xmm4, XMMWORD [wk(1)]
-        paddd     xmm6,xmm2
-        paddd     xmm4,xmm2
-        psrld     xmm6,SCALEBITS        ; xmm6=YEL
-        psrld     xmm4,SCALEBITS        ; xmm4=YEH
-        packssdw  xmm6,xmm4             ; xmm6=YE
-
-        psllw     xmm0,BYTE_BIT
-        por       xmm6,xmm0             ; xmm6=Y
-        movdqa    XMMWORD [rdi], xmm6   ; Save Y
-
-        sub     rcx, byte SIZEOF_XMMWORD
-        add     rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
-        add     rdi, byte SIZEOF_XMMWORD                ; outptr0
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        test    rcx,rcx
-        jnz     near .column_ld1
-
-        pop     rcx                     ; col
-        pop     rsi
-        pop     rdi
-
-        add     rsi, byte SIZEOF_JSAMPROW       ; input_buf
-        add     rdi, byte SIZEOF_JSAMPROW
-        dec     rax                             ; num_rows
-        jg      near .rowloop
-
-.return:
-        pop     rbx
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jcgryext-sse2.asm b/simd/jcgryext-sse2.asm
deleted file mode 100644
index cd16dd1..0000000
--- a/simd/jcgryext-sse2.asm
+++ /dev/null
@@ -1,384 +0,0 @@
-;
-; jcgryext.asm - grayscale colorspace conversion (SSE2)
-;
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
-;                              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                              JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION img_width
-%define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf
-%define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf
-%define output_row(b)   (b)+20          ; JDIMENSION output_row
-%define num_rows(b)     (b)+24          ; int num_rows
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-
-        global  EXTN(jsimd_rgb_gray_convert_sse2)
-
-EXTN(jsimd_rgb_gray_convert_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [img_width(eax)]
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     esi, JSAMPIMAGE [output_buf(eax)]
-        mov     ecx, JDIMENSION [output_row(eax)]
-        mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-        lea     edi, [edi+ecx*SIZEOF_JSAMPROW]
-
-        pop     ecx
-
-        mov     esi, JSAMPARRAY [input_buf(eax)]
-        mov     eax, INT [num_rows(eax)]
-        test    eax,eax
-        jle     near .return
-        alignx  16,7
-.rowloop:
-        pushpic eax
-        push    edi
-        push    esi
-        push    ecx                     ; col
-
-        mov     esi, JSAMPROW [esi]     ; inptr
-        mov     edi, JSAMPROW [edi]     ; outptr0
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        alignx  16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-        push    eax
-        push    edx
-        lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE
-        test    cl, SIZEOF_BYTE
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_BYTE
-        movzx   eax, BYTE [esi+ecx]
-.column_ld2:
-        test    cl, SIZEOF_WORD
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_WORD
-        movzx   edx, WORD [esi+ecx]
-        shl     eax, WORD_BIT
-        or      eax,edx
-.column_ld4:
-        movd    xmmA,eax
-        pop     edx
-        pop     eax
-        test    cl, SIZEOF_DWORD
-        jz      short .column_ld8
-        sub     ecx, byte SIZEOF_DWORD
-        movd    xmmF, XMM_DWORD [esi+ecx]
-        pslldq  xmmA, SIZEOF_DWORD
-        por     xmmA,xmmF
-.column_ld8:
-        test    cl, SIZEOF_MMWORD
-        jz      short .column_ld16
-        sub     ecx, byte SIZEOF_MMWORD
-        movq    xmmB, XMM_MMWORD [esi+ecx]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmB
-.column_ld16:
-        test    cl, SIZEOF_XMMWORD
-        jz      short .column_ld32
-        movdqa  xmmF,xmmA
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        mov     ecx, SIZEOF_XMMWORD
-        jmp     short .rgb_gray_cnv
-.column_ld32:
-        test    cl, 2*SIZEOF_XMMWORD
-        mov     ecx, SIZEOF_XMMWORD
-        jz      short .rgb_gray_cnv
-        movdqa  xmmB,xmmA
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_gray_cnv
-        alignx  16,7
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        movdqu  xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        movdqa    xmmG,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-        movdqa    xmmD,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-        movdqa    xmmE,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-        pxor      xmmH,xmmH
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmB,xmmE
-        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-        movdqa    xmmF,xmmD
-        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-        test    cl, SIZEOF_XMMWORD/16
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_XMMWORD/16
-        movd    xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-        test    cl, SIZEOF_XMMWORD/8
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_XMMWORD/8
-        movq    xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmE
-.column_ld4:
-        test    cl, SIZEOF_XMMWORD/4
-        jz      short .column_ld8
-        sub     ecx, byte SIZEOF_XMMWORD/4
-        movdqa  xmmE,xmmA
-        movdqu  xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld8:
-        test    cl, SIZEOF_XMMWORD/2
-        mov     ecx, SIZEOF_XMMWORD
-        jz      short .rgb_gray_cnv
-        movdqa  xmmF,xmmA
-        movdqa  xmmH,xmmE
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_gray_cnv
-        alignx  16,7
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
-        movdqu  xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-        movdqa    xmmC,xmmF
-        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-        movdqa    xmmB,xmmA
-        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-        movdqa    xmmG,xmmD
-        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-        movdqa    xmmE,xmmA
-        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-        movdqa    xmmH,xmmB
-        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-        pxor      xmmF,xmmF
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmD,xmmB
-        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-        movdqa    xmmG,xmmE
-        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-        punpcklbw xmmF,xmmH
-        punpckhbw xmmH,xmmH
-        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
-        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-        ; (Original)
-        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-        ;
-        ; (This implementation)
-        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
-        movdqa    xmm6,xmm1
-        punpcklwd xmm1,xmm3
-        punpckhwd xmm6,xmm3
-        pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-        pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        movdqa    xmm7, xmm6    ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        movdqa    xmm6,xmm0
-        punpcklwd xmm0,xmm2
-        punpckhwd xmm6,xmm2
-        pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-        pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
-        movdqa    XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        movdqa    xmm0, xmm5    ; xmm0=BO
-        movdqa    xmm6, xmm4    ; xmm6=BE
-
-        movdqa    xmm4,xmm0
-        punpcklwd xmm0,xmm3
-        punpckhwd xmm4,xmm3
-        pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-        pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
-        movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
-
-        paddd     xmm0, xmm1
-        paddd     xmm4, xmm7
-        paddd     xmm0,xmm3
-        paddd     xmm4,xmm3
-        psrld     xmm0,SCALEBITS        ; xmm0=YOL
-        psrld     xmm4,SCALEBITS        ; xmm4=YOH
-        packssdw  xmm0,xmm4             ; xmm0=YO
-
-        movdqa    xmm4,xmm6
-        punpcklwd xmm6,xmm2
-        punpckhwd xmm4,xmm2
-        pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-        pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
-        movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
-
-        paddd     xmm6, XMMWORD [wk(0)]
-        paddd     xmm4, XMMWORD [wk(1)]
-        paddd     xmm6,xmm2
-        paddd     xmm4,xmm2
-        psrld     xmm6,SCALEBITS        ; xmm6=YEL
-        psrld     xmm4,SCALEBITS        ; xmm4=YEH
-        packssdw  xmm6,xmm4             ; xmm6=YE
-
-        psllw     xmm0,BYTE_BIT
-        por       xmm6,xmm0             ; xmm6=Y
-        movdqa    XMMWORD [edi], xmm6   ; Save Y
-
-        sub     ecx, byte SIZEOF_XMMWORD
-        add     esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
-        add     edi, byte SIZEOF_XMMWORD                ; outptr0
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        test    ecx,ecx
-        jnz     near .column_ld1
-
-        pop     ecx                     ; col
-        pop     esi
-        pop     edi
-        poppic  eax
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_buf
-        add     edi, byte SIZEOF_JSAMPROW
-        dec     eax                             ; num_rows
-        jg      near .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jchuff-sse2-64.asm b/simd/jchuff-sse2-64.asm
deleted file mode 100644
index 0deb3e6..0000000
--- a/simd/jchuff-sse2-64.asm
+++ /dev/null
@@ -1,360 +0,0 @@
-;
-; jchuff-sse2-64.asm - Huffman entropy encoding (64-bit SSE2)
-;
-; Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
-; Copyright (C) 2015, Matthieu Darbois.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains an SSE2 implementation for Huffman coding of one block.
-; The following code is based directly on jchuff.c; see jchuff.c for more
-; details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_huff_encode_one_block)
-
-EXTN(jconst_huff_encode_one_block):
-
-%include "jpeg_nbits_table.inc"
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-
-; These macros perform the same task as the emit_bits() function in the
-; original libjpeg code.  In addition to reducing overhead by explicitly
-; inlining the code, additional performance is achieved by taking into
-; account the size of the bit buffer and waiting until it is almost full
-; before emptying it.  This mostly benefits 64-bit platforms, since 6
-; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
-
-%macro EMIT_BYTE 0
-        sub put_bits, 8  ; put_bits -= 8;
-        mov rdx, put_buffer
-        mov ecx, put_bits
-        shr rdx, cl  ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
-        mov byte [buffer], dl  ; *buffer++ = c;
-        add buffer, 1
-        cmp dl, 0xFF  ; need to stuff a zero byte?
-        jne %%.EMIT_BYTE_END
-        mov byte [buffer], 0  ; *buffer++ = 0;
-        add buffer, 1
-%%.EMIT_BYTE_END:
-%endmacro
-
-%macro PUT_BITS 1
-        add put_bits, ecx  ; put_bits += size;
-        shl put_buffer, cl  ; put_buffer = (put_buffer << size);
-        or  put_buffer, %1
-%endmacro
-
-%macro CHECKBUF31 0
-        cmp put_bits, 32  ; if (put_bits > 31) {
-        jl %%.CHECKBUF31_END
-        EMIT_BYTE
-        EMIT_BYTE
-        EMIT_BYTE
-        EMIT_BYTE
-%%.CHECKBUF31_END:
-%endmacro
-
-%macro CHECKBUF47 0
-        cmp put_bits, 48  ; if (put_bits > 47) {
-        jl %%.CHECKBUF47_END
-        EMIT_BYTE
-        EMIT_BYTE
-        EMIT_BYTE
-        EMIT_BYTE
-        EMIT_BYTE
-        EMIT_BYTE
-%%.CHECKBUF47_END:
-%endmacro
-
-%macro EMIT_BITS 2
-        CHECKBUF47
-        mov ecx, %2
-        PUT_BITS %1
-%endmacro
-
-%macro kloop_prepare 37  ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
-    pxor xmm8, xmm8  ; __m128i neg = _mm_setzero_si128();
-    pxor xmm9, xmm9  ; __m128i neg = _mm_setzero_si128();
-    pxor xmm10, xmm10  ; __m128i neg = _mm_setzero_si128();
-    pxor xmm11, xmm11  ; __m128i neg = _mm_setzero_si128();
-    pinsrw %34, word [r12 + %2  * SIZEOF_WORD], 0  ; xmm_shadow[0] = block[jno0];
-    pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0  ; xmm_shadow[8] = block[jno8];
-    pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0  ; xmm_shadow[16] = block[jno16];
-    pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0  ; xmm_shadow[24] = block[jno24];
-    pinsrw %34, word [r12 + %3  * SIZEOF_WORD], 1  ; xmm_shadow[1] = block[jno1];
-    pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1  ; xmm_shadow[9] = block[jno9];
-    pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1  ; xmm_shadow[17] = block[jno17];
-    pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1  ; xmm_shadow[25] = block[jno25];
-    pinsrw %34, word [r12 + %4  * SIZEOF_WORD], 2  ; xmm_shadow[2] = block[jno2];
-    pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2  ; xmm_shadow[10] = block[jno10];
-    pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2  ; xmm_shadow[18] = block[jno18];
-    pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2  ; xmm_shadow[26] = block[jno26];
-    pinsrw %34, word [r12 + %5  * SIZEOF_WORD], 3  ; xmm_shadow[3] = block[jno3];
-    pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3  ; xmm_shadow[11] = block[jno11];
-    pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3  ; xmm_shadow[19] = block[jno19];
-    pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3  ; xmm_shadow[27] = block[jno27];
-    pinsrw %34, word [r12 + %6  * SIZEOF_WORD], 4  ; xmm_shadow[4] = block[jno4];
-    pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4  ; xmm_shadow[12] = block[jno12];
-    pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4  ; xmm_shadow[20] = block[jno20];
-    pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4  ; xmm_shadow[28] = block[jno28];
-    pinsrw %34, word [r12 + %7  * SIZEOF_WORD], 5  ; xmm_shadow[5] = block[jno5];
-    pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5  ; xmm_shadow[13] = block[jno13];
-    pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5  ; xmm_shadow[21] = block[jno21];
-    pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5  ; xmm_shadow[29] = block[jno29];
-    pinsrw %34, word [r12 + %8  * SIZEOF_WORD], 6  ; xmm_shadow[6] = block[jno6];
-    pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6  ; xmm_shadow[14] = block[jno14];
-    pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6  ; xmm_shadow[22] = block[jno22];
-    pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6  ; xmm_shadow[30] = block[jno30];
-    pinsrw %34, word [r12 + %9  * SIZEOF_WORD], 7  ; xmm_shadow[7] = block[jno7];
-    pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7  ; xmm_shadow[15] = block[jno15];
-    pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7  ; xmm_shadow[23] = block[jno23];
-%if %1 != 32
-    pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7  ; xmm_shadow[31] = block[jno31];
-%else
-    pinsrw %37, ebx, 7  ; xmm_shadow[31] = block[jno31];
-%endif
-    pcmpgtw xmm8, %34  ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw xmm9, %35  ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw xmm10, %36  ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw xmm11, %37  ; neg = _mm_cmpgt_epi16(neg, x1);
-    paddw %34, xmm8   ; x1 = _mm_add_epi16(x1, neg);
-    paddw %35, xmm9   ; x1 = _mm_add_epi16(x1, neg);
-    paddw %36, xmm10  ; x1 = _mm_add_epi16(x1, neg);
-    paddw %37, xmm11  ; x1 = _mm_add_epi16(x1, neg);
-    pxor %34, xmm8    ; x1 = _mm_xor_si128(x1, neg);
-    pxor %35, xmm9    ; x1 = _mm_xor_si128(x1, neg);
-    pxor %36, xmm10   ; x1 = _mm_xor_si128(x1, neg);
-    pxor %37, xmm11   ; x1 = _mm_xor_si128(x1, neg);
-    pxor xmm8, %34    ; neg = _mm_xor_si128(neg, x1);
-    pxor xmm9, %35    ; neg = _mm_xor_si128(neg, x1);
-    pxor xmm10, %36   ; neg = _mm_xor_si128(neg, x1);
-    pxor xmm11, %37   ; neg = _mm_xor_si128(neg, x1);
-    movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34  ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
-    movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35  ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
-    movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36  ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
-    movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37  ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
-    movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8  ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
-    movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9  ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
-    movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10  ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
-    movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11  ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
-%endmacro
-
-;
-; Encode a single block's worth of coefficients.
-;
-; GLOBAL(JOCTET*)
-; jsimd_huff_encode_one_block_sse2 (working_state *state, JOCTET *buffer,
-;                                   JCOEFPTR block, int last_dc_val,
-;                                   c_derived_tbl *dctbl, c_derived_tbl *actbl)
-;
-
-; r10 = working_state *state
-; r11 = JOCTET *buffer
-; r12 = JCOEFPTR block
-; r13 = int last_dc_val
-; r14 = c_derived_tbl *dctbl
-; r15 = c_derived_tbl *actbl
-
-%define t1              rbp-(DCTSIZE2*SIZEOF_WORD)
-%define t2              t1-(DCTSIZE2*SIZEOF_WORD)
-%define put_buffer      r8
-%define put_bits        r9d
-%define buffer          rax
-
-        align   16
-        global  EXTN(jsimd_huff_encode_one_block_sse2)
-
-EXTN(jsimd_huff_encode_one_block_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [t2]
-        collect_args
-%ifdef WIN64
-        sub     rsp, 4*SIZEOF_XMMWORD
-        movaps  XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8
-        movaps  XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9
-        movaps  XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10
-        movaps  XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11
-%endif
-        push rbx
-
-        mov buffer, r11  ; r11 is now sratch
-
-        mov put_buffer, MMWORD [r10+16]  ; put_buffer = state->cur.put_buffer;
-        mov put_bits,    DWORD [r10+24]  ; put_bits = state->cur.put_bits;
-        push r10  ; r10 is now scratch
-
-        ; Encode the DC coefficient difference per section F.1.2.1
-        movsx edi, word [r12]  ; temp = temp2 = block[0] - last_dc_val;
-        sub   edi, r13d  ; r13 is not used anymore
-        mov   ebx, edi
-
-        ; This is a well-known technique for obtaining the absolute value
-        ; without a branch.  It is derived from an assembly language technique
-        ; presented in "How to Optimize for the Pentium Processors",
-        ; Copyright (c) 1996, 1997 by Agner Fog.
-        mov esi, edi
-        sar esi, 31   ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
-        xor edi, esi  ; temp ^= temp3;
-        sub edi, esi  ; temp -= temp3;
-
-        ; For a negative input, want temp2 = bitwise complement of abs(input)
-        ; This code assumes we are on a two's complement machine
-        add ebx, esi  ; temp2 += temp3;
-
-        ; Find the number of bits needed for the magnitude of the coefficient
-        lea   r11, [rel jpeg_nbits_table]
-        movzx rdi, byte [r11 + rdi]  ; nbits = JPEG_NBITS(temp);
-        ; Emit the Huffman-coded symbol for the number of bits
-        mov   r11d,  INT [r14 + rdi * 4]  ; code = dctbl->ehufco[nbits];
-        movzx  esi, byte [r14 + rdi + 1024]  ; size = dctbl->ehufsi[nbits];
-        EMIT_BITS r11, esi  ; EMIT_BITS(code, size)
-
-        ; Mask off any extra bits in code
-        mov esi, 1
-        mov ecx, edi
-        shl esi, cl
-        dec esi
-        and ebx, esi  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
-
-        ; Emit that number of bits of the value, if positive,
-        ; or the complement of its magnitude, if negative.
-        EMIT_BITS rbx, edi  ; EMIT_BITS(temp2, nbits)
-
-        ; Prepare data
-        xor ebx, ebx
-        kloop_prepare  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, \
-                       18, 11, 4,  5,  12, 19, 26, 33, 40, 48, 41, 34, \
-                       27, 20, 13, 6,  7,  14, 21, 28, 35, \
-                       xmm0, xmm1, xmm2, xmm3
-        kloop_prepare  32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
-                       30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
-                       53, 60, 61, 54, 47, 55, 62, 63, 63, \
-                       xmm4, xmm5, xmm6, xmm7
-
-        pxor xmm8, xmm8
-        pcmpeqw xmm0, xmm8  ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
-        pcmpeqw xmm1, xmm8  ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
-        pcmpeqw xmm2, xmm8  ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
-        pcmpeqw xmm3, xmm8  ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
-        pcmpeqw xmm4, xmm8  ; tmp4 = _mm_cmpeq_epi16(tmp4, zero);
-        pcmpeqw xmm5, xmm8  ; tmp5 = _mm_cmpeq_epi16(tmp5, zero);
-        pcmpeqw xmm6, xmm8  ; tmp6 = _mm_cmpeq_epi16(tmp6, zero);
-        pcmpeqw xmm7, xmm8  ; tmp7 = _mm_cmpeq_epi16(tmp7, zero);
-        packsswb xmm0, xmm1  ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
-        packsswb xmm2, xmm3  ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
-        packsswb xmm4, xmm5  ; tmp4 = _mm_packs_epi16(tmp4, tmp5);
-        packsswb xmm6, xmm7  ; tmp6 = _mm_packs_epi16(tmp6, tmp7);
-        pmovmskb r11d, xmm0  ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
-        pmovmskb r12d, xmm2  ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
-        pmovmskb r13d, xmm4  ; index  = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32;
-        pmovmskb r14d, xmm6  ; index  = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48;
-        shl r12, 16
-        shl r14, 16
-        or  r11, r12
-        or  r13, r14
-        shl r13, 32
-        or  r11, r13
-        not r11  ; index = ~index;
-
-        ;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11
-        ;jmp .EFN
-
-        mov   r13d,  INT [r15 + 240 * 4]  ; code_0xf0 = actbl->ehufco[0xf0];
-        movzx r14d, byte [r15 + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
-        lea rsi, [t1]
-.BLOOP:
-        bsf r12, r11  ; r = __builtin_ctzl(index);
-        jz .ELOOP
-        mov rcx, r12
-        lea rsi, [rsi+r12*2]  ; k += r;
-        shr r11, cl  ; index >>= r;
-        movzx rdi, word [rsi]  ; temp = t1[k];
-        lea   rbx, [rel jpeg_nbits_table]
-        movzx rdi, byte [rbx + rdi]  ; nbits = JPEG_NBITS(temp);
-.BRLOOP:
-        cmp r12, 16  ; while (r > 15) {
-        jl .ERLOOP
-        EMIT_BITS r13, r14d  ; EMIT_BITS(code_0xf0, size_0xf0)
-        sub r12, 16  ; r -= 16;
-        jmp .BRLOOP
-.ERLOOP:
-        ; Emit Huffman symbol for run length / number of bits
-        CHECKBUF31  ; uses rcx, rdx
-
-        shl r12, 4  ; temp3 = (r << 4) + nbits;
-        add r12, rdi
-        mov   ebx,  INT [r15 + r12 * 4]  ; code = actbl->ehufco[temp3];
-        movzx ecx, byte [r15 + r12 + 1024]  ; size = actbl->ehufsi[temp3];
-        PUT_BITS rbx
-
-        ;EMIT_CODE(code, size)
-
-        movsx ebx, word [rsi-DCTSIZE2*2]  ; temp2 = t2[k];
-        ; Mask off any extra bits in code
-        mov rcx, rdi
-        mov rdx, 1
-        shl rdx, cl
-        dec rdx
-        and rbx, rdx  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
-        PUT_BITS rbx  ; PUT_BITS(temp2, nbits)
-
-        shr r11, 1  ; index >>= 1;
-        add rsi, 2  ; ++k;
-        jmp .BLOOP
-.ELOOP:
-        ; If the last coef(s) were zero, emit an end-of-block code
-        lea rdi, [t1 + (DCTSIZE2-1) * 2]  ; r = DCTSIZE2-1-k;
-        cmp rdi, rsi  ; if (r > 0) {
-        je .EFN
-        mov   ebx,  INT [r15]  ; code = actbl->ehufco[0];
-        movzx r12d, byte [r15 + 1024]  ; size = actbl->ehufsi[0];
-        EMIT_BITS rbx, r12d
-.EFN:
-        pop r10
-        ; Save put_buffer & put_bits
-        mov MMWORD [r10+16], put_buffer  ; state->cur.put_buffer = put_buffer;
-        mov DWORD  [r10+24], put_bits  ; state->cur.put_bits = put_bits;
-
-        pop rbx
-%ifdef WIN64
-        movaps  xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD]
-        movaps  xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD]
-        movaps  xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD]
-        movaps  xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD]
-        add     rsp, 4*SIZEOF_XMMWORD
-%endif
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jchuff-sse2.asm b/simd/jchuff-sse2.asm
deleted file mode 100644
index b81db75..0000000
--- a/simd/jchuff-sse2.asm
+++ /dev/null
@@ -1,426 +0,0 @@
-;
-; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
-;
-; Copyright (C) 2009-2011, 2014-2017, D. R. Commander.
-; Copyright (C) 2015, Matthieu Darbois.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains an SSE2 implementation for Huffman coding of one block.
-; The following code is based directly on jchuff.c; see jchuff.c for more
-; details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_huff_encode_one_block)
-
-EXTN(jconst_huff_encode_one_block):
-
-%include "jpeg_nbits_table.inc"
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-
-; These macros perform the same task as the emit_bits() function in the
-; original libjpeg code.  In addition to reducing overhead by explicitly
-; inlining the code, additional performance is achieved by taking into
-; account the size of the bit buffer and waiting until it is almost full
-; before emptying it.  This mostly benefits 64-bit platforms, since 6
-; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
-
-%macro EMIT_BYTE 0
-        sub put_bits, 8  ; put_bits -= 8;
-        mov edx, put_buffer
-        mov ecx, put_bits
-        shr edx, cl  ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
-        mov byte [eax], dl  ; *buffer++ = c;
-        add eax, 1
-        cmp dl, 0xFF  ; need to stuff a zero byte?
-        jne %%.EMIT_BYTE_END
-        mov byte [eax], 0  ; *buffer++ = 0;
-        add eax, 1
-%%.EMIT_BYTE_END:
-%endmacro
-
-%macro PUT_BITS 1
-        add put_bits, ecx  ; put_bits += size;
-        shl put_buffer, cl  ; put_buffer = (put_buffer << size);
-        or  put_buffer, %1
-%endmacro
-
-%macro CHECKBUF15 0
-        cmp put_bits, 16  ; if (put_bits > 31) {
-        jl %%.CHECKBUF15_END
-        mov eax, POINTER [esp+buffer]
-        EMIT_BYTE
-        EMIT_BYTE
-        mov POINTER [esp+buffer], eax
-%%.CHECKBUF15_END:
-%endmacro
-
-%macro EMIT_BITS 1
-        PUT_BITS %1
-        CHECKBUF15
-%endmacro
-
-%macro kloop_prepare 37  ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
-    pxor xmm4, xmm4  ; __m128i neg = _mm_setzero_si128();
-    pxor xmm5, xmm5  ; __m128i neg = _mm_setzero_si128();
-    pxor xmm6, xmm6  ; __m128i neg = _mm_setzero_si128();
-    pxor xmm7, xmm7  ; __m128i neg = _mm_setzero_si128();
-    pinsrw %34, word [esi + %2  * SIZEOF_WORD], 0  ; xmm_shadow[0] = block[jno0];
-    pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0  ; xmm_shadow[8] = block[jno8];
-    pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0  ; xmm_shadow[16] = block[jno16];
-    pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0  ; xmm_shadow[24] = block[jno24];
-    pinsrw %34, word [esi + %3  * SIZEOF_WORD], 1  ; xmm_shadow[1] = block[jno1];
-    pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1  ; xmm_shadow[9] = block[jno9];
-    pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1  ; xmm_shadow[17] = block[jno17];
-    pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1  ; xmm_shadow[25] = block[jno25];
-    pinsrw %34, word [esi + %4  * SIZEOF_WORD], 2  ; xmm_shadow[2] = block[jno2];
-    pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2  ; xmm_shadow[10] = block[jno10];
-    pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2  ; xmm_shadow[18] = block[jno18];
-    pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2  ; xmm_shadow[26] = block[jno26];
-    pinsrw %34, word [esi + %5  * SIZEOF_WORD], 3  ; xmm_shadow[3] = block[jno3];
-    pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3  ; xmm_shadow[11] = block[jno11];
-    pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3  ; xmm_shadow[19] = block[jno19];
-    pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3  ; xmm_shadow[27] = block[jno27];
-    pinsrw %34, word [esi + %6  * SIZEOF_WORD], 4  ; xmm_shadow[4] = block[jno4];
-    pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4  ; xmm_shadow[12] = block[jno12];
-    pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4  ; xmm_shadow[20] = block[jno20];
-    pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4  ; xmm_shadow[28] = block[jno28];
-    pinsrw %34, word [esi + %7  * SIZEOF_WORD], 5  ; xmm_shadow[5] = block[jno5];
-    pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5  ; xmm_shadow[13] = block[jno13];
-    pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5  ; xmm_shadow[21] = block[jno21];
-    pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5  ; xmm_shadow[29] = block[jno29];
-    pinsrw %34, word [esi + %8  * SIZEOF_WORD], 6  ; xmm_shadow[6] = block[jno6];
-    pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6  ; xmm_shadow[14] = block[jno14];
-    pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6  ; xmm_shadow[22] = block[jno22];
-    pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6  ; xmm_shadow[30] = block[jno30];
-    pinsrw %34, word [esi + %9  * SIZEOF_WORD], 7  ; xmm_shadow[7] = block[jno7];
-    pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7  ; xmm_shadow[15] = block[jno15];
-    pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7  ; xmm_shadow[23] = block[jno23];
-%if %1 != 32
-    pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7  ; xmm_shadow[31] = block[jno31];
-%else
-    pinsrw %37, ecx, 7  ; xmm_shadow[31] = block[jno31];
-%endif
-    pcmpgtw xmm4, %34  ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw xmm5, %35  ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw xmm6, %36  ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw xmm7, %37  ; neg = _mm_cmpgt_epi16(neg, x1);
-    paddw %34, xmm4   ; x1 = _mm_add_epi16(x1, neg);
-    paddw %35, xmm5   ; x1 = _mm_add_epi16(x1, neg);
-    paddw %36, xmm6  ; x1 = _mm_add_epi16(x1, neg);
-    paddw %37, xmm7  ; x1 = _mm_add_epi16(x1, neg);
-    pxor %34, xmm4    ; x1 = _mm_xor_si128(x1, neg);
-    pxor %35, xmm5    ; x1 = _mm_xor_si128(x1, neg);
-    pxor %36, xmm6   ; x1 = _mm_xor_si128(x1, neg);
-    pxor %37, xmm7   ; x1 = _mm_xor_si128(x1, neg);
-    pxor xmm4, %34    ; neg = _mm_xor_si128(neg, x1);
-    pxor xmm5, %35    ; neg = _mm_xor_si128(neg, x1);
-    pxor xmm6, %36   ; neg = _mm_xor_si128(neg, x1);
-    pxor xmm7, %37   ; neg = _mm_xor_si128(neg, x1);
-    movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34  ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
-    movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35  ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
-    movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36  ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
-    movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37  ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
-    movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4  ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
-    movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5  ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
-    movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6  ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
-    movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7  ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
-%endmacro
-
-;
-; Encode a single block's worth of coefficients.
-;
-; GLOBAL(JOCTET*)
-; jsimd_huff_encode_one_block_sse2 (working_state *state, JOCTET *buffer,
-;                                   JCOEFPTR block, int last_dc_val,
-;                                   c_derived_tbl *dctbl, c_derived_tbl *actbl)
-;
-
-; eax + 8 = working_state *state
-; eax + 12 = JOCTET *buffer
-; eax + 16 = JCOEFPTR block
-; eax + 20 = int last_dc_val
-; eax + 24 = c_derived_tbl *dctbl
-; eax + 28 = c_derived_tbl *actbl
-
-%define pad             6*SIZEOF_DWORD  ; Align to 16 bytes
-%define t1              pad
-%define t2              t1+(DCTSIZE2*SIZEOF_WORD)
-%define block           t2+(DCTSIZE2*SIZEOF_WORD)
-%define actbl           block+SIZEOF_DWORD
-%define buffer          actbl+SIZEOF_DWORD
-%define temp            buffer+SIZEOF_DWORD
-%define temp2           temp+SIZEOF_DWORD
-%define temp3           temp2+SIZEOF_DWORD
-%define temp4           temp3+SIZEOF_DWORD
-%define temp5           temp4+SIZEOF_DWORD
-%define gotptr          temp5+SIZEOF_DWORD  ; void *gotptr
-%define put_buffer      ebx
-%define put_bits        edi
-
-        align   16
-        global  EXTN(jsimd_huff_encode_one_block_sse2)
-
-EXTN(jsimd_huff_encode_one_block_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        sub     esp, temp5+9*SIZEOF_DWORD-pad
-        push    ebx
-        push    ecx
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-        push    ebp
-
-        mov esi, POINTER [eax+8]        ; (working_state *state)
-        mov put_buffer,  DWORD [esi+8]  ; put_buffer = state->cur.put_buffer;
-        mov put_bits,    DWORD [esi+12]  ; put_bits = state->cur.put_bits;
-        push esi  ; esi is now scratch
-
-        get_GOT edx                       ; get GOT address
-        movpic POINTER [esp+gotptr], edx  ; save GOT address
-
-        mov ecx, POINTER [eax+28]
-        mov edx, POINTER [eax+16]
-        mov esi, POINTER [eax+12]
-        mov POINTER [esp+actbl],  ecx
-        mov POINTER [esp+block],  edx
-        mov POINTER [esp+buffer], esi
-
-        ; Encode the DC coefficient difference per section F.1.2.1
-        mov esi, POINTER [esp+block]        ; block
-        movsx ecx, word [esi]  ; temp = temp2 = block[0] - last_dc_val;
-        sub   ecx, DWORD [eax+20]
-        mov   esi, ecx
-
-        ; This is a well-known technique for obtaining the absolute value
-        ; without a branch.  It is derived from an assembly language technique
-        ; presented in "How to Optimize for the Pentium Processors",
-        ; Copyright (c) 1996, 1997 by Agner Fog.
-        mov edx, ecx
-        sar edx, 31   ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
-        xor ecx, edx ; temp ^= temp3;
-        sub ecx, edx ; temp -= temp3;
-
-        ; For a negative input, want temp2 = bitwise complement of abs(input)
-        ; This code assumes we are on a two's complement machine
-        add esi, edx  ; temp2 += temp3;
-        mov DWORD [esp+temp], esi  ; backup temp2 in temp
-
-        ; Find the number of bits needed for the magnitude of the coefficient
-        movpic ebp, POINTER [esp+gotptr]   ; load GOT address (ebp)
-        movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)]  ; nbits = JPEG_NBITS(temp);
-        mov DWORD [esp+temp2], edx  ; backup nbits in temp2
-
-        ; Emit the Huffman-coded symbol for the number of bits
-        mov    ebp, POINTER [eax+24]  ; After this point, arguments are not accessible anymore
-        mov    eax,  INT [ebp + edx * 4]  ; code = dctbl->ehufco[nbits];
-        movzx  ecx, byte [ebp + edx + 1024]  ; size = dctbl->ehufsi[nbits];
-        EMIT_BITS eax  ; EMIT_BITS(code, size)
-
-        mov ecx, DWORD [esp+temp2]  ; restore nbits
-
-        ; Mask off any extra bits in code
-        mov eax, 1
-        shl eax, cl
-        dec eax
-        and eax, DWORD [esp+temp]  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
-
-        ; Emit that number of bits of the value, if positive,
-        ; or the complement of its magnitude, if negative.
-        EMIT_BITS eax  ; EMIT_BITS(temp2, nbits)
-
-        ; Prepare data
-        xor ecx, ecx
-        mov esi, POINTER [esp+block]
-        kloop_prepare  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, \
-                       18, 11, 4,  5,  12, 19, 26, 33, 40, 48, 41, 34, \
-                       27, 20, 13, 6,  7,  14, 21, 28, 35, \
-                       xmm0, xmm1, xmm2, xmm3
-        kloop_prepare  32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
-                       30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
-                       53, 60, 61, 54, 47, 55, 62, 63, 63, \
-                       xmm0, xmm1, xmm2, xmm3
-
-        pxor xmm7, xmm7
-        movdqa xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD]   ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
-        movdqa xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD]   ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
-        movdqa xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
-        movdqa xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
-        pcmpeqw xmm0, xmm7  ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
-        pcmpeqw xmm1, xmm7  ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
-        pcmpeqw xmm2, xmm7  ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
-        pcmpeqw xmm3, xmm7  ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
-        packsswb xmm0, xmm1  ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
-        packsswb xmm2, xmm3  ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
-        pmovmskb edx, xmm0  ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
-        pmovmskb ecx, xmm2  ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
-        shl ecx, 16
-        or  edx, ecx
-        not edx  ; index = ~index;
-
-        lea esi, [esp+t1]
-        mov ebp, POINTER [esp+actbl]  ; ebp = actbl
-
-.BLOOP:
-        bsf ecx, edx  ; r = __builtin_ctzl(index);
-        jz near .ELOOP
-        lea esi, [esi+ecx*2]  ; k += r;
-        shr edx, cl  ; index >>= r;
-        mov DWORD [esp+temp3], edx
-.BRLOOP:
-        cmp ecx, 16  ; while (r > 15) {
-        jl near .ERLOOP
-        sub ecx, 16 ; r -= 16;
-        mov DWORD [esp+temp], ecx
-        mov   eax, INT [ebp + 240 * 4]  ; code_0xf0 = actbl->ehufco[0xf0];
-        movzx ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
-        EMIT_BITS eax  ; EMIT_BITS(code_0xf0, size_0xf0)
-        mov ecx, DWORD [esp+temp]
-        jmp .BRLOOP
-.ERLOOP:
-        movsx eax, word [esi]  ; temp = t1[k];
-        movpic edx, POINTER [esp+gotptr]   ; load GOT address (edx)
-        movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)]  ; nbits = JPEG_NBITS(temp);
-        mov DWORD [esp+temp2], eax
-        ; Emit Huffman symbol for run length / number of bits
-        shl ecx, 4  ; temp3 = (r << 4) + nbits;
-        add ecx, eax
-        mov   eax,  INT [ebp + ecx * 4]  ; code = actbl->ehufco[temp3];
-        movzx ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
-        EMIT_BITS eax
-
-        movsx edx, word [esi+DCTSIZE2*2]  ; temp2 = t2[k];
-        ; Mask off any extra bits in code
-        mov ecx, DWORD [esp+temp2]
-        mov eax, 1
-        shl eax, cl
-        dec eax
-        and eax, edx  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
-        EMIT_BITS eax  ; PUT_BITS(temp2, nbits)
-        mov edx, DWORD [esp+temp3]
-        add esi, 2  ; ++k;
-        shr edx, 1  ; index >>= 1;
-
-        jmp .BLOOP
-.ELOOP:
-        movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD]  ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
-        movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD]  ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
-        movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
-        movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
-        pcmpeqw xmm0, xmm7  ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
-        pcmpeqw xmm1, xmm7  ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
-        pcmpeqw xmm2, xmm7  ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
-        pcmpeqw xmm3, xmm7  ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
-        packsswb xmm0, xmm1  ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
-        packsswb xmm2, xmm3  ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
-        pmovmskb edx, xmm0  ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
-        pmovmskb ecx, xmm2  ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
-        shl ecx, 16
-        or  edx, ecx
-        not edx  ; index = ~index;
-
-        lea eax, [esp + t1 + (DCTSIZE2/2) * 2]
-        sub eax, esi
-        shr eax, 1
-        bsf ecx, edx  ; r = __builtin_ctzl(index);
-        jz near .ELOOP2
-        shr edx, cl  ; index >>= r;
-        add ecx, eax
-        lea esi, [esi+ecx*2]  ; k += r;
-        mov DWORD [esp+temp3], edx
-        jmp .BRLOOP2
-.BLOOP2:
-        bsf ecx, edx  ; r = __builtin_ctzl(index);
-        jz near .ELOOP2
-        lea esi, [esi+ecx*2]  ; k += r;
-        shr edx, cl  ; index >>= r;
-        mov DWORD [esp+temp3], edx
-.BRLOOP2:
-        cmp ecx, 16  ; while (r > 15) {
-        jl near .ERLOOP2
-        sub ecx, 16  ; r -= 16;
-        mov DWORD [esp+temp], ecx
-        mov   eax, INT [ebp + 240 * 4]  ; code_0xf0 = actbl->ehufco[0xf0];
-        movzx ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
-        EMIT_BITS eax  ; EMIT_BITS(code_0xf0, size_0xf0)
-        mov ecx, DWORD [esp+temp]
-        jmp .BRLOOP2
-.ERLOOP2:
-        movsx eax, word [esi]  ; temp = t1[k];
-        bsr eax, eax  ; nbits = 32 - __builtin_clz(temp);
-        inc eax
-        mov DWORD [esp+temp2], eax
-        ; Emit Huffman symbol for run length / number of bits
-        shl ecx, 4  ; temp3 = (r << 4) + nbits;
-        add ecx, eax
-        mov   eax,  INT [ebp + ecx * 4]  ; code = actbl->ehufco[temp3];
-        movzx ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
-        EMIT_BITS eax
-
-        movsx edx, word [esi+DCTSIZE2*2]  ; temp2 = t2[k];
-        ; Mask off any extra bits in code
-        mov ecx, DWORD [esp+temp2]
-        mov eax, 1
-        shl eax, cl
-        dec eax
-        and eax, edx  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
-        EMIT_BITS eax  ; PUT_BITS(temp2, nbits)
-        mov edx, DWORD [esp+temp3]
-        add esi, 2  ; ++k;
-        shr edx, 1  ; index >>= 1;
-
-        jmp .BLOOP2
-.ELOOP2:
-        ; If the last coef(s) were zero, emit an end-of-block code
-        lea edx, [esp + t1 + (DCTSIZE2-1) * 2]  ; r = DCTSIZE2-1-k;
-        cmp edx, esi  ; if (r > 0) {
-        je .EFN
-        mov   eax,  INT [ebp]  ; code = actbl->ehufco[0];
-        movzx ecx, byte [ebp + 1024]  ; size = actbl->ehufsi[0];
-        EMIT_BITS eax
-.EFN:
-        mov eax, [esp+buffer]
-        pop esi
-        ; Save put_buffer & put_bits
-        mov DWORD [esi+8], put_buffer  ; state->cur.put_buffer = put_buffer;
-        mov DWORD [esi+12], put_bits  ; state->cur.put_bits = put_bits;
-
-        pop     ebp
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-        pop     ecx
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jcsample-mmx.asm b/simd/jcsample-mmx.asm
deleted file mode 100644
index 6cd544e..0000000
--- a/simd/jcsample-mmx.asm
+++ /dev/null
@@ -1,323 +0,0 @@
-;
-; jcsample.asm - downsampling (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Downsample pixel values of a single component.
-; This version handles the common case of 2:1 horizontal and 1:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
-;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION image_width
-%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
-%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
-%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
-%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
-%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
-
-        align   16
-        global  EXTN(jsimd_h2v1_downsample_mmx)
-
-EXTN(jsimd_h2v1_downsample_mmx):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     ecx, JDIMENSION [width_blks(ebp)]
-        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
-        jz      near .return
-
-        mov     edx, JDIMENSION [img_width(ebp)]
-
-        ; -- expand_right_edge
-
-        push    ecx
-        shl     ecx,1                           ; output_cols * 2
-        sub     ecx,edx
-        jle     short .expand_end
-
-        mov     eax, INT [max_v_samp(ebp)]
-        test    eax,eax
-        jle     short .expand_end
-
-        cld
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        alignx  16,7
-.expandloop:
-        push    eax
-        push    ecx
-
-        mov     edi, JSAMPROW [esi]
-        add     edi,edx
-        mov     al, JSAMPLE [edi-1]
-
-        rep stosb
-
-        pop     ecx
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW
-        dec     eax
-        jg      short .expandloop
-
-.expand_end:
-        pop     ecx                             ; output_cols
-
-        ; -- h2v1_downsample
-
-        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
-        test    eax,eax
-        jle     near .return
-
-        mov       edx, 0x00010000       ; bias pattern
-        movd      mm7,edx
-        pcmpeqw   mm6,mm6
-        punpckldq mm7,mm7               ; mm7={0, 1, 0, 1}
-        psrlw     mm6,BYTE_BIT          ; mm6={0xFF 0x00 0xFF 0x00 ..}
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
-        alignx  16,7
-.rowloop:
-        push    ecx
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]             ; inptr
-        mov     edi, JSAMPROW [edi]             ; outptr
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mm1, MMWORD [esi+1*SIZEOF_MMWORD]
-        movq    mm2,mm0
-        movq    mm3,mm1
-
-        pand    mm0,mm6
-        psrlw   mm2,BYTE_BIT
-        pand    mm1,mm6
-        psrlw   mm3,BYTE_BIT
-
-        paddw   mm0,mm2
-        paddw   mm1,mm3
-        paddw   mm0,mm7
-        paddw   mm1,mm7
-        psrlw   mm0,1
-        psrlw   mm1,1
-
-        packuswb mm0,mm1
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
-
-        add     esi, byte 2*SIZEOF_MMWORD       ; inptr
-        add     edi, byte 1*SIZEOF_MMWORD       ; outptr
-        sub     ecx, byte SIZEOF_MMWORD         ; outcol
-        jnz     short .columnloop
-
-        pop     esi
-        pop     edi
-        pop     ecx
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_data
-        add     edi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     eax                             ; rowctr
-        jg      short .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Downsample pixel values of a single component.
-; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
-;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION image_width
-%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
-%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
-%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
-%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
-%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
-
-        align   16
-        global  EXTN(jsimd_h2v2_downsample_mmx)
-
-EXTN(jsimd_h2v2_downsample_mmx):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     ecx, JDIMENSION [width_blks(ebp)]
-        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
-        jz      near .return
-
-        mov     edx, JDIMENSION [img_width(ebp)]
-
-        ; -- expand_right_edge
-
-        push    ecx
-        shl     ecx,1                           ; output_cols * 2
-        sub     ecx,edx
-        jle     short .expand_end
-
-        mov     eax, INT [max_v_samp(ebp)]
-        test    eax,eax
-        jle     short .expand_end
-
-        cld
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        alignx  16,7
-.expandloop:
-        push    eax
-        push    ecx
-
-        mov     edi, JSAMPROW [esi]
-        add     edi,edx
-        mov     al, JSAMPLE [edi-1]
-
-        rep stosb
-
-        pop     ecx
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW
-        dec     eax
-        jg      short .expandloop
-
-.expand_end:
-        pop     ecx                             ; output_cols
-
-        ; -- h2v2_downsample
-
-        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
-        test    eax,eax
-        jle     near .return
-
-        mov       edx, 0x00020001       ; bias pattern
-        movd      mm7,edx
-        pcmpeqw   mm6,mm6
-        punpckldq mm7,mm7               ; mm7={1, 2, 1, 2}
-        psrlw     mm6,BYTE_BIT          ; mm6={0xFF 0x00 0xFF 0x00 ..}
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
-        alignx  16,7
-.rowloop:
-        push    ecx
-        push    edi
-        push    esi
-
-        mov     edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
-        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1
-        mov     edi, JSAMPROW [edi]                     ; outptr
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [edx+0*SIZEOF_MMWORD]
-        movq    mm1, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mm2, MMWORD [edx+1*SIZEOF_MMWORD]
-        movq    mm3, MMWORD [esi+1*SIZEOF_MMWORD]
-
-        movq    mm4,mm0
-        movq    mm5,mm1
-        pand    mm0,mm6
-        psrlw   mm4,BYTE_BIT
-        pand    mm1,mm6
-        psrlw   mm5,BYTE_BIT
-        paddw   mm0,mm4
-        paddw   mm1,mm5
-
-        movq    mm4,mm2
-        movq    mm5,mm3
-        pand    mm2,mm6
-        psrlw   mm4,BYTE_BIT
-        pand    mm3,mm6
-        psrlw   mm5,BYTE_BIT
-        paddw   mm2,mm4
-        paddw   mm3,mm5
-
-        paddw   mm0,mm1
-        paddw   mm2,mm3
-        paddw   mm0,mm7
-        paddw   mm2,mm7
-        psrlw   mm0,2
-        psrlw   mm2,2
-
-        packuswb mm0,mm2
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
-
-        add     edx, byte 2*SIZEOF_MMWORD       ; inptr0
-        add     esi, byte 2*SIZEOF_MMWORD       ; inptr1
-        add     edi, byte 1*SIZEOF_MMWORD       ; outptr
-        sub     ecx, byte SIZEOF_MMWORD         ; outcol
-        jnz     near .columnloop
-
-        pop     esi
-        pop     edi
-        pop     ecx
-
-        add     esi, byte 2*SIZEOF_JSAMPROW     ; input_data
-        add     edi, byte 1*SIZEOF_JSAMPROW     ; output_data
-        dec     eax                             ; rowctr
-        jg      near .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jcsample-sse2-64.asm b/simd/jcsample-sse2-64.asm
deleted file mode 100644
index 40ee15f..0000000
--- a/simd/jcsample-sse2-64.asm
+++ /dev/null
@@ -1,329 +0,0 @@
-;
-; jcsample.asm - downsampling (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Downsample pixel values of a single component.
-; This version handles the common case of 2:1 horizontal and 1:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-; r10 = JDIMENSION image_width
-; r11 = int max_v_samp_factor
-; r12 = JDIMENSION v_samp_factor
-; r13 = JDIMENSION width_blocks
-; r14 = JSAMPARRAY input_data
-; r15 = JSAMPARRAY output_data
-
-        align   16
-        global  EXTN(jsimd_h2v1_downsample_sse2)
-
-EXTN(jsimd_h2v1_downsample_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-
-        mov ecx, r13d
-        shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
-        jz      near .return
-
-        mov edx, r10d
-
-        ; -- expand_right_edge
-
-        push    rcx
-        shl     rcx,1                           ; output_cols * 2
-        sub     rcx,rdx
-        jle     short .expand_end
-
-        mov     rax, r11
-        test    rax,rax
-        jle     short .expand_end
-
-        cld
-        mov     rsi, r14        ; input_data
-.expandloop:
-        push    rax
-        push    rcx
-
-        mov     rdi, JSAMPROW [rsi]
-        add     rdi,rdx
-        mov     al, JSAMPLE [rdi-1]
-
-        rep stosb
-
-        pop     rcx
-        pop     rax
-
-        add     rsi, byte SIZEOF_JSAMPROW
-        dec     rax
-        jg      short .expandloop
-
-.expand_end:
-        pop     rcx                             ; output_cols
-
-        ; -- h2v1_downsample
-
-        mov     eax, r12d        ; rowctr
-        test    eax,eax
-        jle     near .return
-
-        mov     rdx, 0x00010000         ; bias pattern
-        movd    xmm7,edx
-        pcmpeqw xmm6,xmm6
-        pshufd  xmm7,xmm7,0x00          ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
-        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-        mov     rsi, r14        ; input_data
-        mov     rdi, r15        ; output_data
-.rowloop:
-        push    rcx
-        push    rdi
-        push    rsi
-
-        mov     rsi, JSAMPROW [rsi]             ; inptr
-        mov rdi, JSAMPROW [rdi]         ; outptr
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     short .columnloop
-
-.columnloop_r8:
-        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        pxor    xmm1,xmm1
-        mov     rcx, SIZEOF_XMMWORD
-        jmp     short .downsample
-
-.columnloop:
-        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqa  xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-.downsample:
-        movdqa  xmm2,xmm0
-        movdqa  xmm3,xmm1
-
-        pand    xmm0,xmm6
-        psrlw   xmm2,BYTE_BIT
-        pand    xmm1,xmm6
-        psrlw   xmm3,BYTE_BIT
-
-        paddw   xmm0,xmm2
-        paddw   xmm1,xmm3
-        paddw   xmm0,xmm7
-        paddw   xmm1,xmm7
-        psrlw   xmm0,1
-        psrlw   xmm1,1
-
-        packuswb xmm0,xmm1
-
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-
-        sub     rcx, byte SIZEOF_XMMWORD        ; outcol
-        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
-        add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     short .columnloop
-        test    rcx,rcx
-        jnz     short .columnloop_r8
-
-        pop     rsi
-        pop     rdi
-        pop     rcx
-
-        add     rsi, byte SIZEOF_JSAMPROW       ; input_data
-        add     rdi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     rax                             ; rowctr
-        jg      near .rowloop
-
-.return:
-        uncollect_args
-        pop     rbp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Downsample pixel values of a single component.
-; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-; r10 = JDIMENSION image_width
-; r11 = int max_v_samp_factor
-; r12 = JDIMENSION v_samp_factor
-; r13 = JDIMENSION width_blocks
-; r14 = JSAMPARRAY input_data
-; r15 = JSAMPARRAY output_data
-
-        align   16
-        global  EXTN(jsimd_h2v2_downsample_sse2)
-
-EXTN(jsimd_h2v2_downsample_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-
-        mov     ecx, r13d
-        shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
-        jz      near .return
-
-        mov     edx, r10d
-
-        ; -- expand_right_edge
-
-        push    rcx
-        shl     rcx,1                           ; output_cols * 2
-        sub     rcx,rdx
-        jle     short .expand_end
-
-        mov     rax, r11
-        test    rax,rax
-        jle     short .expand_end
-
-        cld
-        mov     rsi, r14        ; input_data
-.expandloop:
-        push    rax
-        push    rcx
-
-        mov     rdi, JSAMPROW [rsi]
-        add     rdi,rdx
-        mov     al, JSAMPLE [rdi-1]
-
-        rep stosb
-
-        pop     rcx
-        pop     rax
-
-        add     rsi, byte SIZEOF_JSAMPROW
-        dec     rax
-        jg      short .expandloop
-
-.expand_end:
-        pop     rcx                             ; output_cols
-
-        ; -- h2v2_downsample
-
-        mov     eax, r12d        ; rowctr
-        test    rax,rax
-        jle     near .return
-
-        mov     rdx, 0x00020001         ; bias pattern
-        movd    xmm7,edx
-        pcmpeqw xmm6,xmm6
-        pshufd  xmm7,xmm7,0x00          ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
-        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-        mov     rsi, r14        ; input_data
-        mov     rdi, r15        ; output_data
-.rowloop:
-        push    rcx
-        push    rdi
-        push    rsi
-
-        mov     rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
-        mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1
-        mov     rdi, JSAMPROW [rdi]                     ; outptr
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     short .columnloop
-
-.columnloop_r8:
-        movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
-        movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        pxor    xmm2,xmm2
-        pxor    xmm3,xmm3
-        mov     rcx, SIZEOF_XMMWORD
-        jmp     short .downsample
-
-.columnloop:
-        movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
-        movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqa  xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
-        movdqa  xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-.downsample:
-        movdqa  xmm4,xmm0
-        movdqa  xmm5,xmm1
-        pand    xmm0,xmm6
-        psrlw   xmm4,BYTE_BIT
-        pand    xmm1,xmm6
-        psrlw   xmm5,BYTE_BIT
-        paddw   xmm0,xmm4
-        paddw   xmm1,xmm5
-
-        movdqa  xmm4,xmm2
-        movdqa  xmm5,xmm3
-        pand    xmm2,xmm6
-        psrlw   xmm4,BYTE_BIT
-        pand    xmm3,xmm6
-        psrlw   xmm5,BYTE_BIT
-        paddw   xmm2,xmm4
-        paddw   xmm3,xmm5
-
-        paddw   xmm0,xmm1
-        paddw   xmm2,xmm3
-        paddw   xmm0,xmm7
-        paddw   xmm2,xmm7
-        psrlw   xmm0,2
-        psrlw   xmm2,2
-
-        packuswb xmm0,xmm2
-
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-
-        sub     rcx, byte SIZEOF_XMMWORD        ; outcol
-        add     rdx, byte 2*SIZEOF_XMMWORD      ; inptr0
-        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr1
-        add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        test    rcx,rcx
-        jnz     near .columnloop_r8
-
-        pop     rsi
-        pop     rdi
-        pop     rcx
-
-        add     rsi, byte 2*SIZEOF_JSAMPROW     ; input_data
-        add     rdi, byte 1*SIZEOF_JSAMPROW     ; output_data
-        dec     rax                             ; rowctr
-        jg      near .rowloop
-
-.return:
-        uncollect_args
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jcsample-sse2.asm b/simd/jcsample-sse2.asm
deleted file mode 100644
index 83c9d15..0000000
--- a/simd/jcsample-sse2.asm
+++ /dev/null
@@ -1,350 +0,0 @@
-;
-; jcsample.asm - downsampling (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Downsample pixel values of a single component.
-; This version handles the common case of 2:1 horizontal and 1:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION image_width
-%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
-%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
-%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
-%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
-%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
-
-        align   16
-        global  EXTN(jsimd_h2v1_downsample_sse2)
-
-EXTN(jsimd_h2v1_downsample_sse2):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     ecx, JDIMENSION [width_blks(ebp)]
-        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
-        jz      near .return
-
-        mov     edx, JDIMENSION [img_width(ebp)]
-
-        ; -- expand_right_edge
-
-        push    ecx
-        shl     ecx,1                           ; output_cols * 2
-        sub     ecx,edx
-        jle     short .expand_end
-
-        mov     eax, INT [max_v_samp(ebp)]
-        test    eax,eax
-        jle     short .expand_end
-
-        cld
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        alignx  16,7
-.expandloop:
-        push    eax
-        push    ecx
-
-        mov     edi, JSAMPROW [esi]
-        add     edi,edx
-        mov     al, JSAMPLE [edi-1]
-
-        rep stosb
-
-        pop     ecx
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW
-        dec     eax
-        jg      short .expandloop
-
-.expand_end:
-        pop     ecx                             ; output_cols
-
-        ; -- h2v1_downsample
-
-        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
-        test    eax,eax
-        jle     near .return
-
-        mov     edx, 0x00010000         ; bias pattern
-        movd    xmm7,edx
-        pcmpeqw xmm6,xmm6
-        pshufd  xmm7,xmm7,0x00          ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
-        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
-        alignx  16,7
-.rowloop:
-        push    ecx
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]             ; inptr
-        mov     edi, JSAMPROW [edi]             ; outptr
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     short .columnloop
-        alignx  16,7
-
-.columnloop_r8:
-        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        pxor    xmm1,xmm1
-        mov     ecx, SIZEOF_XMMWORD
-        jmp     short .downsample
-        alignx  16,7
-
-.columnloop:
-        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqa  xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-.downsample:
-        movdqa  xmm2,xmm0
-        movdqa  xmm3,xmm1
-
-        pand    xmm0,xmm6
-        psrlw   xmm2,BYTE_BIT
-        pand    xmm1,xmm6
-        psrlw   xmm3,BYTE_BIT
-
-        paddw   xmm0,xmm2
-        paddw   xmm1,xmm3
-        paddw   xmm0,xmm7
-        paddw   xmm1,xmm7
-        psrlw   xmm0,1
-        psrlw   xmm1,1
-
-        packuswb xmm0,xmm1
-
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-
-        sub     ecx, byte SIZEOF_XMMWORD        ; outcol
-        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
-        add     edi, byte 1*SIZEOF_XMMWORD      ; outptr
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     short .columnloop
-        test    ecx,ecx
-        jnz     short .columnloop_r8
-
-        pop     esi
-        pop     edi
-        pop     ecx
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_data
-        add     edi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     eax                             ; rowctr
-        jg      near .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Downsample pixel values of a single component.
-; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION image_width
-%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
-%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
-%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
-%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
-%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
-
-        align   16
-        global  EXTN(jsimd_h2v2_downsample_sse2)
-
-EXTN(jsimd_h2v2_downsample_sse2):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     ecx, JDIMENSION [width_blks(ebp)]
-        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
-        jz      near .return
-
-        mov     edx, JDIMENSION [img_width(ebp)]
-
-        ; -- expand_right_edge
-
-        push    ecx
-        shl     ecx,1                           ; output_cols * 2
-        sub     ecx,edx
-        jle     short .expand_end
-
-        mov     eax, INT [max_v_samp(ebp)]
-        test    eax,eax
-        jle     short .expand_end
-
-        cld
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        alignx  16,7
-.expandloop:
-        push    eax
-        push    ecx
-
-        mov     edi, JSAMPROW [esi]
-        add     edi,edx
-        mov     al, JSAMPLE [edi-1]
-
-        rep stosb
-
-        pop     ecx
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW
-        dec     eax
-        jg      short .expandloop
-
-.expand_end:
-        pop     ecx                             ; output_cols
-
-        ; -- h2v2_downsample
-
-        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
-        test    eax,eax
-        jle     near .return
-
-        mov     edx, 0x00020001         ; bias pattern
-        movd    xmm7,edx
-        pcmpeqw xmm6,xmm6
-        pshufd  xmm7,xmm7,0x00          ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
-        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
-        alignx  16,7
-.rowloop:
-        push    ecx
-        push    edi
-        push    esi
-
-        mov     edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
-        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1
-        mov     edi, JSAMPROW [edi]                     ; outptr
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     short .columnloop
-        alignx  16,7
-
-.columnloop_r8:
-        movdqa  xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
-        movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        pxor    xmm2,xmm2
-        pxor    xmm3,xmm3
-        mov     ecx, SIZEOF_XMMWORD
-        jmp     short .downsample
-        alignx  16,7
-
-.columnloop:
-        movdqa  xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
-        movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqa  xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
-        movdqa  xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-.downsample:
-        movdqa  xmm4,xmm0
-        movdqa  xmm5,xmm1
-        pand    xmm0,xmm6
-        psrlw   xmm4,BYTE_BIT
-        pand    xmm1,xmm6
-        psrlw   xmm5,BYTE_BIT
-        paddw   xmm0,xmm4
-        paddw   xmm1,xmm5
-
-        movdqa  xmm4,xmm2
-        movdqa  xmm5,xmm3
-        pand    xmm2,xmm6
-        psrlw   xmm4,BYTE_BIT
-        pand    xmm3,xmm6
-        psrlw   xmm5,BYTE_BIT
-        paddw   xmm2,xmm4
-        paddw   xmm3,xmm5
-
-        paddw   xmm0,xmm1
-        paddw   xmm2,xmm3
-        paddw   xmm0,xmm7
-        paddw   xmm2,xmm7
-        psrlw   xmm0,2
-        psrlw   xmm2,2
-
-        packuswb xmm0,xmm2
-
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-
-        sub     ecx, byte SIZEOF_XMMWORD        ; outcol
-        add     edx, byte 2*SIZEOF_XMMWORD      ; inptr0
-        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr1
-        add     edi, byte 1*SIZEOF_XMMWORD      ; outptr
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        test    ecx,ecx
-        jnz     near .columnloop_r8
-
-        pop     esi
-        pop     edi
-        pop     ecx
-
-        add     esi, byte 2*SIZEOF_JSAMPROW     ; input_data
-        add     edi, byte 1*SIZEOF_JSAMPROW     ; output_data
-        dec     eax                             ; rowctr
-        jg      near .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jdcolext-mmx.asm b/simd/jdcolext-mmx.asm
deleted file mode 100644
index 21e34f6..0000000
--- a/simd/jdcolext-mmx.asm
+++ /dev/null
@@ -1,404 +0,0 @@
-;
-; jdcolext.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width,
-;                            JSAMPIMAGE input_buf, JDIMENSION input_row,
-;                            JSAMPARRAY output_buf, int num_rows)
-;
-
-%define out_width(b)    (b)+8           ; JDIMENSION out_width
-%define input_buf(b)    (b)+12          ; JSAMPIMAGE input_buf
-%define input_row(b)    (b)+16          ; JDIMENSION input_row
-%define output_buf(b)   (b)+20          ; JSAMPARRAY output_buf
-%define num_rows(b)     (b)+24          ; int num_rows
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-        global  EXTN(jsimd_ycc_rgb_convert_mmx)
-
-EXTN(jsimd_ycc_rgb_convert_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [out_width(eax)]        ; num_cols
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     edi, JSAMPIMAGE [input_buf(eax)]
-        mov     ecx, JDIMENSION [input_row(eax)]
-        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-        lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
-        lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-        lea     edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-        pop     ecx
-
-        mov     edi, JSAMPARRAY [output_buf(eax)]
-        mov     eax, INT [num_rows(eax)]
-        test    eax,eax
-        jle     near .return
-        alignx  16,7
-.rowloop:
-        push    eax
-        push    edi
-        push    edx
-        push    ebx
-        push    esi
-        push    ecx                     ; col
-
-        mov     esi, JSAMPROW [esi]     ; inptr0
-        mov     ebx, JSAMPROW [ebx]     ; inptr1
-        mov     edx, JSAMPROW [edx]     ; inptr2
-        mov     edi, JSAMPROW [edi]     ; outptr
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-        alignx  16,7
-.columnloop:
-
-        movq    mm5, MMWORD [ebx]       ; mm5=Cb(01234567)
-        movq    mm1, MMWORD [edx]       ; mm1=Cr(01234567)
-
-        pcmpeqw mm4,mm4
-        pcmpeqw mm7,mm7
-        psrlw   mm4,BYTE_BIT
-        psllw   mm7,7                   ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
-        movq    mm0,mm4                 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
-
-        pand    mm4,mm5                 ; mm4=Cb(0246)=CbE
-        psrlw   mm5,BYTE_BIT            ; mm5=Cb(1357)=CbO
-        pand    mm0,mm1                 ; mm0=Cr(0246)=CrE
-        psrlw   mm1,BYTE_BIT            ; mm1=Cr(1357)=CrO
-
-        paddw   mm4,mm7
-        paddw   mm5,mm7
-        paddw   mm0,mm7
-        paddw   mm1,mm7
-
-        ; (Original)
-        ; R = Y                + 1.40200 * Cr
-        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-        ; B = Y + 1.77200 * Cb
-        ;
-        ; (This implementation)
-        ; R = Y                + 0.40200 * Cr + Cr
-        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-        ; B = Y - 0.22800 * Cb + Cb + Cb
-
-        movq    mm2,mm4                 ; mm2=CbE
-        movq    mm3,mm5                 ; mm3=CbO
-        paddw   mm4,mm4                 ; mm4=2*CbE
-        paddw   mm5,mm5                 ; mm5=2*CbO
-        movq    mm6,mm0                 ; mm6=CrE
-        movq    mm7,mm1                 ; mm7=CrO
-        paddw   mm0,mm0                 ; mm0=2*CrE
-        paddw   mm1,mm1                 ; mm1=2*CrO
-
-        pmulhw  mm4,[GOTOFF(eax,PW_MF0228)]     ; mm4=(2*CbE * -FIX(0.22800))
-        pmulhw  mm5,[GOTOFF(eax,PW_MF0228)]     ; mm5=(2*CbO * -FIX(0.22800))
-        pmulhw  mm0,[GOTOFF(eax,PW_F0402)]      ; mm0=(2*CrE * FIX(0.40200))
-        pmulhw  mm1,[GOTOFF(eax,PW_F0402)]      ; mm1=(2*CrO * FIX(0.40200))
-
-        paddw   mm4,[GOTOFF(eax,PW_ONE)]
-        paddw   mm5,[GOTOFF(eax,PW_ONE)]
-        psraw   mm4,1                   ; mm4=(CbE * -FIX(0.22800))
-        psraw   mm5,1                   ; mm5=(CbO * -FIX(0.22800))
-        paddw   mm0,[GOTOFF(eax,PW_ONE)]
-        paddw   mm1,[GOTOFF(eax,PW_ONE)]
-        psraw   mm0,1                   ; mm0=(CrE * FIX(0.40200))
-        psraw   mm1,1                   ; mm1=(CrO * FIX(0.40200))
-
-        paddw   mm4,mm2
-        paddw   mm5,mm3
-        paddw   mm4,mm2                 ; mm4=(CbE * FIX(1.77200))=(B-Y)E
-        paddw   mm5,mm3                 ; mm5=(CbO * FIX(1.77200))=(B-Y)O
-        paddw   mm0,mm6                 ; mm0=(CrE * FIX(1.40200))=(R-Y)E
-        paddw   mm1,mm7                 ; mm1=(CrO * FIX(1.40200))=(R-Y)O
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=(B-Y)E
-        movq    MMWORD [wk(1)], mm5     ; wk(1)=(B-Y)O
-
-        movq      mm4,mm2
-        movq      mm5,mm3
-        punpcklwd mm2,mm6
-        punpckhwd mm4,mm6
-        pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   mm4,[GOTOFF(eax,PW_MF0344_F0285)]
-        punpcklwd mm3,mm7
-        punpckhwd mm5,mm7
-        pmaddwd   mm3,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
-
-        paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     mm4,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     mm2,SCALEBITS
-        psrad     mm4,SCALEBITS
-        paddd     mm3,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     mm3,SCALEBITS
-        psrad     mm5,SCALEBITS
-
-        packssdw  mm2,mm4       ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
-        packssdw  mm3,mm5       ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
-        psubw     mm2,mm6       ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
-        psubw     mm3,mm7       ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
-        movq      mm5, MMWORD [esi]     ; mm5=Y(01234567)
-
-        pcmpeqw   mm4,mm4
-        psrlw     mm4,BYTE_BIT          ; mm4={0xFF 0x00 0xFF 0x00 ..}
-        pand      mm4,mm5               ; mm4=Y(0246)=YE
-        psrlw     mm5,BYTE_BIT          ; mm5=Y(1357)=YO
-
-        paddw     mm0,mm4               ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
-        paddw     mm1,mm5               ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
-        packuswb  mm0,mm0               ; mm0=(R0 R2 R4 R6 ** ** ** **)
-        packuswb  mm1,mm1               ; mm1=(R1 R3 R5 R7 ** ** ** **)
-
-        paddw     mm2,mm4               ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
-        paddw     mm3,mm5               ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
-        packuswb  mm2,mm2               ; mm2=(G0 G2 G4 G6 ** ** ** **)
-        packuswb  mm3,mm3               ; mm3=(G1 G3 G5 G7 ** ** ** **)
-
-        paddw     mm4, MMWORD [wk(0)]   ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
-        paddw     mm5, MMWORD [wk(1)]   ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
-        packuswb  mm4,mm4               ; mm4=(B0 B2 B4 B6 ** ** ** **)
-        packuswb  mm5,mm5               ; mm5=(B1 B3 B5 B7 ** ** ** **)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-        ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
-
-        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
-        punpcklbw mmE,mmB               ; mmE=(20 01 22 03 24 05 26 07)
-        punpcklbw mmD,mmF               ; mmD=(11 21 13 23 15 25 17 27)
-
-        movq      mmG,mmA
-        movq      mmH,mmA
-        punpcklwd mmA,mmE               ; mmA=(00 10 20 01 02 12 22 03)
-        punpckhwd mmG,mmE               ; mmG=(04 14 24 05 06 16 26 07)
-
-        psrlq     mmH,2*BYTE_BIT        ; mmH=(02 12 04 14 06 16 -- --)
-        psrlq     mmE,2*BYTE_BIT        ; mmE=(22 03 24 05 26 07 -- --)
-
-        movq      mmC,mmD
-        movq      mmB,mmD
-        punpcklwd mmD,mmH               ; mmD=(11 21 02 12 13 23 04 14)
-        punpckhwd mmC,mmH               ; mmC=(15 25 06 16 17 27 -- --)
-
-        psrlq     mmB,2*BYTE_BIT        ; mmB=(13 23 15 25 17 27 -- --)
-
-        movq      mmF,mmE
-        punpcklwd mmE,mmB               ; mmE=(22 03 13 23 24 05 15 25)
-        punpckhwd mmF,mmB               ; mmF=(26 07 17 27 -- -- -- --)
-
-        punpckldq mmA,mmD               ; mmA=(00 10 20 01 11 21 02 12)
-        punpckldq mmE,mmG               ; mmE=(22 03 13 23 04 14 24 05)
-        punpckldq mmC,mmF               ; mmC=(15 25 06 16 26 07 17 27)
-
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st16
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
-
-        sub     ecx, byte SIZEOF_MMWORD
-        jz      short .nextrow
-
-        add     esi, byte SIZEOF_MMWORD                 ; inptr0
-        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
-        add     edx, byte SIZEOF_MMWORD                 ; inptr2
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st16:
-        lea     ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
-        cmp     ecx, byte 2*SIZEOF_MMWORD
-        jb      short .column_st8
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
-        movq    mmA,mmC
-        sub     ecx, byte 2*SIZEOF_MMWORD
-        add     edi, byte 2*SIZEOF_MMWORD
-        jmp     short .column_st4
-.column_st8:
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st4
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    mmA,mmE
-        sub     ecx, byte SIZEOF_MMWORD
-        add     edi, byte SIZEOF_MMWORD
-.column_st4:
-        movd    eax,mmA
-        cmp     ecx, byte SIZEOF_DWORD
-        jb      short .column_st2
-        mov     DWORD [edi+0*SIZEOF_DWORD], eax
-        psrlq   mmA,DWORD_BIT
-        movd    eax,mmA
-        sub     ecx, byte SIZEOF_DWORD
-        add     edi, byte SIZEOF_DWORD
-.column_st2:
-        cmp     ecx, byte SIZEOF_WORD
-        jb      short .column_st1
-        mov     WORD [edi+0*SIZEOF_WORD], ax
-        shr     eax,WORD_BIT
-        sub     ecx, byte SIZEOF_WORD
-        add     edi, byte SIZEOF_WORD
-.column_st1:
-        cmp     ecx, byte SIZEOF_BYTE
-        jb      short .nextrow
-        mov     BYTE [edi+0*SIZEOF_BYTE], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-        pcmpeqb   mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
-        pcmpeqb   mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%else
-        pxor      mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
-        pxor      mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%endif
-        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-        ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
-
-        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
-        punpcklbw mmE,mmG               ; mmE=(20 30 22 32 24 34 26 36)
-        punpcklbw mmB,mmD               ; mmB=(01 11 03 13 05 15 07 17)
-        punpcklbw mmF,mmH               ; mmF=(21 31 23 33 25 35 27 37)
-
-        movq      mmC,mmA
-        punpcklwd mmA,mmE               ; mmA=(00 10 20 30 02 12 22 32)
-        punpckhwd mmC,mmE               ; mmC=(04 14 24 34 06 16 26 36)
-        movq      mmG,mmB
-        punpcklwd mmB,mmF               ; mmB=(01 11 21 31 03 13 23 33)
-        punpckhwd mmG,mmF               ; mmG=(05 15 25 35 07 17 27 37)
-
-        movq      mmD,mmA
-        punpckldq mmA,mmB               ; mmA=(00 10 20 30 01 11 21 31)
-        punpckhdq mmD,mmB               ; mmD=(02 12 22 32 03 13 23 33)
-        movq      mmH,mmC
-        punpckldq mmC,mmG               ; mmC=(04 14 24 34 05 15 25 35)
-        punpckhdq mmH,mmG               ; mmH=(06 16 26 36 07 17 27 37)
-
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st16
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
-        movq    MMWORD [edi+3*SIZEOF_MMWORD], mmH
-
-        sub     ecx, byte SIZEOF_MMWORD
-        jz      short .nextrow
-
-        add     esi, byte SIZEOF_MMWORD                 ; inptr0
-        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
-        add     edx, byte SIZEOF_MMWORD                 ; inptr2
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st16:
-        cmp     ecx, byte SIZEOF_MMWORD/2
-        jb      short .column_st8
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
-        movq    mmA,mmC
-        movq    mmD,mmH
-        sub     ecx, byte SIZEOF_MMWORD/2
-        add     edi, byte 2*SIZEOF_MMWORD
-.column_st8:
-        cmp     ecx, byte SIZEOF_MMWORD/4
-        jb      short .column_st4
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    mmA,mmD
-        sub     ecx, byte SIZEOF_MMWORD/4
-        add     edi, byte 1*SIZEOF_MMWORD
-.column_st4:
-        cmp     ecx, byte SIZEOF_MMWORD/8
-        jb      short .nextrow
-        movd    DWORD [edi+0*SIZEOF_DWORD], mmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        alignx  16,7
-
-.nextrow:
-        pop     ecx
-        pop     esi
-        pop     ebx
-        pop     edx
-        pop     edi
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW
-        add     ebx, byte SIZEOF_JSAMPROW
-        add     edx, byte SIZEOF_JSAMPROW
-        add     edi, byte SIZEOF_JSAMPROW       ; output_buf
-        dec     eax                             ; num_rows
-        jg      near .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jdcolext-sse2-64.asm b/simd/jdcolext-sse2-64.asm
deleted file mode 100644
index 4634066..0000000
--- a/simd/jdcolext-sse2-64.asm
+++ /dev/null
@@ -1,440 +0,0 @@
-;
-; jdcolext.asm - colorspace conversion (64-bit SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2012, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
-;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
-;                             JSAMPARRAY output_buf, int num_rows)
-;
-
-; r10 = JDIMENSION out_width
-; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION input_row
-; r13 = JSAMPARRAY output_buf
-; r14 = int num_rows
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_ycc_rgb_convert_sse2)
-
-EXTN(jsimd_ycc_rgb_convert_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-        push    rbx
-
-        mov     ecx, r10d        ; num_cols
-        test    rcx,rcx
-        jz      near .return
-
-        push    rcx
-
-        mov     rdi, r11
-        mov     ecx, r12d
-        mov     rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-        mov     rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-        mov     rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
-        lea     rsi, [rsi+rcx*SIZEOF_JSAMPROW]
-        lea     rbx, [rbx+rcx*SIZEOF_JSAMPROW]
-        lea     rdx, [rdx+rcx*SIZEOF_JSAMPROW]
-
-        pop     rcx
-
-        mov     rdi, r13
-        mov     eax, r14d
-        test    rax,rax
-        jle     near .return
-.rowloop:
-        push    rax
-        push    rdi
-        push    rdx
-        push    rbx
-        push    rsi
-        push    rcx                     ; col
-
-        mov     rsi, JSAMPROW [rsi]     ; inptr0
-        mov     rbx, JSAMPROW [rbx]     ; inptr1
-        mov     rdx, JSAMPROW [rdx]     ; inptr2
-        mov     rdi, JSAMPROW [rdi]     ; outptr
-.columnloop:
-
-        movdqa  xmm5, XMMWORD [rbx]     ; xmm5=Cb(0123456789ABCDEF)
-        movdqa  xmm1, XMMWORD [rdx]     ; xmm1=Cr(0123456789ABCDEF)
-
-        pcmpeqw xmm4,xmm4
-        pcmpeqw xmm7,xmm7
-        psrlw   xmm4,BYTE_BIT
-        psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-        movdqa  xmm0,xmm4               ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
-
-        pand    xmm4,xmm5               ; xmm4=Cb(02468ACE)=CbE
-        psrlw   xmm5,BYTE_BIT           ; xmm5=Cb(13579BDF)=CbO
-        pand    xmm0,xmm1               ; xmm0=Cr(02468ACE)=CrE
-        psrlw   xmm1,BYTE_BIT           ; xmm1=Cr(13579BDF)=CrO
-
-        paddw   xmm4,xmm7
-        paddw   xmm5,xmm7
-        paddw   xmm0,xmm7
-        paddw   xmm1,xmm7
-
-        ; (Original)
-        ; R = Y                + 1.40200 * Cr
-        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-        ; B = Y + 1.77200 * Cb
-        ;
-        ; (This implementation)
-        ; R = Y                + 0.40200 * Cr + Cr
-        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-        ; B = Y - 0.22800 * Cb + Cb + Cb
-
-        movdqa  xmm2,xmm4               ; xmm2=CbE
-        movdqa  xmm3,xmm5               ; xmm3=CbO
-        paddw   xmm4,xmm4               ; xmm4=2*CbE
-        paddw   xmm5,xmm5               ; xmm5=2*CbO
-        movdqa  xmm6,xmm0               ; xmm6=CrE
-        movdqa  xmm7,xmm1               ; xmm7=CrO
-        paddw   xmm0,xmm0               ; xmm0=2*CrE
-        paddw   xmm1,xmm1               ; xmm1=2*CrO
-
-        pmulhw  xmm4,[rel PW_MF0228]    ; xmm4=(2*CbE * -FIX(0.22800))
-        pmulhw  xmm5,[rel PW_MF0228]    ; xmm5=(2*CbO * -FIX(0.22800))
-        pmulhw  xmm0,[rel PW_F0402]     ; xmm0=(2*CrE * FIX(0.40200))
-        pmulhw  xmm1,[rel PW_F0402]     ; xmm1=(2*CrO * FIX(0.40200))
-
-        paddw   xmm4,[rel PW_ONE]
-        paddw   xmm5,[rel PW_ONE]
-        psraw   xmm4,1                  ; xmm4=(CbE * -FIX(0.22800))
-        psraw   xmm5,1                  ; xmm5=(CbO * -FIX(0.22800))
-        paddw   xmm0,[rel PW_ONE]
-        paddw   xmm1,[rel PW_ONE]
-        psraw   xmm0,1                  ; xmm0=(CrE * FIX(0.40200))
-        psraw   xmm1,1                  ; xmm1=(CrO * FIX(0.40200))
-
-        paddw   xmm4,xmm2
-        paddw   xmm5,xmm3
-        paddw   xmm4,xmm2               ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
-        paddw   xmm5,xmm3               ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
-        paddw   xmm0,xmm6               ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
-        paddw   xmm1,xmm7               ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=(B-Y)E
-        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(B-Y)O
-
-        movdqa    xmm4,xmm2
-        movdqa    xmm5,xmm3
-        punpcklwd xmm2,xmm6
-        punpckhwd xmm4,xmm6
-        pmaddwd   xmm2,[rel PW_MF0344_F0285]
-        pmaddwd   xmm4,[rel PW_MF0344_F0285]
-        punpcklwd xmm3,xmm7
-        punpckhwd xmm5,xmm7
-        pmaddwd   xmm3,[rel PW_MF0344_F0285]
-        pmaddwd   xmm5,[rel PW_MF0344_F0285]
-
-        paddd     xmm2,[rel PD_ONEHALF]
-        paddd     xmm4,[rel PD_ONEHALF]
-        psrad     xmm2,SCALEBITS
-        psrad     xmm4,SCALEBITS
-        paddd     xmm3,[rel PD_ONEHALF]
-        paddd     xmm5,[rel PD_ONEHALF]
-        psrad     xmm3,SCALEBITS
-        psrad     xmm5,SCALEBITS
-
-        packssdw  xmm2,xmm4     ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
-        packssdw  xmm3,xmm5     ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
-        psubw     xmm2,xmm6     ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
-        psubw     xmm3,xmm7     ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
-        movdqa    xmm5, XMMWORD [rsi]   ; xmm5=Y(0123456789ABCDEF)
-
-        pcmpeqw   xmm4,xmm4
-        psrlw     xmm4,BYTE_BIT         ; xmm4={0xFF 0x00 0xFF 0x00 ..}
-        pand      xmm4,xmm5             ; xmm4=Y(02468ACE)=YE
-        psrlw     xmm5,BYTE_BIT         ; xmm5=Y(13579BDF)=YO
-
-        paddw     xmm0,xmm4             ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
-        paddw     xmm1,xmm5             ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
-        packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
-        packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
-
-        paddw     xmm2,xmm4             ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
-        paddw     xmm3,xmm5             ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
-        packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
-        packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
-
-        paddw     xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
-        paddw     xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
-        packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
-        packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-        punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-        movdqa    xmmG,xmmA
-        movdqa    xmmH,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-        punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-        psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-        psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-        movdqa    xmmC,xmmD
-        movdqa    xmmB,xmmD
-        punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-        punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-        psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-        movdqa    xmmF,xmmE
-        punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-        punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-        pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-        movdqa    xmmB,xmmE
-        punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-        punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-        punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-        pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-        movdqa    xmmB,xmmF
-        punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-        punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-        punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-        punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    rdi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-        add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     rcx, byte SIZEOF_XMMWORD
-        jz      near .nextrow
-
-        add     rsi, byte SIZEOF_XMMWORD        ; inptr0
-        add     rbx, byte SIZEOF_XMMWORD        ; inptr1
-        add     rdx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-
-.column_st32:
-        lea     rcx, [rcx+rcx*2]                ; imul ecx, RGB_PIXELSIZE
-        cmp     rcx, byte 2*SIZEOF_XMMWORD
-        jb      short .column_st16
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmF
-        sub     rcx, byte 2*SIZEOF_XMMWORD
-        jmp     short .column_st15
-.column_st16:
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jb      short .column_st15
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        add     rdi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     rcx, byte SIZEOF_XMMWORD
-.column_st15:
-        ; Store the lower 8 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     rcx, byte SIZEOF_MMWORD
-        jb      short .column_st7
-        movq    XMM_MMWORD [rdi], xmmA
-        add     rdi, byte SIZEOF_MMWORD
-        sub     rcx, byte SIZEOF_MMWORD
-        psrldq  xmmA, SIZEOF_MMWORD
-.column_st7:
-        ; Store the lower 4 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     rcx, byte SIZEOF_DWORD
-        jb      short .column_st3
-        movd    XMM_DWORD [rdi], xmmA
-        add     rdi, byte SIZEOF_DWORD
-        sub     rcx, byte SIZEOF_DWORD
-        psrldq  xmmA, SIZEOF_DWORD
-.column_st3:
-        ; Store the lower 2 bytes of rax to the output when it has enough
-        ; space.
-        movd    eax, xmmA
-        cmp     rcx, byte SIZEOF_WORD
-        jb      short .column_st1
-        mov     WORD [rdi], ax
-        add     rdi, byte SIZEOF_WORD
-        sub     rcx, byte SIZEOF_WORD
-        shr     rax, 16
-.column_st1:
-        ; Store the lower 1 byte of rax to the output when it has enough
-        ; space.
-        test    rcx, rcx
-        jz      short .nextrow
-        mov     BYTE [rdi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-        pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%else
-        pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%endif
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-        punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-        punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-        movdqa    xmmC,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-        punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-        movdqa    xmmG,xmmB
-        punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-        punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        movdqa    xmmH,xmmC
-        punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    rdi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-        movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-        movdqu  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-        add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     rcx, byte SIZEOF_XMMWORD
-        jz      near .nextrow
-
-        add     rsi, byte SIZEOF_XMMWORD        ; inptr0
-        add     rbx, byte SIZEOF_XMMWORD        ; inptr1
-        add     rdx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-
-.column_st32:
-        cmp     rcx, byte SIZEOF_XMMWORD/2
-        jb      short .column_st16
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmC
-        movdqa  xmmD,xmmH
-        sub     rcx, byte SIZEOF_XMMWORD/2
-.column_st16:
-        cmp     rcx, byte SIZEOF_XMMWORD/4
-        jb      short .column_st15
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        add     rdi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     rcx, byte SIZEOF_XMMWORD/4
-.column_st15:
-        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
-        ; space.
-        cmp     rcx, byte SIZEOF_XMMWORD/8
-        jb      short .column_st7
-        movq    MMWORD [rdi], xmmA
-        add     rdi, byte SIZEOF_XMMWORD/8*4
-        sub     rcx, byte SIZEOF_XMMWORD/8
-        psrldq  xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-        ; Store one pixel (4 bytes) of xmmA to the output when it has enough
-        ; space.
-        test    rcx, rcx
-        jz      short .nextrow
-        movd    XMM_DWORD [rdi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.nextrow:
-        pop     rcx
-        pop     rsi
-        pop     rbx
-        pop     rdx
-        pop     rdi
-        pop     rax
-
-        add     rsi, byte SIZEOF_JSAMPROW
-        add     rbx, byte SIZEOF_JSAMPROW
-        add     rdx, byte SIZEOF_JSAMPROW
-        add     rdi, byte SIZEOF_JSAMPROW       ; output_buf
-        dec     rax                             ; num_rows
-        jg      near .rowloop
-
-        sfence          ; flush the write buffer
-
-.return:
-        pop     rbx
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jdcolext-sse2.asm b/simd/jdcolext-sse2.asm
deleted file mode 100644
index 682aef3..0000000
--- a/simd/jdcolext-sse2.asm
+++ /dev/null
@@ -1,459 +0,0 @@
-;
-; jdcolext.asm - colorspace conversion (SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2012, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
-;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
-;                             JSAMPARRAY output_buf, int num_rows)
-;
-
-%define out_width(b)    (b)+8           ; JDIMENSION out_width
-%define input_buf(b)    (b)+12          ; JSAMPIMAGE input_buf
-%define input_row(b)    (b)+16          ; JDIMENSION input_row
-%define output_buf(b)   (b)+20          ; JSAMPARRAY output_buf
-%define num_rows(b)     (b)+24          ; int num_rows
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-        global  EXTN(jsimd_ycc_rgb_convert_sse2)
-
-EXTN(jsimd_ycc_rgb_convert_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [out_width(eax)]        ; num_cols
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     edi, JSAMPIMAGE [input_buf(eax)]
-        mov     ecx, JDIMENSION [input_row(eax)]
-        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-        lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
-        lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-        lea     edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-        pop     ecx
-
-        mov     edi, JSAMPARRAY [output_buf(eax)]
-        mov     eax, INT [num_rows(eax)]
-        test    eax,eax
-        jle     near .return
-        alignx  16,7
-.rowloop:
-        push    eax
-        push    edi
-        push    edx
-        push    ebx
-        push    esi
-        push    ecx                     ; col
-
-        mov     esi, JSAMPROW [esi]     ; inptr0
-        mov     ebx, JSAMPROW [ebx]     ; inptr1
-        mov     edx, JSAMPROW [edx]     ; inptr2
-        mov     edi, JSAMPROW [edi]     ; outptr
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-        alignx  16,7
-.columnloop:
-
-        movdqa  xmm5, XMMWORD [ebx]     ; xmm5=Cb(0123456789ABCDEF)
-        movdqa  xmm1, XMMWORD [edx]     ; xmm1=Cr(0123456789ABCDEF)
-
-        pcmpeqw xmm4,xmm4
-        pcmpeqw xmm7,xmm7
-        psrlw   xmm4,BYTE_BIT
-        psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-        movdqa  xmm0,xmm4               ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
-
-        pand    xmm4,xmm5               ; xmm4=Cb(02468ACE)=CbE
-        psrlw   xmm5,BYTE_BIT           ; xmm5=Cb(13579BDF)=CbO
-        pand    xmm0,xmm1               ; xmm0=Cr(02468ACE)=CrE
-        psrlw   xmm1,BYTE_BIT           ; xmm1=Cr(13579BDF)=CrO
-
-        paddw   xmm4,xmm7
-        paddw   xmm5,xmm7
-        paddw   xmm0,xmm7
-        paddw   xmm1,xmm7
-
-        ; (Original)
-        ; R = Y                + 1.40200 * Cr
-        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-        ; B = Y + 1.77200 * Cb
-        ;
-        ; (This implementation)
-        ; R = Y                + 0.40200 * Cr + Cr
-        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-        ; B = Y - 0.22800 * Cb + Cb + Cb
-
-        movdqa  xmm2,xmm4               ; xmm2=CbE
-        movdqa  xmm3,xmm5               ; xmm3=CbO
-        paddw   xmm4,xmm4               ; xmm4=2*CbE
-        paddw   xmm5,xmm5               ; xmm5=2*CbO
-        movdqa  xmm6,xmm0               ; xmm6=CrE
-        movdqa  xmm7,xmm1               ; xmm7=CrO
-        paddw   xmm0,xmm0               ; xmm0=2*CrE
-        paddw   xmm1,xmm1               ; xmm1=2*CrO
-
-        pmulhw  xmm4,[GOTOFF(eax,PW_MF0228)]    ; xmm4=(2*CbE * -FIX(0.22800))
-        pmulhw  xmm5,[GOTOFF(eax,PW_MF0228)]    ; xmm5=(2*CbO * -FIX(0.22800))
-        pmulhw  xmm0,[GOTOFF(eax,PW_F0402)]     ; xmm0=(2*CrE * FIX(0.40200))
-        pmulhw  xmm1,[GOTOFF(eax,PW_F0402)]     ; xmm1=(2*CrO * FIX(0.40200))
-
-        paddw   xmm4,[GOTOFF(eax,PW_ONE)]
-        paddw   xmm5,[GOTOFF(eax,PW_ONE)]
-        psraw   xmm4,1                  ; xmm4=(CbE * -FIX(0.22800))
-        psraw   xmm5,1                  ; xmm5=(CbO * -FIX(0.22800))
-        paddw   xmm0,[GOTOFF(eax,PW_ONE)]
-        paddw   xmm1,[GOTOFF(eax,PW_ONE)]
-        psraw   xmm0,1                  ; xmm0=(CrE * FIX(0.40200))
-        psraw   xmm1,1                  ; xmm1=(CrO * FIX(0.40200))
-
-        paddw   xmm4,xmm2
-        paddw   xmm5,xmm3
-        paddw   xmm4,xmm2               ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
-        paddw   xmm5,xmm3               ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
-        paddw   xmm0,xmm6               ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
-        paddw   xmm1,xmm7               ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=(B-Y)E
-        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(B-Y)O
-
-        movdqa    xmm4,xmm2
-        movdqa    xmm5,xmm3
-        punpcklwd xmm2,xmm6
-        punpckhwd xmm4,xmm6
-        pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   xmm4,[GOTOFF(eax,PW_MF0344_F0285)]
-        punpcklwd xmm3,xmm7
-        punpckhwd xmm5,xmm7
-        pmaddwd   xmm3,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
-
-        paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     xmm4,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     xmm2,SCALEBITS
-        psrad     xmm4,SCALEBITS
-        paddd     xmm3,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     xmm3,SCALEBITS
-        psrad     xmm5,SCALEBITS
-
-        packssdw  xmm2,xmm4     ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
-        packssdw  xmm3,xmm5     ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
-        psubw     xmm2,xmm6     ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
-        psubw     xmm3,xmm7     ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
-        movdqa    xmm5, XMMWORD [esi]   ; xmm5=Y(0123456789ABCDEF)
-
-        pcmpeqw   xmm4,xmm4
-        psrlw     xmm4,BYTE_BIT         ; xmm4={0xFF 0x00 0xFF 0x00 ..}
-        pand      xmm4,xmm5             ; xmm4=Y(02468ACE)=YE
-        psrlw     xmm5,BYTE_BIT         ; xmm5=Y(13579BDF)=YO
-
-        paddw     xmm0,xmm4             ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
-        paddw     xmm1,xmm5             ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
-        packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
-        packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
-
-        paddw     xmm2,xmm4             ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
-        paddw     xmm3,xmm5             ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
-        packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
-        packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
-
-        paddw     xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
-        paddw     xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
-        packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
-        packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-        punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-        movdqa    xmmG,xmmA
-        movdqa    xmmH,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-        punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-        psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-        psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-        movdqa    xmmC,xmmD
-        movdqa    xmmB,xmmD
-        punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-        punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-        psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-        movdqa    xmmF,xmmE
-        punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-        punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-        pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-        movdqa    xmmB,xmmE
-        punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-        punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-        punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-        pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-        movdqa    xmmB,xmmF
-        punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-        punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-        punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-        punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    edi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     ecx, byte SIZEOF_XMMWORD
-        jz      near .nextrow
-
-        add     esi, byte SIZEOF_XMMWORD        ; inptr0
-        add     ebx, byte SIZEOF_XMMWORD        ; inptr1
-        add     edx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st32:
-        lea     ecx, [ecx+ecx*2]                ; imul ecx, RGB_PIXELSIZE
-        cmp     ecx, byte 2*SIZEOF_XMMWORD
-        jb      short .column_st16
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmF
-        sub     ecx, byte 2*SIZEOF_XMMWORD
-        jmp     short .column_st15
-.column_st16:
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jb      short .column_st15
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        add     edi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     ecx, byte SIZEOF_XMMWORD
-.column_st15:
-        ; Store the lower 8 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st7
-        movq    XMM_MMWORD [edi], xmmA
-        add     edi, byte SIZEOF_MMWORD
-        sub     ecx, byte SIZEOF_MMWORD
-        psrldq  xmmA, SIZEOF_MMWORD
-.column_st7:
-        ; Store the lower 4 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     ecx, byte SIZEOF_DWORD
-        jb      short .column_st3
-        movd    XMM_DWORD [edi], xmmA
-        add     edi, byte SIZEOF_DWORD
-        sub     ecx, byte SIZEOF_DWORD
-        psrldq  xmmA, SIZEOF_DWORD
-.column_st3:
-        ; Store the lower 2 bytes of eax to the output when it has enough
-        ; space.
-        movd    eax, xmmA
-        cmp     ecx, byte SIZEOF_WORD
-        jb      short .column_st1
-        mov     WORD [edi], ax
-        add     edi, byte SIZEOF_WORD
-        sub     ecx, byte SIZEOF_WORD
-        shr     eax, 16
-.column_st1:
-        ; Store the lower 1 byte of eax to the output when it has enough
-        ; space.
-        test    ecx, ecx
-        jz      short .nextrow
-        mov     BYTE [edi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-        pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%else
-        pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%endif
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-        punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-        punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-        movdqa    xmmC,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-        punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-        movdqa    xmmG,xmmB
-        punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-        punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        movdqa    xmmH,xmmC
-        punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    edi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-        movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-        movdqu  XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     ecx, byte SIZEOF_XMMWORD
-        jz      near .nextrow
-
-        add     esi, byte SIZEOF_XMMWORD        ; inptr0
-        add     ebx, byte SIZEOF_XMMWORD        ; inptr1
-        add     edx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st32:
-        cmp     ecx, byte SIZEOF_XMMWORD/2
-        jb      short .column_st16
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmC
-        movdqa  xmmD,xmmH
-        sub     ecx, byte SIZEOF_XMMWORD/2
-.column_st16:
-        cmp     ecx, byte SIZEOF_XMMWORD/4
-        jb      short .column_st15
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        add     edi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     ecx, byte SIZEOF_XMMWORD/4
-.column_st15:
-        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
-        ; space.
-        cmp     ecx, byte SIZEOF_XMMWORD/8
-        jb      short .column_st7
-        movq    XMM_MMWORD [edi], xmmA
-        add     edi, byte SIZEOF_XMMWORD/8*4
-        sub     ecx, byte SIZEOF_XMMWORD/8
-        psrldq  xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-        ; Store one pixel (4 bytes) of xmmA to the output when it has enough
-        ; space.
-        test    ecx, ecx
-        jz      short .nextrow
-        movd    XMM_DWORD [edi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        alignx  16,7
-
-.nextrow:
-        pop     ecx
-        pop     esi
-        pop     ebx
-        pop     edx
-        pop     edi
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW
-        add     ebx, byte SIZEOF_JSAMPROW
-        add     edx, byte SIZEOF_JSAMPROW
-        add     edi, byte SIZEOF_JSAMPROW       ; output_buf
-        dec     eax                             ; num_rows
-        jg      near .rowloop
-
-        sfence          ; flush the write buffer
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jdcolor-sse2-64.asm b/simd/jdcolor-sse2-64.asm
deleted file mode 100644
index d2bf210..0000000
--- a/simd/jdcolor-sse2-64.asm
+++ /dev/null
@@ -1,119 +0,0 @@
-;
-; jdcolor.asm - colorspace conversion (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_ycc_rgb_convert_sse2)
-
-EXTN(jconst_ycc_rgb_convert_sse2):
-
-PW_F0402        times 8 dw  F_0_402
-PW_MF0228       times 8 dw -F_0_228
-PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
-PW_ONE          times 8 dw  1
-PD_ONEHALF      times 4 dd  1 << (SCALEBITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
-%include "jdcolext-sse2-64.asm"
diff --git a/simd/jdmerge-sse2-64.asm b/simd/jdmerge-sse2-64.asm
deleted file mode 100644
index 244bd40..0000000
--- a/simd/jdmerge-sse2-64.asm
+++ /dev/null
@@ -1,125 +0,0 @@
-;
-; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_merged_upsample_sse2)
-
-EXTN(jconst_merged_upsample_sse2):
-
-PW_F0402        times 8 dw  F_0_402
-PW_MF0228       times 8 dw -F_0_228
-PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
-PW_ONE          times 8 dw  1
-PD_ONEHALF      times 4 dd  1 << (SCALEBITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
diff --git a/simd/jdmrgext-mmx.asm b/simd/jdmrgext-mmx.asm
deleted file mode 100644
index 63f45cf..0000000
--- a/simd/jdmrgext-mmx.asm
+++ /dev/null
@@ -1,463 +0,0 @@
-;
-; jdmrgext.asm - merged upsampling/color conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width,
-;                                 JSAMPIMAGE input_buf,
-;                                 JDIMENSION in_row_group_ctr,
-;                                 JSAMPARRAY output_buf);
-;
-
-%define output_width(b) (b)+8                   ; JDIMENSION output_width
-%define input_buf(b)            (b)+12          ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)     (b)+16          ; JDIMENSION in_row_group_ctr
-%define output_buf(b)           (b)+20          ; JSAMPARRAY output_buf
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          3
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_merged_upsample_mmx)
-
-EXTN(jsimd_h2v1_merged_upsample_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [output_width(eax)]     ; col
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     edi, JSAMPIMAGE [input_buf(eax)]
-        mov     ecx, JDIMENSION [in_row_group_ctr(eax)]
-        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-        mov     edi, JSAMPARRAY [output_buf(eax)]
-        mov     esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]         ; inptr0
-        mov     ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]         ; inptr1
-        mov     edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]         ; inptr2
-        mov     edi, JSAMPROW [edi]                             ; outptr
-
-        pop     ecx                     ; col
-
-        alignx  16,7
-.columnloop:
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-        movq      mm6, MMWORD [ebx]     ; mm6=Cb(01234567)
-        movq      mm7, MMWORD [edx]     ; mm7=Cr(01234567)
-
-        pxor      mm1,mm1               ; mm1=(all 0's)
-        pcmpeqw   mm3,mm3
-        psllw     mm3,7                 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
-
-        movq      mm4,mm6
-        punpckhbw mm6,mm1               ; mm6=Cb(4567)=CbH
-        punpcklbw mm4,mm1               ; mm4=Cb(0123)=CbL
-        movq      mm0,mm7
-        punpckhbw mm7,mm1               ; mm7=Cr(4567)=CrH
-        punpcklbw mm0,mm1               ; mm0=Cr(0123)=CrL
-
-        paddw     mm6,mm3
-        paddw     mm4,mm3
-        paddw     mm7,mm3
-        paddw     mm0,mm3
-
-        ; (Original)
-        ; R = Y                + 1.40200 * Cr
-        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-        ; B = Y + 1.77200 * Cb
-        ;
-        ; (This implementation)
-        ; R = Y                + 0.40200 * Cr + Cr
-        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-        ; B = Y - 0.22800 * Cb + Cb + Cb
-
-        movq    mm5,mm6                 ; mm5=CbH
-        movq    mm2,mm4                 ; mm2=CbL
-        paddw   mm6,mm6                 ; mm6=2*CbH
-        paddw   mm4,mm4                 ; mm4=2*CbL
-        movq    mm1,mm7                 ; mm1=CrH
-        movq    mm3,mm0                 ; mm3=CrL
-        paddw   mm7,mm7                 ; mm7=2*CrH
-        paddw   mm0,mm0                 ; mm0=2*CrL
-
-        pmulhw  mm6,[GOTOFF(eax,PW_MF0228)]     ; mm6=(2*CbH * -FIX(0.22800))
-        pmulhw  mm4,[GOTOFF(eax,PW_MF0228)]     ; mm4=(2*CbL * -FIX(0.22800))
-        pmulhw  mm7,[GOTOFF(eax,PW_F0402)]      ; mm7=(2*CrH * FIX(0.40200))
-        pmulhw  mm0,[GOTOFF(eax,PW_F0402)]      ; mm0=(2*CrL * FIX(0.40200))
-
-        paddw   mm6,[GOTOFF(eax,PW_ONE)]
-        paddw   mm4,[GOTOFF(eax,PW_ONE)]
-        psraw   mm6,1                   ; mm6=(CbH * -FIX(0.22800))
-        psraw   mm4,1                   ; mm4=(CbL * -FIX(0.22800))
-        paddw   mm7,[GOTOFF(eax,PW_ONE)]
-        paddw   mm0,[GOTOFF(eax,PW_ONE)]
-        psraw   mm7,1                   ; mm7=(CrH * FIX(0.40200))
-        psraw   mm0,1                   ; mm0=(CrL * FIX(0.40200))
-
-        paddw   mm6,mm5
-        paddw   mm4,mm2
-        paddw   mm6,mm5                 ; mm6=(CbH * FIX(1.77200))=(B-Y)H
-        paddw   mm4,mm2                 ; mm4=(CbL * FIX(1.77200))=(B-Y)L
-        paddw   mm7,mm1                 ; mm7=(CrH * FIX(1.40200))=(R-Y)H
-        paddw   mm0,mm3                 ; mm0=(CrL * FIX(1.40200))=(R-Y)L
-
-        movq    MMWORD [wk(0)], mm6     ; wk(0)=(B-Y)H
-        movq    MMWORD [wk(1)], mm7     ; wk(1)=(R-Y)H
-
-        movq      mm6,mm5
-        movq      mm7,mm2
-        punpcklwd mm5,mm1
-        punpckhwd mm6,mm1
-        pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   mm6,[GOTOFF(eax,PW_MF0344_F0285)]
-        punpcklwd mm2,mm3
-        punpckhwd mm7,mm3
-        pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   mm7,[GOTOFF(eax,PW_MF0344_F0285)]
-
-        paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     mm6,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     mm5,SCALEBITS
-        psrad     mm6,SCALEBITS
-        paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     mm7,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     mm2,SCALEBITS
-        psrad     mm7,SCALEBITS
-
-        packssdw  mm5,mm6       ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
-        packssdw  mm2,mm7       ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
-        psubw     mm5,mm1       ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
-        psubw     mm2,mm3       ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
-        movq    MMWORD [wk(2)], mm5     ; wk(2)=(G-Y)H
-
-        mov     al,2                    ; Yctr
-        jmp     short .Yloop_1st
-        alignx  16,7
-
-.Yloop_2nd:
-        movq    mm0, MMWORD [wk(1)]     ; mm0=(R-Y)H
-        movq    mm2, MMWORD [wk(2)]     ; mm2=(G-Y)H
-        movq    mm4, MMWORD [wk(0)]     ; mm4=(B-Y)H
-        alignx  16,7
-
-.Yloop_1st:
-        movq    mm7, MMWORD [esi]       ; mm7=Y(01234567)
-
-        pcmpeqw mm6,mm6
-        psrlw   mm6,BYTE_BIT            ; mm6={0xFF 0x00 0xFF 0x00 ..}
-        pand    mm6,mm7                 ; mm6=Y(0246)=YE
-        psrlw   mm7,BYTE_BIT            ; mm7=Y(1357)=YO
-
-        movq    mm1,mm0                 ; mm1=mm0=(R-Y)(L/H)
-        movq    mm3,mm2                 ; mm3=mm2=(G-Y)(L/H)
-        movq    mm5,mm4                 ; mm5=mm4=(B-Y)(L/H)
-
-        paddw     mm0,mm6               ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
-        paddw     mm1,mm7               ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
-        packuswb  mm0,mm0               ; mm0=(R0 R2 R4 R6 ** ** ** **)
-        packuswb  mm1,mm1               ; mm1=(R1 R3 R5 R7 ** ** ** **)
-
-        paddw     mm2,mm6               ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
-        paddw     mm3,mm7               ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
-        packuswb  mm2,mm2               ; mm2=(G0 G2 G4 G6 ** ** ** **)
-        packuswb  mm3,mm3               ; mm3=(G1 G3 G5 G7 ** ** ** **)
-
-        paddw     mm4,mm6               ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
-        paddw     mm5,mm7               ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
-        packuswb  mm4,mm4               ; mm4=(B0 B2 B4 B6 ** ** ** **)
-        packuswb  mm5,mm5               ; mm5=(B1 B3 B5 B7 ** ** ** **)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-        ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
-
-        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
-        punpcklbw mmE,mmB               ; mmE=(20 01 22 03 24 05 26 07)
-        punpcklbw mmD,mmF               ; mmD=(11 21 13 23 15 25 17 27)
-
-        movq      mmG,mmA
-        movq      mmH,mmA
-        punpcklwd mmA,mmE               ; mmA=(00 10 20 01 02 12 22 03)
-        punpckhwd mmG,mmE               ; mmG=(04 14 24 05 06 16 26 07)
-
-        psrlq     mmH,2*BYTE_BIT        ; mmH=(02 12 04 14 06 16 -- --)
-        psrlq     mmE,2*BYTE_BIT        ; mmE=(22 03 24 05 26 07 -- --)
-
-        movq      mmC,mmD
-        movq      mmB,mmD
-        punpcklwd mmD,mmH               ; mmD=(11 21 02 12 13 23 04 14)
-        punpckhwd mmC,mmH               ; mmC=(15 25 06 16 17 27 -- --)
-
-        psrlq     mmB,2*BYTE_BIT        ; mmB=(13 23 15 25 17 27 -- --)
-
-        movq      mmF,mmE
-        punpcklwd mmE,mmB               ; mmE=(22 03 13 23 24 05 15 25)
-        punpckhwd mmF,mmB               ; mmF=(26 07 17 27 -- -- -- --)
-
-        punpckldq mmA,mmD               ; mmA=(00 10 20 01 11 21 02 12)
-        punpckldq mmE,mmG               ; mmE=(22 03 13 23 04 14 24 05)
-        punpckldq mmC,mmF               ; mmC=(15 25 06 16 26 07 17 27)
-
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st16
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
-
-        sub     ecx, byte SIZEOF_MMWORD
-        jz      near .endcolumn
-
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
-        add     esi, byte SIZEOF_MMWORD                 ; inptr0
-        dec     al                      ; Yctr
-        jnz     near .Yloop_2nd
-
-        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
-        add     edx, byte SIZEOF_MMWORD                 ; inptr2
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st16:
-        lea     ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
-        cmp     ecx, byte 2*SIZEOF_MMWORD
-        jb      short .column_st8
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
-        movq    mmA,mmC
-        sub     ecx, byte 2*SIZEOF_MMWORD
-        add     edi, byte 2*SIZEOF_MMWORD
-        jmp     short .column_st4
-.column_st8:
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st4
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    mmA,mmE
-        sub     ecx, byte SIZEOF_MMWORD
-        add     edi, byte SIZEOF_MMWORD
-.column_st4:
-        movd    eax,mmA
-        cmp     ecx, byte SIZEOF_DWORD
-        jb      short .column_st2
-        mov     DWORD [edi+0*SIZEOF_DWORD], eax
-        psrlq   mmA,DWORD_BIT
-        movd    eax,mmA
-        sub     ecx, byte SIZEOF_DWORD
-        add     edi, byte SIZEOF_DWORD
-.column_st2:
-        cmp     ecx, byte SIZEOF_WORD
-        jb      short .column_st1
-        mov     WORD [edi+0*SIZEOF_WORD], ax
-        shr     eax,WORD_BIT
-        sub     ecx, byte SIZEOF_WORD
-        add     edi, byte SIZEOF_WORD
-.column_st1:
-        cmp     ecx, byte SIZEOF_BYTE
-        jb      short .endcolumn
-        mov     BYTE [edi+0*SIZEOF_BYTE], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-        pcmpeqb   mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
-        pcmpeqb   mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%else
-        pxor      mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
-        pxor      mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%endif
-        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-        ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
-
-        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
-        punpcklbw mmE,mmG               ; mmE=(20 30 22 32 24 34 26 36)
-        punpcklbw mmB,mmD               ; mmB=(01 11 03 13 05 15 07 17)
-        punpcklbw mmF,mmH               ; mmF=(21 31 23 33 25 35 27 37)
-
-        movq      mmC,mmA
-        punpcklwd mmA,mmE               ; mmA=(00 10 20 30 02 12 22 32)
-        punpckhwd mmC,mmE               ; mmC=(04 14 24 34 06 16 26 36)
-        movq      mmG,mmB
-        punpcklwd mmB,mmF               ; mmB=(01 11 21 31 03 13 23 33)
-        punpckhwd mmG,mmF               ; mmG=(05 15 25 35 07 17 27 37)
-
-        movq      mmD,mmA
-        punpckldq mmA,mmB               ; mmA=(00 10 20 30 01 11 21 31)
-        punpckhdq mmD,mmB               ; mmD=(02 12 22 32 03 13 23 33)
-        movq      mmH,mmC
-        punpckldq mmC,mmG               ; mmC=(04 14 24 34 05 15 25 35)
-        punpckhdq mmH,mmG               ; mmH=(06 16 26 36 07 17 27 37)
-
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st16
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
-        movq    MMWORD [edi+3*SIZEOF_MMWORD], mmH
-
-        sub     ecx, byte SIZEOF_MMWORD
-        jz      short .endcolumn
-
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
-        add     esi, byte SIZEOF_MMWORD                 ; inptr0
-        dec     al                      ; Yctr
-        jnz     near .Yloop_2nd
-
-        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
-        add     edx, byte SIZEOF_MMWORD                 ; inptr2
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st16:
-        cmp     ecx, byte SIZEOF_MMWORD/2
-        jb      short .column_st8
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
-        movq    mmA,mmC
-        movq    mmD,mmH
-        sub     ecx, byte SIZEOF_MMWORD/2
-        add     edi, byte 2*SIZEOF_MMWORD
-.column_st8:
-        cmp     ecx, byte SIZEOF_MMWORD/4
-        jb      short .column_st4
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    mmA,mmD
-        sub     ecx, byte SIZEOF_MMWORD/4
-        add     edi, byte 1*SIZEOF_MMWORD
-.column_st4:
-        cmp     ecx, byte SIZEOF_MMWORD/8
-        jb      short .endcolumn
-        movd    DWORD [edi+0*SIZEOF_DWORD], mmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width,
-;                                 JSAMPIMAGE input_buf,
-;                                 JDIMENSION in_row_group_ctr,
-;                                 JSAMPARRAY output_buf);
-;
-
-%define output_width(b) (b)+8                   ; JDIMENSION output_width
-%define input_buf(b)            (b)+12          ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)     (b)+16          ; JDIMENSION in_row_group_ctr
-%define output_buf(b)           (b)+20          ; JSAMPARRAY output_buf
-
-        align   16
-        global  EXTN(jsimd_h2v2_merged_upsample_mmx)
-
-EXTN(jsimd_h2v2_merged_upsample_mmx):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     eax, JDIMENSION [output_width(ebp)]
-
-        mov     edi, JSAMPIMAGE [input_buf(ebp)]
-        mov     ecx, JDIMENSION [in_row_group_ctr(ebp)]
-        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-        mov     edi, JSAMPARRAY [output_buf(ebp)]
-        lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
-
-        push    edx                     ; inptr2
-        push    ebx                     ; inptr1
-        push    esi                     ; inptr00
-        mov     ebx,esp
-
-        push    edi                     ; output_buf (outptr0)
-        push    ecx                     ; in_row_group_ctr
-        push    ebx                     ; input_buf
-        push    eax                     ; output_width
-
-        call    near EXTN(jsimd_h2v1_merged_upsample_mmx)
-
-        add     esi, byte SIZEOF_JSAMPROW       ; inptr01
-        add     edi, byte SIZEOF_JSAMPROW       ; outptr1
-        mov     POINTER [ebx+0*SIZEOF_POINTER], esi
-        mov     POINTER [ebx-1*SIZEOF_POINTER], edi
-
-        call    near EXTN(jsimd_h2v1_merged_upsample_mmx)
-
-        add     esp, byte 7*SIZEOF_DWORD
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jdmrgext-sse2-64.asm b/simd/jdmrgext-sse2-64.asm
deleted file mode 100644
index ad74c5f..0000000
--- a/simd/jdmrgext-sse2-64.asm
+++ /dev/null
@@ -1,537 +0,0 @@
-;
-; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2012, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-; r10 = JDIMENSION output_width
-; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION in_row_group_ctr
-; r13 = JSAMPARRAY output_buf
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          3
-
-        align   16
-        global  EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-EXTN(jsimd_h2v1_merged_upsample_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-        push    rbx
-
-        mov     ecx, r10d        ; col
-        test    rcx,rcx
-        jz      near .return
-
-        push    rcx
-
-        mov     rdi, r11
-        mov     ecx, r12d
-        mov     rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-        mov     rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-        mov     rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
-        mov     rdi, r13
-        mov     rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]         ; inptr0
-        mov     rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]         ; inptr1
-        mov     rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]         ; inptr2
-        mov     rdi, JSAMPROW [rdi]                             ; outptr
-
-        pop     rcx                     ; col
-
-.columnloop:
-
-        movdqa    xmm6, XMMWORD [rbx]   ; xmm6=Cb(0123456789ABCDEF)
-        movdqa    xmm7, XMMWORD [rdx]   ; xmm7=Cr(0123456789ABCDEF)
-
-        pxor      xmm1,xmm1             ; xmm1=(all 0's)
-        pcmpeqw   xmm3,xmm3
-        psllw     xmm3,7                ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-        movdqa    xmm4,xmm6
-        punpckhbw xmm6,xmm1             ; xmm6=Cb(89ABCDEF)=CbH
-        punpcklbw xmm4,xmm1             ; xmm4=Cb(01234567)=CbL
-        movdqa    xmm0,xmm7
-        punpckhbw xmm7,xmm1             ; xmm7=Cr(89ABCDEF)=CrH
-        punpcklbw xmm0,xmm1             ; xmm0=Cr(01234567)=CrL
-
-        paddw     xmm6,xmm3
-        paddw     xmm4,xmm3
-        paddw     xmm7,xmm3
-        paddw     xmm0,xmm3
-
-        ; (Original)
-        ; R = Y                + 1.40200 * Cr
-        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-        ; B = Y + 1.77200 * Cb
-        ;
-        ; (This implementation)
-        ; R = Y                + 0.40200 * Cr + Cr
-        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-        ; B = Y - 0.22800 * Cb + Cb + Cb
-
-        movdqa  xmm5,xmm6               ; xmm5=CbH
-        movdqa  xmm2,xmm4               ; xmm2=CbL
-        paddw   xmm6,xmm6               ; xmm6=2*CbH
-        paddw   xmm4,xmm4               ; xmm4=2*CbL
-        movdqa  xmm1,xmm7               ; xmm1=CrH
-        movdqa  xmm3,xmm0               ; xmm3=CrL
-        paddw   xmm7,xmm7               ; xmm7=2*CrH
-        paddw   xmm0,xmm0               ; xmm0=2*CrL
-
-        pmulhw  xmm6,[rel PW_MF0228]    ; xmm6=(2*CbH * -FIX(0.22800))
-        pmulhw  xmm4,[rel PW_MF0228]    ; xmm4=(2*CbL * -FIX(0.22800))
-        pmulhw  xmm7,[rel PW_F0402]     ; xmm7=(2*CrH * FIX(0.40200))
-        pmulhw  xmm0,[rel PW_F0402]     ; xmm0=(2*CrL * FIX(0.40200))
-
-        paddw   xmm6,[rel PW_ONE]
-        paddw   xmm4,[rel PW_ONE]
-        psraw   xmm6,1                  ; xmm6=(CbH * -FIX(0.22800))
-        psraw   xmm4,1                  ; xmm4=(CbL * -FIX(0.22800))
-        paddw   xmm7,[rel PW_ONE]
-        paddw   xmm0,[rel PW_ONE]
-        psraw   xmm7,1                  ; xmm7=(CrH * FIX(0.40200))
-        psraw   xmm0,1                  ; xmm0=(CrL * FIX(0.40200))
-
-        paddw   xmm6,xmm5
-        paddw   xmm4,xmm2
-        paddw   xmm6,xmm5               ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
-        paddw   xmm4,xmm2               ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
-        paddw   xmm7,xmm1               ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
-        paddw   xmm0,xmm3               ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
-
-        movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
-        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
-
-        movdqa    xmm6,xmm5
-        movdqa    xmm7,xmm2
-        punpcklwd xmm5,xmm1
-        punpckhwd xmm6,xmm1
-        pmaddwd   xmm5,[rel PW_MF0344_F0285]
-        pmaddwd   xmm6,[rel PW_MF0344_F0285]
-        punpcklwd xmm2,xmm3
-        punpckhwd xmm7,xmm3
-        pmaddwd   xmm2,[rel PW_MF0344_F0285]
-        pmaddwd   xmm7,[rel PW_MF0344_F0285]
-
-        paddd     xmm5,[rel PD_ONEHALF]
-        paddd     xmm6,[rel PD_ONEHALF]
-        psrad     xmm5,SCALEBITS
-        psrad     xmm6,SCALEBITS
-        paddd     xmm2,[rel PD_ONEHALF]
-        paddd     xmm7,[rel PD_ONEHALF]
-        psrad     xmm2,SCALEBITS
-        psrad     xmm7,SCALEBITS
-
-        packssdw  xmm5,xmm6     ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
-        packssdw  xmm2,xmm7     ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
-        psubw     xmm5,xmm1     ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
-        psubw     xmm2,xmm3     ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
-        movdqa  XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
-
-        mov     al,2                    ; Yctr
-        jmp     short .Yloop_1st
-
-.Yloop_2nd:
-        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
-        movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
-        movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
-
-.Yloop_1st:
-        movdqa  xmm7, XMMWORD [rsi]     ; xmm7=Y(0123456789ABCDEF)
-
-        pcmpeqw xmm6,xmm6
-        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-        pand    xmm6,xmm7               ; xmm6=Y(02468ACE)=YE
-        psrlw   xmm7,BYTE_BIT           ; xmm7=Y(13579BDF)=YO
-
-        movdqa  xmm1,xmm0               ; xmm1=xmm0=(R-Y)(L/H)
-        movdqa  xmm3,xmm2               ; xmm3=xmm2=(G-Y)(L/H)
-        movdqa  xmm5,xmm4               ; xmm5=xmm4=(B-Y)(L/H)
-
-        paddw     xmm0,xmm6             ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
-        paddw     xmm1,xmm7             ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
-        packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
-        packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
-
-        paddw     xmm2,xmm6             ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
-        paddw     xmm3,xmm7             ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
-        packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
-        packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
-
-        paddw     xmm4,xmm6             ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
-        paddw     xmm5,xmm7             ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
-        packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
-        packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-        punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-        movdqa    xmmG,xmmA
-        movdqa    xmmH,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-        punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-        psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-        psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-        movdqa    xmmC,xmmD
-        movdqa    xmmB,xmmD
-        punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-        punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-        psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-        movdqa    xmmF,xmmE
-        punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-        punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-        pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-        movdqa    xmmB,xmmE
-        punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-        punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-        punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-        pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-        movdqa    xmmB,xmmF
-        punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-        punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-        punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-        punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    rdi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-        add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     rcx, byte SIZEOF_XMMWORD
-        jz      near .endcolumn
-
-        add     rsi, byte SIZEOF_XMMWORD        ; inptr0
-        dec     al                      ; Yctr
-        jnz     near .Yloop_2nd
-
-        add     rbx, byte SIZEOF_XMMWORD        ; inptr1
-        add     rdx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-
-.column_st32:
-        lea     rcx, [rcx+rcx*2]                ; imul ecx, RGB_PIXELSIZE
-        cmp     rcx, byte 2*SIZEOF_XMMWORD
-        jb      short .column_st16
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmF
-        sub     rcx, byte 2*SIZEOF_XMMWORD
-        jmp     short .column_st15
-.column_st16:
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jb      short .column_st15
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        add     rdi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     rcx, byte SIZEOF_XMMWORD
-.column_st15:
-        ; Store the lower 8 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     rcx, byte SIZEOF_MMWORD
-        jb      short .column_st7
-        movq    XMM_MMWORD [rdi], xmmA
-        add     rdi, byte SIZEOF_MMWORD
-        sub     rcx, byte SIZEOF_MMWORD
-        psrldq  xmmA, SIZEOF_MMWORD
-.column_st7:
-        ; Store the lower 4 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     rcx, byte SIZEOF_DWORD
-        jb      short .column_st3
-        movd    XMM_DWORD [rdi], xmmA
-        add     rdi, byte SIZEOF_DWORD
-        sub     rcx, byte SIZEOF_DWORD
-        psrldq  xmmA, SIZEOF_DWORD
-.column_st3:
-        ; Store the lower 2 bytes of rax to the output when it has enough
-        ; space.
-        movd    eax, xmmA
-        cmp     rcx, byte SIZEOF_WORD
-        jb      short .column_st1
-        mov     WORD [rdi], ax
-        add     rdi, byte SIZEOF_WORD
-        sub     rcx, byte SIZEOF_WORD
-        shr     rax, 16
-.column_st1:
-        ; Store the lower 1 byte of rax to the output when it has enough
-        ; space.
-        test    rcx, rcx
-        jz      short .endcolumn
-        mov     BYTE [rdi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-        pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%else
-        pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%endif
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-        punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-        punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-        movdqa    xmmC,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-        punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-        movdqa    xmmG,xmmB
-        punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-        punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        movdqa    xmmH,xmmC
-        punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    rdi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-        movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-        movdqu  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-        add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     rcx, byte SIZEOF_XMMWORD
-        jz      near .endcolumn
-
-        add     rsi, byte SIZEOF_XMMWORD        ; inptr0
-        dec     al                      ; Yctr
-        jnz     near .Yloop_2nd
-
-        add     rbx, byte SIZEOF_XMMWORD        ; inptr1
-        add     rdx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-
-.column_st32:
-        cmp     rcx, byte SIZEOF_XMMWORD/2
-        jb      short .column_st16
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmC
-        movdqa  xmmD,xmmH
-        sub     rcx, byte SIZEOF_XMMWORD/2
-.column_st16:
-        cmp     rcx, byte SIZEOF_XMMWORD/4
-        jb      short .column_st15
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        add     rdi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     rcx, byte SIZEOF_XMMWORD/4
-.column_st15:
-        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
-        ; space.
-        cmp     rcx, byte SIZEOF_XMMWORD/8
-        jb      short .column_st7
-        movq    XMM_MMWORD [rdi], xmmA
-        add     rdi, byte SIZEOF_XMMWORD/8*4
-        sub     rcx, byte SIZEOF_XMMWORD/8
-        psrldq  xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-        ; Store one pixel (4 bytes) of xmmA to the output when it has enough
-        ; space.
-        test    rcx, rcx
-        jz      short .endcolumn
-        movd    XMM_DWORD [rdi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
-        sfence          ; flush the write buffer
-
-.return:
-        pop     rbx
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-; r10 = JDIMENSION output_width
-; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION in_row_group_ctr
-; r13 = JSAMPARRAY output_buf
-
-        align   16
-        global  EXTN(jsimd_h2v2_merged_upsample_sse2)
-
-EXTN(jsimd_h2v2_merged_upsample_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-        push    rbx
-
-        mov     eax, r10d
-
-        mov     rdi, r11
-        mov     ecx, r12d
-        mov     rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-        mov     rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-        mov     rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
-        mov     rdi, r13
-        lea     rsi, [rsi+rcx*SIZEOF_JSAMPROW]
-
-        push    rdx                     ; inptr2
-        push    rbx                     ; inptr1
-        push    rsi                     ; inptr00
-        mov     rbx,rsp
-
-        push    rdi
-        push    rcx
-        push    rax
-
-        %ifdef WIN64
-        mov r8, rcx
-        mov r9, rdi
-        mov rcx, rax
-        mov rdx, rbx
-        %else
-        mov rdx, rcx
-        mov rcx, rdi
-        mov     rdi, rax
-        mov rsi, rbx
-        %endif
-
-        call    EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-        pop rax
-        pop rcx
-        pop rdi
-        pop rsi
-        pop rbx
-        pop rdx
-
-        add     rdi, byte SIZEOF_JSAMPROW       ; outptr1
-        add     rsi, byte SIZEOF_JSAMPROW       ; inptr01
-
-        push    rdx                     ; inptr2
-        push    rbx                     ; inptr1
-        push    rsi                     ; inptr00
-        mov     rbx,rsp
-
-        push    rdi
-        push    rcx
-        push    rax
-
-        %ifdef WIN64
-        mov r8, rcx
-        mov r9, rdi
-        mov rcx, rax
-        mov rdx, rbx
-        %else
-        mov rdx, rcx
-        mov rcx, rdi
-        mov     rdi, rax
-        mov rsi, rbx
-        %endif
-
-        call    EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-        pop rax
-        pop rcx
-        pop rdi
-        pop rsi
-        pop rbx
-        pop rdx
-
-        pop     rbx
-        uncollect_args
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jdmrgext-sse2.asm b/simd/jdmrgext-sse2.asm
deleted file mode 100644
index b50f698..0000000
--- a/simd/jdmrgext-sse2.asm
+++ /dev/null
@@ -1,518 +0,0 @@
-;
-; jdmrgext.asm - merged upsampling/color conversion (SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2012, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-%define output_width(b) (b)+8                   ; JDIMENSION output_width
-%define input_buf(b)            (b)+12          ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)     (b)+16          ; JDIMENSION in_row_group_ctr
-%define output_buf(b)           (b)+20          ; JSAMPARRAY output_buf
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          3
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-EXTN(jsimd_h2v1_merged_upsample_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [output_width(eax)]     ; col
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     edi, JSAMPIMAGE [input_buf(eax)]
-        mov     ecx, JDIMENSION [in_row_group_ctr(eax)]
-        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-        mov     edi, JSAMPARRAY [output_buf(eax)]
-        mov     esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]         ; inptr0
-        mov     ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]         ; inptr1
-        mov     edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]         ; inptr2
-        mov     edi, JSAMPROW [edi]                             ; outptr
-
-        pop     ecx                     ; col
-
-        alignx  16,7
-.columnloop:
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-        movdqa    xmm6, XMMWORD [ebx]   ; xmm6=Cb(0123456789ABCDEF)
-        movdqa    xmm7, XMMWORD [edx]   ; xmm7=Cr(0123456789ABCDEF)
-
-        pxor      xmm1,xmm1             ; xmm1=(all 0's)
-        pcmpeqw   xmm3,xmm3
-        psllw     xmm3,7                ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-        movdqa    xmm4,xmm6
-        punpckhbw xmm6,xmm1             ; xmm6=Cb(89ABCDEF)=CbH
-        punpcklbw xmm4,xmm1             ; xmm4=Cb(01234567)=CbL
-        movdqa    xmm0,xmm7
-        punpckhbw xmm7,xmm1             ; xmm7=Cr(89ABCDEF)=CrH
-        punpcklbw xmm0,xmm1             ; xmm0=Cr(01234567)=CrL
-
-        paddw     xmm6,xmm3
-        paddw     xmm4,xmm3
-        paddw     xmm7,xmm3
-        paddw     xmm0,xmm3
-
-        ; (Original)
-        ; R = Y                + 1.40200 * Cr
-        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-        ; B = Y + 1.77200 * Cb
-        ;
-        ; (This implementation)
-        ; R = Y                + 0.40200 * Cr + Cr
-        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-        ; B = Y - 0.22800 * Cb + Cb + Cb
-
-        movdqa  xmm5,xmm6               ; xmm5=CbH
-        movdqa  xmm2,xmm4               ; xmm2=CbL
-        paddw   xmm6,xmm6               ; xmm6=2*CbH
-        paddw   xmm4,xmm4               ; xmm4=2*CbL
-        movdqa  xmm1,xmm7               ; xmm1=CrH
-        movdqa  xmm3,xmm0               ; xmm3=CrL
-        paddw   xmm7,xmm7               ; xmm7=2*CrH
-        paddw   xmm0,xmm0               ; xmm0=2*CrL
-
-        pmulhw  xmm6,[GOTOFF(eax,PW_MF0228)]    ; xmm6=(2*CbH * -FIX(0.22800))
-        pmulhw  xmm4,[GOTOFF(eax,PW_MF0228)]    ; xmm4=(2*CbL * -FIX(0.22800))
-        pmulhw  xmm7,[GOTOFF(eax,PW_F0402)]     ; xmm7=(2*CrH * FIX(0.40200))
-        pmulhw  xmm0,[GOTOFF(eax,PW_F0402)]     ; xmm0=(2*CrL * FIX(0.40200))
-
-        paddw   xmm6,[GOTOFF(eax,PW_ONE)]
-        paddw   xmm4,[GOTOFF(eax,PW_ONE)]
-        psraw   xmm6,1                  ; xmm6=(CbH * -FIX(0.22800))
-        psraw   xmm4,1                  ; xmm4=(CbL * -FIX(0.22800))
-        paddw   xmm7,[GOTOFF(eax,PW_ONE)]
-        paddw   xmm0,[GOTOFF(eax,PW_ONE)]
-        psraw   xmm7,1                  ; xmm7=(CrH * FIX(0.40200))
-        psraw   xmm0,1                  ; xmm0=(CrL * FIX(0.40200))
-
-        paddw   xmm6,xmm5
-        paddw   xmm4,xmm2
-        paddw   xmm6,xmm5               ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
-        paddw   xmm4,xmm2               ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
-        paddw   xmm7,xmm1               ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
-        paddw   xmm0,xmm3               ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
-
-        movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
-        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
-
-        movdqa    xmm6,xmm5
-        movdqa    xmm7,xmm2
-        punpcklwd xmm5,xmm1
-        punpckhwd xmm6,xmm1
-        pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   xmm6,[GOTOFF(eax,PW_MF0344_F0285)]
-        punpcklwd xmm2,xmm3
-        punpckhwd xmm7,xmm3
-        pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   xmm7,[GOTOFF(eax,PW_MF0344_F0285)]
-
-        paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     xmm6,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     xmm5,SCALEBITS
-        psrad     xmm6,SCALEBITS
-        paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     xmm7,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     xmm2,SCALEBITS
-        psrad     xmm7,SCALEBITS
-
-        packssdw  xmm5,xmm6     ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
-        packssdw  xmm2,xmm7     ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
-        psubw     xmm5,xmm1     ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
-        psubw     xmm2,xmm3     ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
-        movdqa  XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
-
-        mov     al,2                    ; Yctr
-        jmp     short .Yloop_1st
-        alignx  16,7
-
-.Yloop_2nd:
-        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
-        movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
-        movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
-        alignx  16,7
-
-.Yloop_1st:
-        movdqa  xmm7, XMMWORD [esi]     ; xmm7=Y(0123456789ABCDEF)
-
-        pcmpeqw xmm6,xmm6
-        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-        pand    xmm6,xmm7               ; xmm6=Y(02468ACE)=YE
-        psrlw   xmm7,BYTE_BIT           ; xmm7=Y(13579BDF)=YO
-
-        movdqa  xmm1,xmm0               ; xmm1=xmm0=(R-Y)(L/H)
-        movdqa  xmm3,xmm2               ; xmm3=xmm2=(G-Y)(L/H)
-        movdqa  xmm5,xmm4               ; xmm5=xmm4=(B-Y)(L/H)
-
-        paddw     xmm0,xmm6             ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
-        paddw     xmm1,xmm7             ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
-        packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
-        packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
-
-        paddw     xmm2,xmm6             ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
-        paddw     xmm3,xmm7             ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
-        packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
-        packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
-
-        paddw     xmm4,xmm6             ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
-        paddw     xmm5,xmm7             ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
-        packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
-        packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-        punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-        movdqa    xmmG,xmmA
-        movdqa    xmmH,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-        punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-        psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-        psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-        movdqa    xmmC,xmmD
-        movdqa    xmmB,xmmD
-        punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-        punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-        psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-        movdqa    xmmF,xmmE
-        punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-        punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-        pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-        movdqa    xmmB,xmmE
-        punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-        punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-        punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-        pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-        movdqa    xmmB,xmmF
-        punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-        punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-        punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-        punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    edi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     ecx, byte SIZEOF_XMMWORD
-        jz      near .endcolumn
-
-        add     esi, byte SIZEOF_XMMWORD        ; inptr0
-        dec     al                      ; Yctr
-        jnz     near .Yloop_2nd
-
-        add     ebx, byte SIZEOF_XMMWORD        ; inptr1
-        add     edx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st32:
-        lea     ecx, [ecx+ecx*2]                ; imul ecx, RGB_PIXELSIZE
-        cmp     ecx, byte 2*SIZEOF_XMMWORD
-        jb      short .column_st16
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmF
-        sub     ecx, byte 2*SIZEOF_XMMWORD
-        jmp     short .column_st15
-.column_st16:
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jb      short .column_st15
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        add     edi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     ecx, byte SIZEOF_XMMWORD
-.column_st15:
-        ; Store the lower 8 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st7
-        movq    XMM_MMWORD [edi], xmmA
-        add     edi, byte SIZEOF_MMWORD
-        sub     ecx, byte SIZEOF_MMWORD
-        psrldq  xmmA, SIZEOF_MMWORD
-.column_st7:
-        ; Store the lower 4 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     ecx, byte SIZEOF_DWORD
-        jb      short .column_st3
-        movd    XMM_DWORD [edi], xmmA
-        add     edi, byte SIZEOF_DWORD
-        sub     ecx, byte SIZEOF_DWORD
-        psrldq  xmmA, SIZEOF_DWORD
-.column_st3:
-        ; Store the lower 2 bytes of eax to the output when it has enough
-        ; space.
-        movd    eax, xmmA
-        cmp     ecx, byte SIZEOF_WORD
-        jb      short .column_st1
-        mov     WORD [edi], ax
-        add     edi, byte SIZEOF_WORD
-        sub     ecx, byte SIZEOF_WORD
-        shr     eax, 16
-.column_st1:
-        ; Store the lower 1 byte of eax to the output when it has enough
-        ; space.
-        test    ecx, ecx
-        jz      short .endcolumn
-        mov     BYTE [edi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-        pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%else
-        pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%endif
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-        punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-        punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-        movdqa    xmmC,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-        punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-        movdqa    xmmG,xmmB
-        punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-        punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        movdqa    xmmH,xmmC
-        punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    edi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-        movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-        movdqu  XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     ecx, byte SIZEOF_XMMWORD
-        jz      near .endcolumn
-
-        add     esi, byte SIZEOF_XMMWORD        ; inptr0
-        dec     al                      ; Yctr
-        jnz     near .Yloop_2nd
-
-        add     ebx, byte SIZEOF_XMMWORD        ; inptr1
-        add     edx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st32:
-        cmp     ecx, byte SIZEOF_XMMWORD/2
-        jb      short .column_st16
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmC
-        movdqa  xmmD,xmmH
-        sub     ecx, byte SIZEOF_XMMWORD/2
-.column_st16:
-        cmp     ecx, byte SIZEOF_XMMWORD/4
-        jb      short .column_st15
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        add     edi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     ecx, byte SIZEOF_XMMWORD/4
-.column_st15:
-        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
-        ; space.
-        cmp     ecx, byte SIZEOF_XMMWORD/8
-        jb      short .column_st7
-        movq    XMM_MMWORD [edi], xmmA
-        add     edi, byte SIZEOF_XMMWORD/8*4
-        sub     ecx, byte SIZEOF_XMMWORD/8
-        psrldq  xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-        ; Store one pixel (4 bytes) of xmmA to the output when it has enough
-        ; space.
-        test    ecx, ecx
-        jz      short .endcolumn
-        movd    XMM_DWORD [edi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
-        sfence          ; flush the write buffer
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-%define output_width(b) (b)+8                   ; JDIMENSION output_width
-%define input_buf(b)            (b)+12          ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)     (b)+16          ; JDIMENSION in_row_group_ctr
-%define output_buf(b)           (b)+20          ; JSAMPARRAY output_buf
-
-        align   16
-        global  EXTN(jsimd_h2v2_merged_upsample_sse2)
-
-EXTN(jsimd_h2v2_merged_upsample_sse2):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     eax, POINTER [output_width(ebp)]
-
-        mov     edi, JSAMPIMAGE [input_buf(ebp)]
-        mov     ecx, JDIMENSION [in_row_group_ctr(ebp)]
-        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-        mov     edi, JSAMPARRAY [output_buf(ebp)]
-        lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
-
-        push    edx                     ; inptr2
-        push    ebx                     ; inptr1
-        push    esi                     ; inptr00
-        mov     ebx,esp
-
-        push    edi                     ; output_buf (outptr0)
-        push    ecx                     ; in_row_group_ctr
-        push    ebx                     ; input_buf
-        push    eax                     ; output_width
-
-        call    near EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-        add     esi, byte SIZEOF_JSAMPROW       ; inptr01
-        add     edi, byte SIZEOF_JSAMPROW       ; outptr1
-        mov     POINTER [ebx+0*SIZEOF_POINTER], esi
-        mov     POINTER [ebx-1*SIZEOF_POINTER], edi
-
-        call    near EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-        add     esp, byte 7*SIZEOF_DWORD
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jdsample-mmx.asm b/simd/jdsample-mmx.asm
deleted file mode 100644
index 5e4fa7a..0000000
--- a/simd/jdsample-mmx.asm
+++ /dev/null
@@ -1,736 +0,0 @@
-;
-; jdsample.asm - upsampling (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fancy_upsample_mmx)
-
-EXTN(jconst_fancy_upsample_mmx):
-
-PW_ONE          times 4 dw  1
-PW_TWO          times 4 dw  2
-PW_THREE        times 4 dw  3
-PW_SEVEN        times 4 dw  7
-PW_EIGHT        times 4 dw  8
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
-;
-; The upsampling algorithm is linear interpolation between pixel centers,
-; also known as a "triangle filter".  This is a good compromise between
-; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
-; of the way between input pixel centers.
-;
-; GLOBAL(void)
-; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
-;                                JDIMENSION downsampled_width,
-;                                JSAMPARRAY input_data,
-;                                JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_fancy_upsample_mmx)
-
-EXTN(jsimd_h2v1_fancy_upsample_mmx):
-        push    ebp
-        mov     ebp,esp
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        mov     eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
-        test    eax,eax
-        jz      near .return
-
-        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
-        test    ecx,ecx
-        jz      near .return
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(ebp)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    eax                     ; colctr
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]     ; inptr
-        mov     edi, JSAMPROW [edi]     ; outptr
-
-        test    eax, SIZEOF_MMWORD-1
-        jz      short .skip
-        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
-.skip:
-        pxor    mm0,mm0                 ; mm0=(all 0's)
-        pcmpeqb mm7,mm7
-        psrlq   mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
-        pand    mm7, MMWORD [esi+0*SIZEOF_MMWORD]
-
-        add     eax, byte SIZEOF_MMWORD-1
-        and     eax, byte -SIZEOF_MMWORD
-        cmp     eax, byte SIZEOF_MMWORD
-        ja      short .columnloop
-        alignx  16,7
-
-.columnloop_last:
-        pcmpeqb mm6,mm6
-        psllq   mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
-        pand    mm6, MMWORD [esi+0*SIZEOF_MMWORD]
-        jmp     short .upsample
-        alignx  16,7
-
-.columnloop:
-        movq    mm6, MMWORD [esi+1*SIZEOF_MMWORD]
-        psllq   mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
-
-.upsample:
-        movq    mm1, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mm2,mm1
-        movq    mm3,mm1                 ; mm1=( 0 1 2 3 4 5 6 7)
-        psllq   mm2,BYTE_BIT            ; mm2=( - 0 1 2 3 4 5 6)
-        psrlq   mm3,BYTE_BIT            ; mm3=( 1 2 3 4 5 6 7 -)
-
-        por     mm2,mm7                 ; mm2=(-1 0 1 2 3 4 5 6)
-        por     mm3,mm6                 ; mm3=( 1 2 3 4 5 6 7 8)
-
-        movq    mm7,mm1
-        psrlq   mm7,(SIZEOF_MMWORD-1)*BYTE_BIT  ; mm7=( 7 - - - - - - -)
-
-        movq      mm4,mm1
-        punpcklbw mm1,mm0               ; mm1=( 0 1 2 3)
-        punpckhbw mm4,mm0               ; mm4=( 4 5 6 7)
-        movq      mm5,mm2
-        punpcklbw mm2,mm0               ; mm2=(-1 0 1 2)
-        punpckhbw mm5,mm0               ; mm5=( 3 4 5 6)
-        movq      mm6,mm3
-        punpcklbw mm3,mm0               ; mm3=( 1 2 3 4)
-        punpckhbw mm6,mm0               ; mm6=( 5 6 7 8)
-
-        pmullw  mm1,[GOTOFF(ebx,PW_THREE)]
-        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
-        paddw   mm2,[GOTOFF(ebx,PW_ONE)]
-        paddw   mm5,[GOTOFF(ebx,PW_ONE)]
-        paddw   mm3,[GOTOFF(ebx,PW_TWO)]
-        paddw   mm6,[GOTOFF(ebx,PW_TWO)]
-
-        paddw   mm2,mm1
-        paddw   mm5,mm4
-        psrlw   mm2,2                   ; mm2=OutLE=( 0  2  4  6)
-        psrlw   mm5,2                   ; mm5=OutHE=( 8 10 12 14)
-        paddw   mm3,mm1
-        paddw   mm6,mm4
-        psrlw   mm3,2                   ; mm3=OutLO=( 1  3  5  7)
-        psrlw   mm6,2                   ; mm6=OutHO=( 9 11 13 15)
-
-        psllw   mm3,BYTE_BIT
-        psllw   mm6,BYTE_BIT
-        por     mm2,mm3                 ; mm2=OutL=( 0  1  2  3  4  5  6  7)
-        por     mm5,mm6                 ; mm5=OutH=( 8  9 10 11 12 13 14 15)
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm2
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm5
-
-        sub     eax, byte SIZEOF_MMWORD
-        add     esi, byte 1*SIZEOF_MMWORD       ; inptr
-        add     edi, byte 2*SIZEOF_MMWORD       ; outptr
-        cmp     eax, byte SIZEOF_MMWORD
-        ja      near .columnloop
-        test    eax,eax
-        jnz     near .columnloop_last
-
-        pop     esi
-        pop     edi
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_data
-        add     edi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     ecx                             ; rowctr
-        jg      near .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
-; Again a triangle filter; see comments for h2v1 case, above.
-;
-; GLOBAL(void)
-; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
-;                                JDIMENSION downsampled_width,
-;                                JSAMPARRAY input_data,
-;                                JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          4
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void *gotptr
-
-        align   16
-        global  EXTN(jsimd_h2v2_fancy_upsample_mmx)
-
-EXTN(jsimd_h2v2_fancy_upsample_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     edx,eax                         ; edx = original ebp
-        mov     eax, JDIMENSION [downsamp_width(edx)]  ; colctr
-        test    eax,eax
-        jz      near .return
-
-        mov     ecx, INT [max_v_samp(edx)]      ; rowctr
-        test    ecx,ecx
-        jz      near .return
-
-        mov     esi, JSAMPARRAY [input_data(edx)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(edx)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    eax                                     ; colctr
-        push    ecx
-        push    edi
-        push    esi
-
-        mov     ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
-        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
-        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
-
-        test    eax, SIZEOF_MMWORD-1
-        jz      short .skip
-        push    edx
-        mov     dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
-        mov     dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
-        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
-        pop     edx
-.skip:
-        ; -- process the first column block
-
-        movq    mm0, MMWORD [ebx+0*SIZEOF_MMWORD]       ; mm0=row[ 0][0]
-        movq    mm1, MMWORD [ecx+0*SIZEOF_MMWORD]       ; mm1=row[-1][0]
-        movq    mm2, MMWORD [esi+0*SIZEOF_MMWORD]       ; mm2=row[+1][0]
-
-        pushpic ebx
-        movpic  ebx, POINTER [gotptr]   ; load GOT address
-
-        pxor      mm3,mm3               ; mm3=(all 0's)
-        movq      mm4,mm0
-        punpcklbw mm0,mm3               ; mm0=row[ 0][0]( 0 1 2 3)
-        punpckhbw mm4,mm3               ; mm4=row[ 0][0]( 4 5 6 7)
-        movq      mm5,mm1
-        punpcklbw mm1,mm3               ; mm1=row[-1][0]( 0 1 2 3)
-        punpckhbw mm5,mm3               ; mm5=row[-1][0]( 4 5 6 7)
-        movq      mm6,mm2
-        punpcklbw mm2,mm3               ; mm2=row[+1][0]( 0 1 2 3)
-        punpckhbw mm6,mm3               ; mm6=row[+1][0]( 4 5 6 7)
-
-        pmullw  mm0,[GOTOFF(ebx,PW_THREE)]
-        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
-
-        pcmpeqb mm7,mm7
-        psrlq   mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
-
-        paddw   mm1,mm0                 ; mm1=Int0L=( 0 1 2 3)
-        paddw   mm5,mm4                 ; mm5=Int0H=( 4 5 6 7)
-        paddw   mm2,mm0                 ; mm2=Int1L=( 0 1 2 3)
-        paddw   mm6,mm4                 ; mm6=Int1H=( 4 5 6 7)
-
-        movq    MMWORD [edx+0*SIZEOF_MMWORD], mm1       ; temporarily save
-        movq    MMWORD [edx+1*SIZEOF_MMWORD], mm5       ; the intermediate data
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm2
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm6
-
-        pand    mm1,mm7                 ; mm1=( 0 - - -)
-        pand    mm2,mm7                 ; mm2=( 0 - - -)
-
-        movq    MMWORD [wk(0)], mm1
-        movq    MMWORD [wk(1)], mm2
-
-        poppic  ebx
-
-        add     eax, byte SIZEOF_MMWORD-1
-        and     eax, byte -SIZEOF_MMWORD
-        cmp     eax, byte SIZEOF_MMWORD
-        ja      short .columnloop
-        alignx  16,7
-
-.columnloop_last:
-        ; -- process the last column block
-
-        pushpic ebx
-        movpic  ebx, POINTER [gotptr]   ; load GOT address
-
-        pcmpeqb mm1,mm1
-        psllq   mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
-        movq    mm2,mm1
-
-        pand    mm1, MMWORD [edx+1*SIZEOF_MMWORD]       ; mm1=( - - - 7)
-        pand    mm2, MMWORD [edi+1*SIZEOF_MMWORD]       ; mm2=( - - - 7)
-
-        movq    MMWORD [wk(2)], mm1
-        movq    MMWORD [wk(3)], mm2
-
-        jmp     short .upsample
-        alignx  16,7
-
-.columnloop:
-        ; -- process the next column block
-
-        movq    mm0, MMWORD [ebx+1*SIZEOF_MMWORD]       ; mm0=row[ 0][1]
-        movq    mm1, MMWORD [ecx+1*SIZEOF_MMWORD]       ; mm1=row[-1][1]
-        movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]       ; mm2=row[+1][1]
-
-        pushpic ebx
-        movpic  ebx, POINTER [gotptr]   ; load GOT address
-
-        pxor      mm3,mm3               ; mm3=(all 0's)
-        movq      mm4,mm0
-        punpcklbw mm0,mm3               ; mm0=row[ 0][1]( 0 1 2 3)
-        punpckhbw mm4,mm3               ; mm4=row[ 0][1]( 4 5 6 7)
-        movq      mm5,mm1
-        punpcklbw mm1,mm3               ; mm1=row[-1][1]( 0 1 2 3)
-        punpckhbw mm5,mm3               ; mm5=row[-1][1]( 4 5 6 7)
-        movq      mm6,mm2
-        punpcklbw mm2,mm3               ; mm2=row[+1][1]( 0 1 2 3)
-        punpckhbw mm6,mm3               ; mm6=row[+1][1]( 4 5 6 7)
-
-        pmullw  mm0,[GOTOFF(ebx,PW_THREE)]
-        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
-
-        paddw   mm1,mm0                 ; mm1=Int0L=( 0 1 2 3)
-        paddw   mm5,mm4                 ; mm5=Int0H=( 4 5 6 7)
-        paddw   mm2,mm0                 ; mm2=Int1L=( 0 1 2 3)
-        paddw   mm6,mm4                 ; mm6=Int1H=( 4 5 6 7)
-
-        movq    MMWORD [edx+2*SIZEOF_MMWORD], mm1       ; temporarily save
-        movq    MMWORD [edx+3*SIZEOF_MMWORD], mm5       ; the intermediate data
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
-        movq    MMWORD [edi+3*SIZEOF_MMWORD], mm6
-
-        psllq   mm1,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm1=( - - - 0)
-        psllq   mm2,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm2=( - - - 0)
-
-        movq    MMWORD [wk(2)], mm1
-        movq    MMWORD [wk(3)], mm2
-
-.upsample:
-        ; -- process the upper row
-
-        movq    mm7, MMWORD [edx+0*SIZEOF_MMWORD]       ; mm7=Int0L=( 0 1 2 3)
-        movq    mm3, MMWORD [edx+1*SIZEOF_MMWORD]       ; mm3=Int0H=( 4 5 6 7)
-
-        movq    mm0,mm7
-        movq    mm4,mm3
-        psrlq   mm0,2*BYTE_BIT                  ; mm0=( 1 2 3 -)
-        psllq   mm4,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( - - - 4)
-        movq    mm5,mm7
-        movq    mm6,mm3
-        psrlq   mm5,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm5=( 3 - - -)
-        psllq   mm6,2*BYTE_BIT                  ; mm6=( - 4 5 6)
-
-        por     mm0,mm4                         ; mm0=( 1 2 3 4)
-        por     mm5,mm6                         ; mm5=( 3 4 5 6)
-
-        movq    mm1,mm7
-        movq    mm2,mm3
-        psllq   mm1,2*BYTE_BIT                  ; mm1=( - 0 1 2)
-        psrlq   mm2,2*BYTE_BIT                  ; mm2=( 5 6 7 -)
-        movq    mm4,mm3
-        psrlq   mm4,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( 7 - - -)
-
-        por     mm1, MMWORD [wk(0)]             ; mm1=(-1 0 1 2)
-        por     mm2, MMWORD [wk(2)]             ; mm2=( 5 6 7 8)
-
-        movq    MMWORD [wk(0)], mm4
-
-        pmullw  mm7,[GOTOFF(ebx,PW_THREE)]
-        pmullw  mm3,[GOTOFF(ebx,PW_THREE)]
-        paddw   mm1,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   mm5,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   mm0,[GOTOFF(ebx,PW_SEVEN)]
-        paddw   mm2,[GOTOFF(ebx,PW_SEVEN)]
-
-        paddw   mm1,mm7
-        paddw   mm5,mm3
-        psrlw   mm1,4                   ; mm1=Out0LE=( 0  2  4  6)
-        psrlw   mm5,4                   ; mm5=Out0HE=( 8 10 12 14)
-        paddw   mm0,mm7
-        paddw   mm2,mm3
-        psrlw   mm0,4                   ; mm0=Out0LO=( 1  3  5  7)
-        psrlw   mm2,4                   ; mm2=Out0HO=( 9 11 13 15)
-
-        psllw   mm0,BYTE_BIT
-        psllw   mm2,BYTE_BIT
-        por     mm1,mm0                 ; mm1=Out0L=( 0  1  2  3  4  5  6  7)
-        por     mm5,mm2                 ; mm5=Out0H=( 8  9 10 11 12 13 14 15)
-
-        movq    MMWORD [edx+0*SIZEOF_MMWORD], mm1
-        movq    MMWORD [edx+1*SIZEOF_MMWORD], mm5
-
-        ; -- process the lower row
-
-        movq    mm6, MMWORD [edi+0*SIZEOF_MMWORD]       ; mm6=Int1L=( 0 1 2 3)
-        movq    mm4, MMWORD [edi+1*SIZEOF_MMWORD]       ; mm4=Int1H=( 4 5 6 7)
-
-        movq    mm7,mm6
-        movq    mm3,mm4
-        psrlq   mm7,2*BYTE_BIT                  ; mm7=( 1 2 3 -)
-        psllq   mm3,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( - - - 4)
-        movq    mm0,mm6
-        movq    mm2,mm4
-        psrlq   mm0,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm0=( 3 - - -)
-        psllq   mm2,2*BYTE_BIT                  ; mm2=( - 4 5 6)
-
-        por     mm7,mm3                         ; mm7=( 1 2 3 4)
-        por     mm0,mm2                         ; mm0=( 3 4 5 6)
-
-        movq    mm1,mm6
-        movq    mm5,mm4
-        psllq   mm1,2*BYTE_BIT                  ; mm1=( - 0 1 2)
-        psrlq   mm5,2*BYTE_BIT                  ; mm5=( 5 6 7 -)
-        movq    mm3,mm4
-        psrlq   mm3,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( 7 - - -)
-
-        por     mm1, MMWORD [wk(1)]             ; mm1=(-1 0 1 2)
-        por     mm5, MMWORD [wk(3)]             ; mm5=( 5 6 7 8)
-
-        movq    MMWORD [wk(1)], mm3
-
-        pmullw  mm6,[GOTOFF(ebx,PW_THREE)]
-        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
-        paddw   mm1,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   mm0,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   mm7,[GOTOFF(ebx,PW_SEVEN)]
-        paddw   mm5,[GOTOFF(ebx,PW_SEVEN)]
-
-        paddw   mm1,mm6
-        paddw   mm0,mm4
-        psrlw   mm1,4                   ; mm1=Out1LE=( 0  2  4  6)
-        psrlw   mm0,4                   ; mm0=Out1HE=( 8 10 12 14)
-        paddw   mm7,mm6
-        paddw   mm5,mm4
-        psrlw   mm7,4                   ; mm7=Out1LO=( 1  3  5  7)
-        psrlw   mm5,4                   ; mm5=Out1HO=( 9 11 13 15)
-
-        psllw   mm7,BYTE_BIT
-        psllw   mm5,BYTE_BIT
-        por     mm1,mm7                 ; mm1=Out1L=( 0  1  2  3  4  5  6  7)
-        por     mm0,mm5                 ; mm0=Out1H=( 8  9 10 11 12 13 14 15)
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm1
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm0
-
-        poppic  ebx
-
-        sub     eax, byte SIZEOF_MMWORD
-        add     ecx, byte 1*SIZEOF_MMWORD       ; inptr1(above)
-        add     ebx, byte 1*SIZEOF_MMWORD       ; inptr0
-        add     esi, byte 1*SIZEOF_MMWORD       ; inptr1(below)
-        add     edx, byte 2*SIZEOF_MMWORD       ; outptr0
-        add     edi, byte 2*SIZEOF_MMWORD       ; outptr1
-        cmp     eax, byte SIZEOF_MMWORD
-        ja      near .columnloop
-        test    eax,eax
-        jnz     near .columnloop_last
-
-        pop     esi
-        pop     edi
-        pop     ecx
-        pop     eax
-
-        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
-        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
-        sub     ecx, byte 2                     ; rowctr
-        jg      near .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
-;                          JDIMENSION output_width,
-;                          JSAMPARRAY input_data,
-;                          JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define output_width(b)         (b)+12          ; JDIMENSION output_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_upsample_mmx)
-
-EXTN(jsimd_h2v1_upsample_mmx):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     edx, JDIMENSION [output_width(ebp)]
-        add     edx, byte (2*SIZEOF_MMWORD)-1
-        and     edx, byte -(2*SIZEOF_MMWORD)
-        jz      short .return
-
-        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
-        test    ecx,ecx
-        jz      short .return
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(ebp)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]             ; inptr
-        mov     edi, JSAMPROW [edi]             ; outptr
-        mov     eax,edx                         ; colctr
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
-
-        movq      mm1,mm0
-        punpcklbw mm0,mm0
-        punpckhbw mm1,mm1
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm1
-
-        sub     eax, byte 2*SIZEOF_MMWORD
-        jz      short .nextrow
-
-        movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]
-
-        movq      mm3,mm2
-        punpcklbw mm2,mm2
-        punpckhbw mm3,mm3
-
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
-        movq    MMWORD [edi+3*SIZEOF_MMWORD], mm3
-
-        sub     eax, byte 2*SIZEOF_MMWORD
-        jz      short .nextrow
-
-        add     esi, byte 2*SIZEOF_MMWORD       ; inptr
-        add     edi, byte 4*SIZEOF_MMWORD       ; outptr
-        jmp     short .columnloop
-        alignx  16,7
-
-.nextrow:
-        pop     esi
-        pop     edi
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_data
-        add     edi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     ecx                             ; rowctr
-        jg      short .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
-;                          JDIMENSION output_width,
-;                          JSAMPARRAY input_data,
-;                          JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define output_width(b)         (b)+12          ; JDIMENSION output_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v2_upsample_mmx)
-
-EXTN(jsimd_h2v2_upsample_mmx):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     edx, JDIMENSION [output_width(ebp)]
-        add     edx, byte (2*SIZEOF_MMWORD)-1
-        and     edx, byte -(2*SIZEOF_MMWORD)
-        jz      near .return
-
-        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
-        test    ecx,ecx
-        jz      short .return
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(ebp)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]                     ; inptr
-        mov     ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
-        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
-        mov     eax,edx                                 ; colctr
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
-
-        movq      mm1,mm0
-        punpcklbw mm0,mm0
-        punpckhbw mm1,mm1
-
-        movq    MMWORD [ebx+0*SIZEOF_MMWORD], mm0
-        movq    MMWORD [ebx+1*SIZEOF_MMWORD], mm1
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm1
-
-        sub     eax, byte 2*SIZEOF_MMWORD
-        jz      short .nextrow
-
-        movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]
-
-        movq      mm3,mm2
-        punpcklbw mm2,mm2
-        punpckhbw mm3,mm3
-
-        movq    MMWORD [ebx+2*SIZEOF_MMWORD], mm2
-        movq    MMWORD [ebx+3*SIZEOF_MMWORD], mm3
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
-        movq    MMWORD [edi+3*SIZEOF_MMWORD], mm3
-
-        sub     eax, byte 2*SIZEOF_MMWORD
-        jz      short .nextrow
-
-        add     esi, byte 2*SIZEOF_MMWORD       ; inptr
-        add     ebx, byte 4*SIZEOF_MMWORD       ; outptr0
-        add     edi, byte 4*SIZEOF_MMWORD       ; outptr1
-        jmp     short .columnloop
-        alignx  16,7
-
-.nextrow:
-        pop     esi
-        pop     edi
-
-        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
-        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
-        sub     ecx, byte 2                     ; rowctr
-        jg      short .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jdsample-sse2-64.asm b/simd/jdsample-sse2-64.asm
deleted file mode 100644
index 1faaed6..0000000
--- a/simd/jdsample-sse2-64.asm
+++ /dev/null
@@ -1,670 +0,0 @@
-;
-; jdsample.asm - upsampling (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fancy_upsample_sse2)
-
-EXTN(jconst_fancy_upsample_sse2):
-
-PW_ONE          times 8 dw  1
-PW_TWO          times 8 dw  2
-PW_THREE        times 8 dw  3
-PW_SEVEN        times 8 dw  7
-PW_EIGHT        times 8 dw  8
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
-;
-; The upsampling algorithm is linear interpolation between pixel centers,
-; also known as a "triangle filter".  This is a good compromise between
-; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
-; of the way between input pixel centers.
-;
-; GLOBAL(void)
-; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY *output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION downsampled_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_fancy_upsample_sse2)
-
-EXTN(jsimd_h2v1_fancy_upsample_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-
-        mov     eax, r11d  ; colctr
-        test    rax,rax
-        jz      near .return
-
-        mov     rcx, r10        ; rowctr
-        test    rcx,rcx
-        jz      near .return
-
-        mov     rsi, r12        ; input_data
-        mov     rdi, r13
-        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
-.rowloop:
-        push    rax                     ; colctr
-        push    rdi
-        push    rsi
-
-        mov     rsi, JSAMPROW [rsi]     ; inptr
-        mov     rdi, JSAMPROW [rdi]     ; outptr
-
-        test    rax, SIZEOF_XMMWORD-1
-        jz      short .skip
-        mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
-.skip:
-        pxor    xmm0,xmm0               ; xmm0=(all 0's)
-        pcmpeqb xmm7,xmm7
-        psrldq  xmm7,(SIZEOF_XMMWORD-1)
-        pand    xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-
-        add     rax, byte SIZEOF_XMMWORD-1
-        and     rax, byte -SIZEOF_XMMWORD
-        cmp     rax, byte SIZEOF_XMMWORD
-        ja      short .columnloop
-
-.columnloop_last:
-        pcmpeqb xmm6,xmm6
-        pslldq  xmm6,(SIZEOF_XMMWORD-1)
-        pand    xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        jmp     short .upsample
-
-.columnloop:
-        movdqa  xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        pslldq  xmm6,(SIZEOF_XMMWORD-1)
-
-.upsample:
-        movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqa  xmm2,xmm1
-        movdqa  xmm3,xmm1               ; xmm1=( 0  1  2 ... 13 14 15)
-        pslldq  xmm2,1                  ; xmm2=(--  0  1 ... 12 13 14)
-        psrldq  xmm3,1                  ; xmm3=( 1  2  3 ... 14 15 --)
-
-        por     xmm2,xmm7               ; xmm2=(-1  0  1 ... 12 13 14)
-        por     xmm3,xmm6               ; xmm3=( 1  2  3 ... 14 15 16)
-
-        movdqa  xmm7,xmm1
-        psrldq  xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
-
-        movdqa    xmm4,xmm1
-        punpcklbw xmm1,xmm0             ; xmm1=( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm4,xmm0             ; xmm4=( 8  9 10 11 12 13 14 15)
-        movdqa    xmm5,xmm2
-        punpcklbw xmm2,xmm0             ; xmm2=(-1  0  1  2  3  4  5  6)
-        punpckhbw xmm5,xmm0             ; xmm5=( 7  8  9 10 11 12 13 14)
-        movdqa    xmm6,xmm3
-        punpcklbw xmm3,xmm0             ; xmm3=( 1  2  3  4  5  6  7  8)
-        punpckhbw xmm6,xmm0             ; xmm6=( 9 10 11 12 13 14 15 16)
-
-        pmullw  xmm1,[rel PW_THREE]
-        pmullw  xmm4,[rel PW_THREE]
-        paddw   xmm2,[rel PW_ONE]
-        paddw   xmm5,[rel PW_ONE]
-        paddw   xmm3,[rel PW_TWO]
-        paddw   xmm6,[rel PW_TWO]
-
-        paddw   xmm2,xmm1
-        paddw   xmm5,xmm4
-        psrlw   xmm2,2                  ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
-        psrlw   xmm5,2                  ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
-        paddw   xmm3,xmm1
-        paddw   xmm6,xmm4
-        psrlw   xmm3,2                  ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
-        psrlw   xmm6,2                  ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
-
-        psllw   xmm3,BYTE_BIT
-        psllw   xmm6,BYTE_BIT
-        por     xmm2,xmm3               ; xmm2=OutL=( 0  1  2 ... 13 14 15)
-        por     xmm5,xmm6               ; xmm5=OutH=(16 17 18 ... 29 30 31)
-
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
-
-        sub     rax, byte SIZEOF_XMMWORD
-        add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr
-        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
-        cmp     rax, byte SIZEOF_XMMWORD
-        ja      near .columnloop
-        test    eax,eax
-        jnz     near .columnloop_last
-
-        pop     rsi
-        pop     rdi
-        pop     rax
-
-        add     rsi, byte SIZEOF_JSAMPROW       ; input_data
-        add     rdi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     rcx                             ; rowctr
-        jg      near .rowloop
-
-.return:
-        uncollect_args
-        pop     rbp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
-; Again a triangle filter; see comments for h2v1 case, above.
-;
-; GLOBAL(void)
-; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY *output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION downsampled_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY *output_data_ptr
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          4
-
-        align   16
-        global  EXTN(jsimd_h2v2_fancy_upsample_sse2)
-
-EXTN(jsimd_h2v2_fancy_upsample_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-        push    rbx
-
-        mov     eax, r11d  ; colctr
-        test    rax,rax
-        jz      near .return
-
-        mov     rcx, r10        ; rowctr
-        test    rcx,rcx
-        jz      near .return
-
-        mov     rsi, r12        ; input_data
-        mov     rdi, r13
-        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
-.rowloop:
-        push    rax                                     ; colctr
-        push    rcx
-        push    rdi
-        push    rsi
-
-        mov     rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
-        mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
-        mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
-        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
-        mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
-
-        test    rax, SIZEOF_XMMWORD-1
-        jz      short .skip
-        push    rdx
-        mov     dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
-        mov     dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
-        mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
-        pop     rdx
-.skip:
-        ; -- process the first column block
-
-        movdqa  xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]    ; xmm0=row[ 0][0]
-        movdqa  xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]    ; xmm1=row[-1][0]
-        movdqa  xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]    ; xmm2=row[+1][0]
-
-        pxor      xmm3,xmm3             ; xmm3=(all 0's)
-        movdqa    xmm4,xmm0
-        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm5,xmm1
-        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm6,xmm2
-        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-        pmullw  xmm0,[rel PW_THREE]
-        pmullw  xmm4,[rel PW_THREE]
-
-        pcmpeqb xmm7,xmm7
-        psrldq  xmm7,(SIZEOF_XMMWORD-2)
-
-        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-        movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1    ; temporarily save
-        movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5    ; the intermediate data
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
-
-        pand    xmm1,xmm7               ; xmm1=( 0 -- -- -- -- -- -- --)
-        pand    xmm2,xmm7               ; xmm2=( 0 -- -- -- -- -- -- --)
-
-        movdqa  XMMWORD [wk(0)], xmm1
-        movdqa  XMMWORD [wk(1)], xmm2
-
-        add     rax, byte SIZEOF_XMMWORD-1
-        and     rax, byte -SIZEOF_XMMWORD
-        cmp     rax, byte SIZEOF_XMMWORD
-        ja      short .columnloop
-
-.columnloop_last:
-        ; -- process the last column block
-
-        pcmpeqb xmm1,xmm1
-        pslldq  xmm1,(SIZEOF_XMMWORD-2)
-        movdqa  xmm2,xmm1
-
-        pand    xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
-        pand    xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
-
-        movdqa  XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
-        movdqa  XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
-
-        jmp     near .upsample
-
-.columnloop:
-        ; -- process the next column block
-
-        movdqa  xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]    ; xmm0=row[ 0][1]
-        movdqa  xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]    ; xmm1=row[-1][1]
-        movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]    ; xmm2=row[+1][1]
-
-        pxor      xmm3,xmm3             ; xmm3=(all 0's)
-        movdqa    xmm4,xmm0
-        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm5,xmm1
-        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm6,xmm2
-        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-        pmullw  xmm0,[rel PW_THREE]
-        pmullw  xmm4,[rel PW_THREE]
-
-        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-        movdqa  XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1    ; temporarily save
-        movdqa  XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5    ; the intermediate data
-        movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
-
-        pslldq  xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- --  0)
-        pslldq  xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- --  0)
-
-        movdqa  XMMWORD [wk(2)], xmm1
-        movdqa  XMMWORD [wk(3)], xmm2
-
-.upsample:
-        ; -- process the upper row
-
-        movdqa  xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
-        movdqa  xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
-
-        movdqa  xmm0,xmm7               ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
-        movdqa  xmm4,xmm3               ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
-        psrldq  xmm0,2                  ; xmm0=( 1  2  3  4  5  6  7 --)
-        pslldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- --  8)
-        movdqa  xmm5,xmm7
-        movdqa  xmm6,xmm3
-        psrldq  xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
-        pslldq  xmm6,2                  ; xmm6=(--  8  9 10 11 12 13 14)
-
-        por     xmm0,xmm4               ; xmm0=( 1  2  3  4  5  6  7  8)
-        por     xmm5,xmm6               ; xmm5=( 7  8  9 10 11 12 13 14)
-
-        movdqa  xmm1,xmm7
-        movdqa  xmm2,xmm3
-        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
-        psrldq  xmm2,2                  ; xmm2=( 9 10 11 12 13 14 15 --)
-        movdqa  xmm4,xmm3
-        psrldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
-
-        por     xmm1, XMMWORD [wk(0)]   ; xmm1=(-1  0  1  2  3  4  5  6)
-        por     xmm2, XMMWORD [wk(2)]   ; xmm2=( 9 10 11 12 13 14 15 16)
-
-        movdqa  XMMWORD [wk(0)], xmm4
-
-        pmullw  xmm7,[rel PW_THREE]
-        pmullw  xmm3,[rel PW_THREE]
-        paddw   xmm1,[rel PW_EIGHT]
-        paddw   xmm5,[rel PW_EIGHT]
-        paddw   xmm0,[rel PW_SEVEN]
-        paddw   xmm2,[rel PW_SEVEN]
-
-        paddw   xmm1,xmm7
-        paddw   xmm5,xmm3
-        psrlw   xmm1,4                  ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
-        psrlw   xmm5,4                  ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
-        paddw   xmm0,xmm7
-        paddw   xmm2,xmm3
-        psrlw   xmm0,4                  ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
-        psrlw   xmm2,4                  ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
-
-        psllw   xmm0,BYTE_BIT
-        psllw   xmm2,BYTE_BIT
-        por     xmm1,xmm0               ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
-        por     xmm5,xmm2               ; xmm5=Out0H=(16 17 18 ... 29 30 31)
-
-        movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
-        movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
-
-        ; -- process the lower row
-
-        movdqa  xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
-        movdqa  xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
-
-        movdqa  xmm7,xmm6               ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
-        movdqa  xmm3,xmm4               ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
-        psrldq  xmm7,2                  ; xmm7=( 1  2  3  4  5  6  7 --)
-        pslldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- --  8)
-        movdqa  xmm0,xmm6
-        movdqa  xmm2,xmm4
-        psrldq  xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
-        pslldq  xmm2,2                  ; xmm2=(--  8  9 10 11 12 13 14)
-
-        por     xmm7,xmm3               ; xmm7=( 1  2  3  4  5  6  7  8)
-        por     xmm0,xmm2               ; xmm0=( 7  8  9 10 11 12 13 14)
-
-        movdqa  xmm1,xmm6
-        movdqa  xmm5,xmm4
-        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
-        psrldq  xmm5,2                  ; xmm5=( 9 10 11 12 13 14 15 --)
-        movdqa  xmm3,xmm4
-        psrldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
-
-        por     xmm1, XMMWORD [wk(1)]   ; xmm1=(-1  0  1  2  3  4  5  6)
-        por     xmm5, XMMWORD [wk(3)]   ; xmm5=( 9 10 11 12 13 14 15 16)
-
-        movdqa  XMMWORD [wk(1)], xmm3
-
-        pmullw  xmm6,[rel PW_THREE]
-        pmullw  xmm4,[rel PW_THREE]
-        paddw   xmm1,[rel PW_EIGHT]
-        paddw   xmm0,[rel PW_EIGHT]
-        paddw   xmm7,[rel PW_SEVEN]
-        paddw   xmm5,[rel PW_SEVEN]
-
-        paddw   xmm1,xmm6
-        paddw   xmm0,xmm4
-        psrlw   xmm1,4                  ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
-        psrlw   xmm0,4                  ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
-        paddw   xmm7,xmm6
-        paddw   xmm5,xmm4
-        psrlw   xmm7,4                  ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
-        psrlw   xmm5,4                  ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
-
-        psllw   xmm7,BYTE_BIT
-        psllw   xmm5,BYTE_BIT
-        por     xmm1,xmm7               ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
-        por     xmm0,xmm5               ; xmm0=Out1H=(16 17 18 ... 29 30 31)
-
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
-        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
-
-        sub     rax, byte SIZEOF_XMMWORD
-        add     rcx, byte 1*SIZEOF_XMMWORD      ; inptr1(above)
-        add     rbx, byte 1*SIZEOF_XMMWORD      ; inptr0
-        add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr1(below)
-        add     rdx, byte 2*SIZEOF_XMMWORD      ; outptr0
-        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr1
-        cmp     rax, byte SIZEOF_XMMWORD
-        ja      near .columnloop
-        test    rax,rax
-        jnz     near .columnloop_last
-
-        pop     rsi
-        pop     rdi
-        pop     rcx
-        pop     rax
-
-        add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
-        add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
-        sub     rcx, byte 2                     ; rowctr
-        jg      near .rowloop
-
-.return:
-        pop     rbx
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY *output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION output_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_upsample_sse2)
-
-EXTN(jsimd_h2v1_upsample_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-
-        mov     edx, r11d
-        add     rdx, byte (2*SIZEOF_XMMWORD)-1
-        and     rdx, byte -(2*SIZEOF_XMMWORD)
-        jz      near .return
-
-        mov     rcx, r10        ; rowctr
-        test    rcx,rcx
-        jz      short .return
-
-        mov     rsi, r12 ; input_data
-        mov     rdi, r13
-        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
-.rowloop:
-        push    rdi
-        push    rsi
-
-        mov     rsi, JSAMPROW [rsi]             ; inptr
-        mov     rdi, JSAMPROW [rdi]             ; outptr
-        mov     rax,rdx                         ; colctr
-.columnloop:
-
-        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-
-        movdqa    xmm1,xmm0
-        punpcklbw xmm0,xmm0
-        punpckhbw xmm1,xmm1
-
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
-
-        sub     rax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-        movdqa    xmm3,xmm2
-        punpcklbw xmm2,xmm2
-        punpckhbw xmm3,xmm3
-
-        movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
-
-        sub     rax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
-        add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr
-        jmp     short .columnloop
-
-.nextrow:
-        pop     rsi
-        pop     rdi
-
-        add     rsi, byte SIZEOF_JSAMPROW       ; input_data
-        add     rdi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     rcx                             ; rowctr
-        jg      short .rowloop
-
-.return:
-        uncollect_args
-        pop     rbp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY *output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION output_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v2_upsample_sse2)
-
-EXTN(jsimd_h2v2_upsample_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-        push    rbx
-
-        mov     edx, r11d
-        add     rdx, byte (2*SIZEOF_XMMWORD)-1
-        and     rdx, byte -(2*SIZEOF_XMMWORD)
-        jz      near .return
-
-        mov     rcx, r10        ; rowctr
-        test    rcx,rcx
-        jz      near .return
-
-        mov     rsi, r12        ; input_data
-        mov     rdi, r13
-        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
-.rowloop:
-        push    rdi
-        push    rsi
-
-        mov     rsi, JSAMPROW [rsi]                     ; inptr
-        mov     rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
-        mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
-        mov     rax,rdx                                 ; colctr
-.columnloop:
-
-        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-
-        movdqa    xmm1,xmm0
-        punpcklbw xmm0,xmm0
-        punpckhbw xmm1,xmm1
-
-        movdqa  XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
-        movdqa  XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
-
-        sub     rax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-        movdqa    xmm3,xmm2
-        punpcklbw xmm2,xmm2
-        punpckhbw xmm3,xmm3
-
-        movdqa  XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
-        movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
-
-        sub     rax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
-        add     rbx, byte 4*SIZEOF_XMMWORD      ; outptr0
-        add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr1
-        jmp     short .columnloop
-
-.nextrow:
-        pop     rsi
-        pop     rdi
-
-        add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
-        add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
-        sub     rcx, byte 2                     ; rowctr
-        jg      near .rowloop
-
-.return:
-        pop     rbx
-        uncollect_args
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jdsample-sse2.asm b/simd/jdsample-sse2.asm
deleted file mode 100644
index 1d0059e..0000000
--- a/simd/jdsample-sse2.asm
+++ /dev/null
@@ -1,728 +0,0 @@
-;
-; jdsample.asm - upsampling (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fancy_upsample_sse2)
-
-EXTN(jconst_fancy_upsample_sse2):
-
-PW_ONE          times 8 dw  1
-PW_TWO          times 8 dw  2
-PW_THREE        times 8 dw  3
-PW_SEVEN        times 8 dw  7
-PW_EIGHT        times 8 dw  8
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
-;
-; The upsampling algorithm is linear interpolation between pixel centers,
-; also known as a "triangle filter".  This is a good compromise between
-; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
-; of the way between input pixel centers.
-;
-; GLOBAL(void)
-; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_fancy_upsample_sse2)
-
-EXTN(jsimd_h2v1_fancy_upsample_sse2):
-        push    ebp
-        mov     ebp,esp
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        mov     eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
-        test    eax,eax
-        jz      near .return
-
-        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
-        test    ecx,ecx
-        jz      near .return
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(ebp)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    eax                     ; colctr
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]     ; inptr
-        mov     edi, JSAMPROW [edi]     ; outptr
-
-        test    eax, SIZEOF_XMMWORD-1
-        jz      short .skip
-        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
-.skip:
-        pxor    xmm0,xmm0               ; xmm0=(all 0's)
-        pcmpeqb xmm7,xmm7
-        psrldq  xmm7,(SIZEOF_XMMWORD-1)
-        pand    xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
-
-        add     eax, byte SIZEOF_XMMWORD-1
-        and     eax, byte -SIZEOF_XMMWORD
-        cmp     eax, byte SIZEOF_XMMWORD
-        ja      short .columnloop
-        alignx  16,7
-
-.columnloop_last:
-        pcmpeqb xmm6,xmm6
-        pslldq  xmm6,(SIZEOF_XMMWORD-1)
-        pand    xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        jmp     short .upsample
-        alignx  16,7
-
-.columnloop:
-        movdqa  xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        pslldq  xmm6,(SIZEOF_XMMWORD-1)
-
-.upsample:
-        movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqa  xmm2,xmm1
-        movdqa  xmm3,xmm1               ; xmm1=( 0  1  2 ... 13 14 15)
-        pslldq  xmm2,1                  ; xmm2=(--  0  1 ... 12 13 14)
-        psrldq  xmm3,1                  ; xmm3=( 1  2  3 ... 14 15 --)
-
-        por     xmm2,xmm7               ; xmm2=(-1  0  1 ... 12 13 14)
-        por     xmm3,xmm6               ; xmm3=( 1  2  3 ... 14 15 16)
-
-        movdqa  xmm7,xmm1
-        psrldq  xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
-
-        movdqa    xmm4,xmm1
-        punpcklbw xmm1,xmm0             ; xmm1=( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm4,xmm0             ; xmm4=( 8  9 10 11 12 13 14 15)
-        movdqa    xmm5,xmm2
-        punpcklbw xmm2,xmm0             ; xmm2=(-1  0  1  2  3  4  5  6)
-        punpckhbw xmm5,xmm0             ; xmm5=( 7  8  9 10 11 12 13 14)
-        movdqa    xmm6,xmm3
-        punpcklbw xmm3,xmm0             ; xmm3=( 1  2  3  4  5  6  7  8)
-        punpckhbw xmm6,xmm0             ; xmm6=( 9 10 11 12 13 14 15 16)
-
-        pmullw  xmm1,[GOTOFF(ebx,PW_THREE)]
-        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
-        paddw   xmm2,[GOTOFF(ebx,PW_ONE)]
-        paddw   xmm5,[GOTOFF(ebx,PW_ONE)]
-        paddw   xmm3,[GOTOFF(ebx,PW_TWO)]
-        paddw   xmm6,[GOTOFF(ebx,PW_TWO)]
-
-        paddw   xmm2,xmm1
-        paddw   xmm5,xmm4
-        psrlw   xmm2,2                  ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
-        psrlw   xmm5,2                  ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
-        paddw   xmm3,xmm1
-        paddw   xmm6,xmm4
-        psrlw   xmm3,2                  ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
-        psrlw   xmm6,2                  ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
-
-        psllw   xmm3,BYTE_BIT
-        psllw   xmm6,BYTE_BIT
-        por     xmm2,xmm3               ; xmm2=OutL=( 0  1  2 ... 13 14 15)
-        por     xmm5,xmm6               ; xmm5=OutH=(16 17 18 ... 29 30 31)
-
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
-
-        sub     eax, byte SIZEOF_XMMWORD
-        add     esi, byte 1*SIZEOF_XMMWORD      ; inptr
-        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
-        cmp     eax, byte SIZEOF_XMMWORD
-        ja      near .columnloop
-        test    eax,eax
-        jnz     near .columnloop_last
-
-        pop     esi
-        pop     edi
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_data
-        add     edi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     ecx                             ; rowctr
-        jg      near .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
-; Again a triangle filter; see comments for h2v1 case, above.
-;
-; GLOBAL(void)
-; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          4
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void *gotptr
-
-        align   16
-        global  EXTN(jsimd_h2v2_fancy_upsample_sse2)
-
-EXTN(jsimd_h2v2_fancy_upsample_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     edx,eax                         ; edx = original ebp
-        mov     eax, JDIMENSION [downsamp_width(edx)]  ; colctr
-        test    eax,eax
-        jz      near .return
-
-        mov     ecx, INT [max_v_samp(edx)]      ; rowctr
-        test    ecx,ecx
-        jz      near .return
-
-        mov     esi, JSAMPARRAY [input_data(edx)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(edx)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    eax                                     ; colctr
-        push    ecx
-        push    edi
-        push    esi
-
-        mov     ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
-        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
-        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
-
-        test    eax, SIZEOF_XMMWORD-1
-        jz      short .skip
-        push    edx
-        mov     dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
-        mov     dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
-        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
-        pop     edx
-.skip:
-        ; -- process the first column block
-
-        movdqa  xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]    ; xmm0=row[ 0][0]
-        movdqa  xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]    ; xmm1=row[-1][0]
-        movdqa  xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]    ; xmm2=row[+1][0]
-
-        pushpic ebx
-        movpic  ebx, POINTER [gotptr]   ; load GOT address
-
-        pxor      xmm3,xmm3             ; xmm3=(all 0's)
-        movdqa    xmm4,xmm0
-        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm5,xmm1
-        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm6,xmm2
-        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-        pmullw  xmm0,[GOTOFF(ebx,PW_THREE)]
-        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
-
-        pcmpeqb xmm7,xmm7
-        psrldq  xmm7,(SIZEOF_XMMWORD-2)
-
-        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-        movdqa  XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1    ; temporarily save
-        movdqa  XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5    ; the intermediate data
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
-
-        pand    xmm1,xmm7               ; xmm1=( 0 -- -- -- -- -- -- --)
-        pand    xmm2,xmm7               ; xmm2=( 0 -- -- -- -- -- -- --)
-
-        movdqa  XMMWORD [wk(0)], xmm1
-        movdqa  XMMWORD [wk(1)], xmm2
-
-        poppic  ebx
-
-        add     eax, byte SIZEOF_XMMWORD-1
-        and     eax, byte -SIZEOF_XMMWORD
-        cmp     eax, byte SIZEOF_XMMWORD
-        ja      short .columnloop
-        alignx  16,7
-
-.columnloop_last:
-        ; -- process the last column block
-
-        pushpic ebx
-        movpic  ebx, POINTER [gotptr]   ; load GOT address
-
-        pcmpeqb xmm1,xmm1
-        pslldq  xmm1,(SIZEOF_XMMWORD-2)
-        movdqa  xmm2,xmm1
-
-        pand    xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
-        pand    xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
-
-        movdqa  XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
-        movdqa  XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
-
-        jmp     near .upsample
-        alignx  16,7
-
-.columnloop:
-        ; -- process the next column block
-
-        movdqa  xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]    ; xmm0=row[ 0][1]
-        movdqa  xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]    ; xmm1=row[-1][1]
-        movdqa  xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]    ; xmm2=row[+1][1]
-
-        pushpic ebx
-        movpic  ebx, POINTER [gotptr]   ; load GOT address
-
-        pxor      xmm3,xmm3             ; xmm3=(all 0's)
-        movdqa    xmm4,xmm0
-        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm5,xmm1
-        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm6,xmm2
-        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-        pmullw  xmm0,[GOTOFF(ebx,PW_THREE)]
-        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
-
-        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-        movdqa  XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1    ; temporarily save
-        movdqa  XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5    ; the intermediate data
-        movdqa  XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
-
-        pslldq  xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- --  0)
-        pslldq  xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- --  0)
-
-        movdqa  XMMWORD [wk(2)], xmm1
-        movdqa  XMMWORD [wk(3)], xmm2
-
-.upsample:
-        ; -- process the upper row
-
-        movdqa  xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
-        movdqa  xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
-
-        movdqa  xmm0,xmm7               ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
-        movdqa  xmm4,xmm3               ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
-        psrldq  xmm0,2                  ; xmm0=( 1  2  3  4  5  6  7 --)
-        pslldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- --  8)
-        movdqa  xmm5,xmm7
-        movdqa  xmm6,xmm3
-        psrldq  xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
-        pslldq  xmm6,2                  ; xmm6=(--  8  9 10 11 12 13 14)
-
-        por     xmm0,xmm4               ; xmm0=( 1  2  3  4  5  6  7  8)
-        por     xmm5,xmm6               ; xmm5=( 7  8  9 10 11 12 13 14)
-
-        movdqa  xmm1,xmm7
-        movdqa  xmm2,xmm3
-        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
-        psrldq  xmm2,2                  ; xmm2=( 9 10 11 12 13 14 15 --)
-        movdqa  xmm4,xmm3
-        psrldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
-
-        por     xmm1, XMMWORD [wk(0)]   ; xmm1=(-1  0  1  2  3  4  5  6)
-        por     xmm2, XMMWORD [wk(2)]   ; xmm2=( 9 10 11 12 13 14 15 16)
-
-        movdqa  XMMWORD [wk(0)], xmm4
-
-        pmullw  xmm7,[GOTOFF(ebx,PW_THREE)]
-        pmullw  xmm3,[GOTOFF(ebx,PW_THREE)]
-        paddw   xmm1,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   xmm5,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   xmm0,[GOTOFF(ebx,PW_SEVEN)]
-        paddw   xmm2,[GOTOFF(ebx,PW_SEVEN)]
-
-        paddw   xmm1,xmm7
-        paddw   xmm5,xmm3
-        psrlw   xmm1,4                  ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
-        psrlw   xmm5,4                  ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
-        paddw   xmm0,xmm7
-        paddw   xmm2,xmm3
-        psrlw   xmm0,4                  ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
-        psrlw   xmm2,4                  ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
-
-        psllw   xmm0,BYTE_BIT
-        psllw   xmm2,BYTE_BIT
-        por     xmm1,xmm0               ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
-        por     xmm5,xmm2               ; xmm5=Out0H=(16 17 18 ... 29 30 31)
-
-        movdqa  XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
-        movdqa  XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
-
-        ; -- process the lower row
-
-        movdqa  xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
-        movdqa  xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
-
-        movdqa  xmm7,xmm6               ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
-        movdqa  xmm3,xmm4               ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
-        psrldq  xmm7,2                  ; xmm7=( 1  2  3  4  5  6  7 --)
-        pslldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- --  8)
-        movdqa  xmm0,xmm6
-        movdqa  xmm2,xmm4
-        psrldq  xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
-        pslldq  xmm2,2                  ; xmm2=(--  8  9 10 11 12 13 14)
-
-        por     xmm7,xmm3               ; xmm7=( 1  2  3  4  5  6  7  8)
-        por     xmm0,xmm2               ; xmm0=( 7  8  9 10 11 12 13 14)
-
-        movdqa  xmm1,xmm6
-        movdqa  xmm5,xmm4
-        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
-        psrldq  xmm5,2                  ; xmm5=( 9 10 11 12 13 14 15 --)
-        movdqa  xmm3,xmm4
-        psrldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
-
-        por     xmm1, XMMWORD [wk(1)]   ; xmm1=(-1  0  1  2  3  4  5  6)
-        por     xmm5, XMMWORD [wk(3)]   ; xmm5=( 9 10 11 12 13 14 15 16)
-
-        movdqa  XMMWORD [wk(1)], xmm3
-
-        pmullw  xmm6,[GOTOFF(ebx,PW_THREE)]
-        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
-        paddw   xmm1,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   xmm0,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   xmm7,[GOTOFF(ebx,PW_SEVEN)]
-        paddw   xmm5,[GOTOFF(ebx,PW_SEVEN)]
-
-        paddw   xmm1,xmm6
-        paddw   xmm0,xmm4
-        psrlw   xmm1,4                  ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
-        psrlw   xmm0,4                  ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
-        paddw   xmm7,xmm6
-        paddw   xmm5,xmm4
-        psrlw   xmm7,4                  ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
-        psrlw   xmm5,4                  ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
-
-        psllw   xmm7,BYTE_BIT
-        psllw   xmm5,BYTE_BIT
-        por     xmm1,xmm7               ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
-        por     xmm0,xmm5               ; xmm0=Out1H=(16 17 18 ... 29 30 31)
-
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
-        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
-
-        poppic  ebx
-
-        sub     eax, byte SIZEOF_XMMWORD
-        add     ecx, byte 1*SIZEOF_XMMWORD      ; inptr1(above)
-        add     ebx, byte 1*SIZEOF_XMMWORD      ; inptr0
-        add     esi, byte 1*SIZEOF_XMMWORD      ; inptr1(below)
-        add     edx, byte 2*SIZEOF_XMMWORD      ; outptr0
-        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr1
-        cmp     eax, byte SIZEOF_XMMWORD
-        ja      near .columnloop
-        test    eax,eax
-        jnz     near .columnloop_last
-
-        pop     esi
-        pop     edi
-        pop     ecx
-        pop     eax
-
-        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
-        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
-        sub     ecx, byte 2                     ; rowctr
-        jg      near .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define output_width(b)         (b)+12          ; JDIMENSION output_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_upsample_sse2)
-
-EXTN(jsimd_h2v1_upsample_sse2):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     edx, JDIMENSION [output_width(ebp)]
-        add     edx, byte (2*SIZEOF_XMMWORD)-1
-        and     edx, byte -(2*SIZEOF_XMMWORD)
-        jz      short .return
-
-        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
-        test    ecx,ecx
-        jz      short .return
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(ebp)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]             ; inptr
-        mov     edi, JSAMPROW [edi]             ; outptr
-        mov     eax,edx                         ; colctr
-        alignx  16,7
-.columnloop:
-
-        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-
-        movdqa    xmm1,xmm0
-        punpcklbw xmm0,xmm0
-        punpckhbw xmm1,xmm1
-
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
-
-        sub     eax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        movdqa  xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-        movdqa    xmm3,xmm2
-        punpcklbw xmm2,xmm2
-        punpckhbw xmm3,xmm3
-
-        movdqa  XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
-
-        sub     eax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
-        add     edi, byte 4*SIZEOF_XMMWORD      ; outptr
-        jmp     short .columnloop
-        alignx  16,7
-
-.nextrow:
-        pop     esi
-        pop     edi
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_data
-        add     edi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     ecx                             ; rowctr
-        jg      short .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define output_width(b)         (b)+12          ; JDIMENSION output_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v2_upsample_sse2)
-
-EXTN(jsimd_h2v2_upsample_sse2):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     edx, JDIMENSION [output_width(ebp)]
-        add     edx, byte (2*SIZEOF_XMMWORD)-1
-        and     edx, byte -(2*SIZEOF_XMMWORD)
-        jz      near .return
-
-        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
-        test    ecx,ecx
-        jz      near .return
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(ebp)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]                     ; inptr
-        mov     ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
-        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
-        mov     eax,edx                                 ; colctr
-        alignx  16,7
-.columnloop:
-
-        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-
-        movdqa    xmm1,xmm0
-        punpcklbw xmm0,xmm0
-        punpckhbw xmm1,xmm1
-
-        movdqa  XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
-        movdqa  XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
-
-        sub     eax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        movdqa  xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-        movdqa    xmm3,xmm2
-        punpcklbw xmm2,xmm2
-        punpckhbw xmm3,xmm3
-
-        movdqa  XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
-        movdqa  XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
-
-        sub     eax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
-        add     ebx, byte 4*SIZEOF_XMMWORD      ; outptr0
-        add     edi, byte 4*SIZEOF_XMMWORD      ; outptr1
-        jmp     short .columnloop
-        alignx  16,7
-
-.nextrow:
-        pop     esi
-        pop     edi
-
-        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
-        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
-        sub     ecx, byte 2                     ; rowctr
-        jg      short .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jfdctflt-3dn.asm b/simd/jfdctflt-3dn.asm
deleted file mode 100644
index 2191618..0000000
--- a/simd/jfdctflt-3dn.asm
+++ /dev/null
@@ -1,319 +0,0 @@
-;
-; jfdctflt.asm - floating-point FDCT (3DNow!)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the forward DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fdct_float_3dnow)
-
-EXTN(jconst_fdct_float_3dnow):
-
-PD_0_382        times 2 dd  0.382683432365089771728460
-PD_0_707        times 2 dd  0.707106781186547524400844
-PD_0_541        times 2 dd  0.541196100146196984399723
-PD_1_306        times 2 dd  1.306562964876376527856643
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_float_3dnow (FAST_FLOAT *data)
-;
-
-%define data(b)         (b)+8           ; FAST_FLOAT *data
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_float_3dnow)
-
-EXTN(jsimd_fdct_float_3dnow):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-;       push    edi             ; unused
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process rows.
-
-        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
-        mov     ecx, DCTSIZE/2
-        alignx  16,7
-.rowloop:
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
-
-        ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
-
-        movq      mm4,mm0               ; transpose coefficients
-        punpckldq mm0,mm1               ; mm0=(00 10)=data0
-        punpckhdq mm4,mm1               ; mm4=(01 11)=data1
-        movq      mm5,mm2               ; transpose coefficients
-        punpckldq mm2,mm3               ; mm2=(06 16)=data6
-        punpckhdq mm5,mm3               ; mm5=(07 17)=data7
-
-        movq    mm6,mm4
-        movq    mm7,mm0
-        pfsub   mm4,mm2                 ; mm4=data1-data6=tmp6
-        pfsub   mm0,mm5                 ; mm0=data0-data7=tmp7
-        pfadd   mm6,mm2                 ; mm6=data1+data6=tmp1
-        pfadd   mm7,mm5                 ; mm7=data0+data7=tmp0
-
-        movq    mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
-
-        ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=tmp6
-        movq    MMWORD [wk(1)], mm0     ; wk(1)=tmp7
-
-        movq      mm4,mm1               ; transpose coefficients
-        punpckldq mm1,mm3               ; mm1=(02 12)=data2
-        punpckhdq mm4,mm3               ; mm4=(03 13)=data3
-        movq      mm0,mm2               ; transpose coefficients
-        punpckldq mm2,mm5               ; mm2=(04 14)=data4
-        punpckhdq mm0,mm5               ; mm0=(05 15)=data5
-
-        movq    mm3,mm4
-        movq    mm5,mm1
-        pfadd   mm4,mm2                 ; mm4=data3+data4=tmp3
-        pfadd   mm1,mm0                 ; mm1=data2+data5=tmp2
-        pfsub   mm3,mm2                 ; mm3=data3-data4=tmp4
-        pfsub   mm5,mm0                 ; mm5=data2-data5=tmp5
-
-        ; -- Even part
-
-        movq    mm2,mm7
-        movq    mm0,mm6
-        pfsub   mm7,mm4                 ; mm7=tmp13
-        pfsub   mm6,mm1                 ; mm6=tmp12
-        pfadd   mm2,mm4                 ; mm2=tmp10
-        pfadd   mm0,mm1                 ; mm0=tmp11
-
-        pfadd   mm6,mm7
-        pfmul   mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
-
-        movq    mm4,mm2
-        movq    mm1,mm7
-        pfsub   mm2,mm0                 ; mm2=data4
-        pfsub   mm7,mm6                 ; mm7=data6
-        pfadd   mm4,mm0                 ; mm4=data0
-        pfadd   mm1,mm6                 ; mm1=data2
-
-        movq    MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
-        movq    MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
-        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
-        movq    MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
-
-        ; -- Odd part
-
-        movq    mm0, MMWORD [wk(0)]     ; mm0=tmp6
-        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp7
-
-        pfadd   mm3,mm5                 ; mm3=tmp10
-        pfadd   mm5,mm0                 ; mm5=tmp11
-        pfadd   mm0,mm6                 ; mm0=tmp12, mm6=tmp7
-
-        pfmul   mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
-
-        movq    mm2,mm3                 ; mm2=tmp10
-        pfsub   mm3,mm0
-        pfmul   mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
-        pfmul   mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
-        pfmul   mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
-        pfadd   mm2,mm3                 ; mm2=z2
-        pfadd   mm0,mm3                 ; mm0=z4
-
-        movq    mm7,mm6
-        pfsub   mm6,mm5                 ; mm6=z13
-        pfadd   mm7,mm5                 ; mm7=z11
-
-        movq    mm4,mm6
-        movq    mm1,mm7
-        pfsub   mm6,mm2                 ; mm6=data3
-        pfsub   mm7,mm0                 ; mm7=data7
-        pfadd   mm4,mm2                 ; mm4=data5
-        pfadd   mm1,mm0                 ; mm1=data1
-
-        movq    MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
-        movq    MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
-        movq    MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
-        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
-
-        add     edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     near .rowloop
-
-        ; ---- Pass 2: process columns.
-
-        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
-        mov     ecx, DCTSIZE/2
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
-
-        ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
-
-        movq      mm4,mm0               ; transpose coefficients
-        punpckldq mm0,mm1               ; mm0=(00 01)=data0
-        punpckhdq mm4,mm1               ; mm4=(10 11)=data1
-        movq      mm5,mm2               ; transpose coefficients
-        punpckldq mm2,mm3               ; mm2=(60 61)=data6
-        punpckhdq mm5,mm3               ; mm5=(70 71)=data7
-
-        movq    mm6,mm4
-        movq    mm7,mm0
-        pfsub   mm4,mm2                 ; mm4=data1-data6=tmp6
-        pfsub   mm0,mm5                 ; mm0=data0-data7=tmp7
-        pfadd   mm6,mm2                 ; mm6=data1+data6=tmp1
-        pfadd   mm7,mm5                 ; mm7=data0+data7=tmp0
-
-        movq    mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
-
-        ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=tmp6
-        movq    MMWORD [wk(1)], mm0     ; wk(1)=tmp7
-
-        movq      mm4,mm1               ; transpose coefficients
-        punpckldq mm1,mm3               ; mm1=(20 21)=data2
-        punpckhdq mm4,mm3               ; mm4=(30 31)=data3
-        movq      mm0,mm2               ; transpose coefficients
-        punpckldq mm2,mm5               ; mm2=(40 41)=data4
-        punpckhdq mm0,mm5               ; mm0=(50 51)=data5
-
-        movq    mm3,mm4
-        movq    mm5,mm1
-        pfadd   mm4,mm2                 ; mm4=data3+data4=tmp3
-        pfadd   mm1,mm0                 ; mm1=data2+data5=tmp2
-        pfsub   mm3,mm2                 ; mm3=data3-data4=tmp4
-        pfsub   mm5,mm0                 ; mm5=data2-data5=tmp5
-
-        ; -- Even part
-
-        movq    mm2,mm7
-        movq    mm0,mm6
-        pfsub   mm7,mm4                 ; mm7=tmp13
-        pfsub   mm6,mm1                 ; mm6=tmp12
-        pfadd   mm2,mm4                 ; mm2=tmp10
-        pfadd   mm0,mm1                 ; mm0=tmp11
-
-        pfadd   mm6,mm7
-        pfmul   mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
-
-        movq    mm4,mm2
-        movq    mm1,mm7
-        pfsub   mm2,mm0                 ; mm2=data4
-        pfsub   mm7,mm6                 ; mm7=data6
-        pfadd   mm4,mm0                 ; mm4=data0
-        pfadd   mm1,mm6                 ; mm1=data2
-
-        movq    MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
-        movq    MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
-        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
-        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
-
-        ; -- Odd part
-
-        movq    mm0, MMWORD [wk(0)]     ; mm0=tmp6
-        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp7
-
-        pfadd   mm3,mm5                 ; mm3=tmp10
-        pfadd   mm5,mm0                 ; mm5=tmp11
-        pfadd   mm0,mm6                 ; mm0=tmp12, mm6=tmp7
-
-        pfmul   mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
-
-        movq    mm2,mm3                 ; mm2=tmp10
-        pfsub   mm3,mm0
-        pfmul   mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
-        pfmul   mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
-        pfmul   mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
-        pfadd   mm2,mm3                 ; mm2=z2
-        pfadd   mm0,mm3                 ; mm0=z4
-
-        movq    mm7,mm6
-        pfsub   mm6,mm5                 ; mm6=z13
-        pfadd   mm7,mm5                 ; mm7=z11
-
-        movq    mm4,mm6
-        movq    mm1,mm7
-        pfsub   mm6,mm2                 ; mm6=data3
-        pfsub   mm7,mm0                 ; mm7=data7
-        pfadd   mm4,mm2                 ; mm4=data5
-        pfadd   mm1,mm0                 ; mm1=data1
-
-        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
-        movq    MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
-        movq    MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
-        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
-
-        add     edx, byte 2*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     near .columnloop
-
-        femms           ; empty MMX/3DNow! state
-
-;       pop     edi             ; unused
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jfdctflt-sse-64.asm b/simd/jfdctflt-sse-64.asm
deleted file mode 100644
index 4b64ea4..0000000
--- a/simd/jfdctflt-sse-64.asm
+++ /dev/null
@@ -1,357 +0,0 @@
-;
-; jfdctflt.asm - floating-point FDCT (64-bit SSE)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the forward DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-        shufps  %1,%2,0x44
-%endmacro
-
-%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-        shufps  %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fdct_float_sse)
-
-EXTN(jconst_fdct_float_sse):
-
-PD_0_382        times 4 dd  0.382683432365089771728460
-PD_0_707        times 4 dd  0.707106781186547524400844
-PD_0_541        times 4 dd  0.541196100146196984399723
-PD_1_306        times 4 dd  1.306562964876376527856643
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_float_sse (FAST_FLOAT *data)
-;
-
-; r10 = FAST_FLOAT *data
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_float_sse)
-
-EXTN(jsimd_fdct_float_sse):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-
-        ; ---- Pass 1: process rows.
-
-        mov     rdx, r10        ; (FAST_FLOAT *)
-        mov     rcx, DCTSIZE/4
-.rowloop:
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
-        ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
-
-        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
-        unpcklps xmm0,xmm1              ; xmm0=(20 30 21 31)
-        unpckhps xmm4,xmm1              ; xmm4=(22 32 23 33)
-        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
-        unpcklps xmm2,xmm3              ; xmm2=(24 34 25 35)
-        unpckhps xmm5,xmm3              ; xmm5=(26 36 27 37)
-
-        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
-        ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
-
-        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
-        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
-
-        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
-        unpcklps xmm6,xmm7              ; xmm6=(00 10 01 11)
-        unpckhps xmm4,xmm7              ; xmm4=(02 12 03 13)
-        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
-        unpcklps xmm1,xmm3              ; xmm1=(04 14 05 15)
-        unpckhps xmm2,xmm3              ; xmm2=(06 16 07 17)
-
-        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm0             ; xmm6=(00 10 20 30)=data0
-        unpckhps2 xmm7,xmm0             ; xmm7=(01 11 21 31)=data1
-        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
-        unpcklps2 xmm2,xmm5             ; xmm2=(06 16 26 36)=data6
-        unpckhps2 xmm3,xmm5             ; xmm3=(07 17 27 37)=data7
-
-        movaps  xmm0,xmm7
-        movaps  xmm5,xmm6
-        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
-        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
-        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
-        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
-
-        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
-        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
-        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
-        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
-
-        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(02 12 22 32)=data2
-        unpckhps2 xmm7,xmm2             ; xmm7=(03 13 23 33)=data3
-        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm3             ; xmm1=(04 14 24 34)=data4
-        unpckhps2 xmm6,xmm3             ; xmm6=(05 15 25 35)=data5
-
-        movaps  xmm2,xmm7
-        movaps  xmm3,xmm4
-        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
-        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
-        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
-        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movaps  xmm1,xmm5
-        movaps  xmm6,xmm0
-        subps   xmm5,xmm7               ; xmm5=tmp13
-        subps   xmm0,xmm4               ; xmm0=tmp12
-        addps   xmm1,xmm7               ; xmm1=tmp10
-        addps   xmm6,xmm4               ; xmm6=tmp11
-
-        addps   xmm0,xmm5
-        mulps   xmm0,[rel PD_0_707] ; xmm0=z1
-
-        movaps  xmm7,xmm1
-        movaps  xmm4,xmm5
-        subps   xmm1,xmm6               ; xmm1=data4
-        subps   xmm5,xmm0               ; xmm5=data6
-        addps   xmm7,xmm6               ; xmm7=data0
-        addps   xmm4,xmm0               ; xmm4=data2
-
-        movaps  XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-        ; -- Odd part
-
-        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
-        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
-
-        addps   xmm2,xmm3               ; xmm2=tmp10
-        addps   xmm3,xmm6               ; xmm3=tmp11
-        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
-
-        mulps   xmm3,[rel PD_0_707] ; xmm3=z3
-
-        movaps  xmm1,xmm2               ; xmm1=tmp10
-        subps   xmm2,xmm6
-        mulps   xmm2,[rel PD_0_382] ; xmm2=z5
-        mulps   xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-        mulps   xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-        addps   xmm1,xmm2               ; xmm1=z2
-        addps   xmm6,xmm2               ; xmm6=z4
-
-        movaps  xmm5,xmm0
-        subps   xmm0,xmm3               ; xmm0=z13
-        addps   xmm5,xmm3               ; xmm5=z11
-
-        movaps  xmm7,xmm0
-        movaps  xmm4,xmm5
-        subps   xmm0,xmm1               ; xmm0=data3
-        subps   xmm5,xmm6               ; xmm5=data7
-        addps   xmm7,xmm1               ; xmm7=data5
-        addps   xmm4,xmm6               ; xmm4=data1
-
-        movaps  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-        add     rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     rcx
-        jnz     near .rowloop
-
-        ; ---- Pass 2: process columns.
-
-        mov     rdx, r10        ; (FAST_FLOAT *)
-        mov     rcx, DCTSIZE/4
-.columnloop:
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
-        ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
-
-        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
-        unpcklps xmm0,xmm1              ; xmm0=(02 03 12 13)
-        unpckhps xmm4,xmm1              ; xmm4=(22 23 32 33)
-        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
-        unpcklps xmm2,xmm3              ; xmm2=(42 43 52 53)
-        unpckhps xmm5,xmm3              ; xmm5=(62 63 72 73)
-
-        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
-        ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
-
-        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
-        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
-
-        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
-        unpcklps xmm6,xmm7              ; xmm6=(00 01 10 11)
-        unpckhps xmm4,xmm7              ; xmm4=(20 21 30 31)
-        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
-        unpcklps xmm1,xmm3              ; xmm1=(40 41 50 51)
-        unpckhps xmm2,xmm3              ; xmm2=(60 61 70 71)
-
-        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm0             ; xmm6=(00 01 02 03)=data0
-        unpckhps2 xmm7,xmm0             ; xmm7=(10 11 12 13)=data1
-        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
-        unpcklps2 xmm2,xmm5             ; xmm2=(60 61 62 63)=data6
-        unpckhps2 xmm3,xmm5             ; xmm3=(70 71 72 73)=data7
-
-        movaps  xmm0,xmm7
-        movaps  xmm5,xmm6
-        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
-        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
-        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
-        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
-
-        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
-        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
-        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
-        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
-
-        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(20 21 22 23)=data2
-        unpckhps2 xmm7,xmm2             ; xmm7=(30 31 32 33)=data3
-        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm3             ; xmm1=(40 41 42 43)=data4
-        unpckhps2 xmm6,xmm3             ; xmm6=(50 51 52 53)=data5
-
-        movaps  xmm2,xmm7
-        movaps  xmm3,xmm4
-        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
-        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
-        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
-        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movaps  xmm1,xmm5
-        movaps  xmm6,xmm0
-        subps   xmm5,xmm7               ; xmm5=tmp13
-        subps   xmm0,xmm4               ; xmm0=tmp12
-        addps   xmm1,xmm7               ; xmm1=tmp10
-        addps   xmm6,xmm4               ; xmm6=tmp11
-
-        addps   xmm0,xmm5
-        mulps   xmm0,[rel PD_0_707] ; xmm0=z1
-
-        movaps  xmm7,xmm1
-        movaps  xmm4,xmm5
-        subps   xmm1,xmm6               ; xmm1=data4
-        subps   xmm5,xmm0               ; xmm5=data6
-        addps   xmm7,xmm6               ; xmm7=data0
-        addps   xmm4,xmm0               ; xmm4=data2
-
-        movaps  XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-        ; -- Odd part
-
-        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
-        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
-
-        addps   xmm2,xmm3               ; xmm2=tmp10
-        addps   xmm3,xmm6               ; xmm3=tmp11
-        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
-
-        mulps   xmm3,[rel PD_0_707] ; xmm3=z3
-
-        movaps  xmm1,xmm2               ; xmm1=tmp10
-        subps   xmm2,xmm6
-        mulps   xmm2,[rel PD_0_382] ; xmm2=z5
-        mulps   xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-        mulps   xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-        addps   xmm1,xmm2               ; xmm1=z2
-        addps   xmm6,xmm2               ; xmm6=z4
-
-        movaps  xmm5,xmm0
-        subps   xmm0,xmm3               ; xmm0=z13
-        addps   xmm5,xmm3               ; xmm5=z11
-
-        movaps  xmm7,xmm0
-        movaps  xmm4,xmm5
-        subps   xmm0,xmm1               ; xmm0=data3
-        subps   xmm5,xmm6               ; xmm5=data7
-        addps   xmm7,xmm1               ; xmm7=data5
-        addps   xmm4,xmm6               ; xmm4=data1
-
-        movaps  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-        add     rdx, byte 4*SIZEOF_FAST_FLOAT
-        dec     rcx
-        jnz     near .columnloop
-
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jfdctflt-sse.asm b/simd/jfdctflt-sse.asm
deleted file mode 100644
index e7ede26..0000000
--- a/simd/jfdctflt-sse.asm
+++ /dev/null
@@ -1,369 +0,0 @@
-;
-; jfdctflt.asm - floating-point FDCT (SSE)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the forward DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-        shufps  %1,%2,0x44
-%endmacro
-
-%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-        shufps  %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fdct_float_sse)
-
-EXTN(jconst_fdct_float_sse):
-
-PD_0_382        times 4 dd  0.382683432365089771728460
-PD_0_707        times 4 dd  0.707106781186547524400844
-PD_0_541        times 4 dd  0.541196100146196984399723
-PD_1_306        times 4 dd  1.306562964876376527856643
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_float_sse (FAST_FLOAT *data)
-;
-
-%define data(b)         (b)+8           ; FAST_FLOAT *data
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_float_sse)
-
-EXTN(jsimd_fdct_float_sse):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-;       push    edi             ; unused
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process rows.
-
-        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.rowloop:
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
-        ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
-
-        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
-        unpcklps xmm0,xmm1              ; xmm0=(20 30 21 31)
-        unpckhps xmm4,xmm1              ; xmm4=(22 32 23 33)
-        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
-        unpcklps xmm2,xmm3              ; xmm2=(24 34 25 35)
-        unpckhps xmm5,xmm3              ; xmm5=(26 36 27 37)
-
-        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
-        ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
-
-        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
-        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
-
-        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
-        unpcklps xmm6,xmm7              ; xmm6=(00 10 01 11)
-        unpckhps xmm4,xmm7              ; xmm4=(02 12 03 13)
-        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
-        unpcklps xmm1,xmm3              ; xmm1=(04 14 05 15)
-        unpckhps xmm2,xmm3              ; xmm2=(06 16 07 17)
-
-        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm0             ; xmm6=(00 10 20 30)=data0
-        unpckhps2 xmm7,xmm0             ; xmm7=(01 11 21 31)=data1
-        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
-        unpcklps2 xmm2,xmm5             ; xmm2=(06 16 26 36)=data6
-        unpckhps2 xmm3,xmm5             ; xmm3=(07 17 27 37)=data7
-
-        movaps  xmm0,xmm7
-        movaps  xmm5,xmm6
-        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
-        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
-        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
-        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
-
-        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
-        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
-        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
-        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
-
-        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(02 12 22 32)=data2
-        unpckhps2 xmm7,xmm2             ; xmm7=(03 13 23 33)=data3
-        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm3             ; xmm1=(04 14 24 34)=data4
-        unpckhps2 xmm6,xmm3             ; xmm6=(05 15 25 35)=data5
-
-        movaps  xmm2,xmm7
-        movaps  xmm3,xmm4
-        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
-        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
-        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
-        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movaps  xmm1,xmm5
-        movaps  xmm6,xmm0
-        subps   xmm5,xmm7               ; xmm5=tmp13
-        subps   xmm0,xmm4               ; xmm0=tmp12
-        addps   xmm1,xmm7               ; xmm1=tmp10
-        addps   xmm6,xmm4               ; xmm6=tmp11
-
-        addps   xmm0,xmm5
-        mulps   xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
-
-        movaps  xmm7,xmm1
-        movaps  xmm4,xmm5
-        subps   xmm1,xmm6               ; xmm1=data4
-        subps   xmm5,xmm0               ; xmm5=data6
-        addps   xmm7,xmm6               ; xmm7=data0
-        addps   xmm4,xmm0               ; xmm4=data2
-
-        movaps  XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-        ; -- Odd part
-
-        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
-        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
-
-        addps   xmm2,xmm3               ; xmm2=tmp10
-        addps   xmm3,xmm6               ; xmm3=tmp11
-        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
-
-        mulps   xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
-
-        movaps  xmm1,xmm2               ; xmm1=tmp10
-        subps   xmm2,xmm6
-        mulps   xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
-        mulps   xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-        mulps   xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-        addps   xmm1,xmm2               ; xmm1=z2
-        addps   xmm6,xmm2               ; xmm6=z4
-
-        movaps  xmm5,xmm0
-        subps   xmm0,xmm3               ; xmm0=z13
-        addps   xmm5,xmm3               ; xmm5=z11
-
-        movaps  xmm7,xmm0
-        movaps  xmm4,xmm5
-        subps   xmm0,xmm1               ; xmm0=data3
-        subps   xmm5,xmm6               ; xmm5=data7
-        addps   xmm7,xmm1               ; xmm7=data5
-        addps   xmm4,xmm6               ; xmm4=data1
-
-        movaps  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-        add     edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     near .rowloop
-
-        ; ---- Pass 2: process columns.
-
-        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.columnloop:
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
-        ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
-
-        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
-        unpcklps xmm0,xmm1              ; xmm0=(02 03 12 13)
-        unpckhps xmm4,xmm1              ; xmm4=(22 23 32 33)
-        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
-        unpcklps xmm2,xmm3              ; xmm2=(42 43 52 53)
-        unpckhps xmm5,xmm3              ; xmm5=(62 63 72 73)
-
-        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
-        ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
-
-        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
-        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
-
-        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
-        unpcklps xmm6,xmm7              ; xmm6=(00 01 10 11)
-        unpckhps xmm4,xmm7              ; xmm4=(20 21 30 31)
-        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
-        unpcklps xmm1,xmm3              ; xmm1=(40 41 50 51)
-        unpckhps xmm2,xmm3              ; xmm2=(60 61 70 71)
-
-        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm0             ; xmm6=(00 01 02 03)=data0
-        unpckhps2 xmm7,xmm0             ; xmm7=(10 11 12 13)=data1
-        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
-        unpcklps2 xmm2,xmm5             ; xmm2=(60 61 62 63)=data6
-        unpckhps2 xmm3,xmm5             ; xmm3=(70 71 72 73)=data7
-
-        movaps  xmm0,xmm7
-        movaps  xmm5,xmm6
-        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
-        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
-        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
-        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
-
-        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
-        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
-        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
-        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
-
-        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(20 21 22 23)=data2
-        unpckhps2 xmm7,xmm2             ; xmm7=(30 31 32 33)=data3
-        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm3             ; xmm1=(40 41 42 43)=data4
-        unpckhps2 xmm6,xmm3             ; xmm6=(50 51 52 53)=data5
-
-        movaps  xmm2,xmm7
-        movaps  xmm3,xmm4
-        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
-        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
-        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
-        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movaps  xmm1,xmm5
-        movaps  xmm6,xmm0
-        subps   xmm5,xmm7               ; xmm5=tmp13
-        subps   xmm0,xmm4               ; xmm0=tmp12
-        addps   xmm1,xmm7               ; xmm1=tmp10
-        addps   xmm6,xmm4               ; xmm6=tmp11
-
-        addps   xmm0,xmm5
-        mulps   xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
-
-        movaps  xmm7,xmm1
-        movaps  xmm4,xmm5
-        subps   xmm1,xmm6               ; xmm1=data4
-        subps   xmm5,xmm0               ; xmm5=data6
-        addps   xmm7,xmm6               ; xmm7=data0
-        addps   xmm4,xmm0               ; xmm4=data2
-
-        movaps  XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-        ; -- Odd part
-
-        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
-        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
-
-        addps   xmm2,xmm3               ; xmm2=tmp10
-        addps   xmm3,xmm6               ; xmm3=tmp11
-        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
-
-        mulps   xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
-
-        movaps  xmm1,xmm2               ; xmm1=tmp10
-        subps   xmm2,xmm6
-        mulps   xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
-        mulps   xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-        mulps   xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-        addps   xmm1,xmm2               ; xmm1=z2
-        addps   xmm6,xmm2               ; xmm6=z4
-
-        movaps  xmm5,xmm0
-        subps   xmm0,xmm3               ; xmm0=z13
-        addps   xmm5,xmm3               ; xmm5=z11
-
-        movaps  xmm7,xmm0
-        movaps  xmm4,xmm5
-        subps   xmm0,xmm1               ; xmm0=data3
-        subps   xmm5,xmm6               ; xmm5=data7
-        addps   xmm7,xmm1               ; xmm7=data5
-        addps   xmm4,xmm6               ; xmm4=data1
-
-        movaps  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-        add     edx, byte 4*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     near .columnloop
-
-;       pop     edi             ; unused
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jfdctfst-mmx.asm b/simd/jfdctfst-mmx.asm
deleted file mode 100644
index eb2eb9c..0000000
--- a/simd/jfdctfst-mmx.asm
+++ /dev/null
@@ -1,396 +0,0 @@
-;
-; jfdctfst.asm - fast integer FDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the forward DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      8       ; 14 is also OK.
-
-%if CONST_BITS == 8
-F_0_382 equ      98             ; FIX(0.382683433)
-F_0_541 equ     139             ; FIX(0.541196100)
-F_0_707 equ     181             ; FIX(0.707106781)
-F_1_306 equ     334             ; FIX(1.306562965)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_382 equ     DESCALE( 410903207,30-CONST_BITS)       ; FIX(0.382683433)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_707 equ     DESCALE( 759250124,30-CONST_BITS)       ; FIX(0.707106781)
-F_1_306 equ     DESCALE(1402911301,30-CONST_BITS)       ; FIX(1.306562965)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-        alignz  16
-        global  EXTN(jconst_fdct_ifast_mmx)
-
-EXTN(jconst_fdct_ifast_mmx):
-
-PW_F0707        times 4 dw  F_0_707 << CONST_SHIFT
-PW_F0382        times 4 dw  F_0_382 << CONST_SHIFT
-PW_F0541        times 4 dw  F_0_541 << CONST_SHIFT
-PW_F1306        times 4 dw  F_1_306 << CONST_SHIFT
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_ifast_mmx (DCTELEM *data)
-;
-
-%define data(b)         (b)+8           ; DCTELEM *data
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_ifast_mmx)
-
-EXTN(jsimd_fdct_ifast_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-;       push    edi             ; unused
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process rows.
-
-        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.rowloop:
-
-        movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-        movq    mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
-
-        ; mm0=(20 21 22 23), mm2=(24 25 26 27)
-        ; mm1=(30 31 32 33), mm3=(34 35 36 37)
-
-        movq      mm4,mm0               ; transpose coefficients(phase 1)
-        punpcklwd mm0,mm1               ; mm0=(20 30 21 31)
-        punpckhwd mm4,mm1               ; mm4=(22 32 23 33)
-        movq      mm5,mm2               ; transpose coefficients(phase 1)
-        punpcklwd mm2,mm3               ; mm2=(24 34 25 35)
-        punpckhwd mm5,mm3               ; mm5=(26 36 27 37)
-
-        movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-        movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
-
-        ; mm6=(00 01 02 03), mm1=(04 05 06 07)
-        ; mm7=(10 11 12 13), mm3=(14 15 16 17)
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
-        movq    MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
-
-        movq      mm4,mm6               ; transpose coefficients(phase 1)
-        punpcklwd mm6,mm7               ; mm6=(00 10 01 11)
-        punpckhwd mm4,mm7               ; mm4=(02 12 03 13)
-        movq      mm2,mm1               ; transpose coefficients(phase 1)
-        punpcklwd mm1,mm3               ; mm1=(04 14 05 15)
-        punpckhwd mm2,mm3               ; mm2=(06 16 07 17)
-
-        movq      mm7,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm0               ; mm6=(00 10 20 30)=data0
-        punpckhdq mm7,mm0               ; mm7=(01 11 21 31)=data1
-        movq      mm3,mm2               ; transpose coefficients(phase 2)
-        punpckldq mm2,mm5               ; mm2=(06 16 26 36)=data6
-        punpckhdq mm3,mm5               ; mm3=(07 17 27 37)=data7
-
-        movq    mm0,mm7
-        movq    mm5,mm6
-        psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
-        psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
-        paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
-        paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
-
-        movq    mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
-        movq    mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
-        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
-        movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
-
-        movq      mm7,mm4               ; transpose coefficients(phase 2)
-        punpckldq mm4,mm2               ; mm4=(02 12 22 32)=data2
-        punpckhdq mm7,mm2               ; mm7=(03 13 23 33)=data3
-        movq      mm6,mm1               ; transpose coefficients(phase 2)
-        punpckldq mm1,mm3               ; mm1=(04 14 24 34)=data4
-        punpckhdq mm6,mm3               ; mm6=(05 15 25 35)=data5
-
-        movq    mm2,mm7
-        movq    mm3,mm4
-        paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
-        paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
-        psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
-        psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movq    mm1,mm5
-        movq    mm6,mm0
-        psubw   mm5,mm7                 ; mm5=tmp13
-        psubw   mm0,mm4                 ; mm0=tmp12
-        paddw   mm1,mm7                 ; mm1=tmp10
-        paddw   mm6,mm4                 ; mm6=tmp11
-
-        paddw   mm0,mm5
-        psllw   mm0,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
-
-        movq    mm7,mm1
-        movq    mm4,mm5
-        psubw   mm1,mm6                 ; mm1=data4
-        psubw   mm5,mm0                 ; mm5=data6
-        paddw   mm7,mm6                 ; mm7=data0
-        paddw   mm4,mm0                 ; mm4=data2
-
-        movq    MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
-        movq    MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
-        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
-        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-
-        ; -- Odd part
-
-        movq    mm6, MMWORD [wk(0)]     ; mm6=tmp6
-        movq    mm0, MMWORD [wk(1)]     ; mm0=tmp7
-
-        paddw   mm2,mm3                 ; mm2=tmp10
-        paddw   mm3,mm6                 ; mm3=tmp11
-        paddw   mm6,mm0                 ; mm6=tmp12, mm0=tmp7
-
-        psllw   mm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   mm6,PRE_MULTIPLY_SCALE_BITS
-
-        psllw   mm3,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
-
-        movq    mm1,mm2                 ; mm1=tmp10
-        psubw   mm2,mm6
-        pmulhw  mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
-        pmulhw  mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
-        pmulhw  mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
-        paddw   mm1,mm2                 ; mm1=z2
-        paddw   mm6,mm2                 ; mm6=z4
-
-        movq    mm5,mm0
-        psubw   mm0,mm3                 ; mm0=z13
-        paddw   mm5,mm3                 ; mm5=z11
-
-        movq    mm7,mm0
-        movq    mm4,mm5
-        psubw   mm0,mm1                 ; mm0=data3
-        psubw   mm5,mm6                 ; mm5=data7
-        paddw   mm7,mm1                 ; mm7=data5
-        paddw   mm4,mm6                 ; mm4=data1
-
-        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
-        movq    MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
-        movq    MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
-        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
-
-        add     edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
-        dec     ecx
-        jnz     near .rowloop
-
-        ; ---- Pass 2: process columns.
-
-        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-        movq    mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-        ; mm0=(02 12 22 32), mm2=(42 52 62 72)
-        ; mm1=(03 13 23 33), mm3=(43 53 63 73)
-
-        movq      mm4,mm0               ; transpose coefficients(phase 1)
-        punpcklwd mm0,mm1               ; mm0=(02 03 12 13)
-        punpckhwd mm4,mm1               ; mm4=(22 23 32 33)
-        movq      mm5,mm2               ; transpose coefficients(phase 1)
-        punpcklwd mm2,mm3               ; mm2=(42 43 52 53)
-        punpckhwd mm5,mm3               ; mm5=(62 63 72 73)
-
-        movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-        movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-
-        ; mm6=(00 10 20 30), mm1=(40 50 60 70)
-        ; mm7=(01 11 21 31), mm3=(41 51 61 71)
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
-        movq    MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
-
-        movq      mm4,mm6               ; transpose coefficients(phase 1)
-        punpcklwd mm6,mm7               ; mm6=(00 01 10 11)
-        punpckhwd mm4,mm7               ; mm4=(20 21 30 31)
-        movq      mm2,mm1               ; transpose coefficients(phase 1)
-        punpcklwd mm1,mm3               ; mm1=(40 41 50 51)
-        punpckhwd mm2,mm3               ; mm2=(60 61 70 71)
-
-        movq      mm7,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm0               ; mm6=(00 01 02 03)=data0
-        punpckhdq mm7,mm0               ; mm7=(10 11 12 13)=data1
-        movq      mm3,mm2               ; transpose coefficients(phase 2)
-        punpckldq mm2,mm5               ; mm2=(60 61 62 63)=data6
-        punpckhdq mm3,mm5               ; mm3=(70 71 72 73)=data7
-
-        movq    mm0,mm7
-        movq    mm5,mm6
-        psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
-        psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
-        paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
-        paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
-
-        movq    mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
-        movq    mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
-        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
-        movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
-
-        movq      mm7,mm4               ; transpose coefficients(phase 2)
-        punpckldq mm4,mm2               ; mm4=(20 21 22 23)=data2
-        punpckhdq mm7,mm2               ; mm7=(30 31 32 33)=data3
-        movq      mm6,mm1               ; transpose coefficients(phase 2)
-        punpckldq mm1,mm3               ; mm1=(40 41 42 43)=data4
-        punpckhdq mm6,mm3               ; mm6=(50 51 52 53)=data5
-
-        movq    mm2,mm7
-        movq    mm3,mm4
-        paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
-        paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
-        psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
-        psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movq    mm1,mm5
-        movq    mm6,mm0
-        psubw   mm5,mm7                 ; mm5=tmp13
-        psubw   mm0,mm4                 ; mm0=tmp12
-        paddw   mm1,mm7                 ; mm1=tmp10
-        paddw   mm6,mm4                 ; mm6=tmp11
-
-        paddw   mm0,mm5
-        psllw   mm0,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
-
-        movq    mm7,mm1
-        movq    mm4,mm5
-        psubw   mm1,mm6                 ; mm1=data4
-        psubw   mm5,mm0                 ; mm5=data6
-        paddw   mm7,mm6                 ; mm7=data0
-        paddw   mm4,mm0                 ; mm4=data2
-
-        movq    MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
-        movq    MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
-        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
-        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-
-        ; -- Odd part
-
-        movq    mm6, MMWORD [wk(0)]     ; mm6=tmp6
-        movq    mm0, MMWORD [wk(1)]     ; mm0=tmp7
-
-        paddw   mm2,mm3                 ; mm2=tmp10
-        paddw   mm3,mm6                 ; mm3=tmp11
-        paddw   mm6,mm0                 ; mm6=tmp12, mm0=tmp7
-
-        psllw   mm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   mm6,PRE_MULTIPLY_SCALE_BITS
-
-        psllw   mm3,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
-
-        movq    mm1,mm2                 ; mm1=tmp10
-        psubw   mm2,mm6
-        pmulhw  mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
-        pmulhw  mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
-        pmulhw  mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
-        paddw   mm1,mm2                 ; mm1=z2
-        paddw   mm6,mm2                 ; mm6=z4
-
-        movq    mm5,mm0
-        psubw   mm0,mm3                 ; mm0=z13
-        paddw   mm5,mm3                 ; mm5=z11
-
-        movq    mm7,mm0
-        movq    mm4,mm5
-        psubw   mm0,mm1                 ; mm0=data3
-        psubw   mm5,mm6                 ; mm5=data7
-        paddw   mm7,mm1                 ; mm7=data5
-        paddw   mm4,mm6                 ; mm4=data1
-
-        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
-        movq    MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
-        movq    MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
-        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
-
-        add     edx, byte 4*SIZEOF_DCTELEM
-        dec     ecx
-        jnz     near .columnloop
-
-        emms            ; empty MMX state
-
-;       pop     edi             ; unused
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jfdctfst-sse2-64.asm b/simd/jfdctfst-sse2-64.asm
deleted file mode 100644
index 4c96685..0000000
--- a/simd/jfdctfst-sse2-64.asm
+++ /dev/null
@@ -1,391 +0,0 @@
-;
-; jfdctfst.asm - fast integer FDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the forward DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      8       ; 14 is also OK.
-
-%if CONST_BITS == 8
-F_0_382 equ      98             ; FIX(0.382683433)
-F_0_541 equ     139             ; FIX(0.541196100)
-F_0_707 equ     181             ; FIX(0.707106781)
-F_1_306 equ     334             ; FIX(1.306562965)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_382 equ     DESCALE( 410903207,30-CONST_BITS)       ; FIX(0.382683433)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_707 equ     DESCALE( 759250124,30-CONST_BITS)       ; FIX(0.707106781)
-F_1_306 equ     DESCALE(1402911301,30-CONST_BITS)       ; FIX(1.306562965)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-        alignz  16
-        global  EXTN(jconst_fdct_ifast_sse2)
-
-EXTN(jconst_fdct_ifast_sse2):
-
-PW_F0707        times 8 dw  F_0_707 << CONST_SHIFT
-PW_F0382        times 8 dw  F_0_382 << CONST_SHIFT
-PW_F0541        times 8 dw  F_0_541 << CONST_SHIFT
-PW_F1306        times 8 dw  F_1_306 << CONST_SHIFT
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_ifast_sse2 (DCTELEM *data)
-;
-
-; r10 = DCTELEM *data
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_ifast_sse2)
-
-EXTN(jsimd_fdct_ifast_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-
-        ; ---- Pass 1: process rows.
-
-        mov     rdx, r10        ; (DCTELEM *)
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
-
-        ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-        ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-        movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
-        movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
-
-        movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
-
-        ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-        ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
-        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
-
-        movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
-        movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
-
-        movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
-        movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
-        movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
-
-        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
-        movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
-
-        movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
-        punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
-        movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
-        punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-        movdqa  xmm6,xmm1
-        movdqa  xmm3,xmm0
-        psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
-        psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
-        paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
-        paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
-
-        movdqa  xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
-        movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
-
-        movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
-        punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
-        movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
-        punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-        movdqa  xmm2,xmm1
-        movdqa  xmm5,xmm7
-        paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
-        paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
-        psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
-        psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm0,xmm6
-        psubw   xmm3,xmm1               ; xmm3=tmp13
-        psubw   xmm6,xmm7               ; xmm6=tmp12
-        paddw   xmm4,xmm1               ; xmm4=tmp10
-        paddw   xmm0,xmm7               ; xmm0=tmp11
-
-        paddw   xmm6,xmm3
-        psllw   xmm6,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm6,[rel PW_F0707] ; xmm6=z1
-
-        movdqa  xmm1,xmm4
-        movdqa  xmm7,xmm3
-        psubw   xmm4,xmm0               ; xmm4=data4
-        psubw   xmm3,xmm6               ; xmm3=data6
-        paddw   xmm1,xmm0               ; xmm1=data0
-        paddw   xmm7,xmm6               ; xmm7=data2
-
-        movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
-        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=data4
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=data6
-
-        ; -- Odd part
-
-        paddw   xmm2,xmm5               ; xmm2=tmp10
-        paddw   xmm5,xmm0               ; xmm5=tmp11
-        paddw   xmm0,xmm6               ; xmm0=tmp12, xmm6=tmp7
-
-        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
-
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm5,[rel PW_F0707] ; xmm5=z3
-
-        movdqa  xmm4,xmm2               ; xmm4=tmp10
-        psubw   xmm2,xmm0
-        pmulhw  xmm2,[rel PW_F0382] ; xmm2=z5
-        pmulhw  xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-        pmulhw  xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
-        paddw   xmm4,xmm2               ; xmm4=z2
-        paddw   xmm0,xmm2               ; xmm0=z4
-
-        movdqa  xmm3,xmm6
-        psubw   xmm6,xmm5               ; xmm6=z13
-        paddw   xmm3,xmm5               ; xmm3=z11
-
-        movdqa  xmm2,xmm6
-        movdqa  xmm5,xmm3
-        psubw   xmm6,xmm4               ; xmm6=data3
-        psubw   xmm3,xmm0               ; xmm3=data7
-        paddw   xmm2,xmm4               ; xmm2=data5
-        paddw   xmm5,xmm0               ; xmm5=data1
-
-        ; ---- Pass 2: process columns.
-
-        ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
-        ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
-
-        movdqa    xmm4,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm5             ; xmm1=(00 01 10 11 20 21 30 31)
-        punpckhwd xmm4,xmm5             ; xmm4=(40 41 50 51 60 61 70 71)
-        movdqa    xmm0,xmm7             ; transpose coefficients(phase 1)
-        punpcklwd xmm7,xmm6             ; xmm7=(02 03 12 13 22 23 32 33)
-        punpckhwd xmm0,xmm6             ; xmm0=(42 43 52 53 62 63 72 73)
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=col4
-        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=col6
-
-        ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
-        ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
-
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
-
-        movdqa    xmm7,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm2             ; xmm5=(04 05 14 15 24 25 34 35)
-        punpckhwd xmm7,xmm2             ; xmm7=(44 45 54 55 64 65 74 75)
-        movdqa    xmm0,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm3             ; xmm6=(06 07 16 17 26 27 36 37)
-        punpckhwd xmm0,xmm3             ; xmm0=(46 47 56 57 66 67 76 77)
-
-        movdqa    xmm2,xmm5             ; transpose coefficients(phase 2)
-        punpckldq xmm5,xmm6             ; xmm5=(04 05 06 07 14 15 16 17)
-        punpckhdq xmm2,xmm6             ; xmm2=(24 25 26 27 34 35 36 37)
-        movdqa    xmm3,xmm7             ; transpose coefficients(phase 2)
-        punpckldq xmm7,xmm0             ; xmm7=(44 45 46 47 54 55 56 57)
-        punpckhdq xmm3,xmm0             ; xmm3=(64 65 66 67 74 75 76 77)
-
-        movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
-        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
-        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
-
-        movdqa    xmm2,xmm1             ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm6             ; xmm1=(00 01 02 03 10 11 12 13)
-        punpckhdq xmm2,xmm6             ; xmm2=(20 21 22 23 30 31 32 33)
-        movdqa    xmm7,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm0             ; xmm4=(40 41 42 43 50 51 52 53)
-        punpckhdq xmm7,xmm0             ; xmm7=(60 61 62 63 70 71 72 73)
-
-        movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm5            ; xmm1=(00 01 02 03 04 05 06 07)=data0
-        punpckhqdq xmm6,xmm5            ; xmm6=(10 11 12 13 14 15 16 17)=data1
-        movdqa     xmm0,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm3            ; xmm7=(60 61 62 63 64 65 66 67)=data6
-        punpckhqdq xmm0,xmm3            ; xmm0=(70 71 72 73 74 75 76 77)=data7
-
-        movdqa  xmm5,xmm6
-        movdqa  xmm3,xmm1
-        psubw   xmm6,xmm7               ; xmm6=data1-data6=tmp6
-        psubw   xmm1,xmm0               ; xmm1=data0-data7=tmp7
-        paddw   xmm5,xmm7               ; xmm5=data1+data6=tmp1
-        paddw   xmm3,xmm0               ; xmm3=data0+data7=tmp0
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
-        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
-        movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
-
-        movdqa     xmm6,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm7            ; xmm2=(20 21 22 23 24 25 26 27)=data2
-        punpckhqdq xmm6,xmm7            ; xmm6=(30 31 32 33 34 35 36 37)=data3
-        movdqa     xmm1,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm0            ; xmm4=(40 41 42 43 44 45 46 47)=data4
-        punpckhqdq xmm1,xmm0            ; xmm1=(50 51 52 53 54 55 56 57)=data5
-
-        movdqa  xmm7,xmm6
-        movdqa  xmm0,xmm2
-        paddw   xmm6,xmm4               ; xmm6=data3+data4=tmp3
-        paddw   xmm2,xmm1               ; xmm2=data2+data5=tmp2
-        psubw   xmm7,xmm4               ; xmm7=data3-data4=tmp4
-        psubw   xmm0,xmm1               ; xmm0=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm1,xmm5
-        psubw   xmm3,xmm6               ; xmm3=tmp13
-        psubw   xmm5,xmm2               ; xmm5=tmp12
-        paddw   xmm4,xmm6               ; xmm4=tmp10
-        paddw   xmm1,xmm2               ; xmm1=tmp11
-
-        paddw   xmm5,xmm3
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm5,[rel PW_F0707] ; xmm5=z1
-
-        movdqa  xmm6,xmm4
-        movdqa  xmm2,xmm3
-        psubw   xmm4,xmm1               ; xmm4=data4
-        psubw   xmm3,xmm5               ; xmm3=data6
-        paddw   xmm6,xmm1               ; xmm6=data0
-        paddw   xmm2,xmm5               ; xmm2=data2
-
-        movdqa  XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
-        movdqa  XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
-        movdqa  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
-        movdqa  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
-
-        ; -- Odd part
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
-
-        paddw   xmm7,xmm0               ; xmm7=tmp10
-        paddw   xmm0,xmm1               ; xmm0=tmp11
-        paddw   xmm1,xmm5               ; xmm1=tmp12, xmm5=tmp7
-
-        psllw   xmm7,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
-
-        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm0,[rel PW_F0707] ; xmm0=z3
-
-        movdqa  xmm4,xmm7               ; xmm4=tmp10
-        psubw   xmm7,xmm1
-        pmulhw  xmm7,[rel PW_F0382] ; xmm7=z5
-        pmulhw  xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-        pmulhw  xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
-        paddw   xmm4,xmm7               ; xmm4=z2
-        paddw   xmm1,xmm7               ; xmm1=z4
-
-        movdqa  xmm3,xmm5
-        psubw   xmm5,xmm0               ; xmm5=z13
-        paddw   xmm3,xmm0               ; xmm3=z11
-
-        movdqa  xmm6,xmm5
-        movdqa  xmm2,xmm3
-        psubw   xmm5,xmm4               ; xmm5=data3
-        psubw   xmm3,xmm1               ; xmm3=data7
-        paddw   xmm6,xmm4               ; xmm6=data5
-        paddw   xmm2,xmm1               ; xmm2=data1
-
-        movdqa  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
-        movdqa  XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
-        movdqa  XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
-        movdqa  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
-
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jfdctfst-sse2.asm b/simd/jfdctfst-sse2.asm
deleted file mode 100644
index 54856a2..0000000
--- a/simd/jfdctfst-sse2.asm
+++ /dev/null
@@ -1,403 +0,0 @@
-;
-; jfdctfst.asm - fast integer FDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the forward DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      8       ; 14 is also OK.
-
-%if CONST_BITS == 8
-F_0_382 equ      98             ; FIX(0.382683433)
-F_0_541 equ     139             ; FIX(0.541196100)
-F_0_707 equ     181             ; FIX(0.707106781)
-F_1_306 equ     334             ; FIX(1.306562965)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_382 equ     DESCALE( 410903207,30-CONST_BITS)       ; FIX(0.382683433)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_707 equ     DESCALE( 759250124,30-CONST_BITS)       ; FIX(0.707106781)
-F_1_306 equ     DESCALE(1402911301,30-CONST_BITS)       ; FIX(1.306562965)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-        alignz  16
-        global  EXTN(jconst_fdct_ifast_sse2)
-
-EXTN(jconst_fdct_ifast_sse2):
-
-PW_F0707        times 8 dw  F_0_707 << CONST_SHIFT
-PW_F0382        times 8 dw  F_0_382 << CONST_SHIFT
-PW_F0541        times 8 dw  F_0_541 << CONST_SHIFT
-PW_F1306        times 8 dw  F_1_306 << CONST_SHIFT
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_ifast_sse2 (DCTELEM *data)
-;
-
-%define data(b)         (b)+8           ; DCTELEM *data
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_ifast_sse2)
-
-EXTN(jsimd_fdct_ifast_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-;       push    edi             ; unused
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process rows.
-
-        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-
-        ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-        ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-        movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
-        movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
-
-        movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-        ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-        ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
-        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
-
-        movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
-        movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
-
-        movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
-        movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
-        movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
-
-        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
-        movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
-
-        movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
-        punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
-        movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
-        punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-        movdqa  xmm6,xmm1
-        movdqa  xmm3,xmm0
-        psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
-        psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
-        paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
-        paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
-
-        movdqa  xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
-        movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
-
-        movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
-        punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
-        movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
-        punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-        movdqa  xmm2,xmm1
-        movdqa  xmm5,xmm7
-        paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
-        paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
-        psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
-        psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm0,xmm6
-        psubw   xmm3,xmm1               ; xmm3=tmp13
-        psubw   xmm6,xmm7               ; xmm6=tmp12
-        paddw   xmm4,xmm1               ; xmm4=tmp10
-        paddw   xmm0,xmm7               ; xmm0=tmp11
-
-        paddw   xmm6,xmm3
-        psllw   xmm6,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1
-
-        movdqa  xmm1,xmm4
-        movdqa  xmm7,xmm3
-        psubw   xmm4,xmm0               ; xmm4=data4
-        psubw   xmm3,xmm6               ; xmm3=data6
-        paddw   xmm1,xmm0               ; xmm1=data0
-        paddw   xmm7,xmm6               ; xmm7=data2
-
-        movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
-        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=data4
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=data6
-
-        ; -- Odd part
-
-        paddw   xmm2,xmm5               ; xmm2=tmp10
-        paddw   xmm5,xmm0               ; xmm5=tmp11
-        paddw   xmm0,xmm6               ; xmm0=tmp12, xmm6=tmp7
-
-        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
-
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3
-
-        movdqa  xmm4,xmm2               ; xmm4=tmp10
-        psubw   xmm2,xmm0
-        pmulhw  xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5
-        pmulhw  xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-        pmulhw  xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
-        paddw   xmm4,xmm2               ; xmm4=z2
-        paddw   xmm0,xmm2               ; xmm0=z4
-
-        movdqa  xmm3,xmm6
-        psubw   xmm6,xmm5               ; xmm6=z13
-        paddw   xmm3,xmm5               ; xmm3=z11
-
-        movdqa  xmm2,xmm6
-        movdqa  xmm5,xmm3
-        psubw   xmm6,xmm4               ; xmm6=data3
-        psubw   xmm3,xmm0               ; xmm3=data7
-        paddw   xmm2,xmm4               ; xmm2=data5
-        paddw   xmm5,xmm0               ; xmm5=data1
-
-        ; ---- Pass 2: process columns.
-
-;       mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-
-        ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
-        ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
-
-        movdqa    xmm4,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm5             ; xmm1=(00 01 10 11 20 21 30 31)
-        punpckhwd xmm4,xmm5             ; xmm4=(40 41 50 51 60 61 70 71)
-        movdqa    xmm0,xmm7             ; transpose coefficients(phase 1)
-        punpcklwd xmm7,xmm6             ; xmm7=(02 03 12 13 22 23 32 33)
-        punpckhwd xmm0,xmm6             ; xmm0=(42 43 52 53 62 63 72 73)
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=col4
-        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=col6
-
-        ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
-        ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
-
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
-
-        movdqa    xmm7,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm2             ; xmm5=(04 05 14 15 24 25 34 35)
-        punpckhwd xmm7,xmm2             ; xmm7=(44 45 54 55 64 65 74 75)
-        movdqa    xmm0,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm3             ; xmm6=(06 07 16 17 26 27 36 37)
-        punpckhwd xmm0,xmm3             ; xmm0=(46 47 56 57 66 67 76 77)
-
-        movdqa    xmm2,xmm5             ; transpose coefficients(phase 2)
-        punpckldq xmm5,xmm6             ; xmm5=(04 05 06 07 14 15 16 17)
-        punpckhdq xmm2,xmm6             ; xmm2=(24 25 26 27 34 35 36 37)
-        movdqa    xmm3,xmm7             ; transpose coefficients(phase 2)
-        punpckldq xmm7,xmm0             ; xmm7=(44 45 46 47 54 55 56 57)
-        punpckhdq xmm3,xmm0             ; xmm3=(64 65 66 67 74 75 76 77)
-
-        movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
-        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
-        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
-
-        movdqa    xmm2,xmm1             ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm6             ; xmm1=(00 01 02 03 10 11 12 13)
-        punpckhdq xmm2,xmm6             ; xmm2=(20 21 22 23 30 31 32 33)
-        movdqa    xmm7,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm0             ; xmm4=(40 41 42 43 50 51 52 53)
-        punpckhdq xmm7,xmm0             ; xmm7=(60 61 62 63 70 71 72 73)
-
-        movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm5            ; xmm1=(00 01 02 03 04 05 06 07)=data0
-        punpckhqdq xmm6,xmm5            ; xmm6=(10 11 12 13 14 15 16 17)=data1
-        movdqa     xmm0,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm3            ; xmm7=(60 61 62 63 64 65 66 67)=data6
-        punpckhqdq xmm0,xmm3            ; xmm0=(70 71 72 73 74 75 76 77)=data7
-
-        movdqa  xmm5,xmm6
-        movdqa  xmm3,xmm1
-        psubw   xmm6,xmm7               ; xmm6=data1-data6=tmp6
-        psubw   xmm1,xmm0               ; xmm1=data0-data7=tmp7
-        paddw   xmm5,xmm7               ; xmm5=data1+data6=tmp1
-        paddw   xmm3,xmm0               ; xmm3=data0+data7=tmp0
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
-        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
-        movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
-
-        movdqa     xmm6,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm7            ; xmm2=(20 21 22 23 24 25 26 27)=data2
-        punpckhqdq xmm6,xmm7            ; xmm6=(30 31 32 33 34 35 36 37)=data3
-        movdqa     xmm1,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm0            ; xmm4=(40 41 42 43 44 45 46 47)=data4
-        punpckhqdq xmm1,xmm0            ; xmm1=(50 51 52 53 54 55 56 57)=data5
-
-        movdqa  xmm7,xmm6
-        movdqa  xmm0,xmm2
-        paddw   xmm6,xmm4               ; xmm6=data3+data4=tmp3
-        paddw   xmm2,xmm1               ; xmm2=data2+data5=tmp2
-        psubw   xmm7,xmm4               ; xmm7=data3-data4=tmp4
-        psubw   xmm0,xmm1               ; xmm0=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm1,xmm5
-        psubw   xmm3,xmm6               ; xmm3=tmp13
-        psubw   xmm5,xmm2               ; xmm5=tmp12
-        paddw   xmm4,xmm6               ; xmm4=tmp10
-        paddw   xmm1,xmm2               ; xmm1=tmp11
-
-        paddw   xmm5,xmm3
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1
-
-        movdqa  xmm6,xmm4
-        movdqa  xmm2,xmm3
-        psubw   xmm4,xmm1               ; xmm4=data4
-        psubw   xmm3,xmm5               ; xmm3=data6
-        paddw   xmm6,xmm1               ; xmm6=data0
-        paddw   xmm2,xmm5               ; xmm2=data2
-
-        movdqa  XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
-        movdqa  XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
-        movdqa  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
-        movdqa  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
-
-        ; -- Odd part
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
-
-        paddw   xmm7,xmm0               ; xmm7=tmp10
-        paddw   xmm0,xmm1               ; xmm0=tmp11
-        paddw   xmm1,xmm5               ; xmm1=tmp12, xmm5=tmp7
-
-        psllw   xmm7,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
-
-        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3
-
-        movdqa  xmm4,xmm7               ; xmm4=tmp10
-        psubw   xmm7,xmm1
-        pmulhw  xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5
-        pmulhw  xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-        pmulhw  xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
-        paddw   xmm4,xmm7               ; xmm4=z2
-        paddw   xmm1,xmm7               ; xmm1=z4
-
-        movdqa  xmm3,xmm5
-        psubw   xmm5,xmm0               ; xmm5=z13
-        paddw   xmm3,xmm0               ; xmm3=z11
-
-        movdqa  xmm6,xmm5
-        movdqa  xmm2,xmm3
-        psubw   xmm5,xmm4               ; xmm5=data3
-        psubw   xmm3,xmm1               ; xmm3=data7
-        paddw   xmm6,xmm4               ; xmm6=data5
-        paddw   xmm2,xmm1               ; xmm2=data1
-
-        movdqa  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
-        movdqa  XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
-        movdqa  XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
-        movdqa  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
-
-;       pop     edi             ; unused
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jfdctint-altivec.c b/simd/jfdctint-altivec.c
deleted file mode 100644
index e6e8a56..0000000
--- a/simd/jfdctint-altivec.c
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* SLOW INTEGER FORWARD DCT */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_298 2446   /* FIX(0.298631336) */
-#define F_0_390 3196   /* FIX(0.390180644) */
-#define F_0_541 4433   /* FIX(0.541196100) */
-#define F_0_765 6270   /* FIX(0.765366865) */
-#define F_0_899 7373   /* FIX(0.899976223) */
-#define F_1_175 9633   /* FIX(1.175875602) */
-#define F_1_501 12299  /* FIX(1.501321110) */
-#define F_1_847 15137  /* FIX(1.847759065) */
-#define F_1_961 16069  /* FIX(1.961570560) */
-#define F_2_053 16819  /* FIX(2.053119869) */
-#define F_2_562 20995  /* FIX(2.562915447) */
-#define F_3_072 25172  /* FIX(3.072711026) */
-
-#define CONST_BITS 13
-#define PASS1_BITS 2
-#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
-#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
-
-
-#define DO_FDCT_COMMON(PASS)  \
-{  \
-  /* (Original)  \
-   * z1 = (tmp12 + tmp13) * 0.541196100;  \
-   * data2 = z1 + tmp13 * 0.765366865;  \
-   * data6 = z1 + tmp12 * -1.847759065;  \
-   *  \
-   * (This implementation)  \
-   * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;  \
-   * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);  \
-   */  \
-  \
-  tmp1312l = vec_mergeh(tmp13, tmp12);  \
-  tmp1312h = vec_mergel(tmp13, tmp12);  \
-  \
-  out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS);  \
-  out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS);  \
-  out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS);  \
-  out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS);  \
-  \
-  out2l = vec_sra(out2l, descale_p##PASS);  \
-  out2h = vec_sra(out2h, descale_p##PASS);  \
-  out6l = vec_sra(out6l, descale_p##PASS);  \
-  out6h = vec_sra(out6h, descale_p##PASS);  \
-  \
-  out2 = vec_pack(out2l, out2h);  \
-  out6 = vec_pack(out6l, out6h);  \
-  \
-  /* Odd part */  \
-  \
-  z3 = vec_add(tmp4, tmp6);  \
-  z4 = vec_add(tmp5, tmp7);  \
-  \
-  /* (Original)  \
-   * z5 = (z3 + z4) * 1.175875602;  \
-   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;  \
-   * z3 += z5;  z4 += z5;  \
-   *  \
-   * (This implementation)  \
-   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;  \
-   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);  \
-   */  \
-  \
-  z34l = vec_mergeh(z3, z4);  \
-  z34h = vec_mergel(z3, z4);  \
-  \
-  z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS);  \
-  z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS);  \
-  z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS);  \
-  z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS);  \
-  \
-  /* (Original)  \
-   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;  \
-   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;  \
-   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;  \
-   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;  \
-   * data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;  \
-   * data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;  \
-   *  \
-   * (This implementation)  \
-   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;  \
-   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;  \
-   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);  \
-   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);  \
-   * data7 = tmp4 + z3;  data5 = tmp5 + z4;  \
-   * data3 = tmp6 + z3;  data1 = tmp7 + z4;  \
-   */  \
-  \
-  tmp47l = vec_mergeh(tmp4, tmp7);  \
-  tmp47h = vec_mergel(tmp4, tmp7);  \
-  \
-  out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l);  \
-  out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h);  \
-  out1l = vec_msums(tmp47l, pw_mf089_f060, z4l);  \
-  out1h = vec_msums(tmp47h, pw_mf089_f060, z4h);  \
-  \
-  out7l = vec_sra(out7l, descale_p##PASS);  \
-  out7h = vec_sra(out7h, descale_p##PASS);  \
-  out1l = vec_sra(out1l, descale_p##PASS);  \
-  out1h = vec_sra(out1h, descale_p##PASS);  \
-  \
-  out7 = vec_pack(out7l, out7h);  \
-  out1 = vec_pack(out1l, out1h);  \
-  \
-  tmp56l = vec_mergeh(tmp5, tmp6);  \
-  tmp56h = vec_mergel(tmp5, tmp6);  \
-  \
-  out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l);  \
-  out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h);  \
-  out3l = vec_msums(tmp56l, pw_mf256_f050, z3l);  \
-  out3h = vec_msums(tmp56h, pw_mf256_f050, z3h);  \
-  \
-  out5l = vec_sra(out5l, descale_p##PASS);  \
-  out5h = vec_sra(out5h, descale_p##PASS);  \
-  out3l = vec_sra(out3l, descale_p##PASS);  \
-  out3h = vec_sra(out3h, descale_p##PASS);  \
-  \
-  out5 = vec_pack(out5l, out5h);  \
-  out3 = vec_pack(out3l, out3h);  \
-}
-
-#define DO_FDCT_PASS1()  \
-{  \
-  /* Even part */  \
-  \
-  tmp10 = vec_add(tmp0, tmp3);  \
-  tmp13 = vec_sub(tmp0, tmp3);  \
-  tmp11 = vec_add(tmp1, tmp2);  \
-  tmp12 = vec_sub(tmp1, tmp2);  \
-  \
-  out0  = vec_add(tmp10, tmp11);  \
-  out0  = vec_sl(out0, pass1_bits);  \
-  out4  = vec_sub(tmp10, tmp11);  \
-  out4  = vec_sl(out4, pass1_bits);  \
-  \
-  DO_FDCT_COMMON(1);  \
-}
-
-#define DO_FDCT_PASS2()  \
-{  \
-  /* Even part */  \
-  \
-  tmp10 = vec_add(tmp0, tmp3);  \
-  tmp13 = vec_sub(tmp0, tmp3);  \
-  tmp11 = vec_add(tmp1, tmp2);  \
-  tmp12 = vec_sub(tmp1, tmp2);  \
-  \
-  out0  = vec_add(tmp10, tmp11);  \
-  out0  = vec_add(out0, pw_descale_p2x);  \
-  out0  = vec_sra(out0, pass1_bits);  \
-  out4  = vec_sub(tmp10, tmp11);  \
-  out4  = vec_add(out4, pw_descale_p2x);  \
-  out4  = vec_sra(out4, pass1_bits);  \
-  \
-  DO_FDCT_COMMON(2);  \
-}
-
-
-void
-jsimd_fdct_islow_altivec (DCTELEM *data)
-{
-  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
-    col0, col1, col2, col3, col4, col5, col6, col7,
-    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
-    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
-    z3, z4, z34l, z34h,
-    out0, out1, out2, out3, out4, out5, out6, out7;
-  __vector int z3l, z3h, z4l, z4h,
-    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
-    out7l, out7h;
-
-  /* Constants */
-  __vector short
-    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
-    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
-    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
-    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
-    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
-    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
-    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
-    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
-    pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
-  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
-  __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
-    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
-  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
-    descale_p2 = { __4X(DESCALE_P2) };
-
-  /* Pass 1: process rows */
-
-  row0 = vec_ld(0, data);
-  row1 = vec_ld(16, data);
-  row2 = vec_ld(32, data);
-  row3 = vec_ld(48, data);
-  row4 = vec_ld(64, data);
-  row5 = vec_ld(80, data);
-  row6 = vec_ld(96, data);
-  row7 = vec_ld(112, data);
-
-  TRANSPOSE(row, col);
-
-  tmp0 = vec_add(col0, col7);
-  tmp7 = vec_sub(col0, col7);
-  tmp1 = vec_add(col1, col6);
-  tmp6 = vec_sub(col1, col6);
-  tmp2 = vec_add(col2, col5);
-  tmp5 = vec_sub(col2, col5);
-  tmp3 = vec_add(col3, col4);
-  tmp4 = vec_sub(col3, col4);
-
-  DO_FDCT_PASS1();
-
-  /* Pass 2: process columns */
-
-  TRANSPOSE(out, row);
-
-  tmp0 = vec_add(row0, row7);
-  tmp7 = vec_sub(row0, row7);
-  tmp1 = vec_add(row1, row6);
-  tmp6 = vec_sub(row1, row6);
-  tmp2 = vec_add(row2, row5);
-  tmp5 = vec_sub(row2, row5);
-  tmp3 = vec_add(row3, row4);
-  tmp4 = vec_sub(row3, row4);
-
-  DO_FDCT_PASS2();
-
-  vec_st(out0, 0, data);
-  vec_st(out1, 16, data);
-  vec_st(out2, 32, data);
-  vec_st(out3, 48, data);
-  vec_st(out4, 64, data);
-  vec_st(out5, 80, data);
-  vec_st(out6, 96, data);
-  vec_st(out7, 112, data);
-}
diff --git a/simd/jfdctint-mmx.asm b/simd/jfdctint-mmx.asm
deleted file mode 100644
index 9142ad8..0000000
--- a/simd/jfdctint-mmx.asm
+++ /dev/null
@@ -1,621 +0,0 @@
-;
-; jfdctint.asm - accurate integer FDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; forward DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jfdctint.c; see the jfdctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2      (CONST_BITS+PASS1_BITS)
-
-%if CONST_BITS == 13
-F_0_298 equ      2446           ; FIX(0.298631336)
-F_0_390 equ      3196           ; FIX(0.390180644)
-F_0_541 equ      4433           ; FIX(0.541196100)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_175 equ      9633           ; FIX(1.175875602)
-F_1_501 equ     12299           ; FIX(1.501321110)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_1_961 equ     16069           ; FIX(1.961570560)
-F_2_053 equ     16819           ; FIX(2.053119869)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_072 equ     25172           ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
-F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
-F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
-F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fdct_islow_mmx)
-
-EXTN(jconst_fdct_islow_mmx):
-
-PW_F130_F054    times 2 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130   times 2 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117   times 2 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078    times 2 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089  times 2 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060   times 2 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256  times 2 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050   times 2 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1   times 2 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2   times 2 dd  1 << (DESCALE_P2-1)
-PW_DESCALE_P2X  times 4 dw  1 << (PASS1_BITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_islow_mmx (DCTELEM *data)
-;
-
-%define data(b)         (b)+8           ; DCTELEM *data
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_islow_mmx)
-
-EXTN(jsimd_fdct_islow_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-;       push    edi             ; unused
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process rows.
-
-        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.rowloop:
-
-        movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-        movq    mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
-
-        ; mm0=(20 21 22 23), mm2=(24 25 26 27)
-        ; mm1=(30 31 32 33), mm3=(34 35 36 37)
-
-        movq      mm4,mm0               ; transpose coefficients(phase 1)
-        punpcklwd mm0,mm1               ; mm0=(20 30 21 31)
-        punpckhwd mm4,mm1               ; mm4=(22 32 23 33)
-        movq      mm5,mm2               ; transpose coefficients(phase 1)
-        punpcklwd mm2,mm3               ; mm2=(24 34 25 35)
-        punpckhwd mm5,mm3               ; mm5=(26 36 27 37)
-
-        movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-        movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
-
-        ; mm6=(00 01 02 03), mm1=(04 05 06 07)
-        ; mm7=(10 11 12 13), mm3=(14 15 16 17)
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
-        movq    MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
-
-        movq      mm4,mm6               ; transpose coefficients(phase 1)
-        punpcklwd mm6,mm7               ; mm6=(00 10 01 11)
-        punpckhwd mm4,mm7               ; mm4=(02 12 03 13)
-        movq      mm2,mm1               ; transpose coefficients(phase 1)
-        punpcklwd mm1,mm3               ; mm1=(04 14 05 15)
-        punpckhwd mm2,mm3               ; mm2=(06 16 07 17)
-
-        movq      mm7,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm0               ; mm6=(00 10 20 30)=data0
-        punpckhdq mm7,mm0               ; mm7=(01 11 21 31)=data1
-        movq      mm3,mm2               ; transpose coefficients(phase 2)
-        punpckldq mm2,mm5               ; mm2=(06 16 26 36)=data6
-        punpckhdq mm3,mm5               ; mm3=(07 17 27 37)=data7
-
-        movq    mm0,mm7
-        movq    mm5,mm6
-        psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
-        psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
-        paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
-        paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
-
-        movq    mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
-        movq    mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
-        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
-        movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
-
-        movq      mm7,mm4               ; transpose coefficients(phase 2)
-        punpckldq mm4,mm2               ; mm4=(02 12 22 32)=data2
-        punpckhdq mm7,mm2               ; mm7=(03 13 23 33)=data3
-        movq      mm6,mm1               ; transpose coefficients(phase 2)
-        punpckldq mm1,mm3               ; mm1=(04 14 24 34)=data4
-        punpckhdq mm6,mm3               ; mm6=(05 15 25 35)=data5
-
-        movq    mm2,mm7
-        movq    mm3,mm4
-        paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
-        paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
-        psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
-        psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movq    mm1,mm5
-        movq    mm6,mm0
-        paddw   mm5,mm7                 ; mm5=tmp10
-        paddw   mm0,mm4                 ; mm0=tmp11
-        psubw   mm1,mm7                 ; mm1=tmp13
-        psubw   mm6,mm4                 ; mm6=tmp12
-
-        movq    mm7,mm5
-        paddw   mm5,mm0                 ; mm5=tmp10+tmp11
-        psubw   mm7,mm0                 ; mm7=tmp10-tmp11
-
-        psllw   mm5,PASS1_BITS          ; mm5=data0
-        psllw   mm7,PASS1_BITS          ; mm7=data4
-
-        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
-        movq    MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
-
-        ; (Original)
-        ; z1 = (tmp12 + tmp13) * 0.541196100;
-        ; data2 = z1 + tmp13 * 0.765366865;
-        ; data6 = z1 + tmp12 * -1.847759065;
-        ;
-        ; (This implementation)
-        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-        movq      mm4,mm1               ; mm1=tmp13
-        movq      mm0,mm1
-        punpcklwd mm4,mm6               ; mm6=tmp12
-        punpckhwd mm0,mm6
-        movq      mm1,mm4
-        movq      mm6,mm0
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=data2L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]        ; mm0=data2H
-        pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=data6L
-        pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]       ; mm6=data6H
-
-        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   mm0,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   mm4,DESCALE_P1
-        psrad   mm0,DESCALE_P1
-        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   mm6,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   mm1,DESCALE_P1
-        psrad   mm6,DESCALE_P1
-
-        packssdw  mm4,mm0               ; mm4=data2
-        packssdw  mm1,mm6               ; mm1=data6
-
-        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-        movq    MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
-
-        ; -- Odd part
-
-        movq    mm5, MMWORD [wk(0)]     ; mm5=tmp6
-        movq    mm7, MMWORD [wk(1)]     ; mm7=tmp7
-
-        movq    mm0,mm2                 ; mm2=tmp4
-        movq    mm6,mm3                 ; mm3=tmp5
-        paddw   mm0,mm5                 ; mm0=z3
-        paddw   mm6,mm7                 ; mm6=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movq      mm4,mm0
-        movq      mm1,mm0
-        punpcklwd mm4,mm6
-        punpckhwd mm1,mm6
-        movq      mm0,mm4
-        movq      mm6,mm1
-        pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]       ; mm4=z3L
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]       ; mm1=z3H
-        pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]        ; mm0=z4L
-        pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]        ; mm6=z4H
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=z3L
-        movq    MMWORD [wk(1)], mm1     ; wk(1)=z3H
-
-        ; (Original)
-        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-        movq      mm4,mm2
-        movq      mm1,mm2
-        punpcklwd mm4,mm7
-        punpckhwd mm1,mm7
-        movq      mm2,mm4
-        movq      mm7,mm1
-        pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm4=tmp4L
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm1=tmp4H
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]       ; mm2=tmp7L
-        pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]       ; mm7=tmp7H
-
-        paddd   mm4, MMWORD [wk(0)]     ; mm4=data7L
-        paddd   mm1, MMWORD [wk(1)]     ; mm1=data7H
-        paddd   mm2,mm0                 ; mm2=data1L
-        paddd   mm7,mm6                 ; mm7=data1H
-
-        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   mm4,DESCALE_P1
-        psrad   mm1,DESCALE_P1
-        paddd   mm2,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   mm2,DESCALE_P1
-        psrad   mm7,DESCALE_P1
-
-        packssdw  mm4,mm1               ; mm4=data7
-        packssdw  mm2,mm7               ; mm2=data1
-
-        movq    MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
-        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
-
-        movq      mm1,mm3
-        movq      mm7,mm3
-        punpcklwd mm1,mm5
-        punpckhwd mm7,mm5
-        movq      mm3,mm1
-        movq      mm5,mm7
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm1=tmp5L
-        pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm7=tmp5H
-        pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]       ; mm3=tmp6L
-        pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]       ; mm5=tmp6H
-
-        paddd   mm1,mm0                 ; mm1=data5L
-        paddd   mm7,mm6                 ; mm7=data5H
-        paddd   mm3, MMWORD [wk(0)]     ; mm3=data3L
-        paddd   mm5, MMWORD [wk(1)]     ; mm5=data3H
-
-        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   mm1,DESCALE_P1
-        psrad   mm7,DESCALE_P1
-        paddd   mm3,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   mm5,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   mm3,DESCALE_P1
-        psrad   mm5,DESCALE_P1
-
-        packssdw  mm1,mm7               ; mm1=data5
-        packssdw  mm3,mm5               ; mm3=data3
-
-        movq    MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
-        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
-
-        add     edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
-        dec     ecx
-        jnz     near .rowloop
-
-        ; ---- Pass 2: process columns.
-
-        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-        movq    mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-        ; mm0=(02 12 22 32), mm2=(42 52 62 72)
-        ; mm1=(03 13 23 33), mm3=(43 53 63 73)
-
-        movq      mm4,mm0               ; transpose coefficients(phase 1)
-        punpcklwd mm0,mm1               ; mm0=(02 03 12 13)
-        punpckhwd mm4,mm1               ; mm4=(22 23 32 33)
-        movq      mm5,mm2               ; transpose coefficients(phase 1)
-        punpcklwd mm2,mm3               ; mm2=(42 43 52 53)
-        punpckhwd mm5,mm3               ; mm5=(62 63 72 73)
-
-        movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-        movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-
-        ; mm6=(00 10 20 30), mm1=(40 50 60 70)
-        ; mm7=(01 11 21 31), mm3=(41 51 61 71)
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
-        movq    MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
-
-        movq      mm4,mm6               ; transpose coefficients(phase 1)
-        punpcklwd mm6,mm7               ; mm6=(00 01 10 11)
-        punpckhwd mm4,mm7               ; mm4=(20 21 30 31)
-        movq      mm2,mm1               ; transpose coefficients(phase 1)
-        punpcklwd mm1,mm3               ; mm1=(40 41 50 51)
-        punpckhwd mm2,mm3               ; mm2=(60 61 70 71)
-
-        movq      mm7,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm0               ; mm6=(00 01 02 03)=data0
-        punpckhdq mm7,mm0               ; mm7=(10 11 12 13)=data1
-        movq      mm3,mm2               ; transpose coefficients(phase 2)
-        punpckldq mm2,mm5               ; mm2=(60 61 62 63)=data6
-        punpckhdq mm3,mm5               ; mm3=(70 71 72 73)=data7
-
-        movq    mm0,mm7
-        movq    mm5,mm6
-        psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
-        psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
-        paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
-        paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
-
-        movq    mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
-        movq    mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
-        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
-        movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
-
-        movq      mm7,mm4               ; transpose coefficients(phase 2)
-        punpckldq mm4,mm2               ; mm4=(20 21 22 23)=data2
-        punpckhdq mm7,mm2               ; mm7=(30 31 32 33)=data3
-        movq      mm6,mm1               ; transpose coefficients(phase 2)
-        punpckldq mm1,mm3               ; mm1=(40 41 42 43)=data4
-        punpckhdq mm6,mm3               ; mm6=(50 51 52 53)=data5
-
-        movq    mm2,mm7
-        movq    mm3,mm4
-        paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
-        paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
-        psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
-        psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movq    mm1,mm5
-        movq    mm6,mm0
-        paddw   mm5,mm7                 ; mm5=tmp10
-        paddw   mm0,mm4                 ; mm0=tmp11
-        psubw   mm1,mm7                 ; mm1=tmp13
-        psubw   mm6,mm4                 ; mm6=tmp12
-
-        movq    mm7,mm5
-        paddw   mm5,mm0                 ; mm5=tmp10+tmp11
-        psubw   mm7,mm0                 ; mm7=tmp10-tmp11
-
-        paddw   mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
-        paddw   mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
-        psraw   mm5,PASS1_BITS          ; mm5=data0
-        psraw   mm7,PASS1_BITS          ; mm7=data4
-
-        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
-        movq    MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
-
-        ; (Original)
-        ; z1 = (tmp12 + tmp13) * 0.541196100;
-        ; data2 = z1 + tmp13 * 0.765366865;
-        ; data6 = z1 + tmp12 * -1.847759065;
-        ;
-        ; (This implementation)
-        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-        movq      mm4,mm1               ; mm1=tmp13
-        movq      mm0,mm1
-        punpcklwd mm4,mm6               ; mm6=tmp12
-        punpckhwd mm0,mm6
-        movq      mm1,mm4
-        movq      mm6,mm0
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=data2L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]        ; mm0=data2H
-        pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=data6L
-        pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]       ; mm6=data6H
-
-        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   mm0,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   mm4,DESCALE_P2
-        psrad   mm0,DESCALE_P2
-        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   mm6,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   mm1,DESCALE_P2
-        psrad   mm6,DESCALE_P2
-
-        packssdw  mm4,mm0               ; mm4=data2
-        packssdw  mm1,mm6               ; mm1=data6
-
-        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-        movq    MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
-
-        ; -- Odd part
-
-        movq    mm5, MMWORD [wk(0)]     ; mm5=tmp6
-        movq    mm7, MMWORD [wk(1)]     ; mm7=tmp7
-
-        movq    mm0,mm2                 ; mm2=tmp4
-        movq    mm6,mm3                 ; mm3=tmp5
-        paddw   mm0,mm5                 ; mm0=z3
-        paddw   mm6,mm7                 ; mm6=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movq      mm4,mm0
-        movq      mm1,mm0
-        punpcklwd mm4,mm6
-        punpckhwd mm1,mm6
-        movq      mm0,mm4
-        movq      mm6,mm1
-        pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]       ; mm4=z3L
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]       ; mm1=z3H
-        pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]        ; mm0=z4L
-        pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]        ; mm6=z4H
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=z3L
-        movq    MMWORD [wk(1)], mm1     ; wk(1)=z3H
-
-        ; (Original)
-        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-        movq      mm4,mm2
-        movq      mm1,mm2
-        punpcklwd mm4,mm7
-        punpckhwd mm1,mm7
-        movq      mm2,mm4
-        movq      mm7,mm1
-        pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm4=tmp4L
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm1=tmp4H
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]       ; mm2=tmp7L
-        pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]       ; mm7=tmp7H
-
-        paddd   mm4, MMWORD [wk(0)]     ; mm4=data7L
-        paddd   mm1, MMWORD [wk(1)]     ; mm1=data7H
-        paddd   mm2,mm0                 ; mm2=data1L
-        paddd   mm7,mm6                 ; mm7=data1H
-
-        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   mm4,DESCALE_P2
-        psrad   mm1,DESCALE_P2
-        paddd   mm2,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   mm2,DESCALE_P2
-        psrad   mm7,DESCALE_P2
-
-        packssdw  mm4,mm1               ; mm4=data7
-        packssdw  mm2,mm7               ; mm2=data1
-
-        movq    MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
-        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
-
-        movq      mm1,mm3
-        movq      mm7,mm3
-        punpcklwd mm1,mm5
-        punpckhwd mm7,mm5
-        movq      mm3,mm1
-        movq      mm5,mm7
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm1=tmp5L
-        pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm7=tmp5H
-        pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]       ; mm3=tmp6L
-        pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]       ; mm5=tmp6H
-
-        paddd   mm1,mm0                 ; mm1=data5L
-        paddd   mm7,mm6                 ; mm7=data5H
-        paddd   mm3, MMWORD [wk(0)]     ; mm3=data3L
-        paddd   mm5, MMWORD [wk(1)]     ; mm5=data3H
-
-        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   mm1,DESCALE_P2
-        psrad   mm7,DESCALE_P2
-        paddd   mm3,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   mm5,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   mm3,DESCALE_P2
-        psrad   mm5,DESCALE_P2
-
-        packssdw  mm1,mm7               ; mm1=data5
-        packssdw  mm3,mm5               ; mm3=data3
-
-        movq    MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
-        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
-
-        add     edx, byte 4*SIZEOF_DCTELEM
-        dec     ecx
-        jnz     near .columnloop
-
-        emms            ; empty MMX state
-
-;       pop     edi             ; unused
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jfdctint-sse2-64.asm b/simd/jfdctint-sse2-64.asm
deleted file mode 100644
index 9a0ca0f..0000000
--- a/simd/jfdctint-sse2-64.asm
+++ /dev/null
@@ -1,621 +0,0 @@
-;
-; jfdctint.asm - accurate integer FDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; forward DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jfdctint.c; see the jfdctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2      (CONST_BITS+PASS1_BITS)
-
-%if CONST_BITS == 13
-F_0_298 equ      2446           ; FIX(0.298631336)
-F_0_390 equ      3196           ; FIX(0.390180644)
-F_0_541 equ      4433           ; FIX(0.541196100)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_175 equ      9633           ; FIX(1.175875602)
-F_1_501 equ     12299           ; FIX(1.501321110)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_1_961 equ     16069           ; FIX(1.961570560)
-F_2_053 equ     16819           ; FIX(2.053119869)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_072 equ     25172           ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
-F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
-F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
-F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fdct_islow_sse2)
-
-EXTN(jconst_fdct_islow_sse2):
-
-PW_F130_F054    times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130   times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117   times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078    times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089  times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060   times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256  times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050   times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1   times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2   times 4 dd  1 << (DESCALE_P2-1)
-PW_DESCALE_P2X  times 8 dw  1 << (PASS1_BITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_islow_sse2 (DCTELEM *data)
-;
-
-; r10 = DCTELEM *data
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          6
-
-        align   16
-        global  EXTN(jsimd_fdct_islow_sse2)
-
-EXTN(jsimd_fdct_islow_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-
-        ; ---- Pass 1: process rows.
-
-        mov     rdx, r10        ; (DCTELEM *)
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
-
-        ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-        ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-        movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
-        movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
-
-        movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
-
-        ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-        ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
-        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
-
-        movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
-        movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
-
-        movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
-        movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
-        movdqa  XMMWORD [wk(2)], xmm7   ; wk(2)=(42 52 62 72 43 53 63 73)
-        movdqa  XMMWORD [wk(3)], xmm2   ; wk(3)=(44 54 64 74 45 55 65 75)
-
-        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
-        movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
-
-        movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
-        punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
-        movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
-        punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-        movdqa  xmm6,xmm1
-        movdqa  xmm3,xmm0
-        psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
-        psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
-        paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
-        paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
-
-        movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(42 52 62 72 43 53 63 73)
-        movdqa  xmm5, XMMWORD [wk(3)]   ; xmm5=(44 54 64 74 45 55 65 75)
-        movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
-
-        movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
-        punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
-        movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
-        punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-        movdqa  xmm2,xmm1
-        movdqa  xmm5,xmm7
-        paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
-        paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
-        psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
-        psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm0,xmm6
-        paddw   xmm3,xmm1               ; xmm3=tmp10
-        paddw   xmm6,xmm7               ; xmm6=tmp11
-        psubw   xmm4,xmm1               ; xmm4=tmp13
-        psubw   xmm0,xmm7               ; xmm0=tmp12
-
-        movdqa  xmm1,xmm3
-        paddw   xmm3,xmm6               ; xmm3=tmp10+tmp11
-        psubw   xmm1,xmm6               ; xmm1=tmp10-tmp11
-
-        psllw   xmm3,PASS1_BITS         ; xmm3=data0
-        psllw   xmm1,PASS1_BITS         ; xmm1=data4
-
-        movdqa  XMMWORD [wk(2)], xmm3   ; wk(2)=data0
-        movdqa  XMMWORD [wk(3)], xmm1   ; wk(3)=data4
-
-        ; (Original)
-        ; z1 = (tmp12 + tmp13) * 0.541196100;
-        ; data2 = z1 + tmp13 * 0.765366865;
-        ; data6 = z1 + tmp12 * -1.847759065;
-        ;
-        ; (This implementation)
-        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-        movdqa    xmm7,xmm4             ; xmm4=tmp13
-        movdqa    xmm6,xmm4
-        punpcklwd xmm7,xmm0             ; xmm0=tmp12
-        punpckhwd xmm6,xmm0
-        movdqa    xmm4,xmm7
-        movdqa    xmm0,xmm6
-        pmaddwd   xmm7,[rel PW_F130_F054]       ; xmm7=data2L
-        pmaddwd   xmm6,[rel PW_F130_F054]       ; xmm6=data2H
-        pmaddwd   xmm4,[rel PW_F054_MF130]      ; xmm4=data6L
-        pmaddwd   xmm0,[rel PW_F054_MF130]      ; xmm0=data6H
-
-        paddd   xmm7,[rel PD_DESCALE_P1]
-        paddd   xmm6,[rel PD_DESCALE_P1]
-        psrad   xmm7,DESCALE_P1
-        psrad   xmm6,DESCALE_P1
-        paddd   xmm4,[rel PD_DESCALE_P1]
-        paddd   xmm0,[rel PD_DESCALE_P1]
-        psrad   xmm4,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-
-        packssdw  xmm7,xmm6             ; xmm7=data2
-        packssdw  xmm4,xmm0             ; xmm4=data6
-
-        movdqa  XMMWORD [wk(4)], xmm7   ; wk(4)=data2
-        movdqa  XMMWORD [wk(5)], xmm4   ; wk(5)=data6
-
-        ; -- Odd part
-
-        movdqa  xmm3, XMMWORD [wk(0)]   ; xmm3=tmp6
-        movdqa  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp7
-
-        movdqa  xmm6,xmm2               ; xmm2=tmp4
-        movdqa  xmm0,xmm5               ; xmm5=tmp5
-        paddw   xmm6,xmm3               ; xmm6=z3
-        paddw   xmm0,xmm1               ; xmm0=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm7,xmm6
-        movdqa    xmm4,xmm6
-        punpcklwd xmm7,xmm0
-        punpckhwd xmm4,xmm0
-        movdqa    xmm6,xmm7
-        movdqa    xmm0,xmm4
-        pmaddwd   xmm7,[rel PW_MF078_F117]      ; xmm7=z3L
-        pmaddwd   xmm4,[rel PW_MF078_F117]      ; xmm4=z3H
-        pmaddwd   xmm6,[rel PW_F117_F078]       ; xmm6=z4L
-        pmaddwd   xmm0,[rel PW_F117_F078]       ; xmm0=z4H
-
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=z3L
-        movdqa  XMMWORD [wk(1)], xmm4   ; wk(1)=z3H
-
-        ; (Original)
-        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-        movdqa    xmm7,xmm2
-        movdqa    xmm4,xmm2
-        punpcklwd xmm7,xmm1
-        punpckhwd xmm4,xmm1
-        movdqa    xmm2,xmm7
-        movdqa    xmm1,xmm4
-        pmaddwd   xmm7,[rel PW_MF060_MF089]     ; xmm7=tmp4L
-        pmaddwd   xmm4,[rel PW_MF060_MF089]     ; xmm4=tmp4H
-        pmaddwd   xmm2,[rel PW_MF089_F060]      ; xmm2=tmp7L
-        pmaddwd   xmm1,[rel PW_MF089_F060]      ; xmm1=tmp7H
-
-        paddd   xmm7, XMMWORD [wk(0)]   ; xmm7=data7L
-        paddd   xmm4, XMMWORD [wk(1)]   ; xmm4=data7H
-        paddd   xmm2,xmm6               ; xmm2=data1L
-        paddd   xmm1,xmm0               ; xmm1=data1H
-
-        paddd   xmm7,[rel PD_DESCALE_P1]
-        paddd   xmm4,[rel PD_DESCALE_P1]
-        psrad   xmm7,DESCALE_P1
-        psrad   xmm4,DESCALE_P1
-        paddd   xmm2,[rel PD_DESCALE_P1]
-        paddd   xmm1,[rel PD_DESCALE_P1]
-        psrad   xmm2,DESCALE_P1
-        psrad   xmm1,DESCALE_P1
-
-        packssdw  xmm7,xmm4             ; xmm7=data7
-        packssdw  xmm2,xmm1             ; xmm2=data1
-
-        movdqa    xmm4,xmm5
-        movdqa    xmm1,xmm5
-        punpcklwd xmm4,xmm3
-        punpckhwd xmm1,xmm3
-        movdqa    xmm5,xmm4
-        movdqa    xmm3,xmm1
-        pmaddwd   xmm4,[rel PW_MF050_MF256]     ; xmm4=tmp5L
-        pmaddwd   xmm1,[rel PW_MF050_MF256]     ; xmm1=tmp5H
-        pmaddwd   xmm5,[rel PW_MF256_F050]      ; xmm5=tmp6L
-        pmaddwd   xmm3,[rel PW_MF256_F050]      ; xmm3=tmp6H
-
-        paddd   xmm4,xmm6               ; xmm4=data5L
-        paddd   xmm1,xmm0               ; xmm1=data5H
-        paddd   xmm5, XMMWORD [wk(0)]   ; xmm5=data3L
-        paddd   xmm3, XMMWORD [wk(1)]   ; xmm3=data3H
-
-        paddd   xmm4,[rel PD_DESCALE_P1]
-        paddd   xmm1,[rel PD_DESCALE_P1]
-        psrad   xmm4,DESCALE_P1
-        psrad   xmm1,DESCALE_P1
-        paddd   xmm5,[rel PD_DESCALE_P1]
-        paddd   xmm3,[rel PD_DESCALE_P1]
-        psrad   xmm5,DESCALE_P1
-        psrad   xmm3,DESCALE_P1
-
-        packssdw  xmm4,xmm1             ; xmm4=data5
-        packssdw  xmm5,xmm3             ; xmm5=data3
-
-        ; ---- Pass 2: process columns.
-
-        movdqa  xmm6, XMMWORD [wk(2)]   ; xmm6=col0
-        movdqa  xmm0, XMMWORD [wk(4)]   ; xmm0=col2
-
-        ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
-        ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
-
-        movdqa    xmm1,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm2             ; xmm6=(00 01 10 11 20 21 30 31)
-        punpckhwd xmm1,xmm2             ; xmm1=(40 41 50 51 60 61 70 71)
-        movdqa    xmm3,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm5             ; xmm0=(02 03 12 13 22 23 32 33)
-        punpckhwd xmm3,xmm5             ; xmm3=(42 43 52 53 62 63 72 73)
-
-        movdqa  xmm2, XMMWORD [wk(3)]   ; xmm2=col4
-        movdqa  xmm5, XMMWORD [wk(5)]   ; xmm5=col6
-
-        ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
-        ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
-
-        movdqa  XMMWORD [wk(0)], xmm0   ; wk(0)=(02 03 12 13 22 23 32 33)
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=(42 43 52 53 62 63 72 73)
-
-        movdqa    xmm0,xmm2             ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm4             ; xmm2=(04 05 14 15 24 25 34 35)
-        punpckhwd xmm0,xmm4             ; xmm0=(44 45 54 55 64 65 74 75)
-        movdqa    xmm3,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm7             ; xmm5=(06 07 16 17 26 27 36 37)
-        punpckhwd xmm3,xmm7             ; xmm3=(46 47 56 57 66 67 76 77)
-
-        movdqa    xmm4,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm5             ; xmm2=(04 05 06 07 14 15 16 17)
-        punpckhdq xmm4,xmm5             ; xmm4=(24 25 26 27 34 35 36 37)
-        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm3             ; xmm0=(44 45 46 47 54 55 56 57)
-        punpckhdq xmm7,xmm3             ; xmm7=(64 65 66 67 74 75 76 77)
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=(02 03 12 13 22 23 32 33)
-        movdqa  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53 62 63 72 73)
-        movdqa  XMMWORD [wk(2)], xmm4   ; wk(2)=(24 25 26 27 34 35 36 37)
-        movdqa  XMMWORD [wk(3)], xmm0   ; wk(3)=(44 45 46 47 54 55 56 57)
-
-        movdqa    xmm4,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm5             ; xmm6=(00 01 02 03 10 11 12 13)
-        punpckhdq xmm4,xmm5             ; xmm4=(20 21 22 23 30 31 32 33)
-        movdqa    xmm0,xmm1             ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm3             ; xmm1=(40 41 42 43 50 51 52 53)
-        punpckhdq xmm0,xmm3             ; xmm0=(60 61 62 63 70 71 72 73)
-
-        movdqa     xmm5,xmm6            ; transpose coefficients(phase 3)
-        punpcklqdq xmm6,xmm2            ; xmm6=(00 01 02 03 04 05 06 07)=data0
-        punpckhqdq xmm5,xmm2            ; xmm5=(10 11 12 13 14 15 16 17)=data1
-        movdqa     xmm3,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm7            ; xmm0=(60 61 62 63 64 65 66 67)=data6
-        punpckhqdq xmm3,xmm7            ; xmm3=(70 71 72 73 74 75 76 77)=data7
-
-        movdqa  xmm2,xmm5
-        movdqa  xmm7,xmm6
-        psubw   xmm5,xmm0               ; xmm5=data1-data6=tmp6
-        psubw   xmm6,xmm3               ; xmm6=data0-data7=tmp7
-        paddw   xmm2,xmm0               ; xmm2=data1+data6=tmp1
-        paddw   xmm7,xmm3               ; xmm7=data0+data7=tmp0
-
-        movdqa  xmm0, XMMWORD [wk(2)]   ; xmm0=(24 25 26 27 34 35 36 37)
-        movdqa  xmm3, XMMWORD [wk(3)]   ; xmm3=(44 45 46 47 54 55 56 57)
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
-
-        movdqa     xmm5,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm0            ; xmm4=(20 21 22 23 24 25 26 27)=data2
-        punpckhqdq xmm5,xmm0            ; xmm5=(30 31 32 33 34 35 36 37)=data3
-        movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm3            ; xmm1=(40 41 42 43 44 45 46 47)=data4
-        punpckhqdq xmm6,xmm3            ; xmm6=(50 51 52 53 54 55 56 57)=data5
-
-        movdqa  xmm0,xmm5
-        movdqa  xmm3,xmm4
-        paddw   xmm5,xmm1               ; xmm5=data3+data4=tmp3
-        paddw   xmm4,xmm6               ; xmm4=data2+data5=tmp2
-        psubw   xmm0,xmm1               ; xmm0=data3-data4=tmp4
-        psubw   xmm3,xmm6               ; xmm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm1,xmm7
-        movdqa  xmm6,xmm2
-        paddw   xmm7,xmm5               ; xmm7=tmp10
-        paddw   xmm2,xmm4               ; xmm2=tmp11
-        psubw   xmm1,xmm5               ; xmm1=tmp13
-        psubw   xmm6,xmm4               ; xmm6=tmp12
-
-        movdqa  xmm5,xmm7
-        paddw   xmm7,xmm2               ; xmm7=tmp10+tmp11
-        psubw   xmm5,xmm2               ; xmm5=tmp10-tmp11
-
-        paddw   xmm7,[rel PW_DESCALE_P2X]
-        paddw   xmm5,[rel PW_DESCALE_P2X]
-        psraw   xmm7,PASS1_BITS         ; xmm7=data0
-        psraw   xmm5,PASS1_BITS         ; xmm5=data4
-
-        movdqa  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
-        movdqa  XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
-
-        ; (Original)
-        ; z1 = (tmp12 + tmp13) * 0.541196100;
-        ; data2 = z1 + tmp13 * 0.765366865;
-        ; data6 = z1 + tmp12 * -1.847759065;
-        ;
-        ; (This implementation)
-        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-        movdqa    xmm4,xmm1             ; xmm1=tmp13
-        movdqa    xmm2,xmm1
-        punpcklwd xmm4,xmm6             ; xmm6=tmp12
-        punpckhwd xmm2,xmm6
-        movdqa    xmm1,xmm4
-        movdqa    xmm6,xmm2
-        pmaddwd   xmm4,[rel PW_F130_F054]       ; xmm4=data2L
-        pmaddwd   xmm2,[rel PW_F130_F054]       ; xmm2=data2H
-        pmaddwd   xmm1,[rel PW_F054_MF130]      ; xmm1=data6L
-        pmaddwd   xmm6,[rel PW_F054_MF130]      ; xmm6=data6H
-
-        paddd   xmm4,[rel PD_DESCALE_P2]
-        paddd   xmm2,[rel PD_DESCALE_P2]
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm2,DESCALE_P2
-        paddd   xmm1,[rel PD_DESCALE_P2]
-        paddd   xmm6,[rel PD_DESCALE_P2]
-        psrad   xmm1,DESCALE_P2
-        psrad   xmm6,DESCALE_P2
-
-        packssdw  xmm4,xmm2             ; xmm4=data2
-        packssdw  xmm1,xmm6             ; xmm1=data6
-
-        movdqa  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
-        movdqa  XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
-
-        ; -- Odd part
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp6
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
-
-        movdqa  xmm2,xmm0               ; xmm0=tmp4
-        movdqa  xmm6,xmm3               ; xmm3=tmp5
-        paddw   xmm2,xmm7               ; xmm2=z3
-        paddw   xmm6,xmm5               ; xmm6=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm4,xmm2
-        movdqa    xmm1,xmm2
-        punpcklwd xmm4,xmm6
-        punpckhwd xmm1,xmm6
-        movdqa    xmm2,xmm4
-        movdqa    xmm6,xmm1
-        pmaddwd   xmm4,[rel PW_MF078_F117]      ; xmm4=z3L
-        pmaddwd   xmm1,[rel PW_MF078_F117]      ; xmm1=z3H
-        pmaddwd   xmm2,[rel PW_F117_F078]       ; xmm2=z4L
-        pmaddwd   xmm6,[rel PW_F117_F078]       ; xmm6=z4H
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=z3L
-        movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=z3H
-
-        ; (Original)
-        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-        movdqa    xmm4,xmm0
-        movdqa    xmm1,xmm0
-        punpcklwd xmm4,xmm5
-        punpckhwd xmm1,xmm5
-        movdqa    xmm0,xmm4
-        movdqa    xmm5,xmm1
-        pmaddwd   xmm4,[rel PW_MF060_MF089]     ; xmm4=tmp4L
-        pmaddwd   xmm1,[rel PW_MF060_MF089]     ; xmm1=tmp4H
-        pmaddwd   xmm0,[rel PW_MF089_F060]      ; xmm0=tmp7L
-        pmaddwd   xmm5,[rel PW_MF089_F060]      ; xmm5=tmp7H
-
-        paddd   xmm4, XMMWORD [wk(0)]   ; xmm4=data7L
-        paddd   xmm1, XMMWORD [wk(1)]   ; xmm1=data7H
-        paddd   xmm0,xmm2               ; xmm0=data1L
-        paddd   xmm5,xmm6               ; xmm5=data1H
-
-        paddd   xmm4,[rel PD_DESCALE_P2]
-        paddd   xmm1,[rel PD_DESCALE_P2]
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm1,DESCALE_P2
-        paddd   xmm0,[rel PD_DESCALE_P2]
-        paddd   xmm5,[rel PD_DESCALE_P2]
-        psrad   xmm0,DESCALE_P2
-        psrad   xmm5,DESCALE_P2
-
-        packssdw  xmm4,xmm1             ; xmm4=data7
-        packssdw  xmm0,xmm5             ; xmm0=data1
-
-        movdqa  XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
-        movdqa  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
-
-        movdqa    xmm1,xmm3
-        movdqa    xmm5,xmm3
-        punpcklwd xmm1,xmm7
-        punpckhwd xmm5,xmm7
-        movdqa    xmm3,xmm1
-        movdqa    xmm7,xmm5
-        pmaddwd   xmm1,[rel PW_MF050_MF256]     ; xmm1=tmp5L
-        pmaddwd   xmm5,[rel PW_MF050_MF256]     ; xmm5=tmp5H
-        pmaddwd   xmm3,[rel PW_MF256_F050]      ; xmm3=tmp6L
-        pmaddwd   xmm7,[rel PW_MF256_F050]      ; xmm7=tmp6H
-
-        paddd   xmm1,xmm2               ; xmm1=data5L
-        paddd   xmm5,xmm6               ; xmm5=data5H
-        paddd   xmm3, XMMWORD [wk(0)]   ; xmm3=data3L
-        paddd   xmm7, XMMWORD [wk(1)]   ; xmm7=data3H
-
-        paddd   xmm1,[rel PD_DESCALE_P2]
-        paddd   xmm5,[rel PD_DESCALE_P2]
-        psrad   xmm1,DESCALE_P2
-        psrad   xmm5,DESCALE_P2
-        paddd   xmm3,[rel PD_DESCALE_P2]
-        paddd   xmm7,[rel PD_DESCALE_P2]
-        psrad   xmm3,DESCALE_P2
-        psrad   xmm7,DESCALE_P2
-
-        packssdw  xmm1,xmm5             ; xmm1=data5
-        packssdw  xmm3,xmm7             ; xmm3=data3
-
-        movdqa  XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
-        movdqa  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
-
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jfdctint-sse2.asm b/simd/jfdctint-sse2.asm
deleted file mode 100644
index db9d0bb..0000000
--- a/simd/jfdctint-sse2.asm
+++ /dev/null
@@ -1,633 +0,0 @@
-;
-; jfdctint.asm - accurate integer FDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; forward DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jfdctint.c; see the jfdctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2      (CONST_BITS+PASS1_BITS)
-
-%if CONST_BITS == 13
-F_0_298 equ      2446           ; FIX(0.298631336)
-F_0_390 equ      3196           ; FIX(0.390180644)
-F_0_541 equ      4433           ; FIX(0.541196100)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_175 equ      9633           ; FIX(1.175875602)
-F_1_501 equ     12299           ; FIX(1.501321110)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_1_961 equ     16069           ; FIX(1.961570560)
-F_2_053 equ     16819           ; FIX(2.053119869)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_072 equ     25172           ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
-F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
-F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
-F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fdct_islow_sse2)
-
-EXTN(jconst_fdct_islow_sse2):
-
-PW_F130_F054    times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130   times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117   times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078    times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089  times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060   times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256  times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050   times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1   times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2   times 4 dd  1 << (DESCALE_P2-1)
-PW_DESCALE_P2X  times 8 dw  1 << (PASS1_BITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_islow_sse2 (DCTELEM *data)
-;
-
-%define data(b)         (b)+8           ; DCTELEM *data
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          6
-
-        align   16
-        global  EXTN(jsimd_fdct_islow_sse2)
-
-EXTN(jsimd_fdct_islow_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-;       push    edi             ; unused
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process rows.
-
-        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-
-        ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-        ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-        movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
-        movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
-
-        movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-        ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-        ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
-        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
-
-        movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
-        movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
-
-        movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
-        movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
-        movdqa  XMMWORD [wk(2)], xmm7   ; wk(2)=(42 52 62 72 43 53 63 73)
-        movdqa  XMMWORD [wk(3)], xmm2   ; wk(3)=(44 54 64 74 45 55 65 75)
-
-        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
-        movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
-
-        movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
-        punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
-        movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
-        punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-        movdqa  xmm6,xmm1
-        movdqa  xmm3,xmm0
-        psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
-        psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
-        paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
-        paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
-
-        movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(42 52 62 72 43 53 63 73)
-        movdqa  xmm5, XMMWORD [wk(3)]   ; xmm5=(44 54 64 74 45 55 65 75)
-        movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
-
-        movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
-        punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
-        movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
-        punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-        movdqa  xmm2,xmm1
-        movdqa  xmm5,xmm7
-        paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
-        paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
-        psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
-        psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm0,xmm6
-        paddw   xmm3,xmm1               ; xmm3=tmp10
-        paddw   xmm6,xmm7               ; xmm6=tmp11
-        psubw   xmm4,xmm1               ; xmm4=tmp13
-        psubw   xmm0,xmm7               ; xmm0=tmp12
-
-        movdqa  xmm1,xmm3
-        paddw   xmm3,xmm6               ; xmm3=tmp10+tmp11
-        psubw   xmm1,xmm6               ; xmm1=tmp10-tmp11
-
-        psllw   xmm3,PASS1_BITS         ; xmm3=data0
-        psllw   xmm1,PASS1_BITS         ; xmm1=data4
-
-        movdqa  XMMWORD [wk(2)], xmm3   ; wk(2)=data0
-        movdqa  XMMWORD [wk(3)], xmm1   ; wk(3)=data4
-
-        ; (Original)
-        ; z1 = (tmp12 + tmp13) * 0.541196100;
-        ; data2 = z1 + tmp13 * 0.765366865;
-        ; data6 = z1 + tmp12 * -1.847759065;
-        ;
-        ; (This implementation)
-        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-        movdqa    xmm7,xmm4             ; xmm4=tmp13
-        movdqa    xmm6,xmm4
-        punpcklwd xmm7,xmm0             ; xmm0=tmp12
-        punpckhwd xmm6,xmm0
-        movdqa    xmm4,xmm7
-        movdqa    xmm0,xmm6
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_F130_F054)]       ; xmm7=data2L
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]       ; xmm6=data2H
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm4=data6L
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm0=data6H
-
-        paddd   xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   xmm6,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   xmm7,DESCALE_P1
-        psrad   xmm6,DESCALE_P1
-        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   xmm0,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   xmm4,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-
-        packssdw  xmm7,xmm6             ; xmm7=data2
-        packssdw  xmm4,xmm0             ; xmm4=data6
-
-        movdqa  XMMWORD [wk(4)], xmm7   ; wk(4)=data2
-        movdqa  XMMWORD [wk(5)], xmm4   ; wk(5)=data6
-
-        ; -- Odd part
-
-        movdqa  xmm3, XMMWORD [wk(0)]   ; xmm3=tmp6
-        movdqa  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp7
-
-        movdqa  xmm6,xmm2               ; xmm2=tmp4
-        movdqa  xmm0,xmm5               ; xmm5=tmp5
-        paddw   xmm6,xmm3               ; xmm6=z3
-        paddw   xmm0,xmm1               ; xmm0=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm7,xmm6
-        movdqa    xmm4,xmm6
-        punpcklwd xmm7,xmm0
-        punpckhwd xmm4,xmm0
-        movdqa    xmm6,xmm7
-        movdqa    xmm0,xmm4
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm7=z3L
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm4=z3H
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]       ; xmm6=z4L
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_F117_F078)]       ; xmm0=z4H
-
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=z3L
-        movdqa  XMMWORD [wk(1)], xmm4   ; wk(1)=z3H
-
-        ; (Original)
-        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-        movdqa    xmm7,xmm2
-        movdqa    xmm4,xmm2
-        punpcklwd xmm7,xmm1
-        punpckhwd xmm4,xmm1
-        movdqa    xmm2,xmm7
-        movdqa    xmm1,xmm4
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm7=tmp4L
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm4=tmp4H
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm2=tmp7L
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm1=tmp7H
-
-        paddd   xmm7, XMMWORD [wk(0)]   ; xmm7=data7L
-        paddd   xmm4, XMMWORD [wk(1)]   ; xmm4=data7H
-        paddd   xmm2,xmm6               ; xmm2=data1L
-        paddd   xmm1,xmm0               ; xmm1=data1H
-
-        paddd   xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   xmm7,DESCALE_P1
-        psrad   xmm4,DESCALE_P1
-        paddd   xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   xmm2,DESCALE_P1
-        psrad   xmm1,DESCALE_P1
-
-        packssdw  xmm7,xmm4             ; xmm7=data7
-        packssdw  xmm2,xmm1             ; xmm2=data1
-
-        movdqa    xmm4,xmm5
-        movdqa    xmm1,xmm5
-        punpcklwd xmm4,xmm3
-        punpckhwd xmm1,xmm3
-        movdqa    xmm5,xmm4
-        movdqa    xmm3,xmm1
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm4=tmp5L
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm1=tmp5H
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm5=tmp6L
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm3=tmp6H
-
-        paddd   xmm4,xmm6               ; xmm4=data5L
-        paddd   xmm1,xmm0               ; xmm1=data5H
-        paddd   xmm5, XMMWORD [wk(0)]   ; xmm5=data3L
-        paddd   xmm3, XMMWORD [wk(1)]   ; xmm3=data3H
-
-        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   xmm4,DESCALE_P1
-        psrad   xmm1,DESCALE_P1
-        paddd   xmm5,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   xmm5,DESCALE_P1
-        psrad   xmm3,DESCALE_P1
-
-        packssdw  xmm4,xmm1             ; xmm4=data5
-        packssdw  xmm5,xmm3             ; xmm5=data3
-
-        ; ---- Pass 2: process columns.
-
-;       mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-
-        movdqa  xmm6, XMMWORD [wk(2)]   ; xmm6=col0
-        movdqa  xmm0, XMMWORD [wk(4)]   ; xmm0=col2
-
-        ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
-        ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
-
-        movdqa    xmm1,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm2             ; xmm6=(00 01 10 11 20 21 30 31)
-        punpckhwd xmm1,xmm2             ; xmm1=(40 41 50 51 60 61 70 71)
-        movdqa    xmm3,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm5             ; xmm0=(02 03 12 13 22 23 32 33)
-        punpckhwd xmm3,xmm5             ; xmm3=(42 43 52 53 62 63 72 73)
-
-        movdqa  xmm2, XMMWORD [wk(3)]   ; xmm2=col4
-        movdqa  xmm5, XMMWORD [wk(5)]   ; xmm5=col6
-
-        ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
-        ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
-
-        movdqa  XMMWORD [wk(0)], xmm0   ; wk(0)=(02 03 12 13 22 23 32 33)
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=(42 43 52 53 62 63 72 73)
-
-        movdqa    xmm0,xmm2             ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm4             ; xmm2=(04 05 14 15 24 25 34 35)
-        punpckhwd xmm0,xmm4             ; xmm0=(44 45 54 55 64 65 74 75)
-        movdqa    xmm3,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm7             ; xmm5=(06 07 16 17 26 27 36 37)
-        punpckhwd xmm3,xmm7             ; xmm3=(46 47 56 57 66 67 76 77)
-
-        movdqa    xmm4,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm5             ; xmm2=(04 05 06 07 14 15 16 17)
-        punpckhdq xmm4,xmm5             ; xmm4=(24 25 26 27 34 35 36 37)
-        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm3             ; xmm0=(44 45 46 47 54 55 56 57)
-        punpckhdq xmm7,xmm3             ; xmm7=(64 65 66 67 74 75 76 77)
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=(02 03 12 13 22 23 32 33)
-        movdqa  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53 62 63 72 73)
-        movdqa  XMMWORD [wk(2)], xmm4   ; wk(2)=(24 25 26 27 34 35 36 37)
-        movdqa  XMMWORD [wk(3)], xmm0   ; wk(3)=(44 45 46 47 54 55 56 57)
-
-        movdqa    xmm4,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm5             ; xmm6=(00 01 02 03 10 11 12 13)
-        punpckhdq xmm4,xmm5             ; xmm4=(20 21 22 23 30 31 32 33)
-        movdqa    xmm0,xmm1             ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm3             ; xmm1=(40 41 42 43 50 51 52 53)
-        punpckhdq xmm0,xmm3             ; xmm0=(60 61 62 63 70 71 72 73)
-
-        movdqa     xmm5,xmm6            ; transpose coefficients(phase 3)
-        punpcklqdq xmm6,xmm2            ; xmm6=(00 01 02 03 04 05 06 07)=data0
-        punpckhqdq xmm5,xmm2            ; xmm5=(10 11 12 13 14 15 16 17)=data1
-        movdqa     xmm3,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm7            ; xmm0=(60 61 62 63 64 65 66 67)=data6
-        punpckhqdq xmm3,xmm7            ; xmm3=(70 71 72 73 74 75 76 77)=data7
-
-        movdqa  xmm2,xmm5
-        movdqa  xmm7,xmm6
-        psubw   xmm5,xmm0               ; xmm5=data1-data6=tmp6
-        psubw   xmm6,xmm3               ; xmm6=data0-data7=tmp7
-        paddw   xmm2,xmm0               ; xmm2=data1+data6=tmp1
-        paddw   xmm7,xmm3               ; xmm7=data0+data7=tmp0
-
-        movdqa  xmm0, XMMWORD [wk(2)]   ; xmm0=(24 25 26 27 34 35 36 37)
-        movdqa  xmm3, XMMWORD [wk(3)]   ; xmm3=(44 45 46 47 54 55 56 57)
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
-
-        movdqa     xmm5,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm0            ; xmm4=(20 21 22 23 24 25 26 27)=data2
-        punpckhqdq xmm5,xmm0            ; xmm5=(30 31 32 33 34 35 36 37)=data3
-        movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm3            ; xmm1=(40 41 42 43 44 45 46 47)=data4
-        punpckhqdq xmm6,xmm3            ; xmm6=(50 51 52 53 54 55 56 57)=data5
-
-        movdqa  xmm0,xmm5
-        movdqa  xmm3,xmm4
-        paddw   xmm5,xmm1               ; xmm5=data3+data4=tmp3
-        paddw   xmm4,xmm6               ; xmm4=data2+data5=tmp2
-        psubw   xmm0,xmm1               ; xmm0=data3-data4=tmp4
-        psubw   xmm3,xmm6               ; xmm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm1,xmm7
-        movdqa  xmm6,xmm2
-        paddw   xmm7,xmm5               ; xmm7=tmp10
-        paddw   xmm2,xmm4               ; xmm2=tmp11
-        psubw   xmm1,xmm5               ; xmm1=tmp13
-        psubw   xmm6,xmm4               ; xmm6=tmp12
-
-        movdqa  xmm5,xmm7
-        paddw   xmm7,xmm2               ; xmm7=tmp10+tmp11
-        psubw   xmm5,xmm2               ; xmm5=tmp10-tmp11
-
-        paddw   xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
-        paddw   xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
-        psraw   xmm7,PASS1_BITS         ; xmm7=data0
-        psraw   xmm5,PASS1_BITS         ; xmm5=data4
-
-        movdqa  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
-        movdqa  XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
-
-        ; (Original)
-        ; z1 = (tmp12 + tmp13) * 0.541196100;
-        ; data2 = z1 + tmp13 * 0.765366865;
-        ; data6 = z1 + tmp12 * -1.847759065;
-        ;
-        ; (This implementation)
-        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-        movdqa    xmm4,xmm1             ; xmm1=tmp13
-        movdqa    xmm2,xmm1
-        punpcklwd xmm4,xmm6             ; xmm6=tmp12
-        punpckhwd xmm2,xmm6
-        movdqa    xmm1,xmm4
-        movdqa    xmm6,xmm2
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]       ; xmm4=data2L
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_F130_F054)]       ; xmm2=data2H
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm1=data6L
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm6=data6H
-
-        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm2,DESCALE_P2
-        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   xmm6,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   xmm1,DESCALE_P2
-        psrad   xmm6,DESCALE_P2
-
-        packssdw  xmm4,xmm2             ; xmm4=data2
-        packssdw  xmm1,xmm6             ; xmm1=data6
-
-        movdqa  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
-        movdqa  XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
-
-        ; -- Odd part
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp6
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
-
-        movdqa  xmm2,xmm0               ; xmm0=tmp4
-        movdqa  xmm6,xmm3               ; xmm3=tmp5
-        paddw   xmm2,xmm7               ; xmm2=z3
-        paddw   xmm6,xmm5               ; xmm6=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm4,xmm2
-        movdqa    xmm1,xmm2
-        punpcklwd xmm4,xmm6
-        punpckhwd xmm1,xmm6
-        movdqa    xmm2,xmm4
-        movdqa    xmm6,xmm1
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm4=z3L
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm1=z3H
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_F117_F078)]       ; xmm2=z4L
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]       ; xmm6=z4H
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=z3L
-        movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=z3H
-
-        ; (Original)
-        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-        movdqa    xmm4,xmm0
-        movdqa    xmm1,xmm0
-        punpcklwd xmm4,xmm5
-        punpckhwd xmm1,xmm5
-        movdqa    xmm0,xmm4
-        movdqa    xmm5,xmm1
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm4=tmp4L
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm1=tmp4H
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm0=tmp7L
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm5=tmp7H
-
-        paddd   xmm4, XMMWORD [wk(0)]   ; xmm4=data7L
-        paddd   xmm1, XMMWORD [wk(1)]   ; xmm1=data7H
-        paddd   xmm0,xmm2               ; xmm0=data1L
-        paddd   xmm5,xmm6               ; xmm5=data1H
-
-        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm1,DESCALE_P2
-        paddd   xmm0,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   xmm0,DESCALE_P2
-        psrad   xmm5,DESCALE_P2
-
-        packssdw  xmm4,xmm1             ; xmm4=data7
-        packssdw  xmm0,xmm5             ; xmm0=data1
-
-        movdqa  XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
-        movdqa  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
-
-        movdqa    xmm1,xmm3
-        movdqa    xmm5,xmm3
-        punpcklwd xmm1,xmm7
-        punpckhwd xmm5,xmm7
-        movdqa    xmm3,xmm1
-        movdqa    xmm7,xmm5
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm1=tmp5L
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm5=tmp5H
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm3=tmp6L
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm7=tmp6H
-
-        paddd   xmm1,xmm2               ; xmm1=data5L
-        paddd   xmm5,xmm6               ; xmm5=data5H
-        paddd   xmm3, XMMWORD [wk(0)]   ; xmm3=data3L
-        paddd   xmm7, XMMWORD [wk(1)]   ; xmm7=data3H
-
-        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   xmm1,DESCALE_P2
-        psrad   xmm5,DESCALE_P2
-        paddd   xmm3,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   xmm3,DESCALE_P2
-        psrad   xmm7,DESCALE_P2
-
-        packssdw  xmm1,xmm5             ; xmm1=data5
-        packssdw  xmm3,xmm7             ; xmm3=data3
-
-        movdqa  XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
-        movdqa  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
-
-;       pop     edi             ; unused
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jidctflt-3dn.asm b/simd/jidctflt-3dn.asm
deleted file mode 100644
index 99356f2..0000000
--- a/simd/jidctflt-3dn.asm
+++ /dev/null
@@ -1,451 +0,0 @@
-;
-; jidctflt.asm - floating-point IDCT (3DNow! & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_float_3dnow)
-
-EXTN(jconst_idct_float_3dnow):
-
-PD_1_414        times 2 dd  1.414213562373095048801689
-PD_1_847        times 2 dd  1.847759065022573512256366
-PD_1_082        times 2 dd  1.082392200292393968799446
-PD_2_613        times 2 dd  2.613125929752753055713286
-PD_RNDINT_MAGIC times 2 dd  100663296.0 ; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_3dnow (void *dct_table, JCOEFPTR coef_block,
-;                         JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-                                        ; FAST_FLOAT workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_float_3dnow)
-
-EXTN(jsimd_idct_float_3dnow):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [workspace]
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     ecx, DCTSIZE/2                          ; ctr
-        alignx  16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     short .columnDCT
-
-        pushpic ebx             ; save GOT address
-        mov     ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        mov     eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        or      ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        or      ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        or      eax,ebx
-        poppic  ebx             ; restore GOT address
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
-
-        punpcklwd mm0,mm0
-        psrad     mm0,(DWORD_BIT-WORD_BIT)
-        pi2fd     mm0,mm0
-
-        pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movq      mm1,mm0
-        punpckldq mm0,mm0
-        punpckhdq mm1,mm1
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
-        movq    MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
-        movq    MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
-        movq    MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
-        movq    MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
-        jmp     near .nextcolumn
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movd      mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movd      mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movd      mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-        punpcklwd mm0,mm0
-        punpcklwd mm1,mm1
-        psrad     mm0,(DWORD_BIT-WORD_BIT)
-        psrad     mm1,(DWORD_BIT-WORD_BIT)
-        pi2fd     mm0,mm0
-        pi2fd     mm1,mm1
-
-        pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        pfmul     mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        punpcklwd mm2,mm2
-        punpcklwd mm3,mm3
-        psrad     mm2,(DWORD_BIT-WORD_BIT)
-        psrad     mm3,(DWORD_BIT-WORD_BIT)
-        pi2fd     mm2,mm2
-        pi2fd     mm3,mm3
-
-        pfmul     mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        pfmul     mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movq    mm4,mm0
-        movq    mm5,mm1
-        pfsub   mm0,mm2                 ; mm0=tmp11
-        pfsub   mm1,mm3
-        pfadd   mm4,mm2                 ; mm4=tmp10
-        pfadd   mm5,mm3                 ; mm5=tmp13
-
-        pfmul   mm1,[GOTOFF(ebx,PD_1_414)]
-        pfsub   mm1,mm5                 ; mm1=tmp12
-
-        movq    mm6,mm4
-        movq    mm7,mm0
-        pfsub   mm4,mm5                 ; mm4=tmp3
-        pfsub   mm0,mm1                 ; mm0=tmp2
-        pfadd   mm6,mm5                 ; mm6=tmp0
-        pfadd   mm7,mm1                 ; mm7=tmp1
-
-        movq    MMWORD [wk(1)], mm4     ; tmp3
-        movq    MMWORD [wk(0)], mm0     ; tmp2
-
-        ; -- Odd part
-
-        movd      mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movd      mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movd      mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movd      mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-        punpcklwd mm2,mm2
-        punpcklwd mm3,mm3
-        psrad     mm2,(DWORD_BIT-WORD_BIT)
-        psrad     mm3,(DWORD_BIT-WORD_BIT)
-        pi2fd     mm2,mm2
-        pi2fd     mm3,mm3
-
-        pfmul     mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        pfmul     mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        punpcklwd mm5,mm5
-        punpcklwd mm1,mm1
-        psrad     mm5,(DWORD_BIT-WORD_BIT)
-        psrad     mm1,(DWORD_BIT-WORD_BIT)
-        pi2fd     mm5,mm5
-        pi2fd     mm1,mm1
-
-        pfmul     mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        pfmul     mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movq    mm4,mm2
-        movq    mm0,mm5
-        pfadd   mm2,mm1                 ; mm2=z11
-        pfadd   mm5,mm3                 ; mm5=z13
-        pfsub   mm4,mm1                 ; mm4=z12
-        pfsub   mm0,mm3                 ; mm0=z10
-
-        movq    mm1,mm2
-        pfsub   mm2,mm5
-        pfadd   mm1,mm5                 ; mm1=tmp7
-
-        pfmul   mm2,[GOTOFF(ebx,PD_1_414)]      ; mm2=tmp11
-
-        movq    mm3,mm0
-        pfadd   mm0,mm4
-        pfmul   mm0,[GOTOFF(ebx,PD_1_847)]      ; mm0=z5
-        pfmul   mm3,[GOTOFF(ebx,PD_2_613)]      ; mm3=(z10 * 2.613125930)
-        pfmul   mm4,[GOTOFF(ebx,PD_1_082)]      ; mm4=(z12 * 1.082392200)
-        pfsubr  mm3,mm0                 ; mm3=tmp12
-        pfsub   mm4,mm0                 ; mm4=tmp10
-
-        ; -- Final output stage
-
-        pfsub   mm3,mm1                 ; mm3=tmp6
-        movq    mm5,mm6
-        movq    mm0,mm7
-        pfadd   mm6,mm1                 ; mm6=data0=(00 01)
-        pfadd   mm7,mm3                 ; mm7=data1=(10 11)
-        pfsub   mm5,mm1                 ; mm5=data7=(70 71)
-        pfsub   mm0,mm3                 ; mm0=data6=(60 61)
-        pfsub   mm2,mm3                 ; mm2=tmp5
-
-        movq      mm1,mm6               ; transpose coefficients
-        punpckldq mm6,mm7               ; mm6=(00 10)
-        punpckhdq mm1,mm7               ; mm1=(01 11)
-        movq      mm3,mm0               ; transpose coefficients
-        punpckldq mm0,mm5               ; mm0=(60 70)
-        punpckhdq mm3,mm5               ; mm3=(61 71)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
-        movq    MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
-        movq    MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
-
-        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp2
-        movq    mm5, MMWORD [wk(1)]     ; mm5=tmp3
-
-        pfadd   mm4,mm2                 ; mm4=tmp4
-        movq    mm6,mm7
-        movq    mm1,mm5
-        pfadd   mm7,mm2                 ; mm7=data2=(20 21)
-        pfadd   mm5,mm4                 ; mm5=data4=(40 41)
-        pfsub   mm6,mm2                 ; mm6=data5=(50 51)
-        pfsub   mm1,mm4                 ; mm1=data3=(30 31)
-
-        movq      mm0,mm7               ; transpose coefficients
-        punpckldq mm7,mm1               ; mm7=(20 30)
-        punpckhdq mm0,mm1               ; mm0=(21 31)
-        movq      mm3,mm5               ; transpose coefficients
-        punpckldq mm5,mm6               ; mm5=(40 50)
-        punpckhdq mm3,mm6               ; mm3=(41 51)
-
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
-        movq    MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
-        movq    MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
-
-.nextcolumn:
-        add     esi, byte 2*SIZEOF_JCOEF                ; coef_block
-        add     edx, byte 2*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
-        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
-        dec     ecx                                     ; ctr
-        jnz     near .columnloop
-
-        ; -- Prefetch the next coefficient block
-
-        prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-        prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-        prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-        prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-        mov     ecx, DCTSIZE/2                          ; ctr
-        alignx  16,7
-.rowloop:
-
-        ; -- Even part
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
-
-        movq    mm4,mm0
-        movq    mm5,mm1
-        pfsub   mm0,mm2                 ; mm0=tmp11
-        pfsub   mm1,mm3
-        pfadd   mm4,mm2                 ; mm4=tmp10
-        pfadd   mm5,mm3                 ; mm5=tmp13
-
-        pfmul   mm1,[GOTOFF(ebx,PD_1_414)]
-        pfsub   mm1,mm5                 ; mm1=tmp12
-
-        movq    mm6,mm4
-        movq    mm7,mm0
-        pfsub   mm4,mm5                 ; mm4=tmp3
-        pfsub   mm0,mm1                 ; mm0=tmp2
-        pfadd   mm6,mm5                 ; mm6=tmp0
-        pfadd   mm7,mm1                 ; mm7=tmp1
-
-        movq    MMWORD [wk(1)], mm4     ; tmp3
-        movq    MMWORD [wk(0)], mm0     ; tmp2
-
-        ; -- Odd part
-
-        movq    mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
-
-        movq    mm4,mm2
-        movq    mm0,mm5
-        pfadd   mm2,mm1                 ; mm2=z11
-        pfadd   mm5,mm3                 ; mm5=z13
-        pfsub   mm4,mm1                 ; mm4=z12
-        pfsub   mm0,mm3                 ; mm0=z10
-
-        movq    mm1,mm2
-        pfsub   mm2,mm5
-        pfadd   mm1,mm5                 ; mm1=tmp7
-
-        pfmul   mm2,[GOTOFF(ebx,PD_1_414)]      ; mm2=tmp11
-
-        movq    mm3,mm0
-        pfadd   mm0,mm4
-        pfmul   mm0,[GOTOFF(ebx,PD_1_847)]      ; mm0=z5
-        pfmul   mm3,[GOTOFF(ebx,PD_2_613)]      ; mm3=(z10 * 2.613125930)
-        pfmul   mm4,[GOTOFF(ebx,PD_1_082)]      ; mm4=(z12 * 1.082392200)
-        pfsubr  mm3,mm0                 ; mm3=tmp12
-        pfsub   mm4,mm0                 ; mm4=tmp10
-
-        ; -- Final output stage
-
-        pfsub   mm3,mm1                 ; mm3=tmp6
-        movq    mm5,mm6
-        movq    mm0,mm7
-        pfadd   mm6,mm1                 ; mm6=data0=(00 10)
-        pfadd   mm7,mm3                 ; mm7=data1=(01 11)
-        pfsub   mm5,mm1                 ; mm5=data7=(07 17)
-        pfsub   mm0,mm3                 ; mm0=data6=(06 16)
-        pfsub   mm2,mm3                 ; mm2=tmp5
-
-        movq    mm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]       ; mm1=[PD_RNDINT_MAGIC]
-        pcmpeqd mm3,mm3
-        psrld   mm3,WORD_BIT            ; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
-
-        pfadd   mm6,mm1                 ; mm6=roundint(data0/8)=(00 ** 10 **)
-        pfadd   mm7,mm1                 ; mm7=roundint(data1/8)=(01 ** 11 **)
-        pfadd   mm0,mm1                 ; mm0=roundint(data6/8)=(06 ** 16 **)
-        pfadd   mm5,mm1                 ; mm5=roundint(data7/8)=(07 ** 17 **)
-
-        pand    mm6,mm3                 ; mm6=(00 -- 10 --)
-        pslld   mm7,WORD_BIT            ; mm7=(-- 01 -- 11)
-        pand    mm0,mm3                 ; mm0=(06 -- 16 --)
-        pslld   mm5,WORD_BIT            ; mm5=(-- 07 -- 17)
-        por     mm6,mm7                 ; mm6=(00 01 10 11)
-        por     mm0,mm5                 ; mm0=(06 07 16 17)
-
-        movq    mm1, MMWORD [wk(0)]     ; mm1=tmp2
-        movq    mm3, MMWORD [wk(1)]     ; mm3=tmp3
-
-        pfadd   mm4,mm2                 ; mm4=tmp4
-        movq    mm7,mm1
-        movq    mm5,mm3
-        pfadd   mm1,mm2                 ; mm1=data2=(02 12)
-        pfadd   mm3,mm4                 ; mm3=data4=(04 14)
-        pfsub   mm7,mm2                 ; mm7=data5=(05 15)
-        pfsub   mm5,mm4                 ; mm5=data3=(03 13)
-
-        movq    mm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]       ; mm2=[PD_RNDINT_MAGIC]
-        pcmpeqd mm4,mm4
-        psrld   mm4,WORD_BIT            ; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
-
-        pfadd   mm3,mm2                 ; mm3=roundint(data4/8)=(04 ** 14 **)
-        pfadd   mm7,mm2                 ; mm7=roundint(data5/8)=(05 ** 15 **)
-        pfadd   mm1,mm2                 ; mm1=roundint(data2/8)=(02 ** 12 **)
-        pfadd   mm5,mm2                 ; mm5=roundint(data3/8)=(03 ** 13 **)
-
-        pand    mm3,mm4                 ; mm3=(04 -- 14 --)
-        pslld   mm7,WORD_BIT            ; mm7=(-- 05 -- 15)
-        pand    mm1,mm4                 ; mm1=(02 -- 12 --)
-        pslld   mm5,WORD_BIT            ; mm5=(-- 03 -- 13)
-        por     mm3,mm7                 ; mm3=(04 05 14 15)
-        por     mm1,mm5                 ; mm1=(02 03 12 13)
-
-        movq      mm2,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm2=[PB_CENTERJSAMP]
-
-        packsswb  mm6,mm3               ; mm6=(00 01 10 11 04 05 14 15)
-        packsswb  mm1,mm0               ; mm1=(02 03 12 13 06 07 16 17)
-        paddb     mm6,mm2
-        paddb     mm1,mm2
-
-        movq      mm4,mm6               ; transpose coefficients(phase 2)
-        punpcklwd mm6,mm1               ; mm6=(00 01 02 03 10 11 12 13)
-        punpckhwd mm4,mm1               ; mm4=(04 05 06 07 14 15 16 17)
-
-        movq      mm7,mm6               ; transpose coefficients(phase 3)
-        punpckldq mm6,mm4               ; mm6=(00 01 02 03 04 05 06 07)
-        punpckhdq mm7,mm4               ; mm7=(10 11 12 13 14 15 16 17)
-
-        pushpic ebx                     ; save GOT address
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
-
-        poppic  ebx                     ; restore GOT address
-
-        add     esi, byte 2*SIZEOF_FAST_FLOAT   ; wsptr
-        add     edi, byte 2*SIZEOF_JSAMPROW
-        dec     ecx                             ; ctr
-        jnz     near .rowloop
-
-        femms           ; empty MMX/3DNow! state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jidctflt-sse.asm b/simd/jidctflt-sse.asm
deleted file mode 100644
index 4d4af2f..0000000
--- a/simd/jidctflt-sse.asm
+++ /dev/null
@@ -1,571 +0,0 @@
-;
-; jidctflt.asm - floating-point IDCT (SSE & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-        shufps  %1,%2,0x44
-%endmacro
-
-%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-        shufps  %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_float_sse)
-
-EXTN(jconst_idct_float_sse):
-
-PD_1_414        times 4 dd  1.414213562373095048801689
-PD_1_847        times 4 dd  1.847759065022573512256366
-PD_1_082        times 4 dd  1.082392200292393968799446
-PD_M2_613       times 4 dd -2.613125929752753055713286
-PD_0_125        times 4 dd  0.125       ; 1/8
-PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_sse (void *dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-                                        ; FAST_FLOAT workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_float_sse)
-
-EXTN(jsimd_idct_float_sse):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [workspace]
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     mm1,mm0
-        packsswb mm1,mm1
-        movd    eax,mm1
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-
-        punpckhwd mm1,mm0                       ; mm1=(** 02 ** 03)
-        punpcklwd mm0,mm0                       ; mm0=(00 00 01 01)
-        psrad     mm1,(DWORD_BIT-WORD_BIT)      ; mm1=in0H=(02 03)
-        psrad     mm0,(DWORD_BIT-WORD_BIT)      ; mm0=in0L=(00 01)
-        cvtpi2ps  xmm3,mm1                      ; xmm3=(02 03 ** **)
-        cvtpi2ps  xmm0,mm0                      ; xmm0=(00 01 ** **)
-        movlhps   xmm0,xmm3                     ; xmm0=in0=(00 01 02 03)
-
-        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm1,xmm0
-        movaps  xmm2,xmm0
-        movaps  xmm3,xmm0
-
-        shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
-        shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
-        shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
-        shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-        jmp     near .nextcolumn
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq      mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq      mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq      mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-        punpckhwd mm4,mm0                       ; mm4=(** 02 ** 03)
-        punpcklwd mm0,mm0                       ; mm0=(00 00 01 01)
-        punpckhwd mm5,mm1                       ; mm5=(** 22 ** 23)
-        punpcklwd mm1,mm1                       ; mm1=(20 20 21 21)
-
-        psrad     mm4,(DWORD_BIT-WORD_BIT)      ; mm4=in0H=(02 03)
-        psrad     mm0,(DWORD_BIT-WORD_BIT)      ; mm0=in0L=(00 01)
-        cvtpi2ps  xmm4,mm4                      ; xmm4=(02 03 ** **)
-        cvtpi2ps  xmm0,mm0                      ; xmm0=(00 01 ** **)
-        psrad     mm5,(DWORD_BIT-WORD_BIT)      ; mm5=in2H=(22 23)
-        psrad     mm1,(DWORD_BIT-WORD_BIT)      ; mm1=in2L=(20 21)
-        cvtpi2ps  xmm5,mm5                      ; xmm5=(22 23 ** **)
-        cvtpi2ps  xmm1,mm1                      ; xmm1=(20 21 ** **)
-
-        punpckhwd mm6,mm2                       ; mm6=(** 42 ** 43)
-        punpcklwd mm2,mm2                       ; mm2=(40 40 41 41)
-        punpckhwd mm7,mm3                       ; mm7=(** 62 ** 63)
-        punpcklwd mm3,mm3                       ; mm3=(60 60 61 61)
-
-        psrad     mm6,(DWORD_BIT-WORD_BIT)      ; mm6=in4H=(42 43)
-        psrad     mm2,(DWORD_BIT-WORD_BIT)      ; mm2=in4L=(40 41)
-        cvtpi2ps  xmm6,mm6                      ; xmm6=(42 43 ** **)
-        cvtpi2ps  xmm2,mm2                      ; xmm2=(40 41 ** **)
-        psrad     mm7,(DWORD_BIT-WORD_BIT)      ; mm7=in6H=(62 63)
-        psrad     mm3,(DWORD_BIT-WORD_BIT)      ; mm3=in6L=(60 61)
-        cvtpi2ps  xmm7,mm7                      ; xmm7=(62 63 ** **)
-        cvtpi2ps  xmm3,mm3                      ; xmm3=(60 61 ** **)
-
-        movlhps   xmm0,xmm4                     ; xmm0=in0=(00 01 02 03)
-        movlhps   xmm1,xmm5                     ; xmm1=in2=(20 21 22 23)
-        mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movlhps   xmm2,xmm6                     ; xmm2=in4=(40 41 42 43)
-        movlhps   xmm3,xmm7                     ; xmm3=in6=(60 61 62 63)
-        mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm4,xmm0
-        movaps  xmm5,xmm1
-        subps   xmm0,xmm2               ; xmm0=tmp11
-        subps   xmm1,xmm3
-        addps   xmm4,xmm2               ; xmm4=tmp10
-        addps   xmm5,xmm3               ; xmm5=tmp13
-
-        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
-        subps   xmm1,xmm5               ; xmm1=tmp12
-
-        movaps  xmm6,xmm4
-        movaps  xmm7,xmm0
-        subps   xmm4,xmm5               ; xmm4=tmp3
-        subps   xmm0,xmm1               ; xmm0=tmp2
-        addps   xmm6,xmm5               ; xmm6=tmp0
-        addps   xmm7,xmm1               ; xmm7=tmp1
-
-        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
-        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
-
-        ; -- Odd part
-
-        movq      mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq      mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movq      mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq      mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-        punpckhwd mm6,mm4                       ; mm6=(** 12 ** 13)
-        punpcklwd mm4,mm4                       ; mm4=(10 10 11 11)
-        punpckhwd mm2,mm0                       ; mm2=(** 32 ** 33)
-        punpcklwd mm0,mm0                       ; mm0=(30 30 31 31)
-
-        psrad     mm6,(DWORD_BIT-WORD_BIT)      ; mm6=in1H=(12 13)
-        psrad     mm4,(DWORD_BIT-WORD_BIT)      ; mm4=in1L=(10 11)
-        cvtpi2ps  xmm4,mm6                      ; xmm4=(12 13 ** **)
-        cvtpi2ps  xmm2,mm4                      ; xmm2=(10 11 ** **)
-        psrad     mm2,(DWORD_BIT-WORD_BIT)      ; mm2=in3H=(32 33)
-        psrad     mm0,(DWORD_BIT-WORD_BIT)      ; mm0=in3L=(30 31)
-        cvtpi2ps  xmm0,mm2                      ; xmm0=(32 33 ** **)
-        cvtpi2ps  xmm3,mm0                      ; xmm3=(30 31 ** **)
-
-        punpckhwd mm7,mm5                       ; mm7=(** 52 ** 53)
-        punpcklwd mm5,mm5                       ; mm5=(50 50 51 51)
-        punpckhwd mm3,mm1                       ; mm3=(** 72 ** 73)
-        punpcklwd mm1,mm1                       ; mm1=(70 70 71 71)
-
-        movlhps   xmm2,xmm4                     ; xmm2=in1=(10 11 12 13)
-        movlhps   xmm3,xmm0                     ; xmm3=in3=(30 31 32 33)
-
-        psrad     mm7,(DWORD_BIT-WORD_BIT)      ; mm7=in5H=(52 53)
-        psrad     mm5,(DWORD_BIT-WORD_BIT)      ; mm5=in5L=(50 51)
-        cvtpi2ps  xmm4,mm7                      ; xmm4=(52 53 ** **)
-        cvtpi2ps  xmm5,mm5                      ; xmm5=(50 51 ** **)
-        psrad     mm3,(DWORD_BIT-WORD_BIT)      ; mm3=in7H=(72 73)
-        psrad     mm1,(DWORD_BIT-WORD_BIT)      ; mm1=in7L=(70 71)
-        cvtpi2ps  xmm0,mm3                      ; xmm0=(72 73 ** **)
-        cvtpi2ps  xmm1,mm1                      ; xmm1=(70 71 ** **)
-
-        mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movlhps   xmm5,xmm4                     ; xmm5=in5=(50 51 52 53)
-        movlhps   xmm1,xmm0                     ; xmm1=in7=(70 71 72 73)
-        mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm4,xmm2
-        movaps  xmm0,xmm5
-        addps   xmm2,xmm1               ; xmm2=z11
-        addps   xmm5,xmm3               ; xmm5=z13
-        subps   xmm4,xmm1               ; xmm4=z12
-        subps   xmm0,xmm3               ; xmm0=z10
-
-        movaps  xmm1,xmm2
-        subps   xmm2,xmm5
-        addps   xmm1,xmm5               ; xmm1=tmp7
-
-        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
-
-        movaps  xmm3,xmm0
-        addps   xmm0,xmm4
-        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
-        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
-        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
-        addps   xmm3,xmm0               ; xmm3=tmp12
-        subps   xmm4,xmm0               ; xmm4=tmp10
-
-        ; -- Final output stage
-
-        subps   xmm3,xmm1               ; xmm3=tmp6
-        movaps  xmm5,xmm6
-        movaps  xmm0,xmm7
-        addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
-        addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
-        subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
-        subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
-        subps   xmm2,xmm3               ; xmm2=tmp5
-
-        movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
-        unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
-        unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
-        movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
-        unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
-        unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
-
-        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
-        movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
-
-        movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
-        movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
-
-        addps   xmm4,xmm2               ; xmm4=tmp4
-        movaps  xmm0,xmm7
-        movaps  xmm3,xmm5
-        addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
-        addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
-        subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
-        subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
-
-        movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
-        unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
-        unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
-        movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
-        unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
-        unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
-
-        movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
-        unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
-        movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
-        unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
-
-        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
-        movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
-        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-
-        movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
-        unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
-        unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
-        movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
-        unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
-
-        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
-        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
-        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-
-.nextcolumn:
-        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
-        add     edx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
-        add     edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
-        dec     ecx                                     ; ctr
-        jnz     near .columnloop
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.rowloop:
-
-        ; -- Even part
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
-
-        movaps  xmm4,xmm0
-        movaps  xmm5,xmm1
-        subps   xmm0,xmm2               ; xmm0=tmp11
-        subps   xmm1,xmm3
-        addps   xmm4,xmm2               ; xmm4=tmp10
-        addps   xmm5,xmm3               ; xmm5=tmp13
-
-        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
-        subps   xmm1,xmm5               ; xmm1=tmp12
-
-        movaps  xmm6,xmm4
-        movaps  xmm7,xmm0
-        subps   xmm4,xmm5               ; xmm4=tmp3
-        subps   xmm0,xmm1               ; xmm0=tmp2
-        addps   xmm6,xmm5               ; xmm6=tmp0
-        addps   xmm7,xmm1               ; xmm7=tmp1
-
-        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
-        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
-
-        ; -- Odd part
-
-        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
-
-        movaps  xmm4,xmm2
-        movaps  xmm0,xmm5
-        addps   xmm2,xmm1               ; xmm2=z11
-        addps   xmm5,xmm3               ; xmm5=z13
-        subps   xmm4,xmm1               ; xmm4=z12
-        subps   xmm0,xmm3               ; xmm0=z10
-
-        movaps  xmm1,xmm2
-        subps   xmm2,xmm5
-        addps   xmm1,xmm5               ; xmm1=tmp7
-
-        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
-
-        movaps  xmm3,xmm0
-        addps   xmm0,xmm4
-        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
-        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
-        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
-        addps   xmm3,xmm0               ; xmm3=tmp12
-        subps   xmm4,xmm0               ; xmm4=tmp10
-
-        ; -- Final output stage
-
-        subps   xmm3,xmm1               ; xmm3=tmp6
-        movaps  xmm5,xmm6
-        movaps  xmm0,xmm7
-        addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
-        addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
-        subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
-        subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
-        subps   xmm2,xmm3               ; xmm2=tmp5
-
-        movaps  xmm1,[GOTOFF(ebx,PD_0_125)]     ; xmm1=[PD_0_125]
-
-        mulps   xmm6,xmm1               ; descale(1/8)
-        mulps   xmm7,xmm1               ; descale(1/8)
-        mulps   xmm5,xmm1               ; descale(1/8)
-        mulps   xmm0,xmm1               ; descale(1/8)
-
-        movhlps   xmm3,xmm6
-        movhlps   xmm1,xmm7
-        cvtps2pi  mm0,xmm6              ; round to int32, mm0=data0L=(00 10)
-        cvtps2pi  mm1,xmm7              ; round to int32, mm1=data1L=(01 11)
-        cvtps2pi  mm2,xmm3              ; round to int32, mm2=data0H=(20 30)
-        cvtps2pi  mm3,xmm1              ; round to int32, mm3=data1H=(21 31)
-        packssdw  mm0,mm2               ; mm0=data0=(00 10 20 30)
-        packssdw  mm1,mm3               ; mm1=data1=(01 11 21 31)
-
-        movhlps   xmm6,xmm5
-        movhlps   xmm7,xmm0
-        cvtps2pi  mm4,xmm5              ; round to int32, mm4=data7L=(07 17)
-        cvtps2pi  mm5,xmm0              ; round to int32, mm5=data6L=(06 16)
-        cvtps2pi  mm6,xmm6              ; round to int32, mm6=data7H=(27 37)
-        cvtps2pi  mm7,xmm7              ; round to int32, mm7=data6H=(26 36)
-        packssdw  mm4,mm6               ; mm4=data7=(07 17 27 37)
-        packssdw  mm5,mm7               ; mm5=data6=(06 16 26 36)
-
-        packsswb  mm0,mm5               ; mm0=(00 10 20 30 06 16 26 36)
-        packsswb  mm1,mm4               ; mm1=(01 11 21 31 07 17 27 37)
-
-        movaps  xmm3, XMMWORD [wk(0)]   ; xmm3=tmp2
-        movaps  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
-
-        movaps  xmm6,[GOTOFF(ebx,PD_0_125)]     ; xmm6=[PD_0_125]
-
-        addps   xmm4,xmm2               ; xmm4=tmp4
-        movaps  xmm5,xmm3
-        movaps  xmm0,xmm1
-        addps   xmm3,xmm2               ; xmm3=data2=(02 12 22 32)
-        addps   xmm1,xmm4               ; xmm1=data4=(04 14 24 34)
-        subps   xmm5,xmm2               ; xmm5=data5=(05 15 25 35)
-        subps   xmm0,xmm4               ; xmm0=data3=(03 13 23 33)
-
-        mulps   xmm3,xmm6               ; descale(1/8)
-        mulps   xmm1,xmm6               ; descale(1/8)
-        mulps   xmm5,xmm6               ; descale(1/8)
-        mulps   xmm0,xmm6               ; descale(1/8)
-
-        movhlps   xmm7,xmm3
-        movhlps   xmm2,xmm1
-        cvtps2pi  mm2,xmm3              ; round to int32, mm2=data2L=(02 12)
-        cvtps2pi  mm3,xmm1              ; round to int32, mm3=data4L=(04 14)
-        cvtps2pi  mm6,xmm7              ; round to int32, mm6=data2H=(22 32)
-        cvtps2pi  mm7,xmm2              ; round to int32, mm7=data4H=(24 34)
-        packssdw  mm2,mm6               ; mm2=data2=(02 12 22 32)
-        packssdw  mm3,mm7               ; mm3=data4=(04 14 24 34)
-
-        movhlps   xmm4,xmm5
-        movhlps   xmm6,xmm0
-        cvtps2pi  mm5,xmm5              ; round to int32, mm5=data5L=(05 15)
-        cvtps2pi  mm4,xmm0              ; round to int32, mm4=data3L=(03 13)
-        cvtps2pi  mm6,xmm4              ; round to int32, mm6=data5H=(25 35)
-        cvtps2pi  mm7,xmm6              ; round to int32, mm7=data3H=(23 33)
-        packssdw  mm5,mm6               ; mm5=data5=(05 15 25 35)
-        packssdw  mm4,mm7               ; mm4=data3=(03 13 23 33)
-
-        movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm6=[PB_CENTERJSAMP]
-
-        packsswb  mm2,mm3               ; mm2=(02 12 22 32 04 14 24 34)
-        packsswb  mm4,mm5               ; mm4=(03 13 23 33 05 15 25 35)
-
-        paddb     mm0,mm6
-        paddb     mm1,mm6
-        paddb     mm2,mm6
-        paddb     mm4,mm6
-
-        movq      mm7,mm0               ; transpose coefficients(phase 1)
-        punpcklbw mm0,mm1               ; mm0=(00 01 10 11 20 21 30 31)
-        punpckhbw mm7,mm1               ; mm7=(06 07 16 17 26 27 36 37)
-        movq      mm3,mm2               ; transpose coefficients(phase 1)
-        punpcklbw mm2,mm4               ; mm2=(02 03 12 13 22 23 32 33)
-        punpckhbw mm3,mm4               ; mm3=(04 05 14 15 24 25 34 35)
-
-        movq      mm5,mm0               ; transpose coefficients(phase 2)
-        punpcklwd mm0,mm2               ; mm0=(00 01 02 03 10 11 12 13)
-        punpckhwd mm5,mm2               ; mm5=(20 21 22 23 30 31 32 33)
-        movq      mm6,mm3               ; transpose coefficients(phase 2)
-        punpcklwd mm3,mm7               ; mm3=(04 05 06 07 14 15 16 17)
-        punpckhwd mm6,mm7               ; mm6=(24 25 26 27 34 35 36 37)
-
-        movq      mm1,mm0               ; transpose coefficients(phase 3)
-        punpckldq mm0,mm3               ; mm0=(00 01 02 03 04 05 06 07)
-        punpckhdq mm1,mm3               ; mm1=(10 11 12 13 14 15 16 17)
-        movq      mm4,mm5               ; transpose coefficients(phase 3)
-        punpckldq mm5,mm6               ; mm5=(20 21 22 23 24 25 26 27)
-        punpckhdq mm4,mm6               ; mm4=(30 31 32 33 34 35 36 37)
-
-        pushpic ebx                     ; save GOT address
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
-        mov     edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
-
-        poppic  ebx                     ; restore GOT address
-
-        add     esi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
-        add     edi, byte 4*SIZEOF_JSAMPROW
-        dec     ecx                             ; ctr
-        jnz     near .rowloop
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jidctflt-sse2-64.asm b/simd/jidctflt-sse2-64.asm
deleted file mode 100644
index bdda05d..0000000
--- a/simd/jidctflt-sse2-64.asm
+++ /dev/null
@@ -1,482 +0,0 @@
-;
-; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-        shufps  %1,%2,0x44
-%endmacro
-
-%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-        shufps  %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_float_sse2)
-
-EXTN(jconst_idct_float_sse2):
-
-PD_1_414        times 4 dd  1.414213562373095048801689
-PD_1_847        times 4 dd  1.847759065022573512256366
-PD_1_082        times 4 dd  1.082392200292393968799446
-PD_M2_613       times 4 dd -2.613125929752753055713286
-PD_RNDINT_MAGIC times 4 dd  100663296.0 ; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = void *dct_table
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp    rbp+0
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-                                        ; FAST_FLOAT workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_float_sse2)
-
-EXTN(jsimd_idct_float_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [workspace]
-        collect_args
-        push    rbx
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-        mov     rdx, r10                ; quantptr
-        mov     rsi, r11                ; inptr
-        lea     rdi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     rcx, DCTSIZE/4                          ; ctr
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-        mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movq    xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movq    xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        movq    xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        movq    xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-        movq    xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        movq    xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        movq    xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1,xmm2
-        por     xmm3,xmm4
-        por     xmm5,xmm6
-        por     xmm1,xmm3
-        por     xmm5,xmm7
-        por     xmm1,xmm5
-        packsswb xmm1,xmm1
-        movd    eax,xmm1
-        test    rax,rax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-
-        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
-        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
-        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
-
-        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm1,xmm0
-        movaps  xmm2,xmm0
-        movaps  xmm3,xmm0
-
-        shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
-        shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
-        shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
-        shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
-        jmp     near .nextcolumn
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-        movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-
-        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
-        punpcklwd xmm1,xmm1             ; xmm1=(20 20 21 21 22 22 23 23)
-        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
-        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in2=(20 21 22 23)
-        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
-        cvtdq2ps  xmm1,xmm1                     ; xmm1=in2=(20 21 22 23)
-
-        punpcklwd xmm2,xmm2             ; xmm2=(40 40 41 41 42 42 43 43)
-        punpcklwd xmm3,xmm3             ; xmm3=(60 60 61 61 62 62 63 63)
-        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in4=(40 41 42 43)
-        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in6=(60 61 62 63)
-        cvtdq2ps  xmm2,xmm2                     ; xmm2=in4=(40 41 42 43)
-        cvtdq2ps  xmm3,xmm3                     ; xmm3=in6=(60 61 62 63)
-
-        mulps     xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm4,xmm0
-        movaps  xmm5,xmm1
-        subps   xmm0,xmm2               ; xmm0=tmp11
-        subps   xmm1,xmm3
-        addps   xmm4,xmm2               ; xmm4=tmp10
-        addps   xmm5,xmm3               ; xmm5=tmp13
-
-        mulps   xmm1,[rel PD_1_414]
-        subps   xmm1,xmm5               ; xmm1=tmp12
-
-        movaps  xmm6,xmm4
-        movaps  xmm7,xmm0
-        subps   xmm4,xmm5               ; xmm4=tmp3
-        subps   xmm0,xmm1               ; xmm0=tmp2
-        addps   xmm6,xmm5               ; xmm6=tmp0
-        addps   xmm7,xmm1               ; xmm7=tmp1
-
-        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
-        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
-
-        ; -- Odd part
-
-        movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-
-        punpcklwd xmm2,xmm2             ; xmm2=(10 10 11 11 12 12 13 13)
-        punpcklwd xmm3,xmm3             ; xmm3=(30 30 31 31 32 32 33 33)
-        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in1=(10 11 12 13)
-        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in3=(30 31 32 33)
-        cvtdq2ps  xmm2,xmm2                     ; xmm2=in1=(10 11 12 13)
-        cvtdq2ps  xmm3,xmm3                     ; xmm3=in3=(30 31 32 33)
-
-        punpcklwd xmm5,xmm5             ; xmm5=(50 50 51 51 52 52 53 53)
-        punpcklwd xmm1,xmm1             ; xmm1=(70 70 71 71 72 72 73 73)
-        psrad     xmm5,(DWORD_BIT-WORD_BIT)     ; xmm5=in5=(50 51 52 53)
-        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in7=(70 71 72 73)
-        cvtdq2ps  xmm5,xmm5                     ; xmm5=in5=(50 51 52 53)
-        cvtdq2ps  xmm1,xmm1                     ; xmm1=in7=(70 71 72 73)
-
-        mulps     xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm4,xmm2
-        movaps  xmm0,xmm5
-        addps   xmm2,xmm1               ; xmm2=z11
-        addps   xmm5,xmm3               ; xmm5=z13
-        subps   xmm4,xmm1               ; xmm4=z12
-        subps   xmm0,xmm3               ; xmm0=z10
-
-        movaps  xmm1,xmm2
-        subps   xmm2,xmm5
-        addps   xmm1,xmm5               ; xmm1=tmp7
-
-        mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
-
-        movaps  xmm3,xmm0
-        addps   xmm0,xmm4
-        mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
-        mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
-        mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
-        addps   xmm3,xmm0               ; xmm3=tmp12
-        subps   xmm4,xmm0               ; xmm4=tmp10
-
-        ; -- Final output stage
-
-        subps   xmm3,xmm1               ; xmm3=tmp6
-        movaps  xmm5,xmm6
-        movaps  xmm0,xmm7
-        addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
-        addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
-        subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
-        subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
-        subps   xmm2,xmm3               ; xmm2=tmp5
-
-        movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
-        unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
-        unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
-        movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
-        unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
-        unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
-
-        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
-        movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
-
-        movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
-        movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
-
-        addps   xmm4,xmm2               ; xmm4=tmp4
-        movaps  xmm0,xmm7
-        movaps  xmm3,xmm5
-        addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
-        addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
-        subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
-        subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
-
-        movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
-        unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
-        unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
-        movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
-        unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
-        unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
-
-        movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
-        unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
-        movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
-        unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
-
-        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
-        movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
-        movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
-
-        movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
-        unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
-        unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
-        movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
-        unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
-
-        movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
-        movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
-        movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
-
-.nextcolumn:
-        add     rsi, byte 4*SIZEOF_JCOEF                ; coef_block
-        add     rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
-        add     rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
-        dec     rcx                                     ; ctr
-        jnz     near .columnloop
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     rax, [original_rbp]
-        lea     rsi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     rdi, r12        ; (JSAMPROW *)
-        mov     eax, r13d
-        mov     rcx, DCTSIZE/4                          ; ctr
-.rowloop:
-
-        ; -- Even part
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
-
-        movaps  xmm4,xmm0
-        movaps  xmm5,xmm1
-        subps   xmm0,xmm2               ; xmm0=tmp11
-        subps   xmm1,xmm3
-        addps   xmm4,xmm2               ; xmm4=tmp10
-        addps   xmm5,xmm3               ; xmm5=tmp13
-
-        mulps   xmm1,[rel PD_1_414]
-        subps   xmm1,xmm5               ; xmm1=tmp12
-
-        movaps  xmm6,xmm4
-        movaps  xmm7,xmm0
-        subps   xmm4,xmm5               ; xmm4=tmp3
-        subps   xmm0,xmm1               ; xmm0=tmp2
-        addps   xmm6,xmm5               ; xmm6=tmp0
-        addps   xmm7,xmm1               ; xmm7=tmp1
-
-        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
-        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
-
-        ; -- Odd part
-
-        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
-
-        movaps  xmm4,xmm2
-        movaps  xmm0,xmm5
-        addps   xmm2,xmm1               ; xmm2=z11
-        addps   xmm5,xmm3               ; xmm5=z13
-        subps   xmm4,xmm1               ; xmm4=z12
-        subps   xmm0,xmm3               ; xmm0=z10
-
-        movaps  xmm1,xmm2
-        subps   xmm2,xmm5
-        addps   xmm1,xmm5               ; xmm1=tmp7
-
-        mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
-
-        movaps  xmm3,xmm0
-        addps   xmm0,xmm4
-        mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
-        mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
-        mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
-        addps   xmm3,xmm0               ; xmm3=tmp12
-        subps   xmm4,xmm0               ; xmm4=tmp10
-
-        ; -- Final output stage
-
-        subps   xmm3,xmm1               ; xmm3=tmp6
-        movaps  xmm5,xmm6
-        movaps  xmm0,xmm7
-        addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
-        addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
-        subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
-        subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
-        subps   xmm2,xmm3               ; xmm2=tmp5
-
-        movaps  xmm1,[rel PD_RNDINT_MAGIC]      ; xmm1=[rel PD_RNDINT_MAGIC]
-        pcmpeqd xmm3,xmm3
-        psrld   xmm3,WORD_BIT           ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-        addps   xmm6,xmm1       ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
-        addps   xmm7,xmm1       ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
-        addps   xmm0,xmm1       ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
-        addps   xmm5,xmm1       ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
-
-        pand    xmm6,xmm3               ; xmm6=(00 -- 10 -- 20 -- 30 --)
-        pslld   xmm7,WORD_BIT           ; xmm7=(-- 01 -- 11 -- 21 -- 31)
-        pand    xmm0,xmm3               ; xmm0=(06 -- 16 -- 26 -- 36 --)
-        pslld   xmm5,WORD_BIT           ; xmm5=(-- 07 -- 17 -- 27 -- 37)
-        por     xmm6,xmm7               ; xmm6=(00 01 10 11 20 21 30 31)
-        por     xmm0,xmm5               ; xmm0=(06 07 16 17 26 27 36 37)
-
-        movaps  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
-        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
-
-        addps   xmm4,xmm2               ; xmm4=tmp4
-        movaps  xmm7,xmm1
-        movaps  xmm5,xmm3
-        addps   xmm1,xmm2               ; xmm1=data2=(02 12 22 32)
-        addps   xmm3,xmm4               ; xmm3=data4=(04 14 24 34)
-        subps   xmm7,xmm2               ; xmm7=data5=(05 15 25 35)
-        subps   xmm5,xmm4               ; xmm5=data3=(03 13 23 33)
-
-        movaps  xmm2,[rel PD_RNDINT_MAGIC]      ; xmm2=[rel PD_RNDINT_MAGIC]
-        pcmpeqd xmm4,xmm4
-        psrld   xmm4,WORD_BIT           ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-        addps   xmm3,xmm2       ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
-        addps   xmm7,xmm2       ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
-        addps   xmm1,xmm2       ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
-        addps   xmm5,xmm2       ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
-
-        pand    xmm3,xmm4               ; xmm3=(04 -- 14 -- 24 -- 34 --)
-        pslld   xmm7,WORD_BIT           ; xmm7=(-- 05 -- 15 -- 25 -- 35)
-        pand    xmm1,xmm4               ; xmm1=(02 -- 12 -- 22 -- 32 --)
-        pslld   xmm5,WORD_BIT           ; xmm5=(-- 03 -- 13 -- 23 -- 33)
-        por     xmm3,xmm7               ; xmm3=(04 05 14 15 24 25 34 35)
-        por     xmm1,xmm5               ; xmm1=(02 03 12 13 22 23 32 33)
-
-        movdqa    xmm2,[rel PB_CENTERJSAMP]     ; xmm2=[rel PB_CENTERJSAMP]
-
-        packsswb  xmm6,xmm3     ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
-        packsswb  xmm1,xmm0     ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
-        paddb     xmm6,xmm2
-        paddb     xmm1,xmm2
-
-        movdqa    xmm4,xmm6     ; transpose coefficients(phase 2)
-        punpcklwd xmm6,xmm1     ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-        punpckhwd xmm4,xmm1     ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-
-        movdqa    xmm7,xmm6     ; transpose coefficients(phase 3)
-        punpckldq xmm6,xmm4     ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-        punpckhdq xmm7,xmm4     ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-
-        pshufd  xmm5,xmm6,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-        pshufd  xmm3,xmm7,0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-
-        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-        mov     rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
-        movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
-        mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-        mov     rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
-        movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
-
-        add     rsi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
-        add     rdi, byte 4*SIZEOF_JSAMPROW
-        dec     rcx                             ; ctr
-        jnz     near .rowloop
-
-        pop     rbx
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jidctflt-sse2.asm b/simd/jidctflt-sse2.asm
deleted file mode 100644
index a15a9c1..0000000
--- a/simd/jidctflt-sse2.asm
+++ /dev/null
@@ -1,497 +0,0 @@
-;
-; jidctflt.asm - floating-point IDCT (SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-        shufps  %1,%2,0x44
-%endmacro
-
-%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-        shufps  %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_float_sse2)
-
-EXTN(jconst_idct_float_sse2):
-
-PD_1_414        times 4 dd  1.414213562373095048801689
-PD_1_847        times 4 dd  1.847759065022573512256366
-PD_1_082        times 4 dd  1.082392200292393968799446
-PD_M2_613       times 4 dd -2.613125929752753055713286
-PD_RNDINT_MAGIC times 4 dd  100663296.0 ; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-                                        ; FAST_FLOAT workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_float_sse2)
-
-EXTN(jsimd_idct_float_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [workspace]
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movq    xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq    xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movq    xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq    xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        movq    xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     xmm1,xmm2
-        por     xmm3,xmm4
-        por     xmm5,xmm6
-        por     xmm1,xmm3
-        por     xmm5,xmm7
-        por     xmm1,xmm5
-        packsswb xmm1,xmm1
-        movd    eax,xmm1
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-
-        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
-        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
-        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
-
-        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm1,xmm0
-        movaps  xmm2,xmm0
-        movaps  xmm3,xmm0
-
-        shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
-        shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
-        shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
-        shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-        jmp     near .nextcolumn
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
-        punpcklwd xmm1,xmm1             ; xmm1=(20 20 21 21 22 22 23 23)
-        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
-        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in2=(20 21 22 23)
-        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
-        cvtdq2ps  xmm1,xmm1                     ; xmm1=in2=(20 21 22 23)
-
-        punpcklwd xmm2,xmm2             ; xmm2=(40 40 41 41 42 42 43 43)
-        punpcklwd xmm3,xmm3             ; xmm3=(60 60 61 61 62 62 63 63)
-        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in4=(40 41 42 43)
-        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in6=(60 61 62 63)
-        cvtdq2ps  xmm2,xmm2                     ; xmm2=in4=(40 41 42 43)
-        cvtdq2ps  xmm3,xmm3                     ; xmm3=in6=(60 61 62 63)
-
-        mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm4,xmm0
-        movaps  xmm5,xmm1
-        subps   xmm0,xmm2               ; xmm0=tmp11
-        subps   xmm1,xmm3
-        addps   xmm4,xmm2               ; xmm4=tmp10
-        addps   xmm5,xmm3               ; xmm5=tmp13
-
-        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
-        subps   xmm1,xmm5               ; xmm1=tmp12
-
-        movaps  xmm6,xmm4
-        movaps  xmm7,xmm0
-        subps   xmm4,xmm5               ; xmm4=tmp3
-        subps   xmm0,xmm1               ; xmm0=tmp2
-        addps   xmm6,xmm5               ; xmm6=tmp0
-        addps   xmm7,xmm1               ; xmm7=tmp1
-
-        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
-        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
-
-        ; -- Odd part
-
-        movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-        punpcklwd xmm2,xmm2             ; xmm2=(10 10 11 11 12 12 13 13)
-        punpcklwd xmm3,xmm3             ; xmm3=(30 30 31 31 32 32 33 33)
-        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in1=(10 11 12 13)
-        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in3=(30 31 32 33)
-        cvtdq2ps  xmm2,xmm2                     ; xmm2=in1=(10 11 12 13)
-        cvtdq2ps  xmm3,xmm3                     ; xmm3=in3=(30 31 32 33)
-
-        punpcklwd xmm5,xmm5             ; xmm5=(50 50 51 51 52 52 53 53)
-        punpcklwd xmm1,xmm1             ; xmm1=(70 70 71 71 72 72 73 73)
-        psrad     xmm5,(DWORD_BIT-WORD_BIT)     ; xmm5=in5=(50 51 52 53)
-        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in7=(70 71 72 73)
-        cvtdq2ps  xmm5,xmm5                     ; xmm5=in5=(50 51 52 53)
-        cvtdq2ps  xmm1,xmm1                     ; xmm1=in7=(70 71 72 73)
-
-        mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm4,xmm2
-        movaps  xmm0,xmm5
-        addps   xmm2,xmm1               ; xmm2=z11
-        addps   xmm5,xmm3               ; xmm5=z13
-        subps   xmm4,xmm1               ; xmm4=z12
-        subps   xmm0,xmm3               ; xmm0=z10
-
-        movaps  xmm1,xmm2
-        subps   xmm2,xmm5
-        addps   xmm1,xmm5               ; xmm1=tmp7
-
-        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
-
-        movaps  xmm3,xmm0
-        addps   xmm0,xmm4
-        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
-        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
-        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
-        addps   xmm3,xmm0               ; xmm3=tmp12
-        subps   xmm4,xmm0               ; xmm4=tmp10
-
-        ; -- Final output stage
-
-        subps   xmm3,xmm1               ; xmm3=tmp6
-        movaps  xmm5,xmm6
-        movaps  xmm0,xmm7
-        addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
-        addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
-        subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
-        subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
-        subps   xmm2,xmm3               ; xmm2=tmp5
-
-        movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
-        unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
-        unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
-        movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
-        unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
-        unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
-
-        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
-        movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
-
-        movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
-        movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
-
-        addps   xmm4,xmm2               ; xmm4=tmp4
-        movaps  xmm0,xmm7
-        movaps  xmm3,xmm5
-        addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
-        addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
-        subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
-        subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
-
-        movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
-        unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
-        unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
-        movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
-        unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
-        unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
-
-        movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
-        unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
-        movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
-        unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
-
-        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
-        movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
-        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-
-        movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
-        unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
-        unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
-        movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
-        unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
-
-        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
-        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
-        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-
-.nextcolumn:
-        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
-        add     edx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
-        add     edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
-        dec     ecx                                     ; ctr
-        jnz     near .columnloop
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.rowloop:
-
-        ; -- Even part
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
-
-        movaps  xmm4,xmm0
-        movaps  xmm5,xmm1
-        subps   xmm0,xmm2               ; xmm0=tmp11
-        subps   xmm1,xmm3
-        addps   xmm4,xmm2               ; xmm4=tmp10
-        addps   xmm5,xmm3               ; xmm5=tmp13
-
-        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
-        subps   xmm1,xmm5               ; xmm1=tmp12
-
-        movaps  xmm6,xmm4
-        movaps  xmm7,xmm0
-        subps   xmm4,xmm5               ; xmm4=tmp3
-        subps   xmm0,xmm1               ; xmm0=tmp2
-        addps   xmm6,xmm5               ; xmm6=tmp0
-        addps   xmm7,xmm1               ; xmm7=tmp1
-
-        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
-        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
-
-        ; -- Odd part
-
-        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
-
-        movaps  xmm4,xmm2
-        movaps  xmm0,xmm5
-        addps   xmm2,xmm1               ; xmm2=z11
-        addps   xmm5,xmm3               ; xmm5=z13
-        subps   xmm4,xmm1               ; xmm4=z12
-        subps   xmm0,xmm3               ; xmm0=z10
-
-        movaps  xmm1,xmm2
-        subps   xmm2,xmm5
-        addps   xmm1,xmm5               ; xmm1=tmp7
-
-        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
-
-        movaps  xmm3,xmm0
-        addps   xmm0,xmm4
-        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
-        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
-        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
-        addps   xmm3,xmm0               ; xmm3=tmp12
-        subps   xmm4,xmm0               ; xmm4=tmp10
-
-        ; -- Final output stage
-
-        subps   xmm3,xmm1               ; xmm3=tmp6
-        movaps  xmm5,xmm6
-        movaps  xmm0,xmm7
-        addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
-        addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
-        subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
-        subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
-        subps   xmm2,xmm3               ; xmm2=tmp5
-
-        movaps  xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]      ; xmm1=[PD_RNDINT_MAGIC]
-        pcmpeqd xmm3,xmm3
-        psrld   xmm3,WORD_BIT           ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-        addps   xmm6,xmm1       ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
-        addps   xmm7,xmm1       ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
-        addps   xmm0,xmm1       ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
-        addps   xmm5,xmm1       ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
-
-        pand    xmm6,xmm3               ; xmm6=(00 -- 10 -- 20 -- 30 --)
-        pslld   xmm7,WORD_BIT           ; xmm7=(-- 01 -- 11 -- 21 -- 31)
-        pand    xmm0,xmm3               ; xmm0=(06 -- 16 -- 26 -- 36 --)
-        pslld   xmm5,WORD_BIT           ; xmm5=(-- 07 -- 17 -- 27 -- 37)
-        por     xmm6,xmm7               ; xmm6=(00 01 10 11 20 21 30 31)
-        por     xmm0,xmm5               ; xmm0=(06 07 16 17 26 27 36 37)
-
-        movaps  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
-        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
-
-        addps   xmm4,xmm2               ; xmm4=tmp4
-        movaps  xmm7,xmm1
-        movaps  xmm5,xmm3
-        addps   xmm1,xmm2               ; xmm1=data2=(02 12 22 32)
-        addps   xmm3,xmm4               ; xmm3=data4=(04 14 24 34)
-        subps   xmm7,xmm2               ; xmm7=data5=(05 15 25 35)
-        subps   xmm5,xmm4               ; xmm5=data3=(03 13 23 33)
-
-        movaps  xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]      ; xmm2=[PD_RNDINT_MAGIC]
-        pcmpeqd xmm4,xmm4
-        psrld   xmm4,WORD_BIT           ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-        addps   xmm3,xmm2       ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
-        addps   xmm7,xmm2       ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
-        addps   xmm1,xmm2       ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
-        addps   xmm5,xmm2       ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
-
-        pand    xmm3,xmm4               ; xmm3=(04 -- 14 -- 24 -- 34 --)
-        pslld   xmm7,WORD_BIT           ; xmm7=(-- 05 -- 15 -- 25 -- 35)
-        pand    xmm1,xmm4               ; xmm1=(02 -- 12 -- 22 -- 32 --)
-        pslld   xmm5,WORD_BIT           ; xmm5=(-- 03 -- 13 -- 23 -- 33)
-        por     xmm3,xmm7               ; xmm3=(04 05 14 15 24 25 34 35)
-        por     xmm1,xmm5               ; xmm1=(02 03 12 13 22 23 32 33)
-
-        movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]     ; xmm2=[PB_CENTERJSAMP]
-
-        packsswb  xmm6,xmm3     ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
-        packsswb  xmm1,xmm0     ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
-        paddb     xmm6,xmm2
-        paddb     xmm1,xmm2
-
-        movdqa    xmm4,xmm6     ; transpose coefficients(phase 2)
-        punpcklwd xmm6,xmm1     ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-        punpckhwd xmm4,xmm1     ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-
-        movdqa    xmm7,xmm6     ; transpose coefficients(phase 3)
-        punpckldq xmm6,xmm4     ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-        punpckhdq xmm7,xmm4     ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-
-        pshufd  xmm5,xmm6,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-        pshufd  xmm3,xmm7,0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-
-        pushpic ebx                     ; save GOT address
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
-        movq    XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
-        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
-        movq    XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
-
-        poppic  ebx                     ; restore GOT address
-
-        add     esi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
-        add     edi, byte 4*SIZEOF_JSAMPROW
-        dec     ecx                             ; ctr
-        jnz     near .rowloop
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jidctfst-mmx.asm b/simd/jidctfst-mmx.asm
deleted file mode 100644
index 6e95bfb..0000000
--- a/simd/jidctfst-mmx.asm
+++ /dev/null
@@ -1,499 +0,0 @@
-;
-; jidctfst.asm - fast integer IDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the inverse DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jidctfst.c; see the jidctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      8       ; 14 is also OK.
-%define PASS1_BITS      2
-
-%if IFAST_SCALE_BITS != PASS1_BITS
-%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
-%endif
-
-%if CONST_BITS == 8
-F_1_082 equ     277             ; FIX(1.082392200)
-F_1_414 equ     362             ; FIX(1.414213562)
-F_1_847 equ     473             ; FIX(1.847759065)
-F_2_613 equ     669             ; FIX(2.613125930)
-F_1_613 equ     (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_1_082 equ     DESCALE(1162209775,30-CONST_BITS)       ; FIX(1.082392200)
-F_1_414 equ     DESCALE(1518500249,30-CONST_BITS)       ; FIX(1.414213562)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_2_613 equ     DESCALE(2805822602,30-CONST_BITS)       ; FIX(2.613125930)
-F_1_613 equ     (F_2_613 - (1 << CONST_BITS))   ; FIX(2.613125930) - FIX(1)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-        alignz  16
-        global  EXTN(jconst_idct_ifast_mmx)
-
-EXTN(jconst_idct_ifast_mmx):
-
-PW_F1414        times 4 dw  F_1_414 << CONST_SHIFT
-PW_F1847        times 4 dw  F_1_847 << CONST_SHIFT
-PW_MF1613       times 4 dw -F_1_613 << CONST_SHIFT
-PW_F1082        times 4 dw  F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_ifast_mmx (void *dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_JCOEF
-                                        ; JCOEF workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_ifast_mmx)
-
-EXTN(jsimd_idct_ifast_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [workspace]
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; JCOEF *wsptr
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     short .columnDCT
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     mm1,mm0
-        packsswb mm1,mm1
-        movd    eax,mm1
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movq      mm2,mm0               ; mm0=in0=(00 01 02 03)
-        punpcklwd mm0,mm0               ; mm0=(00 00 01 01)
-        punpckhwd mm2,mm2               ; mm2=(02 02 03 03)
-
-        movq      mm1,mm0
-        punpckldq mm0,mm0               ; mm0=(00 00 00 00)
-        punpckhdq mm1,mm1               ; mm1=(01 01 01 01)
-        movq      mm3,mm2
-        punpckldq mm2,mm2               ; mm2=(02 02 02 02)
-        punpckhdq mm3,mm3               ; mm3=(03 03 03 03)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
-        jmp     near .nextcolumn
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movq    mm4,mm0
-        movq    mm5,mm1
-        psubw   mm0,mm2                 ; mm0=tmp11
-        psubw   mm1,mm3
-        paddw   mm4,mm2                 ; mm4=tmp10
-        paddw   mm5,mm3                 ; mm5=tmp13
-
-        psllw   mm1,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm1,[GOTOFF(ebx,PW_F1414)]
-        psubw   mm1,mm5                 ; mm1=tmp12
-
-        movq    mm6,mm4
-        movq    mm7,mm0
-        psubw   mm4,mm5                 ; mm4=tmp3
-        psubw   mm0,mm1                 ; mm0=tmp2
-        paddw   mm6,mm5                 ; mm6=tmp0
-        paddw   mm7,mm1                 ; mm7=tmp1
-
-        movq    MMWORD [wk(1)], mm4     ; wk(1)=tmp3
-        movq    MMWORD [wk(0)], mm0     ; wk(0)=tmp2
-
-        ; -- Odd part
-
-        movq    mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        movq    mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movq    mm4,mm2
-        movq    mm0,mm5
-        psubw   mm2,mm1                 ; mm2=z12
-        psubw   mm5,mm3                 ; mm5=z10
-        paddw   mm4,mm1                 ; mm4=z11
-        paddw   mm0,mm3                 ; mm0=z13
-
-        movq    mm1,mm5                 ; mm1=z10(unscaled)
-        psllw   mm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   mm5,PRE_MULTIPLY_SCALE_BITS
-
-        movq    mm3,mm4
-        psubw   mm4,mm0
-        paddw   mm3,mm0                 ; mm3=tmp7
-
-        psllw   mm4,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm4,[GOTOFF(ebx,PW_F1414)]      ; mm4=tmp11
-
-        ; To avoid overflow...
-        ;
-        ; (Original)
-        ; tmp12 = -2.613125930 * z10 + z5;
-        ;
-        ; (This implementation)
-        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
-        ;       = -1.613125930 * z10 - z10 + z5;
-
-        movq    mm0,mm5
-        paddw   mm5,mm2
-        pmulhw  mm5,[GOTOFF(ebx,PW_F1847)]      ; mm5=z5
-        pmulhw  mm0,[GOTOFF(ebx,PW_MF1613)]
-        pmulhw  mm2,[GOTOFF(ebx,PW_F1082)]
-        psubw   mm0,mm1
-        psubw   mm2,mm5                 ; mm2=tmp10
-        paddw   mm0,mm5                 ; mm0=tmp12
-
-        ; -- Final output stage
-
-        psubw   mm0,mm3                 ; mm0=tmp6
-        movq    mm1,mm6
-        movq    mm5,mm7
-        paddw   mm6,mm3                 ; mm6=data0=(00 01 02 03)
-        paddw   mm7,mm0                 ; mm7=data1=(10 11 12 13)
-        psubw   mm1,mm3                 ; mm1=data7=(70 71 72 73)
-        psubw   mm5,mm0                 ; mm5=data6=(60 61 62 63)
-        psubw   mm4,mm0                 ; mm4=tmp5
-
-        movq      mm3,mm6               ; transpose coefficients(phase 1)
-        punpcklwd mm6,mm7               ; mm6=(00 10 01 11)
-        punpckhwd mm3,mm7               ; mm3=(02 12 03 13)
-        movq      mm0,mm5               ; transpose coefficients(phase 1)
-        punpcklwd mm5,mm1               ; mm5=(60 70 61 71)
-        punpckhwd mm0,mm1               ; mm0=(62 72 63 73)
-
-        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp2
-        movq    mm1, MMWORD [wk(1)]     ; mm1=tmp3
-
-        movq    MMWORD [wk(0)], mm5     ; wk(0)=(60 70 61 71)
-        movq    MMWORD [wk(1)], mm0     ; wk(1)=(62 72 63 73)
-
-        paddw   mm2,mm4                 ; mm2=tmp4
-        movq    mm5,mm7
-        movq    mm0,mm1
-        paddw   mm7,mm4                 ; mm7=data2=(20 21 22 23)
-        paddw   mm1,mm2                 ; mm1=data4=(40 41 42 43)
-        psubw   mm5,mm4                 ; mm5=data5=(50 51 52 53)
-        psubw   mm0,mm2                 ; mm0=data3=(30 31 32 33)
-
-        movq      mm4,mm7               ; transpose coefficients(phase 1)
-        punpcklwd mm7,mm0               ; mm7=(20 30 21 31)
-        punpckhwd mm4,mm0               ; mm4=(22 32 23 33)
-        movq      mm2,mm1               ; transpose coefficients(phase 1)
-        punpcklwd mm1,mm5               ; mm1=(40 50 41 51)
-        punpckhwd mm2,mm5               ; mm2=(42 52 43 53)
-
-        movq      mm0,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm7               ; mm6=(00 10 20 30)
-        punpckhdq mm0,mm7               ; mm0=(01 11 21 31)
-        movq      mm5,mm3               ; transpose coefficients(phase 2)
-        punpckldq mm3,mm4               ; mm3=(02 12 22 32)
-        punpckhdq mm5,mm4               ; mm5=(03 13 23 33)
-
-        movq    mm7, MMWORD [wk(0)]     ; mm7=(60 70 61 71)
-        movq    mm4, MMWORD [wk(1)]     ; mm4=(62 72 63 73)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
-
-        movq      mm6,mm1               ; transpose coefficients(phase 2)
-        punpckldq mm1,mm7               ; mm1=(40 50 60 70)
-        punpckhdq mm6,mm7               ; mm6=(41 51 61 71)
-        movq      mm0,mm2               ; transpose coefficients(phase 2)
-        punpckldq mm2,mm4               ; mm2=(42 52 62 72)
-        punpckhdq mm0,mm4               ; mm0=(43 53 63 73)
-
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
-        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
-
-.nextcolumn:
-        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
-        add     edx, byte 4*SIZEOF_IFAST_MULT_TYPE      ; quantptr
-        add     edi, byte 4*DCTSIZE*SIZEOF_JCOEF        ; wsptr
-        dec     ecx                                     ; ctr
-        jnz     near .columnloop
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; JCOEF *wsptr
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.rowloop:
-
-        ; -- Even part
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-        movq    mm4,mm0
-        movq    mm5,mm1
-        psubw   mm0,mm2                 ; mm0=tmp11
-        psubw   mm1,mm3
-        paddw   mm4,mm2                 ; mm4=tmp10
-        paddw   mm5,mm3                 ; mm5=tmp13
-
-        psllw   mm1,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm1,[GOTOFF(ebx,PW_F1414)]
-        psubw   mm1,mm5                 ; mm1=tmp12
-
-        movq    mm6,mm4
-        movq    mm7,mm0
-        psubw   mm4,mm5                 ; mm4=tmp3
-        psubw   mm0,mm1                 ; mm0=tmp2
-        paddw   mm6,mm5                 ; mm6=tmp0
-        paddw   mm7,mm1                 ; mm7=tmp1
-
-        movq    MMWORD [wk(1)], mm4     ; wk(1)=tmp3
-        movq    MMWORD [wk(0)], mm0     ; wk(0)=tmp2
-
-        ; -- Odd part
-
-        movq    mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movq    mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-        movq    mm4,mm2
-        movq    mm0,mm5
-        psubw   mm2,mm1                 ; mm2=z12
-        psubw   mm5,mm3                 ; mm5=z10
-        paddw   mm4,mm1                 ; mm4=z11
-        paddw   mm0,mm3                 ; mm0=z13
-
-        movq    mm1,mm5                 ; mm1=z10(unscaled)
-        psllw   mm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   mm5,PRE_MULTIPLY_SCALE_BITS
-
-        movq    mm3,mm4
-        psubw   mm4,mm0
-        paddw   mm3,mm0                 ; mm3=tmp7
-
-        psllw   mm4,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm4,[GOTOFF(ebx,PW_F1414)]      ; mm4=tmp11
-
-        ; To avoid overflow...
-        ;
-        ; (Original)
-        ; tmp12 = -2.613125930 * z10 + z5;
-        ;
-        ; (This implementation)
-        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
-        ;       = -1.613125930 * z10 - z10 + z5;
-
-        movq    mm0,mm5
-        paddw   mm5,mm2
-        pmulhw  mm5,[GOTOFF(ebx,PW_F1847)]      ; mm5=z5
-        pmulhw  mm0,[GOTOFF(ebx,PW_MF1613)]
-        pmulhw  mm2,[GOTOFF(ebx,PW_F1082)]
-        psubw   mm0,mm1
-        psubw   mm2,mm5                 ; mm2=tmp10
-        paddw   mm0,mm5                 ; mm0=tmp12
-
-        ; -- Final output stage
-
-        psubw   mm0,mm3                 ; mm0=tmp6
-        movq    mm1,mm6
-        movq    mm5,mm7
-        paddw   mm6,mm3                 ; mm6=data0=(00 10 20 30)
-        paddw   mm7,mm0                 ; mm7=data1=(01 11 21 31)
-        psraw   mm6,(PASS1_BITS+3)      ; descale
-        psraw   mm7,(PASS1_BITS+3)      ; descale
-        psubw   mm1,mm3                 ; mm1=data7=(07 17 27 37)
-        psubw   mm5,mm0                 ; mm5=data6=(06 16 26 36)
-        psraw   mm1,(PASS1_BITS+3)      ; descale
-        psraw   mm5,(PASS1_BITS+3)      ; descale
-        psubw   mm4,mm0                 ; mm4=tmp5
-
-        packsswb  mm6,mm5               ; mm6=(00 10 20 30 06 16 26 36)
-        packsswb  mm7,mm1               ; mm7=(01 11 21 31 07 17 27 37)
-
-        movq    mm3, MMWORD [wk(0)]     ; mm3=tmp2
-        movq    mm0, MMWORD [wk(1)]     ; mm0=tmp3
-
-        paddw   mm2,mm4                 ; mm2=tmp4
-        movq    mm5,mm3
-        movq    mm1,mm0
-        paddw   mm3,mm4                 ; mm3=data2=(02 12 22 32)
-        paddw   mm0,mm2                 ; mm0=data4=(04 14 24 34)
-        psraw   mm3,(PASS1_BITS+3)      ; descale
-        psraw   mm0,(PASS1_BITS+3)      ; descale
-        psubw   mm5,mm4                 ; mm5=data5=(05 15 25 35)
-        psubw   mm1,mm2                 ; mm1=data3=(03 13 23 33)
-        psraw   mm5,(PASS1_BITS+3)      ; descale
-        psraw   mm1,(PASS1_BITS+3)      ; descale
-
-        movq      mm4,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm4=[PB_CENTERJSAMP]
-
-        packsswb  mm3,mm0               ; mm3=(02 12 22 32 04 14 24 34)
-        packsswb  mm1,mm5               ; mm1=(03 13 23 33 05 15 25 35)
-
-        paddb     mm6,mm4
-        paddb     mm7,mm4
-        paddb     mm3,mm4
-        paddb     mm1,mm4
-
-        movq      mm2,mm6               ; transpose coefficients(phase 1)
-        punpcklbw mm6,mm7               ; mm6=(00 01 10 11 20 21 30 31)
-        punpckhbw mm2,mm7               ; mm2=(06 07 16 17 26 27 36 37)
-        movq      mm0,mm3               ; transpose coefficients(phase 1)
-        punpcklbw mm3,mm1               ; mm3=(02 03 12 13 22 23 32 33)
-        punpckhbw mm0,mm1               ; mm0=(04 05 14 15 24 25 34 35)
-
-        movq      mm5,mm6               ; transpose coefficients(phase 2)
-        punpcklwd mm6,mm3               ; mm6=(00 01 02 03 10 11 12 13)
-        punpckhwd mm5,mm3               ; mm5=(20 21 22 23 30 31 32 33)
-        movq      mm4,mm0               ; transpose coefficients(phase 2)
-        punpcklwd mm0,mm2               ; mm0=(04 05 06 07 14 15 16 17)
-        punpckhwd mm4,mm2               ; mm4=(24 25 26 27 34 35 36 37)
-
-        movq      mm7,mm6               ; transpose coefficients(phase 3)
-        punpckldq mm6,mm0               ; mm6=(00 01 02 03 04 05 06 07)
-        punpckhdq mm7,mm0               ; mm7=(10 11 12 13 14 15 16 17)
-        movq      mm1,mm5               ; transpose coefficients(phase 3)
-        punpckldq mm5,mm4               ; mm5=(20 21 22 23 24 25 26 27)
-        punpckhdq mm1,mm4               ; mm1=(30 31 32 33 34 35 36 37)
-
-        pushpic ebx                     ; save GOT address
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
-        mov     edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
-
-        poppic  ebx                     ; restore GOT address
-
-        add     esi, byte 4*SIZEOF_JCOEF        ; wsptr
-        add     edi, byte 4*SIZEOF_JSAMPROW
-        dec     ecx                             ; ctr
-        jnz     near .rowloop
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jidctfst-sse2-64.asm b/simd/jidctfst-sse2-64.asm
deleted file mode 100644
index 4884642..0000000
--- a/simd/jidctfst-sse2-64.asm
+++ /dev/null
@@ -1,491 +0,0 @@
-;
-; jidctfst.asm - fast integer IDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the inverse DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jidctfst.c; see the jidctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      8       ; 14 is also OK.
-%define PASS1_BITS      2
-
-%if IFAST_SCALE_BITS != PASS1_BITS
-%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
-%endif
-
-%if CONST_BITS == 8
-F_1_082 equ     277             ; FIX(1.082392200)
-F_1_414 equ     362             ; FIX(1.414213562)
-F_1_847 equ     473             ; FIX(1.847759065)
-F_2_613 equ     669             ; FIX(2.613125930)
-F_1_613 equ     (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_1_082 equ     DESCALE(1162209775,30-CONST_BITS)       ; FIX(1.082392200)
-F_1_414 equ     DESCALE(1518500249,30-CONST_BITS)       ; FIX(1.414213562)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_2_613 equ     DESCALE(2805822602,30-CONST_BITS)       ; FIX(2.613125930)
-F_1_613 equ     (F_2_613 - (1 << CONST_BITS))   ; FIX(2.613125930) - FIX(1)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-        alignz  16
-        global  EXTN(jconst_idct_ifast_sse2)
-
-EXTN(jconst_idct_ifast_sse2):
-
-PW_F1414        times 8 dw  F_1_414 << CONST_SHIFT
-PW_F1847        times 8 dw  F_1_847 << CONST_SHIFT
-PW_MF1613       times 8 dw -F_1_613 << CONST_SHIFT
-PW_F1082        times 8 dw  F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_ifast_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = jpeg_component_info *compptr
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp    rbp+0
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_idct_ifast_sse2)
-
-EXTN(jsimd_idct_ifast_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-
-        ; ---- Pass 1: process columns from input.
-
-        mov     rdx, r10                ; quantptr
-        mov     rsi, r11                ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
-        mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1,xmm0
-        packsswb xmm1,xmm1
-        packsswb xmm1,xmm1
-        movd    eax,xmm1
-        test    rax,rax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movdqa    xmm7,xmm0             ; xmm0=in0=(00 01 02 03 04 05 06 07)
-        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
-        punpckhwd xmm7,xmm7             ; xmm7=(04 04 05 05 06 06 07 07)
-
-        pshufd  xmm6,xmm0,0x00          ; xmm6=col0=(00 00 00 00 00 00 00 00)
-        pshufd  xmm2,xmm0,0x55          ; xmm2=col1=(01 01 01 01 01 01 01 01)
-        pshufd  xmm5,xmm0,0xAA          ; xmm5=col2=(02 02 02 02 02 02 02 02)
-        pshufd  xmm0,xmm0,0xFF          ; xmm0=col3=(03 03 03 03 03 03 03 03)
-        pshufd  xmm1,xmm7,0x00          ; xmm1=col4=(04 04 04 04 04 04 04 04)
-        pshufd  xmm4,xmm7,0x55          ; xmm4=col5=(05 05 05 05 05 05 05 05)
-        pshufd  xmm3,xmm7,0xAA          ; xmm3=col6=(06 06 06 06 06 06 06 06)
-        pshufd  xmm7,xmm7,0xFF          ; xmm7=col7=(07 07 07 07 07 07 07 07)
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=col1
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=col3
-        jmp     near .column_end
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movdqa  xmm4,xmm0
-        movdqa  xmm5,xmm1
-        psubw   xmm0,xmm2               ; xmm0=tmp11
-        psubw   xmm1,xmm3
-        paddw   xmm4,xmm2               ; xmm4=tmp10
-        paddw   xmm5,xmm3               ; xmm5=tmp13
-
-        psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm1,[rel PW_F1414]
-        psubw   xmm1,xmm5               ; xmm1=tmp12
-
-        movdqa  xmm6,xmm4
-        movdqa  xmm7,xmm0
-        psubw   xmm4,xmm5               ; xmm4=tmp3
-        psubw   xmm0,xmm1               ; xmm0=tmp2
-        paddw   xmm6,xmm5               ; xmm6=tmp0
-        paddw   xmm7,xmm1               ; xmm7=tmp1
-
-        movdqa  XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
-        movdqa  XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
-
-        ; -- Odd part
-
-        movdqa  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-        movdqa  xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movdqa  xmm4,xmm2
-        movdqa  xmm0,xmm5
-        psubw   xmm2,xmm1               ; xmm2=z12
-        psubw   xmm5,xmm3               ; xmm5=z10
-        paddw   xmm4,xmm1               ; xmm4=z11
-        paddw   xmm0,xmm3               ; xmm0=z13
-
-        movdqa  xmm1,xmm5               ; xmm1=z10(unscaled)
-        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-
-        movdqa  xmm3,xmm4
-        psubw   xmm4,xmm0
-        paddw   xmm3,xmm0               ; xmm3=tmp7
-
-        psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm4,[rel PW_F1414]     ; xmm4=tmp11
-
-        ; To avoid overflow...
-        ;
-        ; (Original)
-        ; tmp12 = -2.613125930 * z10 + z5;
-        ;
-        ; (This implementation)
-        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
-        ;       = -1.613125930 * z10 - z10 + z5;
-
-        movdqa  xmm0,xmm5
-        paddw   xmm5,xmm2
-        pmulhw  xmm5,[rel PW_F1847]     ; xmm5=z5
-        pmulhw  xmm0,[rel PW_MF1613]
-        pmulhw  xmm2,[rel PW_F1082]
-        psubw   xmm0,xmm1
-        psubw   xmm2,xmm5               ; xmm2=tmp10
-        paddw   xmm0,xmm5               ; xmm0=tmp12
-
-        ; -- Final output stage
-
-        psubw   xmm0,xmm3               ; xmm0=tmp6
-        movdqa  xmm1,xmm6
-        movdqa  xmm5,xmm7
-        paddw   xmm6,xmm3               ; xmm6=data0=(00 01 02 03 04 05 06 07)
-        paddw   xmm7,xmm0               ; xmm7=data1=(10 11 12 13 14 15 16 17)
-        psubw   xmm1,xmm3               ; xmm1=data7=(70 71 72 73 74 75 76 77)
-        psubw   xmm5,xmm0               ; xmm5=data6=(60 61 62 63 64 65 66 67)
-        psubw   xmm4,xmm0               ; xmm4=tmp5
-
-        movdqa    xmm3,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm7             ; xmm6=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm3,xmm7             ; xmm3=(04 14 05 15 06 16 07 17)
-        movdqa    xmm0,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm1             ; xmm5=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm0,xmm1             ; xmm0=(64 74 65 75 66 76 67 77)
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
-        movdqa  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
-
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
-
-        paddw   xmm2,xmm4               ; xmm2=tmp4
-        movdqa  xmm5,xmm7
-        movdqa  xmm0,xmm1
-        paddw   xmm7,xmm4               ; xmm7=data2=(20 21 22 23 24 25 26 27)
-        paddw   xmm1,xmm2               ; xmm1=data4=(40 41 42 43 44 45 46 47)
-        psubw   xmm5,xmm4               ; xmm5=data5=(50 51 52 53 54 55 56 57)
-        psubw   xmm0,xmm2               ; xmm0=data3=(30 31 32 33 34 35 36 37)
-
-        movdqa    xmm4,xmm7             ; transpose coefficients(phase 1)
-        punpcklwd xmm7,xmm0             ; xmm7=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm4,xmm0             ; xmm4=(24 34 25 35 26 36 27 37)
-        movdqa    xmm2,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm5             ; xmm1=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm2,xmm5             ; xmm2=(44 54 45 55 46 56 47 57)
-
-        movdqa    xmm0,xmm3             ; transpose coefficients(phase 2)
-        punpckldq xmm3,xmm4             ; xmm3=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm0,xmm4             ; xmm0=(06 16 26 36 07 17 27 37)
-        movdqa    xmm5,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm7             ; xmm6=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm5,xmm7             ; xmm5=(02 12 22 32 03 13 23 33)
-
-        movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
-        movdqa  xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
-
-        movdqa  XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
-
-        movdqa    xmm3,xmm1             ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm4             ; xmm1=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm3,xmm4             ; xmm3=(42 52 62 72 43 53 63 73)
-        movdqa    xmm0,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm7             ; xmm2=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm0,xmm7             ; xmm0=(46 56 66 76 47 57 67 77)
-
-        movdqa     xmm4,xmm6            ; transpose coefficients(phase 3)
-        punpcklqdq xmm6,xmm1            ; xmm6=col0=(00 10 20 30 40 50 60 70)
-        punpckhqdq xmm4,xmm1            ; xmm4=col1=(01 11 21 31 41 51 61 71)
-        movdqa     xmm7,xmm5            ; transpose coefficients(phase 3)
-        punpcklqdq xmm5,xmm3            ; xmm5=col2=(02 12 22 32 42 52 62 72)
-        punpckhqdq xmm7,xmm3            ; xmm7=col3=(03 13 23 33 43 53 63 73)
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
-        movdqa  xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=col1
-        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=col3
-
-        movdqa     xmm4,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm2            ; xmm1=col4=(04 14 24 34 44 54 64 74)
-        punpckhqdq xmm4,xmm2            ; xmm4=col5=(05 15 25 35 45 55 65 75)
-        movdqa     xmm7,xmm3            ; transpose coefficients(phase 3)
-        punpcklqdq xmm3,xmm0            ; xmm3=col6=(06 16 26 36 46 56 66 76)
-        punpckhqdq xmm7,xmm0            ; xmm7=col7=(07 17 27 37 47 57 67 77)
-.column_end:
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     rax, [original_rbp]
-        mov     rdi, r12        ; (JSAMPROW *)
-        mov     eax, r13d
-
-        ; -- Even part
-
-        ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
-
-        movdqa  xmm2,xmm6
-        movdqa  xmm0,xmm5
-        psubw   xmm6,xmm1               ; xmm6=tmp11
-        psubw   xmm5,xmm3
-        paddw   xmm2,xmm1               ; xmm2=tmp10
-        paddw   xmm0,xmm3               ; xmm0=tmp13
-
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm5,[rel PW_F1414]
-        psubw   xmm5,xmm0               ; xmm5=tmp12
-
-        movdqa  xmm1,xmm2
-        movdqa  xmm3,xmm6
-        psubw   xmm2,xmm0               ; xmm2=tmp3
-        psubw   xmm6,xmm5               ; xmm6=tmp2
-        paddw   xmm1,xmm0               ; xmm1=tmp0
-        paddw   xmm3,xmm5               ; xmm3=tmp1
-
-        movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=col1
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=col3
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
-        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
-
-        ; -- Odd part
-
-        ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
-
-        movdqa  xmm2,xmm0
-        movdqa  xmm6,xmm4
-        psubw   xmm0,xmm7               ; xmm0=z12
-        psubw   xmm4,xmm5               ; xmm4=z10
-        paddw   xmm2,xmm7               ; xmm2=z11
-        paddw   xmm6,xmm5               ; xmm6=z13
-
-        movdqa  xmm7,xmm4               ; xmm7=z10(unscaled)
-        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
-
-        movdqa  xmm5,xmm2
-        psubw   xmm2,xmm6
-        paddw   xmm5,xmm6               ; xmm5=tmp7
-
-        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm2,[rel PW_F1414]     ; xmm2=tmp11
-
-        ; To avoid overflow...
-        ;
-        ; (Original)
-        ; tmp12 = -2.613125930 * z10 + z5;
-        ;
-        ; (This implementation)
-        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
-        ;       = -1.613125930 * z10 - z10 + z5;
-
-        movdqa  xmm6,xmm4
-        paddw   xmm4,xmm0
-        pmulhw  xmm4,[rel PW_F1847]     ; xmm4=z5
-        pmulhw  xmm6,[rel PW_MF1613]
-        pmulhw  xmm0,[rel PW_F1082]
-        psubw   xmm6,xmm7
-        psubw   xmm0,xmm4               ; xmm0=tmp10
-        paddw   xmm6,xmm4               ; xmm6=tmp12
-
-        ; -- Final output stage
-
-        psubw   xmm6,xmm5               ; xmm6=tmp6
-        movdqa  xmm7,xmm1
-        movdqa  xmm4,xmm3
-        paddw   xmm1,xmm5               ; xmm1=data0=(00 10 20 30 40 50 60 70)
-        paddw   xmm3,xmm6               ; xmm3=data1=(01 11 21 31 41 51 61 71)
-        psraw   xmm1,(PASS1_BITS+3)     ; descale
-        psraw   xmm3,(PASS1_BITS+3)     ; descale
-        psubw   xmm7,xmm5               ; xmm7=data7=(07 17 27 37 47 57 67 77)
-        psubw   xmm4,xmm6               ; xmm4=data6=(06 16 26 36 46 56 66 76)
-        psraw   xmm7,(PASS1_BITS+3)     ; descale
-        psraw   xmm4,(PASS1_BITS+3)     ; descale
-        psubw   xmm2,xmm6               ; xmm2=tmp5
-
-        packsswb  xmm1,xmm4     ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        packsswb  xmm3,xmm7     ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
-        movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
-
-        paddw   xmm0,xmm2               ; xmm0=tmp4
-        movdqa  xmm4,xmm5
-        movdqa  xmm7,xmm6
-        paddw   xmm5,xmm2               ; xmm5=data2=(02 12 22 32 42 52 62 72)
-        paddw   xmm6,xmm0               ; xmm6=data4=(04 14 24 34 44 54 64 74)
-        psraw   xmm5,(PASS1_BITS+3)     ; descale
-        psraw   xmm6,(PASS1_BITS+3)     ; descale
-        psubw   xmm4,xmm2               ; xmm4=data5=(05 15 25 35 45 55 65 75)
-        psubw   xmm7,xmm0               ; xmm7=data3=(03 13 23 33 43 53 63 73)
-        psraw   xmm4,(PASS1_BITS+3)     ; descale
-        psraw   xmm7,(PASS1_BITS+3)     ; descale
-
-        movdqa    xmm2,[rel PB_CENTERJSAMP]     ; xmm2=[rel PB_CENTERJSAMP]
-
-        packsswb  xmm5,xmm6     ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-        packsswb  xmm7,xmm4     ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-        paddb     xmm1,xmm2
-        paddb     xmm3,xmm2
-        paddb     xmm5,xmm2
-        paddb     xmm7,xmm2
-
-        movdqa    xmm0,xmm1     ; transpose coefficients(phase 1)
-        punpcklbw xmm1,xmm3     ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-        punpckhbw xmm0,xmm3     ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-        movdqa    xmm6,xmm5     ; transpose coefficients(phase 1)
-        punpcklbw xmm5,xmm7     ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-        punpckhbw xmm6,xmm7     ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-        movdqa    xmm4,xmm1     ; transpose coefficients(phase 2)
-        punpcklwd xmm1,xmm5     ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-        punpckhwd xmm4,xmm5     ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-        movdqa    xmm2,xmm6     ; transpose coefficients(phase 2)
-        punpcklwd xmm6,xmm0     ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-        punpckhwd xmm2,xmm0     ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-        movdqa    xmm3,xmm1     ; transpose coefficients(phase 3)
-        punpckldq xmm1,xmm6     ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-        punpckhdq xmm3,xmm6     ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-        movdqa    xmm7,xmm4     ; transpose coefficients(phase 3)
-        punpckldq xmm4,xmm2     ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-        punpckhdq xmm7,xmm2     ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-        pshufd  xmm5,xmm1,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-        pshufd  xmm0,xmm3,0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-        pshufd  xmm6,xmm4,0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-        pshufd  xmm2,xmm7,0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-        mov     rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
-
-        mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
-        mov     rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
-
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jidctfst-sse2.asm b/simd/jidctfst-sse2.asm
deleted file mode 100644
index f591e55..0000000
--- a/simd/jidctfst-sse2.asm
+++ /dev/null
@@ -1,501 +0,0 @@
-;
-; jidctfst.asm - fast integer IDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the inverse DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jidctfst.c; see the jidctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      8       ; 14 is also OK.
-%define PASS1_BITS      2
-
-%if IFAST_SCALE_BITS != PASS1_BITS
-%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
-%endif
-
-%if CONST_BITS == 8
-F_1_082 equ     277             ; FIX(1.082392200)
-F_1_414 equ     362             ; FIX(1.414213562)
-F_1_847 equ     473             ; FIX(1.847759065)
-F_2_613 equ     669             ; FIX(2.613125930)
-F_1_613 equ     (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_1_082 equ     DESCALE(1162209775,30-CONST_BITS)       ; FIX(1.082392200)
-F_1_414 equ     DESCALE(1518500249,30-CONST_BITS)       ; FIX(1.414213562)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_2_613 equ     DESCALE(2805822602,30-CONST_BITS)       ; FIX(2.613125930)
-F_1_613 equ     (F_2_613 - (1 << CONST_BITS))   ; FIX(2.613125930) - FIX(1)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-        alignz  16
-        global  EXTN(jconst_idct_ifast_sse2)
-
-EXTN(jconst_idct_ifast_sse2):
-
-PW_F1414        times 8 dw  F_1_414 << CONST_SHIFT
-PW_F1847        times 8 dw  F_1_847 << CONST_SHIFT
-PW_MF1613       times 8 dw -F_1_613 << CONST_SHIFT
-PW_F1082        times 8 dw  F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_ifast_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_idct_ifast_sse2)
-
-EXTN(jsimd_idct_ifast_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     xmm1,xmm0
-        packsswb xmm1,xmm1
-        packsswb xmm1,xmm1
-        movd    eax,xmm1
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movdqa    xmm7,xmm0             ; xmm0=in0=(00 01 02 03 04 05 06 07)
-        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
-        punpckhwd xmm7,xmm7             ; xmm7=(04 04 05 05 06 06 07 07)
-
-        pshufd  xmm6,xmm0,0x00          ; xmm6=col0=(00 00 00 00 00 00 00 00)
-        pshufd  xmm2,xmm0,0x55          ; xmm2=col1=(01 01 01 01 01 01 01 01)
-        pshufd  xmm5,xmm0,0xAA          ; xmm5=col2=(02 02 02 02 02 02 02 02)
-        pshufd  xmm0,xmm0,0xFF          ; xmm0=col3=(03 03 03 03 03 03 03 03)
-        pshufd  xmm1,xmm7,0x00          ; xmm1=col4=(04 04 04 04 04 04 04 04)
-        pshufd  xmm4,xmm7,0x55          ; xmm4=col5=(05 05 05 05 05 05 05 05)
-        pshufd  xmm3,xmm7,0xAA          ; xmm3=col6=(06 06 06 06 06 06 06 06)
-        pshufd  xmm7,xmm7,0xFF          ; xmm7=col7=(07 07 07 07 07 07 07 07)
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=col1
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=col3
-        jmp     near .column_end
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movdqa  xmm4,xmm0
-        movdqa  xmm5,xmm1
-        psubw   xmm0,xmm2               ; xmm0=tmp11
-        psubw   xmm1,xmm3
-        paddw   xmm4,xmm2               ; xmm4=tmp10
-        paddw   xmm5,xmm3               ; xmm5=tmp13
-
-        psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm1,[GOTOFF(ebx,PW_F1414)]
-        psubw   xmm1,xmm5               ; xmm1=tmp12
-
-        movdqa  xmm6,xmm4
-        movdqa  xmm7,xmm0
-        psubw   xmm4,xmm5               ; xmm4=tmp3
-        psubw   xmm0,xmm1               ; xmm0=tmp2
-        paddw   xmm6,xmm5               ; xmm6=tmp0
-        paddw   xmm7,xmm1               ; xmm7=tmp1
-
-        movdqa  XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
-        movdqa  XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
-
-        ; -- Odd part
-
-        movdqa  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        movdqa  xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movdqa  xmm4,xmm2
-        movdqa  xmm0,xmm5
-        psubw   xmm2,xmm1               ; xmm2=z12
-        psubw   xmm5,xmm3               ; xmm5=z10
-        paddw   xmm4,xmm1               ; xmm4=z11
-        paddw   xmm0,xmm3               ; xmm0=z13
-
-        movdqa  xmm1,xmm5               ; xmm1=z10(unscaled)
-        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-
-        movdqa  xmm3,xmm4
-        psubw   xmm4,xmm0
-        paddw   xmm3,xmm0               ; xmm3=tmp7
-
-        psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm4,[GOTOFF(ebx,PW_F1414)]     ; xmm4=tmp11
-
-        ; To avoid overflow...
-        ;
-        ; (Original)
-        ; tmp12 = -2.613125930 * z10 + z5;
-        ;
-        ; (This implementation)
-        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
-        ;       = -1.613125930 * z10 - z10 + z5;
-
-        movdqa  xmm0,xmm5
-        paddw   xmm5,xmm2
-        pmulhw  xmm5,[GOTOFF(ebx,PW_F1847)]     ; xmm5=z5
-        pmulhw  xmm0,[GOTOFF(ebx,PW_MF1613)]
-        pmulhw  xmm2,[GOTOFF(ebx,PW_F1082)]
-        psubw   xmm0,xmm1
-        psubw   xmm2,xmm5               ; xmm2=tmp10
-        paddw   xmm0,xmm5               ; xmm0=tmp12
-
-        ; -- Final output stage
-
-        psubw   xmm0,xmm3               ; xmm0=tmp6
-        movdqa  xmm1,xmm6
-        movdqa  xmm5,xmm7
-        paddw   xmm6,xmm3               ; xmm6=data0=(00 01 02 03 04 05 06 07)
-        paddw   xmm7,xmm0               ; xmm7=data1=(10 11 12 13 14 15 16 17)
-        psubw   xmm1,xmm3               ; xmm1=data7=(70 71 72 73 74 75 76 77)
-        psubw   xmm5,xmm0               ; xmm5=data6=(60 61 62 63 64 65 66 67)
-        psubw   xmm4,xmm0               ; xmm4=tmp5
-
-        movdqa    xmm3,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm7             ; xmm6=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm3,xmm7             ; xmm3=(04 14 05 15 06 16 07 17)
-        movdqa    xmm0,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm1             ; xmm5=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm0,xmm1             ; xmm0=(64 74 65 75 66 76 67 77)
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
-        movdqa  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
-
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
-
-        paddw   xmm2,xmm4               ; xmm2=tmp4
-        movdqa  xmm5,xmm7
-        movdqa  xmm0,xmm1
-        paddw   xmm7,xmm4               ; xmm7=data2=(20 21 22 23 24 25 26 27)
-        paddw   xmm1,xmm2               ; xmm1=data4=(40 41 42 43 44 45 46 47)
-        psubw   xmm5,xmm4               ; xmm5=data5=(50 51 52 53 54 55 56 57)
-        psubw   xmm0,xmm2               ; xmm0=data3=(30 31 32 33 34 35 36 37)
-
-        movdqa    xmm4,xmm7             ; transpose coefficients(phase 1)
-        punpcklwd xmm7,xmm0             ; xmm7=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm4,xmm0             ; xmm4=(24 34 25 35 26 36 27 37)
-        movdqa    xmm2,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm5             ; xmm1=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm2,xmm5             ; xmm2=(44 54 45 55 46 56 47 57)
-
-        movdqa    xmm0,xmm3             ; transpose coefficients(phase 2)
-        punpckldq xmm3,xmm4             ; xmm3=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm0,xmm4             ; xmm0=(06 16 26 36 07 17 27 37)
-        movdqa    xmm5,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm7             ; xmm6=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm5,xmm7             ; xmm5=(02 12 22 32 03 13 23 33)
-
-        movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
-        movdqa  xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
-
-        movdqa  XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
-
-        movdqa    xmm3,xmm1             ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm4             ; xmm1=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm3,xmm4             ; xmm3=(42 52 62 72 43 53 63 73)
-        movdqa    xmm0,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm7             ; xmm2=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm0,xmm7             ; xmm0=(46 56 66 76 47 57 67 77)
-
-        movdqa     xmm4,xmm6            ; transpose coefficients(phase 3)
-        punpcklqdq xmm6,xmm1            ; xmm6=col0=(00 10 20 30 40 50 60 70)
-        punpckhqdq xmm4,xmm1            ; xmm4=col1=(01 11 21 31 41 51 61 71)
-        movdqa     xmm7,xmm5            ; transpose coefficients(phase 3)
-        punpcklqdq xmm5,xmm3            ; xmm5=col2=(02 12 22 32 42 52 62 72)
-        punpckhqdq xmm7,xmm3            ; xmm7=col3=(03 13 23 33 43 53 63 73)
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
-        movdqa  xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=col1
-        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=col3
-
-        movdqa     xmm4,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm2            ; xmm1=col4=(04 14 24 34 44 54 64 74)
-        punpckhqdq xmm4,xmm2            ; xmm4=col5=(05 15 25 35 45 55 65 75)
-        movdqa     xmm7,xmm3            ; transpose coefficients(phase 3)
-        punpcklqdq xmm3,xmm0            ; xmm3=col6=(06 16 26 36 46 56 66 76)
-        punpckhqdq xmm7,xmm0            ; xmm7=col7=(07 17 27 37 47 57 67 77)
-.column_end:
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-
-        ; -- Even part
-
-        ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
-
-        movdqa  xmm2,xmm6
-        movdqa  xmm0,xmm5
-        psubw   xmm6,xmm1               ; xmm6=tmp11
-        psubw   xmm5,xmm3
-        paddw   xmm2,xmm1               ; xmm2=tmp10
-        paddw   xmm0,xmm3               ; xmm0=tmp13
-
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm5,[GOTOFF(ebx,PW_F1414)]
-        psubw   xmm5,xmm0               ; xmm5=tmp12
-
-        movdqa  xmm1,xmm2
-        movdqa  xmm3,xmm6
-        psubw   xmm2,xmm0               ; xmm2=tmp3
-        psubw   xmm6,xmm5               ; xmm6=tmp2
-        paddw   xmm1,xmm0               ; xmm1=tmp0
-        paddw   xmm3,xmm5               ; xmm3=tmp1
-
-        movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=col1
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=col3
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
-        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
-
-        ; -- Odd part
-
-        ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
-
-        movdqa  xmm2,xmm0
-        movdqa  xmm6,xmm4
-        psubw   xmm0,xmm7               ; xmm0=z12
-        psubw   xmm4,xmm5               ; xmm4=z10
-        paddw   xmm2,xmm7               ; xmm2=z11
-        paddw   xmm6,xmm5               ; xmm6=z13
-
-        movdqa  xmm7,xmm4               ; xmm7=z10(unscaled)
-        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
-
-        movdqa  xmm5,xmm2
-        psubw   xmm2,xmm6
-        paddw   xmm5,xmm6               ; xmm5=tmp7
-
-        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm2,[GOTOFF(ebx,PW_F1414)]     ; xmm2=tmp11
-
-        ; To avoid overflow...
-        ;
-        ; (Original)
-        ; tmp12 = -2.613125930 * z10 + z5;
-        ;
-        ; (This implementation)
-        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
-        ;       = -1.613125930 * z10 - z10 + z5;
-
-        movdqa  xmm6,xmm4
-        paddw   xmm4,xmm0
-        pmulhw  xmm4,[GOTOFF(ebx,PW_F1847)]     ; xmm4=z5
-        pmulhw  xmm6,[GOTOFF(ebx,PW_MF1613)]
-        pmulhw  xmm0,[GOTOFF(ebx,PW_F1082)]
-        psubw   xmm6,xmm7
-        psubw   xmm0,xmm4               ; xmm0=tmp10
-        paddw   xmm6,xmm4               ; xmm6=tmp12
-
-        ; -- Final output stage
-
-        psubw   xmm6,xmm5               ; xmm6=tmp6
-        movdqa  xmm7,xmm1
-        movdqa  xmm4,xmm3
-        paddw   xmm1,xmm5               ; xmm1=data0=(00 10 20 30 40 50 60 70)
-        paddw   xmm3,xmm6               ; xmm3=data1=(01 11 21 31 41 51 61 71)
-        psraw   xmm1,(PASS1_BITS+3)     ; descale
-        psraw   xmm3,(PASS1_BITS+3)     ; descale
-        psubw   xmm7,xmm5               ; xmm7=data7=(07 17 27 37 47 57 67 77)
-        psubw   xmm4,xmm6               ; xmm4=data6=(06 16 26 36 46 56 66 76)
-        psraw   xmm7,(PASS1_BITS+3)     ; descale
-        psraw   xmm4,(PASS1_BITS+3)     ; descale
-        psubw   xmm2,xmm6               ; xmm2=tmp5
-
-        packsswb  xmm1,xmm4     ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        packsswb  xmm3,xmm7     ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
-        movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
-
-        paddw   xmm0,xmm2               ; xmm0=tmp4
-        movdqa  xmm4,xmm5
-        movdqa  xmm7,xmm6
-        paddw   xmm5,xmm2               ; xmm5=data2=(02 12 22 32 42 52 62 72)
-        paddw   xmm6,xmm0               ; xmm6=data4=(04 14 24 34 44 54 64 74)
-        psraw   xmm5,(PASS1_BITS+3)     ; descale
-        psraw   xmm6,(PASS1_BITS+3)     ; descale
-        psubw   xmm4,xmm2               ; xmm4=data5=(05 15 25 35 45 55 65 75)
-        psubw   xmm7,xmm0               ; xmm7=data3=(03 13 23 33 43 53 63 73)
-        psraw   xmm4,(PASS1_BITS+3)     ; descale
-        psraw   xmm7,(PASS1_BITS+3)     ; descale
-
-        movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]     ; xmm2=[PB_CENTERJSAMP]
-
-        packsswb  xmm5,xmm6     ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-        packsswb  xmm7,xmm4     ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-        paddb     xmm1,xmm2
-        paddb     xmm3,xmm2
-        paddb     xmm5,xmm2
-        paddb     xmm7,xmm2
-
-        movdqa    xmm0,xmm1     ; transpose coefficients(phase 1)
-        punpcklbw xmm1,xmm3     ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-        punpckhbw xmm0,xmm3     ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-        movdqa    xmm6,xmm5     ; transpose coefficients(phase 1)
-        punpcklbw xmm5,xmm7     ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-        punpckhbw xmm6,xmm7     ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-        movdqa    xmm4,xmm1     ; transpose coefficients(phase 2)
-        punpcklwd xmm1,xmm5     ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-        punpckhwd xmm4,xmm5     ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-        movdqa    xmm2,xmm6     ; transpose coefficients(phase 2)
-        punpcklwd xmm6,xmm0     ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-        punpckhwd xmm2,xmm0     ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-        movdqa    xmm3,xmm1     ; transpose coefficients(phase 3)
-        punpckldq xmm1,xmm6     ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-        punpckhdq xmm3,xmm6     ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-        movdqa    xmm7,xmm4     ; transpose coefficients(phase 3)
-        punpckldq xmm4,xmm2     ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-        punpckhdq xmm7,xmm2     ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-        pshufd  xmm5,xmm1,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-        pshufd  xmm0,xmm3,0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-        pshufd  xmm6,xmm4,0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-        pshufd  xmm2,xmm7,0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
-        mov     edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
-
-        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
-        mov     edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jidctint-altivec.c b/simd/jidctint-altivec.c
deleted file mode 100644
index 935f35d..0000000
--- a/simd/jidctint-altivec.c
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* SLOW INTEGER INVERSE DCT */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_298 2446   /* FIX(0.298631336) */
-#define F_0_390 3196   /* FIX(0.390180644) */
-#define F_0_541 4433   /* FIX(0.541196100) */
-#define F_0_765 6270   /* FIX(0.765366865) */
-#define F_0_899 7373   /* FIX(0.899976223) */
-#define F_1_175 9633   /* FIX(1.175875602) */
-#define F_1_501 12299  /* FIX(1.501321110) */
-#define F_1_847 15137  /* FIX(1.847759065) */
-#define F_1_961 16069  /* FIX(1.961570560) */
-#define F_2_053 16819  /* FIX(2.053119869) */
-#define F_2_562 20995  /* FIX(2.562915447) */
-#define F_3_072 25172  /* FIX(3.072711026) */
-
-#define CONST_BITS 13
-#define PASS1_BITS 2
-#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
-#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
-
-
-#define DO_IDCT(in, PASS)  \
-{  \
-  /* Even part  \
-   *  \
-   * (Original)  \
-   * z1 = (z2 + z3) * 0.541196100;  \
-   * tmp2 = z1 + z3 * -1.847759065;  \
-   * tmp3 = z1 + z2 * 0.765366865;  \
-   *  \
-   * (This implementation)  \
-   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);  \
-   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;  \
-   */  \
-  \
-  in##26l = vec_mergeh(in##2, in##6);  \
-  in##26h = vec_mergel(in##2, in##6);  \
-  \
-  tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero);  \
-  tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero);  \
-  tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero);  \
-  tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero);  \
-  \
-  tmp0 = vec_add(in##0, in##4);  \
-  tmp1 = vec_sub(in##0, in##4);  \
-  \
-  tmp0l = vec_unpackh(tmp0);  \
-  tmp0h = vec_unpackl(tmp0);  \
-  tmp0l = vec_sl(tmp0l, const_bits);  \
-  tmp0h = vec_sl(tmp0h, const_bits);  \
-  tmp0l = vec_add(tmp0l, pd_descale_p##PASS);  \
-  tmp0h = vec_add(tmp0h, pd_descale_p##PASS);  \
-  \
-  tmp10l = vec_add(tmp0l, tmp3l);  \
-  tmp10h = vec_add(tmp0h, tmp3h);  \
-  tmp13l = vec_sub(tmp0l, tmp3l);  \
-  tmp13h = vec_sub(tmp0h, tmp3h);  \
-  \
-  tmp1l = vec_unpackh(tmp1);  \
-  tmp1h = vec_unpackl(tmp1);  \
-  tmp1l = vec_sl(tmp1l, const_bits);  \
-  tmp1h = vec_sl(tmp1h, const_bits);  \
-  tmp1l = vec_add(tmp1l, pd_descale_p##PASS);  \
-  tmp1h = vec_add(tmp1h, pd_descale_p##PASS);  \
-  \
-  tmp11l = vec_add(tmp1l, tmp2l);  \
-  tmp11h = vec_add(tmp1h, tmp2h);  \
-  tmp12l = vec_sub(tmp1l, tmp2l);  \
-  tmp12h = vec_sub(tmp1h, tmp2h);  \
-  \
-  /* Odd part */  \
-  \
-  z3 = vec_add(in##3, in##7);  \
-  z4 = vec_add(in##1, in##5);  \
-  \
-  /* (Original)  \
-   * z5 = (z3 + z4) * 1.175875602;  \
-   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;  \
-   * z3 += z5;  z4 += z5;  \
-   *  \
-   * (This implementation)  \
-   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;  \
-   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);  \
-   */  \
-  \
-  z34l = vec_mergeh(z3, z4);  \
-  z34h = vec_mergel(z3, z4);  \
-  \
-  z3l = vec_msums(z34l, pw_mf078_f117, pd_zero);  \
-  z3h = vec_msums(z34h, pw_mf078_f117, pd_zero);  \
-  z4l = vec_msums(z34l, pw_f117_f078, pd_zero);  \
-  z4h = vec_msums(z34h, pw_f117_f078, pd_zero);  \
-  \
-  /* (Original)  \
-   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;  \
-   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;  \
-   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;  \
-   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;  \
-   * tmp0 += z1 + z3;  tmp1 += z2 + z4;  \
-   * tmp2 += z2 + z3;  tmp3 += z1 + z4;  \
-   *  \
-   * (This implementation)  \
-   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;  \
-   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;  \
-   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);  \
-   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);  \
-   * tmp0 += z3;  tmp1 += z4;  \
-   * tmp2 += z3;  tmp3 += z4;  \
-   */  \
-  \
-  in##71l = vec_mergeh(in##7, in##1);  \
-  in##71h = vec_mergel(in##7, in##1);  \
-  \
-  tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l);  \
-  tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h);  \
-  tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l);  \
-  tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h);  \
-  \
-  in##53l = vec_mergeh(in##5, in##3);  \
-  in##53h = vec_mergel(in##5, in##3);  \
-  \
-  tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l);  \
-  tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h);  \
-  tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l);  \
-  tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h);  \
-  \
-  /* Final output stage */  \
-  \
-  out0l = vec_add(tmp10l, tmp3l);  \
-  out0h = vec_add(tmp10h, tmp3h);  \
-  out7l = vec_sub(tmp10l, tmp3l);  \
-  out7h = vec_sub(tmp10h, tmp3h);  \
-  \
-  out0l = vec_sra(out0l, descale_p##PASS);  \
-  out0h = vec_sra(out0h, descale_p##PASS);  \
-  out7l = vec_sra(out7l, descale_p##PASS);  \
-  out7h = vec_sra(out7h, descale_p##PASS);  \
-  \
-  out0 = vec_pack(out0l, out0h);  \
-  out7 = vec_pack(out7l, out7h);  \
-  \
-  out1l = vec_add(tmp11l, tmp2l);  \
-  out1h = vec_add(tmp11h, tmp2h);  \
-  out6l = vec_sub(tmp11l, tmp2l);  \
-  out6h = vec_sub(tmp11h, tmp2h);  \
-  \
-  out1l = vec_sra(out1l, descale_p##PASS);  \
-  out1h = vec_sra(out1h, descale_p##PASS);  \
-  out6l = vec_sra(out6l, descale_p##PASS);  \
-  out6h = vec_sra(out6h, descale_p##PASS);  \
-  \
-  out1 = vec_pack(out1l, out1h);  \
-  out6 = vec_pack(out6l, out6h);  \
-  \
-  out2l = vec_add(tmp12l, tmp1l);  \
-  out2h = vec_add(tmp12h, tmp1h);  \
-  out5l = vec_sub(tmp12l, tmp1l);  \
-  out5h = vec_sub(tmp12h, tmp1h);  \
-  \
-  out2l = vec_sra(out2l, descale_p##PASS);  \
-  out2h = vec_sra(out2h, descale_p##PASS);  \
-  out5l = vec_sra(out5l, descale_p##PASS);  \
-  out5h = vec_sra(out5h, descale_p##PASS);  \
-  \
-  out2 = vec_pack(out2l, out2h);  \
-  out5 = vec_pack(out5l, out5h);  \
-  \
-  out3l = vec_add(tmp13l, tmp0l);  \
-  out3h = vec_add(tmp13h, tmp0h);  \
-  out4l = vec_sub(tmp13l, tmp0l);  \
-  out4h = vec_sub(tmp13h, tmp0h);  \
-  \
-  out3l = vec_sra(out3l, descale_p##PASS);  \
-  out3h = vec_sra(out3h, descale_p##PASS);  \
-  out4l = vec_sra(out4l, descale_p##PASS);  \
-  out4h = vec_sra(out4h, descale_p##PASS);  \
-  \
-  out3 = vec_pack(out3l, out3h);  \
-  out4 = vec_pack(out4l, out4h);  \
-}
-
-
-void
-jsimd_idct_islow_altivec (void *dct_table_, JCOEFPTR coef_block,
-                          JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-  short *dct_table = (short *)dct_table_;
-  int *outptr;
-
-  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
-    col0, col1, col2, col3, col4, col5, col6, col7,
-    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
-    tmp0, tmp1, tmp2, tmp3, z3, z4,
-    z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
-    row71l, row71h, row26l, row26h, row53l, row53h,
-    out0, out1, out2, out3, out4, out5, out6, out7;
-  __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
-    tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
-    z3l, z3h, z4l, z4h,
-    out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
-    out5l, out5h, out6l, out6h, out7l, out7h;
-  __vector signed char outb;
-
-  /* Constants */
-  __vector short pw_zero = { __8X(0) },
-    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
-    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
-    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
-    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
-    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
-    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
-    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
-    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
-  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
-  __vector int pd_zero = { __4X(0) },
-    pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
-    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
-  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
-    descale_p2 = { __4X(DESCALE_P2) },
-    const_bits = { __4X(CONST_BITS) };
-  __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
-
-  /* Pass 1: process columns */
-
-  col0 = vec_ld(0, coef_block);
-  col1 = vec_ld(16, coef_block);
-  col2 = vec_ld(32, coef_block);
-  col3 = vec_ld(48, coef_block);
-  col4 = vec_ld(64, coef_block);
-  col5 = vec_ld(80, coef_block);
-  col6 = vec_ld(96, coef_block);
-  col7 = vec_ld(112, coef_block);
-
-  tmp1 = vec_or(col1, col2);
-  tmp2 = vec_or(col3, col4);
-  tmp1 = vec_or(tmp1, tmp2);
-  tmp3 = vec_or(col5, col6);
-  tmp3 = vec_or(tmp3, col7);
-  tmp1 = vec_or(tmp1, tmp3);
-
-  quant0 = vec_ld(0, dct_table);
-  col0 = vec_mladd(col0, quant0, pw_zero);
-
-  if (vec_all_eq(tmp1, pw_zero)) {
-    /* AC terms all zero */
-
-    col0 = vec_sl(col0, pass1_bits);
-
-    row0 = vec_splat(col0, 0);
-    row1 = vec_splat(col0, 1);
-    row2 = vec_splat(col0, 2);
-    row3 = vec_splat(col0, 3);
-    row4 = vec_splat(col0, 4);
-    row5 = vec_splat(col0, 5);
-    row6 = vec_splat(col0, 6);
-    row7 = vec_splat(col0, 7);
-
-  } else {
-
-    quant1 = vec_ld(16, dct_table);
-    quant2 = vec_ld(32, dct_table);
-    quant3 = vec_ld(48, dct_table);
-    quant4 = vec_ld(64, dct_table);
-    quant5 = vec_ld(80, dct_table);
-    quant6 = vec_ld(96, dct_table);
-    quant7 = vec_ld(112, dct_table);
-
-    col1 = vec_mladd(col1, quant1, pw_zero);
-    col2 = vec_mladd(col2, quant2, pw_zero);
-    col3 = vec_mladd(col3, quant3, pw_zero);
-    col4 = vec_mladd(col4, quant4, pw_zero);
-    col5 = vec_mladd(col5, quant5, pw_zero);
-    col6 = vec_mladd(col6, quant6, pw_zero);
-    col7 = vec_mladd(col7, quant7, pw_zero);
-
-    DO_IDCT(col, 1);
-
-    TRANSPOSE(out, row);
-  }
-
-  /* Pass 2: process rows */
-
-  DO_IDCT(row, 2);
-
-  TRANSPOSE(out, col);
-
-  outb = vec_packs(col0, col0);
-  outb = vec_add(outb, pb_centerjsamp);
-  outptr = (int *)(output_buf[0] + output_col);
-  vec_ste((__vector int)outb, 0, outptr);
-  vec_ste((__vector int)outb, 4, outptr);
-
-  outb = vec_packs(col1, col1);
-  outb = vec_add(outb, pb_centerjsamp);
-  outptr = (int *)(output_buf[1] + output_col);
-  vec_ste((__vector int)outb, 0, outptr);
-  vec_ste((__vector int)outb, 4, outptr);
-
-  outb = vec_packs(col2, col2);
-  outb = vec_add(outb, pb_centerjsamp);
-  outptr = (int *)(output_buf[2] + output_col);
-  vec_ste((__vector int)outb, 0, outptr);
-  vec_ste((__vector int)outb, 4, outptr);
-
-  outb = vec_packs(col3, col3);
-  outb = vec_add(outb, pb_centerjsamp);
-  outptr = (int *)(output_buf[3] + output_col);
-  vec_ste((__vector int)outb, 0, outptr);
-  vec_ste((__vector int)outb, 4, outptr);
-
-  outb = vec_packs(col4, col4);
-  outb = vec_add(outb, pb_centerjsamp);
-  outptr = (int *)(output_buf[4] + output_col);
-  vec_ste((__vector int)outb, 0, outptr);
-  vec_ste((__vector int)outb, 4, outptr);
-
-  outb = vec_packs(col5, col5);
-  outb = vec_add(outb, pb_centerjsamp);
-  outptr = (int *)(output_buf[5] + output_col);
-  vec_ste((__vector int)outb, 0, outptr);
-  vec_ste((__vector int)outb, 4, outptr);
-
-  outb = vec_packs(col6, col6);
-  outb = vec_add(outb, pb_centerjsamp);
-  outptr = (int *)(output_buf[6] + output_col);
-  vec_ste((__vector int)outb, 0, outptr);
-  vec_ste((__vector int)outb, 4, outptr);
-
-  outb = vec_packs(col7, col7);
-  outb = vec_add(outb, pb_centerjsamp);
-  outptr = (int *)(output_buf[7] + output_col);
-  vec_ste((__vector int)outb, 0, outptr);
-  vec_ste((__vector int)outb, 4, outptr);
-}
diff --git a/simd/jidctint-mmx.asm b/simd/jidctint-mmx.asm
deleted file mode 100644
index 5bd1981..0000000
--- a/simd/jidctint-mmx.asm
+++ /dev/null
@@ -1,851 +0,0 @@
-;
-; jidctint.asm - accurate integer IDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; inverse DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jidctint.c; see the jidctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2      (CONST_BITS+PASS1_BITS+3)
-
-%if CONST_BITS == 13
-F_0_298 equ      2446           ; FIX(0.298631336)
-F_0_390 equ      3196           ; FIX(0.390180644)
-F_0_541 equ      4433           ; FIX(0.541196100)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_175 equ      9633           ; FIX(1.175875602)
-F_1_501 equ     12299           ; FIX(1.501321110)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_1_961 equ     16069           ; FIX(1.961570560)
-F_2_053 equ     16819           ; FIX(2.053119869)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_072 equ     25172           ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
-F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
-F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
-F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_islow_mmx)
-
-EXTN(jconst_idct_islow_mmx):
-
-PW_F130_F054    times 2 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130   times 2 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117   times 2 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078    times 2 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089  times 2 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060   times 2 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256  times 2 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050   times 2 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1   times 2 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2   times 2 dd  1 << (DESCALE_P2-1)
-PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_islow_mmx (void *dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          12
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_JCOEF
-                                        ; JCOEF workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_islow_mmx)
-
-EXTN(jsimd_idct_islow_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [workspace]
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; JCOEF *wsptr
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     short .columnDCT
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     mm1,mm0
-        packsswb mm1,mm1
-        movd    eax,mm1
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        psllw   mm0,PASS1_BITS
-
-        movq      mm2,mm0               ; mm0=in0=(00 01 02 03)
-        punpcklwd mm0,mm0               ; mm0=(00 00 01 01)
-        punpckhwd mm2,mm2               ; mm2=(02 02 03 03)
-
-        movq      mm1,mm0
-        punpckldq mm0,mm0               ; mm0=(00 00 00 00)
-        punpckhdq mm1,mm1               ; mm1=(01 01 01 01)
-        movq      mm3,mm2
-        punpckldq mm2,mm2               ; mm2=(02 02 02 02)
-        punpckhdq mm3,mm3               ; mm3=(03 03 03 03)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
-        jmp     near .nextcolumn
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; (Original)
-        ; z1 = (z2 + z3) * 0.541196100;
-        ; tmp2 = z1 + z3 * -1.847759065;
-        ; tmp3 = z1 + z2 * 0.765366865;
-        ;
-        ; (This implementation)
-        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-        movq      mm4,mm1               ; mm1=in2=z2
-        movq      mm5,mm1
-        punpcklwd mm4,mm3               ; mm3=in6=z3
-        punpckhwd mm5,mm3
-        movq      mm1,mm4
-        movq      mm3,mm5
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=tmp3L
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]        ; mm5=tmp3H
-        pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=tmp2L
-        pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]       ; mm3=tmp2H
-
-        movq      mm6,mm0
-        paddw     mm0,mm2               ; mm0=in0+in4
-        psubw     mm6,mm2               ; mm6=in0-in4
-
-        pxor      mm7,mm7
-        pxor      mm2,mm2
-        punpcklwd mm7,mm0               ; mm7=tmp0L
-        punpckhwd mm2,mm0               ; mm2=tmp0H
-        psrad     mm7,(16-CONST_BITS)   ; psrad mm7,16 & pslld mm7,CONST_BITS
-        psrad     mm2,(16-CONST_BITS)   ; psrad mm2,16 & pslld mm2,CONST_BITS
-
-        movq    mm0,mm7
-        paddd   mm7,mm4                 ; mm7=tmp10L
-        psubd   mm0,mm4                 ; mm0=tmp13L
-        movq    mm4,mm2
-        paddd   mm2,mm5                 ; mm2=tmp10H
-        psubd   mm4,mm5                 ; mm4=tmp13H
-
-        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp10L
-        movq    MMWORD [wk(1)], mm2     ; wk(1)=tmp10H
-        movq    MMWORD [wk(2)], mm0     ; wk(2)=tmp13L
-        movq    MMWORD [wk(3)], mm4     ; wk(3)=tmp13H
-
-        pxor      mm5,mm5
-        pxor      mm7,mm7
-        punpcklwd mm5,mm6               ; mm5=tmp1L
-        punpckhwd mm7,mm6               ; mm7=tmp1H
-        psrad     mm5,(16-CONST_BITS)   ; psrad mm5,16 & pslld mm5,CONST_BITS
-        psrad     mm7,(16-CONST_BITS)   ; psrad mm7,16 & pslld mm7,CONST_BITS
-
-        movq    mm2,mm5
-        paddd   mm5,mm1                 ; mm5=tmp11L
-        psubd   mm2,mm1                 ; mm2=tmp12L
-        movq    mm0,mm7
-        paddd   mm7,mm3                 ; mm7=tmp11H
-        psubd   mm0,mm3                 ; mm0=tmp12H
-
-        movq    MMWORD [wk(4)], mm5     ; wk(4)=tmp11L
-        movq    MMWORD [wk(5)], mm7     ; wk(5)=tmp11H
-        movq    MMWORD [wk(6)], mm2     ; wk(6)=tmp12L
-        movq    MMWORD [wk(7)], mm0     ; wk(7)=tmp12H
-
-        ; -- Odd part
-
-        movq    mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movq    mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movq    mm5,mm6
-        movq    mm7,mm4
-        paddw   mm5,mm3                 ; mm5=z3
-        paddw   mm7,mm1                 ; mm7=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movq      mm2,mm5
-        movq      mm0,mm5
-        punpcklwd mm2,mm7
-        punpckhwd mm0,mm7
-        movq      mm5,mm2
-        movq      mm7,mm0
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]       ; mm2=z3L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]       ; mm0=z3H
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]        ; mm5=z4L
-        pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]        ; mm7=z4H
-
-        movq    MMWORD [wk(10)], mm2    ; wk(10)=z3L
-        movq    MMWORD [wk(11)], mm0    ; wk(11)=z3H
-
-        ; (Original)
-        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-        ; tmp0 += z3;  tmp1 += z4;
-        ; tmp2 += z3;  tmp3 += z4;
-
-        movq      mm2,mm3
-        movq      mm0,mm3
-        punpcklwd mm2,mm4
-        punpckhwd mm0,mm4
-        movq      mm3,mm2
-        movq      mm4,mm0
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm2=tmp0L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm0=tmp0H
-        pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]       ; mm3=tmp3L
-        pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]       ; mm4=tmp3H
-
-        paddd   mm2, MMWORD [wk(10)]    ; mm2=tmp0L
-        paddd   mm0, MMWORD [wk(11)]    ; mm0=tmp0H
-        paddd   mm3,mm5                 ; mm3=tmp3L
-        paddd   mm4,mm7                 ; mm4=tmp3H
-
-        movq    MMWORD [wk(8)], mm2     ; wk(8)=tmp0L
-        movq    MMWORD [wk(9)], mm0     ; wk(9)=tmp0H
-
-        movq      mm2,mm1
-        movq      mm0,mm1
-        punpcklwd mm2,mm6
-        punpckhwd mm0,mm6
-        movq      mm1,mm2
-        movq      mm6,mm0
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm2=tmp1L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm0=tmp1H
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]       ; mm1=tmp2L
-        pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]       ; mm6=tmp2H
-
-        paddd   mm2,mm5                 ; mm2=tmp1L
-        paddd   mm0,mm7                 ; mm0=tmp1H
-        paddd   mm1, MMWORD [wk(10)]    ; mm1=tmp2L
-        paddd   mm6, MMWORD [wk(11)]    ; mm6=tmp2H
-
-        movq    MMWORD [wk(10)], mm2    ; wk(10)=tmp1L
-        movq    MMWORD [wk(11)], mm0    ; wk(11)=tmp1H
-
-        ; -- Final output stage
-
-        movq    mm5, MMWORD [wk(0)]     ; mm5=tmp10L
-        movq    mm7, MMWORD [wk(1)]     ; mm7=tmp10H
-
-        movq    mm2,mm5
-        movq    mm0,mm7
-        paddd   mm5,mm3                 ; mm5=data0L
-        paddd   mm7,mm4                 ; mm7=data0H
-        psubd   mm2,mm3                 ; mm2=data7L
-        psubd   mm0,mm4                 ; mm0=data7H
-
-        movq    mm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm3=[PD_DESCALE_P1]
-
-        paddd   mm5,mm3
-        paddd   mm7,mm3
-        psrad   mm5,DESCALE_P1
-        psrad   mm7,DESCALE_P1
-        paddd   mm2,mm3
-        paddd   mm0,mm3
-        psrad   mm2,DESCALE_P1
-        psrad   mm0,DESCALE_P1
-
-        packssdw  mm5,mm7               ; mm5=data0=(00 01 02 03)
-        packssdw  mm2,mm0               ; mm2=data7=(70 71 72 73)
-
-        movq    mm4, MMWORD [wk(4)]     ; mm4=tmp11L
-        movq    mm3, MMWORD [wk(5)]     ; mm3=tmp11H
-
-        movq    mm7,mm4
-        movq    mm0,mm3
-        paddd   mm4,mm1                 ; mm4=data1L
-        paddd   mm3,mm6                 ; mm3=data1H
-        psubd   mm7,mm1                 ; mm7=data6L
-        psubd   mm0,mm6                 ; mm0=data6H
-
-        movq    mm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm1=[PD_DESCALE_P1]
-
-        paddd   mm4,mm1
-        paddd   mm3,mm1
-        psrad   mm4,DESCALE_P1
-        psrad   mm3,DESCALE_P1
-        paddd   mm7,mm1
-        paddd   mm0,mm1
-        psrad   mm7,DESCALE_P1
-        psrad   mm0,DESCALE_P1
-
-        packssdw  mm4,mm3               ; mm4=data1=(10 11 12 13)
-        packssdw  mm7,mm0               ; mm7=data6=(60 61 62 63)
-
-        movq      mm6,mm5               ; transpose coefficients(phase 1)
-        punpcklwd mm5,mm4               ; mm5=(00 10 01 11)
-        punpckhwd mm6,mm4               ; mm6=(02 12 03 13)
-        movq      mm1,mm7               ; transpose coefficients(phase 1)
-        punpcklwd mm7,mm2               ; mm7=(60 70 61 71)
-        punpckhwd mm1,mm2               ; mm1=(62 72 63 73)
-
-        movq    mm3, MMWORD [wk(6)]     ; mm3=tmp12L
-        movq    mm0, MMWORD [wk(7)]     ; mm0=tmp12H
-        movq    mm4, MMWORD [wk(10)]    ; mm4=tmp1L
-        movq    mm2, MMWORD [wk(11)]    ; mm2=tmp1H
-
-        movq    MMWORD [wk(0)], mm5     ; wk(0)=(00 10 01 11)
-        movq    MMWORD [wk(1)], mm6     ; wk(1)=(02 12 03 13)
-        movq    MMWORD [wk(4)], mm7     ; wk(4)=(60 70 61 71)
-        movq    MMWORD [wk(5)], mm1     ; wk(5)=(62 72 63 73)
-
-        movq    mm5,mm3
-        movq    mm6,mm0
-        paddd   mm3,mm4                 ; mm3=data2L
-        paddd   mm0,mm2                 ; mm0=data2H
-        psubd   mm5,mm4                 ; mm5=data5L
-        psubd   mm6,mm2                 ; mm6=data5H
-
-        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm7=[PD_DESCALE_P1]
-
-        paddd   mm3,mm7
-        paddd   mm0,mm7
-        psrad   mm3,DESCALE_P1
-        psrad   mm0,DESCALE_P1
-        paddd   mm5,mm7
-        paddd   mm6,mm7
-        psrad   mm5,DESCALE_P1
-        psrad   mm6,DESCALE_P1
-
-        packssdw  mm3,mm0               ; mm3=data2=(20 21 22 23)
-        packssdw  mm5,mm6               ; mm5=data5=(50 51 52 53)
-
-        movq    mm1, MMWORD [wk(2)]     ; mm1=tmp13L
-        movq    mm4, MMWORD [wk(3)]     ; mm4=tmp13H
-        movq    mm2, MMWORD [wk(8)]     ; mm2=tmp0L
-        movq    mm7, MMWORD [wk(9)]     ; mm7=tmp0H
-
-        movq    mm0,mm1
-        movq    mm6,mm4
-        paddd   mm1,mm2                 ; mm1=data3L
-        paddd   mm4,mm7                 ; mm4=data3H
-        psubd   mm0,mm2                 ; mm0=data4L
-        psubd   mm6,mm7                 ; mm6=data4H
-
-        movq    mm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm2=[PD_DESCALE_P1]
-
-        paddd   mm1,mm2
-        paddd   mm4,mm2
-        psrad   mm1,DESCALE_P1
-        psrad   mm4,DESCALE_P1
-        paddd   mm0,mm2
-        paddd   mm6,mm2
-        psrad   mm0,DESCALE_P1
-        psrad   mm6,DESCALE_P1
-
-        packssdw  mm1,mm4               ; mm1=data3=(30 31 32 33)
-        packssdw  mm0,mm6               ; mm0=data4=(40 41 42 43)
-
-        movq    mm7, MMWORD [wk(0)]     ; mm7=(00 10 01 11)
-        movq    mm2, MMWORD [wk(1)]     ; mm2=(02 12 03 13)
-
-        movq      mm4,mm3               ; transpose coefficients(phase 1)
-        punpcklwd mm3,mm1               ; mm3=(20 30 21 31)
-        punpckhwd mm4,mm1               ; mm4=(22 32 23 33)
-        movq      mm6,mm0               ; transpose coefficients(phase 1)
-        punpcklwd mm0,mm5               ; mm0=(40 50 41 51)
-        punpckhwd mm6,mm5               ; mm6=(42 52 43 53)
-
-        movq      mm1,mm7               ; transpose coefficients(phase 2)
-        punpckldq mm7,mm3               ; mm7=(00 10 20 30)
-        punpckhdq mm1,mm3               ; mm1=(01 11 21 31)
-        movq      mm5,mm2               ; transpose coefficients(phase 2)
-        punpckldq mm2,mm4               ; mm2=(02 12 22 32)
-        punpckhdq mm5,mm4               ; mm5=(03 13 23 33)
-
-        movq    mm3, MMWORD [wk(4)]     ; mm3=(60 70 61 71)
-        movq    mm4, MMWORD [wk(5)]     ; mm4=(62 72 63 73)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
-
-        movq      mm7,mm0               ; transpose coefficients(phase 2)
-        punpckldq mm0,mm3               ; mm0=(40 50 60 70)
-        punpckhdq mm7,mm3               ; mm7=(41 51 61 71)
-        movq      mm1,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm4               ; mm6=(42 52 62 72)
-        punpckhdq mm1,mm4               ; mm1=(43 53 63 73)
-
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
-        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
-        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
-
-.nextcolumn:
-        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
-        add     edx, byte 4*SIZEOF_ISLOW_MULT_TYPE      ; quantptr
-        add     edi, byte 4*DCTSIZE*SIZEOF_JCOEF        ; wsptr
-        dec     ecx                                     ; ctr
-        jnz     near .columnloop
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; JCOEF *wsptr
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.rowloop:
-
-        ; -- Even part
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-        ; (Original)
-        ; z1 = (z2 + z3) * 0.541196100;
-        ; tmp2 = z1 + z3 * -1.847759065;
-        ; tmp3 = z1 + z2 * 0.765366865;
-        ;
-        ; (This implementation)
-        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-        movq      mm4,mm1               ; mm1=in2=z2
-        movq      mm5,mm1
-        punpcklwd mm4,mm3               ; mm3=in6=z3
-        punpckhwd mm5,mm3
-        movq      mm1,mm4
-        movq      mm3,mm5
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=tmp3L
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]        ; mm5=tmp3H
-        pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=tmp2L
-        pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]       ; mm3=tmp2H
-
-        movq      mm6,mm0
-        paddw     mm0,mm2               ; mm0=in0+in4
-        psubw     mm6,mm2               ; mm6=in0-in4
-
-        pxor      mm7,mm7
-        pxor      mm2,mm2
-        punpcklwd mm7,mm0               ; mm7=tmp0L
-        punpckhwd mm2,mm0               ; mm2=tmp0H
-        psrad     mm7,(16-CONST_BITS)   ; psrad mm7,16 & pslld mm7,CONST_BITS
-        psrad     mm2,(16-CONST_BITS)   ; psrad mm2,16 & pslld mm2,CONST_BITS
-
-        movq    mm0,mm7
-        paddd   mm7,mm4                 ; mm7=tmp10L
-        psubd   mm0,mm4                 ; mm0=tmp13L
-        movq    mm4,mm2
-        paddd   mm2,mm5                 ; mm2=tmp10H
-        psubd   mm4,mm5                 ; mm4=tmp13H
-
-        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp10L
-        movq    MMWORD [wk(1)], mm2     ; wk(1)=tmp10H
-        movq    MMWORD [wk(2)], mm0     ; wk(2)=tmp13L
-        movq    MMWORD [wk(3)], mm4     ; wk(3)=tmp13H
-
-        pxor      mm5,mm5
-        pxor      mm7,mm7
-        punpcklwd mm5,mm6               ; mm5=tmp1L
-        punpckhwd mm7,mm6               ; mm7=tmp1H
-        psrad     mm5,(16-CONST_BITS)   ; psrad mm5,16 & pslld mm5,CONST_BITS
-        psrad     mm7,(16-CONST_BITS)   ; psrad mm7,16 & pslld mm7,CONST_BITS
-
-        movq    mm2,mm5
-        paddd   mm5,mm1                 ; mm5=tmp11L
-        psubd   mm2,mm1                 ; mm2=tmp12L
-        movq    mm0,mm7
-        paddd   mm7,mm3                 ; mm7=tmp11H
-        psubd   mm0,mm3                 ; mm0=tmp12H
-
-        movq    MMWORD [wk(4)], mm5     ; wk(4)=tmp11L
-        movq    MMWORD [wk(5)], mm7     ; wk(5)=tmp11H
-        movq    MMWORD [wk(6)], mm2     ; wk(6)=tmp12L
-        movq    MMWORD [wk(7)], mm0     ; wk(7)=tmp12H
-
-        ; -- Odd part
-
-        movq    mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-        movq    mm5,mm6
-        movq    mm7,mm4
-        paddw   mm5,mm3                 ; mm5=z3
-        paddw   mm7,mm1                 ; mm7=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movq      mm2,mm5
-        movq      mm0,mm5
-        punpcklwd mm2,mm7
-        punpckhwd mm0,mm7
-        movq      mm5,mm2
-        movq      mm7,mm0
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]       ; mm2=z3L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]       ; mm0=z3H
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]        ; mm5=z4L
-        pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]        ; mm7=z4H
-
-        movq    MMWORD [wk(10)], mm2    ; wk(10)=z3L
-        movq    MMWORD [wk(11)], mm0    ; wk(11)=z3H
-
-        ; (Original)
-        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-        ; tmp0 += z3;  tmp1 += z4;
-        ; tmp2 += z3;  tmp3 += z4;
-
-        movq      mm2,mm3
-        movq      mm0,mm3
-        punpcklwd mm2,mm4
-        punpckhwd mm0,mm4
-        movq      mm3,mm2
-        movq      mm4,mm0
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm2=tmp0L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm0=tmp0H
-        pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]       ; mm3=tmp3L
-        pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]       ; mm4=tmp3H
-
-        paddd   mm2, MMWORD [wk(10)]    ; mm2=tmp0L
-        paddd   mm0, MMWORD [wk(11)]    ; mm0=tmp0H
-        paddd   mm3,mm5                 ; mm3=tmp3L
-        paddd   mm4,mm7                 ; mm4=tmp3H
-
-        movq    MMWORD [wk(8)], mm2     ; wk(8)=tmp0L
-        movq    MMWORD [wk(9)], mm0     ; wk(9)=tmp0H
-
-        movq      mm2,mm1
-        movq      mm0,mm1
-        punpcklwd mm2,mm6
-        punpckhwd mm0,mm6
-        movq      mm1,mm2
-        movq      mm6,mm0
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm2=tmp1L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm0=tmp1H
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]       ; mm1=tmp2L
-        pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]       ; mm6=tmp2H
-
-        paddd   mm2,mm5                 ; mm2=tmp1L
-        paddd   mm0,mm7                 ; mm0=tmp1H
-        paddd   mm1, MMWORD [wk(10)]    ; mm1=tmp2L
-        paddd   mm6, MMWORD [wk(11)]    ; mm6=tmp2H
-
-        movq    MMWORD [wk(10)], mm2    ; wk(10)=tmp1L
-        movq    MMWORD [wk(11)], mm0    ; wk(11)=tmp1H
-
-        ; -- Final output stage
-
-        movq    mm5, MMWORD [wk(0)]     ; mm5=tmp10L
-        movq    mm7, MMWORD [wk(1)]     ; mm7=tmp10H
-
-        movq    mm2,mm5
-        movq    mm0,mm7
-        paddd   mm5,mm3                 ; mm5=data0L
-        paddd   mm7,mm4                 ; mm7=data0H
-        psubd   mm2,mm3                 ; mm2=data7L
-        psubd   mm0,mm4                 ; mm0=data7H
-
-        movq    mm3,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm3=[PD_DESCALE_P2]
-
-        paddd   mm5,mm3
-        paddd   mm7,mm3
-        psrad   mm5,DESCALE_P2
-        psrad   mm7,DESCALE_P2
-        paddd   mm2,mm3
-        paddd   mm0,mm3
-        psrad   mm2,DESCALE_P2
-        psrad   mm0,DESCALE_P2
-
-        packssdw  mm5,mm7               ; mm5=data0=(00 10 20 30)
-        packssdw  mm2,mm0               ; mm2=data7=(07 17 27 37)
-
-        movq    mm4, MMWORD [wk(4)]     ; mm4=tmp11L
-        movq    mm3, MMWORD [wk(5)]     ; mm3=tmp11H
-
-        movq    mm7,mm4
-        movq    mm0,mm3
-        paddd   mm4,mm1                 ; mm4=data1L
-        paddd   mm3,mm6                 ; mm3=data1H
-        psubd   mm7,mm1                 ; mm7=data6L
-        psubd   mm0,mm6                 ; mm0=data6H
-
-        movq    mm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm1=[PD_DESCALE_P2]
-
-        paddd   mm4,mm1
-        paddd   mm3,mm1
-        psrad   mm4,DESCALE_P2
-        psrad   mm3,DESCALE_P2
-        paddd   mm7,mm1
-        paddd   mm0,mm1
-        psrad   mm7,DESCALE_P2
-        psrad   mm0,DESCALE_P2
-
-        packssdw  mm4,mm3               ; mm4=data1=(01 11 21 31)
-        packssdw  mm7,mm0               ; mm7=data6=(06 16 26 36)
-
-        packsswb  mm5,mm7               ; mm5=(00 10 20 30 06 16 26 36)
-        packsswb  mm4,mm2               ; mm4=(01 11 21 31 07 17 27 37)
-
-        movq    mm6, MMWORD [wk(6)]     ; mm6=tmp12L
-        movq    mm1, MMWORD [wk(7)]     ; mm1=tmp12H
-        movq    mm3, MMWORD [wk(10)]    ; mm3=tmp1L
-        movq    mm0, MMWORD [wk(11)]    ; mm0=tmp1H
-
-        movq    MMWORD [wk(0)], mm5     ; wk(0)=(00 10 20 30 06 16 26 36)
-        movq    MMWORD [wk(1)], mm4     ; wk(1)=(01 11 21 31 07 17 27 37)
-
-        movq    mm7,mm6
-        movq    mm2,mm1
-        paddd   mm6,mm3                 ; mm6=data2L
-        paddd   mm1,mm0                 ; mm1=data2H
-        psubd   mm7,mm3                 ; mm7=data5L
-        psubd   mm2,mm0                 ; mm2=data5H
-
-        movq    mm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm5=[PD_DESCALE_P2]
-
-        paddd   mm6,mm5
-        paddd   mm1,mm5
-        psrad   mm6,DESCALE_P2
-        psrad   mm1,DESCALE_P2
-        paddd   mm7,mm5
-        paddd   mm2,mm5
-        psrad   mm7,DESCALE_P2
-        psrad   mm2,DESCALE_P2
-
-        packssdw  mm6,mm1               ; mm6=data2=(02 12 22 32)
-        packssdw  mm7,mm2               ; mm7=data5=(05 15 25 35)
-
-        movq    mm4, MMWORD [wk(2)]     ; mm4=tmp13L
-        movq    mm3, MMWORD [wk(3)]     ; mm3=tmp13H
-        movq    mm0, MMWORD [wk(8)]     ; mm0=tmp0L
-        movq    mm5, MMWORD [wk(9)]     ; mm5=tmp0H
-
-        movq    mm1,mm4
-        movq    mm2,mm3
-        paddd   mm4,mm0                 ; mm4=data3L
-        paddd   mm3,mm5                 ; mm3=data3H
-        psubd   mm1,mm0                 ; mm1=data4L
-        psubd   mm2,mm5                 ; mm2=data4H
-
-        movq    mm0,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm0=[PD_DESCALE_P2]
-
-        paddd   mm4,mm0
-        paddd   mm3,mm0
-        psrad   mm4,DESCALE_P2
-        psrad   mm3,DESCALE_P2
-        paddd   mm1,mm0
-        paddd   mm2,mm0
-        psrad   mm1,DESCALE_P2
-        psrad   mm2,DESCALE_P2
-
-        movq      mm5,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm5=[PB_CENTERJSAMP]
-
-        packssdw  mm4,mm3               ; mm4=data3=(03 13 23 33)
-        packssdw  mm1,mm2               ; mm1=data4=(04 14 24 34)
-
-        movq      mm0, MMWORD [wk(0)]   ; mm0=(00 10 20 30 06 16 26 36)
-        movq      mm3, MMWORD [wk(1)]   ; mm3=(01 11 21 31 07 17 27 37)
-
-        packsswb  mm6,mm1               ; mm6=(02 12 22 32 04 14 24 34)
-        packsswb  mm4,mm7               ; mm4=(03 13 23 33 05 15 25 35)
-
-        paddb     mm0,mm5
-        paddb     mm3,mm5
-        paddb     mm6,mm5
-        paddb     mm4,mm5
-
-        movq      mm2,mm0               ; transpose coefficients(phase 1)
-        punpcklbw mm0,mm3               ; mm0=(00 01 10 11 20 21 30 31)
-        punpckhbw mm2,mm3               ; mm2=(06 07 16 17 26 27 36 37)
-        movq      mm1,mm6               ; transpose coefficients(phase 1)
-        punpcklbw mm6,mm4               ; mm6=(02 03 12 13 22 23 32 33)
-        punpckhbw mm1,mm4               ; mm1=(04 05 14 15 24 25 34 35)
-
-        movq      mm7,mm0               ; transpose coefficients(phase 2)
-        punpcklwd mm0,mm6               ; mm0=(00 01 02 03 10 11 12 13)
-        punpckhwd mm7,mm6               ; mm7=(20 21 22 23 30 31 32 33)
-        movq      mm5,mm1               ; transpose coefficients(phase 2)
-        punpcklwd mm1,mm2               ; mm1=(04 05 06 07 14 15 16 17)
-        punpckhwd mm5,mm2               ; mm5=(24 25 26 27 34 35 36 37)
-
-        movq      mm3,mm0               ; transpose coefficients(phase 3)
-        punpckldq mm0,mm1               ; mm0=(00 01 02 03 04 05 06 07)
-        punpckhdq mm3,mm1               ; mm3=(10 11 12 13 14 15 16 17)
-        movq      mm4,mm7               ; transpose coefficients(phase 3)
-        punpckldq mm7,mm5               ; mm7=(20 21 22 23 24 25 26 27)
-        punpckhdq mm4,mm5               ; mm4=(30 31 32 33 34 35 36 37)
-
-        pushpic ebx                     ; save GOT address
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
-        mov     edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
-
-        poppic  ebx                     ; restore GOT address
-
-        add     esi, byte 4*SIZEOF_JCOEF        ; wsptr
-        add     edi, byte 4*SIZEOF_JSAMPROW
-        dec     ecx                             ; ctr
-        jnz     near .rowloop
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jidctint-sse2-64.asm b/simd/jidctint-sse2-64.asm
deleted file mode 100644
index afe1d6a..0000000
--- a/simd/jidctint-sse2-64.asm
+++ /dev/null
@@ -1,847 +0,0 @@
-;
-; jidctint.asm - accurate integer IDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; inverse DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jidctint.c; see the jidctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2      (CONST_BITS+PASS1_BITS+3)
-
-%if CONST_BITS == 13
-F_0_298 equ      2446           ; FIX(0.298631336)
-F_0_390 equ      3196           ; FIX(0.390180644)
-F_0_541 equ      4433           ; FIX(0.541196100)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_175 equ      9633           ; FIX(1.175875602)
-F_1_501 equ     12299           ; FIX(1.501321110)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_1_961 equ     16069           ; FIX(1.961570560)
-F_2_053 equ     16819           ; FIX(2.053119869)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_072 equ     25172           ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
-F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
-F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
-F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_islow_sse2)
-
-EXTN(jconst_idct_islow_sse2):
-
-PW_F130_F054    times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130   times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117   times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078    times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089  times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060   times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256  times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050   times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1   times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2   times 4 dd  1 << (DESCALE_P2-1)
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_islow_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = jpeg_component_info *compptr
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp    rbp+0
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          12
-
-        align   16
-        global  EXTN(jsimd_idct_islow_sse2)
-
-EXTN(jsimd_idct_islow_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-
-        ; ---- Pass 1: process columns from input.
-
-        mov     rdx, r10                ; quantptr
-        mov     rsi, r11                ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
-        mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1,xmm0
-        packsswb xmm1,xmm1
-        packsswb xmm1,xmm1
-        movd    eax,xmm1
-        test    rax,rax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movdqa  xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        psllw   xmm5,PASS1_BITS
-
-        movdqa    xmm4,xmm5             ; xmm5=in0=(00 01 02 03 04 05 06 07)
-        punpcklwd xmm5,xmm5             ; xmm5=(00 00 01 01 02 02 03 03)
-        punpckhwd xmm4,xmm4             ; xmm4=(04 04 05 05 06 06 07 07)
-
-        pshufd  xmm7,xmm5,0x00          ; xmm7=col0=(00 00 00 00 00 00 00 00)
-        pshufd  xmm6,xmm5,0x55          ; xmm6=col1=(01 01 01 01 01 01 01 01)
-        pshufd  xmm1,xmm5,0xAA          ; xmm1=col2=(02 02 02 02 02 02 02 02)
-        pshufd  xmm5,xmm5,0xFF          ; xmm5=col3=(03 03 03 03 03 03 03 03)
-        pshufd  xmm0,xmm4,0x00          ; xmm0=col4=(04 04 04 04 04 04 04 04)
-        pshufd  xmm3,xmm4,0x55          ; xmm3=col5=(05 05 05 05 05 05 05 05)
-        pshufd  xmm2,xmm4,0xAA          ; xmm2=col6=(06 06 06 06 06 06 06 06)
-        pshufd  xmm4,xmm4,0xFF          ; xmm4=col7=(07 07 07 07 07 07 07 07)
-
-        movdqa  XMMWORD [wk(8)], xmm6   ; wk(8)=col1
-        movdqa  XMMWORD [wk(9)], xmm5   ; wk(9)=col3
-        movdqa  XMMWORD [wk(10)], xmm3  ; wk(10)=col5
-        movdqa  XMMWORD [wk(11)], xmm4  ; wk(11)=col7
-        jmp     near .column_end
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; (Original)
-        ; z1 = (z2 + z3) * 0.541196100;
-        ; tmp2 = z1 + z3 * -1.847759065;
-        ; tmp3 = z1 + z2 * 0.765366865;
-        ;
-        ; (This implementation)
-        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-        movdqa    xmm4,xmm1             ; xmm1=in2=z2
-        movdqa    xmm5,xmm1
-        punpcklwd xmm4,xmm3             ; xmm3=in6=z3
-        punpckhwd xmm5,xmm3
-        movdqa    xmm1,xmm4
-        movdqa    xmm3,xmm5
-        pmaddwd   xmm4,[rel PW_F130_F054]       ; xmm4=tmp3L
-        pmaddwd   xmm5,[rel PW_F130_F054]       ; xmm5=tmp3H
-        pmaddwd   xmm1,[rel PW_F054_MF130]      ; xmm1=tmp2L
-        pmaddwd   xmm3,[rel PW_F054_MF130]      ; xmm3=tmp2H
-
-        movdqa    xmm6,xmm0
-        paddw     xmm0,xmm2             ; xmm0=in0+in4
-        psubw     xmm6,xmm2             ; xmm6=in0-in4
-
-        pxor      xmm7,xmm7
-        pxor      xmm2,xmm2
-        punpcklwd xmm7,xmm0             ; xmm7=tmp0L
-        punpckhwd xmm2,xmm0             ; xmm2=tmp0H
-        psrad     xmm7,(16-CONST_BITS)  ; psrad xmm7,16 & pslld xmm7,CONST_BITS
-        psrad     xmm2,(16-CONST_BITS)  ; psrad xmm2,16 & pslld xmm2,CONST_BITS
-
-        movdqa  xmm0,xmm7
-        paddd   xmm7,xmm4               ; xmm7=tmp10L
-        psubd   xmm0,xmm4               ; xmm0=tmp13L
-        movdqa  xmm4,xmm2
-        paddd   xmm2,xmm5               ; xmm2=tmp10H
-        psubd   xmm4,xmm5               ; xmm4=tmp13H
-
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
-        movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
-        movdqa  XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
-        movdqa  XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
-
-        pxor      xmm5,xmm5
-        pxor      xmm7,xmm7
-        punpcklwd xmm5,xmm6             ; xmm5=tmp1L
-        punpckhwd xmm7,xmm6             ; xmm7=tmp1H
-        psrad     xmm5,(16-CONST_BITS)  ; psrad xmm5,16 & pslld xmm5,CONST_BITS
-        psrad     xmm7,(16-CONST_BITS)  ; psrad xmm7,16 & pslld xmm7,CONST_BITS
-
-        movdqa  xmm2,xmm5
-        paddd   xmm5,xmm1               ; xmm5=tmp11L
-        psubd   xmm2,xmm1               ; xmm2=tmp12L
-        movdqa  xmm0,xmm7
-        paddd   xmm7,xmm3               ; xmm7=tmp11H
-        psubd   xmm0,xmm3               ; xmm0=tmp12H
-
-        movdqa  XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
-        movdqa  XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
-        movdqa  XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
-        movdqa  XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
-
-        ; -- Odd part
-
-        movdqa  xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movdqa  xmm5,xmm6
-        movdqa  xmm7,xmm4
-        paddw   xmm5,xmm3               ; xmm5=z3
-        paddw   xmm7,xmm1               ; xmm7=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm2,xmm5
-        movdqa    xmm0,xmm5
-        punpcklwd xmm2,xmm7
-        punpckhwd xmm0,xmm7
-        movdqa    xmm5,xmm2
-        movdqa    xmm7,xmm0
-        pmaddwd   xmm2,[rel PW_MF078_F117]      ; xmm2=z3L
-        pmaddwd   xmm0,[rel PW_MF078_F117]      ; xmm0=z3H
-        pmaddwd   xmm5,[rel PW_F117_F078]       ; xmm5=z4L
-        pmaddwd   xmm7,[rel PW_F117_F078]       ; xmm7=z4H
-
-        movdqa  XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
-        movdqa  XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
-
-        ; (Original)
-        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-        ; tmp0 += z3;  tmp1 += z4;
-        ; tmp2 += z3;  tmp3 += z4;
-
-        movdqa    xmm2,xmm3
-        movdqa    xmm0,xmm3
-        punpcklwd xmm2,xmm4
-        punpckhwd xmm0,xmm4
-        movdqa    xmm3,xmm2
-        movdqa    xmm4,xmm0
-        pmaddwd   xmm2,[rel PW_MF060_MF089]     ; xmm2=tmp0L
-        pmaddwd   xmm0,[rel PW_MF060_MF089]     ; xmm0=tmp0H
-        pmaddwd   xmm3,[rel PW_MF089_F060]      ; xmm3=tmp3L
-        pmaddwd   xmm4,[rel PW_MF089_F060]      ; xmm4=tmp3H
-
-        paddd   xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
-        paddd   xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
-        paddd   xmm3,xmm5               ; xmm3=tmp3L
-        paddd   xmm4,xmm7               ; xmm4=tmp3H
-
-        movdqa  XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
-        movdqa  XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
-
-        movdqa    xmm2,xmm1
-        movdqa    xmm0,xmm1
-        punpcklwd xmm2,xmm6
-        punpckhwd xmm0,xmm6
-        movdqa    xmm1,xmm2
-        movdqa    xmm6,xmm0
-        pmaddwd   xmm2,[rel PW_MF050_MF256]     ; xmm2=tmp1L
-        pmaddwd   xmm0,[rel PW_MF050_MF256]     ; xmm0=tmp1H
-        pmaddwd   xmm1,[rel PW_MF256_F050]      ; xmm1=tmp2L
-        pmaddwd   xmm6,[rel PW_MF256_F050]      ; xmm6=tmp2H
-
-        paddd   xmm2,xmm5               ; xmm2=tmp1L
-        paddd   xmm0,xmm7               ; xmm0=tmp1H
-        paddd   xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
-        paddd   xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
-
-        movdqa  XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
-        movdqa  XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
-
-        ; -- Final output stage
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
-        movdqa  xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
-
-        movdqa  xmm2,xmm5
-        movdqa  xmm0,xmm7
-        paddd   xmm5,xmm3               ; xmm5=data0L
-        paddd   xmm7,xmm4               ; xmm7=data0H
-        psubd   xmm2,xmm3               ; xmm2=data7L
-        psubd   xmm0,xmm4               ; xmm0=data7H
-
-        movdqa  xmm3,[rel PD_DESCALE_P1]        ; xmm3=[rel PD_DESCALE_P1]
-
-        paddd   xmm5,xmm3
-        paddd   xmm7,xmm3
-        psrad   xmm5,DESCALE_P1
-        psrad   xmm7,DESCALE_P1
-        paddd   xmm2,xmm3
-        paddd   xmm0,xmm3
-        psrad   xmm2,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-
-        packssdw  xmm5,xmm7             ; xmm5=data0=(00 01 02 03 04 05 06 07)
-        packssdw  xmm2,xmm0             ; xmm2=data7=(70 71 72 73 74 75 76 77)
-
-        movdqa  xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
-        movdqa  xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
-
-        movdqa  xmm7,xmm4
-        movdqa  xmm0,xmm3
-        paddd   xmm4,xmm1               ; xmm4=data1L
-        paddd   xmm3,xmm6               ; xmm3=data1H
-        psubd   xmm7,xmm1               ; xmm7=data6L
-        psubd   xmm0,xmm6               ; xmm0=data6H
-
-        movdqa  xmm1,[rel PD_DESCALE_P1]        ; xmm1=[rel PD_DESCALE_P1]
-
-        paddd   xmm4,xmm1
-        paddd   xmm3,xmm1
-        psrad   xmm4,DESCALE_P1
-        psrad   xmm3,DESCALE_P1
-        paddd   xmm7,xmm1
-        paddd   xmm0,xmm1
-        psrad   xmm7,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-
-        packssdw  xmm4,xmm3             ; xmm4=data1=(10 11 12 13 14 15 16 17)
-        packssdw  xmm7,xmm0             ; xmm7=data6=(60 61 62 63 64 65 66 67)
-
-        movdqa    xmm6,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm4             ; xmm5=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm6,xmm4             ; xmm6=(04 14 05 15 06 16 07 17)
-        movdqa    xmm1,xmm7             ; transpose coefficients(phase 1)
-        punpcklwd xmm7,xmm2             ; xmm7=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm1,xmm2             ; xmm1=(64 74 65 75 66 76 67 77)
-
-        movdqa  xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
-        movdqa  xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
-        movdqa  xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
-        movdqa  xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
-
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
-        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
-        movdqa  XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
-        movdqa  XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
-
-        movdqa  xmm5,xmm3
-        movdqa  xmm6,xmm0
-        paddd   xmm3,xmm4               ; xmm3=data2L
-        paddd   xmm0,xmm2               ; xmm0=data2H
-        psubd   xmm5,xmm4               ; xmm5=data5L
-        psubd   xmm6,xmm2               ; xmm6=data5H
-
-        movdqa  xmm7,[rel PD_DESCALE_P1]        ; xmm7=[rel PD_DESCALE_P1]
-
-        paddd   xmm3,xmm7
-        paddd   xmm0,xmm7
-        psrad   xmm3,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-        paddd   xmm5,xmm7
-        paddd   xmm6,xmm7
-        psrad   xmm5,DESCALE_P1
-        psrad   xmm6,DESCALE_P1
-
-        packssdw  xmm3,xmm0             ; xmm3=data2=(20 21 22 23 24 25 26 27)
-        packssdw  xmm5,xmm6             ; xmm5=data5=(50 51 52 53 54 55 56 57)
-
-        movdqa  xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
-        movdqa  xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
-        movdqa  xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
-        movdqa  xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
-
-        movdqa  xmm0,xmm1
-        movdqa  xmm6,xmm4
-        paddd   xmm1,xmm2               ; xmm1=data3L
-        paddd   xmm4,xmm7               ; xmm4=data3H
-        psubd   xmm0,xmm2               ; xmm0=data4L
-        psubd   xmm6,xmm7               ; xmm6=data4H
-
-        movdqa  xmm2,[rel PD_DESCALE_P1]        ; xmm2=[rel PD_DESCALE_P1]
-
-        paddd   xmm1,xmm2
-        paddd   xmm4,xmm2
-        psrad   xmm1,DESCALE_P1
-        psrad   xmm4,DESCALE_P1
-        paddd   xmm0,xmm2
-        paddd   xmm6,xmm2
-        psrad   xmm0,DESCALE_P1
-        psrad   xmm6,DESCALE_P1
-
-        packssdw  xmm1,xmm4             ; xmm1=data3=(30 31 32 33 34 35 36 37)
-        packssdw  xmm0,xmm6             ; xmm0=data4=(40 41 42 43 44 45 46 47)
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
-        movdqa  xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
-
-        movdqa    xmm4,xmm3             ; transpose coefficients(phase 1)
-        punpcklwd xmm3,xmm1             ; xmm3=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm4,xmm1             ; xmm4=(24 34 25 35 26 36 27 37)
-        movdqa    xmm6,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm5             ; xmm0=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm6,xmm5             ; xmm6=(44 54 45 55 46 56 47 57)
-
-        movdqa    xmm1,xmm7             ; transpose coefficients(phase 2)
-        punpckldq xmm7,xmm3             ; xmm7=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm1,xmm3             ; xmm1=(02 12 22 32 03 13 23 33)
-        movdqa    xmm5,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm4             ; xmm2=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm5,xmm4             ; xmm5=(06 16 26 36 07 17 27 37)
-
-        movdqa  xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
-        movdqa  xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
-
-        movdqa  XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
-        movdqa  XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
-
-        movdqa    xmm2,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm3             ; xmm0=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm2,xmm3             ; xmm2=(42 52 62 72 43 53 63 73)
-        movdqa    xmm5,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm4             ; xmm6=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm5,xmm4             ; xmm5=(46 56 66 76 47 57 67 77)
-
-        movdqa     xmm3,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm0            ; xmm7=col0=(00 10 20 30 40 50 60 70)
-        punpckhqdq xmm3,xmm0            ; xmm3=col1=(01 11 21 31 41 51 61 71)
-        movdqa     xmm4,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm2            ; xmm1=col2=(02 12 22 32 42 52 62 72)
-        punpckhqdq xmm4,xmm2            ; xmm4=col3=(03 13 23 33 43 53 63 73)
-
-        movdqa  xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
-        movdqa  xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
-
-        movdqa  XMMWORD [wk(8)], xmm3   ; wk(8)=col1
-        movdqa  XMMWORD [wk(9)], xmm4   ; wk(9)=col3
-
-        movdqa     xmm3,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm6            ; xmm0=col4=(04 14 24 34 44 54 64 74)
-        punpckhqdq xmm3,xmm6            ; xmm3=col5=(05 15 25 35 45 55 65 75)
-        movdqa     xmm4,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm5            ; xmm2=col6=(06 16 26 36 46 56 66 76)
-        punpckhqdq xmm4,xmm5            ; xmm4=col7=(07 17 27 37 47 57 67 77)
-
-        movdqa  XMMWORD [wk(10)], xmm3  ; wk(10)=col5
-        movdqa  XMMWORD [wk(11)], xmm4  ; wk(11)=col7
-.column_end:
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     rax, [original_rbp]
-        mov     rdi, r12        ; (JSAMPROW *)
-        mov     eax, r13d
-
-        ; -- Even part
-
-        ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
-
-        ; (Original)
-        ; z1 = (z2 + z3) * 0.541196100;
-        ; tmp2 = z1 + z3 * -1.847759065;
-        ; tmp3 = z1 + z2 * 0.765366865;
-        ;
-        ; (This implementation)
-        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-        movdqa    xmm6,xmm1             ; xmm1=in2=z2
-        movdqa    xmm5,xmm1
-        punpcklwd xmm6,xmm2             ; xmm2=in6=z3
-        punpckhwd xmm5,xmm2
-        movdqa    xmm1,xmm6
-        movdqa    xmm2,xmm5
-        pmaddwd   xmm6,[rel PW_F130_F054]       ; xmm6=tmp3L
-        pmaddwd   xmm5,[rel PW_F130_F054]       ; xmm5=tmp3H
-        pmaddwd   xmm1,[rel PW_F054_MF130]      ; xmm1=tmp2L
-        pmaddwd   xmm2,[rel PW_F054_MF130]      ; xmm2=tmp2H
-
-        movdqa    xmm3,xmm7
-        paddw     xmm7,xmm0             ; xmm7=in0+in4
-        psubw     xmm3,xmm0             ; xmm3=in0-in4
-
-        pxor      xmm4,xmm4
-        pxor      xmm0,xmm0
-        punpcklwd xmm4,xmm7             ; xmm4=tmp0L
-        punpckhwd xmm0,xmm7             ; xmm0=tmp0H
-        psrad     xmm4,(16-CONST_BITS)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS
-        psrad     xmm0,(16-CONST_BITS)  ; psrad xmm0,16 & pslld xmm0,CONST_BITS
-
-        movdqa  xmm7,xmm4
-        paddd   xmm4,xmm6               ; xmm4=tmp10L
-        psubd   xmm7,xmm6               ; xmm7=tmp13L
-        movdqa  xmm6,xmm0
-        paddd   xmm0,xmm5               ; xmm0=tmp10H
-        psubd   xmm6,xmm5               ; xmm6=tmp13H
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
-        movdqa  XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
-        movdqa  XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
-
-        pxor      xmm5,xmm5
-        pxor      xmm4,xmm4
-        punpcklwd xmm5,xmm3             ; xmm5=tmp1L
-        punpckhwd xmm4,xmm3             ; xmm4=tmp1H
-        psrad     xmm5,(16-CONST_BITS)  ; psrad xmm5,16 & pslld xmm5,CONST_BITS
-        psrad     xmm4,(16-CONST_BITS)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS
-
-        movdqa  xmm0,xmm5
-        paddd   xmm5,xmm1               ; xmm5=tmp11L
-        psubd   xmm0,xmm1               ; xmm0=tmp12L
-        movdqa  xmm7,xmm4
-        paddd   xmm4,xmm2               ; xmm4=tmp11H
-        psubd   xmm7,xmm2               ; xmm7=tmp12H
-
-        movdqa  XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
-        movdqa  XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
-        movdqa  XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
-        movdqa  XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
-
-        ; -- Odd part
-
-        movdqa  xmm6, XMMWORD [wk(9)]   ; xmm6=col3
-        movdqa  xmm3, XMMWORD [wk(8)]   ; xmm3=col1
-        movdqa  xmm1, XMMWORD [wk(11)]  ; xmm1=col7
-        movdqa  xmm2, XMMWORD [wk(10)]  ; xmm2=col5
-
-        movdqa  xmm5,xmm6
-        movdqa  xmm4,xmm3
-        paddw   xmm5,xmm1               ; xmm5=z3
-        paddw   xmm4,xmm2               ; xmm4=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm0,xmm5
-        movdqa    xmm7,xmm5
-        punpcklwd xmm0,xmm4
-        punpckhwd xmm7,xmm4
-        movdqa    xmm5,xmm0
-        movdqa    xmm4,xmm7
-        pmaddwd   xmm0,[rel PW_MF078_F117]      ; xmm0=z3L
-        pmaddwd   xmm7,[rel PW_MF078_F117]      ; xmm7=z3H
-        pmaddwd   xmm5,[rel PW_F117_F078]       ; xmm5=z4L
-        pmaddwd   xmm4,[rel PW_F117_F078]       ; xmm4=z4H
-
-        movdqa  XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
-        movdqa  XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
-
-        ; (Original)
-        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-        ; tmp0 += z3;  tmp1 += z4;
-        ; tmp2 += z3;  tmp3 += z4;
-
-        movdqa    xmm0,xmm1
-        movdqa    xmm7,xmm1
-        punpcklwd xmm0,xmm3
-        punpckhwd xmm7,xmm3
-        movdqa    xmm1,xmm0
-        movdqa    xmm3,xmm7
-        pmaddwd   xmm0,[rel PW_MF060_MF089]     ; xmm0=tmp0L
-        pmaddwd   xmm7,[rel PW_MF060_MF089]     ; xmm7=tmp0H
-        pmaddwd   xmm1,[rel PW_MF089_F060]      ; xmm1=tmp3L
-        pmaddwd   xmm3,[rel PW_MF089_F060]      ; xmm3=tmp3H
-
-        paddd   xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
-        paddd   xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
-        paddd   xmm1,xmm5               ; xmm1=tmp3L
-        paddd   xmm3,xmm4               ; xmm3=tmp3H
-
-        movdqa  XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
-        movdqa  XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
-
-        movdqa    xmm0,xmm2
-        movdqa    xmm7,xmm2
-        punpcklwd xmm0,xmm6
-        punpckhwd xmm7,xmm6
-        movdqa    xmm2,xmm0
-        movdqa    xmm6,xmm7
-        pmaddwd   xmm0,[rel PW_MF050_MF256]     ; xmm0=tmp1L
-        pmaddwd   xmm7,[rel PW_MF050_MF256]     ; xmm7=tmp1H
-        pmaddwd   xmm2,[rel PW_MF256_F050]      ; xmm2=tmp2L
-        pmaddwd   xmm6,[rel PW_MF256_F050]      ; xmm6=tmp2H
-
-        paddd   xmm0,xmm5               ; xmm0=tmp1L
-        paddd   xmm7,xmm4               ; xmm7=tmp1H
-        paddd   xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
-        paddd   xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
-
-        movdqa  XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
-        movdqa  XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
-
-        ; -- Final output stage
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
-        movdqa  xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
-
-        movdqa  xmm0,xmm5
-        movdqa  xmm7,xmm4
-        paddd   xmm5,xmm1               ; xmm5=data0L
-        paddd   xmm4,xmm3               ; xmm4=data0H
-        psubd   xmm0,xmm1               ; xmm0=data7L
-        psubd   xmm7,xmm3               ; xmm7=data7H
-
-        movdqa  xmm1,[rel PD_DESCALE_P2]        ; xmm1=[rel PD_DESCALE_P2]
-
-        paddd   xmm5,xmm1
-        paddd   xmm4,xmm1
-        psrad   xmm5,DESCALE_P2
-        psrad   xmm4,DESCALE_P2
-        paddd   xmm0,xmm1
-        paddd   xmm7,xmm1
-        psrad   xmm0,DESCALE_P2
-        psrad   xmm7,DESCALE_P2
-
-        packssdw  xmm5,xmm4             ; xmm5=data0=(00 10 20 30 40 50 60 70)
-        packssdw  xmm0,xmm7             ; xmm0=data7=(07 17 27 37 47 57 67 77)
-
-        movdqa  xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
-        movdqa  xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm7,xmm1
-        paddd   xmm3,xmm2               ; xmm3=data1L
-        paddd   xmm1,xmm6               ; xmm1=data1H
-        psubd   xmm4,xmm2               ; xmm4=data6L
-        psubd   xmm7,xmm6               ; xmm7=data6H
-
-        movdqa  xmm2,[rel PD_DESCALE_P2]        ; xmm2=[rel PD_DESCALE_P2]
-
-        paddd   xmm3,xmm2
-        paddd   xmm1,xmm2
-        psrad   xmm3,DESCALE_P2
-        psrad   xmm1,DESCALE_P2
-        paddd   xmm4,xmm2
-        paddd   xmm7,xmm2
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm7,DESCALE_P2
-
-        packssdw  xmm3,xmm1             ; xmm3=data1=(01 11 21 31 41 51 61 71)
-        packssdw  xmm4,xmm7             ; xmm4=data6=(06 16 26 36 46 56 66 76)
-
-        packsswb  xmm5,xmm4             ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        packsswb  xmm3,xmm0             ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        movdqa  xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
-        movdqa  xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
-        movdqa  xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
-        movdqa  xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
-
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        movdqa  xmm4,xmm6
-        movdqa  xmm0,xmm2
-        paddd   xmm6,xmm1               ; xmm6=data2L
-        paddd   xmm2,xmm7               ; xmm2=data2H
-        psubd   xmm4,xmm1               ; xmm4=data5L
-        psubd   xmm0,xmm7               ; xmm0=data5H
-
-        movdqa  xmm5,[rel PD_DESCALE_P2]        ; xmm5=[rel PD_DESCALE_P2]
-
-        paddd   xmm6,xmm5
-        paddd   xmm2,xmm5
-        psrad   xmm6,DESCALE_P2
-        psrad   xmm2,DESCALE_P2
-        paddd   xmm4,xmm5
-        paddd   xmm0,xmm5
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm0,DESCALE_P2
-
-        packssdw  xmm6,xmm2             ; xmm6=data2=(02 12 22 32 42 52 62 72)
-        packssdw  xmm4,xmm0             ; xmm4=data5=(05 15 25 35 45 55 65 75)
-
-        movdqa  xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
-        movdqa  xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
-        movdqa  xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
-        movdqa  xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
-
-        movdqa  xmm2,xmm3
-        movdqa  xmm0,xmm1
-        paddd   xmm3,xmm7               ; xmm3=data3L
-        paddd   xmm1,xmm5               ; xmm1=data3H
-        psubd   xmm2,xmm7               ; xmm2=data4L
-        psubd   xmm0,xmm5               ; xmm0=data4H
-
-        movdqa  xmm7,[rel PD_DESCALE_P2]        ; xmm7=[rel PD_DESCALE_P2]
-
-        paddd   xmm3,xmm7
-        paddd   xmm1,xmm7
-        psrad   xmm3,DESCALE_P2
-        psrad   xmm1,DESCALE_P2
-        paddd   xmm2,xmm7
-        paddd   xmm0,xmm7
-        psrad   xmm2,DESCALE_P2
-        psrad   xmm0,DESCALE_P2
-
-        movdqa    xmm5,[rel PB_CENTERJSAMP]     ; xmm5=[rel PB_CENTERJSAMP]
-
-        packssdw  xmm3,xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
-        packssdw  xmm2,xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
-
-        movdqa    xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        packsswb  xmm6,xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-        packsswb  xmm3,xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-        paddb     xmm7,xmm5
-        paddb     xmm1,xmm5
-        paddb     xmm6,xmm5
-        paddb     xmm3,xmm5
-
-        movdqa    xmm0,xmm7     ; transpose coefficients(phase 1)
-        punpcklbw xmm7,xmm1     ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-        punpckhbw xmm0,xmm1     ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-        movdqa    xmm2,xmm6     ; transpose coefficients(phase 1)
-        punpcklbw xmm6,xmm3     ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-        punpckhbw xmm2,xmm3     ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-        movdqa    xmm4,xmm7     ; transpose coefficients(phase 2)
-        punpcklwd xmm7,xmm6     ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-        punpckhwd xmm4,xmm6     ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-        movdqa    xmm5,xmm2     ; transpose coefficients(phase 2)
-        punpcklwd xmm2,xmm0     ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-        punpckhwd xmm5,xmm0     ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-        movdqa    xmm1,xmm7     ; transpose coefficients(phase 3)
-        punpckldq xmm7,xmm2     ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-        punpckhdq xmm1,xmm2     ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-        movdqa    xmm3,xmm4     ; transpose coefficients(phase 3)
-        punpckldq xmm4,xmm5     ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-        punpckhdq xmm3,xmm5     ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-        pshufd  xmm6,xmm7,0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-        pshufd  xmm0,xmm1,0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-        pshufd  xmm2,xmm4,0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-        pshufd  xmm5,xmm3,0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
-        mov     rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-
-        mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
-        mov     rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
-
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jidctint-sse2.asm b/simd/jidctint-sse2.asm
deleted file mode 100644
index 6c7e7d9..0000000
--- a/simd/jidctint-sse2.asm
+++ /dev/null
@@ -1,858 +0,0 @@
-;
-; jidctint.asm - accurate integer IDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; inverse DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jidctint.c; see the jidctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2      (CONST_BITS+PASS1_BITS+3)
-
-%if CONST_BITS == 13
-F_0_298 equ      2446           ; FIX(0.298631336)
-F_0_390 equ      3196           ; FIX(0.390180644)
-F_0_541 equ      4433           ; FIX(0.541196100)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_175 equ      9633           ; FIX(1.175875602)
-F_1_501 equ     12299           ; FIX(1.501321110)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_1_961 equ     16069           ; FIX(1.961570560)
-F_2_053 equ     16819           ; FIX(2.053119869)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_072 equ     25172           ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
-F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
-F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
-F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_islow_sse2)
-
-EXTN(jconst_idct_islow_sse2):
-
-PW_F130_F054    times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130   times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117   times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078    times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089  times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060   times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256  times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050   times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1   times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2   times 4 dd  1 << (DESCALE_P2-1)
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_islow_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          12
-
-        align   16
-        global  EXTN(jsimd_idct_islow_sse2)
-
-EXTN(jsimd_idct_islow_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     xmm1,xmm0
-        packsswb xmm1,xmm1
-        packsswb xmm1,xmm1
-        movd    eax,xmm1
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movdqa  xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        psllw   xmm5,PASS1_BITS
-
-        movdqa    xmm4,xmm5             ; xmm5=in0=(00 01 02 03 04 05 06 07)
-        punpcklwd xmm5,xmm5             ; xmm5=(00 00 01 01 02 02 03 03)
-        punpckhwd xmm4,xmm4             ; xmm4=(04 04 05 05 06 06 07 07)
-
-        pshufd  xmm7,xmm5,0x00          ; xmm7=col0=(00 00 00 00 00 00 00 00)
-        pshufd  xmm6,xmm5,0x55          ; xmm6=col1=(01 01 01 01 01 01 01 01)
-        pshufd  xmm1,xmm5,0xAA          ; xmm1=col2=(02 02 02 02 02 02 02 02)
-        pshufd  xmm5,xmm5,0xFF          ; xmm5=col3=(03 03 03 03 03 03 03 03)
-        pshufd  xmm0,xmm4,0x00          ; xmm0=col4=(04 04 04 04 04 04 04 04)
-        pshufd  xmm3,xmm4,0x55          ; xmm3=col5=(05 05 05 05 05 05 05 05)
-        pshufd  xmm2,xmm4,0xAA          ; xmm2=col6=(06 06 06 06 06 06 06 06)
-        pshufd  xmm4,xmm4,0xFF          ; xmm4=col7=(07 07 07 07 07 07 07 07)
-
-        movdqa  XMMWORD [wk(8)], xmm6   ; wk(8)=col1
-        movdqa  XMMWORD [wk(9)], xmm5   ; wk(9)=col3
-        movdqa  XMMWORD [wk(10)], xmm3  ; wk(10)=col5
-        movdqa  XMMWORD [wk(11)], xmm4  ; wk(11)=col7
-        jmp     near .column_end
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; (Original)
-        ; z1 = (z2 + z3) * 0.541196100;
-        ; tmp2 = z1 + z3 * -1.847759065;
-        ; tmp3 = z1 + z2 * 0.765366865;
-        ;
-        ; (This implementation)
-        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-        movdqa    xmm4,xmm1             ; xmm1=in2=z2
-        movdqa    xmm5,xmm1
-        punpcklwd xmm4,xmm3             ; xmm3=in6=z3
-        punpckhwd xmm5,xmm3
-        movdqa    xmm1,xmm4
-        movdqa    xmm3,xmm5
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]       ; xmm4=tmp3L
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]       ; xmm5=tmp3H
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm1=tmp2L
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm3=tmp2H
-
-        movdqa    xmm6,xmm0
-        paddw     xmm0,xmm2             ; xmm0=in0+in4
-        psubw     xmm6,xmm2             ; xmm6=in0-in4
-
-        pxor      xmm7,xmm7
-        pxor      xmm2,xmm2
-        punpcklwd xmm7,xmm0             ; xmm7=tmp0L
-        punpckhwd xmm2,xmm0             ; xmm2=tmp0H
-        psrad     xmm7,(16-CONST_BITS)  ; psrad xmm7,16 & pslld xmm7,CONST_BITS
-        psrad     xmm2,(16-CONST_BITS)  ; psrad xmm2,16 & pslld xmm2,CONST_BITS
-
-        movdqa  xmm0,xmm7
-        paddd   xmm7,xmm4               ; xmm7=tmp10L
-        psubd   xmm0,xmm4               ; xmm0=tmp13L
-        movdqa  xmm4,xmm2
-        paddd   xmm2,xmm5               ; xmm2=tmp10H
-        psubd   xmm4,xmm5               ; xmm4=tmp13H
-
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
-        movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
-        movdqa  XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
-        movdqa  XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
-
-        pxor      xmm5,xmm5
-        pxor      xmm7,xmm7
-        punpcklwd xmm5,xmm6             ; xmm5=tmp1L
-        punpckhwd xmm7,xmm6             ; xmm7=tmp1H
-        psrad     xmm5,(16-CONST_BITS)  ; psrad xmm5,16 & pslld xmm5,CONST_BITS
-        psrad     xmm7,(16-CONST_BITS)  ; psrad xmm7,16 & pslld xmm7,CONST_BITS
-
-        movdqa  xmm2,xmm5
-        paddd   xmm5,xmm1               ; xmm5=tmp11L
-        psubd   xmm2,xmm1               ; xmm2=tmp12L
-        movdqa  xmm0,xmm7
-        paddd   xmm7,xmm3               ; xmm7=tmp11H
-        psubd   xmm0,xmm3               ; xmm0=tmp12H
-
-        movdqa  XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
-        movdqa  XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
-        movdqa  XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
-        movdqa  XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
-
-        ; -- Odd part
-
-        movdqa  xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movdqa  xmm5,xmm6
-        movdqa  xmm7,xmm4
-        paddw   xmm5,xmm3               ; xmm5=z3
-        paddw   xmm7,xmm1               ; xmm7=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm2,xmm5
-        movdqa    xmm0,xmm5
-        punpcklwd xmm2,xmm7
-        punpckhwd xmm0,xmm7
-        movdqa    xmm5,xmm2
-        movdqa    xmm7,xmm0
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm2=z3L
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm0=z3H
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]       ; xmm5=z4L
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_F117_F078)]       ; xmm7=z4H
-
-        movdqa  XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
-        movdqa  XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
-
-        ; (Original)
-        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-        ; tmp0 += z3;  tmp1 += z4;
-        ; tmp2 += z3;  tmp3 += z4;
-
-        movdqa    xmm2,xmm3
-        movdqa    xmm0,xmm3
-        punpcklwd xmm2,xmm4
-        punpckhwd xmm0,xmm4
-        movdqa    xmm3,xmm2
-        movdqa    xmm4,xmm0
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm2=tmp0L
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm0=tmp0H
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm3=tmp3L
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm4=tmp3H
-
-        paddd   xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
-        paddd   xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
-        paddd   xmm3,xmm5               ; xmm3=tmp3L
-        paddd   xmm4,xmm7               ; xmm4=tmp3H
-
-        movdqa  XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
-        movdqa  XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
-
-        movdqa    xmm2,xmm1
-        movdqa    xmm0,xmm1
-        punpcklwd xmm2,xmm6
-        punpckhwd xmm0,xmm6
-        movdqa    xmm1,xmm2
-        movdqa    xmm6,xmm0
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm2=tmp1L
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm0=tmp1H
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm1=tmp2L
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm6=tmp2H
-
-        paddd   xmm2,xmm5               ; xmm2=tmp1L
-        paddd   xmm0,xmm7               ; xmm0=tmp1H
-        paddd   xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
-        paddd   xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
-
-        movdqa  XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
-        movdqa  XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
-
-        ; -- Final output stage
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
-        movdqa  xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
-
-        movdqa  xmm2,xmm5
-        movdqa  xmm0,xmm7
-        paddd   xmm5,xmm3               ; xmm5=data0L
-        paddd   xmm7,xmm4               ; xmm7=data0H
-        psubd   xmm2,xmm3               ; xmm2=data7L
-        psubd   xmm0,xmm4               ; xmm0=data7H
-
-        movdqa  xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]        ; xmm3=[PD_DESCALE_P1]
-
-        paddd   xmm5,xmm3
-        paddd   xmm7,xmm3
-        psrad   xmm5,DESCALE_P1
-        psrad   xmm7,DESCALE_P1
-        paddd   xmm2,xmm3
-        paddd   xmm0,xmm3
-        psrad   xmm2,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-
-        packssdw  xmm5,xmm7             ; xmm5=data0=(00 01 02 03 04 05 06 07)
-        packssdw  xmm2,xmm0             ; xmm2=data7=(70 71 72 73 74 75 76 77)
-
-        movdqa  xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
-        movdqa  xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
-
-        movdqa  xmm7,xmm4
-        movdqa  xmm0,xmm3
-        paddd   xmm4,xmm1               ; xmm4=data1L
-        paddd   xmm3,xmm6               ; xmm3=data1H
-        psubd   xmm7,xmm1               ; xmm7=data6L
-        psubd   xmm0,xmm6               ; xmm0=data6H
-
-        movdqa  xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]        ; xmm1=[PD_DESCALE_P1]
-
-        paddd   xmm4,xmm1
-        paddd   xmm3,xmm1
-        psrad   xmm4,DESCALE_P1
-        psrad   xmm3,DESCALE_P1
-        paddd   xmm7,xmm1
-        paddd   xmm0,xmm1
-        psrad   xmm7,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-
-        packssdw  xmm4,xmm3             ; xmm4=data1=(10 11 12 13 14 15 16 17)
-        packssdw  xmm7,xmm0             ; xmm7=data6=(60 61 62 63 64 65 66 67)
-
-        movdqa    xmm6,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm4             ; xmm5=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm6,xmm4             ; xmm6=(04 14 05 15 06 16 07 17)
-        movdqa    xmm1,xmm7             ; transpose coefficients(phase 1)
-        punpcklwd xmm7,xmm2             ; xmm7=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm1,xmm2             ; xmm1=(64 74 65 75 66 76 67 77)
-
-        movdqa  xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
-        movdqa  xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
-        movdqa  xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
-        movdqa  xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
-
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
-        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
-        movdqa  XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
-        movdqa  XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
-
-        movdqa  xmm5,xmm3
-        movdqa  xmm6,xmm0
-        paddd   xmm3,xmm4               ; xmm3=data2L
-        paddd   xmm0,xmm2               ; xmm0=data2H
-        psubd   xmm5,xmm4               ; xmm5=data5L
-        psubd   xmm6,xmm2               ; xmm6=data5H
-
-        movdqa  xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]        ; xmm7=[PD_DESCALE_P1]
-
-        paddd   xmm3,xmm7
-        paddd   xmm0,xmm7
-        psrad   xmm3,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-        paddd   xmm5,xmm7
-        paddd   xmm6,xmm7
-        psrad   xmm5,DESCALE_P1
-        psrad   xmm6,DESCALE_P1
-
-        packssdw  xmm3,xmm0             ; xmm3=data2=(20 21 22 23 24 25 26 27)
-        packssdw  xmm5,xmm6             ; xmm5=data5=(50 51 52 53 54 55 56 57)
-
-        movdqa  xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
-        movdqa  xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
-        movdqa  xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
-        movdqa  xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
-
-        movdqa  xmm0,xmm1
-        movdqa  xmm6,xmm4
-        paddd   xmm1,xmm2               ; xmm1=data3L
-        paddd   xmm4,xmm7               ; xmm4=data3H
-        psubd   xmm0,xmm2               ; xmm0=data4L
-        psubd   xmm6,xmm7               ; xmm6=data4H
-
-        movdqa  xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]        ; xmm2=[PD_DESCALE_P1]
-
-        paddd   xmm1,xmm2
-        paddd   xmm4,xmm2
-        psrad   xmm1,DESCALE_P1
-        psrad   xmm4,DESCALE_P1
-        paddd   xmm0,xmm2
-        paddd   xmm6,xmm2
-        psrad   xmm0,DESCALE_P1
-        psrad   xmm6,DESCALE_P1
-
-        packssdw  xmm1,xmm4             ; xmm1=data3=(30 31 32 33 34 35 36 37)
-        packssdw  xmm0,xmm6             ; xmm0=data4=(40 41 42 43 44 45 46 47)
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
-        movdqa  xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
-
-        movdqa    xmm4,xmm3             ; transpose coefficients(phase 1)
-        punpcklwd xmm3,xmm1             ; xmm3=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm4,xmm1             ; xmm4=(24 34 25 35 26 36 27 37)
-        movdqa    xmm6,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm5             ; xmm0=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm6,xmm5             ; xmm6=(44 54 45 55 46 56 47 57)
-
-        movdqa    xmm1,xmm7             ; transpose coefficients(phase 2)
-        punpckldq xmm7,xmm3             ; xmm7=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm1,xmm3             ; xmm1=(02 12 22 32 03 13 23 33)
-        movdqa    xmm5,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm4             ; xmm2=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm5,xmm4             ; xmm5=(06 16 26 36 07 17 27 37)
-
-        movdqa  xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
-        movdqa  xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
-
-        movdqa  XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
-        movdqa  XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
-
-        movdqa    xmm2,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm3             ; xmm0=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm2,xmm3             ; xmm2=(42 52 62 72 43 53 63 73)
-        movdqa    xmm5,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm4             ; xmm6=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm5,xmm4             ; xmm5=(46 56 66 76 47 57 67 77)
-
-        movdqa     xmm3,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm0            ; xmm7=col0=(00 10 20 30 40 50 60 70)
-        punpckhqdq xmm3,xmm0            ; xmm3=col1=(01 11 21 31 41 51 61 71)
-        movdqa     xmm4,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm2            ; xmm1=col2=(02 12 22 32 42 52 62 72)
-        punpckhqdq xmm4,xmm2            ; xmm4=col3=(03 13 23 33 43 53 63 73)
-
-        movdqa  xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
-        movdqa  xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
-
-        movdqa  XMMWORD [wk(8)], xmm3   ; wk(8)=col1
-        movdqa  XMMWORD [wk(9)], xmm4   ; wk(9)=col3
-
-        movdqa     xmm3,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm6            ; xmm0=col4=(04 14 24 34 44 54 64 74)
-        punpckhqdq xmm3,xmm6            ; xmm3=col5=(05 15 25 35 45 55 65 75)
-        movdqa     xmm4,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm5            ; xmm2=col6=(06 16 26 36 46 56 66 76)
-        punpckhqdq xmm4,xmm5            ; xmm4=col7=(07 17 27 37 47 57 67 77)
-
-        movdqa  XMMWORD [wk(10)], xmm3  ; wk(10)=col5
-        movdqa  XMMWORD [wk(11)], xmm4  ; wk(11)=col7
-.column_end:
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-
-        ; -- Even part
-
-        ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
-
-        ; (Original)
-        ; z1 = (z2 + z3) * 0.541196100;
-        ; tmp2 = z1 + z3 * -1.847759065;
-        ; tmp3 = z1 + z2 * 0.765366865;
-        ;
-        ; (This implementation)
-        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-        movdqa    xmm6,xmm1             ; xmm1=in2=z2
-        movdqa    xmm5,xmm1
-        punpcklwd xmm6,xmm2             ; xmm2=in6=z3
-        punpckhwd xmm5,xmm2
-        movdqa    xmm1,xmm6
-        movdqa    xmm2,xmm5
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]       ; xmm6=tmp3L
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]       ; xmm5=tmp3H
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm1=tmp2L
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm2=tmp2H
-
-        movdqa    xmm3,xmm7
-        paddw     xmm7,xmm0             ; xmm7=in0+in4
-        psubw     xmm3,xmm0             ; xmm3=in0-in4
-
-        pxor      xmm4,xmm4
-        pxor      xmm0,xmm0
-        punpcklwd xmm4,xmm7             ; xmm4=tmp0L
-        punpckhwd xmm0,xmm7             ; xmm0=tmp0H
-        psrad     xmm4,(16-CONST_BITS)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS
-        psrad     xmm0,(16-CONST_BITS)  ; psrad xmm0,16 & pslld xmm0,CONST_BITS
-
-        movdqa  xmm7,xmm4
-        paddd   xmm4,xmm6               ; xmm4=tmp10L
-        psubd   xmm7,xmm6               ; xmm7=tmp13L
-        movdqa  xmm6,xmm0
-        paddd   xmm0,xmm5               ; xmm0=tmp10H
-        psubd   xmm6,xmm5               ; xmm6=tmp13H
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
-        movdqa  XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
-        movdqa  XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
-
-        pxor      xmm5,xmm5
-        pxor      xmm4,xmm4
-        punpcklwd xmm5,xmm3             ; xmm5=tmp1L
-        punpckhwd xmm4,xmm3             ; xmm4=tmp1H
-        psrad     xmm5,(16-CONST_BITS)  ; psrad xmm5,16 & pslld xmm5,CONST_BITS
-        psrad     xmm4,(16-CONST_BITS)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS
-
-        movdqa  xmm0,xmm5
-        paddd   xmm5,xmm1               ; xmm5=tmp11L
-        psubd   xmm0,xmm1               ; xmm0=tmp12L
-        movdqa  xmm7,xmm4
-        paddd   xmm4,xmm2               ; xmm4=tmp11H
-        psubd   xmm7,xmm2               ; xmm7=tmp12H
-
-        movdqa  XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
-        movdqa  XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
-        movdqa  XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
-        movdqa  XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
-
-        ; -- Odd part
-
-        movdqa  xmm6, XMMWORD [wk(9)]   ; xmm6=col3
-        movdqa  xmm3, XMMWORD [wk(8)]   ; xmm3=col1
-        movdqa  xmm1, XMMWORD [wk(11)]  ; xmm1=col7
-        movdqa  xmm2, XMMWORD [wk(10)]  ; xmm2=col5
-
-        movdqa  xmm5,xmm6
-        movdqa  xmm4,xmm3
-        paddw   xmm5,xmm1               ; xmm5=z3
-        paddw   xmm4,xmm2               ; xmm4=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm0,xmm5
-        movdqa    xmm7,xmm5
-        punpcklwd xmm0,xmm4
-        punpckhwd xmm7,xmm4
-        movdqa    xmm5,xmm0
-        movdqa    xmm4,xmm7
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm0=z3L
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm7=z3H
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]       ; xmm5=z4L
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_F117_F078)]       ; xmm4=z4H
-
-        movdqa  XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
-        movdqa  XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
-
-        ; (Original)
-        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-        ; tmp0 += z3;  tmp1 += z4;
-        ; tmp2 += z3;  tmp3 += z4;
-
-        movdqa    xmm0,xmm1
-        movdqa    xmm7,xmm1
-        punpcklwd xmm0,xmm3
-        punpckhwd xmm7,xmm3
-        movdqa    xmm1,xmm0
-        movdqa    xmm3,xmm7
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm0=tmp0L
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm7=tmp0H
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm1=tmp3L
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm3=tmp3H
-
-        paddd   xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
-        paddd   xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
-        paddd   xmm1,xmm5               ; xmm1=tmp3L
-        paddd   xmm3,xmm4               ; xmm3=tmp3H
-
-        movdqa  XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
-        movdqa  XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
-
-        movdqa    xmm0,xmm2
-        movdqa    xmm7,xmm2
-        punpcklwd xmm0,xmm6
-        punpckhwd xmm7,xmm6
-        movdqa    xmm2,xmm0
-        movdqa    xmm6,xmm7
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm0=tmp1L
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm7=tmp1H
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm2=tmp2L
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm6=tmp2H
-
-        paddd   xmm0,xmm5               ; xmm0=tmp1L
-        paddd   xmm7,xmm4               ; xmm7=tmp1H
-        paddd   xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
-        paddd   xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
-
-        movdqa  XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
-        movdqa  XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
-
-        ; -- Final output stage
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
-        movdqa  xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
-
-        movdqa  xmm0,xmm5
-        movdqa  xmm7,xmm4
-        paddd   xmm5,xmm1               ; xmm5=data0L
-        paddd   xmm4,xmm3               ; xmm4=data0H
-        psubd   xmm0,xmm1               ; xmm0=data7L
-        psubd   xmm7,xmm3               ; xmm7=data7H
-
-        movdqa  xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]        ; xmm1=[PD_DESCALE_P2]
-
-        paddd   xmm5,xmm1
-        paddd   xmm4,xmm1
-        psrad   xmm5,DESCALE_P2
-        psrad   xmm4,DESCALE_P2
-        paddd   xmm0,xmm1
-        paddd   xmm7,xmm1
-        psrad   xmm0,DESCALE_P2
-        psrad   xmm7,DESCALE_P2
-
-        packssdw  xmm5,xmm4             ; xmm5=data0=(00 10 20 30 40 50 60 70)
-        packssdw  xmm0,xmm7             ; xmm0=data7=(07 17 27 37 47 57 67 77)
-
-        movdqa  xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
-        movdqa  xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm7,xmm1
-        paddd   xmm3,xmm2               ; xmm3=data1L
-        paddd   xmm1,xmm6               ; xmm1=data1H
-        psubd   xmm4,xmm2               ; xmm4=data6L
-        psubd   xmm7,xmm6               ; xmm7=data6H
-
-        movdqa  xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]        ; xmm2=[PD_DESCALE_P2]
-
-        paddd   xmm3,xmm2
-        paddd   xmm1,xmm2
-        psrad   xmm3,DESCALE_P2
-        psrad   xmm1,DESCALE_P2
-        paddd   xmm4,xmm2
-        paddd   xmm7,xmm2
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm7,DESCALE_P2
-
-        packssdw  xmm3,xmm1             ; xmm3=data1=(01 11 21 31 41 51 61 71)
-        packssdw  xmm4,xmm7             ; xmm4=data6=(06 16 26 36 46 56 66 76)
-
-        packsswb  xmm5,xmm4             ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        packsswb  xmm3,xmm0             ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        movdqa  xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
-        movdqa  xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
-        movdqa  xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
-        movdqa  xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
-
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        movdqa  xmm4,xmm6
-        movdqa  xmm0,xmm2
-        paddd   xmm6,xmm1               ; xmm6=data2L
-        paddd   xmm2,xmm7               ; xmm2=data2H
-        psubd   xmm4,xmm1               ; xmm4=data5L
-        psubd   xmm0,xmm7               ; xmm0=data5H
-
-        movdqa  xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]        ; xmm5=[PD_DESCALE_P2]
-
-        paddd   xmm6,xmm5
-        paddd   xmm2,xmm5
-        psrad   xmm6,DESCALE_P2
-        psrad   xmm2,DESCALE_P2
-        paddd   xmm4,xmm5
-        paddd   xmm0,xmm5
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm0,DESCALE_P2
-
-        packssdw  xmm6,xmm2             ; xmm6=data2=(02 12 22 32 42 52 62 72)
-        packssdw  xmm4,xmm0             ; xmm4=data5=(05 15 25 35 45 55 65 75)
-
-        movdqa  xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
-        movdqa  xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
-        movdqa  xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
-        movdqa  xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
-
-        movdqa  xmm2,xmm3
-        movdqa  xmm0,xmm1
-        paddd   xmm3,xmm7               ; xmm3=data3L
-        paddd   xmm1,xmm5               ; xmm1=data3H
-        psubd   xmm2,xmm7               ; xmm2=data4L
-        psubd   xmm0,xmm5               ; xmm0=data4H
-
-        movdqa  xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]        ; xmm7=[PD_DESCALE_P2]
-
-        paddd   xmm3,xmm7
-        paddd   xmm1,xmm7
-        psrad   xmm3,DESCALE_P2
-        psrad   xmm1,DESCALE_P2
-        paddd   xmm2,xmm7
-        paddd   xmm0,xmm7
-        psrad   xmm2,DESCALE_P2
-        psrad   xmm0,DESCALE_P2
-
-        movdqa    xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)]     ; xmm5=[PB_CENTERJSAMP]
-
-        packssdw  xmm3,xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
-        packssdw  xmm2,xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
-
-        movdqa    xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        packsswb  xmm6,xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-        packsswb  xmm3,xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-        paddb     xmm7,xmm5
-        paddb     xmm1,xmm5
-        paddb     xmm6,xmm5
-        paddb     xmm3,xmm5
-
-        movdqa    xmm0,xmm7     ; transpose coefficients(phase 1)
-        punpcklbw xmm7,xmm1     ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-        punpckhbw xmm0,xmm1     ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-        movdqa    xmm2,xmm6     ; transpose coefficients(phase 1)
-        punpcklbw xmm6,xmm3     ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-        punpckhbw xmm2,xmm3     ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-        movdqa    xmm4,xmm7     ; transpose coefficients(phase 2)
-        punpcklwd xmm7,xmm6     ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-        punpckhwd xmm4,xmm6     ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-        movdqa    xmm5,xmm2     ; transpose coefficients(phase 2)
-        punpcklwd xmm2,xmm0     ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-        punpckhwd xmm5,xmm0     ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-        movdqa    xmm1,xmm7     ; transpose coefficients(phase 3)
-        punpckldq xmm7,xmm2     ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-        punpckhdq xmm1,xmm2     ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-        movdqa    xmm3,xmm4     ; transpose coefficients(phase 3)
-        punpckldq xmm4,xmm5     ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-        punpckhdq xmm3,xmm5     ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-        pshufd  xmm6,xmm7,0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-        pshufd  xmm0,xmm1,0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-        pshufd  xmm2,xmm4,0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-        pshufd  xmm5,xmm3,0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
-        mov     edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
-
-        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
-        mov     edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jidctred-mmx.asm b/simd/jidctred-mmx.asm
deleted file mode 100644
index ba054e3..0000000
--- a/simd/jidctred-mmx.asm
+++ /dev/null
@@ -1,705 +0,0 @@
-;
-; jidctred.asm - reduced-size IDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains inverse-DCT routines that produce reduced-size
-; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
-; The following code is based directly on the IJG's original jidctred.c;
-; see the jidctred.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1_4    (CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4    (CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2    (CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2    (CONST_BITS+PASS1_BITS+3+2)
-
-%if CONST_BITS == 13
-F_0_211 equ      1730           ; FIX(0.211164243)
-F_0_509 equ      4176           ; FIX(0.509795579)
-F_0_601 equ      4926           ; FIX(0.601344887)
-F_0_720 equ      5906           ; FIX(0.720959822)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_850 equ      6967           ; FIX(0.850430095)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_061 equ      8697           ; FIX(1.061594337)
-F_1_272 equ     10426           ; FIX(1.272758580)
-F_1_451 equ     11893           ; FIX(1.451774981)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_2_172 equ     17799           ; FIX(2.172734803)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_624 equ     29692           ; FIX(3.624509785)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_211 equ     DESCALE( 226735879,30-CONST_BITS)       ; FIX(0.211164243)
-F_0_509 equ     DESCALE( 547388834,30-CONST_BITS)       ; FIX(0.509795579)
-F_0_601 equ     DESCALE( 645689155,30-CONST_BITS)       ; FIX(0.601344887)
-F_0_720 equ     DESCALE( 774124714,30-CONST_BITS)       ; FIX(0.720959822)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_850 equ     DESCALE( 913142361,30-CONST_BITS)       ; FIX(0.850430095)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_061 equ     DESCALE(1139878239,30-CONST_BITS)       ; FIX(1.061594337)
-F_1_272 equ     DESCALE(1366614119,30-CONST_BITS)       ; FIX(1.272758580)
-F_1_451 equ     DESCALE(1558831516,30-CONST_BITS)       ; FIX(1.451774981)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_2_172 equ     DESCALE(2332956230,30-CONST_BITS)       ; FIX(2.172734803)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_624 equ     DESCALE(3891787747,30-CONST_BITS)       ; FIX(3.624509785)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_red_mmx)
-
-EXTN(jconst_idct_red_mmx):
-
-PW_F184_MF076   times 2 dw  F_1_847,-F_0_765
-PW_F256_F089    times 2 dw  F_2_562, F_0_899
-PW_F106_MF217   times 2 dw  F_1_061,-F_2_172
-PW_MF060_MF050  times 2 dw -F_0_601,-F_0_509
-PW_F145_MF021   times 2 dw  F_1_451,-F_0_211
-PW_F362_MF127   times 2 dw  F_3_624,-F_1_272
-PW_F085_MF072   times 2 dw  F_0_850,-F_0_720
-PD_DESCALE_P1_4 times 2 dd  1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4 times 2 dd  1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2 times 2 dd  1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2 times 2 dd  1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 4x4 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_4x4_mmx (void *dct_table, JCOEFPTR coef_block,
-;                     JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_JCOEF
-                                        ; JCOEF workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_4x4_mmx)
-
-EXTN(jsimd_idct_4x4_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [workspace]
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; JCOEF *wsptr
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     short .columnDCT
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     mm0,mm1
-        packsswb mm0,mm0
-        movd    eax,mm0
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        psllw   mm0,PASS1_BITS
-
-        movq      mm2,mm0               ; mm0=in0=(00 01 02 03)
-        punpcklwd mm0,mm0               ; mm0=(00 00 01 01)
-        punpckhwd mm2,mm2               ; mm2=(02 02 03 03)
-
-        movq      mm1,mm0
-        punpckldq mm0,mm0               ; mm0=(00 00 00 00)
-        punpckhdq mm1,mm1               ; mm1=(01 01 01 01)
-        movq      mm3,mm2
-        punpckldq mm2,mm2               ; mm2=(02 02 02 02)
-        punpckhdq mm3,mm3               ; mm3=(03 03 03 03)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-        jmp     near .nextcolumn
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Odd part
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movq    mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movq      mm4,mm0
-        movq      mm5,mm0
-        punpcklwd mm4,mm1
-        punpckhwd mm5,mm1
-        movq      mm0,mm4
-        movq      mm1,mm5
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]        ; mm4=(tmp2L)
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]        ; mm5=(tmp2H)
-        pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]       ; mm0=(tmp0L)
-        pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]       ; mm1=(tmp0H)
-
-        movq      mm6,mm2
-        movq      mm7,mm2
-        punpcklwd mm6,mm3
-        punpckhwd mm7,mm3
-        movq      mm2,mm6
-        movq      mm3,mm7
-        pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm6=(tmp2L)
-        pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm7=(tmp2H)
-        pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]       ; mm2=(tmp0L)
-        pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]       ; mm3=(tmp0H)
-
-        paddd   mm6,mm4                 ; mm6=tmp2L
-        paddd   mm7,mm5                 ; mm7=tmp2H
-        paddd   mm2,mm0                 ; mm2=tmp0L
-        paddd   mm3,mm1                 ; mm3=tmp0H
-
-        movq    MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
-        movq    MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
-
-        ; -- Even part
-
-        movq    mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq    mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        pxor      mm1,mm1
-        pxor      mm2,mm2
-        punpcklwd mm1,mm4               ; mm1=tmp0L
-        punpckhwd mm2,mm4               ; mm2=tmp0H
-        psrad     mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
-        psrad     mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
-
-        movq      mm3,mm5               ; mm5=in2=z2
-        punpcklwd mm5,mm0               ; mm0=in6=z3
-        punpckhwd mm3,mm0
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]       ; mm5=tmp2L
-        pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]       ; mm3=tmp2H
-
-        movq    mm4,mm1
-        movq    mm0,mm2
-        paddd   mm1,mm5                 ; mm1=tmp10L
-        paddd   mm2,mm3                 ; mm2=tmp10H
-        psubd   mm4,mm5                 ; mm4=tmp12L
-        psubd   mm0,mm3                 ; mm0=tmp12H
-
-        ; -- Final output stage
-
-        movq    mm5,mm1
-        movq    mm3,mm2
-        paddd   mm1,mm6                 ; mm1=data0L
-        paddd   mm2,mm7                 ; mm2=data0H
-        psubd   mm5,mm6                 ; mm5=data3L
-        psubd   mm3,mm7                 ; mm3=data3H
-
-        movq    mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]       ; mm6=[PD_DESCALE_P1_4]
-
-        paddd   mm1,mm6
-        paddd   mm2,mm6
-        psrad   mm1,DESCALE_P1_4
-        psrad   mm2,DESCALE_P1_4
-        paddd   mm5,mm6
-        paddd   mm3,mm6
-        psrad   mm5,DESCALE_P1_4
-        psrad   mm3,DESCALE_P1_4
-
-        packssdw  mm1,mm2               ; mm1=data0=(00 01 02 03)
-        packssdw  mm5,mm3               ; mm5=data3=(30 31 32 33)
-
-        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp0L
-        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp0H
-
-        movq    mm2,mm4
-        movq    mm3,mm0
-        paddd   mm4,mm7                 ; mm4=data1L
-        paddd   mm0,mm6                 ; mm0=data1H
-        psubd   mm2,mm7                 ; mm2=data2L
-        psubd   mm3,mm6                 ; mm3=data2H
-
-        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]       ; mm7=[PD_DESCALE_P1_4]
-
-        paddd   mm4,mm7
-        paddd   mm0,mm7
-        psrad   mm4,DESCALE_P1_4
-        psrad   mm0,DESCALE_P1_4
-        paddd   mm2,mm7
-        paddd   mm3,mm7
-        psrad   mm2,DESCALE_P1_4
-        psrad   mm3,DESCALE_P1_4
-
-        packssdw  mm4,mm0               ; mm4=data1=(10 11 12 13)
-        packssdw  mm2,mm3               ; mm2=data2=(20 21 22 23)
-
-        movq      mm6,mm1               ; transpose coefficients(phase 1)
-        punpcklwd mm1,mm4               ; mm1=(00 10 01 11)
-        punpckhwd mm6,mm4               ; mm6=(02 12 03 13)
-        movq      mm7,mm2               ; transpose coefficients(phase 1)
-        punpcklwd mm2,mm5               ; mm2=(20 30 21 31)
-        punpckhwd mm7,mm5               ; mm7=(22 32 23 33)
-
-        movq      mm0,mm1               ; transpose coefficients(phase 2)
-        punpckldq mm1,mm2               ; mm1=(00 10 20 30)
-        punpckhdq mm0,mm2               ; mm0=(01 11 21 31)
-        movq      mm3,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm7               ; mm6=(02 12 22 32)
-        punpckhdq mm3,mm7               ; mm3=(03 13 23 33)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-
-.nextcolumn:
-        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
-        add     edx, byte 4*SIZEOF_ISLOW_MULT_TYPE      ; quantptr
-        add     edi, byte 4*DCTSIZE*SIZEOF_JCOEF        ; wsptr
-        dec     ecx                                     ; ctr
-        jnz     near .columnloop
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; JCOEF *wsptr
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-
-        ; -- Odd part
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movq    mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-        movq      mm4,mm0
-        movq      mm5,mm0
-        punpcklwd mm4,mm1
-        punpckhwd mm5,mm1
-        movq      mm0,mm4
-        movq      mm1,mm5
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]        ; mm4=(tmp2L)
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]        ; mm5=(tmp2H)
-        pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]       ; mm0=(tmp0L)
-        pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]       ; mm1=(tmp0H)
-
-        movq      mm6,mm2
-        movq      mm7,mm2
-        punpcklwd mm6,mm3
-        punpckhwd mm7,mm3
-        movq      mm2,mm6
-        movq      mm3,mm7
-        pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm6=(tmp2L)
-        pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm7=(tmp2H)
-        pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]       ; mm2=(tmp0L)
-        pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]       ; mm3=(tmp0H)
-
-        paddd   mm6,mm4                 ; mm6=tmp2L
-        paddd   mm7,mm5                 ; mm7=tmp2H
-        paddd   mm2,mm0                 ; mm2=tmp0L
-        paddd   mm3,mm1                 ; mm3=tmp0H
-
-        movq    MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
-        movq    MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
-
-        ; -- Even part
-
-        movq    mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq    mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-        pxor      mm1,mm1
-        pxor      mm2,mm2
-        punpcklwd mm1,mm4               ; mm1=tmp0L
-        punpckhwd mm2,mm4               ; mm2=tmp0H
-        psrad     mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
-        psrad     mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
-
-        movq      mm3,mm5               ; mm5=in2=z2
-        punpcklwd mm5,mm0               ; mm0=in6=z3
-        punpckhwd mm3,mm0
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]       ; mm5=tmp2L
-        pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]       ; mm3=tmp2H
-
-        movq    mm4,mm1
-        movq    mm0,mm2
-        paddd   mm1,mm5                 ; mm1=tmp10L
-        paddd   mm2,mm3                 ; mm2=tmp10H
-        psubd   mm4,mm5                 ; mm4=tmp12L
-        psubd   mm0,mm3                 ; mm0=tmp12H
-
-        ; -- Final output stage
-
-        movq    mm5,mm1
-        movq    mm3,mm2
-        paddd   mm1,mm6                 ; mm1=data0L
-        paddd   mm2,mm7                 ; mm2=data0H
-        psubd   mm5,mm6                 ; mm5=data3L
-        psubd   mm3,mm7                 ; mm3=data3H
-
-        movq    mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)]       ; mm6=[PD_DESCALE_P2_4]
-
-        paddd   mm1,mm6
-        paddd   mm2,mm6
-        psrad   mm1,DESCALE_P2_4
-        psrad   mm2,DESCALE_P2_4
-        paddd   mm5,mm6
-        paddd   mm3,mm6
-        psrad   mm5,DESCALE_P2_4
-        psrad   mm3,DESCALE_P2_4
-
-        packssdw  mm1,mm2               ; mm1=data0=(00 10 20 30)
-        packssdw  mm5,mm3               ; mm5=data3=(03 13 23 33)
-
-        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp0L
-        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp0H
-
-        movq    mm2,mm4
-        movq    mm3,mm0
-        paddd   mm4,mm7                 ; mm4=data1L
-        paddd   mm0,mm6                 ; mm0=data1H
-        psubd   mm2,mm7                 ; mm2=data2L
-        psubd   mm3,mm6                 ; mm3=data2H
-
-        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)]       ; mm7=[PD_DESCALE_P2_4]
-
-        paddd   mm4,mm7
-        paddd   mm0,mm7
-        psrad   mm4,DESCALE_P2_4
-        psrad   mm0,DESCALE_P2_4
-        paddd   mm2,mm7
-        paddd   mm3,mm7
-        psrad   mm2,DESCALE_P2_4
-        psrad   mm3,DESCALE_P2_4
-
-        packssdw  mm4,mm0               ; mm4=data1=(01 11 21 31)
-        packssdw  mm2,mm3               ; mm2=data2=(02 12 22 32)
-
-        movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm6=[PB_CENTERJSAMP]
-
-        packsswb  mm1,mm2               ; mm1=(00 10 20 30 02 12 22 32)
-        packsswb  mm4,mm5               ; mm4=(01 11 21 31 03 13 23 33)
-        paddb     mm1,mm6
-        paddb     mm4,mm6
-
-        movq      mm7,mm1               ; transpose coefficients(phase 1)
-        punpcklbw mm1,mm4               ; mm1=(00 01 10 11 20 21 30 31)
-        punpckhbw mm7,mm4               ; mm7=(02 03 12 13 22 23 32 33)
-
-        movq      mm0,mm1               ; transpose coefficients(phase 2)
-        punpcklwd mm1,mm7               ; mm1=(00 01 02 03 10 11 12 13)
-        punpckhwd mm0,mm7               ; mm0=(20 21 22 23 30 31 32 33)
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        movd    DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
-        movd    DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
-
-        psrlq   mm1,4*BYTE_BIT
-        psrlq   mm0,4*BYTE_BIT
-
-        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movd    DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
-        movd    DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 2x2 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_2x2_mmx (void *dct_table, JCOEFPTR coef_block,
-;                     JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-        align   16
-        global  EXTN(jsimd_idct_2x2_mmx)
-
-EXTN(jsimd_idct_2x2_mmx):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input.
-
-        mov     edx, POINTER [dct_table(ebp)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(ebp)]         ; inptr
-
-        ; | input:                  | result:        |
-        ; | 00 01 ** 03 ** 05 ** 07 |                |
-        ; | 10 11 ** 13 ** 15 ** 17 |                |
-        ; | ** ** ** ** ** ** ** ** |                |
-        ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
-        ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
-        ; | 50 51 ** 53 ** 55 ** 57 |                |
-        ; | ** ** ** ** ** ** ** ** |                |
-        ; | 70 71 ** 73 ** 75 ** 77 |                |
-
-        ; -- Odd part
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movq    mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
-        ; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
-
-        pcmpeqd   mm7,mm7
-        pslld     mm7,WORD_BIT          ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
-
-        movq      mm4,mm0               ; mm4=(10 11 ** 13)
-        movq      mm5,mm2               ; mm5=(50 51 ** 53)
-        punpcklwd mm4,mm1               ; mm4=(10 30 11 31)
-        punpcklwd mm5,mm3               ; mm5=(50 70 51 71)
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
-
-        psrld   mm0,WORD_BIT            ; mm0=(11 -- 13 --)
-        pand    mm1,mm7                 ; mm1=(-- 31 -- 33)
-        psrld   mm2,WORD_BIT            ; mm2=(51 -- 53 --)
-        pand    mm3,mm7                 ; mm3=(-- 71 -- 73)
-        por     mm0,mm1                 ; mm0=(11 31 13 33)
-        por     mm2,mm3                 ; mm2=(51 71 53 73)
-        pmaddwd mm0,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd mm2,[GOTOFF(ebx,PW_F085_MF072)]
-
-        paddd   mm4,mm5                 ; mm4=tmp0[col0 col1]
-
-        movq    mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
-        pmullw  mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movq    mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
-        movq    mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
-        pmullw  mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
-        ; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
-
-        psrld   mm6,WORD_BIT            ; mm6=(15 -- 17 --)
-        pand    mm1,mm7                 ; mm1=(-- 35 -- 37)
-        psrld   mm3,WORD_BIT            ; mm3=(55 -- 57 --)
-        pand    mm5,mm7                 ; mm5=(-- 75 -- 77)
-        por     mm6,mm1                 ; mm6=(15 35 17 37)
-        por     mm3,mm5                 ; mm3=(55 75 57 77)
-        pmaddwd mm6,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd mm3,[GOTOFF(ebx,PW_F085_MF072)]
-
-        paddd   mm0,mm2                 ; mm0=tmp0[col1 col3]
-        paddd   mm6,mm3                 ; mm6=tmp0[col5 col7]
-
-        ; -- Even part
-
-        movq    mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
-        pmullw  mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
-
-        movq    mm2,mm1                         ; mm2=(00 01 ** 03)
-        pslld   mm1,WORD_BIT                    ; mm1=(-- 00 -- **)
-        psrad   mm1,(WORD_BIT-CONST_BITS-2)     ; mm1=tmp10[col0 ****]
-
-        pand    mm2,mm7                         ; mm2=(-- 01 -- 03)
-        pand    mm5,mm7                         ; mm5=(-- 05 -- 07)
-        psrad   mm2,(WORD_BIT-CONST_BITS-2)     ; mm2=tmp10[col1 col3]
-        psrad   mm5,(WORD_BIT-CONST_BITS-2)     ; mm5=tmp10[col5 col7]
-
-        ; -- Final output stage
-
-        movq      mm3,mm1
-        paddd     mm1,mm4               ; mm1=data0[col0 ****]=(A0 **)
-        psubd     mm3,mm4               ; mm3=data1[col0 ****]=(B0 **)
-        punpckldq mm1,mm3               ; mm1=(A0 B0)
-
-        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)]       ; mm7=[PD_DESCALE_P1_2]
-
-        movq    mm4,mm2
-        movq    mm3,mm5
-        paddd   mm2,mm0                 ; mm2=data0[col1 col3]=(A1 A3)
-        paddd   mm5,mm6                 ; mm5=data0[col5 col7]=(A5 A7)
-        psubd   mm4,mm0                 ; mm4=data1[col1 col3]=(B1 B3)
-        psubd   mm3,mm6                 ; mm3=data1[col5 col7]=(B5 B7)
-
-        paddd   mm1,mm7
-        psrad   mm1,DESCALE_P1_2
-
-        paddd   mm2,mm7
-        paddd   mm5,mm7
-        psrad   mm2,DESCALE_P1_2
-        psrad   mm5,DESCALE_P1_2
-        paddd   mm4,mm7
-        paddd   mm3,mm7
-        psrad   mm4,DESCALE_P1_2
-        psrad   mm3,DESCALE_P1_2
-
-        ; ---- Pass 2: process rows, store into output array.
-
-        mov     edi, JSAMPARRAY [output_buf(ebp)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(ebp)]
-
-        ; | input:| result:|
-        ; | A0 B0 |        |
-        ; | A1 B1 | C0 C1  |
-        ; | A3 B3 | D0 D1  |
-        ; | A5 B5 |        |
-        ; | A7 B7 |        |
-
-        ; -- Odd part
-
-        packssdw  mm2,mm4               ; mm2=(A1 A3 B1 B3)
-        packssdw  mm5,mm3               ; mm5=(A5 A7 B5 B7)
-        pmaddwd   mm2,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
-
-        paddd     mm2,mm5               ; mm2=tmp0[row0 row1]
-
-        ; -- Even part
-
-        pslld     mm1,(CONST_BITS+2)    ; mm1=tmp10[row0 row1]
-
-        ; -- Final output stage
-
-        movq      mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)]     ; mm0=[PD_DESCALE_P2_2]
-
-        movq      mm6,mm1
-        paddd     mm1,mm2               ; mm1=data0[row0 row1]=(C0 C1)
-        psubd     mm6,mm2               ; mm6=data1[row0 row1]=(D0 D1)
-
-        paddd     mm1,mm0
-        paddd     mm6,mm0
-        psrad     mm1,DESCALE_P2_2
-        psrad     mm6,DESCALE_P2_2
-
-        movq      mm7,mm1               ; transpose coefficients
-        punpckldq mm1,mm6               ; mm1=(C0 D0)
-        punpckhdq mm7,mm6               ; mm7=(C1 D1)
-
-        packssdw  mm1,mm7               ; mm1=(C0 D0 C1 D1)
-        packsswb  mm1,mm1               ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
-        paddb     mm1,[GOTOFF(ebx,PB_CENTERJSAMP)]
-
-        movd    ecx,mm1
-        movd    ebx,mm1                 ; ebx=(C0 D0 C1 D1)
-        shr     ecx,2*BYTE_BIT          ; ecx=(C1 D1 -- --)
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        mov     WORD [edx+eax*SIZEOF_JSAMPLE], bx
-        mov     WORD [esi+eax*SIZEOF_JSAMPLE], cx
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jidctred-sse2-64.asm b/simd/jidctred-sse2-64.asm
deleted file mode 100644
index a54bbe2..0000000
--- a/simd/jidctred-sse2-64.asm
+++ /dev/null
@@ -1,575 +0,0 @@
-;
-; jidctred.asm - reduced-size IDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains inverse-DCT routines that produce reduced-size
-; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
-; The following code is based directly on the IJG's original jidctred.c;
-; see the jidctred.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1_4    (CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4    (CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2    (CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2    (CONST_BITS+PASS1_BITS+3+2)
-
-%if CONST_BITS == 13
-F_0_211 equ      1730           ; FIX(0.211164243)
-F_0_509 equ      4176           ; FIX(0.509795579)
-F_0_601 equ      4926           ; FIX(0.601344887)
-F_0_720 equ      5906           ; FIX(0.720959822)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_850 equ      6967           ; FIX(0.850430095)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_061 equ      8697           ; FIX(1.061594337)
-F_1_272 equ     10426           ; FIX(1.272758580)
-F_1_451 equ     11893           ; FIX(1.451774981)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_2_172 equ     17799           ; FIX(2.172734803)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_624 equ     29692           ; FIX(3.624509785)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_211 equ     DESCALE( 226735879,30-CONST_BITS)       ; FIX(0.211164243)
-F_0_509 equ     DESCALE( 547388834,30-CONST_BITS)       ; FIX(0.509795579)
-F_0_601 equ     DESCALE( 645689155,30-CONST_BITS)       ; FIX(0.601344887)
-F_0_720 equ     DESCALE( 774124714,30-CONST_BITS)       ; FIX(0.720959822)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_850 equ     DESCALE( 913142361,30-CONST_BITS)       ; FIX(0.850430095)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_061 equ     DESCALE(1139878239,30-CONST_BITS)       ; FIX(1.061594337)
-F_1_272 equ     DESCALE(1366614119,30-CONST_BITS)       ; FIX(1.272758580)
-F_1_451 equ     DESCALE(1558831516,30-CONST_BITS)       ; FIX(1.451774981)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_2_172 equ     DESCALE(2332956230,30-CONST_BITS)       ; FIX(2.172734803)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_624 equ     DESCALE(3891787747,30-CONST_BITS)       ; FIX(3.624509785)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_red_sse2)
-
-EXTN(jconst_idct_red_sse2):
-
-PW_F184_MF076   times 4 dw  F_1_847,-F_0_765
-PW_F256_F089    times 4 dw  F_2_562, F_0_899
-PW_F106_MF217   times 4 dw  F_1_061,-F_2_172
-PW_MF060_MF050  times 4 dw -F_0_601,-F_0_509
-PW_F145_MF021   times 4 dw  F_1_451,-F_0_211
-PW_F362_MF127   times 4 dw  F_3_624,-F_1_272
-PW_F085_MF072   times 4 dw  F_0_850,-F_0_720
-PD_DESCALE_P1_4 times 4 dd  1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4 times 4 dd  1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2 times 4 dd  1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2 times 4 dd  1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 4x4 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_4x4_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = void *dct_table
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp    rbp+0
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_idct_4x4_sse2)
-
-EXTN(jsimd_idct_4x4_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-
-        ; ---- Pass 1: process columns from input.
-
-        mov     rdx, r10                ; quantptr
-        mov     rsi, r11                ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
-        mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        jnz     short .columnDCT
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0,xmm1
-        packsswb xmm0,xmm0
-        packsswb xmm0,xmm0
-        movd    eax,xmm0
-        test    rax,rax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        psllw   xmm0,PASS1_BITS
-
-        movdqa    xmm3,xmm0     ; xmm0=in0=(00 01 02 03 04 05 06 07)
-        punpcklwd xmm0,xmm0     ; xmm0=(00 00 01 01 02 02 03 03)
-        punpckhwd xmm3,xmm3     ; xmm3=(04 04 05 05 06 06 07 07)
-
-        pshufd  xmm1,xmm0,0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
-        pshufd  xmm0,xmm0,0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
-        pshufd  xmm6,xmm3,0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
-        pshufd  xmm3,xmm3,0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
-
-        jmp     near .column_end
-%endif
-.columnDCT:
-
-        ; -- Odd part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movdqa    xmm4,xmm0
-        movdqa    xmm5,xmm0
-        punpcklwd xmm4,xmm1
-        punpckhwd xmm5,xmm1
-        movdqa    xmm0,xmm4
-        movdqa    xmm1,xmm5
-        pmaddwd   xmm4,[rel PW_F256_F089]       ; xmm4=(tmp2L)
-        pmaddwd   xmm5,[rel PW_F256_F089]       ; xmm5=(tmp2H)
-        pmaddwd   xmm0,[rel PW_F106_MF217]      ; xmm0=(tmp0L)
-        pmaddwd   xmm1,[rel PW_F106_MF217]      ; xmm1=(tmp0H)
-
-        movdqa    xmm6,xmm2
-        movdqa    xmm7,xmm2
-        punpcklwd xmm6,xmm3
-        punpckhwd xmm7,xmm3
-        movdqa    xmm2,xmm6
-        movdqa    xmm3,xmm7
-        pmaddwd   xmm6,[rel PW_MF060_MF050]     ; xmm6=(tmp2L)
-        pmaddwd   xmm7,[rel PW_MF060_MF050]     ; xmm7=(tmp2H)
-        pmaddwd   xmm2,[rel PW_F145_MF021]      ; xmm2=(tmp0L)
-        pmaddwd   xmm3,[rel PW_F145_MF021]      ; xmm3=(tmp0H)
-
-        paddd   xmm6,xmm4               ; xmm6=tmp2L
-        paddd   xmm7,xmm5               ; xmm7=tmp2H
-        paddd   xmm2,xmm0               ; xmm2=tmp0L
-        paddd   xmm3,xmm1               ; xmm3=tmp0H
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
-
-        ; -- Even part
-
-        movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        pxor      xmm1,xmm1
-        pxor      xmm2,xmm2
-        punpcklwd xmm1,xmm4             ; xmm1=tmp0L
-        punpckhwd xmm2,xmm4             ; xmm2=tmp0H
-        psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
-        psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
-
-        movdqa    xmm3,xmm5             ; xmm5=in2=z2
-        punpcklwd xmm5,xmm0             ; xmm0=in6=z3
-        punpckhwd xmm3,xmm0
-        pmaddwd   xmm5,[rel PW_F184_MF076]      ; xmm5=tmp2L
-        pmaddwd   xmm3,[rel PW_F184_MF076]      ; xmm3=tmp2H
-
-        movdqa  xmm4,xmm1
-        movdqa  xmm0,xmm2
-        paddd   xmm1,xmm5               ; xmm1=tmp10L
-        paddd   xmm2,xmm3               ; xmm2=tmp10H
-        psubd   xmm4,xmm5               ; xmm4=tmp12L
-        psubd   xmm0,xmm3               ; xmm0=tmp12H
-
-        ; -- Final output stage
-
-        movdqa  xmm5,xmm1
-        movdqa  xmm3,xmm2
-        paddd   xmm1,xmm6               ; xmm1=data0L
-        paddd   xmm2,xmm7               ; xmm2=data0H
-        psubd   xmm5,xmm6               ; xmm5=data3L
-        psubd   xmm3,xmm7               ; xmm3=data3H
-
-        movdqa  xmm6,[rel PD_DESCALE_P1_4]      ; xmm6=[rel PD_DESCALE_P1_4]
-
-        paddd   xmm1,xmm6
-        paddd   xmm2,xmm6
-        psrad   xmm1,DESCALE_P1_4
-        psrad   xmm2,DESCALE_P1_4
-        paddd   xmm5,xmm6
-        paddd   xmm3,xmm6
-        psrad   xmm5,DESCALE_P1_4
-        psrad   xmm3,DESCALE_P1_4
-
-        packssdw  xmm1,xmm2             ; xmm1=data0=(00 01 02 03 04 05 06 07)
-        packssdw  xmm5,xmm3             ; xmm5=data3=(30 31 32 33 34 35 36 37)
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
-        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
-
-        movdqa  xmm2,xmm4
-        movdqa  xmm3,xmm0
-        paddd   xmm4,xmm7               ; xmm4=data1L
-        paddd   xmm0,xmm6               ; xmm0=data1H
-        psubd   xmm2,xmm7               ; xmm2=data2L
-        psubd   xmm3,xmm6               ; xmm3=data2H
-
-        movdqa  xmm7,[rel PD_DESCALE_P1_4]      ; xmm7=[rel PD_DESCALE_P1_4]
-
-        paddd   xmm4,xmm7
-        paddd   xmm0,xmm7
-        psrad   xmm4,DESCALE_P1_4
-        psrad   xmm0,DESCALE_P1_4
-        paddd   xmm2,xmm7
-        paddd   xmm3,xmm7
-        psrad   xmm2,DESCALE_P1_4
-        psrad   xmm3,DESCALE_P1_4
-
-        packssdw  xmm4,xmm0             ; xmm4=data1=(10 11 12 13 14 15 16 17)
-        packssdw  xmm2,xmm3             ; xmm2=data2=(20 21 22 23 24 25 26 27)
-
-        movdqa    xmm6,xmm1     ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm4     ; xmm1=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm6,xmm4     ; xmm6=(04 14 05 15 06 16 07 17)
-        movdqa    xmm7,xmm2     ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm5     ; xmm2=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm7,xmm5     ; xmm7=(24 34 25 35 26 36 27 37)
-
-        movdqa    xmm0,xmm1     ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm2     ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm0,xmm2     ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
-        movdqa    xmm3,xmm6     ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm7     ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm3,xmm7     ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
-.column_end:
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows, store into output array.
-
-        mov     rax, [original_rbp]
-        mov     rdi, r12        ; (JSAMPROW *)
-        mov     eax, r13d
-
-        ; -- Even part
-
-        pxor      xmm4,xmm4
-        punpcklwd xmm4,xmm1             ; xmm4=tmp0
-        psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
-
-        ; -- Odd part
-
-        punpckhwd xmm1,xmm0
-        punpckhwd xmm6,xmm3
-        movdqa    xmm5,xmm1
-        movdqa    xmm2,xmm6
-        pmaddwd   xmm1,[rel PW_F256_F089]       ; xmm1=(tmp2)
-        pmaddwd   xmm6,[rel PW_MF060_MF050]     ; xmm6=(tmp2)
-        pmaddwd   xmm5,[rel PW_F106_MF217]      ; xmm5=(tmp0)
-        pmaddwd   xmm2,[rel PW_F145_MF021]      ; xmm2=(tmp0)
-
-        paddd     xmm6,xmm1             ; xmm6=tmp2
-        paddd     xmm2,xmm5             ; xmm2=tmp0
-
-        ; -- Even part
-
-        punpcklwd xmm0,xmm3
-        pmaddwd   xmm0,[rel PW_F184_MF076]      ; xmm0=tmp2
-
-        movdqa    xmm7,xmm4
-        paddd     xmm4,xmm0             ; xmm4=tmp10
-        psubd     xmm7,xmm0             ; xmm7=tmp12
-
-        ; -- Final output stage
-
-        movdqa  xmm1,[rel PD_DESCALE_P2_4]      ; xmm1=[rel PD_DESCALE_P2_4]
-
-        movdqa  xmm5,xmm4
-        movdqa  xmm3,xmm7
-        paddd   xmm4,xmm6               ; xmm4=data0=(00 10 20 30)
-        paddd   xmm7,xmm2               ; xmm7=data1=(01 11 21 31)
-        psubd   xmm5,xmm6               ; xmm5=data3=(03 13 23 33)
-        psubd   xmm3,xmm2               ; xmm3=data2=(02 12 22 32)
-
-        paddd   xmm4,xmm1
-        paddd   xmm7,xmm1
-        psrad   xmm4,DESCALE_P2_4
-        psrad   xmm7,DESCALE_P2_4
-        paddd   xmm5,xmm1
-        paddd   xmm3,xmm1
-        psrad   xmm5,DESCALE_P2_4
-        psrad   xmm3,DESCALE_P2_4
-
-        packssdw  xmm4,xmm3             ; xmm4=(00 10 20 30 02 12 22 32)
-        packssdw  xmm7,xmm5             ; xmm7=(01 11 21 31 03 13 23 33)
-
-        movdqa    xmm0,xmm4             ; transpose coefficients(phase 1)
-        punpcklwd xmm4,xmm7             ; xmm4=(00 01 10 11 20 21 30 31)
-        punpckhwd xmm0,xmm7             ; xmm0=(02 03 12 13 22 23 32 33)
-
-        movdqa    xmm6,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm0             ; xmm4=(00 01 02 03 10 11 12 13)
-        punpckhdq xmm6,xmm0             ; xmm6=(20 21 22 23 30 31 32 33)
-
-        packsswb  xmm4,xmm6             ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
-        paddb     xmm4,[rel PB_CENTERJSAMP]
-
-        pshufd    xmm2,xmm4,0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
-        pshufd    xmm1,xmm4,0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
-        pshufd    xmm3,xmm4,0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
-
-        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-        movd    XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
-        movd    XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
-        mov     rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-        movd    XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
-        movd    XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 2x2 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_2x2_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = void *dct_table
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-        align   16
-        global  EXTN(jsimd_idct_2x2_sse2)
-
-EXTN(jsimd_idct_2x2_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-        push    rbx
-
-        ; ---- Pass 1: process columns from input.
-
-        mov     rdx, r10                ; quantptr
-        mov     rsi, r11                ; inptr
-
-        ; | input:                  | result:        |
-        ; | 00 01 ** 03 ** 05 ** 07 |                |
-        ; | 10 11 ** 13 ** 15 ** 17 |                |
-        ; | ** ** ** ** ** ** ** ** |                |
-        ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
-        ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
-        ; | 50 51 ** 53 ** 55 ** 57 |                |
-        ; | ** ** ** ** ** ** ** ** |                |
-        ; | 70 71 ** 73 ** 75 ** 77 |                |
-
-        ; -- Odd part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
-        ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
-
-        pcmpeqd   xmm7,xmm7
-        pslld     xmm7,WORD_BIT         ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
-
-        movdqa    xmm4,xmm0             ; xmm4=(10 11 ** 13 ** 15 ** 17)
-        movdqa    xmm5,xmm2             ; xmm5=(50 51 ** 53 ** 55 ** 57)
-        punpcklwd xmm4,xmm1             ; xmm4=(10 30 11 31 ** ** 13 33)
-        punpcklwd xmm5,xmm3             ; xmm5=(50 70 51 71 ** ** 53 73)
-        pmaddwd   xmm4,[rel PW_F362_MF127]
-        pmaddwd   xmm5,[rel PW_F085_MF072]
-
-        psrld   xmm0,WORD_BIT           ; xmm0=(11 -- 13 -- 15 -- 17 --)
-        pand    xmm1,xmm7               ; xmm1=(-- 31 -- 33 -- 35 -- 37)
-        psrld   xmm2,WORD_BIT           ; xmm2=(51 -- 53 -- 55 -- 57 --)
-        pand    xmm3,xmm7               ; xmm3=(-- 71 -- 73 -- 75 -- 77)
-        por     xmm0,xmm1               ; xmm0=(11 31 13 33 15 35 17 37)
-        por     xmm2,xmm3               ; xmm2=(51 71 53 73 55 75 57 77)
-        pmaddwd xmm0,[rel PW_F362_MF127]
-        pmaddwd xmm2,[rel PW_F085_MF072]
-
-        paddd   xmm4,xmm5               ; xmm4=tmp0[col0 col1 **** col3]
-        paddd   xmm0,xmm2               ; xmm0=tmp0[col1 col3 col5 col7]
-
-        ; -- Even part
-
-        movdqa  xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; xmm6=(00 01 ** 03 ** 05 ** 07)
-
-        movdqa  xmm1,xmm6               ; xmm1=(00 01 ** 03 ** 05 ** 07)
-        pslld   xmm6,WORD_BIT           ; xmm6=(-- 00 -- ** -- ** -- **)
-        pand    xmm1,xmm7               ; xmm1=(-- 01 -- 03 -- 05 -- 07)
-        psrad   xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
-        psrad   xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
-
-        ; -- Final output stage
-
-        movdqa  xmm3,xmm6
-        movdqa  xmm5,xmm1
-        paddd   xmm6,xmm4       ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
-        paddd   xmm1,xmm0       ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
-        psubd   xmm3,xmm4       ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
-        psubd   xmm5,xmm0       ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
-
-        movdqa  xmm2,[rel PD_DESCALE_P1_2]      ; xmm2=[rel PD_DESCALE_P1_2]
-
-        punpckldq  xmm6,xmm3            ; xmm6=(A0 B0 ** **)
-
-        movdqa     xmm7,xmm1
-        punpcklqdq xmm1,xmm5            ; xmm1=(A1 A3 B1 B3)
-        punpckhqdq xmm7,xmm5            ; xmm7=(A5 A7 B5 B7)
-
-        paddd   xmm6,xmm2
-        psrad   xmm6,DESCALE_P1_2
-
-        paddd   xmm1,xmm2
-        paddd   xmm7,xmm2
-        psrad   xmm1,DESCALE_P1_2
-        psrad   xmm7,DESCALE_P1_2
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows, store into output array.
-
-        mov     rdi, r12        ; (JSAMPROW *)
-        mov     eax, r13d
-
-        ; | input:| result:|
-        ; | A0 B0 |        |
-        ; | A1 B1 | C0 C1  |
-        ; | A3 B3 | D0 D1  |
-        ; | A5 B5 |        |
-        ; | A7 B7 |        |
-
-        ; -- Odd part
-
-        packssdw  xmm1,xmm1             ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
-        packssdw  xmm7,xmm7             ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
-        pmaddwd   xmm1,[rel PW_F362_MF127]
-        pmaddwd   xmm7,[rel PW_F085_MF072]
-
-        paddd     xmm1,xmm7             ; xmm1=tmp0[row0 row1 row0 row1]
-
-        ; -- Even part
-
-        pslld     xmm6,(CONST_BITS+2)   ; xmm6=tmp10[row0 row1 **** ****]
-
-        ; -- Final output stage
-
-        movdqa    xmm4,xmm6
-        paddd     xmm6,xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
-        psubd     xmm4,xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
-
-        punpckldq xmm6,xmm4     ; xmm6=(C0 D0 C1 D1)
-
-        paddd     xmm6,[rel PD_DESCALE_P2_2]
-        psrad     xmm6,DESCALE_P2_2
-
-        packssdw  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
-        packsswb  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
-        paddb     xmm6,[rel PB_CENTERJSAMP]
-
-        pextrw  ebx,xmm6,0x00           ; ebx=(C0 D0 -- --)
-        pextrw  ecx,xmm6,0x01           ; ecx=(C1 D1 -- --)
-
-        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-        mov     WORD [rdx+rax*SIZEOF_JSAMPLE], bx
-        mov     WORD [rsi+rax*SIZEOF_JSAMPLE], cx
-
-        pop     rbx
-        uncollect_args
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jidctred-sse2.asm b/simd/jidctred-sse2.asm
deleted file mode 100644
index 232d983..0000000
--- a/simd/jidctred-sse2.asm
+++ /dev/null
@@ -1,593 +0,0 @@
-;
-; jidctred.asm - reduced-size IDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains inverse-DCT routines that produce reduced-size
-; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
-; The following code is based directly on the IJG's original jidctred.c;
-; see the jidctred.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1_4    (CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4    (CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2    (CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2    (CONST_BITS+PASS1_BITS+3+2)
-
-%if CONST_BITS == 13
-F_0_211 equ      1730           ; FIX(0.211164243)
-F_0_509 equ      4176           ; FIX(0.509795579)
-F_0_601 equ      4926           ; FIX(0.601344887)
-F_0_720 equ      5906           ; FIX(0.720959822)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_850 equ      6967           ; FIX(0.850430095)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_061 equ      8697           ; FIX(1.061594337)
-F_1_272 equ     10426           ; FIX(1.272758580)
-F_1_451 equ     11893           ; FIX(1.451774981)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_2_172 equ     17799           ; FIX(2.172734803)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_624 equ     29692           ; FIX(3.624509785)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_211 equ     DESCALE( 226735879,30-CONST_BITS)       ; FIX(0.211164243)
-F_0_509 equ     DESCALE( 547388834,30-CONST_BITS)       ; FIX(0.509795579)
-F_0_601 equ     DESCALE( 645689155,30-CONST_BITS)       ; FIX(0.601344887)
-F_0_720 equ     DESCALE( 774124714,30-CONST_BITS)       ; FIX(0.720959822)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_850 equ     DESCALE( 913142361,30-CONST_BITS)       ; FIX(0.850430095)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_061 equ     DESCALE(1139878239,30-CONST_BITS)       ; FIX(1.061594337)
-F_1_272 equ     DESCALE(1366614119,30-CONST_BITS)       ; FIX(1.272758580)
-F_1_451 equ     DESCALE(1558831516,30-CONST_BITS)       ; FIX(1.451774981)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_2_172 equ     DESCALE(2332956230,30-CONST_BITS)       ; FIX(2.172734803)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_624 equ     DESCALE(3891787747,30-CONST_BITS)       ; FIX(3.624509785)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_red_sse2)
-
-EXTN(jconst_idct_red_sse2):
-
-PW_F184_MF076   times 4 dw  F_1_847,-F_0_765
-PW_F256_F089    times 4 dw  F_2_562, F_0_899
-PW_F106_MF217   times 4 dw  F_1_061,-F_2_172
-PW_MF060_MF050  times 4 dw -F_0_601,-F_0_509
-PW_F145_MF021   times 4 dw  F_1_451,-F_0_211
-PW_F362_MF127   times 4 dw  F_3_624,-F_1_272
-PW_F085_MF072   times 4 dw  F_0_850,-F_0_720
-PD_DESCALE_P1_4 times 4 dd  1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4 times 4 dd  1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2 times 4 dd  1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2 times 4 dd  1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 4x4 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_4x4_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_idct_4x4_sse2)
-
-EXTN(jsimd_idct_4x4_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     short .columnDCT
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     xmm0,xmm1
-        packsswb xmm0,xmm0
-        packsswb xmm0,xmm0
-        movd    eax,xmm0
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        psllw   xmm0,PASS1_BITS
-
-        movdqa    xmm3,xmm0     ; xmm0=in0=(00 01 02 03 04 05 06 07)
-        punpcklwd xmm0,xmm0     ; xmm0=(00 00 01 01 02 02 03 03)
-        punpckhwd xmm3,xmm3     ; xmm3=(04 04 05 05 06 06 07 07)
-
-        pshufd  xmm1,xmm0,0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
-        pshufd  xmm0,xmm0,0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
-        pshufd  xmm6,xmm3,0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
-        pshufd  xmm3,xmm3,0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
-
-        jmp     near .column_end
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Odd part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movdqa    xmm4,xmm0
-        movdqa    xmm5,xmm0
-        punpcklwd xmm4,xmm1
-        punpckhwd xmm5,xmm1
-        movdqa    xmm0,xmm4
-        movdqa    xmm1,xmm5
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_F256_F089)]       ; xmm4=(tmp2L)
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F256_F089)]       ; xmm5=(tmp2H)
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_F106_MF217)]      ; xmm0=(tmp0L)
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_F106_MF217)]      ; xmm1=(tmp0H)
-
-        movdqa    xmm6,xmm2
-        movdqa    xmm7,xmm2
-        punpcklwd xmm6,xmm3
-        punpckhwd xmm7,xmm3
-        movdqa    xmm2,xmm6
-        movdqa    xmm3,xmm7
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]     ; xmm6=(tmp2L)
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF050)]     ; xmm7=(tmp2H)
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]      ; xmm2=(tmp0L)
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_F145_MF021)]      ; xmm3=(tmp0H)
-
-        paddd   xmm6,xmm4               ; xmm6=tmp2L
-        paddd   xmm7,xmm5               ; xmm7=tmp2H
-        paddd   xmm2,xmm0               ; xmm2=tmp0L
-        paddd   xmm3,xmm1               ; xmm3=tmp0H
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
-
-        ; -- Even part
-
-        movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        pxor      xmm1,xmm1
-        pxor      xmm2,xmm2
-        punpcklwd xmm1,xmm4             ; xmm1=tmp0L
-        punpckhwd xmm2,xmm4             ; xmm2=tmp0H
-        psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
-        psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
-
-        movdqa    xmm3,xmm5             ; xmm5=in2=z2
-        punpcklwd xmm5,xmm0             ; xmm0=in6=z3
-        punpckhwd xmm3,xmm0
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F184_MF076)]      ; xmm5=tmp2L
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_F184_MF076)]      ; xmm3=tmp2H
-
-        movdqa  xmm4,xmm1
-        movdqa  xmm0,xmm2
-        paddd   xmm1,xmm5               ; xmm1=tmp10L
-        paddd   xmm2,xmm3               ; xmm2=tmp10H
-        psubd   xmm4,xmm5               ; xmm4=tmp12L
-        psubd   xmm0,xmm3               ; xmm0=tmp12H
-
-        ; -- Final output stage
-
-        movdqa  xmm5,xmm1
-        movdqa  xmm3,xmm2
-        paddd   xmm1,xmm6               ; xmm1=data0L
-        paddd   xmm2,xmm7               ; xmm2=data0H
-        psubd   xmm5,xmm6               ; xmm5=data3L
-        psubd   xmm3,xmm7               ; xmm3=data3H
-
-        movdqa  xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]      ; xmm6=[PD_DESCALE_P1_4]
-
-        paddd   xmm1,xmm6
-        paddd   xmm2,xmm6
-        psrad   xmm1,DESCALE_P1_4
-        psrad   xmm2,DESCALE_P1_4
-        paddd   xmm5,xmm6
-        paddd   xmm3,xmm6
-        psrad   xmm5,DESCALE_P1_4
-        psrad   xmm3,DESCALE_P1_4
-
-        packssdw  xmm1,xmm2             ; xmm1=data0=(00 01 02 03 04 05 06 07)
-        packssdw  xmm5,xmm3             ; xmm5=data3=(30 31 32 33 34 35 36 37)
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
-        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
-
-        movdqa  xmm2,xmm4
-        movdqa  xmm3,xmm0
-        paddd   xmm4,xmm7               ; xmm4=data1L
-        paddd   xmm0,xmm6               ; xmm0=data1H
-        psubd   xmm2,xmm7               ; xmm2=data2L
-        psubd   xmm3,xmm6               ; xmm3=data2H
-
-        movdqa  xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]      ; xmm7=[PD_DESCALE_P1_4]
-
-        paddd   xmm4,xmm7
-        paddd   xmm0,xmm7
-        psrad   xmm4,DESCALE_P1_4
-        psrad   xmm0,DESCALE_P1_4
-        paddd   xmm2,xmm7
-        paddd   xmm3,xmm7
-        psrad   xmm2,DESCALE_P1_4
-        psrad   xmm3,DESCALE_P1_4
-
-        packssdw  xmm4,xmm0             ; xmm4=data1=(10 11 12 13 14 15 16 17)
-        packssdw  xmm2,xmm3             ; xmm2=data2=(20 21 22 23 24 25 26 27)
-
-        movdqa    xmm6,xmm1     ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm4     ; xmm1=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm6,xmm4     ; xmm6=(04 14 05 15 06 16 07 17)
-        movdqa    xmm7,xmm2     ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm5     ; xmm2=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm7,xmm5     ; xmm7=(24 34 25 35 26 36 27 37)
-
-        movdqa    xmm0,xmm1     ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm2     ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm0,xmm2     ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
-        movdqa    xmm3,xmm6     ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm7     ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm3,xmm7     ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
-.column_end:
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows, store into output array.
-
-        mov     eax, [original_ebp]
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-
-        ; -- Even part
-
-        pxor      xmm4,xmm4
-        punpcklwd xmm4,xmm1             ; xmm4=tmp0
-        psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
-
-        ; -- Odd part
-
-        punpckhwd xmm1,xmm0
-        punpckhwd xmm6,xmm3
-        movdqa    xmm5,xmm1
-        movdqa    xmm2,xmm6
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_F256_F089)]       ; xmm1=(tmp2)
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]     ; xmm6=(tmp2)
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F106_MF217)]      ; xmm5=(tmp0)
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]      ; xmm2=(tmp0)
-
-        paddd     xmm6,xmm1             ; xmm6=tmp2
-        paddd     xmm2,xmm5             ; xmm2=tmp0
-
-        ; -- Even part
-
-        punpcklwd xmm0,xmm3
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_F184_MF076)]      ; xmm0=tmp2
-
-        movdqa    xmm7,xmm4
-        paddd     xmm4,xmm0             ; xmm4=tmp10
-        psubd     xmm7,xmm0             ; xmm7=tmp12
-
-        ; -- Final output stage
-
-        movdqa  xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)]      ; xmm1=[PD_DESCALE_P2_4]
-
-        movdqa  xmm5,xmm4
-        movdqa  xmm3,xmm7
-        paddd   xmm4,xmm6               ; xmm4=data0=(00 10 20 30)
-        paddd   xmm7,xmm2               ; xmm7=data1=(01 11 21 31)
-        psubd   xmm5,xmm6               ; xmm5=data3=(03 13 23 33)
-        psubd   xmm3,xmm2               ; xmm3=data2=(02 12 22 32)
-
-        paddd   xmm4,xmm1
-        paddd   xmm7,xmm1
-        psrad   xmm4,DESCALE_P2_4
-        psrad   xmm7,DESCALE_P2_4
-        paddd   xmm5,xmm1
-        paddd   xmm3,xmm1
-        psrad   xmm5,DESCALE_P2_4
-        psrad   xmm3,DESCALE_P2_4
-
-        packssdw  xmm4,xmm3             ; xmm4=(00 10 20 30 02 12 22 32)
-        packssdw  xmm7,xmm5             ; xmm7=(01 11 21 31 03 13 23 33)
-
-        movdqa    xmm0,xmm4             ; transpose coefficients(phase 1)
-        punpcklwd xmm4,xmm7             ; xmm4=(00 01 10 11 20 21 30 31)
-        punpckhwd xmm0,xmm7             ; xmm0=(02 03 12 13 22 23 32 33)
-
-        movdqa    xmm6,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm0             ; xmm4=(00 01 02 03 10 11 12 13)
-        punpckhdq xmm6,xmm0             ; xmm6=(20 21 22 23 30 31 32 33)
-
-        packsswb  xmm4,xmm6             ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
-        paddb     xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)]
-
-        pshufd    xmm2,xmm4,0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
-        pshufd    xmm1,xmm4,0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
-        pshufd    xmm3,xmm4,0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        movd    XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
-        movd    XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
-        mov     edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movd    XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
-        movd    XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 2x2 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_2x2_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-        align   16
-        global  EXTN(jsimd_idct_2x2_sse2)
-
-EXTN(jsimd_idct_2x2_sse2):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input.
-
-        mov     edx, POINTER [dct_table(ebp)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(ebp)]         ; inptr
-
-        ; | input:                  | result:        |
-        ; | 00 01 ** 03 ** 05 ** 07 |                |
-        ; | 10 11 ** 13 ** 15 ** 17 |                |
-        ; | ** ** ** ** ** ** ** ** |                |
-        ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
-        ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
-        ; | 50 51 ** 53 ** 55 ** 57 |                |
-        ; | ** ** ** ** ** ** ** ** |                |
-        ; | 70 71 ** 73 ** 75 ** 77 |                |
-
-        ; -- Odd part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
-        ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
-
-        pcmpeqd   xmm7,xmm7
-        pslld     xmm7,WORD_BIT         ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
-
-        movdqa    xmm4,xmm0             ; xmm4=(10 11 ** 13 ** 15 ** 17)
-        movdqa    xmm5,xmm2             ; xmm5=(50 51 ** 53 ** 55 ** 57)
-        punpcklwd xmm4,xmm1             ; xmm4=(10 30 11 31 ** ** 13 33)
-        punpcklwd xmm5,xmm3             ; xmm5=(50 70 51 71 ** ** 53 73)
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F085_MF072)]
-
-        psrld   xmm0,WORD_BIT           ; xmm0=(11 -- 13 -- 15 -- 17 --)
-        pand    xmm1,xmm7               ; xmm1=(-- 31 -- 33 -- 35 -- 37)
-        psrld   xmm2,WORD_BIT           ; xmm2=(51 -- 53 -- 55 -- 57 --)
-        pand    xmm3,xmm7               ; xmm3=(-- 71 -- 73 -- 75 -- 77)
-        por     xmm0,xmm1               ; xmm0=(11 31 13 33 15 35 17 37)
-        por     xmm2,xmm3               ; xmm2=(51 71 53 73 55 75 57 77)
-        pmaddwd xmm0,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd xmm2,[GOTOFF(ebx,PW_F085_MF072)]
-
-        paddd   xmm4,xmm5               ; xmm4=tmp0[col0 col1 **** col3]
-        paddd   xmm0,xmm2               ; xmm0=tmp0[col1 col3 col5 col7]
-
-        ; -- Even part
-
-        movdqa  xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; xmm6=(00 01 ** 03 ** 05 ** 07)
-
-        movdqa  xmm1,xmm6               ; xmm1=(00 01 ** 03 ** 05 ** 07)
-        pslld   xmm6,WORD_BIT           ; xmm6=(-- 00 -- ** -- ** -- **)
-        pand    xmm1,xmm7               ; xmm1=(-- 01 -- 03 -- 05 -- 07)
-        psrad   xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
-        psrad   xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
-
-        ; -- Final output stage
-
-        movdqa  xmm3,xmm6
-        movdqa  xmm5,xmm1
-        paddd   xmm6,xmm4       ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
-        paddd   xmm1,xmm0       ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
-        psubd   xmm3,xmm4       ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
-        psubd   xmm5,xmm0       ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
-
-        movdqa  xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)]      ; xmm2=[PD_DESCALE_P1_2]
-
-        punpckldq  xmm6,xmm3            ; xmm6=(A0 B0 ** **)
-
-        movdqa     xmm7,xmm1
-        punpcklqdq xmm1,xmm5            ; xmm1=(A1 A3 B1 B3)
-        punpckhqdq xmm7,xmm5            ; xmm7=(A5 A7 B5 B7)
-
-        paddd   xmm6,xmm2
-        psrad   xmm6,DESCALE_P1_2
-
-        paddd   xmm1,xmm2
-        paddd   xmm7,xmm2
-        psrad   xmm1,DESCALE_P1_2
-        psrad   xmm7,DESCALE_P1_2
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows, store into output array.
-
-        mov     edi, JSAMPARRAY [output_buf(ebp)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(ebp)]
-
-        ; | input:| result:|
-        ; | A0 B0 |        |
-        ; | A1 B1 | C0 C1  |
-        ; | A3 B3 | D0 D1  |
-        ; | A5 B5 |        |
-        ; | A7 B7 |        |
-
-        ; -- Odd part
-
-        packssdw  xmm1,xmm1             ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
-        packssdw  xmm7,xmm7             ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_F085_MF072)]
-
-        paddd     xmm1,xmm7             ; xmm1=tmp0[row0 row1 row0 row1]
-
-        ; -- Even part
-
-        pslld     xmm6,(CONST_BITS+2)   ; xmm6=tmp10[row0 row1 **** ****]
-
-        ; -- Final output stage
-
-        movdqa    xmm4,xmm6
-        paddd     xmm6,xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
-        psubd     xmm4,xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
-
-        punpckldq xmm6,xmm4     ; xmm6=(C0 D0 C1 D1)
-
-        paddd     xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)]
-        psrad     xmm6,DESCALE_P2_2
-
-        packssdw  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
-        packsswb  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
-        paddb     xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)]
-
-        pextrw  ebx,xmm6,0x00           ; ebx=(C0 D0 -- --)
-        pextrw  ecx,xmm6,0x01           ; ecx=(C1 D1 -- --)
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        mov     WORD [edx+eax*SIZEOF_JSAMPLE], bx
-        mov     WORD [esi+eax*SIZEOF_JSAMPLE], cx
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jquant-3dn.asm b/simd/jquant-3dn.asm
deleted file mode 100644
index 0b4164b..0000000
--- a/simd/jquant-3dn.asm
+++ /dev/null
@@ -1,232 +0,0 @@
-;
-; jquant.asm - sample data conversion and quantization (3DNow! & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                             FAST_FLOAT *workspace);
-;
-
-%define sample_data     ebp+8           ; JSAMPARRAY sample_data
-%define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_float_3dnow)
-
-EXTN(jsimd_convsamp_float_3dnow):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        pcmpeqw  mm7,mm7
-        psllw    mm7,7
-        packsswb mm7,mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
-
-        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
-        mov     eax, JDIMENSION [start_col]
-        mov     edi, POINTER [workspace]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/2
-        alignx  16,7
-.convloop:
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
-        movq    mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
-
-        psubb   mm0,mm7                         ; mm0=(01234567)
-        psubb   mm1,mm7                         ; mm1=(89ABCDEF)
-
-        punpcklbw mm2,mm0                       ; mm2=(*0*1*2*3)
-        punpckhbw mm0,mm0                       ; mm0=(*4*5*6*7)
-        punpcklbw mm3,mm1                       ; mm3=(*8*9*A*B)
-        punpckhbw mm1,mm1                       ; mm1=(*C*D*E*F)
-
-        punpcklwd mm4,mm2                       ; mm4=(***0***1)
-        punpckhwd mm2,mm2                       ; mm2=(***2***3)
-        punpcklwd mm5,mm0                       ; mm5=(***4***5)
-        punpckhwd mm0,mm0                       ; mm0=(***6***7)
-
-        psrad   mm4,(DWORD_BIT-BYTE_BIT)        ; mm4=(01)
-        psrad   mm2,(DWORD_BIT-BYTE_BIT)        ; mm2=(23)
-        pi2fd   mm4,mm4
-        pi2fd   mm2,mm2
-        psrad   mm5,(DWORD_BIT-BYTE_BIT)        ; mm5=(45)
-        psrad   mm0,(DWORD_BIT-BYTE_BIT)        ; mm0=(67)
-        pi2fd   mm5,mm5
-        pi2fd   mm0,mm0
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
-        movq    MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
-        movq    MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
-
-        punpcklwd mm6,mm3                       ; mm6=(***8***9)
-        punpckhwd mm3,mm3                       ; mm3=(***A***B)
-        punpcklwd mm4,mm1                       ; mm4=(***C***D)
-        punpckhwd mm1,mm1                       ; mm1=(***E***F)
-
-        psrad   mm6,(DWORD_BIT-BYTE_BIT)        ; mm6=(89)
-        psrad   mm3,(DWORD_BIT-BYTE_BIT)        ; mm3=(AB)
-        pi2fd   mm6,mm6
-        pi2fd   mm3,mm3
-        psrad   mm4,(DWORD_BIT-BYTE_BIT)        ; mm4=(CD)
-        psrad   mm1,(DWORD_BIT-BYTE_BIT)        ; mm1=(EF)
-        pi2fd   mm4,mm4
-        pi2fd   mm1,mm1
-
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
-        movq    MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
-        movq    MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
-
-        add     esi, byte 2*SIZEOF_JSAMPROW
-        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     near .convloop
-
-        femms           ; empty MMX/3DNow! state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-;                             FAST_FLOAT *workspace);
-;
-
-%define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; FAST_FLOAT *divisors
-%define workspace       ebp+16          ; FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_float_3dnow)
-
-EXTN(jsimd_quantize_float_3dnow):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov       eax, 0x4B400000       ; (float)0x00C00000 (rndint_magic)
-        movd      mm7,eax
-        punpckldq mm7,mm7               ; mm7={12582912.0F 12582912.0F}
-
-        mov     esi, POINTER [workspace]
-        mov     edx, POINTER [divisors]
-        mov     edi, JCOEFPTR [coef_block]
-        mov     eax, DCTSIZE2/16
-        alignx  16,7
-.quantloop:
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
-        pfmul   mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        pfmul   mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
-        pfmul   mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
-        pfmul   mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
-
-        pfadd   mm0,mm7                 ; mm0=(00 ** 01 **)
-        pfadd   mm1,mm7                 ; mm1=(02 ** 03 **)
-        pfadd   mm2,mm7                 ; mm0=(04 ** 05 **)
-        pfadd   mm3,mm7                 ; mm1=(06 ** 07 **)
-
-        movq      mm4,mm0
-        punpcklwd mm0,mm1               ; mm0=(00 02 ** **)
-        punpckhwd mm4,mm1               ; mm4=(01 03 ** **)
-        movq      mm5,mm2
-        punpcklwd mm2,mm3               ; mm2=(04 06 ** **)
-        punpckhwd mm5,mm3               ; mm5=(05 07 ** **)
-
-        punpcklwd mm0,mm4               ; mm0=(00 01 02 03)
-        punpcklwd mm2,mm5               ; mm2=(04 05 06 07)
-
-        movq    mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
-        pfmul   mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        pfmul   mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
-        pfmul   mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
-        pfmul   mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
-
-        pfadd   mm6,mm7                 ; mm0=(10 ** 11 **)
-        pfadd   mm1,mm7                 ; mm4=(12 ** 13 **)
-        pfadd   mm3,mm7                 ; mm0=(14 ** 15 **)
-        pfadd   mm4,mm7                 ; mm4=(16 ** 17 **)
-
-        movq      mm5,mm6
-        punpcklwd mm6,mm1               ; mm6=(10 12 ** **)
-        punpckhwd mm5,mm1               ; mm5=(11 13 ** **)
-        movq      mm1,mm3
-        punpcklwd mm3,mm4               ; mm3=(14 16 ** **)
-        punpckhwd mm1,mm4               ; mm1=(15 17 ** **)
-
-        punpcklwd mm6,mm5               ; mm6=(10 11 12 13)
-        punpcklwd mm3,mm1               ; mm3=(14 15 16 17)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
-
-        add     esi, byte 16*SIZEOF_FAST_FLOAT
-        add     edx, byte 16*SIZEOF_FAST_FLOAT
-        add     edi, byte 16*SIZEOF_JCOEF
-        dec     eax
-        jnz     near .quantloop
-
-        femms           ; empty MMX/3DNow! state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jquant-mmx.asm b/simd/jquant-mmx.asm
deleted file mode 100644
index aed6071..0000000
--- a/simd/jquant-mmx.asm
+++ /dev/null
@@ -1,273 +0,0 @@
-;
-; jquant.asm - sample data conversion and quantization (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                     DCTELEM *workspace);
-;
-
-%define sample_data     ebp+8           ; JSAMPARRAY sample_data
-%define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; DCTELEM *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_mmx)
-
-EXTN(jsimd_convsamp_mmx):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        pxor    mm6,mm6                 ; mm6=(all 0's)
-        pcmpeqw mm7,mm7
-        psllw   mm7,7                   ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
-
-        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
-        mov     eax, JDIMENSION [start_col]
-        mov     edi, POINTER [workspace]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.convloop:
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]    ; mm0=(01234567)
-        movq    mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]    ; mm1=(89ABCDEF)
-
-        mov     ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE]    ; mm2=(GHIJKLMN)
-        movq    mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE]    ; mm3=(OPQRSTUV)
-
-        movq      mm4,mm0
-        punpcklbw mm0,mm6               ; mm0=(0123)
-        punpckhbw mm4,mm6               ; mm4=(4567)
-        movq      mm5,mm1
-        punpcklbw mm1,mm6               ; mm1=(89AB)
-        punpckhbw mm5,mm6               ; mm5=(CDEF)
-
-        paddw   mm0,mm7
-        paddw   mm4,mm7
-        paddw   mm1,mm7
-        paddw   mm5,mm7
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
-
-        movq      mm0,mm2
-        punpcklbw mm2,mm6               ; mm2=(GHIJ)
-        punpckhbw mm0,mm6               ; mm0=(KLMN)
-        movq      mm4,mm3
-        punpcklbw mm3,mm6               ; mm3=(OPQR)
-        punpckhbw mm4,mm6               ; mm4=(STUV)
-
-        paddw   mm2,mm7
-        paddw   mm0,mm7
-        paddw   mm3,mm7
-        paddw   mm4,mm7
-
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
-        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
-        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
-
-        add     esi, byte 4*SIZEOF_JSAMPROW
-        add     edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
-        dec     ecx
-        jnz     short .convloop
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; This implementation is based on an algorithm described in
-;   "How to optimize for the Pentium family of microprocessors"
-;   (http://www.agner.org/assem/).
-;
-; GLOBAL(void)
-; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM *divisors,
-;                     DCTELEM *workspace);
-;
-
-%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b)      MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
-%define SHIFT(m,n,b)      MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM)
-
-%define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; DCTELEM *divisors
-%define workspace       ebp+16          ; DCTELEM *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_mmx)
-
-EXTN(jsimd_quantize_mmx):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     esi, POINTER [workspace]
-        mov     edx, POINTER [divisors]
-        mov     edi, JCOEFPTR [coef_block]
-        mov     ah, 2
-        alignx  16,7
-.quantloop1:
-        mov     al, DCTSIZE2/8/2
-        alignx  16,7
-.quantloop2:
-        movq    mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
-
-        movq    mm0,mm2
-        movq    mm1,mm3
-
-        psraw   mm2,(WORD_BIT-1)  ; -1 if value < 0, 0 otherwise
-        psraw   mm3,(WORD_BIT-1)
-
-        pxor    mm0,mm2   ; val = -val
-        pxor    mm1,mm3
-        psubw   mm0,mm2
-        psubw   mm1,mm3
-
-        ;
-        ; MMX is an annoyingly crappy instruction set. It has two
-        ; misfeatures that are causing problems here:
-        ;
-        ; - All multiplications are signed.
-        ;
-        ; - The second operand for the shifts is not treated as packed.
-        ;
-        ;
-        ; We work around the first problem by implementing this algorithm:
-        ;
-        ; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
-        ; {
-        ;   enum { SHORT_BIT = 16 };
-        ;   signed short sx = (signed short) x;
-        ;   signed short sy = (signed short) y;
-        ;   signed long sz;
-        ;
-        ;   sz = (long) sx * (long) sy;     /* signed multiply */
-        ;
-        ;   if (sx < 0) sz += (long) sy << SHORT_BIT;
-        ;   if (sy < 0) sz += (long) sx << SHORT_BIT;
-        ;
-        ;   return (unsigned long) sz;
-        ; }
-        ;
-        ; (note that a negative sx adds _sy_ and vice versa)
-        ;
-        ; For the second problem, we replace the shift by a multiplication.
-        ; Unfortunately that means we have to deal with the signed issue again.
-        ;
-
-        paddw   mm0, MMWORD [CORRECTION(0,0,edx)]   ; correction + roundfactor
-        paddw   mm1, MMWORD [CORRECTION(0,1,edx)]
-
-        movq    mm4,mm0   ; store current value for later
-        movq    mm5,mm1
-        pmulhw  mm0, MMWORD [RECIPROCAL(0,0,edx)]   ; reciprocal
-        pmulhw  mm1, MMWORD [RECIPROCAL(0,1,edx)]
-        paddw   mm0,mm4         ; reciprocal is always negative (MSB=1),
-        paddw   mm1,mm5   ; so we always need to add the initial value
-                        ; (input value is never negative as we
-                        ; inverted it at the start of this routine)
-
-        ; here it gets a bit tricky as both scale
-        ; and mm0/mm1 can be negative
-        movq    mm6, MMWORD [SCALE(0,0,edx)]    ; scale
-        movq    mm7, MMWORD [SCALE(0,1,edx)]
-        movq    mm4,mm0
-        movq    mm5,mm1
-        pmulhw  mm0,mm6
-        pmulhw  mm1,mm7
-
-        psraw   mm6,(WORD_BIT-1)    ; determine if scale is negative
-        psraw   mm7,(WORD_BIT-1)
-
-        pand    mm6,mm4             ; and add input if it is
-        pand    mm7,mm5
-        paddw   mm0,mm6
-        paddw   mm1,mm7
-
-        psraw   mm4,(WORD_BIT-1)    ; then check if negative input
-        psraw   mm5,(WORD_BIT-1)
-
-        pand    mm4, MMWORD [SCALE(0,0,edx)]    ; and add scale if it is
-        pand    mm5, MMWORD [SCALE(0,1,edx)]
-        paddw   mm0,mm4
-        paddw   mm1,mm5
-
-        pxor    mm0,mm2   ; val = -val
-        pxor    mm1,mm3
-        psubw   mm0,mm2
-        psubw   mm1,mm3
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
-
-        add     esi, byte 8*SIZEOF_DCTELEM
-        add     edx, byte 8*SIZEOF_DCTELEM
-        add     edi, byte 8*SIZEOF_JCOEF
-        dec     al
-        jnz     near .quantloop2
-        dec     ah
-        jnz     near .quantloop1        ; to avoid branch misprediction
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jquant-sse.asm b/simd/jquant-sse.asm
deleted file mode 100644
index 1baf88f..0000000
--- a/simd/jquant-sse.asm
+++ /dev/null
@@ -1,210 +0,0 @@
-;
-; jquant.asm - sample data conversion and quantization (SSE & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                           FAST_FLOAT *workspace);
-;
-
-%define sample_data     ebp+8           ; JSAMPARRAY sample_data
-%define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_float_sse)
-
-EXTN(jsimd_convsamp_float_sse):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        pcmpeqw  mm7,mm7
-        psllw    mm7,7
-        packsswb mm7,mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
-
-        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
-        mov     eax, JDIMENSION [start_col]
-        mov     edi, POINTER [workspace]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/2
-        alignx  16,7
-.convloop:
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
-        movq    mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
-
-        psubb   mm0,mm7                         ; mm0=(01234567)
-        psubb   mm1,mm7                         ; mm1=(89ABCDEF)
-
-        punpcklbw mm2,mm0                       ; mm2=(*0*1*2*3)
-        punpckhbw mm0,mm0                       ; mm0=(*4*5*6*7)
-        punpcklbw mm3,mm1                       ; mm3=(*8*9*A*B)
-        punpckhbw mm1,mm1                       ; mm1=(*C*D*E*F)
-
-        punpcklwd mm4,mm2                       ; mm4=(***0***1)
-        punpckhwd mm2,mm2                       ; mm2=(***2***3)
-        punpcklwd mm5,mm0                       ; mm5=(***4***5)
-        punpckhwd mm0,mm0                       ; mm0=(***6***7)
-
-        psrad     mm4,(DWORD_BIT-BYTE_BIT)      ; mm4=(01)
-        psrad     mm2,(DWORD_BIT-BYTE_BIT)      ; mm2=(23)
-        cvtpi2ps  xmm0,mm4                      ; xmm0=(01**)
-        cvtpi2ps  xmm1,mm2                      ; xmm1=(23**)
-        psrad     mm5,(DWORD_BIT-BYTE_BIT)      ; mm5=(45)
-        psrad     mm0,(DWORD_BIT-BYTE_BIT)      ; mm0=(67)
-        cvtpi2ps  xmm2,mm5                      ; xmm2=(45**)
-        cvtpi2ps  xmm3,mm0                      ; xmm3=(67**)
-
-        punpcklwd mm6,mm3                       ; mm6=(***8***9)
-        punpckhwd mm3,mm3                       ; mm3=(***A***B)
-        punpcklwd mm4,mm1                       ; mm4=(***C***D)
-        punpckhwd mm1,mm1                       ; mm1=(***E***F)
-
-        psrad     mm6,(DWORD_BIT-BYTE_BIT)      ; mm6=(89)
-        psrad     mm3,(DWORD_BIT-BYTE_BIT)      ; mm3=(AB)
-        cvtpi2ps  xmm4,mm6                      ; xmm4=(89**)
-        cvtpi2ps  xmm5,mm3                      ; xmm5=(AB**)
-        psrad     mm4,(DWORD_BIT-BYTE_BIT)      ; mm4=(CD)
-        psrad     mm1,(DWORD_BIT-BYTE_BIT)      ; mm1=(EF)
-        cvtpi2ps  xmm6,mm4                      ; xmm6=(CD**)
-        cvtpi2ps  xmm7,mm1                      ; xmm7=(EF**)
-
-        movlhps   xmm0,xmm1                     ; xmm0=(0123)
-        movlhps   xmm2,xmm3                     ; xmm2=(4567)
-        movlhps   xmm4,xmm5                     ; xmm4=(89AB)
-        movlhps   xmm6,xmm7                     ; xmm6=(CDEF)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
-        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
-
-        add     esi, byte 2*SIZEOF_JSAMPROW
-        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     near .convloop
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-;                           FAST_FLOAT *workspace);
-;
-
-%define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; FAST_FLOAT *divisors
-%define workspace       ebp+16          ; FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_float_sse)
-
-EXTN(jsimd_quantize_float_sse):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     esi, POINTER [workspace]
-        mov     edx, POINTER [divisors]
-        mov     edi, JCOEFPTR [coef_block]
-        mov     eax, DCTSIZE2/16
-        alignx  16,7
-.quantloop:
-        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
-        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        mulps   xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
-        mulps   xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        mulps   xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
-        movhlps  xmm4,xmm0
-        movhlps  xmm5,xmm1
-
-        cvtps2pi mm0,xmm0
-        cvtps2pi mm1,xmm1
-        cvtps2pi mm4,xmm4
-        cvtps2pi mm5,xmm5
-
-        movhlps  xmm6,xmm2
-        movhlps  xmm7,xmm3
-
-        cvtps2pi mm2,xmm2
-        cvtps2pi mm3,xmm3
-        cvtps2pi mm6,xmm6
-        cvtps2pi mm7,xmm7
-
-        packssdw mm0,mm4
-        packssdw mm1,mm5
-        packssdw mm2,mm6
-        packssdw mm3,mm7
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
-
-        add     esi, byte 16*SIZEOF_FAST_FLOAT
-        add     edx, byte 16*SIZEOF_FAST_FLOAT
-        add     edi, byte 16*SIZEOF_JCOEF
-        dec     eax
-        jnz     short .quantloop
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jquantf-sse2-64.asm b/simd/jquantf-sse2-64.asm
deleted file mode 100644
index ef5c1f9..0000000
--- a/simd/jquantf-sse2-64.asm
+++ /dev/null
@@ -1,157 +0,0 @@
-;
-; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                            FAST_FLOAT *workspace);
-;
-
-; r10 = JSAMPARRAY sample_data
-; r11 = JDIMENSION start_col
-; r12 = FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_float_sse2)
-
-EXTN(jsimd_convsamp_float_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-        push    rbx
-
-        pcmpeqw  xmm7,xmm7
-        psllw    xmm7,7
-        packsswb xmm7,xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
-
-        mov rsi, r10
-        mov     eax, r11d
-        mov rdi, r12
-        mov     rcx, DCTSIZE/2
-.convloop:
-        mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]       ; (JSAMPLE *)
-
-        movq    xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
-        movq    xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
-
-        psubb   xmm0,xmm7                       ; xmm0=(01234567)
-        psubb   xmm1,xmm7                       ; xmm1=(89ABCDEF)
-
-        punpcklbw xmm0,xmm0                     ; xmm0=(*0*1*2*3*4*5*6*7)
-        punpcklbw xmm1,xmm1                     ; xmm1=(*8*9*A*B*C*D*E*F)
-
-        punpcklwd xmm2,xmm0                     ; xmm2=(***0***1***2***3)
-        punpckhwd xmm0,xmm0                     ; xmm0=(***4***5***6***7)
-        punpcklwd xmm3,xmm1                     ; xmm3=(***8***9***A***B)
-        punpckhwd xmm1,xmm1                     ; xmm1=(***C***D***E***F)
-
-        psrad     xmm2,(DWORD_BIT-BYTE_BIT)     ; xmm2=(0123)
-        psrad     xmm0,(DWORD_BIT-BYTE_BIT)     ; xmm0=(4567)
-        cvtdq2ps  xmm2,xmm2                     ; xmm2=(0123)
-        cvtdq2ps  xmm0,xmm0                     ; xmm0=(4567)
-        psrad     xmm3,(DWORD_BIT-BYTE_BIT)     ; xmm3=(89AB)
-        psrad     xmm1,(DWORD_BIT-BYTE_BIT)     ; xmm1=(CDEF)
-        cvtdq2ps  xmm3,xmm3                     ; xmm3=(89AB)
-        cvtdq2ps  xmm1,xmm1                     ; xmm1=(CDEF)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
-
-        add     rsi, byte 2*SIZEOF_JSAMPROW
-        add     rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     rcx
-        jnz     short .convloop
-
-        pop     rbx
-        uncollect_args
-        pop     rbp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-;                         FAST_FLOAT *workspace);
-;
-
-; r10 = JCOEFPTR coef_block
-; r11 = FAST_FLOAT *divisors
-; r12 = FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_float_sse2)
-
-EXTN(jsimd_quantize_float_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-
-        mov rsi, r12
-        mov rdx, r11
-        mov rdi, r10
-        mov     rax, DCTSIZE2/16
-.quantloop:
-        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
-        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
-        mulps   xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
-        mulps   xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
-        mulps   xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
-
-        cvtps2dq xmm0,xmm0
-        cvtps2dq xmm1,xmm1
-        cvtps2dq xmm2,xmm2
-        cvtps2dq xmm3,xmm3
-
-        packssdw xmm0,xmm1
-        packssdw xmm2,xmm3
-
-        movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
-        movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
-
-        add     rsi, byte 16*SIZEOF_FAST_FLOAT
-        add     rdx, byte 16*SIZEOF_FAST_FLOAT
-        add     rdi, byte 16*SIZEOF_JCOEF
-        dec     rax
-        jnz     short .quantloop
-
-        uncollect_args
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jquantf-sse2.asm b/simd/jquantf-sse2.asm
deleted file mode 100644
index 1cbc267..0000000
--- a/simd/jquantf-sse2.asm
+++ /dev/null
@@ -1,170 +0,0 @@
-;
-; jquantf.asm - sample data conversion and quantization (SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                            FAST_FLOAT *workspace);
-;
-
-%define sample_data     ebp+8           ; JSAMPARRAY sample_data
-%define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_float_sse2)
-
-EXTN(jsimd_convsamp_float_sse2):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        pcmpeqw  xmm7,xmm7
-        psllw    xmm7,7
-        packsswb xmm7,xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
-
-        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
-        mov     eax, JDIMENSION [start_col]
-        mov     edi, POINTER [workspace]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/2
-        alignx  16,7
-.convloop:
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
-        movq    xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
-
-        psubb   xmm0,xmm7                       ; xmm0=(01234567)
-        psubb   xmm1,xmm7                       ; xmm1=(89ABCDEF)
-
-        punpcklbw xmm0,xmm0                     ; xmm0=(*0*1*2*3*4*5*6*7)
-        punpcklbw xmm1,xmm1                     ; xmm1=(*8*9*A*B*C*D*E*F)
-
-        punpcklwd xmm2,xmm0                     ; xmm2=(***0***1***2***3)
-        punpckhwd xmm0,xmm0                     ; xmm0=(***4***5***6***7)
-        punpcklwd xmm3,xmm1                     ; xmm3=(***8***9***A***B)
-        punpckhwd xmm1,xmm1                     ; xmm1=(***C***D***E***F)
-
-        psrad     xmm2,(DWORD_BIT-BYTE_BIT)     ; xmm2=(0123)
-        psrad     xmm0,(DWORD_BIT-BYTE_BIT)     ; xmm0=(4567)
-        cvtdq2ps  xmm2,xmm2                     ; xmm2=(0123)
-        cvtdq2ps  xmm0,xmm0                     ; xmm0=(4567)
-        psrad     xmm3,(DWORD_BIT-BYTE_BIT)     ; xmm3=(89AB)
-        psrad     xmm1,(DWORD_BIT-BYTE_BIT)     ; xmm1=(CDEF)
-        cvtdq2ps  xmm3,xmm3                     ; xmm3=(89AB)
-        cvtdq2ps  xmm1,xmm1                     ; xmm1=(CDEF)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
-
-        add     esi, byte 2*SIZEOF_JSAMPROW
-        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     short .convloop
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-;                            FAST_FLOAT *workspace);
-;
-
-%define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; FAST_FLOAT *divisors
-%define workspace       ebp+16          ; FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_float_sse2)
-
-EXTN(jsimd_quantize_float_sse2):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     esi, POINTER [workspace]
-        mov     edx, POINTER [divisors]
-        mov     edi, JCOEFPTR [coef_block]
-        mov     eax, DCTSIZE2/16
-        alignx  16,7
-.quantloop:
-        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
-        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        mulps   xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
-        mulps   xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        mulps   xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
-        cvtps2dq xmm0,xmm0
-        cvtps2dq xmm1,xmm1
-        cvtps2dq xmm2,xmm2
-        cvtps2dq xmm3,xmm3
-
-        packssdw xmm0,xmm1
-        packssdw xmm2,xmm3
-
-        movdqa  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
-        movdqa  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
-
-        add     esi, byte 16*SIZEOF_FAST_FLOAT
-        add     edx, byte 16*SIZEOF_FAST_FLOAT
-        add     edi, byte 16*SIZEOF_JCOEF
-        dec     eax
-        jnz     short .quantloop
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jquanti-sse2-64.asm b/simd/jquanti-sse2-64.asm
deleted file mode 100644
index 66c4e51..0000000
--- a/simd/jquanti-sse2-64.asm
+++ /dev/null
@@ -1,186 +0,0 @@
-;
-; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                      DCTELEM *workspace);
-;
-
-; r10 = JSAMPARRAY sample_data
-; r11 = JDIMENSION start_col
-; r12 = DCTELEM *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_sse2)
-
-EXTN(jsimd_convsamp_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-        push    rbx
-
-        pxor    xmm6,xmm6               ; xmm6=(all 0's)
-        pcmpeqw xmm7,xmm7
-        psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-        mov rsi, r10
-        mov eax, r11d
-        mov rdi, r12
-        mov     rcx, DCTSIZE/4
-.convloop:
-        mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]       ; (JSAMPLE *)
-
-        movq    xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]       ; xmm0=(01234567)
-        movq    xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]       ; xmm1=(89ABCDEF)
-
-        mov     rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]       ; xmm2=(GHIJKLMN)
-        movq    xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]       ; xmm3=(OPQRSTUV)
-
-        punpcklbw xmm0,xmm6             ; xmm0=(01234567)
-        punpcklbw xmm1,xmm6             ; xmm1=(89ABCDEF)
-        paddw     xmm0,xmm7
-        paddw     xmm1,xmm7
-        punpcklbw xmm2,xmm6             ; xmm2=(GHIJKLMN)
-        punpcklbw xmm3,xmm6             ; xmm3=(OPQRSTUV)
-        paddw     xmm2,xmm7
-        paddw     xmm3,xmm7
-
-        movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
-        movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
-        movdqa  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
-        movdqa  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
-
-        add     rsi, byte 4*SIZEOF_JSAMPROW
-        add     rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
-        dec     rcx
-        jnz     short .convloop
-
-        pop     rbx
-        uncollect_args
-        pop     rbp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; This implementation is based on an algorithm described in
-;   "How to optimize for the Pentium family of microprocessors"
-;   (http://www.agner.org/assem/).
-;
-; GLOBAL(void)
-; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors,
-;                      DCTELEM *workspace);
-;
-
-%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
-
-; r10 = JCOEFPTR coef_block
-; r11 = DCTELEM *divisors
-; r12 = DCTELEM *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_sse2)
-
-EXTN(jsimd_quantize_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-
-        mov rsi, r12
-        mov rdx, r11
-        mov rdi, r10
-        mov     rax, DCTSIZE2/32
-.quantloop:
-        movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
-        movdqa  xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
-        movdqa  xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
-        movdqa  xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
-        movdqa  xmm0,xmm4
-        movdqa  xmm1,xmm5
-        movdqa  xmm2,xmm6
-        movdqa  xmm3,xmm7
-        psraw   xmm4,(WORD_BIT-1)
-        psraw   xmm5,(WORD_BIT-1)
-        psraw   xmm6,(WORD_BIT-1)
-        psraw   xmm7,(WORD_BIT-1)
-        pxor    xmm0,xmm4
-        pxor    xmm1,xmm5
-        pxor    xmm2,xmm6
-        pxor    xmm3,xmm7
-        psubw   xmm0,xmm4               ; if (xmm0 < 0) xmm0 = -xmm0;
-        psubw   xmm1,xmm5               ; if (xmm1 < 0) xmm1 = -xmm1;
-        psubw   xmm2,xmm6               ; if (xmm2 < 0) xmm2 = -xmm2;
-        psubw   xmm3,xmm7               ; if (xmm3 < 0) xmm3 = -xmm3;
-
-        paddw   xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor
-        paddw   xmm1, XMMWORD [CORRECTION(1,0,rdx)]
-        paddw   xmm2, XMMWORD [CORRECTION(2,0,rdx)]
-        paddw   xmm3, XMMWORD [CORRECTION(3,0,rdx)]
-        pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal
-        pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
-        pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
-        pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
-        pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)]  ; scale
-        pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
-        pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
-        pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
-
-        pxor    xmm0,xmm4
-        pxor    xmm1,xmm5
-        pxor    xmm2,xmm6
-        pxor    xmm3,xmm7
-        psubw   xmm0,xmm4
-        psubw   xmm1,xmm5
-        psubw   xmm2,xmm6
-        psubw   xmm3,xmm7
-        movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
-        movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
-        movdqa  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
-        movdqa  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
-
-        add     rsi, byte 32*SIZEOF_DCTELEM
-        add     rdx, byte 32*SIZEOF_DCTELEM
-        add     rdi, byte 32*SIZEOF_JCOEF
-        dec     rax
-        jnz     near .quantloop
-
-        uncollect_args
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jquanti-sse2.asm b/simd/jquanti-sse2.asm
deleted file mode 100644
index aea8604..0000000
--- a/simd/jquanti-sse2.asm
+++ /dev/null
@@ -1,199 +0,0 @@
-;
-; jquanti.asm - sample data conversion and quantization (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                      DCTELEM *workspace);
-;
-
-%define sample_data     ebp+8           ; JSAMPARRAY sample_data
-%define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; DCTELEM *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_sse2)
-
-EXTN(jsimd_convsamp_sse2):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        pxor    xmm6,xmm6               ; xmm6=(all 0's)
-        pcmpeqw xmm7,xmm7
-        psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
-        mov     eax, JDIMENSION [start_col]
-        mov     edi, POINTER [workspace]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.convloop:
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]       ; xmm0=(01234567)
-        movq    xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]       ; xmm1=(89ABCDEF)
-
-        mov     ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]       ; xmm2=(GHIJKLMN)
-        movq    xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]       ; xmm3=(OPQRSTUV)
-
-        punpcklbw xmm0,xmm6             ; xmm0=(01234567)
-        punpcklbw xmm1,xmm6             ; xmm1=(89ABCDEF)
-        paddw     xmm0,xmm7
-        paddw     xmm1,xmm7
-        punpcklbw xmm2,xmm6             ; xmm2=(GHIJKLMN)
-        punpcklbw xmm3,xmm6             ; xmm3=(OPQRSTUV)
-        paddw     xmm2,xmm7
-        paddw     xmm3,xmm7
-
-        movdqa  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
-        movdqa  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
-        movdqa  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
-        movdqa  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
-
-        add     esi, byte 4*SIZEOF_JSAMPROW
-        add     edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
-        dec     ecx
-        jnz     short .convloop
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; This implementation is based on an algorithm described in
-;   "How to optimize for the Pentium family of microprocessors"
-;   (http://www.agner.org/assem/).
-;
-; GLOBAL(void)
-; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors,
-;                      DCTELEM *workspace);
-;
-
-%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
-
-%define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; DCTELEM *divisors
-%define workspace       ebp+16          ; DCTELEM *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_sse2)
-
-EXTN(jsimd_quantize_sse2):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     esi, POINTER [workspace]
-        mov     edx, POINTER [divisors]
-        mov     edi, JCOEFPTR [coef_block]
-        mov     eax, DCTSIZE2/32
-        alignx  16,7
-.quantloop:
-        movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
-        movdqa  xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
-        movdqa  xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
-        movdqa  xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
-        movdqa  xmm0,xmm4
-        movdqa  xmm1,xmm5
-        movdqa  xmm2,xmm6
-        movdqa  xmm3,xmm7
-        psraw   xmm4,(WORD_BIT-1)
-        psraw   xmm5,(WORD_BIT-1)
-        psraw   xmm6,(WORD_BIT-1)
-        psraw   xmm7,(WORD_BIT-1)
-        pxor    xmm0,xmm4
-        pxor    xmm1,xmm5
-        pxor    xmm2,xmm6
-        pxor    xmm3,xmm7
-        psubw   xmm0,xmm4               ; if (xmm0 < 0) xmm0 = -xmm0;
-        psubw   xmm1,xmm5               ; if (xmm1 < 0) xmm1 = -xmm1;
-        psubw   xmm2,xmm6               ; if (xmm2 < 0) xmm2 = -xmm2;
-        psubw   xmm3,xmm7               ; if (xmm3 < 0) xmm3 = -xmm3;
-
-        paddw   xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
-        paddw   xmm1, XMMWORD [CORRECTION(1,0,edx)]
-        paddw   xmm2, XMMWORD [CORRECTION(2,0,edx)]
-        paddw   xmm3, XMMWORD [CORRECTION(3,0,edx)]
-        pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
-        pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
-        pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
-        pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
-        pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)]  ; scale
-        pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)]
-        pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
-        pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
-
-        pxor    xmm0,xmm4
-        pxor    xmm1,xmm5
-        pxor    xmm2,xmm6
-        pxor    xmm3,xmm7
-        psubw   xmm0,xmm4
-        psubw   xmm1,xmm5
-        psubw   xmm2,xmm6
-        psubw   xmm3,xmm7
-        movdqa  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
-        movdqa  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
-        movdqa  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
-        movdqa  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
-
-        add     esi, byte 32*SIZEOF_DCTELEM
-        add     edx, byte 32*SIZEOF_DCTELEM
-        add     edi, byte 32*SIZEOF_JCOEF
-        dec     eax
-        jnz     near .quantloop
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jsimd.h b/simd/jsimd.h
index dc6ec43..8d25f8b 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -2,10 +2,11 @@
  * simd/jsimd.h
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2011, 2014-2016, D. R. Commander.
+ * Copyright (C) 2011, 2014-2016, 2018, D. R. Commander.
  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2014, Linaro Limited.
  * Copyright (C) 2015-2016, Matthieu Darbois.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -15,857 +16,1059 @@
 
 /* Bitmask for supported acceleration methods */
 
-#define JSIMD_NONE       0x00
-#define JSIMD_MMX        0x01
-#define JSIMD_3DNOW      0x02
-#define JSIMD_SSE        0x04
-#define JSIMD_SSE2       0x08
-#define JSIMD_ARM_NEON   0x10
-#define JSIMD_MIPS_DSPR2 0x20
-#define JSIMD_ALTIVEC    0x40
+#define JSIMD_NONE    0x00
+#define JSIMD_MMX     0x01
+#define JSIMD_3DNOW   0x02
+#define JSIMD_SSE     0x04
+#define JSIMD_SSE2    0x08
+#define JSIMD_NEON    0x10
+#define JSIMD_DSPR2   0x20
+#define JSIMD_ALTIVEC 0x40
+#define JSIMD_AVX2    0x80
+#define JSIMD_MMI     0x100
 
 /* SIMD Ext: retrieve SIMD/CPU information */
-EXTERN(unsigned int) jpeg_simd_cpu_support (void);
+EXTERN(unsigned int) jpeg_simd_cpu_support(void);
 
 /* RGB & extended RGB --> YCC Colorspace Conversion */
 EXTERN(void) jsimd_rgb_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 extern const int jconst_rgb_ycc_convert_sse2[];
 EXTERN(void) jsimd_rgb_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+extern const int jconst_rgb_ycc_convert_avx2[];
+EXTERN(void) jsimd_rgb_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 EXTERN(void) jsimd_rgb_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_ycc_convert_neon_slowld3
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
-EXTERN(void) jsimd_rgb_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extrgb_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extrgbx_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extbgr_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extbgrx_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extxbgr_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extxrgb_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_rgb_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 EXTERN(void) jsimd_rgb_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 /* RGB & extended RGB --> Grayscale Colorspace Conversion */
 EXTERN(void) jsimd_rgb_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 extern const int jconst_rgb_gray_convert_sse2[];
 EXTERN(void) jsimd_rgb_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
-EXTERN(void) jsimd_rgb_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extrgb_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extrgbx_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extbgr_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extbgrx_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extxbgr_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extxrgb_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+extern const int jconst_rgb_gray_convert_avx2[];
+EXTERN(void) jsimd_rgb_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 EXTERN(void) jsimd_rgb_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 /* YCC --> RGB & extended RGB Colorspace Conversion */
 EXTERN(void) jsimd_ycc_rgb_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgb_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgbx_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgr_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgrx_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxbgr_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxrgb_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 
 extern const int jconst_ycc_rgb_convert_sse2[];
 EXTERN(void) jsimd_ycc_rgb_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgb_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgbx_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgr_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgrx_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxbgr_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxrgb_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+extern const int jconst_ycc_rgb_convert_avx2[];
+EXTERN(void) jsimd_ycc_rgb_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 
 EXTERN(void) jsimd_ycc_rgb_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgb_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgbx_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgr_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgrx_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxbgr_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxrgb_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_rgb565_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 
 EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 
-EXTERN(void) jsimd_ycc_rgb_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extrgb_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extrgbx_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extbgr_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extbgrx_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extxbgr_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extxrgb_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_rgb_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+EXTERN(void) jsimd_ycc_rgb_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 
 EXTERN(void) jsimd_ycc_rgb_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgb_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgbx_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgr_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgrx_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxbgr_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxrgb_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 
 /* NULL Colorspace Conversion */
-EXTERN(void) jsimd_c_null_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows, int num_components);
+EXTERN(void) jsimd_c_null_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows, int num_components);
 
 /* h2v1 Downsampling */
 EXTERN(void) jsimd_h2v1_downsample_mmx
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v1_downsample_sse2
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_avx2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v1_downsample_neon
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
-EXTERN(void) jsimd_h2v1_downsample_mips_dspr2
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+EXTERN(void) jsimd_h2v1_downsample_dspr2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v1_downsample_altivec
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 /* h2v2 Downsampling */
 EXTERN(void) jsimd_h2v2_downsample_mmx
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v2_downsample_sse2
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_avx2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v2_downsample_neon
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
-EXTERN(void) jsimd_h2v2_downsample_mips_dspr2
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+EXTERN(void) jsimd_h2v2_downsample_dspr2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_mmi
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v2_downsample_altivec
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 /* h2v2 Smooth Downsampling */
-EXTERN(void) jsimd_h2v2_smooth_downsample_mips_dspr2
-        (JSAMPARRAY input_data, JSAMPARRAY output_data,
-         JDIMENSION v_samp_factor, int max_v_samp_factor,
-         int smoothing_factor, JDIMENSION width_blocks,
-         JDIMENSION image_width);
+EXTERN(void) jsimd_h2v2_smooth_downsample_dspr2
+  (JSAMPARRAY input_data, JSAMPARRAY output_data, JDIMENSION v_samp_factor,
+   int max_v_samp_factor, int smoothing_factor, JDIMENSION width_in_blocks,
+   JDIMENSION image_width);
 
 
 /* Upsampling */
 EXTERN(void) jsimd_h2v1_upsample_mmx
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_upsample_mmx
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 EXTERN(void) jsimd_h2v1_upsample_sse2
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_upsample_sse2
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
-EXTERN(void) jsimd_h2v1_upsample_mips_dspr2
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_h2v2_upsample_mips_dspr2
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v1_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
-EXTERN(void) jsimd_int_upsample_mips_dspr2
-        (UINT8 h_expand, UINT8 v_expand, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr, JDIMENSION output_width,
-         int max_v_samp_factor);
+EXTERN(void) jsimd_h2v1_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_int_upsample_dspr2
+  (UINT8 h_expand, UINT8 v_expand, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr, JDIMENSION output_width,
+   int max_v_samp_factor);
 
 EXTERN(void) jsimd_h2v1_upsample_altivec
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_upsample_altivec
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 /* Fancy Upsampling */
 EXTERN(void) jsimd_h2v1_fancy_upsample_mmx
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_fancy_upsample_mmx
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 extern const int jconst_fancy_upsample_sse2[];
 EXTERN(void) jsimd_h2v1_fancy_upsample_sse2
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_fancy_upsample_sse2
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+extern const int jconst_fancy_upsample_avx2[];
+EXTERN(void) jsimd_h2v1_fancy_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 EXTERN(void) jsimd_h2v1_fancy_upsample_neon
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
-EXTERN(void) jsimd_h2v1_fancy_upsample_mips_dspr2
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_h2v2_fancy_upsample_mips_dspr2
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v2_fancy_upsample_mmi
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 EXTERN(void) jsimd_h2v1_fancy_upsample_altivec
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_fancy_upsample_altivec
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 /* Merged Upsampling */
 EXTERN(void) jsimd_h2v1_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
 EXTERN(void) jsimd_h2v2_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
 extern const int jconst_merged_upsample_sse2[];
 EXTERN(void) jsimd_h2v1_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
 EXTERN(void) jsimd_h2v2_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
-EXTERN(void) jsimd_h2v1_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+extern const int jconst_merged_upsample_avx2[];
+EXTERN(void) jsimd_h2v1_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
-EXTERN(void) jsimd_h2v2_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+EXTERN(void) jsimd_h2v2_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
 
 EXTERN(void) jsimd_h2v1_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
 EXTERN(void) jsimd_h2v2_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
 /* Sample Conversion */
 EXTERN(void) jsimd_convsamp_mmx
-        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
 
 EXTERN(void) jsimd_convsamp_sse2
-        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_avx2
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
 
 EXTERN(void) jsimd_convsamp_neon
-        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
 
-EXTERN(void) jsimd_convsamp_mips_dspr2
-        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+EXTERN(void) jsimd_convsamp_dspr2
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
 
 EXTERN(void) jsimd_convsamp_altivec
-        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
 
 /* Floating Point Sample Conversion */
 EXTERN(void) jsimd_convsamp_float_3dnow
-        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
 
 EXTERN(void) jsimd_convsamp_float_sse
-        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
 
 EXTERN(void) jsimd_convsamp_float_sse2
-        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
 
-EXTERN(void) jsimd_convsamp_float_mips_dspr2
-        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+EXTERN(void) jsimd_convsamp_float_dspr2
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
 
 /* Slow Integer Forward DCT */
-EXTERN(void) jsimd_fdct_islow_mmx (DCTELEM *data);
+EXTERN(void) jsimd_fdct_islow_mmx(DCTELEM *data);
 
 extern const int jconst_fdct_islow_sse2[];
-EXTERN(void) jsimd_fdct_islow_sse2 (DCTELEM *data);
+EXTERN(void) jsimd_fdct_islow_sse2(DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_islow_neon (DCTELEM *data);
+extern const int jconst_fdct_islow_avx2[];
+EXTERN(void) jsimd_fdct_islow_avx2(DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_islow_mips_dspr2 (DCTELEM *data);
+EXTERN(void) jsimd_fdct_islow_neon(DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_islow_altivec (DCTELEM *data);
+EXTERN(void) jsimd_fdct_islow_dspr2(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_mmi(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_altivec(DCTELEM *data);
 
 /* Fast Integer Forward DCT */
-EXTERN(void) jsimd_fdct_ifast_mmx (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_mmx(DCTELEM *data);
 
 extern const int jconst_fdct_ifast_sse2[];
-EXTERN(void) jsimd_fdct_ifast_sse2 (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_sse2(DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_ifast_neon (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_neon(DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_ifast_mips_dspr2 (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_dspr2(DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_ifast_altivec (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_altivec(DCTELEM *data);
 
 /* Floating Point Forward DCT */
-EXTERN(void) jsimd_fdct_float_3dnow (FAST_FLOAT *data);
+EXTERN(void) jsimd_fdct_float_3dnow(FAST_FLOAT *data);
 
 extern const int jconst_fdct_float_sse[];
-EXTERN(void) jsimd_fdct_float_sse (FAST_FLOAT *data);
+EXTERN(void) jsimd_fdct_float_sse(FAST_FLOAT *data);
 
 /* Quantization */
 EXTERN(void) jsimd_quantize_mmx
-        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
 EXTERN(void) jsimd_quantize_sse2
-        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_avx2
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
 EXTERN(void) jsimd_quantize_neon
-        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
-EXTERN(void) jsimd_quantize_mips_dspr2
-        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+EXTERN(void) jsimd_quantize_dspr2
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_mmi
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
 EXTERN(void) jsimd_quantize_altivec
-        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
 /* Floating Point Quantization */
 EXTERN(void) jsimd_quantize_float_3dnow
-        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
 
 EXTERN(void) jsimd_quantize_float_sse
-        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
 
 EXTERN(void) jsimd_quantize_float_sse2
-        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
 
-EXTERN(void) jsimd_quantize_float_mips_dspr2
-        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+EXTERN(void) jsimd_quantize_float_dspr2
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
 
 /* Scaled Inverse DCT */
 EXTERN(void) jsimd_idct_2x2_mmx
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 EXTERN(void) jsimd_idct_4x4_mmx
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 extern const int jconst_idct_red_sse2[];
 EXTERN(void) jsimd_idct_2x2_sse2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 EXTERN(void) jsimd_idct_4x4_sse2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_2x2_neon
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 EXTERN(void) jsimd_idct_4x4_neon
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
-EXTERN(void) jsimd_idct_2x2_mips_dspr2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
-EXTERN(void) jsimd_idct_4x4_mips_dspr2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col, int *workspace);
-EXTERN(void) jsimd_idct_6x6_mips_dspr2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
-EXTERN(void) jsimd_idct_12x12_pass1_mips_dspr2
-        (JCOEFPTR coef_block, void *dct_table, int *workspace);
-EXTERN(void) jsimd_idct_12x12_pass2_mips_dspr2
-        (int *workspace, int *output);
+EXTERN(void) jsimd_idct_2x2_dspr2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_dspr2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col, int *workspace);
+EXTERN(void) jsimd_idct_6x6_dspr2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+EXTERN(void) jsimd_idct_12x12_pass1_dspr2
+  (JCOEFPTR coef_block, void *dct_table, int *workspace);
+EXTERN(void) jsimd_idct_12x12_pass2_dspr2
+  (int *workspace, int *output);
 
 /* Slow Integer Inverse DCT */
 EXTERN(void) jsimd_idct_islow_mmx
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 extern const int jconst_idct_islow_sse2[];
 EXTERN(void) jsimd_idct_islow_sse2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+extern const int jconst_idct_islow_avx2[];
+EXTERN(void) jsimd_idct_islow_avx2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_islow_neon
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
-EXTERN(void) jsimd_idct_islow_mips_dspr2
-        (void *dct_table, JCOEFPTR coef_block, int *output_buf,
-         JSAMPLE *output_col);
+EXTERN(void) jsimd_idct_islow_dspr2
+  (void *dct_table, JCOEFPTR coef_block, int *output_buf, JSAMPLE *output_col);
+
+EXTERN(void) jsimd_idct_islow_mmi
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_islow_altivec
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 /* Fast Integer Inverse DCT */
 EXTERN(void) jsimd_idct_ifast_mmx
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 extern const int jconst_idct_ifast_sse2[];
 EXTERN(void) jsimd_idct_ifast_sse2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_ifast_neon
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
-EXTERN(void) jsimd_idct_ifast_cols_mips_dspr2
-        (JCOEF *inptr, IFAST_MULT_TYPE *quantptr, DCTELEM *wsptr,
-         const int *idct_coefs);
-EXTERN(void) jsimd_idct_ifast_rows_mips_dspr2
-        (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
-         const int *idct_coefs);
+EXTERN(void) jsimd_idct_ifast_cols_dspr2
+  (JCOEF *inptr, IFAST_MULT_TYPE *quantptr, DCTELEM *wsptr,
+   const int *idct_coefs);
+EXTERN(void) jsimd_idct_ifast_rows_dspr2
+  (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
+   const int *idct_coefs);
 
 EXTERN(void) jsimd_idct_ifast_altivec
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 /* Floating Point Inverse DCT */
 EXTERN(void) jsimd_idct_float_3dnow
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 extern const int jconst_idct_float_sse[];
 EXTERN(void) jsimd_idct_float_sse
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 extern const int jconst_idct_float_sse2[];
 EXTERN(void) jsimd_idct_float_sse2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 /* Huffman coding */
 extern const int jconst_huff_encode_one_block[];
-EXTERN(JOCTET*) jsimd_huff_encode_one_block_sse2
-        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
-         c_derived_tbl *dctbl, c_derived_tbl *actbl);
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_sse2
+  (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+   c_derived_tbl *dctbl, c_derived_tbl *actbl);
 
-EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon
-        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
-         c_derived_tbl *dctbl, c_derived_tbl *actbl);
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon
+  (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+   c_derived_tbl *dctbl, c_derived_tbl *actbl);
 
-EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon_slowtbl
-        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
-         c_derived_tbl *dctbl, c_derived_tbl *actbl);
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl
+  (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+   c_derived_tbl *dctbl, c_derived_tbl *actbl);
diff --git a/simd/jsimd_arm.c b/simd/jsimd_arm.c
deleted file mode 100644
index 0b955cd..0000000
--- a/simd/jsimd_arm.c
+++ /dev/null
@@ -1,728 +0,0 @@
-/*
- * jsimd_arm.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
- * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
- * Copyright (C) 2015-2016, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * 32-bit ARM architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
-
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-
-#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
-
-LOCAL(int)
-check_feature (char *buffer, char *feature)
-{
-  char *p;
-  if (*feature == 0)
-    return 0;
-  if (strncmp(buffer, "Features", 8) != 0)
-    return 0;
-  buffer += 8;
-  while (isspace(*buffer))
-    buffer++;
-
-  /* Check if 'feature' is present in the buffer as a separate word */
-  while ((p = strstr(buffer, feature))) {
-    if (p > buffer && !isspace(*(p - 1))) {
-      buffer++;
-      continue;
-    }
-    p += strlen(feature);
-    if (*p != 0 && !isspace(*p)) {
-      buffer++;
-      continue;
-    }
-    return 1;
-  }
-  return 0;
-}
-
-LOCAL(int)
-parse_proc_cpuinfo (int bufsize)
-{
-  char *buffer = (char *)malloc(bufsize);
-  FILE *fd;
-  simd_support = 0;
-
-  if (!buffer)
-    return 0;
-
-  fd = fopen("/proc/cpuinfo", "r");
-  if (fd) {
-    while (fgets(buffer, bufsize, fd)) {
-      if (!strchr(buffer, '\n') && !feof(fd)) {
-        /* "impossible" happened - insufficient size of the buffer! */
-        fclose(fd);
-        free(buffer);
-        return 0;
-      }
-      if (check_feature(buffer, "neon"))
-        simd_support |= JSIMD_ARM_NEON;
-    }
-    fclose(fd);
-  }
-  free(buffer);
-  return 1;
-}
-
-#endif
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
-  char *env = NULL;
-#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-  int bufsize = 1024; /* an initial guess for the line buffer size limit */
-#endif
-
-  if (simd_support != ~0U)
-    return;
-
-  simd_support = 0;
-
-#if defined(__ARM_NEON__)
-  simd_support |= JSIMD_ARM_NEON;
-#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-  /* We still have a chance to use NEON regardless of globally used
-   * -mcpu/-mfpu options passed to gcc by performing runtime detection via
-   * /proc/cpuinfo parsing on linux/android */
-  while (!parse_proc_cpuinfo(bufsize)) {
-    bufsize *= 2;
-    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
-      break;
-  }
-#endif
-
-  /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCENEON");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = JSIMD_ARM_NEON;
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = 0;
-  env = getenv("JSIMD_NOHUFFENC");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_huffman = 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
-{
-  void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      neonfct=jsimd_extrgb_ycc_convert_neon;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      neonfct=jsimd_extrgbx_ycc_convert_neon;
-      break;
-    case JCS_EXT_BGR:
-      neonfct=jsimd_extbgr_ycc_convert_neon;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      neonfct=jsimd_extbgrx_ycc_convert_neon;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      neonfct=jsimd_extxbgr_ycc_convert_neon;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      neonfct=jsimd_extxrgb_ycc_convert_neon;
-      break;
-    default:
-      neonfct=jsimd_extrgb_ycc_convert_neon;
-      break;
-  }
-
-  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
-{
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
-{
-  void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      neonfct=jsimd_ycc_extrgb_convert_neon;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      neonfct=jsimd_ycc_extrgbx_convert_neon;
-      break;
-    case JCS_EXT_BGR:
-      neonfct=jsimd_ycc_extbgr_convert_neon;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      neonfct=jsimd_ycc_extbgrx_convert_neon;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      neonfct=jsimd_ycc_extxbgr_convert_neon;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      neonfct=jsimd_ycc_extxrgb_convert_neon;
-      break;
-    default:
-      neonfct=jsimd_ycc_extrgb_convert_neon;
-      break;
-  }
-
-  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
-{
-  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
-                                output_buf, num_rows);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
-                                 compptr->downsampled_width, input_data,
-                                 output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
-{
-  jsimd_convsamp_neon(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
-  jsimd_fdct_ifast_neon(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
-{
-  jsimd_quantize_neon(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
-                      output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
-                      output_col);
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(IFAST_MULT_TYPE) != 2)
-    return 0;
-  if (IFAST_SCALE_BITS != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
-  init_simd();
-
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON && simd_huffman)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
-{
-  return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
-                                          dctbl, actbl);
-}
diff --git a/simd/jsimd_arm64.c b/simd/jsimd_arm64.c
deleted file mode 100644
index f6e9736..0000000
--- a/simd/jsimd_arm64.c
+++ /dev/null
@@ -1,803 +0,0 @@
-/*
- * jsimd_arm64.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
- * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
- * Copyright (C) 2015-2016, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * 64-bit ARM architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-
-#define JSIMD_FASTLD3 1
-#define JSIMD_FASTST3 2
-#define JSIMD_FASTTBL 4
-
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
-static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
-                                    JSIMD_FASTTBL;
-
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-
-#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
-
-LOCAL(int)
-check_cpuinfo (char *buffer, const char *field, char *value)
-{
-  char *p;
-  if (*value == 0)
-    return 0;
-  if (strncmp(buffer, field, strlen(field)) != 0)
-    return 0;
-  buffer += strlen(field);
-  while (isspace(*buffer))
-    buffer++;
-
-  /* Check if 'value' is present in the buffer as a separate word */
-  while ((p = strstr(buffer, value))) {
-    if (p > buffer && !isspace(*(p - 1))) {
-      buffer++;
-      continue;
-    }
-    p += strlen(value);
-    if (*p != 0 && !isspace(*p)) {
-      buffer++;
-      continue;
-    }
-    return 1;
-  }
-  return 0;
-}
-
-LOCAL(int)
-parse_proc_cpuinfo (int bufsize)
-{
-  char *buffer = (char *)malloc(bufsize);
-  FILE *fd;
-
-  if (!buffer)
-    return 0;
-
-  fd = fopen("/proc/cpuinfo", "r");
-  if (fd) {
-    while (fgets(buffer, bufsize, fd)) {
-      if (!strchr(buffer, '\n') && !feof(fd)) {
-        /* "impossible" happened - insufficient size of the buffer! */
-        fclose(fd);
-        free(buffer);
-        return 0;
-      }
-      if (check_cpuinfo(buffer, "CPU part", "0xd03") ||
-          check_cpuinfo(buffer, "CPU part", "0xd07"))
-        /* The Cortex-A53 has a slow tbl implementation.  We can gain a few
-           percent speedup by disabling the use of that instruction.  The
-           speedup on Cortex-A57 is more subtle but still measurable. */
-        simd_features &= ~JSIMD_FASTTBL;
-      else if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
-        /* The SIMD version of Huffman encoding is slower than the C version on
-           Cavium ThunderX.  Also, ld3 and st3 are abyssmally slow on that
-           CPU. */
-        simd_huffman = simd_features = 0;
-    }
-    fclose(fd);
-  }
-  free(buffer);
-  return 1;
-}
-
-#endif
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-
-/*
- * ARMv8 architectures support NEON extensions by default.
- * It is no longer optional as it was with ARMv7.
- */
-
-
-LOCAL(void)
-init_simd (void)
-{
-  char *env = NULL;
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-  int bufsize = 1024; /* an initial guess for the line buffer size limit */
-#endif
-
-  if (simd_support != ~0U)
-    return;
-
-  simd_support = 0;
-
-  simd_support |= JSIMD_ARM_NEON;
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-  while (!parse_proc_cpuinfo(bufsize)) {
-    bufsize *= 2;
-    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
-      break;
-  }
-#endif
-
-  /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCENEON");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = JSIMD_ARM_NEON;
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = 0;
-  env = getenv("JSIMD_NOHUFFENC");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_huffman = 0;
-  env = getenv("JSIMD_FASTLD3");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_features |= JSIMD_FASTLD3;
-  if ((env != NULL) && (strcmp(env, "0") == 0))
-    simd_features &= ~JSIMD_FASTLD3;
-  env = getenv("JSIMD_FASTST3");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_features |= JSIMD_FASTST3;
-  if ((env != NULL) && (strcmp(env, "0") == 0))
-    simd_features &= ~JSIMD_FASTST3;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
-{
-  void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      if (simd_features & JSIMD_FASTLD3)
-        neonfct=jsimd_extrgb_ycc_convert_neon;
-      else
-        neonfct=jsimd_extrgb_ycc_convert_neon_slowld3;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      neonfct=jsimd_extrgbx_ycc_convert_neon;
-      break;
-    case JCS_EXT_BGR:
-      if (simd_features & JSIMD_FASTLD3)
-        neonfct=jsimd_extbgr_ycc_convert_neon;
-      else
-        neonfct=jsimd_extbgr_ycc_convert_neon_slowld3;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      neonfct=jsimd_extbgrx_ycc_convert_neon;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      neonfct=jsimd_extxbgr_ycc_convert_neon;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      neonfct=jsimd_extxrgb_ycc_convert_neon;
-      break;
-    default:
-      if (simd_features & JSIMD_FASTLD3)
-        neonfct=jsimd_extrgb_ycc_convert_neon;
-      else
-        neonfct=jsimd_extrgb_ycc_convert_neon_slowld3;
-      break;
-  }
-
-  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
-{
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
-{
-  void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      if (simd_features & JSIMD_FASTST3)
-        neonfct=jsimd_ycc_extrgb_convert_neon;
-      else
-        neonfct=jsimd_ycc_extrgb_convert_neon_slowst3;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      neonfct=jsimd_ycc_extrgbx_convert_neon;
-      break;
-    case JCS_EXT_BGR:
-      if (simd_features & JSIMD_FASTST3)
-        neonfct=jsimd_ycc_extbgr_convert_neon;
-      else
-        neonfct=jsimd_ycc_extbgr_convert_neon_slowst3;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      neonfct=jsimd_ycc_extbgrx_convert_neon;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      neonfct=jsimd_ycc_extxbgr_convert_neon;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      neonfct=jsimd_ycc_extxrgb_convert_neon;
-      break;
-    default:
-      if (simd_features & JSIMD_FASTST3)
-        neonfct=jsimd_ycc_extrgb_convert_neon;
-      else
-        neonfct=jsimd_ycc_extrgb_convert_neon_slowst3;
-      break;
-  }
-
-  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
-{
-  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
-                                output_buf, num_rows);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
-                             compptr->v_samp_factor, compptr->width_in_blocks,
-                             input_data, output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
-                             compptr->v_samp_factor, compptr->width_in_blocks,
-                             input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
-{
-  jsimd_convsamp_neon(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-  jsimd_fdct_islow_neon(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
-  jsimd_fdct_ifast_neon(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
-{
-  jsimd_quantize_neon(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
-                      output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
-                      output_col);
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(IFAST_MULT_TYPE) != 2)
-    return 0;
-  if (IFAST_SCALE_BITS != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
-  init_simd();
-
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON && simd_huffman)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
-{
-  if (simd_features & JSIMD_FASTTBL)
-    return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
-                                            dctbl, actbl);
-  else
-    return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
-                                                    last_dc_val, dctbl, actbl);
-}
diff --git a/simd/jsimd_i386.c b/simd/jsimd_i386.c
deleted file mode 100644
index 6da8bd8..0000000
--- a/simd/jsimd_i386.c
+++ /dev/null
@@ -1,1091 +0,0 @@
-/*
- * jsimd_i386.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * 32-bit x86 architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-/*
- * In the PIC cases, we have no guarantee that constants will keep
- * their alignment. This macro allows us to verify it at runtime.
- */
-#define IS_ALIGNED(ptr, order) (((unsigned)ptr & ((1 << order) - 1)) == 0)
-
-#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
-
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
-  char *env = NULL;
-
-  if (simd_support != ~0U)
-    return;
-
-  simd_support = jpeg_simd_cpu_support();
-
-  /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCEMMX");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support &= JSIMD_MMX;
-  env = getenv("JSIMD_FORCE3DNOW");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support &= JSIMD_3DNOW|JSIMD_MMX;
-  env = getenv("JSIMD_FORCESSE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support &= JSIMD_SSE|JSIMD_MMX;
-  env = getenv("JSIMD_FORCESSE2");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support &= JSIMD_SSE2;
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = 0;
-  env = getenv("JSIMD_NOHUFFENC");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_huffman = 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-  void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_extrgb_ycc_convert_sse2;
-      mmxfct=jsimd_extrgb_ycc_convert_mmx;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_extrgbx_ycc_convert_sse2;
-      mmxfct=jsimd_extrgbx_ycc_convert_mmx;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_extbgr_ycc_convert_sse2;
-      mmxfct=jsimd_extbgr_ycc_convert_mmx;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_extbgrx_ycc_convert_sse2;
-      mmxfct=jsimd_extbgrx_ycc_convert_mmx;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_extxbgr_ycc_convert_sse2;
-      mmxfct=jsimd_extxbgr_ycc_convert_mmx;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_extxrgb_ycc_convert_sse2;
-      mmxfct=jsimd_extxrgb_ycc_convert_mmx;
-      break;
-    default:
-      sse2fct=jsimd_rgb_ycc_convert_sse2;
-      mmxfct=jsimd_rgb_ycc_convert_mmx;
-      break;
-  }
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
-    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-  else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-  void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_extrgb_gray_convert_sse2;
-      mmxfct=jsimd_extrgb_gray_convert_mmx;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_extrgbx_gray_convert_sse2;
-      mmxfct=jsimd_extrgbx_gray_convert_mmx;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_extbgr_gray_convert_sse2;
-      mmxfct=jsimd_extbgr_gray_convert_mmx;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_extbgrx_gray_convert_sse2;
-      mmxfct=jsimd_extbgrx_gray_convert_mmx;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_extxbgr_gray_convert_sse2;
-      mmxfct=jsimd_extxbgr_gray_convert_mmx;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_extxrgb_gray_convert_sse2;
-      mmxfct=jsimd_extxrgb_gray_convert_mmx;
-      break;
-    default:
-      sse2fct=jsimd_rgb_gray_convert_sse2;
-      mmxfct=jsimd_rgb_gray_convert_mmx;
-      break;
-  }
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
-    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-  else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-  void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_ycc_extrgb_convert_sse2;
-      mmxfct=jsimd_ycc_extrgb_convert_mmx;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_ycc_extrgbx_convert_sse2;
-      mmxfct=jsimd_ycc_extrgbx_convert_mmx;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_ycc_extbgr_convert_sse2;
-      mmxfct=jsimd_ycc_extbgr_convert_mmx;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_ycc_extbgrx_convert_sse2;
-      mmxfct=jsimd_ycc_extbgrx_convert_mmx;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_ycc_extxbgr_convert_sse2;
-      mmxfct=jsimd_ycc_extxbgr_convert_mmx;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_ycc_extxrgb_convert_sse2;
-      mmxfct=jsimd_ycc_extxrgb_convert_mmx;
-      break;
-    default:
-      sse2fct=jsimd_ycc_rgb_convert_sse2;
-      mmxfct=jsimd_ycc_rgb_convert_mmx;
-      break;
-  }
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
-    sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-  else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
-                               compptr->v_samp_factor,
-                               compptr->width_in_blocks, input_data,
-                               output_data);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
-                              compptr->v_samp_factor, compptr->width_in_blocks,
-                              input_data, output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
-                               compptr->v_samp_factor,
-                               compptr->width_in_blocks, input_data,
-                               output_data);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
-                              compptr->v_samp_factor, compptr->width_in_blocks,
-                              input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
-                             input_data, output_data_ptr);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
-                            input_data, output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
-                             input_data, output_data_ptr);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
-                            input_data, output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
-                                   compptr->downsampled_width, input_data,
-                                   output_data_ptr);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
-                                  compptr->downsampled_width, input_data,
-                                  output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
-                                   compptr->downsampled_width, input_data,
-                                   output_data_ptr);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
-                                  compptr->downsampled_width, input_data,
-                                  output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-  void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_extrgb_merged_upsample_mmx;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_extrgbx_merged_upsample_mmx;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_extbgr_merged_upsample_mmx;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_extbgrx_merged_upsample_mmx;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_extxbgr_merged_upsample_mmx;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_extxrgb_merged_upsample_mmx;
-      break;
-    default:
-      sse2fct=jsimd_h2v2_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_merged_upsample_mmx;
-      break;
-  }
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-  else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-  void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_extrgb_merged_upsample_mmx;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_extrgbx_merged_upsample_mmx;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_extbgr_merged_upsample_mmx;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_extbgrx_merged_upsample_mmx;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_extxbgr_merged_upsample_mmx;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_extxrgb_merged_upsample_mmx;
-      break;
-    default:
-      sse2fct=jsimd_h2v1_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_merged_upsample_mmx;
-      break;
-  }
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-  else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_SSE)
-    return 1;
-  if (simd_support & JSIMD_3DNOW)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_convsamp_sse2(sample_data, start_col, workspace);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_convsamp_mmx(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
-  else if (simd_support & JSIMD_SSE)
-    jsimd_convsamp_float_sse(sample_data, start_col, workspace);
-  else if (simd_support & JSIMD_3DNOW)
-    jsimd_convsamp_float_3dnow(sample_data, start_col, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
-    return 1;
-  if (simd_support & JSIMD_3DNOW)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
-    jsimd_fdct_islow_sse2(data);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_fdct_islow_mmx(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
-    jsimd_fdct_ifast_sse2(data);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_fdct_ifast_mmx(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
-    jsimd_fdct_float_sse(data);
-  else if (simd_support & JSIMD_3DNOW)
-    jsimd_fdct_float_3dnow(data);
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_SSE)
-    return 1;
-  if (simd_support & JSIMD_3DNOW)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_quantize_sse2(coef_block, divisors, workspace);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_quantize_mmx(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_quantize_float_sse2(coef_block, divisors, workspace);
-  else if (simd_support & JSIMD_SSE)
-    jsimd_quantize_float_sse(coef_block, divisors, workspace);
-  else if (simd_support & JSIMD_3DNOW)
-    jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(IFAST_MULT_TYPE) != 2)
-    return 0;
-  if (IFAST_SCALE_BITS != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
-  init_simd();
-
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-  if (sizeof(FLOAT_MULT_TYPE) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
-    return 1;
-  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
-    return 1;
-  if (simd_support & JSIMD_3DNOW)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
-    jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
-                          output_col);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf,
-                         output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
-    jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
-                          output_col);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf,
-                         output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
-    jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
-                          output_col);
-  else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
-    jsimd_idct_float_sse(compptr->dct_table, coef_block, output_buf,
-                         output_col);
-  else if (simd_support & JSIMD_3DNOW)
-    jsimd_idct_float_3dnow(compptr->dct_table, coef_block, output_buf,
-                           output_col);
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
-  init_simd();
-
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
-      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
-{
-  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
-                                          dctbl, actbl);
-}
diff --git a/simd/jsimd_mips.c b/simd/jsimd_mips.c
deleted file mode 100644
index 02e90cd..0000000
--- a/simd/jsimd_mips.c
+++ /dev/null
@@ -1,1140 +0,0 @@
-/*
- * jsimd_mips.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014, 2016, D. R. Commander.
- * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
- * Copyright (C) 2015, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * MIPS architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-
-static unsigned int simd_support = ~0;
-
-#if defined(__linux__)
-
-LOCAL(int)
-parse_proc_cpuinfo(const char* search_string)
-{
-  const char* file_name = "/proc/cpuinfo";
-  char cpuinfo_line[256];
-  FILE* f = NULL;
-  simd_support = 0;
-
-  if ((f = fopen(file_name, "r")) != NULL) {
-    while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
-      if (strstr(cpuinfo_line, search_string) != NULL) {
-        fclose(f);
-        simd_support |= JSIMD_MIPS_DSPR2;
-        return 1;
-      }
-    }
-    fclose(f);
-  }
-  /* Did not find string in the proc file, or not Linux ELF. */
-  return 0;
-}
-
-#endif
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
-  char *env = NULL;
-
-  if (simd_support != ~0U)
-    return;
-
-  simd_support = 0;
-
-#if defined(__MIPSEL__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-  simd_support |= JSIMD_MIPS_DSPR2;
-#elif defined(__linux__)
-  /* We still have a chance to use MIPS DSPR2 regardless of globally used
-   * -mdspr2 options passed to gcc by performing runtime detection via
-   * /proc/cpuinfo parsing on linux */
-  if (!parse_proc_cpuinfo("MIPS 74K"))
-    return;
-#endif
-
-  /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCEDSPR2");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = JSIMD_MIPS_DSPR2;
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = 0;
-}
-
-static const int mips_idct_ifast_coefs[4] = {
-  0x45404540,           // FIX( 1.082392200 / 2) =  17734 = 0x4546
-  0x5A805A80,           // FIX( 1.414213562 / 2) =  23170 = 0x5A82
-  0x76407640,           // FIX( 1.847759065 / 2) =  30274 = 0x7642
-  0xAC60AC60            // FIX(-2.613125930 / 4) = -21407 = 0xAC61
-};
-
-/* The following struct is borrowed from jdsample.c */
-typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
-                               jpeg_component_info *compptr,
-                               JSAMPARRAY input_data,
-                               JSAMPARRAY *output_data_ptr);
-
-typedef struct {
-  struct jpeg_upsampler pub;
-  JSAMPARRAY color_buf[MAX_COMPONENTS];
-  upsample1_ptr methods[MAX_COMPONENTS];
-  int next_row_out;
-  JDIMENSION rows_to_go;
-  int rowgroup_height[MAX_COMPONENTS];
-  UINT8 h_expand[MAX_COMPONENTS];
-  UINT8 v_expand[MAX_COMPONENTS];
-} my_upsampler;
-
-typedef my_upsampler *my_upsample_ptr;
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_c_can_null_convert (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
-{
-  void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      mipsdspr2fct=jsimd_extrgbx_ycc_convert_mips_dspr2;
-      break;
-    case JCS_EXT_BGR:
-      mipsdspr2fct=jsimd_extbgr_ycc_convert_mips_dspr2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      mipsdspr2fct=jsimd_extbgrx_ycc_convert_mips_dspr2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      mipsdspr2fct=jsimd_extxbgr_ycc_convert_mips_dspr2;
-
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      mipsdspr2fct=jsimd_extxrgb_ycc_convert_mips_dspr2;
-      break;
-    default:
-      mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
-      break;
-  }
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    mipsdspr2fct(cinfo->image_width, input_buf, output_buf, output_row,
-                 num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
-{
-  void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      mipsdspr2fct=jsimd_extrgb_gray_convert_mips_dspr2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      mipsdspr2fct=jsimd_extrgbx_gray_convert_mips_dspr2;
-      break;
-    case JCS_EXT_BGR:
-      mipsdspr2fct=jsimd_extbgr_gray_convert_mips_dspr2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      mipsdspr2fct=jsimd_extbgrx_gray_convert_mips_dspr2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      mipsdspr2fct=jsimd_extxbgr_gray_convert_mips_dspr2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      mipsdspr2fct=jsimd_extxrgb_gray_convert_mips_dspr2;
-      break;
-    default:
-      mipsdspr2fct=jsimd_extrgb_gray_convert_mips_dspr2;
-      break;
-  }
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    mipsdspr2fct(cinfo->image_width, input_buf, output_buf, output_row,
-                 num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
-{
-  void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      mipsdspr2fct=jsimd_ycc_extrgbx_convert_mips_dspr2;
-      break;
-    case JCS_EXT_BGR:
-      mipsdspr2fct=jsimd_ycc_extbgr_convert_mips_dspr2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      mipsdspr2fct=jsimd_ycc_extbgrx_convert_mips_dspr2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      mipsdspr2fct=jsimd_ycc_extxbgr_convert_mips_dspr2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      mipsdspr2fct=jsimd_ycc_extxrgb_convert_mips_dspr2;
-      break;
-  default:
-      mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
-      break;
-  }
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    mipsdspr2fct(cinfo->output_width, input_buf, input_row, output_buf,
-                 num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
-{
-}
-
-GLOBAL(void)
-jsimd_c_null_convert (j_compress_ptr cinfo,
-                      JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                      JDIMENSION output_row, int num_rows)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_c_null_convert_mips_dspr2(cinfo->image_width, input_buf,
-                                    output_buf, output_row, num_rows,
-                                    cinfo->num_components);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_smooth_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if(DCTSIZE != 8)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_h2v2_downsample_mips_dspr2(cinfo->image_width,
-                                     cinfo->max_v_samp_factor,
-                                     compptr->v_samp_factor,
-                                     compptr->width_in_blocks, input_data,
-                                     output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo,
-                              jpeg_component_info *compptr,
-                              JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v2_smooth_downsample_mips_dspr2(input_data, output_data,
-                                          compptr->v_samp_factor,
-                                          cinfo->max_v_samp_factor,
-                                          cinfo->smoothing_factor,
-                                          compptr->width_in_blocks,
-                                          cinfo->image_width);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_h2v1_downsample_mips_dspr2(cinfo->image_width,
-                                     cinfo->max_v_samp_factor,
-                                     compptr->v_samp_factor,
-                                     compptr->width_in_blocks,
-                                     input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_int_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_h2v2_upsample_mips_dspr2(cinfo->max_v_samp_factor,
-                                   cinfo->output_width, input_data,
-                                   output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_h2v1_upsample_mips_dspr2(cinfo->max_v_samp_factor,
-                                   cinfo->output_width, input_data,
-                                   output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
-
-  jsimd_int_upsample_mips_dspr2(upsample->h_expand[compptr->component_index],
-                                upsample->v_expand[compptr->component_index],
-                                input_data, output_data_ptr,
-                                cinfo->output_width,
-                                cinfo->max_v_samp_factor);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_h2v2_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor,
-                                         compptr->downsampled_width,
-                                         input_data, output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_h2v1_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor,
-                                         compptr->downsampled_width,
-                                         input_data, output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
-  init_simd();
-
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
-  init_simd();
-
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY,
-                       JSAMPLE *);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      mipsdspr2fct=jsimd_h2v2_extrgb_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      mipsdspr2fct=jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_BGR:
-      mipsdspr2fct=jsimd_h2v2_extbgr_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      mipsdspr2fct=jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      mipsdspr2fct=jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      mipsdspr2fct=jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2;
-      break;
-    default:
-      mipsdspr2fct=jsimd_h2v2_extrgb_merged_upsample_mips_dspr2;
-      break;
-  }
-
-  mipsdspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
-               cinfo->sample_range_limit);
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY,
-                       JSAMPLE *);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      mipsdspr2fct=jsimd_h2v1_extrgb_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      mipsdspr2fct=jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_BGR:
-      mipsdspr2fct=jsimd_h2v1_extbgr_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      mipsdspr2fct=jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      mipsdspr2fct=jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      mipsdspr2fct=jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2;
-      break;
-    default:
-      mipsdspr2fct=jsimd_h2v1_extrgb_merged_upsample_mips_dspr2;
-      break;
-  }
-
-  mipsdspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
-               cinfo->sample_range_limit);
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_convsamp_mips_dspr2(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
-{
-  if ((simd_support & JSIMD_MIPS_DSPR2))
-    jsimd_convsamp_float_mips_dspr2(sample_data, start_col, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_fdct_islow_mips_dspr2(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_fdct_ifast_mips_dspr2(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_quantize_mips_dspr2(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_quantize_float_mips_dspr2(coef_block, divisors, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_6x6 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_12x12 (void)
-{
-  init_simd();
-
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_idct_2x2_mips_dspr2(compptr->dct_table, coef_block, output_buf,
-                              output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2) {
-    int workspace[DCTSIZE*4];  /* buffers data between passes */
-    jsimd_idct_4x4_mips_dspr2(compptr->dct_table, coef_block, output_buf,
-                              output_col, workspace);
-  }
-}
-
-GLOBAL(void)
-jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-           JCOEFPTR coef_block, JSAMPARRAY output_buf,
-           JDIMENSION output_col)
-{
-    if (simd_support & JSIMD_MIPS_DSPR2)
-      jsimd_idct_6x6_mips_dspr2(compptr->dct_table, coef_block, output_buf,
-                                output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block,
-                  JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2) {
-    int workspace[96];
-    int output[12] = {
-      (int)(output_buf[0] + output_col),
-      (int)(output_buf[1] + output_col),
-      (int)(output_buf[2] + output_col),
-      (int)(output_buf[3] + output_col),
-      (int)(output_buf[4] + output_col),
-      (int)(output_buf[5] + output_col),
-      (int)(output_buf[6] + output_col),
-      (int)(output_buf[7] + output_col),
-      (int)(output_buf[8] + output_col),
-      (int)(output_buf[9] + output_col),
-      (int)(output_buf[10] + output_col),
-      (int)(output_buf[11] + output_col),
-    };
-    jsimd_idct_12x12_pass1_mips_dspr2(coef_block, compptr->dct_table,
-                                      workspace);
-    jsimd_idct_12x12_pass2_mips_dspr2(workspace, output);
-  }
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(IFAST_MULT_TYPE) != 2)
-    return 0;
-  if (IFAST_SCALE_BITS != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2) {
-    int output[8] = {
-      (int)(output_buf[0] + output_col),
-      (int)(output_buf[1] + output_col),
-      (int)(output_buf[2] + output_col),
-      (int)(output_buf[3] + output_col),
-      (int)(output_buf[4] + output_col),
-      (int)(output_buf[5] + output_col),
-      (int)(output_buf[6] + output_col),
-      (int)(output_buf[7] + output_col),
-    };
-
-    jsimd_idct_islow_mips_dspr2(coef_block, compptr->dct_table,
-                                output, IDCT_range_limit(cinfo));
-  }
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2) {
-    JCOEFPTR inptr;
-    IFAST_MULT_TYPE *quantptr;
-    DCTELEM workspace[DCTSIZE2];  /* buffers data between passes */
-
-    /* Pass 1: process columns from input, store into work array. */
-
-    inptr = coef_block;
-    quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
-
-    jsimd_idct_ifast_cols_mips_dspr2(inptr, quantptr,
-                                     workspace, mips_idct_ifast_coefs);
-
-    /* Pass 2: process rows from work array, store into output array. */
-    /* Note that we must descale the results by a factor of 8 == 2**3, */
-    /* and also undo the PASS1_BITS scaling. */
-
-    jsimd_idct_ifast_rows_mips_dspr2(workspace, output_buf,
-                                     output_col, mips_idct_ifast_coefs);
-  }
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
-  return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
-{
-  return NULL;
-}
diff --git a/simd/jsimd_mips_dspr2.S b/simd/jsimd_mips_dspr2.S
deleted file mode 100644
index c26dd5c..0000000
--- a/simd/jsimd_mips_dspr2.S
+++ /dev/null
@@ -1,4486 +0,0 @@
-/*
- * MIPS DSPr2 optimizations for libjpeg-turbo
- *
- * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
- * All Rights Reserved.
- * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
- *           Darko Laus       (darko.laus@imgtec.com)
- * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#include "jsimd_mips_dspr2_asm.h"
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_c_null_convert_mips_dspr2)
-/*
- * a0     - cinfo->image_width
- * a1     - input_buf
- * a2     - output_buf
- * a3     - output_row
- * 16(sp) - num_rows
- * 20(sp) - cinfo->num_components
- *
- * Null conversion for compression
- */
-
-    SAVE_REGS_ON_STACK 8, s0, s1
-
-    lw        t9, 24(sp)   // t9 = num_rows
-    lw        s0, 28(sp)   // s0 = cinfo->num_components
-    andi      t0, a0, 3    // t0 = cinfo->image_width & 3
-    beqz      t0, 4f       // no residual
-     nop
-0:
-    addiu     t9, t9, -1
-    bltz      t9, 7f
-     li       t1, 0
-1:
-    sll       t3, t1, 2
-    lwx       t5, t3(a2)   // t5 = outptr = output_buf[ci]
-    lw        t2, 0(a1)    // t2 = inptr = *input_buf
-    sll       t4, a3, 2
-    lwx       t5, t4(t5)   // t5 = outptr = output_buf[ci][output_row]
-    addu      t2, t2, t1
-    addu      s1, t5, a0
-    addu      t6, t5, t0
-2:
-    lbu       t3, 0(t2)
-    addiu     t5, t5, 1
-    sb        t3, -1(t5)
-    bne       t6, t5, 2b
-     addu     t2, t2, s0
-3:
-    lbu       t3, 0(t2)
-    addu      t4, t2, s0
-    addu      t7, t4, s0
-    addu      t8, t7, s0
-    addu      t2, t8, s0
-    lbu       t4, 0(t4)
-    lbu       t7, 0(t7)
-    lbu       t8, 0(t8)
-    addiu     t5, t5, 4
-    sb        t3, -4(t5)
-    sb        t4, -3(t5)
-    sb        t7, -2(t5)
-    bne       s1, t5, 3b
-     sb       t8, -1(t5)
-    addiu     t1, t1, 1
-    bne       t1, s0, 1b
-     nop
-    addiu     a1, a1, 4
-    bgez      t9, 0b
-     addiu    a3, a3, 1
-    b         7f
-     nop
-4:
-    addiu     t9, t9, -1
-    bltz      t9, 7f
-     li       t1, 0
-5:
-    sll       t3, t1, 2
-    lwx       t5, t3(a2)   // t5 = outptr = output_buf[ci]
-    lw        t2, 0(a1)    // t2 = inptr = *input_buf
-    sll       t4, a3, 2
-    lwx       t5, t4(t5)   // t5 = outptr = output_buf[ci][output_row]
-    addu      t2, t2, t1
-    addu      s1, t5, a0
-    addu      t6, t5, t0
-6:
-    lbu       t3, 0(t2)
-    addu      t4, t2, s0
-    addu      t7, t4, s0
-    addu      t8, t7, s0
-    addu      t2, t8, s0
-    lbu       t4, 0(t4)
-    lbu       t7, 0(t7)
-    lbu       t8, 0(t8)
-    addiu     t5, t5, 4
-    sb        t3, -4(t5)
-    sb        t4, -3(t5)
-    sb        t7, -2(t5)
-    bne       s1, t5, 6b
-     sb       t8, -1(t5)
-    addiu     t1, t1, 1
-    bne       t1, s0, 5b
-     nop
-    addiu     a1, a1, 4
-    bgez      t9, 4b
-     addiu    a3, a3, 1
-7:
-    RESTORE_REGS_FROM_STACK 8, s0, s1
-
-    j         ra
-     nop
-
-END(jsimd_c_null_convert_mips_dspr2)
-
-/*****************************************************************************/
-/*
- * jsimd_extrgb_ycc_convert_mips_dspr2
- * jsimd_extbgr_ycc_convert_mips_dspr2
- * jsimd_extrgbx_ycc_convert_mips_dspr2
- * jsimd_extbgrx_ycc_convert_mips_dspr2
- * jsimd_extxbgr_ycc_convert_mips_dspr2
- * jsimd_extxrgb_ycc_convert_mips_dspr2
- *
- * Colorspace conversion RGB -> YCbCr
- */
-
-.macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
-
-.macro DO_RGB_TO_YCC r,    \
-                     g,    \
-                     b,    \
-                     inptr
-    lbu     \r, \r_offs(\inptr)
-    lbu     \g, \g_offs(\inptr)
-    lbu     \b, \b_offs(\inptr)
-    addiu   \inptr, \pixel_size
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2)
-/*
- * a0     - cinfo->image_width
- * a1     - input_buf
- * a2     - output_buf
- * a3     - output_row
- * 16(sp) - num_rows
- */
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    lw      t7, 48(sp)        // t7 = num_rows
-    li      s0, 0x4c8b        // FIX(0.29900)
-    li      s1, 0x9646        // FIX(0.58700)
-    li      s2, 0x1d2f        // FIX(0.11400)
-    li      s3, 0xffffd4cd    // -FIX(0.16874)
-    li      s4, 0xffffab33    // -FIX(0.33126)
-    li      s5, 0x8000        // FIX(0.50000)
-    li      s6, 0xffff94d1    // -FIX(0.41869)
-    li      s7, 0xffffeb2f    // -FIX(0.08131)
-    li      t8, 0x807fff      // CBCR_OFFSET + ONE_HALF-1
-
-0:
-    addiu   t7, -1            // --num_rows
-    lw      t6, 0(a1)         // t6 = input_buf[0]
-    lw      t0, 0(a2)
-    lw      t1, 4(a2)
-    lw      t2, 8(a2)
-    sll     t3, a3, 2
-    lwx     t0, t3(t0)        // t0 = output_buf[0][output_row]
-    lwx     t1, t3(t1)        // t1 = output_buf[1][output_row]
-    lwx     t2, t3(t2)        // t2 = output_buf[2][output_row]
-
-    addu    t9, t2, a0        // t9 = end address
-    addiu   a3, 1
-
-1:
-    DO_RGB_TO_YCC t3, t4, t5, t6
-
-    mtlo    s5, $ac0
-    mtlo    t8, $ac1
-    mtlo    t8, $ac2
-    maddu   $ac0, s2, t5
-    maddu   $ac1, s5, t5
-    maddu   $ac2, s5, t3
-    maddu   $ac0, s0, t3
-    maddu   $ac1, s3, t3
-    maddu   $ac2, s6, t4
-    maddu   $ac0, s1, t4
-    maddu   $ac1, s4, t4
-    maddu   $ac2, s7, t5
-    extr.w  t3, $ac0, 16
-    extr.w  t4, $ac1, 16
-    extr.w  t5, $ac2, 16
-    sb      t3, 0(t0)
-    sb      t4, 0(t1)
-    sb      t5, 0(t2)
-    addiu   t0, 1
-    addiu   t2, 1
-    bne     t2, t9, 1b
-     addiu  t1, 1
-    bgtz    t7, 0b
-     addiu  a1, 4
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j ra
-     nop
-END(jsimd_\colorid\()_ycc_convert_mips_dspr2)
-
-.purgem DO_RGB_TO_YCC
-
-.endm
-
-/*------------------------------------------id -- pix R  G  B */
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
-
-/*****************************************************************************/
-/*
- * jsimd_ycc_extrgb_convert_mips_dspr2
- * jsimd_ycc_extbgr_convert_mips_dspr2
- * jsimd_ycc_extrgbx_convert_mips_dspr2
- * jsimd_ycc_extbgrx_convert_mips_dspr2
- * jsimd_ycc_extxbgr_convert_mips_dspr2
- * jsimd_ycc_extxrgb_convert_mips_dspr2
- *
- * Colorspace conversion YCbCr -> RGB
- */
-
-.macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs
-
-.macro STORE_YCC_TO_RGB  scratch0 \
-                         scratch1 \
-                         scratch2 \
-                         outptr
-    sb       \scratch0, \r_offs(\outptr)
-    sb       \scratch1, \g_offs(\outptr)
-    sb       \scratch2, \b_offs(\outptr)
-.if (\pixel_size == 4)
-    li       t0, 0xFF
-    sb       t0, \a_offs(\outptr)
-.endif
-    addiu    \outptr, \pixel_size
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2)
-/*
- * a0     - cinfo->image_width
- * a1     - input_buf
- * a2     - input_row
- * a3     - output_buf
- * 16(sp) - num_rows
- */
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    lw         s1, 48(sp)
-    li         t3, 0x8000
-    li         t4, 0x166e9     // FIX(1.40200)
-    li         t5, 0x1c5a2     // FIX(1.77200)
-    li         t6, 0xffff492e  // -FIX(0.71414)
-    li         t7, 0xffffa7e6  // -FIX(0.34414)
-    repl.ph    t8, 128
-
-0:
-    lw         s0, 0(a3)
-    lw         t0, 0(a1)
-    lw         t1, 4(a1)
-    lw         t2, 8(a1)
-    sll        s5, a2, 2
-    addiu      s1, -1
-    lwx        s2, s5(t0)
-    lwx        s3, s5(t1)
-    lwx        s4, s5(t2)
-    addu       t9, s2, a0
-    addiu      a2, 1
-
-1:
-    lbu        s7, 0(s4)       // cr
-    lbu        s6, 0(s3)       // cb
-    lbu        s5, 0(s2)       // y
-    addiu      s2, 1
-    addiu      s4, 1
-    addiu      s7, -128
-    addiu      s6, -128
-    mul        t2, t7, s6
-    mul        t0, t6, s7      // Crgtab[cr]
-    sll        s7, 15
-    mulq_rs.w  t1, t4, s7      // Crrtab[cr]
-    sll        s6, 15
-    addu       t2, t3          // Cbgtab[cb]
-    addu       t2, t0
-
-    mulq_rs.w  t0, t5, s6      // Cbbtab[cb]
-    sra        t2, 16
-    addu       t1, s5
-    addu       t2, s5          // add y
-    ins        t2, t1, 16, 16
-    subu.ph    t2, t2, t8
-    addu       t0, s5
-    shll_s.ph  t2, t2, 8
-    subu       t0, 128
-    shra.ph    t2, t2, 8
-    shll_s.w   t0, t0, 24
-    addu.ph    t2, t2, t8      // clip & store
-    sra        t0, t0, 24
-    sra        t1, t2, 16
-    addiu      t0, 128
-
-    STORE_YCC_TO_RGB t1, t2, t0, s0
-
-    bne        s2, t9, 1b
-     addiu     s3, 1
-    bgtz       s1, 0b
-     addiu     a3, 4
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j ra
-     nop
-END(jsimd_ycc_\colorid\()_convert_mips_dspr2)
-
-.purgem STORE_YCC_TO_RGB
-
-.endm
-
-/*------------------------------------------id -- pix R  G  B  A */
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0
-
-/*****************************************************************************/
-/*
- * jsimd_extrgb_gray_convert_mips_dspr2
- * jsimd_extbgr_gray_convert_mips_dspr2
- * jsimd_extrgbx_gray_convert_mips_dspr2
- * jsimd_extbgrx_gray_convert_mips_dspr2
- * jsimd_extxbgr_gray_convert_mips_dspr2
- * jsimd_extxrgb_gray_convert_mips_dspr2
- *
- * Colorspace conversion RGB -> GRAY
- */
-
-.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
-
-.macro DO_RGB_TO_GRAY r,    \
-                      g,    \
-                      b,    \
-                      inptr
-    lbu     \r, \r_offs(\inptr)
-    lbu     \g, \g_offs(\inptr)
-    lbu     \b, \b_offs(\inptr)
-    addiu   \inptr, \pixel_size
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_\colorid\()_gray_convert_mips_dspr2)
-/*
- * a0     - cinfo->image_width
- * a1     - input_buf
- * a2     - output_buf
- * a3     - output_row
- * 16(sp) - num_rows
- */
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    li      s0, 0x4c8b             // s0 = FIX(0.29900)
-    li      s1, 0x9646             // s1 = FIX(0.58700)
-    li      s2, 0x1d2f             // s2 = FIX(0.11400)
-    li      s7, 0x8000             // s7 = FIX(0.50000)
-    lw      s6, 48(sp)
-    andi    t7, a0, 3
-
-0:
-    addiu   s6, -1                 // s6 = num_rows
-    lw      t0, 0(a1)
-    lw      t1, 0(a2)
-    sll     t3, a3, 2
-    lwx     t1, t3(t1)
-    addiu   a3, 1
-    addu    t9, t1, a0
-    subu    t8, t9, t7
-    beq     t1, t8, 2f
-     nop
-
-1:
-    DO_RGB_TO_GRAY t3, t4, t5, t0
-    DO_RGB_TO_GRAY s3, s4, s5, t0
-
-    mtlo    s7, $ac0
-    maddu   $ac0, s2, t5
-    maddu   $ac0, s1, t4
-    maddu   $ac0, s0, t3
-    mtlo    s7, $ac1
-    maddu   $ac1, s2, s5
-    maddu   $ac1, s1, s4
-    maddu   $ac1, s0, s3
-    extr.w  t6, $ac0, 16
-
-    DO_RGB_TO_GRAY t3, t4, t5, t0
-    DO_RGB_TO_GRAY s3, s4, s5, t0
-
-    mtlo    s7, $ac0
-    maddu   $ac0, s2, t5
-    maddu   $ac0, s1, t4
-    extr.w  t2, $ac1, 16
-    maddu   $ac0, s0, t3
-    mtlo    s7, $ac1
-    maddu   $ac1, s2, s5
-    maddu   $ac1, s1, s4
-    maddu   $ac1, s0, s3
-    extr.w  t5, $ac0, 16
-    sb      t6, 0(t1)
-    sb      t2, 1(t1)
-    extr.w  t3, $ac1, 16
-    addiu   t1, 4
-    sb      t5, -2(t1)
-    sb      t3, -1(t1)
-    bne     t1, t8, 1b
-     nop
-
-2:
-    beqz    t7, 4f
-     nop
-
-3:
-    DO_RGB_TO_GRAY t3, t4, t5, t0
-
-    mtlo    s7, $ac0
-    maddu   $ac0, s2, t5
-    maddu   $ac0, s1, t4
-    maddu   $ac0, s0, t3
-    extr.w  t6, $ac0, 16
-    sb      t6, 0(t1)
-    addiu   t1, 1
-    bne     t1, t9, 3b
-     nop
-
-4:
-    bgtz    s6, 0b
-     addiu  a1, 4
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j ra
-     nop
-END(jsimd_\colorid\()_gray_convert_mips_dspr2)
-
-.purgem DO_RGB_TO_GRAY
-
-.endm
-
-/*------------------------------------------id --  pix R  G  B */
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
-/*****************************************************************************/
-/*
- * jsimd_h2v2_merged_upsample_mips_dspr2
- * jsimd_h2v2_extrgb_merged_upsample_mips_dspr2
- * jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2
- * jsimd_h2v2_extbgr_merged_upsample_mips_dspr2
- * jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2
- * jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2
- * jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2
- *
- * Merged h2v2 upsample routines
- */
-.macro GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 colorid,    \
-                                                pixel_size, \
-                                                r1_offs,    \
-                                                g1_offs,    \
-                                                b1_offs,    \
-                                                a1_offs,    \
-                                                r2_offs,    \
-                                                g2_offs,    \
-                                                b2_offs,    \
-                                                a2_offs
-
-.macro STORE_H2V2_2_PIXELS  scratch0 \
-                            scratch1 \
-                            scratch2 \
-                            scratch3 \
-                            scratch4 \
-                            scratch5 \
-                            outptr
-    sb       \scratch0, \r1_offs(\outptr)
-    sb       \scratch1, \g1_offs(\outptr)
-    sb       \scratch2, \b1_offs(\outptr)
-    sb       \scratch3, \r2_offs(\outptr)
-    sb       \scratch4, \g2_offs(\outptr)
-    sb       \scratch5, \b2_offs(\outptr)
-.if (\pixel_size == 8)
-    li       \scratch0, 0xFF
-    sb       \scratch0, \a1_offs(\outptr)
-    sb       \scratch0, \a2_offs(\outptr)
-.endif
-    addiu    \outptr, \pixel_size
-.endm
-
-.macro STORE_H2V2_1_PIXEL  scratch0 \
-                           scratch1 \
-                           scratch2 \
-                           outptr
-    sb    \scratch0, \r1_offs(\outptr)
-    sb    \scratch1, \g1_offs(\outptr)
-    sb    \scratch2, \b1_offs(\outptr)
-
-.if (\pixel_size == 8)
-    li    t0, 0xFF
-    sb    t0, \a1_offs(\outptr)
-.endif
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
-/*
- * a0     - cinfo->output_width
- * a1     - input_buf
- * a2     - in_row_group_ctr
- * a3     - output_buf
- * 16(sp) - cinfo->sample_range_limit
- */
-
-    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
-    lw           t9, 56(sp)        // cinfo->sample_range_limit
-    lw           v0, 0(a1)
-    lw           v1, 4(a1)
-    lw           t0, 8(a1)
-    sll          t1, a2, 3
-    addiu        t2, t1, 4
-    sll          t3, a2, 2
-    lw           t4, 0(a3)         // t4 = output_buf[0]
-    lwx          t1, t1(v0)        // t1 = input_buf[0][in_row_group_ctr*2]
-    lwx          t2, t2(v0)        // t2 = input_buf[0][in_row_group_ctr*2 + 1]
-    lwx          t5, t3(v1)        // t5 = input_buf[1][in_row_group_ctr]
-    lwx          t6, t3(t0)        // t6 = input_buf[2][in_row_group_ctr]
-    lw           t7, 4(a3)         // t7 = output_buf[1]
-    li           s1, 0xe6ea
-    addiu        t8, s1, 0x7fff    // t8 = 0x166e9 [FIX(1.40200)]
-    addiu        s0, t8, 0x5eb9    // s0 = 0x1c5a2 [FIX(1.77200)]
-    addiu        s1, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
-    xori         s2, s1, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
-    srl          t3, a0, 1
-    blez         t3, 2f
-     addu        t0, t5, t3        // t0 = end address
- 1:
-    lbu          t3, 0(t5)
-    lbu          s3, 0(t6)
-    addiu        t5, t5, 1
-    addiu        t3, t3, -128      // (cb - 128)
-    addiu        s3, s3, -128      // (cr - 128)
-    mult         $ac1, s1, t3
-    madd         $ac1, s2, s3
-    sll          s3, s3, 15
-    sll          t3, t3, 15
-    mulq_rs.w    s4, t8, s3        // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
-    extr_r.w     s5, $ac1, 16
-    mulq_rs.w    s6, s0, t3        // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
-    lbu          v0, 0(t1)
-    addiu        t6, t6, 1
-    addiu        t1, t1, 2
-    addu         t3, v0, s4        // y+cred
-    addu         s3, v0, s5        // y+cgreen
-    addu         v1, v0, s6        // y+cblue
-    addu         t3, t9, t3        // y+cred
-    addu         s3, t9, s3        // y+cgreen
-    addu         v1, t9, v1        // y+cblue
-    lbu          AT, 0(t3)
-    lbu          s7, 0(s3)
-    lbu          ra, 0(v1)
-    lbu          v0, -1(t1)
-    addu         t3, v0, s4        // y+cred
-    addu         s3, v0, s5        // y+cgreen
-    addu         v1, v0, s6        // y+cblue
-    addu         t3, t9, t3        // y+cred
-    addu         s3, t9, s3        // y+cgreen
-    addu         v1, t9, v1        // y+cblue
-    lbu          t3, 0(t3)
-    lbu          s3, 0(s3)
-    lbu          v1, 0(v1)
-    lbu          v0, 0(t2)
-
-    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
-
-    addu         t3, v0, s4        // y+cred
-    addu         s3, v0, s5        // y+cgreen
-    addu         v1, v0, s6        // y+cblue
-    addu         t3, t9, t3        // y+cred
-    addu         s3, t9, s3        // y+cgreen
-    addu         v1, t9, v1        // y+cblue
-    lbu          AT, 0(t3)
-    lbu          s7, 0(s3)
-    lbu          ra, 0(v1)
-    lbu          v0, 1(t2)
-    addiu        t2, t2, 2
-    addu         t3, v0, s4        // y+cred
-    addu         s3, v0, s5        // y+cgreen
-    addu         v1, v0, s6        // y+cblue
-    addu         t3, t9, t3        // y+cred
-    addu         s3, t9, s3        // y+cgreen
-    addu         v1, t9, v1        // y+cblue
-    lbu          t3, 0(t3)
-    lbu          s3, 0(s3)
-    lbu          v1, 0(v1)
-
-    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
-
-    bne          t0, t5, 1b
-     nop
-2:
-    andi         t0, a0, 1
-    beqz         t0, 4f
-     lbu          t3, 0(t5)
-    lbu          s3, 0(t6)
-    addiu        t3, t3, -128      // (cb - 128)
-    addiu        s3, s3, -128      // (cr - 128)
-    mult         $ac1, s1, t3
-    madd         $ac1, s2, s3
-    sll          s3, s3, 15
-    sll          t3, t3, 15
-    lbu          v0, 0(t1)
-    extr_r.w     s5, $ac1, 16
-    mulq_rs.w    s4, t8, s3        // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
-    mulq_rs.w    s6, s0, t3        // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
-    addu         t3, v0, s4        // y+cred
-    addu         s3, v0, s5        // y+cgreen
-    addu         v1, v0, s6        // y+cblue
-    addu         t3, t9, t3        // y+cred
-    addu         s3, t9, s3        // y+cgreen
-    addu         v1, t9, v1        // y+cblue
-    lbu          t3, 0(t3)
-    lbu          s3, 0(s3)
-    lbu          v1, 0(v1)
-    lbu          v0, 0(t2)
-
-    STORE_H2V2_1_PIXEL t3, s3, v1, t4
-
-    addu         t3, v0, s4        // y+cred
-    addu         s3, v0, s5        // y+cgreen
-    addu         v1, v0, s6        // y+cblue
-    addu         t3, t9, t3        // y+cred
-    addu         s3, t9, s3        // y+cgreen
-    addu         v1, t9, v1        // y+cblue
-    lbu          t3, 0(t3)
-    lbu          s3, 0(s3)
-    lbu          v1, 0(v1)
-
-    STORE_H2V2_1_PIXEL t3, s3, v1, t7
-4:
-    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
-    j           ra
-     nop
-
-END(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
-
-.purgem STORE_H2V2_1_PIXEL
-.purgem STORE_H2V2_2_PIXELS
-.endm
-
-/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
-/*****************************************************************************/
-/*
- * jsimd_h2v1_merged_upsample_mips_dspr2
- * jsimd_h2v1_extrgb_merged_upsample_mips_dspr2
- * jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2
- * jsimd_h2v1_extbgr_merged_upsample_mips_dspr2
- * jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2
- * jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2
- * jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2
- *
- * Merged h2v1 upsample routines
- */
-
-.macro GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 colorid,    \
-                                                pixel_size, \
-                                                r1_offs,    \
-                                                g1_offs,    \
-                                                b1_offs,    \
-                                                a1_offs,    \
-                                                r2_offs,    \
-                                                g2_offs,    \
-                                                b2_offs,    \
-                                                a2_offs
-
-.macro STORE_H2V1_2_PIXELS  scratch0 \
-                            scratch1 \
-                            scratch2 \
-                            scratch3 \
-                            scratch4 \
-                            scratch5 \
-                            outptr
-    sb       \scratch0, \r1_offs(\outptr)
-    sb       \scratch1, \g1_offs(\outptr)
-    sb       \scratch2, \b1_offs(\outptr)
-    sb       \scratch3, \r2_offs(\outptr)
-    sb       \scratch4, \g2_offs(\outptr)
-    sb       \scratch5, \b2_offs(\outptr)
-.if (\pixel_size == 8)
-    li       t0, 0xFF
-    sb       t0, \a1_offs(\outptr)
-    sb       t0, \a2_offs(\outptr)
-.endif
-    addiu    \outptr, \pixel_size
-.endm
-
-.macro STORE_H2V1_1_PIXEL  scratch0 \
-                           scratch1 \
-                           scratch2 \
-                           outptr
-    sb    \scratch0, \r1_offs(\outptr)
-    sb    \scratch1, \g1_offs(\outptr)
-    sb    \scratch2, \b1_offs(\outptr)
-.if (\pixel_size == 8)
-    li    t0, 0xFF
-    sb    t0, \a1_offs(\outptr)
-.endif
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
-/*
- * a0     - cinfo->output_width
- * a1     - input_buf
- * a2     - in_row_group_ctr
- * a3     - output_buf
- * 16(sp) - range_limit
- */
-
-    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
-    li           t0, 0xe6ea
-    lw           t1, 0(a1)         // t1 = input_buf[0]
-    lw           t2, 4(a1)         // t2 = input_buf[1]
-    lw           t3, 8(a1)         // t3 = input_buf[2]
-    lw           t8, 56(sp)        // t8 = range_limit
-    addiu        s1, t0, 0x7fff    // s1 = 0x166e9 [FIX(1.40200)]
-    addiu        s2, s1, 0x5eb9    // s2 = 0x1c5a2 [FIX(1.77200)]
-    addiu        s0, t0, 0x9916    // s0 = 0x8000
-    addiu        s4, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
-    xori         s3, s4, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
-    srl          t0, a0, 1
-    sll          t4, a2, 2
-    lwx          s5, t4(t1)        // s5 = inptr0
-    lwx          s6, t4(t2)        // s6 = inptr1
-    lwx          s7, t4(t3)        // s7 = inptr2
-    lw           t7, 0(a3)         // t7 = outptr
-    blez         t0, 2f
-     addu        t9, s6, t0        // t9 = end address
-1:
-    lbu          t2, 0(s6)         // t2 = cb
-    lbu          t0, 0(s7)         // t0 = cr
-    lbu          t1, 0(s5)         // t1 = y
-    addiu        t2, t2, -128      // t2 = cb - 128
-    addiu        t0, t0, -128      // t0 = cr - 128
-    mult         $ac1, s4, t2
-    madd         $ac1, s3, t0
-    sll          t0, t0, 15
-    sll          t2, t2, 15
-    mulq_rs.w    t0, s1, t0        // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
-    extr_r.w     t5, $ac1, 16
-    mulq_rs.w    t6, s2, t2        // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
-    addiu        s7, s7, 1
-    addiu        s6, s6, 1
-    addu         t2, t1, t0        // t2 = y + cred
-    addu         t3, t1, t5        // t3 = y + cgreen
-    addu         t4, t1, t6        // t4 = y + cblue
-    addu         t2, t8, t2
-    addu         t3, t8, t3
-    addu         t4, t8, t4
-    lbu          t1, 1(s5)
-    lbu          v0, 0(t2)
-    lbu          v1, 0(t3)
-    lbu          ra, 0(t4)
-    addu         t2, t1, t0
-    addu         t3, t1, t5
-    addu         t4, t1, t6
-    addu         t2, t8, t2
-    addu         t3, t8, t3
-    addu         t4, t8, t4
-    lbu          t2, 0(t2)
-    lbu          t3, 0(t3)
-    lbu          t4, 0(t4)
-
-    STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
-
-    bne          t9, s6, 1b
-     addiu       s5, s5, 2
-2:
-    andi         t0, a0, 1
-    beqz         t0, 4f
-     nop
-3:
-    lbu          t2, 0(s6)
-    lbu          t0, 0(s7)
-    lbu          t1, 0(s5)
-    addiu        t2, t2, -128      //(cb - 128)
-    addiu        t0, t0, -128      //(cr - 128)
-    mul          t3, s4, t2
-    mul          t4, s3, t0
-    sll          t0, t0, 15
-    sll          t2, t2, 15
-    mulq_rs.w    t0, s1, t0       // (C1*cr + ONE_HALF)>> SCALEBITS
-    mulq_rs.w    t6, s2, t2       // (C2*cb + ONE_HALF)>> SCALEBITS
-    addu         t3, t3, s0
-    addu         t3, t4, t3
-    sra          t5, t3, 16       // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
-    addu         t2, t1, t0       // y + cred
-    addu         t3, t1, t5       // y + cgreen
-    addu         t4, t1, t6       // y + cblue
-    addu         t2, t8, t2
-    addu         t3, t8, t3
-    addu         t4, t8, t4
-    lbu          t2, 0(t2)
-    lbu          t3, 0(t3)
-    lbu          t4, 0(t4)
-
-    STORE_H2V1_1_PIXEL t2, t3, t4, t7
-4:
-    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
-    j            ra
-     nop
-
-END(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
-
-.purgem STORE_H2V1_1_PIXEL
-.purgem STORE_H2V1_2_PIXELS
-.endm
-
-/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
-/*****************************************************************************/
-/*
- * jsimd_h2v2_fancy_upsample_mips_dspr2
- *
- * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
- */
-LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2)
-/*
- * a0     - cinfo->max_v_samp_factor
- * a1     - downsampled_width
- * a2     - input_data
- * a3     - output_data_ptr
- */
-
-    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
-
-    li             s4, 0
-    lw             s2, 0(a3)       // s2 = *output_data_ptr
-0:
-    li             t9, 2
-    lw             s1, -4(a2)      // s1 = inptr1
-
-1:
-    lw             s0, 0(a2)       // s0 = inptr0
-    lwx            s3, s4(s2)
-    addiu          s5, a1, -2      // s5 = downsampled_width - 2
-    srl            t4, s5, 1
-    sll            t4, t4, 1
-    lbu            t0, 0(s0)
-    lbu            t1, 1(s0)
-    lbu            t2, 0(s1)
-    lbu            t3, 1(s1)
-    addiu          s0, 2
-    addiu          s1, 2
-    addu           t8, s0, t4      // t8 = end address
-    andi           s5, s5, 1       // s5 = residual
-    sll            t4, t0, 1
-    sll            t6, t1, 1
-    addu           t0, t0, t4      // t0 = (*inptr0++) * 3
-    addu           t1, t1, t6      // t1 = (*inptr0++) * 3
-    addu           t7, t0, t2      // t7 = thiscolsum
-    addu           t6, t1, t3      // t5 = nextcolsum
-    sll            t0, t7, 2       // t0 = thiscolsum * 4
-    subu           t1, t0, t7      // t1 = thiscolsum * 3
-    shra_r.w       t0, t0, 4
-    addiu          t1, 7
-    addu           t1, t1, t6
-    srl            t1, t1, 4
-    sb             t0, 0(s3)
-    sb             t1, 1(s3)
-    beq            t8, s0, 22f     // skip to final iteration if width == 3
-     addiu          s3, 2
-2:
-    lh             t0, 0(s0)       // t0 = A3|A2
-    lh             t2, 0(s1)       // t2 = B3|B2
-    addiu          s0, 2
-    addiu          s1, 2
-    preceu.ph.qbr  t0, t0          // t0 = 0|A3|0|A2
-    preceu.ph.qbr  t2, t2          // t2 = 0|B3|0|B2
-    shll.ph        t1, t0, 1
-    sll            t3, t6, 1
-    addu.ph        t0, t1, t0      // t0 = A3*3|A2*3
-    addu           t3, t3, t6      // t3 = this * 3
-    addu.ph        t0, t0, t2      // t0 = next2|next1
-    addu           t1, t3, t7
-    andi           t7, t0, 0xFFFF  // t7 = next1
-    sll            t2, t7, 1
-    addu           t2, t7, t2      // t2 = next1*3
-    addu           t4, t2, t6
-    srl            t6, t0, 16      // t6 = next2
-    shra_r.w       t1, t1, 4       // t1 = (this*3 + last + 8) >> 4
-    addu           t0, t3, t7
-    addiu          t0, 7
-    srl            t0, t0, 4       // t0 = (this*3 + next1 + 7) >> 4
-    shra_r.w       t4, t4, 4       // t3 = (next1*3 + this + 8) >> 4
-    addu           t2, t2, t6
-    addiu          t2, 7
-    srl            t2, t2, 4       // t2 = (next1*3 + next2 + 7) >> 4
-    sb             t1, 0(s3)
-    sb             t0, 1(s3)
-    sb             t4, 2(s3)
-    sb             t2, 3(s3)
-    bne            t8, s0, 2b
-     addiu         s3, 4
-22:
-    beqz           s5, 4f
-     addu          t8, s0, s5
-3:
-    lbu            t0, 0(s0)
-    lbu            t2, 0(s1)
-    addiu          s0, 1
-    addiu          s1, 1
-    sll            t3, t6, 1
-    sll            t1, t0, 1
-    addu           t1, t0, t1      // t1 = inptr0 * 3
-    addu           t3, t3, t6      // t3 = thiscolsum * 3
-    addu           t5, t1, t2
-    addu           t1, t3, t7
-    shra_r.w       t1, t1, 4
-    addu           t0, t3, t5
-    addiu          t0, 7
-    srl            t0, t0, 4
-    sb             t1, 0(s3)
-    sb             t0, 1(s3)
-    addiu          s3, 2
-    move           t7, t6
-    bne            t8, s0, 3b
-     move          t6, t5
-4:
-    sll            t0, t6, 2       // t0 = thiscolsum * 4
-    subu           t1, t0, t6      // t1 = thiscolsum * 3
-    addu           t1, t1, t7
-    addiu          s4, 4
-    shra_r.w       t1, t1, 4
-    addiu          t0, 7
-    srl            t0, t0, 4
-    sb             t1, 0(s3)
-    sb             t0, 1(s3)
-    addiu          t9, -1
-    addiu          s3, 2
-    bnez           t9, 1b
-     lw            s1, 4(a2)
-    srl            t0, s4, 2
-    subu           t0, a0, t0
-    bgtz           t0, 0b
-     addiu         a2, 4
-
-    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
-
-    j ra
-     nop
-END(jsimd_h2v2_fancy_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2)
-/*
- * a0     - cinfo->max_v_samp_factor
- * a1     - downsampled_width
- * a2     - input_data
- * a3     - output_data_ptr
- */
-
-    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
-    .set at
-
-    beqz           a0, 3f
-     sll           t0, a0, 2
-    lw             s1, 0(a3)
-    li             s3, 0x10001
-    addu           s0, s1, t0
-0:
-    addiu          t8, a1, -2
-    srl            t9, t8, 2
-    lw             t7, 0(a2)
-    lw             s2, 0(s1)
-    lbu            t0, 0(t7)
-    lbu            t1, 1(t7)   // t1 = inptr[1]
-    sll            t2, t0, 1
-    addu           t2, t2, t0  // t2 = invalue*3
-    addu           t2, t2, t1
-    shra_r.w       t2, t2, 2
-    sb             t0, 0(s2)
-    sb             t2, 1(s2)
-    beqz           t9, 11f
-     addiu         s2, 2
-1:
-    ulw            t0, 0(t7)   // t0 = |P3|P2|P1|P0|
-    ulw            t1, 1(t7)
-    ulh            t2, 4(t7)   // t2 = |0|0|P5|P4|
-    preceu.ph.qbl  t3, t0      // t3 = |0|P3|0|P2|
-    preceu.ph.qbr  t0, t0      // t0 = |0|P1|0|P0|
-    preceu.ph.qbr  t2, t2      // t2 = |0|P5|0|P4|
-    preceu.ph.qbl  t4, t1      // t4 = |0|P4|0|P3|
-    preceu.ph.qbr  t1, t1      // t1 = |0|P2|0|P1|
-    shll.ph        t5, t4, 1
-    shll.ph        t6, t1, 1
-    addu.ph        t5, t5, t4  // t5 = |P4*3|P3*3|
-    addu.ph        t6, t6, t1  // t6 = |P2*3|P1*3|
-    addu.ph        t4, t3, s3
-    addu.ph        t0, t0, s3
-    addu.ph        t4, t4, t5
-    addu.ph        t0, t0, t6
-    shrl.ph        t4, t4, 2   // t4 = |0|P3|0|P2|
-    shrl.ph        t0, t0, 2   // t0 = |0|P1|0|P0|
-    addu.ph        t2, t2, t5
-    addu.ph        t3, t3, t6
-    shra_r.ph      t2, t2, 2   // t2 = |0|P5|0|P4|
-    shra_r.ph      t3, t3, 2   // t3 = |0|P3|0|P2|
-    shll.ph        t2, t2, 8
-    shll.ph        t3, t3, 8
-    or             t2, t4, t2
-    or             t3, t3, t0
-    addiu          t9, -1
-    usw            t3, 0(s2)
-    usw            t2, 4(s2)
-    addiu          s2, 8
-    bgtz           t9, 1b
-     addiu         t7, 4
-11:
-    andi           t8, 3
-    beqz           t8, 22f
-     addiu         t7, 1
-
-2:
-    lbu            t0, 0(t7)
-    addiu          t7, 1
-    sll            t1, t0, 1
-    addu           t2, t0, t1  // t2 = invalue
-    lbu            t3, -2(t7)
-    lbu            t4, 0(t7)
-    addiu          t3, 1
-    addiu          t4, 2
-    addu           t3, t3, t2
-    addu           t4, t4, t2
-    srl            t3, 2
-    srl            t4, 2
-    sb             t3, 0(s2)
-    sb             t4, 1(s2)
-    addiu          t8, -1
-    bgtz           t8, 2b
-     addiu         s2, 2
-
-22:
-    lbu            t0, 0(t7)
-    lbu            t2, -1(t7)
-    sll            t1, t0, 1
-    addu           t1, t1, t0 // t1 = invalue * 3
-    addu           t1, t1, t2
-    addiu          t1, 1
-    srl            t1, t1, 2
-    sb             t1, 0(s2)
-    sb             t0, 1(s2)
-    addiu          s1, 4
-    bne            s1, s0, 0b
-     addiu         a2, 4
-3:
-    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
-    j              ra
-     nop
-END(jsimd_h2v1_fancy_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2)
-/*
- * a0     - cinfo->image_width
- * a1     - cinfo->max_v_samp_factor
- * a2     - compptr->v_samp_factor
- * a3     - compptr->width_in_blocks
- * 16(sp) - input_data
- * 20(sp) - output_data
- */
-    .set at
-
-    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
-
-    beqz        a2, 7f
-     lw         s1, 44(sp)  // s1 = output_data
-    lw          s0, 40(sp)  // s0 = input_data
-    srl         s2, a0, 2
-    andi        t9, a0, 2
-    srl         t7, t9, 1
-    addu        s2, t7, s2
-    sll         t0, a3, 3   // t0 = width_in_blocks*DCT
-    srl         t7, t0, 1
-    subu        s2, t7, s2
-0:
-    andi        t6, a0, 1   // t6 = temp_index
-    addiu       t6, -1
-    lw          t4, 0(s1)   // t4 = outptr
-    lw          t5, 0(s0)   // t5 = inptr0
-    li          s3, 0       // s3 = bias
-    srl         t7, a0, 1   // t7 = image_width1
-    srl         s4, t7, 2
-    andi        t8, t7, 3
-1:
-    ulhu        t0, 0(t5)
-    ulhu        t1, 2(t5)
-    ulhu        t2, 4(t5)
-    ulhu        t3, 6(t5)
-    raddu.w.qb  t0, t0
-    raddu.w.qb  t1, t1
-    raddu.w.qb  t2, t2
-    raddu.w.qb  t3, t3
-    shra.ph     t0, t0, 1
-    shra_r.ph   t1, t1, 1
-    shra.ph     t2, t2, 1
-    shra_r.ph   t3, t3, 1
-    sb          t0, 0(t4)
-    sb          t1, 1(t4)
-    sb          t2, 2(t4)
-    sb          t3, 3(t4)
-    addiu       s4, -1
-    addiu       t4, 4
-    bgtz        s4, 1b
-     addiu      t5, 8
-    beqz        t8, 3f
-     addu       s4, t4, t8
-2:
-    ulhu        t0, 0(t5)
-    raddu.w.qb  t0, t0
-    addqh.w     t0, t0, s3
-    xori        s3, s3, 1
-    sb          t0, 0(t4)
-    addiu       t4, 1
-    bne         t4, s4, 2b
-     addiu      t5, 2
-3:
-    lbux        t1, t6(t5)
-    sll         t1, 1
-    addqh.w     t2, t1, s3  // t2 = pixval1
-    xori        s3, s3, 1
-    addqh.w     t3, t1, s3  // t3 = pixval2
-    blez        s2, 5f
-     append     t3, t2,  8
-    addu        t5, t4, s2  // t5 = loop_end2
-4:
-    ush         t3, 0(t4)
-    addiu       s2, -1
-    bgtz        s2, 4b
-     addiu      t4,  2
-5:
-    beqz        t9, 6f
-     nop
-    sb          t2, 0(t4)
-6:
-    addiu       s1, 4
-    addiu       a2, -1
-    bnez        a2, 0b
-     addiu      s0, 4
-7:
-    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
-
-    j           ra
-    nop
-END(jsimd_h2v1_downsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2)
-
-/*
- * a0     - cinfo->image_width
- * a1     - cinfo->max_v_samp_factor
- * a2     - compptr->v_samp_factor
- * a3     - compptr->width_in_blocks
- * 16(sp) - input_data
- * 20(sp) - output_data
- */
-    .set at
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    beqz         a2, 8f
-     lw          s1, 52(sp)      // s1 = output_data
-    lw           s0, 48(sp)      // s0 = input_data
-
-    andi         t6, a0, 1       // t6 = temp_index
-    addiu        t6, -1
-    srl          t7, a0, 1       // t7 = image_width1
-    srl          s4, t7, 2
-    andi         t8, t7, 3
-    andi         t9, a0, 2
-    srl          s2, a0, 2
-    srl          t7, t9, 1
-    addu         s2, t7, s2
-    sll          t0, a3, 3       // s2 = width_in_blocks*DCT
-    srl          t7, t0, 1
-    subu         s2, t7, s2
-0:
-    lw           t4, 0(s1)       // t4 = outptr
-    lw           t5, 0(s0)       // t5 = inptr0
-    lw           s7, 4(s0)       // s7 = inptr1
-    li           s6, 1           // s6 = bias
-2:
-    ulw          t0, 0(t5)       // t0 = |P3|P2|P1|P0|
-    ulw          t1, 0(s7)       // t1 = |Q3|Q2|Q1|Q0|
-    ulw          t2, 4(t5)
-    ulw          t3, 4(s7)
-    precrq.ph.w  t7, t0, t1      // t2 = |P3|P2|Q3|Q2|
-    ins          t0, t1, 16, 16  // t0 = |Q1|Q0|P1|P0|
-    raddu.w.qb   t1, t7
-    raddu.w.qb   t0, t0
-    shra_r.w     t1, t1, 2
-    addiu        t0, 1
-    srl          t0, 2
-    precrq.ph.w  t7, t2, t3
-    ins          t2, t3, 16, 16
-    raddu.w.qb   t7, t7
-    raddu.w.qb   t2, t2
-    shra_r.w     t7, t7, 2
-    addiu        t2, 1
-    srl          t2, 2
-    sb           t0, 0(t4)
-    sb           t1, 1(t4)
-    sb           t2, 2(t4)
-    sb           t7, 3(t4)
-    addiu        t4, 4
-    addiu        t5, 8
-    addiu        s4, s4, -1
-    bgtz         s4, 2b
-     addiu       s7, 8
-    beqz         t8, 4f
-     addu        t8, t4, t8
-3:
-    ulhu         t0, 0(t5)
-    ulhu         t1, 0(s7)
-    ins          t0, t1, 16, 16
-    raddu.w.qb   t0, t0
-    addu         t0, t0, s6
-    srl          t0, 2
-    xori         s6, s6, 3
-    sb           t0, 0(t4)
-    addiu        t5, 2
-    addiu        t4, 1
-    bne          t8, t4, 3b
-     addiu       s7, 2
-4:
-    lbux         t1, t6(t5)
-    sll          t1, 1
-    lbux         t0, t6(s7)
-    sll          t0, 1
-    addu         t1, t1, t0
-    addu         t3, t1, s6
-    srl          t0, t3, 2       // t2 = pixval1
-    xori         s6, s6, 3
-    addu         t2, t1, s6
-    srl          t1, t2, 2       // t3 = pixval2
-    blez         s2, 6f
-     append      t1, t0, 8
-5:
-    ush          t1, 0(t4)
-    addiu        s2, -1
-    bgtz         s2, 5b
-     addiu       t4, 2
-6:
-    beqz         t9, 7f
-     nop
-    sb           t0, 0(t4)
-7:
-    addiu        s1, 4
-    addiu        a2, -1
-    bnez         a2, 0b
-     addiu       s0, 8
-8:
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j            ra
-     nop
-END(jsimd_h2v2_downsample_mips_dspr2)
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2)
-/*
- * a0     - input_data
- * a1     - output_data
- * a2     - compptr->v_samp_factor
- * a3     - cinfo->max_v_samp_factor
- * 16(sp) - cinfo->smoothing_factor
- * 20(sp) - compptr->width_in_blocks
- * 24(sp) - cinfo->image_width
- */
-
-    .set at
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    lw          s7, 52(sp)      // compptr->width_in_blocks
-    lw          s0, 56(sp)      // cinfo->image_width
-    lw          s6, 48(sp)      // cinfo->smoothing_factor
-    sll         s7, 3           // output_cols = width_in_blocks * DCTSIZE
-    sll         v0, s7, 1
-    subu        v0, v0, s0
-    blez        v0, 2f
-    move        v1, zero
-    addiu       t0, a3, 2       // t0 = cinfo->max_v_samp_factor + 2
-0:
-    addiu       t1, a0, -4
-    sll         t2, v1, 2
-    lwx         t1, t2(t1)
-    move        t3, v0
-    addu        t1, t1, s0
-    lbu         t2, -1(t1)
-1:
-    addiu       t3, t3, -1
-    sb          t2, 0(t1)
-    bgtz        t3, 1b
-    addiu       t1, t1, 1
-    addiu       v1, v1, 1
-    bne         v1, t0, 0b
-    nop
-2:
-    li          v0, 80
-    mul         v0, s6, v0
-    li          v1, 16384
-    move        t4, zero
-    move        t5, zero
-    subu        t6, v1, v0      // t6 = 16384 - tmp_smoot_f * 80
-    sll         t7, s6, 4       // t7 = tmp_smoot_f * 16
-3:
-/* Special case for first column: pretend column -1 is same as column 0 */
-    sll         v0, t4, 2
-    lwx         t8, v0(a1)      //  outptr = output_data[outrow]
-    sll         v1, t5, 2
-    addiu       t9, v1, 4
-    addiu       s0, v1, -4
-    addiu       s1, v1, 8
-    lwx         s2, v1(a0)      // inptr0 = input_data[inrow]
-    lwx         t9, t9(a0)      // inptr1 = input_data[inrow+1]
-    lwx         s0, s0(a0)      // above_ptr = input_data[inrow-1]
-    lwx         s1, s1(a0)      // below_ptr = input_data[inrow+2]
-    lh          v0, 0(s2)
-    lh          v1, 0(t9)
-    lh          t0, 0(s0)
-    lh          t1, 0(s1)
-    ins         v0, v1, 16, 16
-    ins         t0, t1, 16, 16
-    raddu.w.qb  t2, v0
-    raddu.w.qb  s3, t0
-    lbu         v0, 0(s2)
-    lbu         v1, 2(s2)
-    lbu         t0, 0(t9)
-    lbu         t1, 2(t9)
-    addu        v0, v0, v1
-    mult        $ac1,t2, t6
-    addu        t0, t0, t1
-    lbu         t2, 2(s0)
-    addu        t0, t0, v0
-    lbu         t3, 2(s1)
-    addu        s3, t0, s3
-    lbu         v0, 0(s0)
-    lbu         t0, 0(s1)
-    sll         s3, s3, 1
-    addu        v0, v0, t2
-    addu        t0, t0, t3
-    addu        t0, t0, v0
-    addu        s3, t0, s3
-    madd        $ac1,s3, t7
-    extr_r.w    v0, $ac1, 16
-    addiu       t8, t8, 1
-    addiu       s2, s2, 2
-    addiu       t9, t9, 2
-    addiu       s0, s0, 2
-    addiu       s1, s1, 2
-    sb          v0, -1(t8)
-    addiu       s4, s7, -2
-    and         s4, s4, 3
-    addu        s5, s4, t8      //end adress
-4:
-    lh          v0, 0(s2)
-    lh          v1, 0(t9)
-    lh          t0, 0(s0)
-    lh          t1, 0(s1)
-    ins         v0, v1, 16, 16
-    ins         t0, t1, 16, 16
-    raddu.w.qb  t2, v0
-    raddu.w.qb  s3, t0
-    lbu         v0, -1(s2)
-    lbu         v1, 2(s2)
-    lbu         t0, -1(t9)
-    lbu         t1, 2(t9)
-    addu        v0, v0, v1
-    mult        $ac1, t2, t6
-    addu        t0, t0, t1
-    lbu         t2, 2(s0)
-    addu        t0, t0, v0
-    lbu         t3, 2(s1)
-    addu        s3, t0, s3
-    lbu         v0, -1(s0)
-    lbu         t0, -1(s1)
-    sll         s3, s3, 1
-    addu        v0, v0, t2
-    addu        t0, t0, t3
-    addu        t0, t0, v0
-    addu        s3, t0, s3
-    madd        $ac1, s3, t7
-    extr_r.w    t2, $ac1, 16
-    addiu       t8, t8, 1
-    addiu       s2, s2, 2
-    addiu       t9, t9, 2
-    addiu       s0, s0, 2
-    sb          t2, -1(t8)
-    bne         s5, t8, 4b
-    addiu       s1, s1, 2
-    addiu       s5, s7, -2
-    subu        s5, s5, s4
-    addu        s5, s5, t8      //end adress
-5:
-    lh          v0, 0(s2)
-    lh          v1, 0(t9)
-    lh          t0, 0(s0)
-    lh          t1, 0(s1)
-    ins         v0, v1, 16, 16
-    ins         t0, t1, 16, 16
-    raddu.w.qb  t2, v0
-    raddu.w.qb  s3, t0
-    lbu         v0, -1(s2)
-    lbu         v1, 2(s2)
-    lbu         t0, -1(t9)
-    lbu         t1, 2(t9)
-    addu        v0, v0, v1
-    mult        $ac1, t2, t6
-    addu        t0, t0, t1
-    lbu         t2, 2(s0)
-    addu        t0, t0, v0
-    lbu         t3, 2(s1)
-    addu        s3, t0, s3
-    lbu         v0, -1(s0)
-    lbu         t0, -1(s1)
-    sll         s3, s3, 1
-    addu        v0, v0, t2
-    addu        t0, t0, t3
-    lh          v1, 2(t9)
-    addu        t0, t0, v0
-    lh          v0, 2(s2)
-    addu        s3, t0, s3
-    lh          t0, 2(s0)
-    lh          t1, 2(s1)
-    madd        $ac1, s3, t7
-    extr_r.w    t2, $ac1, 16
-    ins         t0, t1, 16, 16
-    ins         v0, v1, 16, 16
-    raddu.w.qb  s3, t0
-    lbu         v1, 4(s2)
-    lbu         t0, 1(t9)
-    lbu         t1, 4(t9)
-    sb          t2, 0(t8)
-    raddu.w.qb  t3, v0
-    lbu         v0, 1(s2)
-    addu        t0, t0, t1
-    mult        $ac1, t3, t6
-    addu        v0, v0, v1
-    lbu         t2, 4(s0)
-    addu        t0, t0, v0
-    lbu         v0, 1(s0)
-    addu        s3, t0, s3
-    lbu         t0, 1(s1)
-    lbu         t3, 4(s1)
-    addu        v0, v0, t2
-    sll         s3, s3, 1
-    addu        t0, t0, t3
-    lh          v1, 4(t9)
-    addu        t0, t0, v0
-    lh          v0, 4(s2)
-    addu        s3, t0, s3
-    lh          t0, 4(s0)
-    lh          t1, 4(s1)
-    madd        $ac1, s3, t7
-    extr_r.w    t2, $ac1, 16
-    ins         t0, t1, 16, 16
-    ins         v0, v1, 16, 16
-    raddu.w.qb  s3, t0
-    lbu         v1, 6(s2)
-    lbu         t0, 3(t9)
-    lbu         t1, 6(t9)
-    sb          t2, 1(t8)
-    raddu.w.qb  t3, v0
-    lbu         v0, 3(s2)
-    addu        t0, t0,t1
-    mult        $ac1, t3, t6
-    addu        v0, v0, v1
-    lbu         t2, 6(s0)
-    addu        t0, t0, v0
-    lbu         v0, 3(s0)
-    addu        s3, t0, s3
-    lbu         t0, 3(s1)
-    lbu         t3, 6(s1)
-    addu        v0, v0, t2
-    sll         s3, s3, 1
-    addu        t0, t0, t3
-    lh          v1, 6(t9)
-    addu        t0, t0, v0
-    lh          v0, 6(s2)
-    addu        s3, t0, s3
-    lh          t0, 6(s0)
-    lh          t1, 6(s1)
-    madd        $ac1, s3, t7
-    extr_r.w    t3, $ac1, 16
-    ins         t0, t1, 16, 16
-    ins         v0, v1, 16, 16
-    raddu.w.qb  s3, t0
-    lbu         v1, 8(s2)
-    lbu         t0, 5(t9)
-    lbu         t1, 8(t9)
-    sb          t3, 2(t8)
-    raddu.w.qb  t2, v0
-    lbu         v0, 5(s2)
-    addu        t0, t0, t1
-    mult        $ac1, t2, t6
-    addu        v0, v0, v1
-    lbu         t2, 8(s0)
-    addu        t0, t0, v0
-    lbu         v0, 5(s0)
-    addu        s3, t0, s3
-    lbu         t0, 5(s1)
-    lbu         t3, 8(s1)
-    addu        v0, v0, t2
-    sll         s3, s3, 1
-    addu        t0, t0, t3
-    addiu       t8, t8, 4
-    addu        t0, t0, v0
-    addiu       s2, s2, 8
-    addu        s3, t0, s3
-    addiu       t9, t9, 8
-    madd        $ac1, s3, t7
-    extr_r.w    t1, $ac1, 16
-    addiu       s0, s0, 8
-    addiu       s1, s1, 8
-    bne         s5, t8, 5b
-    sb          t1, -1(t8)
-/* Special case for last column */
-    lh          v0, 0(s2)
-    lh          v1, 0(t9)
-    lh          t0, 0(s0)
-    lh          t1, 0(s1)
-    ins         v0, v1, 16, 16
-    ins         t0, t1, 16, 16
-    raddu.w.qb  t2, v0
-    raddu.w.qb  s3, t0
-    lbu         v0, -1(s2)
-    lbu         v1, 1(s2)
-    lbu         t0, -1(t9)
-    lbu         t1, 1(t9)
-    addu        v0, v0, v1
-    mult        $ac1, t2, t6
-    addu        t0, t0, t1
-    lbu         t2, 1(s0)
-    addu        t0, t0, v0
-    lbu         t3, 1(s1)
-    addu        s3, t0, s3
-    lbu         v0, -1(s0)
-    lbu         t0, -1(s1)
-    sll         s3, s3, 1
-    addu        v0, v0, t2
-    addu        t0, t0, t3
-    addu        t0, t0, v0
-    addu        s3, t0, s3
-    madd        $ac1, s3, t7
-    extr_r.w    t0, $ac1, 16
-    addiu       t5, t5, 2
-    sb          t0, 0(t8)
-    addiu       t4, t4, 1
-    bne         t4, a2, 3b
-    addiu       t5, t5, 2
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j           ra
-     nop
-
-END(jsimd_h2v2_smooth_downsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_int_upsample_mips_dspr2)
-/*
- * a0     - upsample->h_expand[compptr->component_index]
- * a1     - upsample->v_expand[compptr->component_index]
- * a2     - input_data
- * a3     - output_data_ptr
- * 16(sp) - cinfo->output_width
- * 20(sp) - cinfo->max_v_samp_factor
- */
-    .set at
-
-    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
-    lw      s0, 0(a3)    // s0 = output_data
-    lw      s1, 32(sp)   // s1 = cinfo->output_width
-    lw      s2, 36(sp)   // s2 = cinfo->max_v_samp_factor
-    li      t6, 0        // t6 = inrow
-    beqz    s2, 10f
-     li     s3, 0        // s3 = outrow
-0:
-    addu    t0, a2, t6
-    addu    t7, s0, s3
-    lw      t3, 0(t0)    // t3 = inptr
-    lw      t8, 0(t7)    // t8 = outptr
-    beqz    s1, 4f
-     addu   t5, t8, s1   // t5 = outend
-1:
-    lb      t2, 0(t3)    // t2 = invalue = *inptr++
-    addiu   t3, 1
-    beqz    a0, 3f
-     move   t0, a0       // t0 = h_expand
-2:
-    sb      t2, 0(t8)
-    addiu   t0, -1
-    bgtz    t0, 2b
-     addiu  t8, 1
-3:
-    bgt     t5, t8, 1b
-     nop
-4:
-    addiu   t9, a1, -1   // t9 = v_expand - 1
-    blez    t9, 9f
-     nop
-5:
-    lw      t3, 0(s0)
-    lw      t4, 4(s0)
-    subu    t0, s1, 0xF
-    blez    t0, 7f
-     addu   t5, t3, s1   // t5 = end address
-    andi    t7, s1, 0xF  // t7 = residual
-    subu    t8, t5, t7
-6:
-    ulw     t0, 0(t3)
-    ulw     t1, 4(t3)
-    ulw     t2, 8(t3)
-    usw     t0, 0(t4)
-    ulw     t0, 12(t3)
-    usw     t1, 4(t4)
-    usw     t2, 8(t4)
-    usw     t0, 12(t4)
-    addiu   t3, 16
-    bne     t3, t8, 6b
-     addiu  t4, 16
-    beqz    t7, 8f
-     nop
-7:
-    lbu     t0, 0(t3)
-    sb      t0, 0(t4)
-    addiu   t3, 1
-    bne     t3, t5, 7b
-     addiu  t4, 1
-8:
-    addiu   t9, -1
-    bgtz    t9, 5b
-     addiu  s0, 8
-9:
-    addu    s3, s3, a1
-    bne     s3, s2, 0b
-     addiu  t6, 1
-10:
-    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
-    j       ra
-     nop
-END(jsimd_int_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2)
-/*
- * a0     - cinfo->max_v_samp_factor
- * a1     - cinfo->output_width
- * a2     - input_data
- * a3     - output_data_ptr
- */
-    lw      t7, 0(a3)       // t7 = output_data
-    andi    t8, a1, 0xf     // t8 = residual
-    sll     t0, a0, 2
-    blez    a0, 4f
-     addu   t9, t7, t0      // t9 = output_data end address
-0:
-    lw      t5, 0(t7)       // t5 = outptr
-    lw      t6, 0(a2)       // t6 = inptr
-    addu    t3, t5, a1      // t3 = outptr + output_width (end address)
-    subu    t3, t8          // t3 = end address - residual
-    beq     t5, t3, 2f
-     move   t4, t8
-1:
-    ulw     t0, 0(t6)       // t0 = |P3|P2|P1|P0|
-    ulw     t2, 4(t6)       // t2 = |P7|P6|P5|P4|
-    srl     t1, t0, 16      // t1 = |X|X|P3|P2|
-    ins     t0, t0, 16, 16  // t0 = |P1|P0|P1|P0|
-    ins     t1, t1, 16, 16  // t1 = |P3|P2|P3|P2|
-    ins     t0, t0, 8, 16   // t0 = |P1|P1|P0|P0|
-    ins     t1, t1, 8, 16   // t1 = |P3|P3|P2|P2|
-    usw     t0, 0(t5)
-    usw     t1, 4(t5)
-    srl     t0, t2, 16      // t0 = |X|X|P7|P6|
-    ins     t2, t2, 16, 16  // t2 = |P5|P4|P5|P4|
-    ins     t0, t0, 16, 16  // t0 = |P7|P6|P7|P6|
-    ins     t2, t2, 8, 16   // t2 = |P5|P5|P4|P4|
-    ins     t0, t0, 8, 16   // t0 = |P7|P7|P6|P6|
-    usw     t2, 8(t5)
-    usw     t0, 12(t5)
-    addiu   t5, 16
-    bne     t5, t3, 1b
-     addiu  t6, 8
-    beqz    t8, 3f
-     move   t4, t8
-2:
-    lbu     t1, 0(t6)
-    sb      t1, 0(t5)
-    sb      t1, 1(t5)
-    addiu   t4, -2
-    addiu   t6, 1
-    bgtz    t4, 2b
-     addiu  t5, 2
-3:
-    addiu   t7, 4
-    bne     t9, t7, 0b
-     addiu  a2, 4
-4:
-    j       ra
-     nop
-END(jsimd_h2v1_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
-/*
- * a0     - cinfo->max_v_samp_factor
- * a1     - cinfo->output_width
- * a2     - input_data
- * a3     - output_data_ptr
- */
-    lw      t7, 0(a3)
-    blez    a0, 7f
-     andi   t9, a1, 0xf     // t9 = residual
-0:
-    lw      t6, 0(a2)       // t6 = inptr
-    lw      t5, 0(t7)       // t5 = outptr
-    addu    t8, t5, a1      // t8 = outptr end address
-    subu    t8, t9          // t8 = end address - residual
-    beq     t5, t8, 2f
-     move   t4, t9
-1:
-    ulw     t0, 0(t6)
-    srl     t1, t0, 16
-    ins     t0, t0, 16, 16
-    ins     t0, t0, 8, 16
-    ins     t1, t1, 16, 16
-    ins     t1, t1, 8, 16
-    ulw     t2, 4(t6)
-    usw     t0, 0(t5)
-    usw     t1, 4(t5)
-    srl     t3, t2, 16
-    ins     t2, t2, 16, 16
-    ins     t2, t2, 8, 16
-    ins     t3, t3, 16, 16
-    ins     t3, t3, 8, 16
-    usw     t2, 8(t5)
-    usw     t3, 12(t5)
-    addiu   t5, 16
-    bne     t5, t8, 1b
-     addiu  t6, 8
-    beqz    t9, 3f
-     move   t4, t9
-2:
-    lbu     t0, 0(t6)
-    sb      t0, 0(t5)
-    sb      t0, 1(t5)
-    addiu   t4, -2
-    addiu   t6, 1
-    bgtz    t4, 2b
-     addiu  t5, 2
-3:
-    lw      t6, 0(t7)       // t6 = outptr[0]
-    lw      t5, 4(t7)       // t5 = outptr[1]
-    addu    t4, t6, a1      // t4 = new end address
-    beq     a1, t9, 5f
-     subu   t8, t4, t9
-4:
-    ulw     t0, 0(t6)
-    ulw     t1, 4(t6)
-    ulw     t2, 8(t6)
-    usw     t0, 0(t5)
-    ulw     t0, 12(t6)
-    usw     t1, 4(t5)
-    usw     t2, 8(t5)
-    usw     t0, 12(t5)
-    addiu   t6, 16
-    bne     t6, t8, 4b
-     addiu  t5, 16
-    beqz    t9, 6f
-     nop
-5:
-    lbu     t0, 0(t6)
-    sb      t0, 0(t5)
-    addiu   t6, 1
-    bne     t6, t4, 5b
-     addiu  t5, 1
-6:
-    addiu   t7, 8
-    addiu   a0, -2
-    bgtz    a0, 0b
-     addiu  a2, 4
-7:
-    j       ra
-     nop
-END(jsimd_h2v2_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_islow_mips_dspr2)
-/*
- * a0     - coef_block
- * a1     - compptr->dcttable
- * a2     - output
- * a3     - range_limit
- */
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    addiu     sp, sp, -256
-    move      v0, sp
-    addiu     v1, zero, 8      // v1 = DCTSIZE = 8
-1:
-    lh        s4, 32(a0)       // s4 = inptr[16]
-    lh        s5, 64(a0)       // s5 = inptr[32]
-    lh        s6, 96(a0)       // s6 = inptr[48]
-    lh        t1, 112(a0)      // t1 = inptr[56]
-    lh        t7, 16(a0)       // t7 = inptr[8]
-    lh        t5, 80(a0)       // t5 = inptr[40]
-    lh        t3, 48(a0)       // t3 = inptr[24]
-    or        s4, s4, t1
-    or        s4, s4, t3
-    or        s4, s4, t5
-    or        s4, s4, t7
-    or        s4, s4, s5
-    or        s4, s4, s6
-    bnez      s4, 2f
-     addiu    v1, v1, -1
-    lh        s5, 0(a1)        // quantptr[DCTSIZE*0]
-    lh        s6, 0(a0)        // inptr[DCTSIZE*0]
-    mul       s5, s5, s6       // DEQUANTIZE(inptr[0], quantptr[0])
-    sll       s5, s5, 2
-    sw        s5, 0(v0)
-    sw        s5, 32(v0)
-    sw        s5, 64(v0)
-    sw        s5, 96(v0)
-    sw        s5, 128(v0)
-    sw        s5, 160(v0)
-    sw        s5, 192(v0)
-    b         3f
-     sw       s5, 224(v0)
-2:
-    lh        t0, 112(a1)
-    lh        t2, 48(a1)
-    lh        t4, 80(a1)
-    lh        t6, 16(a1)
-    mul       t0, t0, t1       // DEQUANTIZE(inptr[DCTSIZE*7],quant[DCTSIZE*7])
-    mul       t1, t2, t3       // DEQUANTIZE(inptr[DCTSIZE*3],quant[DCTSIZE*3])
-    mul       t2, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*5],quant[DCTSIZE*5])
-    mul       t3, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*1],quant[DCTSIZE*1])
-    lh        t4, 32(a1)
-    lh        t5, 32(a0)
-    lh        t6, 96(a1)
-    lh        t7, 96(a0)
-    addu      s0, t0, t1       // z3 = tmp0 + tmp2
-    addu      s1, t1, t2       // z2 = tmp1 + tmp2
-    addu      s2, t2, t3       // z4 = tmp1 + tmp3
-    addu      s3, s0, s2       // z3 + z4
-    addiu     t9, zero, 9633   // FIX_1_175875602
-    mul       s3, s3, t9       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
-    addu      t8, t0, t3       // z1 = tmp0 + tmp3
-    addiu     t9, zero, 2446   // FIX_0_298631336
-    mul       t0, t0, t9       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
-    addiu     t9, zero, 16819  // FIX_2_053119869
-    mul       t2, t2, t9       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
-    addiu     t9, zero, 25172  // FIX_3_072711026
-    mul       t1, t1, t9       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
-    addiu     t9, zero, 12299  // FIX_1_501321110
-    mul       t3, t3, t9       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
-    addiu     t9, zero, 16069  // FIX_1_961570560
-    mul       s0, s0, t9       // -z3 = MULTIPLY(z3, FIX_1_961570560)
-    addiu     t9, zero, 3196   // FIX_0_390180644
-    mul       s2, s2, t9       // -z4 = MULTIPLY(z4, FIX_0_390180644)
-    addiu     t9, zero, 7373   // FIX_0_899976223
-    mul       t8, t8, t9       // -z1 = MULTIPLY(z1, FIX_0_899976223)
-    addiu     t9, zero, 20995  // FIX_2_562915447
-    mul       s1, s1, t9       // -z2 = MULTIPLY(z2, FIX_2_562915447)
-    subu      s0, s3, s0       // z3 += z5
-    addu      t0, t0, s0       // tmp0 += z3
-    addu      t1, t1, s0       // tmp2 += z3
-    subu      s2, s3, s2       // z4 += z5
-    addu      t2, t2, s2       // tmp1 += z4
-    addu      t3, t3, s2       // tmp3 += z4
-    subu      t0, t0, t8       // tmp0 += z1
-    subu      t1, t1, s1       // tmp2 += z2
-    subu      t2, t2, s1       // tmp1 += z2
-    subu      t3, t3, t8       // tmp3 += z1
-    mul       s0, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*2],quant[DCTSIZE*2])
-    addiu     t9, zero, 6270   // FIX_0_765366865
-    mul       s1, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*6],quant[DCTSIZE*6])
-    lh        t4, 0(a1)
-    lh        t5, 0(a0)
-    lh        t6, 64(a1)
-    lh        t7, 64(a0)
-    mul       s2, t9, s0       // MULTIPLY(z2, FIX_0_765366865)
-    mul       t5, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*0],quant[DCTSIZE*0])
-    mul       t6, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*4],quant[DCTSIZE*4])
-    addiu     t9, zero, 4433   // FIX_0_541196100
-    addu      s3, s0, s1       // z2 + z3
-    mul       s3, s3, t9       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
-    addiu     t9, zero, 15137  // FIX_1_847759065
-    mul       t8, s1, t9       // MULTIPLY(z3, FIX_1_847759065)
-    addu      t4, t5, t6
-    subu      t5, t5, t6
-    sll       t4, t4, 13       // tmp0 = (z2 + z3) << CONST_BITS
-    sll       t5, t5, 13       // tmp1 = (z2 - z3) << CONST_BITS
-    addu      t7, s3, s2       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
-    subu      t6, s3, t8       // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
-    addu      s0, t4, t7
-    subu      s1, t4, t7
-    addu      s2, t5, t6
-    subu      s3, t5, t6
-    addu      t4, s0, t3
-    subu      s0, s0, t3
-    addu      t3, s2, t1
-    subu      s2, s2, t1
-    addu      t1, s3, t2
-    subu      s3, s3, t2
-    addu      t2, s1, t0
-    subu      s1, s1, t0
-    shra_r.w  t4, t4, 11
-    shra_r.w  t3, t3, 11
-    shra_r.w  t1, t1, 11
-    shra_r.w  t2, t2, 11
-    shra_r.w  s1, s1, 11
-    shra_r.w  s3, s3, 11
-    shra_r.w  s2, s2, 11
-    shra_r.w  s0, s0, 11
-    sw        t4, 0(v0)
-    sw        t3, 32(v0)
-    sw        t1, 64(v0)
-    sw        t2, 96(v0)
-    sw        s1, 128(v0)
-    sw        s3, 160(v0)
-    sw        s2, 192(v0)
-    sw        s0, 224(v0)
-3:
-    addiu     a1, a1, 2
-    addiu     a0, a0, 2
-    bgtz      v1, 1b
-     addiu    v0, v0, 4
-    move      v0, sp
-    addiu     v1, zero, 8
-4:
-    lw        t0, 8(v0)        // z2 = (JLONG) wsptr[2]
-    lw        t1, 24(v0)       // z3 = (JLONG) wsptr[6]
-    lw        t2, 0(v0)        // (JLONG) wsptr[0]
-    lw        t3, 16(v0)       // (JLONG) wsptr[4]
-    lw        s4, 4(v0)        // (JLONG) wsptr[1]
-    lw        s5, 12(v0)       // (JLONG) wsptr[3]
-    lw        s6, 20(v0)       // (JLONG) wsptr[5]
-    lw        s7, 28(v0)       // (JLONG) wsptr[7]
-    or        s4, s4, t0
-    or        s4, s4, t1
-    or        s4, s4, t3
-    or        s4, s4, s7
-    or        s4, s4, s5
-    or        s4, s4, s6
-    bnez      s4, 5f
-     addiu    v1, v1, -1
-    shra_r.w  s5, t2, 5
-    andi      s5, s5, 0x3ff
-    lbux      s5, s5(a3)
-    lw        s1, 0(a2)
-    replv.qb  s5, s5
-    usw       s5, 0(s1)
-    usw       s5, 4(s1)
-    b         6f
-     nop
-5:
-    addu      t4, t0, t1       // z2 + z3
-    addiu     t8, zero, 4433   // FIX_0_541196100
-    mul       t5, t4, t8       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
-    addiu     t8, zero, 15137  // FIX_1_847759065
-    mul       t1, t1, t8       // MULTIPLY(z3, FIX_1_847759065)
-    addiu     t8, zero, 6270   // FIX_0_765366865
-    mul       t0, t0, t8       // MULTIPLY(z2, FIX_0_765366865)
-    addu      t4, t2, t3       // (JLONG) wsptr[0] + (JLONG) wsptr[4]
-    subu      t2, t2, t3       // (JLONG) wsptr[0] - (JLONG) wsptr[4]
-    sll       t4, t4, 13       // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS
-    sll       t2, t2, 13       // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS
-    subu      t1, t5, t1       // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
-    subu      t3, t2, t1       // tmp12 = tmp1 - tmp2
-    addu      t2, t2, t1       // tmp11 = tmp1 + tmp2
-    addu      t5, t5, t0       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
-    subu      t1, t4, t5       // tmp13 = tmp0 - tmp3
-    addu      t0, t4, t5       // tmp10 = tmp0 + tmp3
-    lw        t4, 28(v0)       // tmp0 = (JLONG) wsptr[7]
-    lw        t6, 12(v0)       // tmp2 = (JLONG) wsptr[3]
-    lw        t5, 20(v0)       // tmp1 = (JLONG) wsptr[5]
-    lw        t7, 4(v0)        // tmp3 = (JLONG) wsptr[1]
-    addu      s0, t4, t6       // z3 = tmp0 + tmp2
-    addiu     t8, zero, 9633   // FIX_1_175875602
-    addu      s1, t5, t7       // z4 = tmp1 + tmp3
-    addu      s2, s0, s1       // z3 + z4
-    mul       s2, s2, t8       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
-    addu      s3, t4, t7       // z1 = tmp0 + tmp3
-    addu      t9, t5, t6       // z2 = tmp1 + tmp2
-    addiu     t8, zero, 16069  // FIX_1_961570560
-    mul       s0, s0, t8       // -z3 = MULTIPLY(z3, FIX_1_961570560)
-    addiu     t8, zero, 3196   // FIX_0_390180644
-    mul       s1, s1, t8       // -z4 = MULTIPLY(z4, FIX_0_390180644)
-    addiu     t8, zero, 2446   // FIX_0_298631336
-    mul       t4, t4, t8       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
-    addiu     t8, zero, 7373   // FIX_0_899976223
-    mul       s3, s3, t8       // -z1 = MULTIPLY(z1, FIX_0_899976223)
-    addiu     t8, zero, 16819  // FIX_2_053119869
-    mul       t5, t5, t8       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
-    addiu     t8, zero, 20995  // FIX_2_562915447
-    mul       t9, t9, t8       // -z2 = MULTIPLY(z2, FIX_2_562915447)
-    addiu     t8, zero, 25172  // FIX_3_072711026
-    mul       t6, t6, t8       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
-    addiu     t8, zero, 12299  // FIX_1_501321110
-    mul       t7, t7, t8       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
-    subu      s0, s2, s0       // z3 += z5
-    subu      s1, s2, s1       // z4 += z5
-    addu      t4, t4, s0
-    subu      t4, t4, s3       // tmp0
-    addu      t5, t5, s1
-    subu      t5, t5, t9       // tmp1
-    addu      t6, t6, s0
-    subu      t6, t6, t9       // tmp2
-    addu      t7, t7, s1
-    subu      t7, t7, s3       // tmp3
-    addu      s0, t0, t7
-    subu      t0, t0, t7
-    addu      t7, t2, t6
-    subu      t2, t2, t6
-    addu      t6, t3, t5
-    subu      t3, t3, t5
-    addu      t5, t1, t4
-    subu      t1, t1, t4
-    shra_r.w  s0, s0, 18
-    shra_r.w  t7, t7, 18
-    shra_r.w  t6, t6, 18
-    shra_r.w  t5, t5, 18
-    shra_r.w  t1, t1, 18
-    shra_r.w  t3, t3, 18
-    shra_r.w  t2, t2, 18
-    shra_r.w  t0, t0, 18
-    andi      s0, s0, 0x3ff
-    andi      t7, t7, 0x3ff
-    andi      t6, t6, 0x3ff
-    andi      t5, t5, 0x3ff
-    andi      t1, t1, 0x3ff
-    andi      t3, t3, 0x3ff
-    andi      t2, t2, 0x3ff
-    andi      t0, t0, 0x3ff
-    lw        s1, 0(a2)
-    lbux      s0, s0(a3)
-    lbux      t7, t7(a3)
-    lbux      t6, t6(a3)
-    lbux      t5, t5(a3)
-    lbux      t1, t1(a3)
-    lbux      t3, t3(a3)
-    lbux      t2, t2(a3)
-    lbux      t0, t0(a3)
-    sb        s0, 0(s1)
-    sb        t7, 1(s1)
-    sb        t6, 2(s1)
-    sb        t5, 3(s1)
-    sb        t1, 4(s1)
-    sb        t3, 5(s1)
-    sb        t2, 6(s1)
-    sb        t0, 7(s1)
-6:
-    addiu     v0, v0, 32
-    bgtz      v1, 4b
-     addiu    a2, a2, 4
-    addiu     sp, sp, 256
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j         ra
-     nop
-
-END(jsimd_idct_islow_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2)
-/*
- * a0     - inptr
- * a1     - quantptr
- * a2     - wsptr
- * a3     - mips_idct_ifast_coefs
- */
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    addiu          t9, a0, 16            // end address
-    or             AT, a3, zero
-
-0:
-    lw             s0, 0(a1)             // quantptr[DCTSIZE*0]
-    lw             t0, 0(a0)             // inptr[DCTSIZE*0]
-    lw             t1, 16(a0)            // inptr[DCTSIZE*1]
-    muleq_s.w.phl  v0, t0, s0            // tmp0 ...
-    lw             t2, 32(a0)            // inptr[DCTSIZE*2]
-    lw             t3, 48(a0)            // inptr[DCTSIZE*3]
-    lw             t4, 64(a0)            // inptr[DCTSIZE*4]
-    lw             t5, 80(a0)            // inptr[DCTSIZE*5]
-    muleq_s.w.phr  t0, t0, s0            // ... tmp0 ...
-    lw             t6, 96(a0)            // inptr[DCTSIZE*6]
-    lw             t7, 112(a0)           // inptr[DCTSIZE*7]
-    or             s4, t1, t2
-    or             s5, t3, t4
-    bnez           s4, 1f
-     ins           t0, v0, 16, 16        // ... tmp0
-    bnez           s5, 1f
-     or            s6, t5, t6
-    or             s6, s6, t7
-    bnez           s6, 1f
-     sw            t0, 0(a2)             // wsptr[DCTSIZE*0]
-    sw             t0, 16(a2)            // wsptr[DCTSIZE*1]
-    sw             t0, 32(a2)            // wsptr[DCTSIZE*2]
-    sw             t0, 48(a2)            // wsptr[DCTSIZE*3]
-    sw             t0, 64(a2)            // wsptr[DCTSIZE*4]
-    sw             t0, 80(a2)            // wsptr[DCTSIZE*5]
-    sw             t0, 96(a2)            // wsptr[DCTSIZE*6]
-    sw             t0, 112(a2)           // wsptr[DCTSIZE*7]
-    addiu          a0, a0, 4
-    b              2f
-     addiu         a1, a1, 4
-
-1:
-    lw             s1, 32(a1)            // quantptr[DCTSIZE*2]
-    lw             s2, 64(a1)            // quantptr[DCTSIZE*4]
-    muleq_s.w.phl  v0, t2, s1            // tmp1 ...
-    muleq_s.w.phr  t2, t2, s1            // ... tmp1 ...
-    lw             s0, 16(a1)            // quantptr[DCTSIZE*1]
-    lw             s1, 48(a1)            // quantptr[DCTSIZE*3]
-    lw             s3, 96(a1)            // quantptr[DCTSIZE*6]
-    muleq_s.w.phl  v1, t4, s2            // tmp2 ...
-    muleq_s.w.phr  t4, t4, s2            // ... tmp2 ...
-    lw             s2, 80(a1)            // quantptr[DCTSIZE*5]
-    lw             t8, 4(AT)             // FIX(1.414213562)
-    ins            t2, v0, 16, 16        // ... tmp1
-    muleq_s.w.phl  v0, t6, s3            // tmp3 ...
-    muleq_s.w.phr  t6, t6, s3            // ... tmp3 ...
-    ins            t4, v1, 16, 16        // ... tmp2
-    addq.ph        s4, t0, t4            // tmp10
-    subq.ph        s5, t0, t4            // tmp11
-    ins            t6, v0, 16, 16        // ... tmp3
-    subq.ph        s6, t2, t6            // tmp12 ...
-    addq.ph        s7, t2, t6            // tmp13
-    mulq_s.ph      s6, s6, t8            // ... tmp12 ...
-    addq.ph        t0, s4, s7            // tmp0
-    subq.ph        t6, s4, s7            // tmp3
-    muleq_s.w.phl  v0, t1, s0            // tmp4 ...
-    muleq_s.w.phr  t1, t1, s0            // ... tmp4 ...
-    shll_s.ph      s6, s6, 1             // x2
-    lw             s3, 112(a1)           // quantptr[DCTSIZE*7]
-    subq.ph        s6, s6, s7            // ... tmp12
-    muleq_s.w.phl  v1, t7, s3            // tmp7 ...
-    muleq_s.w.phr  t7, t7, s3            // ... tmp7 ...
-    ins            t1, v0, 16, 16        // ... tmp4
-    addq.ph        t2, s5, s6            // tmp1
-    subq.ph        t4, s5, s6            // tmp2
-    muleq_s.w.phl  v0, t5, s2            // tmp6 ...
-    muleq_s.w.phr  t5, t5, s2            // ... tmp6 ...
-    ins            t7, v1, 16, 16        // ... tmp7
-    addq.ph        s5, t1, t7            // z11
-    subq.ph        s6, t1, t7            // z12
-    muleq_s.w.phl  v1, t3, s1            // tmp5 ...
-    muleq_s.w.phr  t3, t3, s1            // ... tmp5 ...
-    ins            t5, v0, 16, 16        // ... tmp6
-    ins            t3, v1, 16, 16        // ... tmp5
-    addq.ph        s7, t5, t3            // z13
-    subq.ph        v0, t5, t3            // z10
-    addq.ph        t7, s5, s7            // tmp7
-    subq.ph        s5, s5, s7            // tmp11 ...
-    addq.ph        v1, v0, s6            // z5 ...
-    mulq_s.ph      s5, s5, t8            // ... tmp11
-    lw             t8, 8(AT)             // FIX(1.847759065)
-    lw             s4, 0(AT)             // FIX(1.082392200)
-    addq.ph        s0, t0, t7
-    subq.ph        s1, t0, t7
-    mulq_s.ph      v1, v1, t8            // ... z5
-    shll_s.ph      s5, s5, 1             // x2
-    lw             t8, 12(AT)            // FIX(-2.613125930)
-    sw             s0, 0(a2)             // wsptr[DCTSIZE*0]
-    shll_s.ph      v0, v0, 1             // x4
-    mulq_s.ph      v0, v0, t8            // tmp12 ...
-    mulq_s.ph      s4, s6, s4            // tmp10 ...
-    shll_s.ph      v1, v1, 1             // x2
-    addiu          a0, a0, 4
-    addiu          a1, a1, 4
-    sw             s1, 112(a2)           // wsptr[DCTSIZE*7]
-    shll_s.ph      s6, v0, 1             // x4
-    shll_s.ph      s4, s4, 1             // x2
-    addq.ph        s6, s6, v1            // ... tmp12
-    subq.ph        t5, s6, t7            // tmp6
-    subq.ph        s4, s4, v1            // ... tmp10
-    subq.ph        t3, s5, t5            // tmp5
-    addq.ph        s2, t2, t5
-    addq.ph        t1, s4, t3            // tmp4
-    subq.ph        s3, t2, t5
-    sw             s2, 16(a2)            // wsptr[DCTSIZE*1]
-    sw             s3, 96(a2)            // wsptr[DCTSIZE*6]
-    addq.ph        v0, t4, t3
-    subq.ph        v1, t4, t3
-    sw             v0, 32(a2)            // wsptr[DCTSIZE*2]
-    sw             v1, 80(a2)            // wsptr[DCTSIZE*5]
-    addq.ph        v0, t6, t1
-    subq.ph        v1, t6, t1
-    sw             v0, 64(a2)            // wsptr[DCTSIZE*4]
-    sw             v1, 48(a2)            // wsptr[DCTSIZE*3]
-
-2:
-    bne            a0, t9, 0b
-     addiu         a2, a2, 4
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j              ra
-     nop
-
-END(jsimd_idct_ifast_cols_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2)
-/*
- * a0     - wsptr
- * a1     - output_buf
- * a2     - output_col
- * a3     - mips_idct_ifast_coefs
- */
-
-    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
-
-    addiu         t9, a0, 128        // end address
-    lui           s8, 0x8080
-    ori           s8, s8, 0x8080
-
-0:
-    lw            AT, 36(sp)         // restore $a3 (mips_idct_ifast_coefs)
-    lw            t0, 0(a0)          // wsptr[DCTSIZE*0+0/1]  b a
-    lw            s0, 16(a0)         // wsptr[DCTSIZE*1+0/1]  B A
-    lw            t2, 4(a0)          // wsptr[DCTSIZE*0+2/3]  d c
-    lw            s2, 20(a0)         // wsptr[DCTSIZE*1+2/3]  D C
-    lw            t4, 8(a0)          // wsptr[DCTSIZE*0+4/5]  f e
-    lw            s4, 24(a0)         // wsptr[DCTSIZE*1+4/5]  F E
-    lw            t6, 12(a0)         // wsptr[DCTSIZE*0+6/7]  h g
-    lw            s6, 28(a0)         // wsptr[DCTSIZE*1+6/7]  H G
-    precrq.ph.w   t1, s0, t0         // B b
-    ins           t0, s0, 16, 16     // A a
-    bnez          t1, 1f
-     or           s0, t2, s2
-    bnez          s0, 1f
-     or           s0, t4, s4
-    bnez          s0, 1f
-     or           s0, t6, s6
-    bnez          s0, 1f
-     shll_s.ph    s0, t0, 2          // A a
-    lw            a3, 0(a1)
-    lw            AT, 4(a1)
-    precrq.ph.w   t0, s0, s0         // A A
-    ins           s0, s0, 16, 16     // a a
-    addu          a3, a3, a2
-    addu          AT, AT, a2
-    precrq.qb.ph  t0, t0, t0         // A A A A
-    precrq.qb.ph  s0, s0, s0         // a a a a
-    addu.qb       s0, s0, s8
-    addu.qb       t0, t0, s8
-    sw            s0, 0(a3)
-    sw            s0, 4(a3)
-    sw            t0, 0(AT)
-    sw            t0, 4(AT)
-    addiu         a0, a0, 32
-    bne           a0, t9, 0b
-     addiu        a1, a1, 8
-    b             2f
-     nop
-
-1:
-    precrq.ph.w   t3, s2, t2
-    ins           t2, s2, 16, 16
-    precrq.ph.w   t5, s4, t4
-    ins           t4, s4, 16, 16
-    precrq.ph.w   t7, s6, t6
-    ins           t6, s6, 16, 16
-    lw            t8, 4(AT)          // FIX(1.414213562)
-    addq.ph       s4, t0, t4         // tmp10
-    subq.ph       s5, t0, t4         // tmp11
-    subq.ph       s6, t2, t6         // tmp12 ...
-    addq.ph       s7, t2, t6         // tmp13
-    mulq_s.ph     s6, s6, t8         // ... tmp12 ...
-    addq.ph       t0, s4, s7         // tmp0
-    subq.ph       t6, s4, s7         // tmp3
-    shll_s.ph     s6, s6, 1          // x2
-    subq.ph       s6, s6, s7         // ... tmp12
-    addq.ph       t2, s5, s6         // tmp1
-    subq.ph       t4, s5, s6         // tmp2
-    addq.ph       s5, t1, t7         // z11
-    subq.ph       s6, t1, t7         // z12
-    addq.ph       s7, t5, t3         // z13
-    subq.ph       v0, t5, t3         // z10
-    addq.ph       t7, s5, s7         // tmp7
-    subq.ph       s5, s5, s7         // tmp11 ...
-    addq.ph       v1, v0, s6         // z5 ...
-    mulq_s.ph     s5, s5, t8         // ... tmp11
-    lw            t8, 8(AT)          // FIX(1.847759065)
-    lw            s4, 0(AT)          // FIX(1.082392200)
-    addq.ph       s0, t0, t7         // tmp0 + tmp7
-    subq.ph       s7, t0, t7         // tmp0 - tmp7
-    mulq_s.ph     v1, v1, t8         // ... z5
-    lw            a3, 0(a1)
-    lw            t8, 12(AT)         // FIX(-2.613125930)
-    shll_s.ph     s5, s5, 1          // x2
-    addu          a3, a3, a2
-    shll_s.ph     v0, v0, 1          // x4
-    mulq_s.ph     v0, v0, t8         // tmp12 ...
-    mulq_s.ph     s4, s6, s4         // tmp10 ...
-    shll_s.ph     v1, v1, 1          // x2
-    addiu         a0, a0, 32
-    addiu         a1, a1, 8
-    shll_s.ph     s6, v0, 1          // x4
-    shll_s.ph     s4, s4, 1          // x2
-    addq.ph       s6, s6, v1         // ... tmp12
-    shll_s.ph     s0, s0, 2
-    subq.ph       t5, s6, t7         // tmp6
-    subq.ph       s4, s4, v1         // ... tmp10
-    subq.ph       t3, s5, t5         // tmp5
-    shll_s.ph     s7, s7, 2
-    addq.ph       t1, s4, t3         // tmp4
-    addq.ph       s1, t2, t5         // tmp1 + tmp6
-    subq.ph       s6, t2, t5         // tmp1 - tmp6
-    addq.ph       s2, t4, t3         // tmp2 + tmp5
-    subq.ph       s5, t4, t3         // tmp2 - tmp5
-    addq.ph       s4, t6, t1         // tmp3 + tmp4
-    subq.ph       s3, t6, t1         // tmp3 - tmp4
-    shll_s.ph     s1, s1, 2
-    shll_s.ph     s2, s2, 2
-    shll_s.ph     s3, s3, 2
-    shll_s.ph     s4, s4, 2
-    shll_s.ph     s5, s5, 2
-    shll_s.ph     s6, s6, 2
-    precrq.ph.w   t0, s1, s0         // B A
-    ins           s0, s1, 16, 16     // b a
-    precrq.ph.w   t2, s3, s2         // D C
-    ins           s2, s3, 16, 16     // d c
-    precrq.ph.w   t4, s5, s4         // F E
-    ins           s4, s5, 16, 16     // f e
-    precrq.ph.w   t6, s7, s6         // H G
-    ins           s6, s7, 16, 16     // h g
-    precrq.qb.ph  t0, t2, t0         // D C B A
-    precrq.qb.ph  s0, s2, s0         // d c b a
-    precrq.qb.ph  t4, t6, t4         // H G F E
-    precrq.qb.ph  s4, s6, s4         // h g f e
-    addu.qb       s0, s0, s8
-    addu.qb       s4, s4, s8
-    sw            s0, 0(a3)          // outptr[0/1/2/3]       d c b a
-    sw            s4, 4(a3)          // outptr[4/5/6/7]       h g f e
-    lw            a3, -4(a1)
-    addu.qb       t0, t0, s8
-    addu          a3, a3, a2
-    addu.qb       t4, t4, s8
-    sw            t0, 0(a3)          // outptr[0/1/2/3]       D C B A
-    bne           a0, t9, 0b
-     sw           t4, 4(a3)          // outptr[4/5/6/7]       H G F E
-
-2:
-
-    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
-
-    j             ra
-     nop
-
-END(jsimd_idct_ifast_rows_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2)
-/*
- * a0     - data
- */
-
-    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
-
-    lui       t0, 6437
-    ori       t0, 2260
-    lui       t1, 9633
-    ori       t1, 11363
-    lui       t2, 0xd39e
-    ori       t2, 0xe6dc
-    lui       t3, 0xf72d
-    ori       t3, 9633
-    lui       t4, 2261
-    ori       t4, 9633
-    lui       t5, 0xd39e
-    ori       t5, 6437
-    lui       t6, 9633
-    ori       t6, 0xd39d
-    lui       t7, 0xe6dc
-    ori       t7, 2260
-    lui       t8, 4433
-    ori       t8, 10703
-    lui       t9, 0xd630
-    ori       t9, 4433
-    li        s8, 8
-    move      a1, a0
-1:
-    lw        s0, 0(a1)     // tmp0 = 1|0
-    lw        s1, 4(a1)     // tmp1 = 3|2
-    lw        s2, 8(a1)     // tmp2 = 5|4
-    lw        s3, 12(a1)    // tmp3 = 7|6
-    packrl.ph s1, s1, s1    // tmp1 = 2|3
-    packrl.ph s3, s3, s3    // tmp3 = 6|7
-    subq.ph   s7, s1, s2    // tmp7 = 2-5|3-4 = t5|t4
-    subq.ph   s5, s0, s3    // tmp5 = 1-6|0-7 = t6|t7
-    mult      $0, $0        // ac0  = 0
-    dpa.w.ph  $ac0, s7, t0  // ac0 += t5*  6437 + t4*  2260
-    dpa.w.ph  $ac0, s5, t1  // ac0 += t6*  9633 + t7* 11363
-    mult      $ac1, $0, $0  // ac1  = 0
-    dpa.w.ph  $ac1, s7, t2  // ac1 += t5*-11362 + t4* -6436
-    dpa.w.ph  $ac1, s5, t3  // ac1 += t6* -2259 + t7*  9633
-    mult      $ac2, $0, $0  // ac2  = 0
-    dpa.w.ph  $ac2, s7, t4  // ac2 += t5*  2261 + t4*  9633
-    dpa.w.ph  $ac2, s5, t5  // ac2 += t6*-11362 + t7*  6437
-    mult      $ac3, $0, $0  // ac3  = 0
-    dpa.w.ph  $ac3, s7, t6  // ac3 += t5*  9633 + t4*-11363
-    dpa.w.ph  $ac3, s5, t7  // ac3 += t6* -6436 + t7*  2260
-    addq.ph   s6, s1, s2    // tmp6 = 2+5|3+4 = t2|t3
-    addq.ph   s4, s0, s3    // tmp4 = 1+6|0+7 = t1|t0
-    extr_r.w  s0, $ac0, 11  // tmp0 = (ac0 + 1024) >> 11
-    extr_r.w  s1, $ac1, 11  // tmp1 = (ac1 + 1024) >> 11
-    extr_r.w  s2, $ac2, 11  // tmp2 = (ac2 + 1024) >> 11
-    extr_r.w  s3, $ac3, 11  // tmp3 = (ac3 + 1024) >> 11
-    addq.ph   s5, s4, s6    // tmp5 = t1+t2|t0+t3 = t11|t10
-    subq.ph   s7, s4, s6    // tmp7 = t1-t2|t0-t3 = t12|t13
-    sh        s0, 2(a1)
-    sh        s1, 6(a1)
-    sh        s2, 10(a1)
-    sh        s3, 14(a1)
-    mult      $0, $0        // ac0  = 0
-    dpa.w.ph  $ac0, s7, t8  // ac0 += t12*  4433 + t13* 10703
-    mult      $ac1, $0, $0  // ac1  = 0
-    dpa.w.ph  $ac1, s7, t9  // ac1 += t12*-10704 + t13*  4433
-    sra       s4, s5, 16    // tmp4 = t11
-    addiu     a1, a1, 16
-    addiu     s8, s8, -1
-    extr_r.w  s0, $ac0, 11  // tmp0 = (ac0 + 1024) >> 11
-    extr_r.w  s1, $ac1, 11  // tmp1 = (ac1 + 1024) >> 11
-    addu      s2, s5, s4    // tmp2 = t10 + t11
-    subu      s3, s5, s4    // tmp3 = t10 - t11
-    sll       s2, s2, 2     // tmp2 = (t10 + t11) << 2
-    sll       s3, s3, 2     // tmp3 = (t10 - t11) << 2
-    sh        s2, -16(a1)
-    sh        s3, -8(a1)
-    sh        s0, -12(a1)
-    bgtz      s8, 1b
-     sh       s1, -4(a1)
-    li        t0, 2260
-    li        t1, 11363
-    li        t2, 9633
-    li        t3, 6436
-    li        t4, 6437
-    li        t5, 2261
-    li        t6, 11362
-    li        t7, 2259
-    li        t8, 4433
-    li        t9, 10703
-    li        a1, 10704
-    li        s8, 8
-
-2:
-    lh        a2, 0(a0)     // 0
-    lh        a3, 16(a0)    // 8
-    lh        v0, 32(a0)    // 16
-    lh        v1, 48(a0)    // 24
-    lh        s4, 64(a0)    // 32
-    lh        s5, 80(a0)    // 40
-    lh        s6, 96(a0)    // 48
-    lh        s7, 112(a0)   // 56
-    addu      s2, v0, s5    // tmp2 = 16 + 40
-    subu      s5, v0, s5    // tmp5 = 16 - 40
-    addu      s3, v1, s4    // tmp3 = 24 + 32
-    subu      s4, v1, s4    // tmp4 = 24 - 32
-    addu      s0, a2, s7    // tmp0 =  0 + 56
-    subu      s7, a2, s7    // tmp7 =  0 - 56
-    addu      s1, a3, s6    // tmp1 =  8 + 48
-    subu      s6, a3, s6    // tmp6 =  8 - 48
-    addu      a2, s0, s3    // tmp10 = tmp0 + tmp3
-    subu      v1, s0, s3    // tmp13 = tmp0 - tmp3
-    addu      a3, s1, s2    // tmp11 = tmp1 + tmp2
-    subu      v0, s1, s2    // tmp12 = tmp1 - tmp2
-    mult      s7, t1        // ac0  = tmp7 * c1
-    madd      s4, t0        // ac0 += tmp4 * c0
-    madd      s5, t4        // ac0 += tmp5 * c4
-    madd      s6, t2        // ac0 += tmp6 * c2
-    mult      $ac1, s7, t2  // ac1  = tmp7 * c2
-    msub      $ac1, s4, t3  // ac1 -= tmp4 * c3
-    msub      $ac1, s5, t6  // ac1 -= tmp5 * c6
-    msub      $ac1, s6, t7  // ac1 -= tmp6 * c7
-    mult      $ac2, s7, t4  // ac2  = tmp7 * c4
-    madd      $ac2, s4, t2  // ac2 += tmp4 * c2
-    madd      $ac2, s5, t5  // ac2 += tmp5 * c5
-    msub      $ac2, s6, t6  // ac2 -= tmp6 * c6
-    mult      $ac3, s7, t0  // ac3  = tmp7 * c0
-    msub      $ac3, s4, t1  // ac3 -= tmp4 * c1
-    madd      $ac3, s5, t2  // ac3 += tmp5 * c2
-    msub      $ac3, s6, t3  // ac3 -= tmp6 * c3
-    extr_r.w  s0, $ac0, 15  // tmp0 = (ac0 + 16384) >> 15
-    extr_r.w  s1, $ac1, 15  // tmp1 = (ac1 + 16384) >> 15
-    extr_r.w  s2, $ac2, 15  // tmp2 = (ac2 + 16384) >> 15
-    extr_r.w  s3, $ac3, 15  // tmp3 = (ac3 + 16384) >> 15
-    addiu     s8, s8, -1
-    addu      s4, a2, a3    // tmp4 = tmp10 + tmp11
-    subu      s5, a2, a3    // tmp5 = tmp10 - tmp11
-    sh        s0, 16(a0)
-    sh        s1, 48(a0)
-    sh        s2, 80(a0)
-    sh        s3, 112(a0)
-    mult      v0, t8        // ac0  = tmp12 * c8
-    madd      v1, t9        // ac0 += tmp13 * c9
-    mult      $ac1, v1, t8  // ac1  = tmp13 * c8
-    msub      $ac1, v0, a1  // ac1 -= tmp12 * c10
-    addiu     a0, a0, 2
-    extr_r.w  s6, $ac0, 15  // tmp6 = (ac0 + 16384) >> 15
-    extr_r.w  s7, $ac1, 15  // tmp7 = (ac1 + 16384) >> 15
-    shra_r.w  s4, s4, 2     // tmp4 = (tmp4 + 2) >> 2
-    shra_r.w  s5, s5, 2     // tmp5 = (tmp5 + 2) >> 2
-    sh        s4, -2(a0)
-    sh        s5, 62(a0)
-    sh        s6, 30(a0)
-    bgtz      s8, 2b
-     sh       s7, 94(a0)
-
-    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
-
-    jr       ra
-     nop
-
-END(jsimd_fdct_islow_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_fdct_ifast_mips_dspr2)
-/*
- * a0     - data
- */
-    .set at
-    SAVE_REGS_ON_STACK 8, s0, s1
-    li           a1, 0x014e014e  // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
-    li           a2, 0x008b008b  // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
-    li           a3, 0x00620062  // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
-    li           s1, 0x00b500b5  // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
-
-    move         v0, a0
-    addiu        v1, v0, 128     // end address
-
-0:
-    lw           t0, 0(v0)       // tmp0 = 1|0
-    lw           t1, 4(v0)       // tmp1 = 3|2
-    lw           t2, 8(v0)       // tmp2 = 5|4
-    lw           t3, 12(v0)      // tmp3 = 7|6
-    packrl.ph    t1, t1, t1      // tmp1 = 2|3
-    packrl.ph    t3, t3, t3      // tmp3 = 6|7
-    subq.ph      t7, t1, t2      // tmp7 = 2-5|3-4 = t5|t4
-    subq.ph      t5, t0, t3      // tmp5 = 1-6|0-7 = t6|t7
-    addq.ph      t6, t1, t2      // tmp6 = 2+5|3+4 = t2|t3
-    addq.ph      t4, t0, t3      // tmp4 = 1+6|0+7 = t1|t0
-    addq.ph      t8, t4, t6      // tmp5 = t1+t2|t0+t3 = t11|t10
-    subq.ph      t9, t4, t6      // tmp7 = t1-t2|t0-t3 = t12|t13
-    sra          t4, t8, 16      // tmp4 = t11
-    mult         $0, $0          // ac0  = 0
-    dpa.w.ph     $ac0, t9, s1
-    mult         $ac1, $0, $0    // ac1  = 0
-    dpa.w.ph     $ac1, t7, a3    // ac1 += t4*98 + t5*98
-    dpsx.w.ph    $ac1, t5, a3    // ac1 += t6*98 + t7*98
-    mult         $ac2, $0, $0    // ac2  = 0
-    dpa.w.ph     $ac2, t7, a2    // ac2 += t4*139 + t5*139
-    mult         $ac3, $0, $0    // ac3  = 0
-    dpa.w.ph     $ac3, t5, a1    // ac3 += t6*334 + t7*334
-    precrq.ph.w  t0, t5, t7      // t0 = t5|t6
-    addq.ph      t2, t8, t4      // tmp2 = t10 + t11
-    subq.ph      t3, t8, t4      // tmp3 = t10 - t11
-    extr.w       t4, $ac0, 8
-    mult         $0, $0          // ac0  = 0
-    dpa.w.ph     $ac0, t0, s1    // ac0 += t5*181 + t6*181
-    extr.w       t0, $ac1, 8     // t0 = z5
-    extr.w       t1, $ac2, 8     // t1 = MULTIPLY(tmp10, 139)
-    extr.w       t7, $ac3, 8     // t2 = MULTIPLY(tmp12, 334)
-    extr.w       t8, $ac0, 8     // t8 = z3 = MULTIPLY(tmp11, 181)
-    add          t6, t1, t0      // t6 = z2
-    add          t7, t7, t0      // t7 = z4
-    subq.ph      t0, t5, t8      // t0 = z13 = tmp7 - z3
-    addq.ph      t8, t5, t8      // t9 = z11 = tmp7 + z3
-    addq.ph      t1, t0, t6      // t1 = z13 + z2
-    subq.ph      t6, t0, t6      // t6 = z13 - z2
-    addq.ph      t0, t8, t7      // t0 = z11 + z4
-    subq.ph      t7, t8, t7      // t7 = z11 - z4
-    addq.ph      t5, t4, t9
-    subq.ph      t4, t9, t4
-    sh           t2, 0(v0)
-    sh           t5, 4(v0)
-    sh           t3, 8(v0)
-    sh           t4, 12(v0)
-    sh           t1, 10(v0)
-    sh           t6, 6(v0)
-    sh           t0, 2(v0)
-    sh           t7, 14(v0)
-    addiu        v0, 16
-    bne          v1, v0, 0b
-     nop
-    move         v0, a0
-    addiu        v1, v0, 16
-
-1:
-    lh           t0, 0(v0)       // 0
-    lh           t1, 16(v0)      // 8
-    lh           t2, 32(v0)      // 16
-    lh           t3, 48(v0)      // 24
-    lh           t4, 64(v0)      // 32
-    lh           t5, 80(v0)      // 40
-    lh           t6, 96(v0)      // 48
-    lh           t7, 112(v0)     // 56
-    add          t8, t0, t7      // t8 = tmp0
-    sub          t7, t0, t7      // t7 = tmp7
-    add          t0, t1, t6      // t0 = tmp1
-    sub          t1, t1, t6      // t1 = tmp6
-    add          t6, t2, t5      // t6 = tmp2
-    sub          t5, t2, t5      // t5 = tmp5
-    add          t2, t3, t4      // t2 = tmp3
-    sub          t3, t3, t4      // t3 = tmp4
-    add          t4, t8, t2      // t4 = tmp10 = tmp0 + tmp3
-    sub          t8, t8, t2      // t8 = tmp13 = tmp0 - tmp3
-    sub          s0, t0, t6      // s0 = tmp12 = tmp1 - tmp2
-    ins          t8, s0, 16, 16  // t8 = tmp12|tmp13
-    add          t2, t0, t6      // t2 = tmp11 = tmp1 + tmp2
-    mult         $0, $0          // ac0  = 0
-    dpa.w.ph     $ac0, t8, s1    // ac0 += t12*181 + t13*181
-    add          s0, t4, t2      // t8 = tmp10+tmp11
-    sub          t4, t4, t2      // t4 = tmp10-tmp11
-    sh           s0, 0(v0)
-    sh           t4, 64(v0)
-    extr.w       t2, $ac0, 8     // z1 = MULTIPLY(tmp12+tmp13,FIX_0_707106781)
-    addq.ph      t4, t8, t2      // t9 = tmp13 + z1
-    subq.ph      t8, t8, t2      // t2 = tmp13 - z1
-    sh           t4, 32(v0)
-    sh           t8, 96(v0)
-    add          t3, t3, t5      // t3 = tmp10 = tmp4 + tmp5
-    add          t0, t5, t1      // t0 = tmp11 = tmp5 + tmp6
-    add          t1, t1, t7      // t1 = tmp12 = tmp6 + tmp7
-    andi         t4, a1, 0xffff
-    mul          s0, t1, t4
-    sra          s0, s0, 8       // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
-    ins          t1, t3, 16, 16  // t1 = tmp10|tmp12
-    mult         $0, $0          // ac0  = 0
-    mulsa.w.ph   $ac0, t1, a3    // ac0 += t10*98 - t12*98
-    extr.w       t8, $ac0, 8     // z5 = MULTIPLY(tmp10-tmp12,FIX_0_382683433)
-    add          t2, t7, t8      // t2 = tmp7 + z5
-    sub          t7, t7, t8      // t7 = tmp7 - z5
-    andi         t4, a2, 0xffff
-    mul          t8, t3, t4
-    sra          t8, t8, 8       // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
-    andi         t4, s1, 0xffff
-    mul          t6, t0, t4
-    sra          t6, t6, 8       // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
-    add          t0, t6, t8      // t0 = z3 + z2
-    sub          t1, t6, t8      // t1 = z3 - z2
-    add          t3, t6, s0      // t3 = z3 + z4
-    sub          t4, t6, s0      // t4 = z3 - z4
-    sub          t5, t2, t1      // t5 = dataptr[5]
-    sub          t6, t7, t0      // t6 = dataptr[3]
-    add          t3, t2, t3      // t3 = dataptr[1]
-    add          t4, t7, t4      // t4 = dataptr[7]
-    sh           t5, 80(v0)
-    sh           t6, 48(v0)
-    sh           t3, 16(v0)
-    sh           t4, 112(v0)
-    addiu        v0, 2
-    bne          v0, v1, 1b
-     nop
-
-    RESTORE_REGS_FROM_STACK 8, s0, s1
-
-    j            ra
-     nop
-END(jsimd_fdct_ifast_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2)
-/*
- * a0     - coef_block
- * a1     - divisors
- * a2     - workspace
- */
-
-    .set at
-
-    SAVE_REGS_ON_STACK 16, s0, s1, s2
-
-    addiu   v0, a2, 124  // v0 = workspace_end
-    lh      t0, 0(a2)
-    lh      t1, 0(a1)
-    lh      t2, 128(a1)
-    sra     t3, t0, 15
-    sll     t3, t3, 1
-    addiu   t3, t3, 1
-    mul     t0, t0, t3
-    lh      t4, 384(a1)
-    lh      t5, 130(a1)
-    lh      t6, 2(a2)
-    lh      t7, 2(a1)
-    lh      t8, 386(a1)
-
-1:
-    andi    t1, 0xffff
-    add     t9, t0, t2
-    andi    t9, 0xffff
-    mul     v1, t9, t1
-    sra     s0, t6, 15
-    sll     s0, s0, 1
-    addiu   s0, s0, 1
-    addiu   t9, t4, 16
-    srav    v1, v1, t9
-    mul     v1, v1, t3
-    mul     t6, t6, s0
-    andi    t7, 0xffff
-    addiu   a2, a2, 4
-    addiu   a1, a1, 4
-    add     s1, t6, t5
-    andi    s1, 0xffff
-    sh      v1, 0(a0)
-
-    mul     s2, s1, t7
-    addiu   s1, t8, 16
-    srav    s2, s2, s1
-    mul     s2,s2, s0
-    lh      t0, 0(a2)
-    lh      t1, 0(a1)
-    sra     t3, t0, 15
-    sll     t3, t3, 1
-    addiu   t3, t3, 1
-    mul     t0, t0, t3
-    lh      t2, 128(a1)
-    lh      t4, 384(a1)
-    lh      t5, 130(a1)
-    lh      t8, 386(a1)
-    lh      t6, 2(a2)
-    lh      t7, 2(a1)
-    sh      s2, 2(a0)
-    lh      t0, 0(a2)
-    sra     t3, t0, 15
-    sll     t3, t3, 1
-    addiu   t3, t3, 1
-    mul     t0, t0,t3
-    bne     a2, v0, 1b
-     addiu  a0, a0, 4
-
-    andi    t1, 0xffff
-    add     t9, t0, t2
-    andi    t9, 0xffff
-    mul     v1, t9, t1
-    sra     s0, t6, 15
-    sll     s0, s0, 1
-    addiu   s0, s0, 1
-    addiu   t9, t4, 16
-    srav    v1, v1, t9
-    mul     v1, v1, t3
-    mul     t6, t6, s0
-    andi    t7, 0xffff
-    sh      v1, 0(a0)
-    add     s1, t6, t5
-    andi    s1, 0xffff
-    mul     s2, s1, t7
-    addiu   s1, t8, 16
-    addiu   a2, a2, 4
-    addiu   a1, a1, 4
-    srav    s2, s2, s1
-    mul     s2, s2, s0
-    sh      s2, 2(a0)
-
-    RESTORE_REGS_FROM_STACK 16, s0, s1, s2
-
-    j       ra
-     nop
-
-END(jsimd_quantize_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2)
-/*
- * a0     - coef_block
- * a1     - divisors
- * a2     - workspace
- */
-
-    .set at
-
-    li         t1, 0x46800100     //integer representation 16384.5
-    mtc1       t1, f0
-    li         t0, 63
-0:
-    lwc1       f2, 0(a2)
-    lwc1       f10, 0(a1)
-    lwc1       f4, 4(a2)
-    lwc1       f12, 4(a1)
-    lwc1       f6, 8(a2)
-    lwc1       f14, 8(a1)
-    lwc1       f8, 12(a2)
-    lwc1       f16, 12(a1)
-    madd.s     f2, f0, f2, f10
-    madd.s     f4, f0, f4, f12
-    madd.s     f6, f0, f6, f14
-    madd.s     f8, f0, f8, f16
-    lwc1       f10, 16(a1)
-    lwc1       f12, 20(a1)
-    trunc.w.s  f2, f2
-    trunc.w.s  f4, f4
-    trunc.w.s  f6, f6
-    trunc.w.s  f8, f8
-    lwc1       f14, 24(a1)
-    lwc1       f16, 28(a1)
-    mfc1       t1, f2
-    mfc1       t2, f4
-    mfc1       t3, f6
-    mfc1       t4, f8
-    lwc1       f2, 16(a2)
-    lwc1       f4, 20(a2)
-    lwc1       f6, 24(a2)
-    lwc1       f8, 28(a2)
-    madd.s     f2, f0, f2, f10
-    madd.s     f4, f0, f4, f12
-    madd.s     f6, f0, f6, f14
-    madd.s     f8, f0, f8, f16
-    addiu      t1, t1, -16384
-    addiu      t2, t2, -16384
-    addiu      t3, t3, -16384
-    addiu      t4, t4, -16384
-    trunc.w.s  f2, f2
-    trunc.w.s  f4, f4
-    trunc.w.s  f6, f6
-    trunc.w.s  f8, f8
-    sh         t1, 0(a0)
-    sh         t2, 2(a0)
-    sh         t3, 4(a0)
-    sh         t4, 6(a0)
-    mfc1       t1, f2
-    mfc1       t2, f4
-    mfc1       t3, f6
-    mfc1       t4, f8
-    addiu      t0, t0, -8
-    addiu      a2, a2, 32
-    addiu      a1, a1, 32
-    addiu      t1, t1, -16384
-    addiu      t2, t2, -16384
-    addiu      t3, t3, -16384
-    addiu      t4, t4, -16384
-    sh         t1, 8(a0)
-    sh         t2, 10(a0)
-    sh         t3, 12(a0)
-    sh         t4, 14(a0)
-    bgez       t0, 0b
-     addiu     a0, a0, 16
-
-    j          ra
-     nop
-
-END(jsimd_quantize_float_mips_dspr2)
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2)
-/*
- * a0     - compptr->dct_table
- * a1     - coef_block
- * a2     - output_buf
- * a3     - output_col
- */
-    .set at
-
-    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
-
-    addiu     sp, sp, -40
-    move      v0, sp
-    addiu     s2, zero, 29692
-    addiu     s3, zero, -10426
-    addiu     s4, zero, 6967
-    addiu     s5, zero, -5906
-    lh        t0, 0(a1)         // t0 = inptr[DCTSIZE*0]
-    lh        t5, 0(a0)         // t5 = quantptr[DCTSIZE*0]
-    lh        t1, 48(a1)        // t1 = inptr[DCTSIZE*3]
-    lh        t6, 48(a0)        // t6 = quantptr[DCTSIZE*3]
-    mul       t4, t5, t0
-    lh        t0, 16(a1)        // t0 = inptr[DCTSIZE*1]
-    lh        t5, 16(a0)        // t5 = quantptr[DCTSIZE*1]
-    mul       t6, t6, t1
-    mul       t5, t5, t0
-    lh        t2, 80(a1)        // t2 = inptr[DCTSIZE*5]
-    lh        t7, 80(a0)        // t7 = quantptr[DCTSIZE*5]
-    lh        t3, 112(a1)       // t3 = inptr[DCTSIZE*7]
-    lh        t8, 112(a0)       // t8 = quantptr[DCTSIZE*7]
-    mul       t7, t7, t2
-    mult      zero, zero
-    mul       t8, t8, t3
-    li        s0, 0x73FCD746    // s0 = (29692 << 16) | (-10426 & 0xffff)
-    li        s1, 0x1B37E8EE    // s1 = (6967 << 16) | (-5906 & 0xffff)
-    ins       t6, t5, 16, 16    // t6 = t5|t6
-    sll       t4, t4, 15
-    dpa.w.ph  $ac0, t6, s0
-    lh        t1, 2(a1)
-    lh        t6, 2(a0)
-    ins       t8, t7, 16, 16    // t8 = t7|t8
-    dpa.w.ph  $ac0, t8, s1
-    mflo      t0, $ac0
-    mul       t5, t6, t1
-    lh        t1, 18(a1)
-    lh        t6, 18(a0)
-    lh        t2, 50(a1)
-    lh        t7, 50(a0)
-    mul       t6, t6, t1
-    subu      t8, t4, t0
-    mul       t7, t7, t2
-    addu      t0, t4, t0
-    shra_r.w  t0, t0, 13
-    lh        t1, 82(a1)
-    lh        t2, 82(a0)
-    lh        t3, 114(a1)
-    lh        t4, 114(a0)
-    shra_r.w  t8, t8, 13
-    mul       t1, t1, t2
-    mul       t3, t3, t4
-    sw        t0, 0(v0)
-    sw        t8, 20(v0)
-    sll       t4, t5, 15
-    ins       t7, t6, 16, 16
-    mult      zero, zero
-    dpa.w.ph  $ac0, t7, s0
-    ins       t3, t1, 16, 16
-    lh        t1, 6(a1)
-    lh        t6, 6(a0)
-    dpa.w.ph  $ac0, t3, s1
-    mflo      t0, $ac0
-    mul       t5, t6, t1
-    lh        t1, 22(a1)
-    lh        t6, 22(a0)
-    lh        t2, 54(a1)
-    lh        t7, 54(a0)
-    mul       t6, t6, t1
-    subu      t8, t4, t0
-    mul       t7, t7, t2
-    addu      t0, t4, t0
-    shra_r.w  t0, t0, 13
-    lh        t1, 86(a1)
-    lh        t2, 86(a0)
-    lh        t3, 118(a1)
-    lh        t4, 118(a0)
-    shra_r.w  t8, t8, 13
-    mul       t1, t1, t2
-    mul       t3, t3, t4
-    sw        t0, 4(v0)
-    sw        t8, 24(v0)
-    sll       t4, t5, 15
-    ins       t7, t6, 16, 16
-    mult      zero, zero
-    dpa.w.ph  $ac0, t7, s0
-    ins       t3, t1, 16, 16
-    lh        t1, 10(a1)
-    lh        t6, 10(a0)
-    dpa.w.ph  $ac0, t3, s1
-    mflo      t0, $ac0
-    mul       t5, t6, t1
-    lh        t1, 26(a1)
-    lh        t6, 26(a0)
-    lh        t2, 58(a1)
-    lh        t7, 58(a0)
-    mul       t6, t6, t1
-    subu      t8, t4, t0
-    mul       t7, t7, t2
-    addu      t0, t4, t0
-    shra_r.w  t0, t0, 13
-    lh        t1, 90(a1)
-    lh        t2, 90(a0)
-    lh        t3, 122(a1)
-    lh        t4, 122(a0)
-    shra_r.w  t8, t8, 13
-    mul       t1, t1, t2
-    mul       t3, t3, t4
-    sw        t0, 8(v0)
-    sw        t8, 28(v0)
-    sll       t4, t5, 15
-    ins       t7, t6, 16, 16
-    mult      zero, zero
-    dpa.w.ph  $ac0, t7, s0
-    ins       t3, t1, 16, 16
-    lh        t1, 14(a1)
-    lh        t6, 14(a0)
-    dpa.w.ph  $ac0, t3, s1
-    mflo      t0, $ac0
-    mul       t5, t6, t1
-    lh        t1, 30(a1)
-    lh        t6, 30(a0)
-    lh        t2, 62(a1)
-    lh        t7, 62(a0)
-    mul       t6, t6, t1
-    subu      t8, t4, t0
-    mul       t7, t7, t2
-    addu      t0, t4, t0
-    shra_r.w  t0, t0, 13
-    lh        t1, 94(a1)
-    lh        t2, 94(a0)
-    lh        t3, 126(a1)
-    lh        t4, 126(a0)
-    shra_r.w  t8, t8, 13
-    mul       t1, t1, t2
-    mul       t3, t3, t4
-    sw        t0, 12(v0)
-    sw        t8, 32(v0)
-    sll       t4, t5, 15
-    ins       t7, t6, 16, 16
-    mult      zero, zero
-    dpa.w.ph  $ac0, t7, s0
-    ins       t3, t1, 16, 16
-    dpa.w.ph  $ac0, t3, s1
-    mflo      t0, $ac0
-    lw        t9, 0(a2)
-    lw        t3, 0(v0)
-    lw        t7, 4(v0)
-    lw        t1, 8(v0)
-    addu      t9, t9, a3
-    sll       t3, t3, 15
-    subu      t8, t4, t0
-    addu      t0, t4, t0
-    shra_r.w  t0, t0, 13
-    shra_r.w  t8, t8, 13
-    sw        t0, 16(v0)
-    sw        t8, 36(v0)
-    lw        t5, 12(v0)
-    lw        t6, 16(v0)
-    mult      t7, s2
-    madd      t1, s3
-    madd      t5, s4
-    madd      t6, s5
-    lw        t5, 24(v0)
-    lw        t7, 28(v0)
-    mflo      t0, $ac0
-    lw        t8, 32(v0)
-    lw        t2, 36(v0)
-    mult      $ac1, t5, s2
-    madd      $ac1, t7, s3
-    madd      $ac1, t8, s4
-    madd      $ac1, t2, s5
-    addu      t1, t3, t0
-    subu      t6, t3, t0
-    shra_r.w  t1, t1, 20
-    shra_r.w  t6, t6, 20
-    mflo      t4, $ac1
-    shll_s.w  t1, t1, 24
-    shll_s.w  t6, t6, 24
-    sra       t1, t1, 24
-    sra       t6, t6, 24
-    addiu     t1, t1, 128
-    addiu     t6, t6, 128
-    lw        t0, 20(v0)
-    sb        t1, 0(t9)
-    sb        t6, 1(t9)
-    sll       t0, t0, 15
-    lw        t9, 4(a2)
-    addu      t1, t0, t4
-    subu      t6, t0, t4
-    addu      t9, t9, a3
-    shra_r.w  t1, t1, 20
-    shra_r.w  t6, t6, 20
-    shll_s.w  t1, t1, 24
-    shll_s.w  t6, t6, 24
-    sra       t1, t1, 24
-    sra       t6, t6, 24
-    addiu     t1, t1, 128
-    addiu     t6, t6, 128
-    sb        t1, 0(t9)
-    sb        t6, 1(t9)
-    addiu     sp, sp, 40
-
-    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
-
-    j         ra
-     nop
-
-END(jsimd_idct_2x2_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_4x4_mips_dspr2)
-/*
- * a0     - compptr->dct_table
- * a1     - coef_block
- * a2     - output_buf
- * a3     - output_col
- * 16(sp) - workspace[DCTSIZE*4];  // buffers data between passes
- */
-
-    .set at
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    lw        v1, 48(sp)
-    move      t0, a1
-    move      t1, v1
-    li        t9, 4
-    li        s0, 0x2e75f93e
-    li        s1, 0x21f9ba79
-    li        s2, 0xecc2efb0
-    li        s3, 0x52031ccd
-
-0:
-    lh        s6, 32(t0)        // inptr[DCTSIZE*2]
-    lh        t6, 32(a0)        // quantptr[DCTSIZE*2]
-    lh        s7, 96(t0)        // inptr[DCTSIZE*6]
-    lh        t7, 96(a0)        // quantptr[DCTSIZE*6]
-    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
-    lh        s4, 0(t0)         // inptr[DCTSIZE*0]
-    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
-    lh        s5, 0(a0)         // quantptr[0]
-    li        s6, 15137
-    li        s7, 6270
-    mul       t2, s4, s5        // tmp0 = (inptr[0] * quantptr[0])
-    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
-    lh        t5, 112(t0)       // inptr[DCTSIZE*7]
-    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
-    lh        s4, 112(a0)       // quantptr[DCTSIZE*7]
-    lh        v0, 80(t0)        // inptr[DCTSIZE*5]
-    lh        s5, 80(a0)        // quantptr[DCTSIZE*5]
-    lh        s6, 48(a0)        // quantptr[DCTSIZE*3]
-    sll       t2, t2, 14        // tmp0 <<= (CONST_BITS+1)
-    lh        s7, 16(a0)        // quantptr[DCTSIZE*1]
-    lh        t8, 16(t0)        // inptr[DCTSIZE*1]
-    subu      t6, t6, t7        // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
-    lh        t7, 48(t0)        // inptr[DCTSIZE*3]
-    mul       t5, s4, t5        // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
-    mul       v0, s5, v0        // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
-    mul       t7, s6, t7        // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
-    mul       t8, s7, t8        // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
-    addu      t3, t2, t6        // tmp10 = tmp0 + z2
-    subu      t4, t2, t6        // tmp10 = tmp0 - z2
-    mult      $ac0, zero, zero
-    mult      $ac1, zero, zero
-    ins       t5, v0, 16, 16
-    ins       t7, t8, 16, 16
-    addiu     t9, t9, -1
-    dpa.w.ph  $ac0, t5, s0
-    dpa.w.ph  $ac0, t7, s1
-    dpa.w.ph  $ac1, t5, s2
-    dpa.w.ph  $ac1, t7, s3
-    mflo      s4, $ac0
-    mflo      s5, $ac1
-    addiu     a0, a0, 2
-    addiu     t1, t1, 4
-    addiu     t0, t0, 2
-    addu      t6, t4, s4
-    subu      t5, t4, s4
-    addu      s6, t3, s5
-    subu      s7, t3, s5
-    shra_r.w  t6, t6, 12        // DESCALE(tmp12 + temp1, 12)
-    shra_r.w  t5, t5, 12        // DESCALE(tmp12 - temp1, 12)
-    shra_r.w  s6, s6, 12        // DESCALE(tmp10 + temp2, 12)
-    shra_r.w  s7, s7, 12        // DESCALE(tmp10 - temp2, 12)
-    sw        t6, 28(t1)
-    sw        t5, 60(t1)
-    sw        s6, -4(t1)
-    bgtz      t9, 0b
-     sw       s7, 92(t1)
-    // second loop three pass
-    li        t9, 3
-1:
-    lh        s6, 34(t0)        // inptr[DCTSIZE*2]
-    lh        t6, 34(a0)        // quantptr[DCTSIZE*2]
-    lh        s7, 98(t0)        // inptr[DCTSIZE*6]
-    lh        t7, 98(a0)        // quantptr[DCTSIZE*6]
-    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
-    lh        s4, 2(t0)         // inptr[DCTSIZE*0]
-    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
-    lh        s5, 2(a0)         // quantptr[DCTSIZE*0]
-    li        s6, 15137
-    li        s7, 6270
-    mul       t2, s4, s5        // tmp0 = (inptr[0] * quantptr[0])
-    mul       v0, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
-    lh        t5, 114(t0)       // inptr[DCTSIZE*7]
-    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
-    lh        s4, 114(a0)       // quantptr[DCTSIZE*7]
-    lh        s5, 82(a0)        // quantptr[DCTSIZE*5]
-    lh        t6, 82(t0)        // inptr[DCTSIZE*5]
-    sll       t2, t2, 14        // tmp0 <<= (CONST_BITS+1)
-    lh        s6, 50(a0)        // quantptr[DCTSIZE*3]
-    lh        t8, 18(t0)        // inptr[DCTSIZE*1]
-    subu      v0, v0, t7        // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
-    lh        t7, 50(t0)        // inptr[DCTSIZE*3]
-    lh        s7, 18(a0)        // quantptr[DCTSIZE*1]
-    mul       t5, s4, t5        // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
-    mul       t6, s5, t6        // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
-    mul       t7, s6, t7        // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
-    mul       t8, s7, t8        // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
-    addu      t3, t2, v0        // tmp10 = tmp0 + z2
-    subu      t4, t2, v0        // tmp10 = tmp0 - z2
-    mult      $ac0, zero, zero
-    mult      $ac1, zero, zero
-    ins       t5, t6, 16, 16
-    ins       t7, t8, 16, 16
-    dpa.w.ph  $ac0, t5, s0
-    dpa.w.ph  $ac0, t7, s1
-    dpa.w.ph  $ac1, t5, s2
-    dpa.w.ph  $ac1, t7, s3
-    mflo      t5, $ac0
-    mflo      t6, $ac1
-    addiu     t9, t9, -1
-    addiu     t0, t0, 2
-    addiu     a0, a0, 2
-    addiu     t1, t1, 4
-    addu      s5, t4, t5
-    subu      s4, t4, t5
-    addu      s6, t3, t6
-    subu      s7, t3, t6
-    shra_r.w  s5, s5, 12        // DESCALE(tmp12 + temp1, 12)
-    shra_r.w  s4, s4, 12        // DESCALE(tmp12 - temp1, 12)
-    shra_r.w  s6, s6, 12        // DESCALE(tmp10 + temp2, 12)
-    shra_r.w  s7, s7, 12        // DESCALE(tmp10 - temp2, 12)
-    sw        s5, 32(t1)
-    sw        s4, 64(t1)
-    sw        s6, 0(t1)
-    bgtz      t9, 1b
-     sw       s7, 96(t1)
-    move      t1, v1
-    li        s4, 15137
-    lw        s6, 8(t1)         // wsptr[2]
-    li        s5, 6270
-    lw        s7, 24(t1)        // wsptr[6]
-    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
-    lw        t2, 0(t1)         // wsptr[0]
-    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
-    lh        t5, 28(t1)        // wsptr[7]
-    lh        t6, 20(t1)        // wsptr[5]
-    lh        t7, 12(t1)        // wsptr[3]
-    lh        t8, 4(t1)         // wsptr[1]
-    ins       t5, t6, 16, 16
-    ins       t7, t8, 16, 16
-    mult      $ac0, zero, zero
-    dpa.w.ph  $ac0, t5, s0
-    dpa.w.ph  $ac0, t7, s1
-    mult      $ac1, zero, zero
-    dpa.w.ph  $ac1, t5, s2
-    dpa.w.ph  $ac1, t7, s3
-    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
-    mflo      s6, $ac0
-    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
-    subu      s4, s4, s5
-    addu      t3, t2, s4        // tmp10 = tmp0 + z2
-    mflo      s7, $ac1
-    subu      t4, t2, s4        // tmp10 = tmp0 - z2
-    addu      t7, t4, s6
-    subu      t8, t4, s6
-    addu      t5, t3, s7
-    subu      t6, t3, s7
-    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
-    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
-    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
-    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
-    sll       s4, t9, 2
-    lw        v0, 0(a2)         // output_buf[ctr]
-    shll_s.w  t5, t5, 24
-    shll_s.w  t6, t6, 24
-    shll_s.w  t7, t7, 24
-    shll_s.w  t8, t8, 24
-    sra       t5, t5, 24
-    sra       t6, t6, 24
-    sra       t7, t7, 24
-    sra       t8, t8, 24
-    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
-    addiu     t5, t5, 128
-    addiu     t6, t6, 128
-    addiu     t7, t7, 128
-    addiu     t8, t8, 128
-    sb        t5, 0(v0)
-    sb        t7, 1(v0)
-    sb        t8, 2(v0)
-    sb        t6, 3(v0)
-    // 2
-    li        s4, 15137
-    lw        s6, 40(t1)        // wsptr[2]
-    li        s5, 6270
-    lw        s7, 56(t1)        // wsptr[6]
-    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
-    lw        t2, 32(t1)        // wsptr[0]
-    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
-    lh        t5, 60(t1)        // wsptr[7]
-    lh        t6, 52(t1)        // wsptr[5]
-    lh        t7, 44(t1)        // wsptr[3]
-    lh        t8, 36(t1)        // wsptr[1]
-    ins       t5, t6, 16, 16
-    ins       t7, t8, 16, 16
-    mult      $ac0, zero, zero
-    dpa.w.ph  $ac0, t5, s0
-    dpa.w.ph  $ac0, t7, s1
-    mult      $ac1, zero, zero
-    dpa.w.ph  $ac1, t5, s2
-    dpa.w.ph  $ac1, t7, s3
-    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
-    mflo      s6, $ac0
-    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
-    subu      s4, s4, s5
-    addu      t3, t2, s4        // tmp10 = tmp0 + z2
-    mflo      s7, $ac1
-    subu      t4, t2, s4        // tmp10 = tmp0 - z2
-    addu      t7, t4, s6
-    subu      t8, t4, s6
-    addu      t5, t3, s7
-    subu      t6, t3, s7
-    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
-    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
-    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
-    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
-    sll       s4, t9, 2
-    lw        v0, 4(a2)         // output_buf[ctr]
-    shll_s.w  t5, t5, 24
-    shll_s.w  t6, t6, 24
-    shll_s.w  t7, t7, 24
-    shll_s.w  t8, t8, 24
-    sra       t5, t5, 24
-    sra       t6, t6, 24
-    sra       t7, t7, 24
-    sra       t8, t8, 24
-    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
-    addiu     t5, t5, 128
-    addiu     t6, t6, 128
-    addiu     t7, t7, 128
-    addiu     t8, t8, 128
-    sb        t5, 0(v0)
-    sb        t7, 1(v0)
-    sb        t8, 2(v0)
-    sb        t6, 3(v0)
-    // 3
-    li        s4, 15137
-    lw        s6, 72(t1)        // wsptr[2]
-    li        s5, 6270
-    lw        s7, 88(t1)        // wsptr[6]
-    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
-    lw        t2, 64(t1)        // wsptr[0]
-    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
-    lh        t5, 92(t1)        // wsptr[7]
-    lh        t6, 84(t1)        // wsptr[5]
-    lh        t7, 76(t1)        // wsptr[3]
-    lh        t8, 68(t1)        // wsptr[1]
-    ins       t5, t6, 16, 16
-    ins       t7, t8, 16, 16
-    mult      $ac0, zero, zero
-    dpa.w.ph  $ac0, t5, s0
-    dpa.w.ph  $ac0, t7, s1
-    mult      $ac1, zero, zero
-    dpa.w.ph  $ac1, t5, s2
-    dpa.w.ph  $ac1, t7, s3
-    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
-    mflo      s6, $ac0
-    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
-    subu      s4, s4, s5
-    addu      t3, t2, s4        // tmp10 = tmp0 + z2
-    mflo      s7, $ac1
-    subu      t4, t2, s4        // tmp10 = tmp0 - z2
-    addu      t7, t4, s6
-    subu      t8, t4, s6
-    addu      t5, t3, s7
-    subu      t6, t3, s7
-    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
-    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
-    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
-    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
-    sll       s4, t9, 2
-    lw        v0, 8(a2)         // output_buf[ctr]
-    shll_s.w  t5, t5, 24
-    shll_s.w  t6, t6, 24
-    shll_s.w  t7, t7, 24
-    shll_s.w  t8, t8, 24
-    sra       t5, t5, 24
-    sra       t6, t6, 24
-    sra       t7, t7, 24
-    sra       t8, t8, 24
-    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
-    addiu     t5, t5, 128
-    addiu     t6, t6, 128
-    addiu     t7, t7, 128
-    addiu     t8, t8, 128
-    sb        t5, 0(v0)
-    sb        t7, 1(v0)
-    sb        t8, 2(v0)
-    sb        t6, 3(v0)
-    li        s4, 15137
-    lw        s6, 104(t1)       // wsptr[2]
-    li        s5, 6270
-    lw        s7, 120(t1)       // wsptr[6]
-    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
-    lw        t2, 96(t1)        // wsptr[0]
-    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], -FIX_0_765366865)
-    lh        t5, 124(t1)       // wsptr[7]
-    lh        t6, 116(t1)       // wsptr[5]
-    lh        t7, 108(t1)       // wsptr[3]
-    lh        t8, 100(t1)       // wsptr[1]
-    ins       t5, t6, 16, 16
-    ins       t7, t8, 16, 16
-    mult      $ac0, zero, zero
-    dpa.w.ph  $ac0, t5, s0
-    dpa.w.ph  $ac0, t7, s1
-    mult      $ac1, zero, zero
-    dpa.w.ph  $ac1, t5, s2
-    dpa.w.ph  $ac1, t7, s3
-    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
-    mflo      s6, $ac0
-    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
-    subu      s4, s4, s5
-    addu      t3, t2, s4        // tmp10 = tmp0 + z2;
-    mflo      s7, $ac1
-    subu      t4, t2, s4        // tmp10 = tmp0 - z2;
-    addu      t7, t4, s6
-    subu      t8, t4, s6
-    addu      t5, t3, s7
-    subu      t6, t3, s7
-    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
-    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
-    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
-    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
-    sll       s4, t9, 2
-    lw        v0, 12(a2)        // output_buf[ctr]
-    shll_s.w  t5, t5, 24
-    shll_s.w  t6, t6, 24
-    shll_s.w  t7, t7, 24
-    shll_s.w  t8, t8, 24
-    sra       t5, t5, 24
-    sra       t6, t6, 24
-    sra       t7, t7, 24
-    sra       t8, t8, 24
-    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
-    addiu     t5, t5, 128
-    addiu     t6, t6, 128
-    addiu     t7, t7, 128
-    addiu     t8, t8, 128
-    sb        t5, 0(v0)
-    sb        t7, 1(v0)
-    sb        t8, 2(v0)
-    sb        t6, 3(v0)
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j         ra
-     nop
-END(jsimd_idct_4x4_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_6x6_mips_dspr2)
-/*
- * a0     - compptr->dct_table
- * a1     - coef_block
- * a2     - output_buf
- * a3     - output_col
- */
-    .set at
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    addiu     sp, sp, -144
-    move      v0, sp
-    addiu     v1, v0, 24
-    addiu     t9, zero, 5793
-    addiu     s0, zero, 10033
-    addiu     s1, zero, 2998
-
-1:
-    lh        s2, 0(a0)   // q0 = quantptr[ 0]
-    lh        s3, 32(a0)  // q1 = quantptr[16]
-    lh        s4, 64(a0)  // q2 = quantptr[32]
-    lh        t2, 64(a1)  // tmp2 = inptr[32]
-    lh        t1, 32(a1)  // tmp1 = inptr[16]
-    lh        t0, 0(a1)   // tmp0 = inptr[ 0]
-    mul       t2, t2, s4  // tmp2 = tmp2 * q2
-    mul       t1, t1, s3  // tmp1 = tmp1 * q1
-    mul       t0, t0, s2  // tmp0 = tmp0 * q0
-    lh        t6, 16(a1)  // z1 = inptr[ 8]
-    lh        t8, 80(a1)  // z3 = inptr[40]
-    lh        t7, 48(a1)  // z2 = inptr[24]
-    lh        s2, 16(a0)  // q0 = quantptr[ 8]
-    lh        s4, 80(a0)  // q2 = quantptr[40]
-    lh        s3, 48(a0)  // q1 = quantptr[24]
-    mul       t2, t2, t9  // tmp2 = tmp2 * 5793
-    mul       t1, t1, s0  // tmp1 = tmp1 * 10033
-    sll       t0, t0, 13  // tmp0 = tmp0 << 13
-    mul       t6, t6, s2  // z1 = z1 * q0
-    mul       t8, t8, s4  // z3 = z3 * q2
-    mul       t7, t7, s3  // z2 = z2 * q1
-    addu      t3, t0, t2  // tmp10 = tmp0 + tmp2
-    sll       t2, t2, 1   // tmp2 = tmp2 << 2
-    subu      t4, t0, t2  // tmp11 = tmp0 - tmp2;
-    subu      t5, t3, t1  // tmp12 = tmp10 - tmp1
-    addu      t3, t3, t1  // tmp10 = tmp10 + tmp1
-    addu      t1, t6, t8  // tmp1 = z1 + z3
-    mul       t1, t1, s1  // tmp1 = tmp1 * 2998
-    shra_r.w  t4, t4, 11  // tmp11 = (tmp11 + 1024) >> 11
-    subu      t2, t6, t8  // tmp2 = z1 - z3
-    subu      t2, t2, t7  // tmp2 = tmp2 - z2
-    sll       t2, t2, 2   // tmp2 = tmp2 << 2
-    addu      t0, t6, t7  // tmp0 = z1 + z2
-    sll       t0, t0, 13  // tmp0 = tmp0 << 13
-    subu      s2, t8, t7  // q0 = z3 - z2
-    sll       s2, s2, 13  // q0 = q0 << 13
-    addu      t0, t0, t1  // tmp0 = tmp0 + tmp1
-    addu      t1, s2, t1  // tmp1 = q0 + tmp1
-    addu      s2, t4, t2  // q0 = tmp11 + tmp2
-    subu      s3, t4, t2  // q1 = tmp11 - tmp2
-    addu      t6, t3, t0  // z1 = tmp10 + tmp0
-    subu      t7, t3, t0  // z2 = tmp10 - tmp0
-    addu      t4, t5, t1  // tmp11 = tmp12 + tmp1
-    subu      t5, t5, t1  // tmp12 = tmp12 - tmp1
-    shra_r.w  t6, t6, 11  // z1 = (z1 + 1024) >> 11
-    shra_r.w  t7, t7, 11  // z2 = (z2 + 1024) >> 11
-    shra_r.w  t4, t4, 11  // tmp11 = (tmp11 + 1024) >> 11
-    shra_r.w  t5, t5, 11  // tmp12 = (tmp12 + 1024) >> 11
-    sw        s2, 24(v0)
-    sw        s3, 96(v0)
-    sw        t6, 0(v0)
-    sw        t7, 120(v0)
-    sw        t4, 48(v0)
-    sw        t5, 72(v0)
-    addiu     v0, v0, 4
-    addiu     a1, a1, 2
-    bne       v0, v1, 1b
-     addiu    a0, a0, 2
-
-    /* Pass 2: process 6 rows from work array, store into output array. */
-    move      v0, sp
-    addiu     v1, v0, 144
-
-2:
-    lw        t0, 0(v0)
-    lw        t2, 16(v0)
-    lw        s5, 0(a2)
-    addiu     t0, t0, 16
-    sll       t0, t0, 13
-    mul       t3, t2, t9
-    lw        t6, 4(v0)
-    lw        t8, 20(v0)
-    lw        t7, 12(v0)
-    addu      s5, s5, a3
-    addu      s6, t6, t8
-    mul       s6, s6, s1
-    addu      t1, t0, t3
-    subu      t4, t0, t3
-    subu      t4, t4, t3
-    lw        t3, 8(v0)
-    mul       t0, t3, s0
-    addu      s7, t6, t7
-    sll       s7, s7, 13
-    addu      s7, s6, s7
-    subu      t2, t8, t7
-    sll       t2, t2, 13
-    addu      t2, s6, t2
-    subu      s6, t6, t7
-    subu      s6, s6, t8
-    sll       s6, s6, 13
-    addu      t3, t1, t0
-    subu      t5, t1, t0
-    addu      t6, t3, s7
-    subu      t3, t3, s7
-    addu      t7, t4, s6
-    subu      t4, t4, s6
-    addu      t8, t5, t2
-    subu      t5, t5, t2
-    shll_s.w  t6, t6, 6
-    shll_s.w  t3, t3, 6
-    shll_s.w  t7, t7, 6
-    shll_s.w  t4, t4, 6
-    shll_s.w  t8, t8, 6
-    shll_s.w  t5, t5, 6
-    sra       t6, t6, 24
-    addiu     t6, t6, 128
-    sra       t3, t3, 24
-    addiu     t3, t3, 128
-    sb        t6, 0(s5)
-    sra       t7, t7, 24
-    addiu     t7, t7, 128
-    sb        t3, 5(s5)
-    sra       t4, t4, 24
-    addiu     t4, t4, 128
-    sb        t7, 1(s5)
-    sra       t8, t8, 24
-    addiu     t8, t8, 128
-    sb        t4, 4(s5)
-    addiu     v0, v0, 24
-    sra       t5, t5, 24
-    addiu     t5, t5, 128
-    sb        t8, 2(s5)
-    addiu     a2, a2,  4
-    bne       v0, v1, 2b
-     sb       t5, 3(s5)
-
-    addiu     sp, sp, 144
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j         ra
-     nop
-
-END(jsimd_idct_6x6_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass1_mips_dspr2)
-/*
- * a0     - compptr->dct_table
- * a1     - coef_block
- * a2     - workspace
- */
-
-    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
-    li         a3, 8
-
-1:
-    // odd part
-    lh         t0, 48(a1)
-    lh         t1, 48(a0)
-    lh         t2, 16(a1)
-    lh         t3, 16(a0)
-    lh         t4, 80(a1)
-    lh         t5, 80(a0)
-    lh         t6, 112(a1)
-    lh         t7, 112(a0)
-    mul        t0, t0, t1    // z2
-    mul        t1, t2, t3    // z1
-    mul        t2, t4, t5    // z3
-    mul        t3, t6, t7    // z4
-    li         t4, 10703     // FIX(1.306562965)
-    li         t5, 4433      // FIX_0_541196100
-    li         t6, 7053      // FIX(0.860918669)
-    mul        t4, t0,t4     // tmp11
-    mul        t5, t0,t5     // -tmp14
-    addu       t7, t1,t2     // tmp10
-    addu       t8, t7,t3     // tmp10 + z4
-    mul        t6, t6, t8    // tmp15
-    li         t8, 2139      // FIX(0.261052384)
-    mul        t8, t7, t8    // MULTIPLY(tmp10, FIX(0.261052384))
-    li         t7, 2295      // FIX(0.280143716)
-    mul        t7, t1, t7    // MULTIPLY(z1, FIX(0.280143716))
-    addu       t9, t2, t3    // z3 + z4
-    li         s0, 8565      // FIX(1.045510580)
-    mul        t9, t9, s0    // -tmp13
-    li         s0, 12112     // FIX(1.478575242)
-    mul        s0, t2, s0    // MULTIPLY(z3, FIX(1.478575242)
-    li         s1, 12998     // FIX(1.586706681)
-    mul        s1, t3, s1    // MULTIPLY(z4, FIX(1.586706681))
-    li         s2, 5540      // FIX(0.676326758)
-    mul        s2, t1, s2    // MULTIPLY(z1, FIX(0.676326758))
-    li         s3, 16244     // FIX(1.982889723)
-    mul        s3, t3, s3    // MULTIPLY(z4, FIX(1.982889723))
-    subu       t1, t1, t3    // z1-=z4
-    subu       t0, t0, t2    // z2-=z3
-    addu       t2, t0, t1    // z1+z2
-    li         t3, 4433      // FIX_0_541196100
-    mul        t2, t2, t3    // z3
-    li         t3, 6270      // FIX_0_765366865
-    mul        t1, t1, t3    // MULTIPLY(z1, FIX_0_765366865)
-    li         t3, 15137     // FIX_0_765366865
-    mul        t0, t0, t3    // MULTIPLY(z2, FIX_1_847759065)
-    addu       t8, t6, t8    // tmp12
-    addu       t3, t8, t4    // tmp12 + tmp11
-    addu       t3, t3, t7    // tmp10
-    subu       t8, t8, t9    // tmp12 + tmp13
-    addu       s0, t5, s0
-    subu       t8, t8, s0    // tmp12
-    subu       t9, t6, t9
-    subu       s1, s1, t4
-    addu       t9, t9, s1    // tmp13
-    subu       t6, t6, t5
-    subu       t6, t6, s2
-    subu       t6, t6, s3    // tmp15
-    // even part start
-    lh         t4, 64(a1)
-    lh         t5, 64(a0)
-    lh         t7, 32(a1)
-    lh         s0, 32(a0)
-    lh         s1, 0(a1)
-    lh         s2, 0(a0)
-    lh         s3, 96(a1)
-    lh         v0, 96(a0)
-    mul        t4, t4, t5    // DEQUANTIZE(inptr[DCTSIZE*4],quantptr[DCTSIZE*4])
-    mul        t5, t7, s0    // DEQUANTIZE(inptr[DCTSIZE*2],quantptr[DCTSIZE*2])
-    mul        t7, s1, s2    // DEQUANTIZE(inptr[DCTSIZE*0],quantptr[DCTSIZE*0])
-    mul        s0, s3, v0    // DEQUANTIZE(inptr[DCTSIZE*6],quantptr[DCTSIZE*6])
-    // odd part end
-    addu       t1, t2, t1    // tmp11
-    subu       t0, t2, t0    // tmp14
-    // update counter and pointers
-    addiu      a3, a3, -1
-    addiu      a0, a0, 2
-    addiu      a1, a1, 2
-    // even part rest
-    li         s1, 10033
-    li         s2, 11190
-    mul        t4, t4, s1    // z4
-    mul        s1, t5, s2    // z4
-    sll        t5, t5, 13    // z1
-    sll        t7, t7, 13
-    addiu      t7, t7, 1024  // z3
-    sll        s0, s0, 13    // z2
-    addu       s2, t7, t4    // tmp10
-    subu       t4, t7, t4    // tmp11
-    subu       s3, t5, s0    // tmp12
-    addu       t2, t7, s3    // tmp21
-    subu       s3, t7, s3    // tmp24
-    addu       t7, s1, s0    // tmp12
-    addu       v0, s2, t7    // tmp20
-    subu       s2, s2, t7    // tmp25
-    subu       s1, s1, t5    // z4 - z1
-    subu       s1, s1, s0    // tmp12
-    addu       s0, t4, s1    // tmp22
-    subu       t4, t4, s1    // tmp23
-    // final output stage
-    addu       t5, v0, t3
-    subu       v0, v0, t3
-    addu       t3, t2, t1
-    subu       t2, t2, t1
-    addu       t1, s0, t8
-    subu       s0, s0, t8
-    addu       t8, t4, t9
-    subu       t4, t4, t9
-    addu       t9, s3, t0
-    subu       s3, s3, t0
-    addu       t0, s2, t6
-    subu       s2, s2, t6
-    sra        t5, t5, 11
-    sra        t3, t3, 11
-    sra        t1, t1, 11
-    sra        t8, t8, 11
-    sra        t9, t9, 11
-    sra        t0, t0, 11
-    sra        s2, s2, 11
-    sra        s3, s3, 11
-    sra        t4, t4, 11
-    sra        s0, s0, 11
-    sra        t2, t2, 11
-    sra        v0, v0, 11
-    sw         t5, 0(a2)
-    sw         t3, 32(a2)
-    sw         t1, 64(a2)
-    sw         t8, 96(a2)
-    sw         t9, 128(a2)
-    sw         t0, 160(a2)
-    sw         s2, 192(a2)
-    sw         s3, 224(a2)
-    sw         t4, 256(a2)
-    sw         s0, 288(a2)
-    sw         t2, 320(a2)
-    sw         v0, 352(a2)
-    bgtz       a3, 1b
-     addiu     a2, a2, 4
-
-    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
-    j          ra
-     nop
-
-END(jsimd_idct_12x12_pass1_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2)
-/*
- * a0     - workspace
- * a1     - output
- */
-
-    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
-    li        a3, 12
-
-1:
-    // Odd part
-    lw        t0, 12(a0)
-    lw        t1, 4(a0)
-    lw        t2, 20(a0)
-    lw        t3, 28(a0)
-    li        t4, 10703     // FIX(1.306562965)
-    li        t5, 4433      // FIX_0_541196100
-    mul       t4, t0, t4    // tmp11
-    mul       t5, t0, t5    // -tmp14
-    addu      t6, t1, t2    // tmp10
-    li        t7, 2139      // FIX(0.261052384)
-    mul       t7, t6, t7    // MULTIPLY(tmp10, FIX(0.261052384))
-    addu      t6, t6, t3    // tmp10 + z4
-    li        t8, 7053      // FIX(0.860918669)
-    mul       t6, t6, t8    // tmp15
-    li        t8, 2295      // FIX(0.280143716)
-    mul       t8, t1, t8    // MULTIPLY(z1, FIX(0.280143716))
-    addu      t9, t2, t3    // z3 + z4
-    li        s0, 8565      // FIX(1.045510580)
-    mul       t9, t9, s0    // -tmp13
-    li        s0, 12112     // FIX(1.478575242)
-    mul       s0, t2, s0    // MULTIPLY(z3, FIX(1.478575242))
-    li        s1, 12998     // FIX(1.586706681)
-    mul       s1, t3, s1    // MULTIPLY(z4, FIX(1.586706681))
-    li        s2, 5540      // FIX(0.676326758)
-    mul       s2, t1, s2    // MULTIPLY(z1, FIX(0.676326758))
-    li        s3, 16244     // FIX(1.982889723)
-    mul       s3, t3, s3    // MULTIPLY(z4, FIX(1.982889723))
-    subu      t1, t1, t3    // z1 -= z4
-    subu      t0, t0, t2    // z2 -= z3
-    addu      t2, t1, t0    // z1 + z2
-    li        t3, 4433      // FIX_0_541196100
-    mul       t2, t2, t3    // z3
-    li        t3, 6270      // FIX_0_765366865
-    mul       t1, t1, t3    // MULTIPLY(z1, FIX_0_765366865)
-    li        t3, 15137     // FIX_1_847759065
-    mul       t0, t0, t3    // MULTIPLY(z2, FIX_1_847759065)
-    addu      t3, t6, t7    // tmp12
-    addu      t7, t3, t4
-    addu      t7, t7, t8    // tmp10
-    subu      t3, t3, t9
-    subu      t3, t3, t5
-    subu      t3, t3, s0    // tmp12
-    subu      t9, t6, t9
-    subu      t9, t9, t4
-    addu      t9, t9, s1    // tmp13
-    subu      t6, t6, t5
-    subu      t6, t6, s2
-    subu      t6, t6, s3    // tmp15
-    addu      t1, t2, t1    // tmp11
-    subu      t0, t2, t0    // tmp14
-    // even part
-    lw        t2, 16(a0)    // z4
-    lw        t4, 8(a0)     // z1
-    lw        t5, 0(a0)     // z3
-    lw        t8, 24(a0)    // z2
-    li        s0, 10033     // FIX(1.224744871)
-    li        s1, 11190     // FIX(1.366025404)
-    mul       t2, t2, s0    // z4
-    mul       s0, t4, s1    // z4
-    addiu     t5, t5, 0x10
-    sll       t5, t5, 13    // z3
-    sll       t4, t4, 13    // z1
-    sll       t8, t8, 13    // z2
-    subu      s1, t4, t8    // tmp12
-    addu      s2, t5, t2    // tmp10
-    subu      t2, t5, t2    // tmp11
-    addu      s3, t5, s1    // tmp21
-    subu      s1, t5, s1    // tmp24
-    addu      t5, s0, t8    // tmp12
-    addu      v0, s2, t5    // tmp20
-    subu      t5, s2, t5    // tmp25
-    subu      t4, s0, t4
-    subu      t4, t4, t8    // tmp12
-    addu      t8, t2, t4    // tmp22
-    subu      t2, t2, t4    // tmp23
-    // increment counter and pointers
-    addiu     a3, a3, -1
-    addiu     a0, a0, 32
-    // Final stage
-    addu      t4, v0, t7
-    subu      v0, v0, t7
-    addu      t7, s3, t1
-    subu      s3, s3, t1
-    addu      t1, t8, t3
-    subu      t8, t8, t3
-    addu      t3, t2, t9
-    subu      t2, t2, t9
-    addu      t9, s1, t0
-    subu      s1, s1, t0
-    addu      t0, t5, t6
-    subu      t5, t5, t6
-    sll       t4, t4, 4
-    sll       t7, t7, 4
-    sll       t1, t1, 4
-    sll       t3, t3, 4
-    sll       t9, t9, 4
-    sll       t0, t0, 4
-    sll       t5, t5, 4
-    sll       s1, s1, 4
-    sll       t2, t2, 4
-    sll       t8, t8, 4
-    sll       s3, s3, 4
-    sll       v0, v0, 4
-    shll_s.w  t4, t4, 2
-    shll_s.w  t7, t7, 2
-    shll_s.w  t1, t1, 2
-    shll_s.w  t3, t3, 2
-    shll_s.w  t9, t9, 2
-    shll_s.w  t0, t0, 2
-    shll_s.w  t5, t5, 2
-    shll_s.w  s1, s1, 2
-    shll_s.w  t2, t2, 2
-    shll_s.w  t8, t8, 2
-    shll_s.w  s3, s3, 2
-    shll_s.w  v0, v0, 2
-    srl       t4, t4, 24
-    srl       t7, t7, 24
-    srl       t1, t1, 24
-    srl       t3, t3, 24
-    srl       t9, t9, 24
-    srl       t0, t0, 24
-    srl       t5, t5, 24
-    srl       s1, s1, 24
-    srl       t2, t2, 24
-    srl       t8, t8, 24
-    srl       s3, s3, 24
-    srl       v0, v0, 24
-    lw        t6, 0(a1)
-    addiu     t4, t4, 0x80
-    addiu     t7, t7, 0x80
-    addiu     t1, t1, 0x80
-    addiu     t3, t3, 0x80
-    addiu     t9, t9, 0x80
-    addiu     t0, t0, 0x80
-    addiu     t5, t5, 0x80
-    addiu     s1, s1, 0x80
-    addiu     t2, t2, 0x80
-    addiu     t8, t8, 0x80
-    addiu     s3, s3, 0x80
-    addiu     v0, v0, 0x80
-    sb        t4, 0(t6)
-    sb        t7, 1(t6)
-    sb        t1, 2(t6)
-    sb        t3, 3(t6)
-    sb        t9, 4(t6)
-    sb        t0, 5(t6)
-    sb        t5, 6(t6)
-    sb        s1, 7(t6)
-    sb        t2, 8(t6)
-    sb        t8, 9(t6)
-    sb        s3, 10(t6)
-    sb        v0, 11(t6)
-    bgtz      a3, 1b
-     addiu    a1, a1, 4
-
-    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
-    jr        ra
-     nop
-
-END(jsimd_idct_12x12_pass2_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_convsamp_mips_dspr2)
-/*
- * a0     - sample_data
- * a1     - start_col
- * a2     - workspace
- */
-
-    lw             t0, 0(a0)
-    li             t7, 0xff80ff80
-    addu           t0, t0, a1
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    lw             t0, 4(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 0(a2)
-    usw            t4, 4(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 8(a2)
-    usw            t6, 12(a2)
-
-    lw             t0, 8(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 16(a2)
-    usw            t4, 20(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 24(a2)
-    usw            t6, 28(a2)
-
-    lw             t0, 12(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 32(a2)
-    usw            t4, 36(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 40(a2)
-    usw            t6, 44(a2)
-
-    lw             t0, 16(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 48(a2)
-    usw            t4, 52(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 56(a2)
-    usw            t6, 60(a2)
-
-    lw             t0, 20(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 64(a2)
-    usw            t4, 68(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 72(a2)
-    usw            t6, 76(a2)
-
-    lw             t0, 24(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 80(a2)
-    usw            t4, 84(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 88(a2)
-    usw            t6, 92(a2)
-
-    lw             t0, 28(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 96(a2)
-    usw            t4, 100(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 104(a2)
-    usw            t6, 108(a2)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 112(a2)
-    usw            t4, 116(a2)
-    usw            t5, 120(a2)
-    usw            t6, 124(a2)
-
-    j              ra
-     nop
-
-END(jsimd_convsamp_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2)
-/*
- * a0     - sample_data
- * a1     - start_col
- * a2     - workspace
- */
-
-    .set at
-
-    lw       t0, 0(a0)
-    addu     t0, t0, a1
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 4(a0)
-    swc1     f2, 0(a2)
-    swc1     f4, 4(a2)
-    swc1     f6, 8(a2)
-    addu     t0, t0, a1
-    swc1     f8, 12(a2)
-    swc1     f10, 16(a2)
-    swc1     f12, 20(a2)
-    swc1     f14, 24(a2)
-    swc1     f16, 28(a2)
-    //elemr 1
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 8(a0)
-    swc1     f2, 32(a2)
-    swc1     f4, 36(a2)
-    swc1     f6, 40(a2)
-    addu     t0, t0, a1
-    swc1     f8, 44(a2)
-    swc1     f10, 48(a2)
-    swc1     f12, 52(a2)
-    swc1     f14, 56(a2)
-    swc1     f16, 60(a2)
-    //elemr 2
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 12(a0)
-    swc1     f2, 64(a2)
-    swc1     f4, 68(a2)
-    swc1     f6, 72(a2)
-    addu     t0, t0, a1
-    swc1     f8, 76(a2)
-    swc1     f10, 80(a2)
-    swc1     f12, 84(a2)
-    swc1     f14, 88(a2)
-    swc1     f16, 92(a2)
-    //elemr 3
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 16(a0)
-    swc1     f2, 96(a2)
-    swc1     f4, 100(a2)
-    swc1     f6, 104(a2)
-    addu     t0, t0, a1
-    swc1     f8, 108(a2)
-    swc1     f10, 112(a2)
-    swc1     f12, 116(a2)
-    swc1     f14, 120(a2)
-    swc1     f16, 124(a2)
-    //elemr 4
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 20(a0)
-    swc1     f2, 128(a2)
-    swc1     f4, 132(a2)
-    swc1     f6, 136(a2)
-    addu     t0, t0, a1
-    swc1     f8, 140(a2)
-    swc1     f10, 144(a2)
-    swc1     f12, 148(a2)
-    swc1     f14, 152(a2)
-    swc1     f16, 156(a2)
-    //elemr 5
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 24(a0)
-    swc1     f2, 160(a2)
-    swc1     f4, 164(a2)
-    swc1     f6, 168(a2)
-    addu     t0, t0, a1
-    swc1     f8, 172(a2)
-    swc1     f10, 176(a2)
-    swc1     f12, 180(a2)
-    swc1     f14, 184(a2)
-    swc1     f16, 188(a2)
-    //elemr 6
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 28(a0)
-    swc1     f2, 192(a2)
-    swc1     f4, 196(a2)
-    swc1     f6, 200(a2)
-    addu     t0, t0, a1
-    swc1     f8, 204(a2)
-    swc1     f10, 208(a2)
-    swc1     f12, 212(a2)
-    swc1     f14, 216(a2)
-    swc1     f16, 220(a2)
-    //elemr 7
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    swc1     f2, 224(a2)
-    swc1     f4, 228(a2)
-    swc1     f6, 232(a2)
-    swc1     f8, 236(a2)
-    swc1     f10, 240(a2)
-    swc1     f12, 244(a2)
-    swc1     f14, 248(a2)
-    swc1     f16, 252(a2)
-
-    j        ra
-     nop
-
-END(jsimd_convsamp_float_mips_dspr2)
-
-/*****************************************************************************/
diff --git a/simd/jsimd_mips_dspr2_asm.h b/simd/jsimd_mips_dspr2_asm.h
deleted file mode 100644
index 499e34b..0000000
--- a/simd/jsimd_mips_dspr2_asm.h
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * MIPS DSPr2 optimizations for libjpeg-turbo
- *
- * Copyright (C) 2013, MIPS Technologies, Inc., California.
- * All Rights Reserved.
- * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
- *           Darko Laus       (darko.laus@imgtec.com)
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#define zero $0
-#define AT   $1
-#define v0   $2
-#define v1   $3
-#define a0   $4
-#define a1   $5
-#define a2   $6
-#define a3   $7
-#define t0   $8
-#define t1   $9
-#define t2   $10
-#define t3   $11
-#define t4   $12
-#define t5   $13
-#define t6   $14
-#define t7   $15
-#define s0   $16
-#define s1   $17
-#define s2   $18
-#define s3   $19
-#define s4   $20
-#define s5   $21
-#define s6   $22
-#define s7   $23
-#define t8   $24
-#define t9   $25
-#define k0   $26
-#define k1   $27
-#define gp   $28
-#define sp   $29
-#define fp   $30
-#define s8   $30
-#define ra   $31
-
-#define f0   $f0
-#define f1   $f1
-#define f2   $f2
-#define f3   $f3
-#define f4   $f4
-#define f5   $f5
-#define f6   $f6
-#define f7   $f7
-#define f8   $f8
-#define f9   $f9
-#define f10  $f10
-#define f11  $f11
-#define f12  $f12
-#define f13  $f13
-#define f14  $f14
-#define f15  $f15
-#define f16  $f16
-#define f17  $f17
-#define f18  $f18
-#define f19  $f19
-#define f20  $f20
-#define f21  $f21
-#define f22  $f22
-#define f23  $f23
-#define f24  $f24
-#define f25  $f25
-#define f26  $f26
-#define f27  $f27
-#define f28  $f28
-#define f29  $f29
-#define f30  $f30
-#define f31  $f31
-
-/*
- * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
- */
-#define LEAF_MIPS32R2(symbol)                           \
-                .globl  symbol;                         \
-                .align  2;                              \
-                .type   symbol, @function;              \
-                .ent    symbol, 0;                      \
-symbol:         .frame  sp, 0, ra;                      \
-                .set    push;                           \
-                .set    arch=mips32r2;                  \
-                .set    noreorder;                      \
-                .set    noat;
-
-/*
- * LEAF_MIPS_DSPR2 - declare leaf routine for MIPS DSPr2
- */
-#define LEAF_MIPS_DSPR2(symbol)                         \
-LEAF_MIPS32R2(symbol)                                   \
-                .set    dspr2;
-
-/*
- * END - mark end of function
- */
-#define END(function)                                   \
-                .set    pop;                            \
-                .end    function;                       \
-                .size   function,.-function
-
-/*
- * Checks if stack offset is big enough for storing/restoring regs_num
- * number of register to/from stack. Stack offset must be greater than
- * or equal to the number of bytes needed for storing registers (regs_num*4).
- * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
- * preserved for input arguments of the functions, already stored in a0-a3),
- * stack size can be further optimized by utilizing this space.
- */
-.macro CHECK_STACK_OFFSET regs_num, stack_offset
-.if \stack_offset < \regs_num * 4 - 16
-.error "Stack offset too small."
-.endif
-.endm
-
-/*
- * Saves set of registers on stack. Maximum number of registers that
- * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
- * Stack offset is number of bytes that are added to stack pointer (sp)
- * before registers are pushed in order to provide enough space on stack
- * (offset must be multiple of 4, and must be big enough, as described by
- * CHECK_STACK_OFFSET macro). This macro is intended to be used in
- * combination with RESTORE_REGS_FROM_STACK macro. Example:
- *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
- *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
- */
-.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \
-                          r2  = 0, r3  = 0, r4  = 0, \
-                          r5  = 0, r6  = 0, r7  = 0, \
-                          r8  = 0, r9  = 0, r10 = 0, \
-                          r11 = 0, r12 = 0, r13 = 0, \
-                          r14 = 0
-    .if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
-    .error "Stack offset must be pozitive and multiple of 4."
-    .endif
-    .if \stack_offset != 0
-    addiu           sp, sp, -\stack_offset
-    .endif
-    sw              \r1, 0(sp)
-    .if \r2 != 0
-    sw              \r2, 4(sp)
-    .endif
-    .if \r3 != 0
-    sw              \r3, 8(sp)
-    .endif
-    .if \r4 != 0
-    sw              \r4, 12(sp)
-    .endif
-    .if \r5 != 0
-    CHECK_STACK_OFFSET 5, \stack_offset
-    sw              \r5, 16(sp)
-    .endif
-    .if \r6 != 0
-    CHECK_STACK_OFFSET 6, \stack_offset
-    sw              \r6, 20(sp)
-    .endif
-    .if \r7 != 0
-    CHECK_STACK_OFFSET 7, \stack_offset
-    sw              \r7, 24(sp)
-    .endif
-    .if \r8 != 0
-    CHECK_STACK_OFFSET 8, \stack_offset
-    sw              \r8, 28(sp)
-    .endif
-    .if \r9 != 0
-    CHECK_STACK_OFFSET 9, \stack_offset
-    sw              \r9, 32(sp)
-    .endif
-    .if \r10 != 0
-    CHECK_STACK_OFFSET 10, \stack_offset
-    sw              \r10, 36(sp)
-    .endif
-    .if \r11 != 0
-    CHECK_STACK_OFFSET 11, \stack_offset
-    sw              \r11, 40(sp)
-    .endif
-    .if \r12 != 0
-    CHECK_STACK_OFFSET 12, \stack_offset
-    sw              \r12, 44(sp)
-    .endif
-    .if \r13 != 0
-    CHECK_STACK_OFFSET 13, \stack_offset
-    sw              \r13, 48(sp)
-    .endif
-    .if \r14 != 0
-    CHECK_STACK_OFFSET 14, \stack_offset
-    sw              \r14, 52(sp)
-    .endif
-.endm
-
-/*
- * Restores set of registers from stack. Maximum number of registers that
- * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
- * Stack offset is number of bytes that are added to stack pointer (sp)
- * after registers are restored (offset must be multiple of 4, and must
- * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
- * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
- * Example:
- *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
- *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
- */
-.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \
-                               r2  = 0, r3  = 0, r4  = 0, \
-                               r5  = 0, r6  = 0, r7  = 0, \
-                               r8  = 0, r9  = 0, r10 = 0, \
-                               r11 = 0, r12 = 0, r13 = 0, \
-                               r14 = 0
-    .if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4)
-    .error "Stack offset must be pozitive and multiple of 4."
-    .endif
-    lw              \r1, 0(sp)
-    .if \r2 != 0
-    lw              \r2, 4(sp)
-    .endif
-    .if \r3 != 0
-    lw              \r3, 8(sp)
-    .endif
-    .if \r4 != 0
-    lw              \r4, 12(sp)
-    .endif
-    .if \r5 != 0
-    CHECK_STACK_OFFSET 5, \stack_offset
-    lw              \r5, 16(sp)
-    .endif
-    .if \r6 != 0
-    CHECK_STACK_OFFSET 6, \stack_offset
-    lw              \r6, 20(sp)
-    .endif
-    .if \r7 != 0
-    CHECK_STACK_OFFSET 7, \stack_offset
-    lw              \r7, 24(sp)
-    .endif
-    .if \r8 != 0
-    CHECK_STACK_OFFSET 8, \stack_offset
-    lw              \r8, 28(sp)
-    .endif
-    .if \r9 != 0
-    CHECK_STACK_OFFSET 9, \stack_offset
-    lw              \r9, 32(sp)
-    .endif
-    .if \r10 != 0
-    CHECK_STACK_OFFSET 10, \stack_offset
-    lw              \r10, 36(sp)
-    .endif
-    .if \r11 != 0
-    CHECK_STACK_OFFSET 11, \stack_offset
-    lw              \r11, 40(sp)
-    .endif
-    .if \r12 != 0
-    CHECK_STACK_OFFSET 12, \stack_offset
-    lw              \r12, 44(sp)
-    .endif
-    .if \r13 != 0
-    CHECK_STACK_OFFSET 13, \stack_offset
-    lw              \r13, 48(sp)
-    .endif
-    .if \r14 != 0
-    CHECK_STACK_OFFSET 14, \stack_offset
-    lw              \r14, 52(sp)
-    .endif
-    .if \stack_offset != 0
-    addiu           sp, sp, \stack_offset
-    .endif
-.endm
diff --git a/simd/jsimd_powerpc.c b/simd/jsimd_powerpc.c
deleted file mode 100644
index 47dd746..0000000
--- a/simd/jsimd_powerpc.c
+++ /dev/null
@@ -1,852 +0,0 @@
-/*
- * jsimd_powerpc.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * PowerPC architecture.
- */
-
-#ifdef __amigaos4__
-/* This must be defined first as it re-defines GLOBAL otherwise */
-#include <proto/exec.h>
-#endif
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-
-#if defined(__OpenBSD__)
-#include <sys/param.h>
-#include <sys/sysctl.h>
-#include <machine/cpu.h>
-#endif
-
-static unsigned int simd_support = ~0;
-
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-
-#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
-
-LOCAL(int)
-check_feature (char *buffer, char *feature)
-{
-  char *p;
-  if (*feature == 0)
-    return 0;
-  if (strncmp(buffer, "cpu", 3) != 0)
-    return 0;
-  buffer += 3;
-  while (isspace(*buffer))
-    buffer++;
-
-  /* Check if 'feature' is present in the buffer as a separate word */
-  while ((p = strstr(buffer, feature))) {
-    if (p > buffer && !isspace(*(p - 1))) {
-      buffer++;
-      continue;
-    }
-    p += strlen(feature);
-    if (*p != 0 && !isspace(*p)) {
-      buffer++;
-      continue;
-    }
-    return 1;
-  }
-  return 0;
-}
-
-LOCAL(int)
-parse_proc_cpuinfo (int bufsize)
-{
-  char *buffer = (char *)malloc(bufsize);
-  FILE *fd;
-  simd_support = 0;
-
-  if (!buffer)
-    return 0;
-
-  fd = fopen("/proc/cpuinfo", "r");
-  if (fd) {
-    while (fgets(buffer, bufsize, fd)) {
-      if (!strchr(buffer, '\n') && !feof(fd)) {
-        /* "impossible" happened - insufficient size of the buffer! */
-        fclose(fd);
-        free(buffer);
-        return 0;
-      }
-      if (check_feature(buffer, "altivec"))
-        simd_support |= JSIMD_ALTIVEC;
-    }
-    fclose(fd);
-  }
-  free(buffer);
-  return 1;
-}
-
-#endif
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
-  char *env = NULL;
-#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
-  int bufsize = 1024; /* an initial guess for the line buffer size limit */
-#elif defined(__amigaos4__)
-  uint32 altivec = 0;
-#elif defined(__OpenBSD__)
-  int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC };
-  int altivec;
-  size_t len = sizeof(altivec);
-#endif
-
-  if (simd_support != ~0U)
-    return;
-
-  simd_support = 0;
-
-#if defined(__ALTIVEC__) || defined(__APPLE__)
-  simd_support |= JSIMD_ALTIVEC;
-#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-  while (!parse_proc_cpuinfo(bufsize)) {
-    bufsize *= 2;
-    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
-      break;
-  }
-#elif defined(__amigaos4__)
-  IExec->GetCPUInfoTags(GCIT_VectorUnit, &altivec, TAG_DONE);
-  if(altivec == VECTORTYPE_ALTIVEC)
-    simd_support |= JSIMD_ALTIVEC;
-#elif defined(__OpenBSD__)
-  if (sysctl(mib, 2, &altivec, &len, NULL, 0) == 0 && altivec != 0)
-    simd_support |= JSIMD_ALTIVEC;
-#endif
-
-  /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCEALTIVEC");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = JSIMD_ALTIVEC;
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
-{
-  void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      altivecfct=jsimd_extrgb_ycc_convert_altivec;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      altivecfct=jsimd_extrgbx_ycc_convert_altivec;
-      break;
-    case JCS_EXT_BGR:
-      altivecfct=jsimd_extbgr_ycc_convert_altivec;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      altivecfct=jsimd_extbgrx_ycc_convert_altivec;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      altivecfct=jsimd_extxbgr_ycc_convert_altivec;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      altivecfct=jsimd_extxrgb_ycc_convert_altivec;
-      break;
-    default:
-      altivecfct=jsimd_rgb_ycc_convert_altivec;
-      break;
-  }
-
-  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
-{
-  void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      altivecfct=jsimd_extrgb_gray_convert_altivec;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      altivecfct=jsimd_extrgbx_gray_convert_altivec;
-      break;
-    case JCS_EXT_BGR:
-      altivecfct=jsimd_extbgr_gray_convert_altivec;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      altivecfct=jsimd_extbgrx_gray_convert_altivec;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      altivecfct=jsimd_extxbgr_gray_convert_altivec;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      altivecfct=jsimd_extxrgb_gray_convert_altivec;
-      break;
-    default:
-      altivecfct=jsimd_rgb_gray_convert_altivec;
-      break;
-  }
-
-  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
-{
-  void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      altivecfct=jsimd_ycc_extrgb_convert_altivec;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      altivecfct=jsimd_ycc_extrgbx_convert_altivec;
-      break;
-    case JCS_EXT_BGR:
-      altivecfct=jsimd_ycc_extbgr_convert_altivec;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      altivecfct=jsimd_ycc_extbgrx_convert_altivec;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      altivecfct=jsimd_ycc_extxbgr_convert_altivec;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      altivecfct=jsimd_ycc_extxrgb_convert_altivec;
-      break;
-    default:
-      altivecfct=jsimd_ycc_rgb_convert_altivec;
-      break;
-  }
-
-  altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
-                                compptr->v_samp_factor,
-                                compptr->width_in_blocks,
-                                input_data, output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
-                                compptr->v_samp_factor,
-                                compptr->width_in_blocks,
-                                input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
-                              input_data, output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
-                              input_data, output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor,
-                                    compptr->downsampled_width, input_data,
-                                    output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor,
-                                    compptr->downsampled_width, input_data,
-                                    output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      altivecfct=jsimd_h2v2_extrgb_merged_upsample_altivec;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      altivecfct=jsimd_h2v2_extrgbx_merged_upsample_altivec;
-      break;
-    case JCS_EXT_BGR:
-      altivecfct=jsimd_h2v2_extbgr_merged_upsample_altivec;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      altivecfct=jsimd_h2v2_extbgrx_merged_upsample_altivec;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      altivecfct=jsimd_h2v2_extxbgr_merged_upsample_altivec;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      altivecfct=jsimd_h2v2_extxrgb_merged_upsample_altivec;
-      break;
-    default:
-      altivecfct=jsimd_h2v2_merged_upsample_altivec;
-      break;
-  }
-
-  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      altivecfct=jsimd_h2v1_extrgb_merged_upsample_altivec;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      altivecfct=jsimd_h2v1_extrgbx_merged_upsample_altivec;
-      break;
-    case JCS_EXT_BGR:
-      altivecfct=jsimd_h2v1_extbgr_merged_upsample_altivec;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      altivecfct=jsimd_h2v1_extbgrx_merged_upsample_altivec;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      altivecfct=jsimd_h2v1_extxbgr_merged_upsample_altivec;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      altivecfct=jsimd_h2v1_extxrgb_merged_upsample_altivec;
-      break;
-    default:
-      altivecfct=jsimd_h2v1_merged_upsample_altivec;
-      break;
-  }
-
-  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
-{
-  jsimd_convsamp_altivec(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-  jsimd_fdct_islow_altivec(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
-  jsimd_fdct_ifast_altivec(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
-{
-  jsimd_quantize_altivec(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
-                           output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
-                           output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
-  return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
-{
-  return NULL;
-}
diff --git a/simd/jsimd_x86_64.c b/simd/jsimd_x86_64.c
deleted file mode 100644
index a62bcdb..0000000
--- a/simd/jsimd_x86_64.c
+++ /dev/null
@@ -1,887 +0,0 @@
-/*
- * jsimd_x86_64.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014, 2016, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * 64-bit x86 architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-/*
- * In the PIC cases, we have no guarantee that constants will keep
- * their alignment. This macro allows us to verify it at runtime.
- */
-#define IS_ALIGNED(ptr, order) (((size_t)ptr & ((1 << order) - 1)) == 0)
-
-#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
-
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
-  char *env = NULL;
-
-  if (simd_support != ~0U)
-    return;
-
-  simd_support = JSIMD_SSE2 | JSIMD_SSE;
-
-  /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = 0;
-  env = getenv("JSIMD_NOHUFFENC");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_huffman = 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_extrgb_ycc_convert_sse2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_extrgbx_ycc_convert_sse2;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_extbgr_ycc_convert_sse2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_extbgrx_ycc_convert_sse2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_extxbgr_ycc_convert_sse2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_extxrgb_ycc_convert_sse2;
-      break;
-    default:
-      sse2fct=jsimd_rgb_ycc_convert_sse2;
-      break;
-  }
-
-  sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_extrgb_gray_convert_sse2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_extrgbx_gray_convert_sse2;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_extbgr_gray_convert_sse2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_extbgrx_gray_convert_sse2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_extxbgr_gray_convert_sse2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_extxrgb_gray_convert_sse2;
-      break;
-    default:
-      sse2fct=jsimd_rgb_gray_convert_sse2;
-      break;
-  }
-
-  sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_ycc_extrgb_convert_sse2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_ycc_extrgbx_convert_sse2;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_ycc_extbgr_convert_sse2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_ycc_extbgrx_convert_sse2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_ycc_extxbgr_convert_sse2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_ycc_extxrgb_convert_sse2;
-      break;
-    default:
-      sse2fct=jsimd_ycc_rgb_convert_sse2;
-      break;
-  }
-
-  sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
-                             compptr->v_samp_factor, compptr->width_in_blocks,
-                             input_data, output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
-                             compptr->v_samp_factor, compptr->width_in_blocks,
-                             input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
-                           input_data, output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
-                           input_data, output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
-                                 compptr->downsampled_width, input_data,
-                                 output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
-                                 compptr->downsampled_width, input_data,
-                                 output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2;
-      break;
-    default:
-      sse2fct=jsimd_h2v2_merged_upsample_sse2;
-      break;
-  }
-
-  sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2;
-      break;
-    default:
-      sse2fct=jsimd_h2v1_merged_upsample_sse2;
-      break;
-  }
-
-  sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
-{
-  jsimd_convsamp_sse2(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
-{
-  jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-  jsimd_fdct_islow_sse2(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
-  jsimd_fdct_ifast_sse2(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-  jsimd_fdct_float_sse(data);
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
-{
-  jsimd_quantize_sse2(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
-{
-  jsimd_quantize_float_sse2(coef_block, divisors, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(IFAST_MULT_TYPE) != 2)
-    return 0;
-  if (IFAST_SCALE_BITS != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
-  init_simd();
-
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-  if (sizeof(FLOAT_MULT_TYPE) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
-  init_simd();
-
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
-      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
-{
-  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
-                                          dctbl, actbl);
-}
diff --git a/simd/jsimdcfg.inc.h b/simd/jsimdcfg.inc.h
index d2b499f..7ff7e29 100644
--- a/simd/jsimdcfg.inc.h
+++ b/simd/jsimdcfg.inc.h
@@ -19,79 +19,79 @@
 ; -- jpeglib.h
 ;
 
-%define _cpp_protection_DCTSIZE DCTSIZE
-%define _cpp_protection_DCTSIZE2 DCTSIZE2
+%define _cpp_protection_DCTSIZE   DCTSIZE
+%define _cpp_protection_DCTSIZE2  DCTSIZE2
 
 ;
 ; -- jmorecfg.h
 ;
 
-%define _cpp_protection_RGB_RED RGB_RED
-%define _cpp_protection_RGB_GREEN RGB_GREEN
-%define _cpp_protection_RGB_BLUE RGB_BLUE
-%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE
+%define _cpp_protection_RGB_RED             RGB_RED
+%define _cpp_protection_RGB_GREEN           RGB_GREEN
+%define _cpp_protection_RGB_BLUE            RGB_BLUE
+%define _cpp_protection_RGB_PIXELSIZE       RGB_PIXELSIZE
 
-%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED
-%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN
-%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE
-%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define _cpp_protection_EXT_RGB_RED         EXT_RGB_RED
+%define _cpp_protection_EXT_RGB_GREEN       EXT_RGB_GREEN
+%define _cpp_protection_EXT_RGB_BLUE        EXT_RGB_BLUE
+%define _cpp_protection_EXT_RGB_PIXELSIZE   EXT_RGB_PIXELSIZE
 
-%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED
-%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN
-%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE
-%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define _cpp_protection_EXT_RGBX_RED        EXT_RGBX_RED
+%define _cpp_protection_EXT_RGBX_GREEN      EXT_RGBX_GREEN
+%define _cpp_protection_EXT_RGBX_BLUE       EXT_RGBX_BLUE
+%define _cpp_protection_EXT_RGBX_PIXELSIZE  EXT_RGBX_PIXELSIZE
 
-%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED
-%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN
-%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE
-%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE
+%define _cpp_protection_EXT_BGR_RED         EXT_BGR_RED
+%define _cpp_protection_EXT_BGR_GREEN       EXT_BGR_GREEN
+%define _cpp_protection_EXT_BGR_BLUE        EXT_BGR_BLUE
+%define _cpp_protection_EXT_BGR_PIXELSIZE   EXT_BGR_PIXELSIZE
 
-%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED
-%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN
-%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE
-%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define _cpp_protection_EXT_BGRX_RED        EXT_BGRX_RED
+%define _cpp_protection_EXT_BGRX_GREEN      EXT_BGRX_GREEN
+%define _cpp_protection_EXT_BGRX_BLUE       EXT_BGRX_BLUE
+%define _cpp_protection_EXT_BGRX_PIXELSIZE  EXT_BGRX_PIXELSIZE
 
-%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED
-%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN
-%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE
-%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define _cpp_protection_EXT_XBGR_RED        EXT_XBGR_RED
+%define _cpp_protection_EXT_XBGR_GREEN      EXT_XBGR_GREEN
+%define _cpp_protection_EXT_XBGR_BLUE       EXT_XBGR_BLUE
+%define _cpp_protection_EXT_XBGR_PIXELSIZE  EXT_XBGR_PIXELSIZE
 
-%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED
-%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN
-%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE
-%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define _cpp_protection_EXT_XRGB_RED        EXT_XRGB_RED
+%define _cpp_protection_EXT_XRGB_GREEN      EXT_XRGB_GREEN
+%define _cpp_protection_EXT_XRGB_BLUE       EXT_XRGB_BLUE
+%define _cpp_protection_EXT_XRGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
 
-%define RGBX_FILLER_0XFF        1
+%define RGBX_FILLER_0XFF  1
 
 ; Representation of a single sample (pixel element value).
 ; On this SIMD implementation, this must be 'unsigned char'.
 ;
 
-%define JSAMPLE                 byte          ; unsigned char
-%define SIZEOF_JSAMPLE          SIZEOF_BYTE   ; sizeof(JSAMPLE)
+%define JSAMPLE            byte            ; unsigned char
+%define SIZEOF_JSAMPLE     SIZEOF_BYTE     ; sizeof(JSAMPLE)
 
-%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE
+%define _cpp_protection_CENTERJSAMPLE  CENTERJSAMPLE
 
 ; Representation of a DCT frequency coefficient.
 ; On this SIMD implementation, this must be 'short'.
 ;
-%define JCOEF                   word          ; short
-%define SIZEOF_JCOEF            SIZEOF_WORD   ; sizeof(JCOEF)
+%define JCOEF              word            ; short
+%define SIZEOF_JCOEF       SIZEOF_WORD     ; sizeof(JCOEF)
 
 ; Datatype used for image dimensions.
 ; On this SIMD implementation, this must be 'unsigned int'.
 ;
-%define JDIMENSION              dword         ; unsigned int
-%define SIZEOF_JDIMENSION       SIZEOF_DWORD  ; sizeof(JDIMENSION)
+%define JDIMENSION         dword           ; unsigned int
+%define SIZEOF_JDIMENSION  SIZEOF_DWORD    ; sizeof(JDIMENSION)
 
-%define JSAMPROW                POINTER       ; JSAMPLE *     (jpeglib.h)
-%define JSAMPARRAY              POINTER       ; JSAMPROW *    (jpeglib.h)
-%define JSAMPIMAGE              POINTER       ; JSAMPARRAY *  (jpeglib.h)
-%define JCOEFPTR                POINTER       ; JCOEF *       (jpeglib.h)
-%define SIZEOF_JSAMPROW         SIZEOF_POINTER  ; sizeof(JSAMPROW)
-%define SIZEOF_JSAMPARRAY       SIZEOF_POINTER  ; sizeof(JSAMPARRAY)
-%define SIZEOF_JSAMPIMAGE       SIZEOF_POINTER  ; sizeof(JSAMPIMAGE)
-%define SIZEOF_JCOEFPTR         SIZEOF_POINTER  ; sizeof(JCOEFPTR)
+%define JSAMPROW           POINTER         ; JSAMPLE *     (jpeglib.h)
+%define JSAMPARRAY         POINTER         ; JSAMPROW *    (jpeglib.h)
+%define JSAMPIMAGE         POINTER         ; JSAMPARRAY *  (jpeglib.h)
+%define JCOEFPTR           POINTER         ; JCOEF *       (jpeglib.h)
+%define SIZEOF_JSAMPROW    SIZEOF_POINTER  ; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY  SIZEOF_POINTER  ; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE  SIZEOF_POINTER  ; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR    SIZEOF_POINTER  ; sizeof(JCOEFPTR)
 
 ;
 ; -- jdct.h
@@ -101,30 +101,31 @@
 ; the DCT is to be performed in-place in that buffer.
 ; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
 ;
-%define DCTELEM                 word          ; short
-%define SIZEOF_DCTELEM          SIZEOF_WORD   ; sizeof(DCTELEM)
+%define DCTELEM                 word         ; short
+%define SIZEOF_DCTELEM          SIZEOF_WORD  ; sizeof(DCTELEM)
 
-%define FAST_FLOAT              FP32            ; float
-%define SIZEOF_FAST_FLOAT       SIZEOF_FP32     ; sizeof(FAST_FLOAT)
+%define FAST_FLOAT              FP32         ; float
+%define SIZEOF_FAST_FLOAT       SIZEOF_FP32  ; sizeof(FAST_FLOAT)
 
 ; To maximize parallelism, Type MULTIPLIER is changed to short.
 ;
-%define ISLOW_MULT_TYPE         word          ; must be short
-%define SIZEOF_ISLOW_MULT_TYPE  SIZEOF_WORD   ; sizeof(ISLOW_MULT_TYPE)
+%define ISLOW_MULT_TYPE         word         ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE  SIZEOF_WORD  ; sizeof(ISLOW_MULT_TYPE)
 
-%define IFAST_MULT_TYPE         word          ; must be short
-%define SIZEOF_IFAST_MULT_TYPE  SIZEOF_WORD   ; sizeof(IFAST_MULT_TYPE)
-%define IFAST_SCALE_BITS        2             ; fractional bits in scale factors
+%define IFAST_MULT_TYPE         word         ; must be short
+%define SIZEOF_IFAST_MULT_TYPE  SIZEOF_WORD  ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS        2            ; fractional bits in scale factors
 
-%define FLOAT_MULT_TYPE         FP32          ; must be float
-%define SIZEOF_FLOAT_MULT_TYPE  SIZEOF_FP32   ; sizeof(FLOAT_MULT_TYPE)
+%define FLOAT_MULT_TYPE         FP32         ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE  SIZEOF_FP32  ; sizeof(FLOAT_MULT_TYPE)
 
 ;
 ; -- jsimd.h
 ;
 
-%define _cpp_protection_JSIMD_NONE JSIMD_NONE
-%define _cpp_protection_JSIMD_MMX JSIMD_MMX
-%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW
-%define _cpp_protection_JSIMD_SSE JSIMD_SSE
-%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2
+%define _cpp_protection_JSIMD_NONE   JSIMD_NONE
+%define _cpp_protection_JSIMD_MMX    JSIMD_MMX
+%define _cpp_protection_JSIMD_3DNOW  JSIMD_3DNOW
+%define _cpp_protection_JSIMD_SSE    JSIMD_SSE
+%define _cpp_protection_JSIMD_SSE2   JSIMD_SSE2
+%define _cpp_protection_JSIMD_AVX2   JSIMD_AVX2
diff --git a/simd/jsimdcpu.asm b/simd/jsimdcpu.asm
deleted file mode 100644
index 599083b..0000000
--- a/simd/jsimdcpu.asm
+++ /dev/null
@@ -1,104 +0,0 @@
-;
-; jsimdcpu.asm - SIMD instruction support check
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Check if the CPU supports SIMD instructions
-;
-; GLOBAL(unsigned int)
-; jpeg_simd_cpu_support (void)
-;
-
-        align   16
-        global  EXTN(jpeg_simd_cpu_support)
-
-EXTN(jpeg_simd_cpu_support):
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-        push    edi
-
-        xor     edi,edi                 ; simd support flag
-
-        pushfd
-        pop     eax
-        mov     edx,eax
-        xor     eax, 1<<21              ; flip ID bit in EFLAGS
-        push    eax
-        popfd
-        pushfd
-        pop     eax
-        xor     eax,edx
-        jz      short .return           ; CPUID is not supported
-
-        ; Check for MMX instruction support
-        xor     eax,eax
-        cpuid
-        test    eax,eax
-        jz      short .return
-
-        xor     eax,eax
-        inc     eax
-        cpuid
-        mov     eax,edx                 ; eax = Standard feature flags
-
-        test    eax, 1<<23              ; bit23:MMX
-        jz      short .no_mmx
-        or      edi, byte JSIMD_MMX
-.no_mmx:
-        test    eax, 1<<25              ; bit25:SSE
-        jz      short .no_sse
-        or      edi, byte JSIMD_SSE
-.no_sse:
-        test    eax, 1<<26              ; bit26:SSE2
-        jz      short .no_sse2
-        or      edi, byte JSIMD_SSE2
-.no_sse2:
-
-        ; Check for 3DNow! instruction support
-        mov     eax, 0x80000000
-        cpuid
-        cmp     eax, 0x80000000
-        jbe     short .return
-
-        mov     eax, 0x80000001
-        cpuid
-        mov     eax,edx                 ; eax = Extended feature flags
-
-        test    eax, 1<<31              ; bit31:3DNow!(vendor independent)
-        jz      short .no_3dnow
-        or      edi, byte JSIMD_3DNOW
-.no_3dnow:
-
-.return:
-        mov     eax,edi
-
-        pop     edi
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/simd/jsimdext.inc b/simd/jsimdext.inc
deleted file mode 100644
index f28db60..0000000
--- a/simd/jsimdext.inc
+++ /dev/null
@@ -1,375 +0,0 @@
-;
-; jsimdext.inc - common declarations
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2010, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
-;
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-;
-; This software is provided 'as-is', without any express or implied
-; warranty.  In no event will the authors be held liable for any damages
-; arising from the use of this software.
-;
-; Permission is granted to anyone to use this software for any purpose,
-; including commercial applications, and to alter it and redistribute it
-; freely, subject to the following restrictions:
-;
-; 1. The origin of this software must not be misrepresented; you must not
-;    claim that you wrote the original software. If you use this software
-;    in a product, an acknowledgment in the product documentation would be
-;    appreciated but is not required.
-; 2. Altered source versions must be plainly marked as such, and must not be
-;    misrepresented as being the original software.
-; 3. This notice may not be removed or altered from any source distribution.
-;
-; [TAB8]
-
-; ==========================================================================
-;  System-dependent configurations
-
-%ifdef WIN32    ; ----(nasm -fwin32 -DWIN32 ...)--------
-; * Microsoft Visual C++
-; * MinGW (Minimalist GNU for Windows)
-; * CygWin
-; * LCC-Win32
-
-; -- segment definition --
-;
-%ifdef __YASM_VER__
-%define SEG_TEXT    .text  align=16
-%define SEG_CONST   .rdata align=16
-%else
-%define SEG_TEXT    .text  align=16 public use32 class=CODE
-%define SEG_CONST   .rdata align=16 public use32 class=CONST
-%endif
-
-%elifdef WIN64  ; ----(nasm -fwin64 -DWIN64 ...)--------
-; * Microsoft Visual C++
-
-; -- segment definition --
-;
-%ifdef __YASM_VER__
-%define SEG_TEXT    .text  align=16
-%define SEG_CONST   .rdata align=16
-%else
-%define SEG_TEXT    .text  align=16 public use64 class=CODE
-%define SEG_CONST   .rdata align=16 public use64 class=CONST
-%endif
-%define EXTN(name)  name                        ; foo() -> foo
-
-%elifdef OBJ32  ; ----(nasm -fobj -DOBJ32 ...)----------
-; * Borland C++ (Win32)
-
-; -- segment definition --
-;
-%define SEG_TEXT    _text  align=16 public use32 class=CODE
-%define SEG_CONST   _data  align=16 public use32 class=DATA
-
-%elifdef ELF    ; ----(nasm -felf[64] -DELF ...)------------
-; * Linux
-; * *BSD family Unix using elf format
-; * Unix System V, including Solaris x86, UnixWare and SCO Unix
-
-; mark stack as non-executable
-section .note.GNU-stack noalloc noexec nowrite progbits
-
-; -- segment definition --
-;
-%ifdef __x86_64__
-%define SEG_TEXT    .text   progbits align=16
-%define SEG_CONST   .rodata progbits align=16
-%else
-%define SEG_TEXT    .text   progbits alloc exec   nowrite align=16
-%define SEG_CONST   .rodata progbits alloc noexec nowrite align=16
-%endif
-
-; To make the code position-independent, append -DPIC to the commandline
-;
-%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_       ; ELF supports PIC
-%define EXTN(name)  name                        ; foo() -> foo
-
-%elifdef AOUT   ; ----(nasm -faoutb/aout -DAOUT ...)----
-; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
-; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
-
-; -- segment definition --
-;
-%define SEG_TEXT    .text
-%define SEG_CONST   .data
-
-; To make the code position-independent, append -DPIC to the commandline
-;
-%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_      ; BSD-style a.out supports PIC
-
-%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
-; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
-
-; -- segment definition --
-;
-%define SEG_TEXT    .text  ;align=16    ; nasm doesn't accept align=16. why?
-%define SEG_CONST   .rodata align=16
-
-; The generation of position-independent code (PIC) is the default on Darwin.
-;
-%define PIC
-%define GOT_SYMBOL  _MACHO_PIC_         ; Mach-O style code-relative addressing
-
-%else           ; ----(Other case)----------------------
-
-; -- segment definition --
-;
-%define SEG_TEXT    .text
-%define SEG_CONST   .data
-
-%endif  ; ----------------------------------------------
-
-; ==========================================================================
-
-; --------------------------------------------------------------------------
-;  Common types
-;
-%ifdef __x86_64__
-%define POINTER                 qword           ; general pointer type
-%define SIZEOF_POINTER          SIZEOF_QWORD    ; sizeof(POINTER)
-%define POINTER_BIT             QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
-%else
-%define POINTER                 dword           ; general pointer type
-%define SIZEOF_POINTER          SIZEOF_DWORD    ; sizeof(POINTER)
-%define POINTER_BIT             DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
-%endif
-
-%define INT                     dword           ; signed integer type
-%define SIZEOF_INT              SIZEOF_DWORD    ; sizeof(INT)
-%define INT_BIT                 DWORD_BIT       ; sizeof(INT)*BYTE_BIT
-
-%define FP32                    dword           ; IEEE754 single
-%define SIZEOF_FP32             SIZEOF_DWORD    ; sizeof(FP32)
-%define FP32_BIT                DWORD_BIT       ; sizeof(FP32)*BYTE_BIT
-
-%define MMWORD                  qword           ; int64  (MMX register)
-%define SIZEOF_MMWORD           SIZEOF_QWORD    ; sizeof(MMWORD)
-%define MMWORD_BIT              QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
-
-; NASM is buggy and doesn't properly handle operand sizes for SSE
-; instructions, so for now we have to define XMMWORD as blank.
-%define XMMWORD                                 ; int128 (SSE register)
-%define SIZEOF_XMMWORD          SIZEOF_OWORD    ; sizeof(XMMWORD)
-%define XMMWORD_BIT             OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT
-
-; Similar hacks for when we load a dword or MMWORD into an xmm# register
-%define XMM_DWORD
-%define XMM_MMWORD
-
-%define SIZEOF_BYTE             1               ; sizeof(BYTE)
-%define SIZEOF_WORD             2               ; sizeof(WORD)
-%define SIZEOF_DWORD            4               ; sizeof(DWORD)
-%define SIZEOF_QWORD            8               ; sizeof(QWORD)
-%define SIZEOF_OWORD            16              ; sizeof(OWORD)
-
-%define BYTE_BIT                8               ; CHAR_BIT in C
-%define WORD_BIT                16              ; sizeof(WORD)*BYTE_BIT
-%define DWORD_BIT               32              ; sizeof(DWORD)*BYTE_BIT
-%define QWORD_BIT               64              ; sizeof(QWORD)*BYTE_BIT
-%define OWORD_BIT               128             ; sizeof(OWORD)*BYTE_BIT
-
-; --------------------------------------------------------------------------
-;  External Symbol Name
-;
-%ifndef EXTN
-%define EXTN(name)   _ %+ name          ; foo() -> _foo
-%endif
-
-; --------------------------------------------------------------------------
-;  Macros for position-independent code (PIC) support
-;
-%ifndef GOT_SYMBOL
-%undef PIC
-%endif
-
-%ifdef PIC ; -------------------------------------------
-
-%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
-
-; At present, nasm doesn't seem to support PIC generation for Mach-O.
-; The PIC support code below is a little tricky.
-
-        SECTION SEG_CONST
-const_base:
-
-%define GOTOFF(got,sym) (got) + (sym) - const_base
-
-%imacro get_GOT 1
-        ; NOTE: this macro destroys ecx resister.
-        call    %%geteip
-        add     ecx, byte (%%ref - $)
-        jmp     short %%adjust
-%%geteip:
-        mov     ecx, POINTER [esp]
-        ret
-%%adjust:
-        push    ebp
-        xor     ebp,ebp         ; ebp = 0
-%ifidni %1,ebx  ; (%1 == ebx)
-        ; db 0x8D,0x9C + jmp near const_base =
-        ;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
-        db      0x8D,0x9C               ; 8D,9C
-        jmp     near const_base         ; E9,(const_base-%%ref)
-%%ref:
-%else  ; (%1 != ebx)
-        ; db 0x8D,0x8C + jmp near const_base =
-        ;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
-        db      0x8D,0x8C               ; 8D,8C
-        jmp     near const_base         ; E9,(const_base-%%ref)
-%%ref:  mov     %1, ecx
-%endif ; (%1 == ebx)
-        pop     ebp
-%endmacro
-
-%else   ; GOT_SYMBOL != _MACHO_PIC_ ----------------
-
-%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
-
-%imacro get_GOT 1
-        extern  GOT_SYMBOL
-        call    %%geteip
-        add     %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
-        jmp     short %%done
-%%geteip:
-        mov     %1, POINTER [esp]
-        ret
-%%done:
-%endmacro
-
-%endif  ; GOT_SYMBOL == _MACHO_PIC_ ----------------
-
-%imacro pushpic 1.nolist
-        push    %1
-%endmacro
-%imacro poppic  1.nolist
-        pop     %1
-%endmacro
-%imacro movpic  2.nolist
-        mov     %1,%2
-%endmacro
-
-%else   ; !PIC -----------------------------------------
-
-%define GOTOFF(got,sym) (sym)
-
-%imacro get_GOT 1.nolist
-%endmacro
-%imacro pushpic 1.nolist
-%endmacro
-%imacro poppic  1.nolist
-%endmacro
-%imacro movpic  2.nolist
-%endmacro
-
-%endif  ;  PIC -----------------------------------------
-
-; --------------------------------------------------------------------------
-;  Align the next instruction on {2,4,8,16,..}-byte boundary.
-;  ".balign n,,m" in GNU as
-;
-%define MSKLE(x,y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
-%define FILLB(b,n)  (($$-(b)) & ((n)-1))
-
-%imacro alignx 1-2.nolist 0xFFFF
-%%bs:   times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
-               db 0x90                               ; nop
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
-               db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
-               db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
-               db 0x8D,0xAD,0x00,0x00,0x00,0x00      ; lea ebp,[ebp+0x00000000]
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
-               db 0x8D,0x6C,0x25,0x00                ; lea ebp,[ebp+0x00]
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
-               db 0x8D,0x6D,0x00                     ; lea ebp,[ebp+0x00]
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
-               db 0x8B,0xED                          ; mov ebp,ebp
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
-               db 0x90                               ; nop
-%endmacro
-
-; Align the next data on {2,4,8,16,..}-byte boundary.
-;
-%imacro alignz 1.nolist
-        align %1, db 0          ; filling zeros
-%endmacro
-
-%ifdef __x86_64__
-
-%ifdef WIN64
-
-%imacro collect_args 0
-        push r12
-        push r13
-        push r14
-        push r15
-        mov r10, rcx
-        mov r11, rdx
-        mov r12, r8
-        mov r13, r9
-        mov r14, [rax+48]
-        mov r15, [rax+56]
-        push rsi
-        push rdi
-        sub     rsp, SIZEOF_XMMWORD
-        movaps  XMMWORD [rsp], xmm6
-        sub     rsp, SIZEOF_XMMWORD
-        movaps  XMMWORD [rsp], xmm7
-%endmacro
-
-%imacro uncollect_args 0
-        movaps  xmm7, XMMWORD [rsp]
-        add     rsp, SIZEOF_XMMWORD
-        movaps  xmm6, XMMWORD [rsp]
-        add     rsp, SIZEOF_XMMWORD
-        pop rdi
-        pop rsi
-        pop r15
-        pop r14
-        pop r13
-        pop r12
-%endmacro
-
-%else
-
-%imacro collect_args 0
-        push r10
-        push r11
-        push r12
-        push r13
-        push r14
-        push r15
-        mov r10, rdi
-        mov r11, rsi
-        mov r12, rdx
-        mov r13, rcx
-        mov r14, r8
-        mov r15, r9
-%endmacro
-
-%imacro uncollect_args 0
-        pop r15
-        pop r14
-        pop r13
-        pop r12
-        pop r11
-        pop r10
-%endmacro
-
-%endif
-
-%endif
-
-; --------------------------------------------------------------------------
-;  Defines picked up from the C headers
-;
-%include "jsimdcfg.inc"
-
-; --------------------------------------------------------------------------
diff --git a/simd/loongson/jccolext-mmi.c b/simd/loongson/jccolext-mmi.c
new file mode 100644
index 0000000..8aeab52
--- /dev/null
+++ b/simd/loongson/jccolext-mmi.c
@@ -0,0 +1,469 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define  mmA  mm0
+#define  mmB  mm1
+#elif RGB_GREEN == 0
+#define  mmA  mm2
+#define  mmB  mm3
+#elif RGB_BLUE == 0
+#define  mmA  mm4
+#define  mmB  mm5
+#else
+#define  mmA  mm6
+#define  mmB  mm7
+#endif
+
+#if RGB_RED == 1
+#define  mmC  mm0
+#define  mmD  mm1
+#elif RGB_GREEN == 1
+#define  mmC  mm2
+#define  mmD  mm3
+#elif RGB_BLUE == 1
+#define  mmC  mm4
+#define  mmD  mm5
+#else
+#define  mmC  mm6
+#define  mmD  mm7
+#endif
+
+#if RGB_RED == 2
+#define  mmE  mm0
+#define  mmF  mm1
+#elif RGB_GREEN == 2
+#define  mmE  mm2
+#define  mmF  mm3
+#elif RGB_BLUE == 2
+#define  mmE  mm4
+#define  mmF  mm5
+#else
+#define  mmE  mm6
+#define  mmF  mm7
+#endif
+
+#if RGB_RED == 3
+#define  mmG  mm0
+#define  mmH  mm1
+#elif RGB_GREEN == 3
+#define  mmG  mm2
+#define  mmH  mm3
+#elif RGB_BLUE == 3
+#define  mmG  mm4
+#define  mmH  mm5
+#else
+#define  mmG  mm6
+#define  mmH  mm7
+#endif
+
+
+void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
+                               JSAMPIMAGE output_buf, JDIMENSION output_row,
+                               int num_rows)
+{
+  JSAMPROW inptr, outptr0, outptr1, outptr2;
+  int num_cols, col;
+  __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
+  __m64 wk[7];
+  __m64 Y_BG, Cb_RG, Cr_BG;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
+         outptr0 += 8, outptr1 += 8, outptr2 += 8) {
+
+#if RGB_PIXELSIZE == 3
+
+      if (num_cols < 8) {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li     $8, 1\r\n"
+            "move   $9, %3\r\n"
+            "and    $10, $9, $8\r\n"
+            "beqz   $10, 1f\r\n"
+            "nop    \r\n"
+            "subu   $9, $9, 1\r\n"
+            "xor    $12, $12, $12\r\n"
+            "move   $13, %5\r\n"
+            "dadd   $13, $13, $9\r\n"
+            "lbu    $12, 0($13)\r\n"
+
+            "1:     \r\n"
+            "li     $8, 2\r\n"
+            "and    $10, $9, $8\r\n"
+            "beqz   $10, 2f\r\n"
+            "nop    \r\n"
+            "subu   $9, $9, 2\r\n"
+            "xor    $11, $11, $11\r\n"
+            "move   $13, %5\r\n"
+            "dadd   $13, $13, $9\r\n"
+            "lhu    $11, 0($13)\r\n"
+            "sll    $12, $12, 16\r\n"
+            "or     $12, $12, $11\r\n"
+
+            "2:     \r\n"
+            "dmtc1  $12, %0\r\n"
+            "li     $8, 4\r\n"
+            "and    $10, $9, $8\r\n"
+            "beqz   $10, 3f\r\n"
+            "nop    \r\n"
+            "subu   $9, $9, 4\r\n"
+            "move   $13, %5\r\n"
+            "dadd   $13, $13, $9\r\n"
+            "lwu    $14, 0($13)\r\n"
+            "dmtc1  $14, %1\r\n"
+            "dsll32 $12, $12, 0\r\n"
+            "or     $12, $12, $14\r\n"
+            "dmtc1  $12, %0\r\n"
+
+            "3:     \r\n"
+            "li     $8, 8\r\n"
+            "and    $10, $9, $8\r\n"
+            "beqz   $10, 4f\r\n"
+            "nop    \r\n"
+            "mov.s  %1, %0\r\n"
+            "ldc1   %0, 0(%5)\r\n"
+            "li     $9, 8\r\n"
+            "j      5f\r\n"
+            "nop    \r\n"
+
+            "4:     \r\n"
+            "li     $8, 16\r\n"
+            "and    $10, $9, $8\r\n"
+            "beqz   $10, 5f\r\n"
+            "nop    \r\n"
+            "mov.s  %2, %0\r\n"
+            "ldc1   %0, 0(%5)\r\n"
+            "ldc1   %1, 8(%5)\r\n"
+
+            "5:     \r\n"
+            "nop    \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
+            : "r" (col), "r" (num_rows), "r" (inptr)
+            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
+              "$14", "memory"
+           );
+      } else {
+        mmA = _mm_load_si64((__m64 *)&inptr[0]);
+        mmG = _mm_load_si64((__m64 *)&inptr[8]);
+        mmF = _mm_load_si64((__m64 *)&inptr[16]);
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmD = mmA;
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+      mmD = _mm_srli_si64(mmD, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmG);
+      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
+
+      mmD = _mm_unpacklo_pi8(mmD, mmF);
+      mmG = _mm_unpackhi_pi8(mmG, mmF);
+
+      mmE = mmA;
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+      mmE = _mm_srli_si64(mmE, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmD);
+      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
+
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmD = _mm_unpackhi_pi8(mmD, mmG);
+      mmC = mmA;
+      mmA = _mm_loadlo_pi8_f(mmA);
+      mmC = _mm_loadhi_pi8_f(mmC);
+
+      mmB = mmE;
+      mmE = _mm_loadlo_pi8_f(mmE);
+      mmB = _mm_loadhi_pi8_f(mmB);
+
+      mmF = mmD;
+      mmD = _mm_loadlo_pi8_f(mmD);
+      mmF = _mm_loadhi_pi8_f(mmF);
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+      if (num_cols < 8) {
+        col = num_cols;
+        asm(".set noreorder\r\n"
+
+            "li     $8, 1\r\n"
+            "move   $9, %4\r\n"
+            "and    $10, $9, $8\r\n"
+            "beqz   $10, 1f\r\n"
+            "nop    \r\n"
+            "subu   $9, $9, 1\r\n"
+            "dsll   $11, $9, 2\r\n"
+            "move   $13, %5\r\n"
+            "daddu  $13, $13, $11\r\n"
+            "lwc1   %0, 0($13)\r\n"
+
+            "1:     \r\n"
+            "li     $8, 2\r\n"
+            "and    $10, $9, $8\r\n"
+            "beqz   $10, 2f\r\n"
+            "nop    \r\n"
+            "subu   $9, $9, 2\r\n"
+            "dsll   $11, $9, 2\r\n"
+            "move   $13, %5\r\n"
+            "daddu  $13, $13, $11\r\n"
+            "mov.s  %1, %0\r\n"
+            "ldc1   %0, 0($13)\r\n"
+
+            "2:     \r\n"
+            "li     $8, 4\r\n"
+            "and    $10, $9, $8\r\n"
+            "beqz   $10, 3f\r\n"
+            "nop    \r\n"
+            "mov.s  %2, %0\r\n"
+            "mov.s  %3, %1\r\n"
+            "ldc1   %0, 0(%5)\r\n"
+            "ldc1   %1, 8(%5)\r\n"
+
+            "3:     \r\n"
+            "nop    \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
+            : "r" (col), "r" (inptr)
+            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
+           );
+      } else {
+        mmA = _mm_load_si64((__m64 *)&inptr[0]);
+        mmF = _mm_load_si64((__m64 *)&inptr[8]);
+        mmD = _mm_load_si64((__m64 *)&inptr[16]);
+        mmC = _mm_load_si64((__m64 *)&inptr[24]);
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmB = mmA;
+      mmA = _mm_unpacklo_pi8(mmA, mmF);
+      mmB = _mm_unpackhi_pi8(mmB, mmF);
+
+      mmG = mmD;
+      mmD = _mm_unpacklo_pi8(mmD, mmC);
+      mmG = _mm_unpackhi_pi8(mmG, mmC);
+
+      mmE = mmA;
+      mmA = _mm_unpacklo_pi16(mmA, mmD);
+      mmE = _mm_unpackhi_pi16(mmE, mmD);
+
+      mmH = mmB;
+      mmB = _mm_unpacklo_pi16(mmB, mmG);
+      mmH = _mm_unpackhi_pi16(mmH, mmG);
+
+      mmC = mmA;
+      mmA = _mm_loadlo_pi8_f(mmA);
+      mmC = _mm_loadhi_pi8_f(mmC);
+
+      mmD = mmB;
+      mmB = _mm_loadlo_pi8_f(mmB);
+      mmD = _mm_loadhi_pi8_f(mmD);
+
+      mmG = mmE;
+      mmE = _mm_loadlo_pi8_f(mmE);
+      mmG = _mm_loadhi_pi8_f(mmG);
+
+      mmF = mmH;
+      mmF = _mm_unpacklo_pi8(mmF, mmH);
+      mmH = _mm_unpackhi_pi8(mmH, mmH);
+      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
+      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
+
+#endif
+
+      wk[0] = mm0;
+      wk[1] = mm1;
+      wk[2] = mm4;
+      wk[3] = mm5;
+
+      mm6 = mm1;
+      mm1 = _mm_unpacklo_pi16(mm1, mm3);
+      mm6 = _mm_unpackhi_pi16(mm6, mm3);
+      mm7 = mm1;
+      mm4 = mm6;
+      mm1 = _mm_madd_pi16(mm1, PW_F0299_F0337);
+      mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
+      mm7 = _mm_madd_pi16(mm7, PW_MF016_MF033);
+      mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
+
+      wk[4] = mm1;
+      wk[5] = mm6;
+
+      mm1 = _mm_loadlo_pi16_f(mm5);
+      mm6 = _mm_loadhi_pi16_f(mm5);
+      mm1 = _mm_srli_pi32(mm1, 1);
+      mm6 = _mm_srli_pi32(mm6, 1);
+
+      mm5 = PD_ONEHALFM1_CJ;
+      mm7 = _mm_add_pi32(mm7, mm1);
+      mm4 = _mm_add_pi32(mm4, mm6);
+      mm7 = _mm_add_pi32(mm7, mm5);
+      mm4 = _mm_add_pi32(mm4, mm5);
+      mm7 = _mm_srli_pi32(mm7, SCALEBITS);
+      mm4 = _mm_srli_pi32(mm4, SCALEBITS);
+      mm7 = _mm_packs_pi32(mm7, mm4);
+
+      mm1 = wk[2];
+      mm6 = mm0;
+      mm0 = _mm_unpacklo_pi16(mm0, mm2);
+      mm6 = _mm_unpackhi_pi16(mm6, mm2);
+      mm5 = mm0;
+      mm4 = mm6;
+      mm0 = _mm_madd_pi16(mm0, PW_F0299_F0337);
+      mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
+      mm5 = _mm_madd_pi16(mm5, PW_MF016_MF033);
+      mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
+
+      wk[6] = mm0;
+      wk[7] = mm6;
+      mm0 = _mm_loadlo_pi16_f(mm1);
+      mm6 = _mm_loadhi_pi16_f(mm1);
+      mm0 = _mm_srli_pi32(mm0, 1);
+      mm6 = _mm_srli_pi32(mm6, 1);
+
+      mm1 = PD_ONEHALFM1_CJ;
+      mm5 = _mm_add_pi32(mm5, mm0);
+      mm4 = _mm_add_pi32(mm4, mm6);
+      mm5 = _mm_add_pi32(mm5, mm1);
+      mm4 = _mm_add_pi32(mm4, mm1);
+      mm5 = _mm_srli_pi32(mm5, SCALEBITS);
+      mm4 = _mm_srli_pi32(mm4, SCALEBITS);
+      mm5 = _mm_packs_pi32(mm5, mm4);
+
+      mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
+      mm5  = _mm_or_si64(mm5, mm7);
+      Cb_RG = mm5;
+
+      mm0 = wk[3];
+      mm6 = wk[2];
+      mm1 = wk[1];
+
+      mm4 = mm0;
+      mm0 = _mm_unpacklo_pi16(mm0, mm3);
+      mm4 = _mm_unpackhi_pi16(mm4, mm3);
+      mm7 = mm0;
+      mm5 = mm4;
+      mm0 = _mm_madd_pi16(mm0, PW_F0114_F0250);
+      mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
+      mm7 = _mm_madd_pi16(mm7, PW_MF008_MF041);
+      mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
+
+      mm3 = PD_ONEHALF;
+      mm0 = _mm_add_pi32(mm0, wk[4]);
+      mm4 = _mm_add_pi32(mm4, wk[5]);
+      mm0 = _mm_add_pi32(mm0, mm3);
+      mm4 = _mm_add_pi32(mm4, mm3);
+      mm0 = _mm_srli_pi32(mm0, SCALEBITS);
+      mm4 = _mm_srli_pi32(mm4, SCALEBITS);
+      mm0 = _mm_packs_pi32(mm0, mm4);
+
+      mm3 = _mm_loadlo_pi16_f(mm1);
+      mm4 = _mm_loadhi_pi16_f(mm1);
+      mm3 = _mm_srli_pi32(mm3, 1);
+      mm4 = _mm_srli_pi32(mm4, 1);
+
+      mm1 = PD_ONEHALFM1_CJ;
+      mm7 = _mm_add_pi32(mm7, mm3);
+      mm5 = _mm_add_pi32(mm5, mm4);
+      mm7 = _mm_add_pi32(mm7, mm1);
+      mm5 = _mm_add_pi32(mm5, mm1);
+      mm7 = _mm_srli_pi32(mm7, SCALEBITS);
+      mm5 = _mm_srli_pi32(mm5, SCALEBITS);
+      mm7 = _mm_packs_pi32(mm7, mm5);
+
+      mm3 = wk[0];
+      mm4 = mm6;
+      mm6 = _mm_unpacklo_pi16(mm6, mm2);
+      mm4 = _mm_unpackhi_pi16(mm4, mm2);
+      mm1 = mm6;
+      mm5 = mm4;
+      mm6 = _mm_madd_pi16(mm6, PW_F0114_F0250);
+      mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
+      mm1 = _mm_madd_pi16(mm1, PW_MF008_MF041);
+      mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
+
+      mm2 = PD_ONEHALF;
+      mm6 = _mm_add_pi32(mm6, wk[6]);
+      mm4 = _mm_add_pi32(mm4, wk[7]);
+      mm6 = _mm_add_pi32(mm6, mm2);
+      mm4 = _mm_add_pi32(mm4, mm2);
+      mm6 = _mm_srli_pi32(mm6, SCALEBITS);
+      mm4 = _mm_srli_pi32(mm4, SCALEBITS);
+      mm6 = _mm_packs_pi32(mm6, mm4);
+
+      mm0 = _mm_slli_pi16(mm0, BYTE_BIT);
+      mm6 = _mm_or_si64(mm6, mm0);
+      Y_BG = mm6;
+
+      mm2 = _mm_loadlo_pi16_f(mm3);
+      mm4 = _mm_loadhi_pi16_f(mm3);
+      mm2 = _mm_srli_pi32(mm2, 1);
+      mm4 = _mm_srli_pi32(mm4, 1);
+
+      mm0 = PD_ONEHALFM1_CJ;
+      mm1 = _mm_add_pi32(mm1, mm2);
+      mm5 = _mm_add_pi32(mm5, mm4);
+      mm1 = _mm_add_pi32(mm1, mm0);
+      mm5 = _mm_add_pi32(mm5, mm0);
+      mm1 = _mm_srli_pi32(mm1, SCALEBITS);
+      mm5 = _mm_srli_pi32(mm5, SCALEBITS);
+      mm1 = _mm_packs_pi32(mm1, mm5);
+
+      mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
+      mm1 = _mm_or_si64(mm1, mm7);
+      Cr_BG = mm1;
+
+      _mm_store_si64((__m64 *)&outptr0[0], Y_BG);
+      _mm_store_si64((__m64 *)&outptr1[0], Cb_RG);
+      _mm_store_si64((__m64 *)&outptr2[0], Cr_BG);
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/simd/loongson/jccolor-mmi.c b/simd/loongson/jccolor-mmi.c
new file mode 100644
index 0000000..3fc4b1e
--- /dev/null
+++ b/simd/loongson/jccolor-mmi.c
@@ -0,0 +1,148 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2014, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> YCC CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_081 ((short) 5329)                /* FIX(0.08131) */
+#define F_0_114 ((short) 7471)                /* FIX(0.11400) */
+#define F_0_168 ((short)11059)                /* FIX(0.16874) */
+#define F_0_250 ((short)16384)                /* FIX(0.25000) */
+#define F_0_299 ((short)19595)                /* FIX(0.29900) */
+#define F_0_331 ((short)21709)                /* FIX(0.33126) */
+#define F_0_418 ((short)27439)                /* FIX(0.41869) */
+#define F_0_587 ((short)38470)                /* FIX(0.58700) */
+#define F_0_337 ((short)(F_0_587 - F_0_250))  /* FIX(0.58700) - FIX(0.25000) */
+
+enum const_index {
+  index_PD_ONEHALF,
+  index_PW_F0299_F0337,
+  index_PW_F0114_F0250,
+  index_PW_MF016_MF033,
+  index_PW_MF008_MF041,
+  index_PD_ONEHALFM1_CJ
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))),
+  _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299),
+  _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114),
+  _uint64_set_pi16(-F_0_331, -F_0_168, -F_0_331, -F_0_168),
+  _uint64_set_pi16(-F_0_418, -F_0_081, -F_0_418, -F_0_081),
+  _uint64_set_pi32(((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS)),
+                   ((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS)))
+};
+
+#define get_const_value(index) (*(__m64 *)&const_value[index])
+
+#define PD_ONEHALF      get_const_value(index_PD_ONEHALF)
+#define PW_F0299_F0337  get_const_value(index_PW_F0299_F0337)
+#define PW_F0114_F0250  get_const_value(index_PW_F0114_F0250)
+#define PW_MF016_MF033  get_const_value(index_PW_MF016_MF033)
+#define PW_MF008_MF041  get_const_value(index_PW_MF008_MF041)
+#define PD_ONEHALFM1_CJ get_const_value(index_PD_ONEHALFM1_CJ)
+
+
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi jsimd_extrgb_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi jsimd_extrgbx_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi jsimd_extbgr_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi jsimd_extbgrx_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi jsimd_extxbgr_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi jsimd_extxrgb_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
diff --git a/simd/loongson/jcsample-mmi.c b/simd/loongson/jcsample-mmi.c
new file mode 100644
index 0000000..2f2d851
--- /dev/null
+++ b/simd/loongson/jcsample-mmi.c
@@ -0,0 +1,100 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, 2018, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA DOWNSAMPLING */
+
+#include "jsimd_mmi.h"
+#include "jcsample.h"
+
+
+void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor,
+                               JDIMENSION v_samp_factor,
+                               JDIMENSION width_in_blocks,
+                               JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  int inrow, outrow, outcol, bias;
+  JDIMENSION output_cols = width_in_blocks * DCTSIZE;
+  JSAMPROW inptr0, inptr1, outptr;
+  __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6 = 0.0, mm7;
+
+  expand_right_edge(input_data, max_v_samp_factor, image_width,
+                    output_cols * 2);
+
+  bias = (1 << 17) + 1;                      /* 0x00020001 (bias pattern) */
+  mm7 = _mm_set1_pi32(bias);                 /* mm7={1, 2, 1, 2} */
+  mm6 = _mm_cmpeq_pi16(mm6, mm6);
+  mm6 = _mm_srli_pi16(mm6, BYTE_BIT);        /* mm6={0xFF 0x00 0xFF 0x00 ..} */
+
+  for (inrow = 0, outrow = 0; outrow < v_samp_factor;
+       inrow += 2, outrow++) {
+
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr = output_data[outrow];
+
+    for (outcol = output_cols; outcol > 0;
+         outcol -= 8, inptr0 += 16, inptr1 += 16, outptr += 8) {
+
+      mm0 = _mm_load_si64((__m64 *)&inptr0[0]);
+      mm1 = _mm_load_si64((__m64 *)&inptr1[0]);
+      mm2 = _mm_load_si64((__m64 *)&inptr0[8]);
+      mm3 = _mm_load_si64((__m64 *)&inptr1[8]);
+
+      mm4 = mm0;
+      mm5 = mm1;
+      mm0 = _mm_and_si64(mm0, mm6);
+      mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
+      mm1 = _mm_and_si64(mm1, mm6);
+      mm5 = _mm_srli_pi16(mm5, BYTE_BIT);
+      mm0 = _mm_add_pi16(mm0, mm4);
+      mm1 = _mm_add_pi16(mm1, mm5);
+
+      mm4 = mm2;
+      mm5 = mm3;
+      mm2 = _mm_and_si64(mm2, mm6);
+      mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
+      mm3 = _mm_and_si64(mm3, mm6);
+      mm5 = _mm_srli_pi16(mm5, BYTE_BIT);
+      mm2 = _mm_add_pi16(mm2, mm4);
+      mm3 = _mm_add_pi16(mm3, mm5);
+
+      mm0 = _mm_add_pi16(mm0, mm1);
+      mm2 = _mm_add_pi16(mm2, mm3);
+      mm0 = _mm_add_pi16(mm0, mm7);
+      mm2 = _mm_add_pi16(mm2, mm7);
+      mm0 = _mm_srli_pi16(mm0, 2);
+      mm2 = _mm_srli_pi16(mm2, 2);
+
+      mm0 = _mm_packs_pu16(mm0, mm2);
+
+      _mm_store_si64((__m64 *)&outptr[0], mm0);
+    }
+  }
+}
diff --git a/simd/jcsample.h b/simd/loongson/jcsample.h
similarity index 76%
rename from simd/jcsample.h
rename to simd/loongson/jcsample.h
index 2a50544..2ac4816 100644
--- a/simd/jcsample.h
+++ b/simd/loongson/jcsample.h
@@ -8,14 +8,14 @@
  */
 
 LOCAL(void)
-expand_right_edge (JSAMPARRAY image_data, int num_rows,
-                   JDIMENSION input_cols, JDIMENSION output_cols)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+                  JDIMENSION output_cols)
 {
   register JSAMPROW ptr;
   register JSAMPLE pixval;
   register int count;
   int row;
-  int numcols = (int) (output_cols - input_cols);
+  int numcols = (int)(output_cols - input_cols);
 
   if (numcols > 0) {
     for (row = 0; row < num_rows; row++) {
diff --git a/simd/loongson/jdcolext-mmi.c b/simd/loongson/jdcolext-mmi.c
new file mode 100644
index 0000000..ca0dc2d
--- /dev/null
+++ b/simd/loongson/jdcolext-mmi.c
@@ -0,0 +1,424 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define  mmA  mm0
+#define  mmB  mm1
+#elif RGB_GREEN == 0
+#define  mmA  mm2
+#define  mmB  mm3
+#elif RGB_BLUE == 0
+#define  mmA  mm4
+#define  mmB  mm5
+#else
+#define  mmA  mm6
+#define  mmB  mm7
+#endif
+
+#if RGB_RED == 1
+#define  mmC  mm0
+#define  mmD  mm1
+#elif RGB_GREEN == 1
+#define  mmC  mm2
+#define  mmD  mm3
+#elif RGB_BLUE == 1
+#define  mmC  mm4
+#define  mmD  mm5
+#else
+#define  mmC  mm6
+#define  mmD  mm7
+#endif
+
+#if RGB_RED == 2
+#define  mmE  mm0
+#define  mmF  mm1
+#elif RGB_GREEN == 2
+#define  mmE  mm2
+#define  mmF  mm3
+#elif RGB_BLUE == 2
+#define  mmE  mm4
+#define  mmF  mm5
+#else
+#define  mmE  mm6
+#define  mmF  mm7
+#endif
+
+#if RGB_RED == 3
+#define  mmG  mm0
+#define  mmH  mm1
+#elif RGB_GREEN == 3
+#define  mmG  mm2
+#define  mmH  mm3
+#elif RGB_BLUE == 3
+#define  mmG  mm4
+#define  mmH  mm5
+#else
+#define  mmG  mm6
+#define  mmH  mm7
+#endif
+
+
+void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
+                               JDIMENSION input_row, JSAMPARRAY output_buf,
+                               int num_rows)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int num_cols, col;
+  __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
+  __m64 mm8, wk[2];
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+
+    for (num_cols = out_width; num_cols > 0; num_cols -= 8,
+         inptr0 += 8, inptr1 += 8, inptr2 += 8) {
+
+      mm5 = _mm_load_si64((__m64 *)inptr1);
+      mm1 = _mm_load_si64((__m64 *)inptr2);
+      mm8 = _mm_load_si64((__m64 *)inptr0);
+      mm4 = 0;
+      mm7 = 0;
+      mm4 = _mm_cmpeq_pi16(mm4, mm4);
+      mm7 = _mm_cmpeq_pi16(mm7, mm7);
+      mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
+      mm7 = _mm_slli_pi16(mm7, 7);      /* mm7={0xFF80 0xFF80 0xFF80 0xFF80} */
+      mm0 = mm4;                        /* mm0=mm4={0xFF 0x00 0xFF 0x00 ..} */
+
+      mm4 = _mm_and_si64(mm4, mm5);           /* mm4=Cb(0246)=CbE */
+      mm5 = _mm_srli_pi16(mm5, BYTE_BIT);     /* mm5=Cb(1357)=CbO */
+      mm0 = _mm_and_si64(mm0, mm1);           /* mm0=Cr(0246)=CrE */
+      mm1 = _mm_srli_pi16(mm1, BYTE_BIT);     /* mm1=Cr(1357)=CrO */
+      mm4 = _mm_add_pi16(mm4, mm7);
+      mm5 = _mm_add_pi16(mm5, mm7);
+      mm0 = _mm_add_pi16(mm0, mm7);
+      mm1 = _mm_add_pi16(mm1, mm7);
+
+      /* (Original)
+       * R = Y                + 1.40200 * Cr
+       * G = Y - 0.34414 * Cb - 0.71414 * Cr
+       * B = Y + 1.77200 * Cb
+       *
+       * (This implementation)
+       * R = Y                + 0.40200 * Cr + Cr
+       * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+       * B = Y - 0.22800 * Cb + Cb + Cb
+       */
+
+      mm2 = mm4;                              /* mm2 = CbE */
+      mm3 = mm5;                              /* mm3 = CbO */
+      mm4 = _mm_add_pi16(mm4, mm4);           /* mm4 = 2*CbE */
+      mm5 = _mm_add_pi16(mm5, mm5);           /* mm5 = 2*CbO */
+      mm6 = mm0;                              /* mm6 = CrE */
+      mm7 = mm1;                              /* mm7 = CrO */
+      mm0 = _mm_add_pi16(mm0, mm0);           /* mm0 = 2*CrE */
+      mm1 = _mm_add_pi16(mm1, mm1);           /* mm1 = 2*CrO */
+
+      mm4 = _mm_mulhi_pi16(mm4, PW_MF0228);   /* mm4=(2*CbE * -FIX(0.22800) */
+      mm5 = _mm_mulhi_pi16(mm5, PW_MF0228);   /* mm5=(2*CbO * -FIX(0.22800) */
+      mm0 = _mm_mulhi_pi16(mm0, PW_F0402);    /* mm0=(2*CrE * FIX(0.40200)) */
+      mm1 = _mm_mulhi_pi16(mm1, PW_F0402);    /* mm1=(2*CrO * FIX(0.40200)) */
+
+      mm4 = _mm_add_pi16(mm4, PW_ONE);
+      mm5 = _mm_add_pi16(mm5, PW_ONE);
+      mm4 = _mm_srai_pi16(mm4, 1);            /* mm4=(CbE * -FIX(0.22800)) */
+      mm5 = _mm_srai_pi16(mm5, 1);            /* mm5=(CbO * -FIX(0.22800)) */
+      mm0 = _mm_add_pi16(mm0, PW_ONE);
+      mm1 = _mm_add_pi16(mm1, PW_ONE);
+      mm0 = _mm_srai_pi16(mm0, 1);            /* mm0=(CrE * FIX(0.40200)) */
+      mm1 = _mm_srai_pi16(mm1, 1);            /* mm1=(CrO * FIX(0.40200)) */
+
+      mm4 = _mm_add_pi16(mm4, mm2);
+      mm5 = _mm_add_pi16(mm5, mm3);
+      mm4 = _mm_add_pi16(mm4, mm2);       /* mm4=(CbE * FIX(1.77200))=(B-Y)E */
+      mm5 = _mm_add_pi16(mm5, mm3);       /* mm5=(CbO * FIX(1.77200))=(B-Y)O */
+      mm0 = _mm_add_pi16(mm0, mm6);       /* mm0=(CrE * FIX(1.40200))=(R-Y)E */
+      mm1 = _mm_add_pi16(mm1, mm7);       /* mm1=(CrO * FIX(1.40200))=(R-Y)O */
+
+      wk[0] = mm4;                            /* wk(0)=(B-Y)E */
+      wk[1] = mm5;                            /* wk(1)=(B-Y)O */
+
+      mm4 = mm2;
+      mm5 = mm3;
+      mm2 = _mm_unpacklo_pi16(mm2, mm6);
+      mm4 = _mm_unpackhi_pi16(mm4, mm6);
+      mm2 = _mm_madd_pi16(mm2, PW_MF0344_F0285);
+      mm4 = _mm_madd_pi16(mm4, PW_MF0344_F0285);
+      mm3 = _mm_unpacklo_pi16(mm3, mm7);
+      mm5 = _mm_unpackhi_pi16(mm5, mm7);
+      mm3 = _mm_madd_pi16(mm3, PW_MF0344_F0285);
+      mm5 = _mm_madd_pi16(mm5, PW_MF0344_F0285);
+
+      mm2 = _mm_add_pi32(mm2, PD_ONEHALF);
+      mm4 = _mm_add_pi32(mm4, PD_ONEHALF);
+      mm2 = _mm_srai_pi32(mm2, SCALEBITS);
+      mm4 = _mm_srai_pi32(mm4, SCALEBITS);
+      mm3 = _mm_add_pi32(mm3, PD_ONEHALF);
+      mm5 = _mm_add_pi32(mm5, PD_ONEHALF);
+      mm3 = _mm_srai_pi32(mm3, SCALEBITS);
+      mm5 = _mm_srai_pi32(mm5, SCALEBITS);
+
+      mm2 = _mm_packs_pi32(mm2, mm4);  /* mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) */
+      mm3 = _mm_packs_pi32(mm3, mm5);  /* mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) */
+      mm2 = _mm_sub_pi16(mm2, mm6);  /* mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
+      mm3 = _mm_sub_pi16(mm3, mm7);  /* mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
+
+      mm5 = mm8;                              /* mm5=Y(01234567) */
+
+      mm4 = _mm_cmpeq_pi16(mm4, mm4);
+      mm4 = _mm_srli_pi16(mm4, BYTE_BIT);    /* mm4={0xFF 0x00 0xFF 0x00 ..} */
+      mm4 = _mm_and_si64(mm4, mm5);          /* mm4=Y(0246)=YE */
+      mm5 = _mm_srli_pi16(mm5, BYTE_BIT);    /* mm5=Y(1357)=YO */
+
+      mm0 = _mm_add_pi16(mm0, mm4);      /* mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) */
+      mm1 = _mm_add_pi16(mm1, mm5);      /* mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) */
+      mm0 = _mm_packs_pu16(mm0, mm0);    /* mm0=(R0 R2 R4 R6 ** ** ** **) */
+      mm1 = _mm_packs_pu16(mm1, mm1);    /* mm1=(R1 R3 R5 R7 ** ** ** **) */
+
+      mm2 = _mm_add_pi16(mm2, mm4);      /* mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) */
+      mm3 = _mm_add_pi16(mm3, mm5);      /* mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) */
+      mm2 = _mm_packs_pu16(mm2, mm2);    /* mm2=(G0 G2 G4 G6 ** ** ** **) */
+      mm3 = _mm_packs_pu16(mm3, mm3);    /* mm3=(G1 G3 G5 G7 ** ** ** **) */
+
+      mm4 = _mm_add_pi16(mm4, wk[0]);    /* mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) */
+      mm5 = _mm_add_pi16(mm5, wk[1]);    /* mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) */
+      mm4 = _mm_packs_pu16(mm4, mm4);    /* mm4=(B0 B2 B4 B6 ** ** ** **) */
+      mm5 = _mm_packs_pu16(mm5, mm5);    /* mm5=(B1 B3 B5 B7 ** ** ** **) */
+
+#if RGB_PIXELSIZE == 3
+
+      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+      mmA = _mm_unpacklo_pi8(mmA, mmC);     /* mmA=(00 10 02 12 04 14 06 16) */
+      mmE = _mm_unpacklo_pi8(mmE, mmB);     /* mmE=(20 01 22 03 24 05 26 07) */
+      mmD = _mm_unpacklo_pi8(mmD, mmF);     /* mmD=(11 21 13 23 15 25 17 27) */
+
+      mmG = mmA;
+      mmH = mmA;
+      mmA = _mm_unpacklo_pi16(mmA, mmE);    /* mmA=(00 10 20 01 02 12 22 03) */
+      mmG = _mm_unpackhi_pi16(mmG, mmE);    /* mmG=(04 14 24 05 06 16 26 07) */
+
+      mmH = _mm_srli_si64(mmH, 2 * BYTE_BIT);
+      mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT);
+
+      mmC = mmD;
+      mmB = mmD;
+      mmD = _mm_unpacklo_pi16(mmD, mmH);    /* mmD=(11 21 02 12 13 23 04 14) */
+      mmC = _mm_unpackhi_pi16(mmC, mmH);    /* mmC=(15 25 06 16 17 27 -- --) */
+
+      mmB = _mm_srli_si64(mmB, 2 * BYTE_BIT); /* mmB=(13 23 15 25 17 27 -- --) */
+
+      mmF = mmE;
+      mmE = _mm_unpacklo_pi16(mmE, mmB);    /* mmE=(22 03 13 23 24 05 15 25) */
+      mmF = _mm_unpackhi_pi16(mmF, mmB);    /* mmF=(26 07 17 27 -- -- -- --) */
+
+      mmA = _mm_unpacklo_pi32(mmA, mmD);    /* mmA=(00 10 20 01 11 21 02 12) */
+      mmE = _mm_unpacklo_pi32(mmE, mmG);    /* mmE=(22 03 13 23 04 14 24 05) */
+      mmC = _mm_unpacklo_pi32(mmC, mmF);    /* mmC=(15 25 06 16 26 07 17 27) */
+
+      if (num_cols >= 8) {
+        _mm_store_si64((__m64 *)outptr, mmA);
+        _mm_store_si64((__m64 *)(outptr + 8), mmE);
+        _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        outptr += RGB_PIXELSIZE * 8;
+      } else {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li      $8, 16\r\n"
+            "move    $9, %4\r\n"
+            "mov.s   $f4, %1\r\n"
+            "mov.s   $f6, %3\r\n"
+            "move    $10, %5\r\n"
+            "bltu    $9, $8, 1f\r\n"
+            "nop     \r\n"
+            "gssdlc1 $f4, 7($10)\r\n"
+            "gssdrc1 $f4, 0($10)\r\n"
+            "gssdlc1 $f6, 7+8($10)\r\n"
+            "gssdrc1 $f6, 8($10)\r\n"
+            "mov.s   $f4, %2\r\n"
+            "subu    $9, $9, 16\r\n"
+            "daddu   $10, $10, 16\r\n"
+            "b       2f\r\n"
+            "nop     \r\n"
+
+            "1:      \r\n"
+            "li      $8, 8\r\n"               /* st8 */
+            "bltu    $9, $8, 2f\r\n"
+            "nop     \r\n"
+            "gssdlc1 $f4, 7($10)\r\n"
+            "gssdrc1 $f4, ($10)\r\n"
+            "mov.s   $f4, %3\r\n"
+            "subu    $9, $9, 8\r\n"
+            "daddu   $10, $10, 8\r\n"
+
+            "2:      \r\n"
+            "li      $8, 4\r\n"               /* st4 */
+            "mfc1    $11, $f4\r\n"
+            "bltu    $9, $8, 3f\r\n"
+            "nop     \r\n"
+            "swl     $11, 3($10)\r\n"
+            "swr     $11, 0($10)\r\n"
+            "li      $8, 32\r\n"
+            "mtc1    $8, $f6\r\n"
+            "dsrl    $f4, $f4, $f6\r\n"
+            "mfc1    $11, $f4\r\n"
+            "subu    $9, $9, 4\r\n"
+            "daddu   $10, $10, 4\r\n"
+
+            "3:      \r\n"
+            "li      $8, 2\r\n"               /* st2 */
+            "bltu    $9, $8, 4f\r\n"
+            "nop     \r\n"
+            "ush     $11, 0($10)\r\n"
+            "srl     $11, 16\r\n"
+            "subu    $9, $9, 2\r\n"
+            "daddu   $10, $10, 2\r\n"
+
+            "4:      \r\n"
+            "li      $8, 1\r\n"               /* st1 */
+            "bltu    $9, $8, 5f\r\n"
+            "nop     \r\n"
+            "sb      $11, 0($10)\r\n"
+
+            "5:      \r\n"
+            "nop     \r\n"                    /* end */
+            : "=m" (*outptr)
+            : "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr)
+            : "$f4", "$f6", "$8", "$9", "$10", "$11", "memory"
+           );
+      }
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+      mm6 = _mm_cmpeq_pi8(mm6, mm6);
+      mm7 = _mm_cmpeq_pi8(mm7, mm7);
+#else
+      mm6 = _mm_xor_si64(mm6, mm6);
+      mm7 = _mm_xor_si64(mm7, mm7);
+#endif
+      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+      /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */
+      /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */
+
+      mmA = _mm_unpacklo_pi8(mmA, mmC);     /* mmA=(00 10 02 12 04 14 06 16) */
+      mmE = _mm_unpacklo_pi8(mmE, mmG);     /* mmE=(20 30 22 32 24 34 26 36) */
+      mmB = _mm_unpacklo_pi8(mmB, mmD);     /* mmB=(01 11 03 13 05 15 07 17) */
+      mmF = _mm_unpacklo_pi8(mmF, mmH);     /* mmF=(21 31 23 33 25 35 27 37) */
+
+      mmC = mmA;
+      mmA = _mm_unpacklo_pi16(mmA, mmE);    /* mmA=(00 10 20 30 02 12 22 32) */
+      mmC = _mm_unpackhi_pi16(mmC, mmE);    /* mmC=(04 14 24 34 06 16 26 36) */
+      mmG = mmB;
+      mmB = _mm_unpacklo_pi16(mmB, mmF);    /* mmB=(01 11 21 31 03 13 23 33) */
+      mmG = _mm_unpackhi_pi16(mmG, mmF);    /* mmG=(05 15 25 35 07 17 27 37) */
+
+      mmD = mmA;
+      mmA = _mm_unpacklo_pi32(mmA, mmB);    /* mmA=(00 10 20 30 01 11 21 31) */
+      mmD = _mm_unpackhi_pi32(mmD, mmB);    /* mmD=(02 12 22 32 03 13 23 33) */
+      mmH = mmC;
+      mmC = _mm_unpacklo_pi32(mmC, mmG);    /* mmC=(04 14 24 34 05 15 25 35) */
+      mmH = _mm_unpackhi_pi32(mmH, mmG);    /* mmH=(06 16 26 36 07 17 27 37) */
+
+      if (num_cols >= 8) {
+        _mm_store_si64((__m64 *)outptr, mmA);
+        _mm_store_si64((__m64 *)(outptr + 8), mmD);
+        _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        _mm_store_si64((__m64 *)(outptr + 24), mmH);
+        outptr += RGB_PIXELSIZE * 8;
+      } else {
+        col = num_cols;
+        asm(".set noreorder\r\n"              /* st16 */
+
+            "li      $8, 4\r\n"
+            "move    $9, %6\r\n"
+            "move    $10, %7\r\n"
+            "mov.s   $f4, %2\r\n"
+            "mov.s   $f6, %4\r\n"
+            "bltu    $9, $8, 1f\r\n"
+            "nop     \r\n"
+            "gssdlc1 $f4, 7($10)\r\n"
+            "gssdrc1 $f4, ($10)\r\n"
+            "gssdlc1 $f6, 7+8($10)\r\n"
+            "gssdrc1 $f6, 8($10)\r\n"
+            "mov.s   $f4, %3\r\n"
+            "mov.s   $f6, %5\r\n"
+            "subu    $9, $9, 4\r\n"
+            "daddu   $10, $10, 16\r\n"
+
+            "1:      \r\n"
+            "li      $8, 2\r\n"               /* st8 */
+            "bltu    $9, $8, 2f\r\n"
+            "nop     \r\n"
+            "gssdlc1 $f4, 7($10)\r\n"
+            "gssdrc1 $f4, 0($10)\r\n"
+            "mov.s   $f4, $f6\r\n"
+            "subu    $9, $9, 2\r\n"
+            "daddu   $10, $10, 8\r\n"
+
+            "2:      \r\n"
+            "li      $8, 1\r\n"               /* st4 */
+            "bltu    $9, $8, 3f\r\n"
+            "nop     \r\n"
+            "gsswlc1 $f4, 3($10)\r\n"
+            "gsswrc1 $f4, 0($10)\r\n"
+
+            "3:      \r\n"
+            "li      %1, 0\r\n"               /* end */
+            : "=m" (*outptr), "=r" (col)
+            : "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col),
+              "r" (outptr)
+            : "$f4", "$f6", "$8", "$9", "$10", "memory"
+           );
+      }
+
+#endif
+
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/simd/loongson/jdcolor-mmi.c b/simd/loongson/jdcolor-mmi.c
new file mode 100644
index 0000000..75b2fdf
--- /dev/null
+++ b/simd/loongson/jdcolor-mmi.c
@@ -0,0 +1,139 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_344 ((short)22554)  /* FIX(0.34414) */
+#define F_0_402 ((short)26345)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285 ((short)18734)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228 ((short)14942)  /* FIX(2) - FIX(1.77200) */
+
+enum const_index {
+  index_PW_ONE,
+  index_PW_F0402,
+  index_PW_MF0228,
+  index_PW_MF0344_F0285,
+  index_PD_ONEHALF
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(1, 1, 1, 1),
+  _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402),
+  _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228),
+  _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344),
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1)))
+};
+
+#define PW_ONE          get_const_value(index_PW_ONE)
+#define PW_F0402        get_const_value(index_PW_F0402)
+#define PW_MF0228       get_const_value(index_PW_MF0228)
+#define PW_MF0344_F0285 get_const_value(index_PW_MF0344_F0285)
+#define PD_ONEHALF      get_const_value(index_PD_ONEHALF)
+
+#define RGBX_FILLER_0XFF 1
+
+
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED EXT_RGB_RED
+#define RGB_GREEN EXT_RGB_GREEN
+#define RGB_BLUE EXT_RGB_BLUE
+#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extrgb_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED EXT_RGBX_RED
+#define RGB_GREEN EXT_RGBX_GREEN
+#define RGB_BLUE EXT_RGBX_BLUE
+#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extrgbx_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED EXT_BGR_RED
+#define RGB_GREEN EXT_BGR_GREEN
+#define RGB_BLUE EXT_BGR_BLUE
+#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extbgr_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED EXT_BGRX_RED
+#define RGB_GREEN EXT_BGRX_GREEN
+#define RGB_BLUE EXT_BGRX_BLUE
+#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extbgrx_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED EXT_XBGR_RED
+#define RGB_GREEN EXT_XBGR_GREEN
+#define RGB_BLUE EXT_XBGR_BLUE
+#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extxbgr_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED EXT_XRGB_RED
+#define RGB_GREEN EXT_XRGB_GREEN
+#define RGB_BLUE EXT_XRGB_BLUE
+#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi jsimd_ycc_extxrgb_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
diff --git a/simd/loongson/jdsample-mmi.c b/simd/loongson/jdsample-mmi.c
new file mode 100644
index 0000000..1741451
--- /dev/null
+++ b/simd/loongson/jdsample-mmi.c
@@ -0,0 +1,245 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, 2018, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA UPSAMPLING */
+
+#include "jsimd_mmi.h"
+
+
+enum const_index {
+  index_PW_THREE,
+  index_PW_SEVEN,
+  index_PW_EIGHT,
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(3, 3, 3, 3),
+  _uint64_set_pi16(7, 7, 7, 7),
+  _uint64_set_pi16(8, 8, 8, 8),
+};
+
+#define PW_THREE get_const_value(index_PW_THREE)
+#define PW_SEVEN get_const_value(index_PW_SEVEN)
+#define PW_EIGHT get_const_value(index_PW_EIGHT)
+
+
+#define PROCESS_ROW(r) { \
+  mm7 = _mm_load_si64((__m64 *)outptr##r);      /* mm7=IntrL=( 0 1 2 3) */ \
+  mm3 = _mm_load_si64((__m64 *)outptr##r + 1);  /* mm3=IntrH=( 4 5 6 7) */ \
+  \
+  mm0 = mm7; \
+  mm4 = mm3; \
+  mm0 = _mm_srli_si64(mm0, 2 * BYTE_BIT);                   /* mm0=( 1 2 3 -) */ \
+  mm4 = _mm_slli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( - - - 4) */ \
+  mm5 = mm7; \
+  mm6 = mm3; \
+  mm5 = _mm_srli_si64(mm5, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm5=( 3 - - -) */ \
+  mm6 = _mm_slli_si64(mm6, 2 * BYTE_BIT);                   /* mm6=( - 4 5 6) */ \
+  \
+  mm0 = _mm_or_si64(mm0, mm4);                /* mm0=( 1 2 3 4) */ \
+  mm5 = _mm_or_si64(mm5, mm6);                /* mm5=( 3 4 5 6) */ \
+  \
+  mm1 = mm7; \
+  mm2 = mm3; \
+  mm1 = _mm_slli_si64(mm1, 2 * BYTE_BIT);     /* mm1=( - 0 1 2) */ \
+  mm2 = _mm_srli_si64(mm2, 2 * BYTE_BIT);     /* mm2=( 5 6 7 -) */ \
+  mm4 = mm3; \
+  mm4 = _mm_srli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( 7 - - -) */ \
+  \
+  mm1 = _mm_or_si64(mm1, wk[r]);              /* mm1=(-1 0 1 2) */ \
+  mm2 = _mm_or_si64(mm2, wk[r + 2]);          /* mm2=( 5 6 6 8) */ \
+  \
+  wk[r] = mm4; \
+  \
+  mm7 = _mm_mullo_pi16(mm7, PW_THREE); \
+  mm3 = _mm_mullo_pi16(mm3, PW_THREE); \
+  mm1 = _mm_add_pi16(mm1, PW_EIGHT); \
+  mm5 = _mm_add_pi16(mm5, PW_EIGHT); \
+  mm0 = _mm_add_pi16(mm0, PW_SEVEN); \
+  mm2 = _mm_add_pi16(mm2, PW_SEVEN); \
+  \
+  mm1 = _mm_add_pi16(mm1, mm7); \
+  mm5 = _mm_add_pi16(mm5, mm3); \
+  mm1 = _mm_srli_pi16(mm1, 4);                /* mm1=OutrLE=( 0  2  4  6) */ \
+  mm5 = _mm_srli_pi16(mm5, 4);                /* mm5=OutrHE=( 8 10 12 14) */ \
+  mm0 = _mm_add_pi16(mm0, mm7); \
+  mm2 = _mm_add_pi16(mm2, mm3); \
+  mm0 = _mm_srli_pi16(mm0, 4);                /* mm0=OutrLO=( 1  3  5  7) */ \
+  mm2 = _mm_srli_pi16(mm2, 4);                /* mm2=OutrHO=( 9 11 13 15) */ \
+  \
+  mm0 = _mm_slli_pi16(mm0, BYTE_BIT); \
+  mm2 = _mm_slli_pi16(mm2, BYTE_BIT); \
+  mm1 = _mm_or_si64(mm1, mm0);     /* mm1=OutrL=( 0  1  2  3  4  5  6  7) */ \
+  mm5 = _mm_or_si64(mm5, mm2);     /* mm5=OutrH=( 8  9 10 11 12 13 14 15) */ \
+  \
+  _mm_store_si64((__m64 *)outptr##r, mm1); \
+  _mm_store_si64((__m64 *)outptr##r + 1, mm5); \
+}
+
+void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
+  int inrow, outrow, incol, tmp, tmp1;
+  __m64 mm0, mm1, mm2, mm3 = 0.0, mm4, mm5, mm6, mm7 = 0.0;
+  __m64 wk[4], mm_tmp;
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+    inptr_1 = input_data[inrow - 1];
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    if (downsampled_width & 7) {
+      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
+      tmp1 =  downsampled_width * sizeof(JSAMPLE);
+      asm("daddu  $8, %3, %6\r\n"
+          "lb     $9, ($8)\r\n"
+          "daddu  $8, %3, %7\r\n"
+          "sb     $9, ($8)\r\n"
+          "daddu  $8, %4, %6\r\n"
+          "lb     $9, ($8)\r\n"
+          "daddu  $8, %4, %7\r\n"
+          "sb     $9, ($8)\r\n"
+          "daddu  $8, %5, %6\r\n"
+          "lb     $9, ($8)\r\n"
+          "daddu  $8, %5, %7\r\n"
+          "sb     $9, ($8)\r\n"
+          : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1)
+          : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1)
+          : "$8", "$9"
+         );
+    }
+
+    /* process the first column block */
+    mm0 = _mm_load_si64((__m64 *)inptr0);     /* mm0 = row[ 0][0] */
+    mm1 = _mm_load_si64((__m64 *)inptr_1);    /* mm1 = row[-1][0] */
+    mm2 = _mm_load_si64((__m64 *)inptr1);     /* mm2 = row[ 1][0] */
+
+    mm3 = _mm_xor_si64(mm3, mm3);             /* mm3 = (all 0's) */
+    mm4 = mm0;
+    mm0 = _mm_unpacklo_pi8(mm0, mm3);         /* mm0 = row[ 0][0]( 0 1 2 3) */
+    mm4 = _mm_unpackhi_pi8(mm4, mm3);         /* mm4 = row[ 0][0]( 4 5 6 7) */
+    mm5 = mm1;
+    mm1 = _mm_unpacklo_pi8(mm1, mm3);         /* mm1 = row[-1][0]( 0 1 2 3) */
+    mm5 = _mm_unpackhi_pi8(mm5, mm3);         /* mm5 = row[-1][0]( 4 5 6 7) */
+    mm6 = mm2;
+    mm2 = _mm_unpacklo_pi8(mm2, mm3);         /* mm2 = row[+1][0]( 0 1 2 3) */
+    mm6 = _mm_unpackhi_pi8(mm6, mm3);         /* mm6 = row[+1][0]( 4 5 6 7) */
+
+    mm0 = _mm_mullo_pi16(mm0, PW_THREE);
+    mm4 = _mm_mullo_pi16(mm4, PW_THREE);
+
+    mm7 = _mm_cmpeq_pi8(mm7, mm7);
+    mm7 = _mm_srli_si64(mm7, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+
+    mm1 = _mm_add_pi16(mm1, mm0);             /* mm1=Int0L=( 0 1 2 3) */
+    mm5 = _mm_add_pi16(mm5, mm4);             /* mm5=Int0H=( 4 5 6 7) */
+    mm2 = _mm_add_pi16(mm2, mm0);             /* mm2=Int1L=( 0 1 2 3) */
+    mm6 = _mm_add_pi16(mm6, mm4);             /* mm6=Int1H=( 4 5 6 7) */
+
+    _mm_store_si64((__m64 *)outptr0, mm1);      /* temporarily save */
+    _mm_store_si64((__m64 *)outptr0 + 1, mm5);  /* the intermediate data */
+    _mm_store_si64((__m64 *)outptr1, mm2);
+    _mm_store_si64((__m64 *)outptr1 + 1, mm6);
+
+    mm1 = _mm_and_si64(mm1, mm7);             /* mm1=( 0 - - -) */
+    mm2 = _mm_and_si64(mm2, mm7);             /* mm2=( 0 - - -) */
+
+    wk[0] = mm1;
+    wk[1] = mm2;
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8,
+         outptr0 += 16, outptr1 += 16) {
+
+      if (incol > 8) {
+        /* process the next column block */
+        mm0 = _mm_load_si64((__m64 *)inptr0 + 1);   /* mm0 = row[ 0][1] */
+        mm1 = _mm_load_si64((__m64 *)inptr_1 + 1);  /* mm1 = row[-1][1] */
+        mm2 = _mm_load_si64((__m64 *)inptr1 + 1);   /* mm2 = row[+1][1] */
+
+        mm3 = _mm_setzero_si64();             /* mm3 = (all 0's) */
+        mm4 = mm0;
+        mm0 = _mm_unpacklo_pi8(mm0, mm3);     /* mm0 = row[ 0][1]( 0 1 2 3) */
+        mm4 = _mm_unpackhi_pi8(mm4, mm3);     /* mm4 = row[ 0][1]( 4 5 6 7) */
+        mm5 = mm1;
+        mm1 = _mm_unpacklo_pi8(mm1, mm3);     /* mm1 = row[-1][1]( 0 1 2 3) */
+        mm5 = _mm_unpackhi_pi8(mm5, mm3);     /* mm5 = row[-1][1]( 4 5 6 7) */
+        mm6 = mm2;
+        mm2 = _mm_unpacklo_pi8(mm2, mm3);     /* mm2 = row[+1][1]( 0 1 2 3) */
+        mm6 = _mm_unpackhi_pi8(mm6, mm3);     /* mm6 = row[+1][1]( 4 5 6 7) */
+
+        mm0 = _mm_mullo_pi16(mm0, PW_THREE);
+        mm4 = _mm_mullo_pi16(mm4, PW_THREE);
+
+        mm1 = _mm_add_pi16(mm1, mm0);         /* mm1 = Int0L = ( 0 1 2 3) */
+        mm5 = _mm_add_pi16(mm5, mm4);         /* mm5 = Int0H = ( 4 5 6 7) */
+        mm2 = _mm_add_pi16(mm2, mm0);         /* mm2 = Int1L = ( 0 1 2 3) */
+        mm6 = _mm_add_pi16(mm6, mm4);         /* mm6 = Int1H = ( 4 5 6 7) */
+
+        _mm_store_si64((__m64 *)outptr0 + 2, mm1);  /* temporarily save */
+        _mm_store_si64((__m64 *)outptr0 + 3, mm5);  /* the intermediate data */
+        _mm_store_si64((__m64 *)outptr1 + 2, mm2);
+        _mm_store_si64((__m64 *)outptr1 + 3, mm6);
+
+        mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm1=( - - - 0) */
+        mm2 = _mm_slli_si64(mm2, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm2=( - - - 0) */
+
+        wk[2] = mm1;
+        wk[3] = mm2;
+      } else {
+        /* process the last column block */
+        mm1 = _mm_cmpeq_pi8(mm1, mm1);
+        mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+        mm2 = mm1;
+
+        mm_tmp = _mm_load_si64((__m64 *)outptr0 + 1);
+        mm1 = _mm_and_si64(mm1, mm_tmp);      /* mm1=( - - - 7) */
+        mm_tmp = _mm_load_si64((__m64 *)outptr1 + 1);
+        mm2 = _mm_and_si64(mm2, mm_tmp);      /* mm2=( - - - 7) */
+
+        wk[2] = mm1;
+        wk[3] = mm2;
+      }
+
+      /* process the upper row */
+      PROCESS_ROW(0)
+
+      /* process the lower row */
+      PROCESS_ROW(1)
+    }
+  }
+}
diff --git a/simd/loongson/jfdctint-mmi.c b/simd/loongson/jfdctint-mmi.c
new file mode 100644
index 0000000..2cfaeae
--- /dev/null
+++ b/simd/loongson/jfdctint-mmi.c
@@ -0,0 +1,398 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2018, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* SLOW INTEGER FORWARD DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+#define FIX_0_298 ((short)2446)   /* FIX(0.298631336) */
+#define FIX_0_390 ((short)3196)   /* FIX(0.390180644) */
+#define FIX_0_541 ((short)4433)   /* FIX(0.541196100) */
+#define FIX_0_765 ((short)6270)   /* FIX(0.765366865) */
+#define FIX_0_899 ((short)7373)   /* FIX(0.899976223) */
+#define FIX_1_175 ((short)9633)   /* FIX(1.175875602) */
+#define FIX_1_501 ((short)12299)  /* FIX(1.501321110) */
+#define FIX_1_847 ((short)15137)  /* FIX(1.847759065) */
+#define FIX_1_961 ((short)16069)  /* FIX(1.961570560) */
+#define FIX_2_053 ((short)16819)  /* FIX(2.053119869) */
+#define FIX_2_562 ((short)20995)  /* FIX(2.562915447) */
+#define FIX_3_072 ((short)25172)  /* FIX(3.072711026) */
+
+enum const_index {
+  index_PW_F130_F054,
+  index_PW_F054_MF130,
+  index_PW_MF078_F117,
+  index_PW_F117_F078,
+  index_PW_MF060_MF089,
+  index_PW_MF089_F060,
+  index_PW_MF050_MF256,
+  index_PW_MF256_F050,
+  index_PD_DESCALE_P1,
+  index_PD_DESCALE_P2,
+  index_PW_DESCALE_P2X
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765),
+                   FIX_0_541, (FIX_0_541 + FIX_0_765)),
+  _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541,
+                   (FIX_0_541 - FIX_1_847), FIX_0_541),
+  _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961),
+                   FIX_1_175, (FIX_1_175 - FIX_1_961)),
+  _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175,
+                   (FIX_1_175 - FIX_0_390), FIX_1_175),
+  _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899),
+                   -FIX_0_899, (FIX_0_298 - FIX_0_899)),
+  _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899,
+                   (FIX_1_501 - FIX_0_899), -FIX_0_899),
+  _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562),
+                   -FIX_2_562, (FIX_2_053 - FIX_2_562)),
+  _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562,
+                   (FIX_3_072 - FIX_2_562), -FIX_2_562),
+  _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))),
+  _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))),
+  _uint64_set_pi16((1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1)),
+                   (1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1)))
+};
+
+#define PW_F130_F054   get_const_value(index_PW_F130_F054)
+#define PW_F054_MF130  get_const_value(index_PW_F054_MF130)
+#define PW_MF078_F117  get_const_value(index_PW_MF078_F117)
+#define PW_F117_F078   get_const_value(index_PW_F117_F078)
+#define PW_MF060_MF089 get_const_value(index_PW_MF060_MF089)
+#define PW_MF089_F060  get_const_value(index_PW_MF089_F060)
+#define PW_MF050_MF256 get_const_value(index_PW_MF050_MF256)
+#define PW_MF256_F050  get_const_value(index_PW_MF256_F050)
+#define PD_DESCALE_P1  get_const_value(index_PD_DESCALE_P1)
+#define PD_DESCALE_P2  get_const_value(index_PD_DESCALE_P2)
+#define PW_DESCALE_P2X get_const_value(index_PW_DESCALE_P2X)
+
+
+#define DO_FDCT_COMMON(PASS) { \
+  __m64 tmp1312l, tmp1312h, tmp47l, tmp47h, tmp4l, tmp4h, tmp7l, tmp7h; \
+  __m64 tmp56l, tmp56h, tmp5l, tmp5h, tmp6l, tmp6h; \
+  __m64 out1l, out1h, out2l, out2h, out3l, out3h; \
+  __m64 out5l, out5h, out6l, out6h, out7l, out7h; \
+  __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \
+  \
+  /* (Original) \
+   * z1 = (tmp12 + tmp13) * 0.541196100; \
+   * out2 = z1 + tmp13 * 0.765366865; \
+   * out6 = z1 + tmp12 * -1.847759065; \
+   * \
+   * (This implementation) \
+   * out2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
+   * out6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
+   */ \
+  \
+  tmp1312l = _mm_unpacklo_pi16(tmp13, tmp12); \
+  tmp1312h = _mm_unpackhi_pi16(tmp13, tmp12); \
+  \
+  out2l = _mm_madd_pi16(tmp1312l, PW_F130_F054); \
+  out2h = _mm_madd_pi16(tmp1312h, PW_F130_F054); \
+  out6l = _mm_madd_pi16(tmp1312l, PW_F054_MF130); \
+  out6h = _mm_madd_pi16(tmp1312h, PW_F054_MF130); \
+  \
+  out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \
+  out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \
+  out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \
+  out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \
+  \
+  out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \
+  out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \
+  out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \
+  out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \
+  \
+  out2 = _mm_packs_pi32(out2l, out2h); \
+  out6 = _mm_packs_pi32(out6l, out6h); \
+  \
+  /* Odd part */ \
+  \
+  z3 = _mm_add_pi16(tmp4, tmp6); \
+  z4 = _mm_add_pi16(tmp5, tmp7); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = _mm_unpacklo_pi16(z3, z4); \
+  z34h = _mm_unpackhi_pi16(z3, z4); \
+  z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \
+  z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \
+  z4l = _mm_madd_pi16(z34l, PW_F117_F078); \
+  z4h = _mm_madd_pi16(z34h, PW_F117_F078); \
+  \
+  /* (Original) \
+   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6; \
+   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869; \
+   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * out7 = tmp4 + z1 + z3;  out5 = tmp5 + z2 + z4; \
+   * out3 = tmp6 + z2 + z3;  out1 = tmp7 + z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
+   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
+   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
+   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
+   * out7 = tmp4 + z3;  out5 = tmp5 + z4; \
+   * out3 = tmp6 + z3;  out1 = tmp7 + z4; \
+   */ \
+  \
+  tmp47l = _mm_unpacklo_pi16(tmp4, tmp7); \
+  tmp47h = _mm_unpackhi_pi16(tmp4, tmp7); \
+  \
+  tmp4l = _mm_madd_pi16(tmp47l, PW_MF060_MF089); \
+  tmp4h = _mm_madd_pi16(tmp47h, PW_MF060_MF089); \
+  tmp7l = _mm_madd_pi16(tmp47l, PW_MF089_F060); \
+  tmp7h = _mm_madd_pi16(tmp47h, PW_MF089_F060); \
+  \
+  out7l = _mm_add_pi32(tmp4l, z3l); \
+  out7h = _mm_add_pi32(tmp4h, z3h); \
+  out1l = _mm_add_pi32(tmp7l, z4l); \
+  out1h = _mm_add_pi32(tmp7h, z4h); \
+  \
+  out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \
+  out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \
+  out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \
+  out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \
+  \
+  out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \
+  out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \
+  out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \
+  out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \
+  \
+  out7 = _mm_packs_pi32(out7l, out7h); \
+  out1 = _mm_packs_pi32(out1l, out1h); \
+  \
+  tmp56l = _mm_unpacklo_pi16(tmp5, tmp6); \
+  tmp56h = _mm_unpackhi_pi16(tmp5, tmp6); \
+  \
+  tmp5l = _mm_madd_pi16(tmp56l, PW_MF050_MF256); \
+  tmp5h = _mm_madd_pi16(tmp56h, PW_MF050_MF256); \
+  tmp6l = _mm_madd_pi16(tmp56l, PW_MF256_F050); \
+  tmp6h = _mm_madd_pi16(tmp56h, PW_MF256_F050); \
+  \
+  out5l = _mm_add_pi32(tmp5l, z4l); \
+  out5h = _mm_add_pi32(tmp5h, z4h); \
+  out3l = _mm_add_pi32(tmp6l, z3l); \
+  out3h = _mm_add_pi32(tmp6h, z3h); \
+  \
+  out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \
+  out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \
+  out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \
+  out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \
+  \
+  out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \
+  out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \
+  out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \
+  out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \
+  \
+  out5 = _mm_packs_pi32(out5l, out5h); \
+  out3 = _mm_packs_pi32(out3l, out3h); \
+}
+
+#define DO_FDCT_PASS1() { \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 col0, col1, col2, col3, col4, col5, col6, col7; \
+  __m64 tmp10, tmp11; \
+  \
+  row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);     /* (00 01 02 03) */ \
+  row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
+  row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);     /* (10 11 12 13) */ \
+  row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
+  row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);     /* (20 21 22 23) */ \
+  row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
+  row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);     /* (30 31 32 33) */ \
+  row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row23a = _mm_unpacklo_pi16(row2l, row3l);   /* row23a=(20 30 21 31) */ \
+  row23b = _mm_unpackhi_pi16(row2l, row3l);   /* row23b=(22 32 23 33) */ \
+  row23c = _mm_unpacklo_pi16(row2h, row3h);   /* row23c=(24 34 25 35) */ \
+  row23d = _mm_unpackhi_pi16(row2h, row3h);   /* row23d=(26 36 27 37) */ \
+  \
+  row01a = _mm_unpacklo_pi16(row0l, row1l);   /* row01a=(00 10 01 11) */ \
+  row01b = _mm_unpackhi_pi16(row0l, row1l);   /* row01b=(02 12 03 13) */ \
+  row01c = _mm_unpacklo_pi16(row0h, row1h);   /* row01c=(04 14 05 15) */ \
+  row01d = _mm_unpackhi_pi16(row0h, row1h);   /* row01d=(06 16 07 17) */ \
+  \
+  col0 = _mm_unpacklo_pi32(row01a, row23a);   /* col0=(00 10 20 30) */ \
+  col1 = _mm_unpackhi_pi32(row01a, row23a);   /* col1=(01 11 21 31) */ \
+  col6 = _mm_unpacklo_pi32(row01d, row23d);   /* col6=(06 16 26 36) */ \
+  col7 = _mm_unpackhi_pi32(row01d, row23d);   /* col7=(07 17 27 37) */ \
+  \
+  tmp6 = _mm_sub_pi16(col1, col6);            /* tmp6=col1-col6 */ \
+  tmp7 = _mm_sub_pi16(col0, col7);            /* tmp7=col0-col7 */ \
+  tmp1 = _mm_add_pi16(col1, col6);            /* tmp1=col1+col6 */ \
+  tmp0 = _mm_add_pi16(col0, col7);            /* tmp0=col0+col7 */ \
+  \
+  col2 = _mm_unpacklo_pi32(row01b, row23b);   /* col2=(02 12 22 32) */ \
+  col3 = _mm_unpackhi_pi32(row01b, row23b);   /* col3=(03 13 23 33) */ \
+  col4 = _mm_unpacklo_pi32(row01c, row23c);   /* col4=(04 14 24 34) */ \
+  col5 = _mm_unpackhi_pi32(row01c, row23c);   /* col5=(05 15 25 35) */ \
+  \
+  tmp3 = _mm_add_pi16(col3, col4);            /* tmp3=col3+col4 */ \
+  tmp2 = _mm_add_pi16(col2, col5);            /* tmp2=col2+col5 */ \
+  tmp4 = _mm_sub_pi16(col3, col4);            /* tmp4=col3-col4 */ \
+  tmp5 = _mm_sub_pi16(col2, col5);            /* tmp5=col2-col5 */ \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp3);           /* tmp10=tmp0+tmp3 */ \
+  tmp13 = _mm_sub_pi16(tmp0, tmp3);           /* tmp13=tmp0-tmp3 */ \
+  tmp11 = _mm_add_pi16(tmp1, tmp2);           /* tmp11=tmp1+tmp2 */ \
+  tmp12 = _mm_sub_pi16(tmp1, tmp2);           /* tmp12=tmp1-tmp2 */ \
+  \
+  out0 = _mm_add_pi16(tmp10, tmp11);          /* out0=tmp10+tmp11 */ \
+  out4 = _mm_sub_pi16(tmp10, tmp11);          /* out4=tmp10-tmp11 */ \
+  out0 = _mm_slli_pi16(out0, PASS1_BITS); \
+  out4 = _mm_slli_pi16(out4, PASS1_BITS); \
+  \
+  DO_FDCT_COMMON(1) \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \
+}
+
+#define DO_FDCT_PASS2() { \
+  __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \
+  __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \
+  __m64 row0, row1, row2, row3, row4, row5, row6, row7; \
+  __m64 tmp10, tmp11; \
+  \
+  col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]);  /* (40 50 60 70) */ \
+  col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]);  /* (41 51 61 71) */ \
+  col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]);  /* (42 52 62 72) */ \
+  col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]);  /* (43 53 63 73) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  col23a = _mm_unpacklo_pi16(col2l, col3l);   /* col23a=(02 03 12 13) */ \
+  col23b = _mm_unpackhi_pi16(col2l, col3l);   /* col23b=(22 23 32 33) */ \
+  col23c = _mm_unpacklo_pi16(col2h, col3h);   /* col23c=(42 43 52 53) */ \
+  col23d = _mm_unpackhi_pi16(col2h, col3h);   /* col23d=(62 63 72 73) */ \
+  \
+  col01a = _mm_unpacklo_pi16(col0l, col1l);   /* col01a=(00 01 10 11) */ \
+  col01b = _mm_unpackhi_pi16(col0l, col1l);   /* col01b=(20 21 30 31) */ \
+  col01c = _mm_unpacklo_pi16(col0h, col1h);   /* col01c=(40 41 50 51) */ \
+  col01d = _mm_unpackhi_pi16(col0h, col1h);   /* col01d=(60 61 70 71) */ \
+  \
+  row0 = _mm_unpacklo_pi32(col01a, col23a);   /* row0=(00 01 02 03) */ \
+  row1 = _mm_unpackhi_pi32(col01a, col23a);   /* row1=(10 11 12 13) */ \
+  row6 = _mm_unpacklo_pi32(col01d, col23d);   /* row6=(60 61 62 63) */ \
+  row7 = _mm_unpackhi_pi32(col01d, col23d);   /* row7=(70 71 72 73) */ \
+  \
+  tmp6 = _mm_sub_pi16(row1, row6);            /* tmp6=row1-row6 */ \
+  tmp7 = _mm_sub_pi16(row0, row7);            /* tmp7=row0-row7 */ \
+  tmp1 = _mm_add_pi16(row1, row6);            /* tmp1=row1+row6 */ \
+  tmp0 = _mm_add_pi16(row0, row7);            /* tmp0=row0+row7 */ \
+  \
+  row2 = _mm_unpacklo_pi32(col01b, col23b);   /* row2=(20 21 22 23) */ \
+  row3 = _mm_unpackhi_pi32(col01b, col23b);   /* row3=(30 31 32 33) */ \
+  row4 = _mm_unpacklo_pi32(col01c, col23c);   /* row4=(40 41 42 43) */ \
+  row5 = _mm_unpackhi_pi32(col01c, col23c);   /* row5=(50 51 52 53) */ \
+  \
+  tmp3 = _mm_add_pi16(row3, row4);            /* tmp3=row3+row4 */ \
+  tmp2 = _mm_add_pi16(row2, row5);            /* tmp2=row2+row5 */ \
+  tmp4 = _mm_sub_pi16(row3, row4);            /* tmp4=row3-row4 */ \
+  tmp5 = _mm_sub_pi16(row2, row5);            /* tmp5=row2-row5 */ \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp3);           /* tmp10=tmp0+tmp3 */ \
+  tmp13 = _mm_sub_pi16(tmp0, tmp3);           /* tmp13=tmp0-tmp3 */ \
+  tmp11 = _mm_add_pi16(tmp1, tmp2);           /* tmp11=tmp1+tmp2 */ \
+  tmp12 = _mm_sub_pi16(tmp1, tmp2);           /* tmp12=tmp1-tmp2 */ \
+  \
+  out0 = _mm_add_pi16(tmp10, tmp11);          /* out0=tmp10+tmp11 */ \
+  out4 = _mm_sub_pi16(tmp10, tmp11);          /* out4=tmp10-tmp11 */ \
+  \
+  out0 = _mm_add_pi16(out0, PW_DESCALE_P2X); \
+  out4 = _mm_add_pi16(out4, PW_DESCALE_P2X); \
+  out0 = _mm_srai_pi16(out0, PASS1_BITS); \
+  out4 = _mm_srai_pi16(out4, PASS1_BITS); \
+  \
+  DO_FDCT_COMMON(2) \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \
+}
+
+void jsimd_fdct_islow_mmi(DCTELEM *data)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  __m64 tmp12, tmp13;
+  DCTELEM *dataptr = data;
+
+  /* Pass 1: process rows. */
+
+  DO_FDCT_PASS1()
+  dataptr += DCTSIZE * 4;
+  DO_FDCT_PASS1()
+
+  /* Pass 2: process columns. */
+
+  dataptr = data;
+  DO_FDCT_PASS2()
+  dataptr += 4;
+  DO_FDCT_PASS2()
+}
diff --git a/simd/loongson/jidctint-mmi.c b/simd/loongson/jidctint-mmi.c
new file mode 100644
index 0000000..6f7c33a
--- /dev/null
+++ b/simd/loongson/jidctint-mmi.c
@@ -0,0 +1,571 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2018, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* SLOW INTEGER INVERSE DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+#define CENTERJSAMPLE 128
+
+#define FIX_0_298 ((short)2446)  /* FIX(0.298631336) */
+#define FIX_0_390 ((short)3196)  /* FIX(0.390180644) */
+#define FIX_0_899 ((short)7373)  /* FIX(0.899976223) */
+#define FIX_0_541 ((short)4433)  /* FIX(0.541196100) */
+#define FIX_0_765 ((short)6270)  /* FIX(0.765366865) */
+#define FIX_1_175 ((short)9633)  /* FIX(1.175875602) */
+#define FIX_1_501 ((short)12299) /* FIX(1.501321110) */
+#define FIX_1_847 ((short)15137) /* FIX(1.847759065) */
+#define FIX_1_961 ((short)16069) /* FIX(1.961570560) */
+#define FIX_2_053 ((short)16819) /* FIX(2.053119869) */
+#define FIX_2_562 ((short)20995) /* FIX(2.562915447) */
+#define FIX_3_072 ((short)25172) /* FIX(3.072711026) */
+
+enum const_index {
+  index_PW_F130_F054,
+  index_PW_F054_MF130,
+  index_PW_MF078_F117,
+  index_PW_F117_F078,
+  index_PW_MF060_MF089,
+  index_PW_MF089_F060,
+  index_PW_MF050_MF256,
+  index_PW_MF256_F050,
+  index_PD_DESCALE_P1,
+  index_PD_DESCALE_P2,
+  index_PB_CENTERJSAMP
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765),
+                   FIX_0_541, (FIX_0_541 + FIX_0_765)),
+  _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541,
+                   (FIX_0_541 - FIX_1_847), FIX_0_541),
+  _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961),
+                   FIX_1_175, (FIX_1_175 - FIX_1_961)),
+  _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175,
+                   (FIX_1_175 - FIX_0_390), FIX_1_175),
+  _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899),
+                   -FIX_0_899, (FIX_0_298 - FIX_0_899)),
+  _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899,
+                   (FIX_1_501 - FIX_0_899), -FIX_0_899),
+  _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562),
+                   -FIX_2_562, (FIX_2_053 - FIX_2_562)),
+  _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562,
+                   (FIX_3_072 - FIX_2_562), -FIX_2_562),
+  _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))),
+  _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))),
+  _uint64_set_pi8(CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE,
+                  CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE)
+};
+
+#define PW_F130_F054   get_const_value(index_PW_F130_F054)
+#define PW_F054_MF130  get_const_value(index_PW_F054_MF130)
+#define PW_MF078_F117  get_const_value(index_PW_MF078_F117)
+#define PW_F117_F078   get_const_value(index_PW_F117_F078)
+#define PW_MF060_MF089 get_const_value(index_PW_MF060_MF089)
+#define PW_MF089_F060  get_const_value(index_PW_MF089_F060)
+#define PW_MF050_MF256 get_const_value(index_PW_MF050_MF256)
+#define PW_MF256_F050  get_const_value(index_PW_MF256_F050)
+#define PD_DESCALE_P1  get_const_value(index_PD_DESCALE_P1)
+#define PD_DESCALE_P2  get_const_value(index_PD_DESCALE_P2)
+#define PB_CENTERJSAMP get_const_value(index_PB_CENTERJSAMP)
+
+
+#define test_m32_zero(mm32) (!(*(uint32_t *)&mm32))
+#define test_m64_zero(mm64) (!(*(uint64_t *)&mm64))
+
+
+#define DO_IDCT_COMMON(PASS) { \
+  __m64 tmp0_3l, tmp0_3h, tmp1_2l, tmp1_2h; \
+  __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+  __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \
+  __m64 out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h; \
+  __m64 out4l, out4h, out5l, out5h, out6l, out6h, out7l, out7h; \
+  \
+  z3 = _mm_add_pi16(tmp0, tmp2); \
+  z4 = _mm_add_pi16(tmp1, tmp3); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = _mm_unpacklo_pi16(z3, z4); \
+  z34h = _mm_unpackhi_pi16(z3, z4); \
+  z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \
+  z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \
+  z4l = _mm_madd_pi16(z34l, PW_F117_F078); \
+  z4h = _mm_madd_pi16(z34h, PW_F117_F078); \
+  \
+  /* (Original) \
+   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2; \
+   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869; \
+   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * tmp0 += z1 + z3;  tmp1 += z2 + z4; \
+   * tmp2 += z2 + z3;  tmp3 += z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
+   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
+   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
+   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
+   * tmp0 += z3;  tmp1 += z4; \
+   * tmp2 += z3;  tmp3 += z4; \
+   */ \
+  \
+  tmp0_3l = _mm_unpacklo_pi16(tmp0, tmp3); \
+  tmp0_3h = _mm_unpackhi_pi16(tmp0, tmp3); \
+  \
+  tmp0l = _mm_madd_pi16(tmp0_3l, PW_MF060_MF089); \
+  tmp0h = _mm_madd_pi16(tmp0_3h, PW_MF060_MF089); \
+  tmp3l = _mm_madd_pi16(tmp0_3l, PW_MF089_F060); \
+  tmp3h = _mm_madd_pi16(tmp0_3h, PW_MF089_F060); \
+  \
+  tmp0l = _mm_add_pi32(tmp0l, z3l); \
+  tmp0h = _mm_add_pi32(tmp0h, z3h); \
+  tmp3l = _mm_add_pi32(tmp3l, z4l); \
+  tmp3h = _mm_add_pi32(tmp3h, z4h); \
+  \
+  tmp1_2l = _mm_unpacklo_pi16(tmp1, tmp2); \
+  tmp1_2h = _mm_unpackhi_pi16(tmp1, tmp2); \
+  \
+  tmp1l = _mm_madd_pi16(tmp1_2l, PW_MF050_MF256); \
+  tmp1h = _mm_madd_pi16(tmp1_2h, PW_MF050_MF256); \
+  tmp2l = _mm_madd_pi16(tmp1_2l, PW_MF256_F050); \
+  tmp2h = _mm_madd_pi16(tmp1_2h, PW_MF256_F050); \
+  \
+  tmp1l = _mm_add_pi32(tmp1l, z4l); \
+  tmp1h = _mm_add_pi32(tmp1h, z4h); \
+  tmp2l = _mm_add_pi32(tmp2l, z3l); \
+  tmp2h = _mm_add_pi32(tmp2h, z3h); \
+  \
+  /* Final output stage */ \
+  \
+  out0l = _mm_add_pi32(tmp10l, tmp3l); \
+  out0h = _mm_add_pi32(tmp10h, tmp3h); \
+  out7l = _mm_sub_pi32(tmp10l, tmp3l); \
+  out7h = _mm_sub_pi32(tmp10h, tmp3h); \
+  \
+  out0l = _mm_add_pi32(out0l, PD_DESCALE_P##PASS); \
+  out0h = _mm_add_pi32(out0h, PD_DESCALE_P##PASS); \
+  out0l = _mm_srai_pi32(out0l, DESCALE_P##PASS); \
+  out0h = _mm_srai_pi32(out0h, DESCALE_P##PASS); \
+  \
+  out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \
+  out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \
+  out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \
+  out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \
+  \
+  out0 = _mm_packs_pi32(out0l, out0h); \
+  out7 = _mm_packs_pi32(out7l, out7h); \
+  \
+  out1l = _mm_add_pi32(tmp11l, tmp2l); \
+  out1h = _mm_add_pi32(tmp11h, tmp2h); \
+  out6l = _mm_sub_pi32(tmp11l, tmp2l); \
+  out6h = _mm_sub_pi32(tmp11h, tmp2h); \
+  \
+  out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \
+  out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \
+  out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \
+  out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \
+  \
+  out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \
+  out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \
+  out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \
+  out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \
+  \
+  out1 = _mm_packs_pi32(out1l, out1h); \
+  out6 = _mm_packs_pi32(out6l, out6h); \
+  \
+  out2l = _mm_add_pi32(tmp12l, tmp1l); \
+  out2h = _mm_add_pi32(tmp12h, tmp1h); \
+  out5l = _mm_sub_pi32(tmp12l, tmp1l); \
+  out5h = _mm_sub_pi32(tmp12h, tmp1h); \
+  \
+  out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \
+  out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \
+  out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \
+  out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \
+  \
+  out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \
+  out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \
+  out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \
+  out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \
+  \
+  out2 = _mm_packs_pi32(out2l, out2h); \
+  out5 = _mm_packs_pi32(out5l, out5h); \
+  \
+  out3l = _mm_add_pi32(tmp13l, tmp0l); \
+  out3h = _mm_add_pi32(tmp13h, tmp0h); \
+  \
+  out4l = _mm_sub_pi32(tmp13l, tmp0l); \
+  out4h = _mm_sub_pi32(tmp13h, tmp0h); \
+  \
+  out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \
+  out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \
+  out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \
+  out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \
+  \
+  out4l = _mm_add_pi32(out4l, PD_DESCALE_P##PASS); \
+  out4h = _mm_add_pi32(out4h, PD_DESCALE_P##PASS); \
+  out4l = _mm_srai_pi32(out4l, DESCALE_P##PASS); \
+  out4h = _mm_srai_pi32(out4h, DESCALE_P##PASS); \
+  \
+  out3 = _mm_packs_pi32(out3l, out3h); \
+  out4 = _mm_packs_pi32(out4l, out4h); \
+}
+
+#define DO_IDCT_PASS1(iter) { \
+  __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \
+  __m64 quant0l, quant1l, quant2l, quant3l; \
+  __m64 quant4l, quant5l, quant6l, quant7l; \
+  __m64 z23, z2, z3, z23l, z23h; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+  __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \
+  __m32 col0a, col1a, mm0; \
+  \
+  col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \
+  col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \
+  mm0 = _mm_or_si32(col0a, col1a); \
+  \
+  if (test_m32_zero(mm0)) { \
+    __m64 mm1, mm2; \
+    \
+    col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \
+    col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \
+    col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \
+    col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \
+    col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \
+    col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \
+    col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \
+    col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \
+    \
+    mm1 = _mm_or_si64(col1l, col3l); \
+    mm2 = _mm_or_si64(col2l, col4l); \
+    mm1 = _mm_or_si64(mm1, col5l); \
+    mm2 = _mm_or_si64(mm2, col6l); \
+    mm1 = _mm_or_si64(mm1, col7l); \
+    mm1 = _mm_or_si64(mm1, mm2); \
+    \
+    if (test_m64_zero(mm1)) { \
+      __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \
+      \
+      /* AC terms all zero */ \
+      \
+      quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+      \
+      dcval = _mm_mullo_pi16(col0l, quant0l); \
+      dcval = _mm_slli_pi16(dcval, PASS1_BITS);  /* dcval=(00 10 20 30) */ \
+      \
+      dcvall = _mm_unpacklo_pi16(dcval, dcval);  /* dcvall=(00 00 10 10) */ \
+      dcvalh = _mm_unpackhi_pi16(dcval, dcval);  /* dcvalh=(20 20 30 30) */ \
+      \
+      row0 = _mm_unpacklo_pi32(dcvall, dcvall);  /* row0=(00 00 00 00) */ \
+      row1 = _mm_unpackhi_pi32(dcvall, dcvall);  /* row1=(10 10 10 10) */ \
+      row2 = _mm_unpacklo_pi32(dcvalh, dcvalh);  /* row2=(20 20 20 20) */ \
+      row3 = _mm_unpackhi_pi32(dcvalh, dcvalh);  /* row3=(30 30 30 30) */ \
+      \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \
+      \
+      goto nextcolumn##iter; \
+    } \
+  } \
+  \
+  /* Even part \
+   * \
+   * (Original) \
+   * z1 = (z2 + z3) * 0.541196100; \
+   * tmp2 = z1 + z3 * -1.847759065; \
+   * tmp3 = z1 + z2 * 0.765366865; \
+   * \
+   * (This implementation) \
+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+   */ \
+  \
+  col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]);  /* (04 14 24 34) */ \
+  col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]);  /* (06 16 26 36) */ \
+  \
+  quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+  quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \
+  quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \
+  quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \
+  \
+  z2 = _mm_mullo_pi16(col2l, quant2l); \
+  z3 = _mm_mullo_pi16(col6l, quant6l); \
+  \
+  z23l = _mm_unpacklo_pi16(z2, z3); \
+  z23h = _mm_unpackhi_pi16(z2, z3); \
+  tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \
+  tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \
+  tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \
+  tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \
+  \
+  z2 = _mm_mullo_pi16(col0l, quant0l); \
+  z3 = _mm_mullo_pi16(col4l, quant4l); \
+  \
+  z23 = _mm_add_pi16(z2, z3); \
+  tmp0l = _mm_loadlo_pi16_f(z23); \
+  tmp0h = _mm_loadhi_pi16_f(z23); \
+  tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \
+  tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \
+  \
+  tmp10l = _mm_add_pi32(tmp0l, tmp3l); \
+  tmp10h = _mm_add_pi32(tmp0h, tmp3h); \
+  tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \
+  tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \
+  \
+  z23 = _mm_sub_pi16(z2, z3); \
+  tmp1l = _mm_loadlo_pi16_f(z23); \
+  tmp1h = _mm_loadhi_pi16_f(z23); \
+  tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \
+  tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \
+  \
+  tmp11l = _mm_add_pi32(tmp1l, tmp2l); \
+  tmp11h = _mm_add_pi32(tmp1h, tmp2h); \
+  tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \
+  tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \
+  \
+  /* Odd part */ \
+  \
+  col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]);  /* (05 15 25 35) */ \
+  col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]);  /* (07 17 27 37) */ \
+  \
+  quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \
+  quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \
+  quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \
+  quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \
+  \
+  tmp0 = _mm_mullo_pi16(col7l, quant7l); \
+  tmp1 = _mm_mullo_pi16(col5l, quant5l); \
+  tmp2 = _mm_mullo_pi16(col3l, quant3l); \
+  tmp3 = _mm_mullo_pi16(col1l, quant1l); \
+  \
+  DO_IDCT_COMMON(1) \
+  \
+  /* out0=(00 10 20 30), out1=(01 11 21 31) */ \
+  /* out2=(02 12 22 32), out3=(03 13 23 33) */ \
+  /* out4=(04 14 24 34), out5=(05 15 25 35) */ \
+  /* out6=(06 16 26 36), out7=(07 17 27 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row01a = _mm_unpacklo_pi16(out0, out1);     /* row01a=(00 01 10 11) */ \
+  row23a = _mm_unpackhi_pi16(out0, out1);     /* row23a=(20 21 30 31) */ \
+  row01d = _mm_unpacklo_pi16(out6, out7);     /* row01d=(06 07 16 17) */ \
+  row23d = _mm_unpackhi_pi16(out6, out7);     /* row23d=(26 27 36 37) */ \
+  \
+  row01b = _mm_unpacklo_pi16(out2, out3);     /* row01b=(02 03 12 13) */ \
+  row23b = _mm_unpackhi_pi16(out2, out3);     /* row23b=(22 23 32 33) */ \
+  row01c = _mm_unpacklo_pi16(out4, out5);     /* row01c=(04 05 14 15) */ \
+  row23c = _mm_unpackhi_pi16(out4, out5);     /* row23c=(24 25 34 35) */ \
+  \
+  row0l = _mm_unpacklo_pi32(row01a, row01b);  /* row0l=(00 01 02 03) */ \
+  row1l = _mm_unpackhi_pi32(row01a, row01b);  /* row1l=(10 11 12 13) */ \
+  row2l = _mm_unpacklo_pi32(row23a, row23b);  /* row2l=(20 21 22 23) */ \
+  row3l = _mm_unpackhi_pi32(row23a, row23b);  /* row3l=(30 31 32 33) */ \
+  \
+  row0h = _mm_unpacklo_pi32(row01c, row01d);  /* row0h=(04 05 06 07) */ \
+  row1h = _mm_unpackhi_pi32(row01c, row01d);  /* row1h=(14 15 16 17) */ \
+  row2h = _mm_unpacklo_pi32(row23c, row23d);  /* row2h=(24 25 26 27) */ \
+  row3h = _mm_unpackhi_pi32(row23c, row23d);  /* row3h=(34 35 36 37) */ \
+  \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \
+}
+
+#define DO_IDCT_PASS2(ctr) { \
+  __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \
+  __m64 z23, z23l, z23h; \
+  __m64 col0123a, col0123b, col0123c, col0123d; \
+  __m64 col01l, col01h, col23l, col23h, row06, row17, row24, row35; \
+  __m64 col0, col1, col2, col3; \
+  __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+  __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \
+  \
+  row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]);  /* (00 01 02 03) */ \
+  row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]);  /* (10 11 12 13) */ \
+  row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]);  /* (20 21 22 23) */ \
+  row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]);  /* (30 31 32 33) */ \
+  row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]);  /* (40 41 42 43) */ \
+  row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]);  /* (50 51 52 53) */ \
+  row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]);  /* (60 61 62 63) */ \
+  row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]);  /* (70 71 72 73) */ \
+  \
+  /* Even part \
+   * \
+   * (Original) \
+   * z1 = (z2 + z3) * 0.541196100; \
+   * tmp2 = z1 + z3 * -1.847759065; \
+   * tmp3 = z1 + z2 * 0.765366865; \
+   * \
+   * (This implementation) \
+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+   */ \
+  \
+  z23l = _mm_unpacklo_pi16(row2l, row6l); \
+  z23h = _mm_unpackhi_pi16(row2l, row6l); \
+  \
+  tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \
+  tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \
+  tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \
+  tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \
+  \
+  z23 = _mm_add_pi16(row0l, row4l); \
+  tmp0l = _mm_loadlo_pi16_f(z23); \
+  tmp0h = _mm_loadhi_pi16_f(z23); \
+  tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \
+  tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \
+  \
+  tmp10l = _mm_add_pi32(tmp0l, tmp3l); \
+  tmp10h = _mm_add_pi32(tmp0h, tmp3h); \
+  tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \
+  tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \
+  \
+  z23 = _mm_sub_pi16(row0l, row4l); \
+  tmp1l = _mm_loadlo_pi16_f(z23); \
+  tmp1h = _mm_loadhi_pi16_f(z23); \
+  tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \
+  tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \
+  \
+  tmp11l = _mm_add_pi32(tmp1l, tmp2l); \
+  tmp11h = _mm_add_pi32(tmp1h, tmp2h); \
+  tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \
+  tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \
+  \
+  /* Odd part */ \
+  \
+  tmp0 = row7l; \
+  tmp1 = row5l; \
+  tmp2 = row3l; \
+  tmp3 = row1l; \
+  \
+  DO_IDCT_COMMON(2) \
+  \
+  /* out0=(00 01 02 03), out1=(10 11 12 13) */ \
+  /* out2=(20 21 22 23), out3=(30 31 32 33) */ \
+  /* out4=(40 41 42 43), out5=(50 51 52 53) */ \
+  /* out6=(60 61 62 63), out7=(70 71 72 73) */ \
+  \
+  row06 = _mm_packs_pi16(out0, out6);  /* row06=(00 01 02 03 60 61 62 63) */ \
+  row17 = _mm_packs_pi16(out1, out7);  /* row17=(10 11 12 13 70 71 72 73) */ \
+  row24 = _mm_packs_pi16(out2, out4);  /* row24=(20 21 22 23 40 41 42 43) */ \
+  row35 = _mm_packs_pi16(out3, out5);  /* row35=(30 31 32 33 50 51 52 53) */ \
+  \
+  row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
+  row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
+  row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \
+  row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \
+  \
+  /* Transpose coefficients */ \
+  \
+  col0123a = _mm_unpacklo_pi8(row06, row17);  /* col0123a=(00 10 01 11 02 12 03 13) */ \
+  col0123d = _mm_unpackhi_pi8(row06, row17);  /* col0123d=(60 70 61 71 62 72 63 73) */ \
+  col0123b = _mm_unpacklo_pi8(row24, row35);  /* col0123b=(20 30 21 31 22 32 23 33) */ \
+  col0123c = _mm_unpackhi_pi8(row24, row35);  /* col0123c=(40 50 41 51 42 52 43 53) */ \
+  \
+  col01l = _mm_unpacklo_pi16(col0123a, col0123b);  /* col01l=(00 10 20 30 01 11 21 31) */ \
+  col23l = _mm_unpackhi_pi16(col0123a, col0123b);  /* col23l=(02 12 22 32 03 13 23 33) */ \
+  col01h = _mm_unpacklo_pi16(col0123c, col0123d);  /* col01h=(40 50 60 70 41 51 61 71) */ \
+  col23h = _mm_unpackhi_pi16(col0123c, col0123d);  /* col23h=(42 52 62 72 43 53 63 73) */ \
+  \
+  col0 = _mm_unpacklo_pi32(col01l, col01h);   /* col0=(00 10 20 30 40 50 60 70) */ \
+  col1 = _mm_unpackhi_pi32(col01l, col01h);   /* col1=(01 11 21 31 41 51 61 71) */ \
+  col2 = _mm_unpacklo_pi32(col23l, col23h);   /* col2=(02 12 22 32 42 52 62 72) */ \
+  col3 = _mm_unpackhi_pi32(col23l, col23h);   /* col3=(03 13 23 33 43 53 63 73) */ \
+  \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \
+}
+
+void jsimd_idct_islow_mmi(void *dct_table, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE *quantptr;
+  JCOEF *wsptr;
+  JCOEF workspace[DCTSIZE2];  /* buffers data between passes */
+
+  /* Pass 1: process columns. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *)dct_table;
+  wsptr = workspace;
+
+  DO_IDCT_PASS1(1)
+nextcolumn1:
+  inptr += 4;
+  quantptr += 4;
+  wsptr += DCTSIZE * 4;
+  DO_IDCT_PASS1(2)
+nextcolumn2:
+
+  /* Pass 2: process rows. */
+
+  wsptr = workspace;
+
+  DO_IDCT_PASS2(0)
+  wsptr += 4;
+  DO_IDCT_PASS2(4)
+}
diff --git a/simd/loongson/jquanti-mmi.c b/simd/loongson/jquanti-mmi.c
new file mode 100644
index 0000000..f9a3f81
--- /dev/null
+++ b/simd/loongson/jquanti-mmi.c
@@ -0,0 +1,130 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ * Copyright (C) 2018, D. R. Commander.  All Rights Reserved.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define DO_QUANT() { \
+  mm2 = _mm_load_si64((__m64 *)&workspace[0]); \
+  mm3 = _mm_load_si64((__m64 *)&workspace[4]); \
+  \
+  mm0 = mm2; \
+  mm1 = mm3; \
+  \
+  mm2 = _mm_srai_pi16(mm2, (WORD_BIT - 1));   /* -1 if value < 0, */ \
+                                              /* 0 otherwise */ \
+  mm3 = _mm_srai_pi16(mm3, (WORD_BIT - 1)); \
+  \
+  mm0 = _mm_xor_si64(mm0, mm2);               /* val = -val */ \
+  mm1 = _mm_xor_si64(mm1, mm3); \
+  mm0 = _mm_sub_pi16(mm0, mm2); \
+  mm1 = _mm_sub_pi16(mm1, mm3); \
+  \
+  corr0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]);  /* correction */ \
+  corr1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \
+  \
+  mm0 = _mm_add_pi16(mm0, corr0);             /* correction + roundfactor */ \
+  mm1 = _mm_add_pi16(mm1, corr1); \
+  \
+  mm4 = mm0; \
+  mm5 = mm1; \
+  \
+  recip0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]);  /* reciprocal */ \
+  recip1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \
+  \
+  mm0 = _mm_mulhi_pi16(mm0, recip0); \
+  mm1 = _mm_mulhi_pi16(mm1, recip1); \
+  \
+  mm0 = _mm_add_pi16(mm0, mm4);  /* reciprocal is always negative */ \
+  mm1 = _mm_add_pi16(mm1, mm5);  /* (MSB=1), so we always need to add the */ \
+                                 /* initial value (input value is never */ \
+                                 /* negative as we inverted it at the */ \
+                                 /* start of this routine) */ \
+  \
+  scale0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]);  /* scale */ \
+  scale1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \
+  \
+  mm6 = scale0; \
+  mm7 = scale1; \
+  mm4 = mm0; \
+  mm5 = mm1; \
+  \
+  mm0 = _mm_mulhi_pi16(mm0, mm6); \
+  mm1 = _mm_mulhi_pi16(mm1, mm7); \
+  \
+  mm6 = _mm_srai_pi16(mm6, (WORD_BIT - 1));   /* determine if scale... */ \
+                                              /* is negative */ \
+  mm7 = _mm_srai_pi16(mm7, (WORD_BIT - 1)); \
+  \
+  mm6 = _mm_and_si64(mm6, mm4);               /* and add input if it is */ \
+  mm7 = _mm_and_si64(mm7, mm5); \
+  mm0 = _mm_add_pi16(mm0, mm6); \
+  mm1 = _mm_add_pi16(mm1, mm7); \
+  \
+  mm4 = _mm_srai_pi16(mm4, (WORD_BIT - 1));   /* then check if... */ \
+  mm5 = _mm_srai_pi16(mm5, (WORD_BIT - 1));   /* negative input */ \
+  \
+  mm4 = _mm_and_si64(mm4, scale0);            /* and add scale if it is */ \
+  mm5 = _mm_and_si64(mm5, scale1); \
+  mm0 = _mm_add_pi16(mm0, mm4); \
+  mm1 = _mm_add_pi16(mm1, mm5); \
+  \
+  mm0 = _mm_xor_si64(mm0, mm2);               /* val = -val */ \
+  mm1 = _mm_xor_si64(mm1, mm3); \
+  mm0 = _mm_sub_pi16(mm0, mm2); \
+  mm1 = _mm_sub_pi16(mm1, mm3); \
+  \
+  _mm_store_si64((__m64 *)&output_ptr[0], mm0); \
+  _mm_store_si64((__m64 *)&output_ptr[4], mm1); \
+  \
+  workspace += DCTSIZE; \
+  divisors += DCTSIZE; \
+  output_ptr += DCTSIZE; \
+}
+
+
+void jsimd_quantize_mmi(JCOEFPTR coef_block, DCTELEM *divisors,
+                        DCTELEM *workspace)
+{
+  JCOEFPTR output_ptr = coef_block;
+  __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
+  __m64 corr0, corr1, recip0, recip1, scale0, scale1;
+
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+}
diff --git a/simd/loongson/jsimd.c b/simd/loongson/jsimd.c
new file mode 100644
index 0000000..0f5c0e8
--- /dev/null
+++ b/simd/loongson/jsimd.c
@@ -0,0 +1,579 @@
+/*
+ * jsimd_loongson.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, D. R. Commander.
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * Loongson architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+static unsigned int simd_support = ~0;
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+  char *env = NULL;
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support |= JSIMD_MMI;
+
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_c_can_null_convert(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_extrgb_ycc_convert_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_extrgbx_ycc_convert_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_extbgr_ycc_convert_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_extbgrx_ycc_convert_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_extxbgr_ycc_convert_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_extxrgb_ycc_convert_mmi;
+    break;
+  default:
+    mmifct = jsimd_rgb_ycc_convert_mmi;
+    break;
+  }
+
+  mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_ycc_extrgb_convert_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_ycc_extrgbx_convert_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_ycc_extbgr_convert_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_ycc_extbgrx_convert_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_ycc_extxbgr_convert_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_ycc_extxrgb_convert_mmi;
+    break;
+  default:
+    mmifct = jsimd_ycc_rgb_convert_mmi;
+    break;
+  }
+
+  mmifct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                     JSAMPIMAGE output_buf, JDIMENSION output_row,
+                     int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_mmi(cinfo->image_width, cinfo->max_v_samp_factor,
+                            compptr->v_samp_factor, compptr->width_in_blocks,
+                            input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+                             jpeg_component_info *compptr,
+                             JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_int_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                   JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_mmi(cinfo->max_v_samp_factor,
+                                compptr->downsampled_width, input_data,
+                                output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_mmi(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_mmi(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_6x6(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_mmi(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return NULL;
+}
diff --git a/simd/loongson/jsimd_mmi.h b/simd/loongson/jsimd_mmi.h
new file mode 100644
index 0000000..0f71f75
--- /dev/null
+++ b/simd/loongson/jsimd_mmi.h
@@ -0,0 +1,57 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jdct.h"
+#include "loongson-mmintrin.h"
+
+
+/* Common code */
+
+#define SIZEOF_MMWORD 8
+#define BYTE_BIT 8
+#define WORD_BIT 16
+#define SCALEBITS 16
+
+#define _uint64_set_pi8(a, b, c, d, e, f, g, h) \
+  (((uint64_t)(uint8_t)a << 56) | \
+   ((uint64_t)(uint8_t)b << 48) | \
+   ((uint64_t)(uint8_t)c << 40) | \
+   ((uint64_t)(uint8_t)d << 32) | \
+   ((uint64_t)(uint8_t)e << 24) | \
+   ((uint64_t)(uint8_t)f << 16) | \
+   ((uint64_t)(uint8_t)g << 8)  | \
+   ((uint64_t)(uint8_t)h))
+#define _uint64_set_pi16(a, b, c, d) (((uint64_t)(uint16_t)a << 48) | \
+                                      ((uint64_t)(uint16_t)b << 32) | \
+                                      ((uint64_t)(uint16_t)c << 16) | \
+                                      ((uint64_t)(uint16_t)d))
+#define _uint64_set_pi32(a, b) (((uint64_t)(uint32_t)a << 32) | \
+                                ((uint64_t)(uint32_t)b))
+
+#define get_const_value(index) (*(__m64 *)&const_value[index])
diff --git a/simd/loongson/loongson-mmintrin.h b/simd/loongson/loongson-mmintrin.h
new file mode 100644
index 0000000..4aea763
--- /dev/null
+++ b/simd/loongson/loongson-mmintrin.h
@@ -0,0 +1,1307 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#ifndef __LOONGSON_MMINTRIN_H__
+#define __LOONGSON_MMINTRIN_H__
+
+#include <stdint.h>
+
+
+#define FUNCTION_ATTRIBS \
+  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+
+/* Vectors are stored in 64-bit floating-point registers. */
+typedef double __m64;
+
+/* Having a 32-bit datatype allows us to use 32-bit loads in places like
+   load8888. */
+typedef float __m32;
+
+
+/********** Set Operations **********/
+
+extern __inline __m64
+_mm_setzero_si64(void)
+{
+  return 0.0;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4,
+            uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0)
+{
+  __m64 ret;
+  uint32_t lo = ((uint32_t)__b6 << 24) |
+                ((uint32_t)__b4 << 16) |
+                ((uint32_t)__b2 << 8) |
+                (uint32_t)__b0;
+  uint32_t hi = ((uint32_t)__b7 << 24) |
+                ((uint32_t)__b5 << 16) |
+                ((uint32_t)__b3 << 8) |
+                (uint32_t)__b1;
+
+  asm("mtc1      %1, %0\n\t"
+      "mtc1      %2, $f0\n\t"
+      "punpcklbh %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (lo), "r" (hi)
+      : "$f0"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0)
+{
+  __m64 ret;
+  uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0;
+  uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1;
+
+  asm("mtc1      %1, %0\n\t"
+      "mtc1      %2, $f0\n\t"
+      "punpcklhw %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (lo), "r" (hi)
+      : "$f0"
+     );
+
+  return ret;
+}
+
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi32(uint32_t __i1, uint32_t __i0)
+{
+  if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) {
+    uint64_t val = ((uint64_t)__i1 << 32) |
+                   ((uint64_t)__i0 <<  0);
+
+    return *(__m64 *)&val;
+  } else if (__i1 == __i0) {
+    uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0);
+    __m64 ret;
+
+    asm("pshufh %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
+       );
+
+    return ret;
+  } else {
+    uint64_t val = ((uint64_t)__i1 << 32) |
+                   ((uint64_t)__i0 <<  0);
+
+    return *(__m64 *)&val;
+  }
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi8(uint8_t __b0)
+{
+  __m64 ret;
+
+  asm("sll    $8, %1, 8\n\t"
+      "or     %1, %1, $8\n\t"
+      "mtc1   %1, %0\n\t"
+      "mtc1   $0, $f0\n\t"
+      "pshufh %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (__b0)
+      : "$8", "$f0"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi16(uint16_t __h0)
+{
+  __m64 ret;
+
+  asm("mtc1   %1, %0\n\t"
+      "mtc1   $0, $f0\n\t"
+      "pshufh %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (__h0)
+      : "$8", "$f0"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi32(unsigned __i0)
+{
+  return _mm_set_pi32(__i0, __i0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3,
+             uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7)
+{
+  return _mm_set_pi8(__h7, __h6, __h5, __h4,
+                     __h3, __h2, __h1, __h0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3)
+{
+  return _mm_set_pi16(__w3, __w2, __w1, __w0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi32(uint32_t __i0, uint32_t __i1)
+{
+  return _mm_set_pi32(__i1, __i0);
+}
+
+
+/********** Arithmetic Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddsb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddusb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddush %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_avg_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pavgb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_avg_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pavgh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_madd_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmaddhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_max_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmaxsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_max_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmaxub %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_min_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pminsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_min_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pminub %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline int FUNCTION_ATTRIBS
+_mm_movemask_pi8(__m64 __m1)
+{
+  int ret;
+
+  asm("pmovmskb %0, %1\n\t"
+      : "=r" (ret)
+      : "y" (__m1)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmulhh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mulhi_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmulhuh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mullo_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmullh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mul_pu32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmuluw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sad_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psadbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_asub_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pasubub %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_biadd_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("biadd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubsb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubusb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubush %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+/********** Logical Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_and_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("and %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_andnot_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("andn %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_or_si32(__m32 __m1, __m32 __m2)
+{
+  __m32 ret;
+
+  asm("or %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_or_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("or %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_xor_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("xor %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+/********** Shift Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_pi16(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psllh  %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_pi32(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psllw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_si64(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("dsll  %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_pi16(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psrlh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_pi32(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psrlw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_si64(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("dsrl  %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_pi16(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psrah %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_pi32(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psraw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_si64(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("dsra %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+
+/********** Conversion Intrinsics **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+to_m64(uint64_t x)
+{
+  return *(__m64 *)&x;
+}
+
+extern __inline uint64_t FUNCTION_ATTRIBS
+to_uint64(__m64 x)
+{
+  return *(uint64_t *)&x;
+}
+
+
+/********** Comparison Intrinsics **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpeqb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpeqh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpeqw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpgtb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpgth %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpgtw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpltb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmplth %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpltw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+/********** Miscellaneous Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packsshb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packsswh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi32_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packsswh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packushb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_extract_pi16(__m64 __m, int64_t __pos)
+{
+  __m64 ret;
+
+  asm("pextrh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__pos)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
+{
+  __m64 ret;
+
+  switch (__pos) {
+  case 0:
+
+    asm("pinsrh_0 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+
+  case 1:
+
+    asm("pinsrh_1 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+  case 2:
+
+    asm("pinsrh_2 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+
+  case 3:
+
+    asm("pinsrh_3 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+  }
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_shuffle_pi16(__m64 __m, int64_t __n)
+{
+  __m64 ret;
+
+  asm("pshufh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__n)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhwd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+/* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype,
+   which preserves the data. */
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32,
+   datatype, which allows load8888 to use 32-bit loads. */
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklwd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklwd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_store_pi32(__m32 *dest, __m64 src)
+{
+  src = _mm_packs_pu16(src, _mm_setzero_si64());
+
+  asm("swc1 %1, %0\n\t"
+      : "=m" (*dest)
+      : "f" (src)
+      : "memory"
+     );
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_store_si64(__m64 *dest, __m64 src)
+{
+  asm("gssdlc1 %1, 7+%0\n\t"
+      "gssdrc1 %1, %0\n\t"
+      : "=m" (*dest)
+      : "f" (src)
+      : "memory"
+     );
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_load_si32(const __m32 *src)
+{
+  __m32 ret;
+
+  asm("lwc1 %0, %1\n\t"
+      : "=f" (ret)
+      : "m" (*src)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_load_si64(const __m64 *src)
+{
+  __m64 ret;
+
+  asm("ldc1 %0, %1\n\t"
+      : "=f" (ret)
+      : "m" (*src)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi8(const uint32_t *src)
+{
+  return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi8_f(__m64 src)
+{
+  return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi8_f(__m64 src)
+{
+  return _mm_unpackhi_pi8_f(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi16(__m64 src)
+{
+  return _mm_unpacklo_pi16(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi16_f(__m64 src)
+{
+  return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi16(__m64 src)
+{
+  return _mm_unpackhi_pi16(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi16_f(__m64 src)
+{
+  return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_expand_alpha(__m64 pixel)
+{
+  return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3));
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_expand_alpha_rev(__m64 pixel)
+{
+  return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+#endif  /* __LOONGSON_MMINTRIN_H__ */
diff --git a/simd/mips/jsimd.c b/simd/mips/jsimd.c
new file mode 100644
index 0000000..a9c7ebd
--- /dev/null
+++ b/simd/mips/jsimd.c
@@ -0,0 +1,1084 @@
+/*
+ * jsimd_mips.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, D. R. Commander.
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2015, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * MIPS architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+
+#if defined(__linux__)
+
+LOCAL(int)
+parse_proc_cpuinfo(const char *search_string)
+{
+  const char *file_name = "/proc/cpuinfo";
+  char cpuinfo_line[256];
+  FILE *f = NULL;
+
+  simd_support = 0;
+
+  if ((f = fopen(file_name, "r")) != NULL) {
+    while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
+      if (strstr(cpuinfo_line, search_string) != NULL) {
+        fclose(f);
+        simd_support |= JSIMD_DSPR2;
+        return 1;
+      }
+    }
+    fclose(f);
+  }
+  /* Did not find string in the proc file, or not Linux ELF. */
+  return 0;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+  char *env = NULL;
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__MIPSEL__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+  simd_support |= JSIMD_DSPR2;
+#elif defined(__linux__)
+  /* We still have a chance to use MIPS DSPR2 regardless of globally used
+   * -mdspr2 options passed to gcc by performing runtime detection via
+   * /proc/cpuinfo parsing on linux */
+  if (!parse_proc_cpuinfo("MIPS 74K"))
+    return;
+#endif
+
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEDSPR2");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_DSPR2;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+}
+
+static const int mips_idct_ifast_coefs[4] = {
+  0x45404540,           /* FIX( 1.082392200 / 2) =  17734 = 0x4546 */
+  0x5A805A80,           /* FIX( 1.414213562 / 2) =  23170 = 0x5A82 */
+  0x76407640,           /* FIX( 1.847759065 / 2) =  30274 = 0x7642 */
+  0xAC60AC60            /* FIX(-2.613125930 / 4) = -21407 = 0xAC61 */
+};
+
+/* The following struct is borrowed from jdsample.c */
+typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
+                               jpeg_component_info *compptr,
+                               JSAMPARRAY input_data,
+                               JSAMPARRAY *output_data_ptr);
+typedef struct {
+  struct jpeg_upsampler pub;
+  JSAMPARRAY color_buf[MAX_COMPONENTS];
+  upsample1_ptr methods[MAX_COMPONENTS];
+  int next_row_out;
+  JDIMENSION rows_to_go;
+  int rowgroup_height[MAX_COMPONENTS];
+  UINT8 h_expand[MAX_COMPONENTS];
+  UINT8 v_expand[MAX_COMPONENTS];
+} my_upsampler;
+
+typedef my_upsampler *my_upsample_ptr;
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_c_can_null_convert(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_extrgb_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_extrgbx_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_extbgr_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_extbgrx_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_extxbgr_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_extxrgb_ycc_convert_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_extrgb_ycc_convert_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_extrgb_gray_convert_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_extrgbx_gray_convert_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_extbgr_gray_convert_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_extbgrx_gray_convert_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_extxbgr_gray_convert_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_extxrgb_gray_convert_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_extrgb_gray_convert_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_ycc_extrgb_convert_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_ycc_extrgbx_convert_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_ycc_extbgr_convert_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_ycc_extbgrx_convert_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_ycc_extxbgr_convert_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_ycc_extxrgb_convert_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_ycc_extrgb_convert_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                     JSAMPIMAGE output_buf, JDIMENSION output_row,
+                     int num_rows)
+{
+  jsimd_c_null_convert_dspr2(cinfo->image_width, input_buf, output_buf,
+                             output_row, num_rows, cinfo->num_components);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+                             jpeg_component_info *compptr,
+                             JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_smooth_downsample_dspr2(input_data, output_data,
+                                     compptr->v_samp_factor,
+                                     cinfo->max_v_samp_factor,
+                                     cinfo->smoothing_factor,
+                                     compptr->width_in_blocks,
+                                     cinfo->image_width);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_int_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                   JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
+
+  jsimd_int_upsample_dspr2(upsample->h_expand[compptr->component_index],
+                           upsample->v_expand[compptr->component_index],
+                           input_data, output_data_ptr, cinfo->output_width,
+                           cinfo->max_v_samp_factor);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_dspr2(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_dspr2(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_h2v2_extrgbx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_h2v2_extbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_h2v2_extbgrx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_h2v2_extxbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_h2v2_extxrgb_merged_upsample_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
+           cinfo->sample_range_limit);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_h2v1_extrgbx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_h2v1_extbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_h2v1_extbgrx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_h2v1_extxbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_h2v1_extxrgb_merged_upsample_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
+           cinfo->sample_range_limit);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_dspr2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+  jsimd_convsamp_float_dspr2(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_dspr2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+  jsimd_quantize_float_dspr2(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_6x6(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12(void)
+{
+  init_simd();
+
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_dspr2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  int workspace[DCTSIZE * 4]; /* buffers data between passes */
+
+  jsimd_idct_4x4_dspr2(compptr->dct_table, coef_block, output_buf, output_col,
+                       workspace);
+}
+
+GLOBAL(void)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_6x6_dspr2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  int workspace[96];
+  int output[12] = {
+    (int)(output_buf[0] + output_col),
+    (int)(output_buf[1] + output_col),
+    (int)(output_buf[2] + output_col),
+    (int)(output_buf[3] + output_col),
+    (int)(output_buf[4] + output_col),
+    (int)(output_buf[5] + output_col),
+    (int)(output_buf[6] + output_col),
+    (int)(output_buf[7] + output_col),
+    (int)(output_buf[8] + output_col),
+    (int)(output_buf[9] + output_col),
+    (int)(output_buf[10] + output_col),
+    (int)(output_buf[11] + output_col)
+  };
+
+  jsimd_idct_12x12_pass1_dspr2(coef_block, compptr->dct_table, workspace);
+  jsimd_idct_12x12_pass2_dspr2(workspace, output);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  int output[8] = {
+    (int)(output_buf[0] + output_col),
+    (int)(output_buf[1] + output_col),
+    (int)(output_buf[2] + output_col),
+    (int)(output_buf[3] + output_col),
+    (int)(output_buf[4] + output_col),
+    (int)(output_buf[5] + output_col),
+    (int)(output_buf[6] + output_col),
+    (int)(output_buf[7] + output_col)
+  };
+
+  jsimd_idct_islow_dspr2(coef_block, compptr->dct_table, output,
+                         IDCT_range_limit(cinfo));
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  JCOEFPTR inptr;
+  IFAST_MULT_TYPE *quantptr;
+  DCTELEM workspace[DCTSIZE2];  /* buffers data between passes */
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (IFAST_MULT_TYPE *)compptr->dct_table;
+
+  jsimd_idct_ifast_cols_dspr2(inptr, quantptr, workspace,
+                              mips_idct_ifast_coefs);
+
+  /* Pass 2: process rows from work array, store into output array. */
+  /* Note that we must descale the results by a factor of 8 == 2**3, */
+  /* and also undo the PASS1_BITS scaling. */
+
+  jsimd_idct_ifast_rows_dspr2(workspace, output_buf, output_col,
+                              mips_idct_ifast_coefs);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return NULL;
+}
diff --git a/simd/mips/jsimd_dspr2.S b/simd/mips/jsimd_dspr2.S
new file mode 100644
index 0000000..2ec543e
--- /dev/null
+++ b/simd/mips/jsimd_dspr2.S
@@ -0,0 +1,4471 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ *                          All Rights Reserved.
+ * Authors:  Teodora Novkovic <teodora.novkovic@imgtec.com>
+ *           Darko Laus       <darko.laus@imgtec.com>
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#include "jsimd_dspr2_asm.h"
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_c_null_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = output_buf
+ * a3     = output_row
+ * 16(sp) = num_rows
+ * 20(sp) = cinfo->num_components
+ *
+ * Null conversion for compression
+ */
+    SAVE_REGS_ON_STACK 8, s0, s1
+
+    lw          t9, 24(sp)      // t9 = num_rows
+    lw          s0, 28(sp)      // s0 = cinfo->num_components
+    andi        t0, a0, 3       // t0 = cinfo->image_width & 3
+    beqz        t0, 4f          // no residual
+     nop
+0:
+    addiu       t9, t9, -1
+    bltz        t9, 7f
+     li         t1, 0
+1:
+    sll         t3, t1, 2
+    lwx         t5, t3(a2)      // t5 = outptr = output_buf[ci]
+    lw          t2, 0(a1)       // t2 = inptr = *input_buf
+    sll         t4, a3, 2
+    lwx         t5, t4(t5)      // t5 = outptr = output_buf[ci][output_row]
+    addu        t2, t2, t1
+    addu        s1, t5, a0
+    addu        t6, t5, t0
+2:
+    lbu         t3, 0(t2)
+    addiu       t5, t5, 1
+    sb          t3, -1(t5)
+    bne         t6, t5, 2b
+     addu       t2, t2, s0
+3:
+    lbu         t3, 0(t2)
+    addu        t4, t2, s0
+    addu        t7, t4, s0
+    addu        t8, t7, s0
+    addu        t2, t8, s0
+    lbu         t4, 0(t4)
+    lbu         t7, 0(t7)
+    lbu         t8, 0(t8)
+    addiu       t5, t5, 4
+    sb          t3, -4(t5)
+    sb          t4, -3(t5)
+    sb          t7, -2(t5)
+    bne         s1, t5, 3b
+     sb         t8, -1(t5)
+    addiu       t1, t1, 1
+    bne         t1, s0, 1b
+     nop
+    addiu       a1, a1, 4
+    bgez        t9, 0b
+     addiu      a3, a3, 1
+    b           7f
+     nop
+4:
+    addiu       t9, t9, -1
+    bltz        t9, 7f
+     li         t1, 0
+5:
+    sll         t3, t1, 2
+    lwx         t5, t3(a2)      // t5 = outptr = output_buf[ci]
+    lw          t2, 0(a1)       // t2 = inptr = *input_buf
+    sll         t4, a3, 2
+    lwx         t5, t4(t5)      // t5 = outptr = output_buf[ci][output_row]
+    addu        t2, t2, t1
+    addu        s1, t5, a0
+    addu        t6, t5, t0
+6:
+    lbu         t3, 0(t2)
+    addu        t4, t2, s0
+    addu        t7, t4, s0
+    addu        t8, t7, s0
+    addu        t2, t8, s0
+    lbu         t4, 0(t4)
+    lbu         t7, 0(t7)
+    lbu         t8, 0(t8)
+    addiu       t5, t5, 4
+    sb          t3, -4(t5)
+    sb          t4, -3(t5)
+    sb          t7, -2(t5)
+    bne         s1, t5, 6b
+     sb         t8, -1(t5)
+    addiu       t1, t1, 1
+    bne         t1, s0, 5b
+     nop
+    addiu       a1, a1, 4
+    bgez        t9, 4b
+     addiu      a3, a3, 1
+7:
+    RESTORE_REGS_FROM_STACK 8, s0, s1
+
+    j           ra
+     nop
+
+END(jsimd_c_null_convert_dspr2)
+
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_ycc_convert_dspr2
+ * jsimd_extbgr_ycc_convert_dspr2
+ * jsimd_extrgbx_ycc_convert_dspr2
+ * jsimd_extbgrx_ycc_convert_dspr2
+ * jsimd_extxbgr_ycc_convert_dspr2
+ * jsimd_extxrgb_ycc_convert_dspr2
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2  colorid, pixel_size, \
+                                             r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_YCC  r, g, b, inptr
+    lbu         \r, \r_offs(\inptr)
+    lbu         \g, \g_offs(\inptr)
+    lbu         \b, \b_offs(\inptr)
+    addiu       \inptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = output_buf
+ * a3     = output_row
+ * 16(sp) = num_rows
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          t7, 48(sp)      // t7 = num_rows
+    li          s0, 0x4c8b      // FIX(0.29900)
+    li          s1, 0x9646      // FIX(0.58700)
+    li          s2, 0x1d2f      // FIX(0.11400)
+    li          s3, 0xffffd4cd  // -FIX(0.16874)
+    li          s4, 0xffffab33  // -FIX(0.33126)
+    li          s5, 0x8000      // FIX(0.50000)
+    li          s6, 0xffff94d1  // -FIX(0.41869)
+    li          s7, 0xffffeb2f  // -FIX(0.08131)
+    li          t8, 0x807fff    // CBCR_OFFSET + ONE_HALF-1
+
+0:
+    addiu       t7, -1          // --num_rows
+    lw          t6, 0(a1)       // t6 = input_buf[0]
+    lw          t0, 0(a2)
+    lw          t1, 4(a2)
+    lw          t2, 8(a2)
+    sll         t3, a3, 2
+    lwx         t0, t3(t0)      // t0 = output_buf[0][output_row]
+    lwx         t1, t3(t1)      // t1 = output_buf[1][output_row]
+    lwx         t2, t3(t2)      // t2 = output_buf[2][output_row]
+
+    addu        t9, t2, a0      // t9 = end address
+    addiu       a3, 1
+
+1:
+    DO_RGB_TO_YCC t3, t4, t5, t6
+
+    mtlo        s5, $ac0
+    mtlo        t8, $ac1
+    mtlo        t8, $ac2
+    maddu       $ac0, s2, t5
+    maddu       $ac1, s5, t5
+    maddu       $ac2, s5, t3
+    maddu       $ac0, s0, t3
+    maddu       $ac1, s3, t3
+    maddu       $ac2, s6, t4
+    maddu       $ac0, s1, t4
+    maddu       $ac1, s4, t4
+    maddu       $ac2, s7, t5
+    extr.w      t3, $ac0, 16
+    extr.w      t4, $ac1, 16
+    extr.w      t5, $ac2, 16
+    sb          t3, 0(t0)
+    sb          t4, 0(t1)
+    sb          t5, 0(t2)
+    addiu       t0, 1
+    addiu       t2, 1
+    bne         t2, t9, 1b
+     addiu      t1, 1
+    bgtz        t7, 0b
+     addiu      a1, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_\colorid\()_ycc_convert_dspr2)
+
+.purgem DO_RGB_TO_YCC
+
+.endm
+
+/*-------------------------------------id -- pix R  G  B */
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb,  3, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr,  3, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
+
+
+/*****************************************************************************/
+/*
+ * jsimd_ycc_extrgb_convert_dspr2
+ * jsimd_ycc_extbgr_convert_dspr2
+ * jsimd_ycc_extrgbx_convert_dspr2
+ * jsimd_ycc_extbgrx_convert_dspr2
+ * jsimd_ycc_extxbgr_convert_dspr2
+ * jsimd_ycc_extxrgb_convert_dspr2
+ *
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+.macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2  colorid, pixel_size, \
+                                             r_offs, g_offs, b_offs, a_offs
+
+.macro STORE_YCC_TO_RGB  scratch0 scratch1 scratch2 outptr
+    sb          \scratch0, \r_offs(\outptr)
+    sb          \scratch1, \g_offs(\outptr)
+    sb          \scratch2, \b_offs(\outptr)
+.if (\pixel_size == 4)
+    li          t0, 0xFF
+    sb          t0, \a_offs(\outptr)
+.endif
+    addiu       \outptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = input_row
+ * a3     = output_buf
+ * 16(sp) = num_rows
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          s1, 48(sp)
+    li          t3, 0x8000
+    li          t4, 0x166e9     // FIX(1.40200)
+    li          t5, 0x1c5a2     // FIX(1.77200)
+    li          t6, 0xffff492e  // -FIX(0.71414)
+    li          t7, 0xffffa7e6  // -FIX(0.34414)
+    repl.ph     t8, 128
+
+0:
+    lw          s0, 0(a3)
+    lw          t0, 0(a1)
+    lw          t1, 4(a1)
+    lw          t2, 8(a1)
+    sll         s5, a2, 2
+    addiu       s1, -1
+    lwx         s2, s5(t0)
+    lwx         s3, s5(t1)
+    lwx         s4, s5(t2)
+    addu        t9, s2, a0
+    addiu       a2, 1
+
+1:
+    lbu         s7, 0(s4)       // cr
+    lbu         s6, 0(s3)       // cb
+    lbu         s5, 0(s2)       // y
+    addiu       s2, 1
+    addiu       s4, 1
+    addiu       s7, -128
+    addiu       s6, -128
+    mul         t2, t7, s6
+    mul         t0, t6, s7      // Crgtab[cr]
+    sll         s7, 15
+    mulq_rs.w   t1, t4, s7      // Crrtab[cr]
+    sll         s6, 15
+    addu        t2, t3          // Cbgtab[cb]
+    addu        t2, t0
+
+    mulq_rs.w   t0, t5, s6      // Cbbtab[cb]
+    sra         t2, 16
+    addu        t1, s5
+    addu        t2, s5          // add y
+    ins         t2, t1, 16, 16
+    subu.ph     t2, t2, t8
+    addu        t0, s5
+    shll_s.ph   t2, t2, 8
+    subu        t0, 128
+    shra.ph     t2, t2, 8
+    shll_s.w    t0, t0, 24
+    addu.ph     t2, t2, t8      // clip & store
+    sra         t0, t0, 24
+    sra         t1, t2, 16
+    addiu       t0, 128
+
+    STORE_YCC_TO_RGB t1, t2, t0, s0
+
+    bne         s2, t9, 1b
+     addiu      s3, 1
+    bgtz        s1, 0b
+     addiu      a3, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_ycc_\colorid\()_convert_dspr2)
+
+.purgem STORE_YCC_TO_RGB
+
+.endm
+
+/*-------------------------------------id -- pix R  G  B  A */
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb,  3, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr,  3, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0
+
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_gray_convert_dspr2
+ * jsimd_extbgr_gray_convert_dspr2
+ * jsimd_extrgbx_gray_convert_dspr2
+ * jsimd_extbgrx_gray_convert_dspr2
+ * jsimd_extxbgr_gray_convert_dspr2
+ * jsimd_extxrgb_gray_convert_dspr2
+ *
+ * Colorspace conversion RGB -> GRAY
+ */
+
+.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2  colorid, pixel_size, \
+                                              r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_GRAY  r, g, b, inptr
+    lbu         \r, \r_offs(\inptr)
+    lbu         \g, \g_offs(\inptr)
+    lbu         \b, \b_offs(\inptr)
+    addiu       \inptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = output_buf
+ * a3     = output_row
+ * 16(sp) = num_rows
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    li          s0, 0x4c8b      // s0 = FIX(0.29900)
+    li          s1, 0x9646      // s1 = FIX(0.58700)
+    li          s2, 0x1d2f      // s2 = FIX(0.11400)
+    li          s7, 0x8000      // s7 = FIX(0.50000)
+    lw          s6, 48(sp)
+    andi        t7, a0, 3
+
+0:
+    addiu       s6, -1          // s6 = num_rows
+    lw          t0, 0(a1)
+    lw          t1, 0(a2)
+    sll         t3, a3, 2
+    lwx         t1, t3(t1)
+    addiu       a3, 1
+    addu        t9, t1, a0
+    subu        t8, t9, t7
+    beq         t1, t8, 2f
+     nop
+
+1:
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+    DO_RGB_TO_GRAY s3, s4, s5, t0
+
+    mtlo        s7, $ac0
+    maddu       $ac0, s2, t5
+    maddu       $ac0, s1, t4
+    maddu       $ac0, s0, t3
+    mtlo        s7, $ac1
+    maddu       $ac1, s2, s5
+    maddu       $ac1, s1, s4
+    maddu       $ac1, s0, s3
+    extr.w      t6, $ac0, 16
+
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+    DO_RGB_TO_GRAY s3, s4, s5, t0
+
+    mtlo        s7, $ac0
+    maddu       $ac0, s2, t5
+    maddu       $ac0, s1, t4
+    extr.w      t2, $ac1, 16
+    maddu       $ac0, s0, t3
+    mtlo        s7, $ac1
+    maddu       $ac1, s2, s5
+    maddu       $ac1, s1, s4
+    maddu       $ac1, s0, s3
+    extr.w      t5, $ac0, 16
+    sb          t6, 0(t1)
+    sb          t2, 1(t1)
+    extr.w      t3, $ac1, 16
+    addiu       t1, 4
+    sb          t5, -2(t1)
+    sb          t3, -1(t1)
+    bne         t1, t8, 1b
+     nop
+
+2:
+    beqz        t7, 4f
+     nop
+
+3:
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+
+    mtlo        s7, $ac0
+    maddu       $ac0, s2, t5
+    maddu       $ac0, s1, t4
+    maddu       $ac0, s0, t3
+    extr.w      t6, $ac0, 16
+    sb          t6, 0(t1)
+    addiu       t1, 1
+    bne         t1, t9, 3b
+     nop
+
+4:
+    bgtz        s6, 0b
+     addiu      a1, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_\colorid\()_gray_convert_dspr2)
+
+.purgem DO_RGB_TO_GRAY
+
+.endm
+
+/*-------------------------------------id --  pix R  G  B */
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb,  3, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr,  3, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_merged_upsample_dspr2
+ * jsimd_h2v2_extrgb_merged_upsample_dspr2
+ * jsimd_h2v2_extrgbx_merged_upsample_dspr2
+ * jsimd_h2v2_extbgr_merged_upsample_dspr2
+ * jsimd_h2v2_extbgrx_merged_upsample_dspr2
+ * jsimd_h2v2_extxbgr_merged_upsample_dspr2
+ * jsimd_h2v2_extxrgb_merged_upsample_dspr2
+ *
+ * Merged h2v2 upsample routines
+ */
+.macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2  colorid, pixel_size, \
+                                            r1_offs, g1_offs, \
+                                            b1_offs, a1_offs, \
+                                            r2_offs, g2_offs, \
+                                            b2_offs, a2_offs
+
+.macro STORE_H2V2_2_PIXELS  scratch0 scratch1 scratch2 scratch3 scratch4 \
+                            scratch5 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+    sb          \scratch3, \r2_offs(\outptr)
+    sb          \scratch4, \g2_offs(\outptr)
+    sb          \scratch5, \b2_offs(\outptr)
+.if (\pixel_size == 8)
+    li          \scratch0, 0xFF
+    sb          \scratch0, \a1_offs(\outptr)
+    sb          \scratch0, \a2_offs(\outptr)
+.endif
+    addiu       \outptr, \pixel_size
+.endm
+
+.macro STORE_H2V2_1_PIXEL  scratch0 scratch1 scratch2 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+
+.if (\pixel_size == 8)
+    li          t0, 0xFF
+    sb          t0, \a1_offs(\outptr)
+.endif
+.endm
+
+LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
+/*
+ * a0     = cinfo->output_width
+ * a1     = input_buf
+ * a2     = in_row_group_ctr
+ * a3     = output_buf
+ * 16(sp) = cinfo->sample_range_limit
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    lw          t9, 56(sp)      // cinfo->sample_range_limit
+    lw          v0, 0(a1)
+    lw          v1, 4(a1)
+    lw          t0, 8(a1)
+    sll         t1, a2, 3
+    addiu       t2, t1, 4
+    sll         t3, a2, 2
+    lw          t4, 0(a3)       // t4 = output_buf[0]
+    lwx         t1, t1(v0)      // t1 = input_buf[0][in_row_group_ctr*2]
+    lwx         t2, t2(v0)      // t2 = input_buf[0][in_row_group_ctr*2 + 1]
+    lwx         t5, t3(v1)      // t5 = input_buf[1][in_row_group_ctr]
+    lwx         t6, t3(t0)      // t6 = input_buf[2][in_row_group_ctr]
+    lw          t7, 4(a3)       // t7 = output_buf[1]
+    li          s1, 0xe6ea
+    addiu       t8, s1, 0x7fff    // t8 = 0x166e9 [FIX(1.40200)]
+    addiu       s0, t8, 0x5eb9    // s0 = 0x1c5a2 [FIX(1.77200)]
+    addiu       s1, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
+    xori        s2, s1, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
+    srl         t3, a0, 1
+    blez        t3, 2f
+     addu       t0, t5, t3      // t0 = end address
+ 1:
+    lbu         t3, 0(t5)
+    lbu         s3, 0(t6)
+    addiu       t5, t5, 1
+    addiu       t3, t3, -128    // (cb - 128)
+    addiu       s3, s3, -128    // (cr - 128)
+    mult        $ac1, s1, t3
+    madd        $ac1, s2, s3
+    sll         s3, s3, 15
+    sll         t3, t3, 15
+    mulq_rs.w   s4, t8, s3      // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
+    extr_r.w    s5, $ac1, 16
+    mulq_rs.w   s6, s0, t3      // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
+    lbu         v0, 0(t1)
+    addiu       t6, t6, 1
+    addiu       t1, t1, 2
+    addu        t3, v0, s4      // y+cred
+    addu        s3, v0, s5      // y+cgreen
+    addu        v1, v0, s6      // y+cblue
+    addu        t3, t9, t3      // y+cred
+    addu        s3, t9, s3      // y+cgreen
+    addu        v1, t9, v1      // y+cblue
+    lbu         AT, 0(t3)
+    lbu         s7, 0(s3)
+    lbu         ra, 0(v1)
+    lbu         v0, -1(t1)
+    addu        t3, v0, s4      // y+cred
+    addu        s3, v0, s5      // y+cgreen
+    addu        v1, v0, s6      // y+cblue
+    addu        t3, t9, t3      // y+cred
+    addu        s3, t9, s3      // y+cgreen
+    addu        v1, t9, v1      // y+cblue
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+    lbu         v0, 0(t2)
+
+    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
+
+    addu        t3, v0, s4      // y+cred
+    addu        s3, v0, s5      // y+cgreen
+    addu        v1, v0, s6      // y+cblue
+    addu        t3, t9, t3      // y+cred
+    addu        s3, t9, s3      // y+cgreen
+    addu        v1, t9, v1      // y+cblue
+    lbu         AT, 0(t3)
+    lbu         s7, 0(s3)
+    lbu         ra, 0(v1)
+    lbu         v0, 1(t2)
+    addiu       t2, t2, 2
+    addu        t3, v0, s4      // y+cred
+    addu        s3, v0, s5      // y+cgreen
+    addu        v1, v0, s6      // y+cblue
+    addu        t3, t9, t3      // y+cred
+    addu        s3, t9, s3      // y+cgreen
+    addu        v1, t9, v1      // y+cblue
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+
+    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
+
+    bne         t0, t5, 1b
+     nop
+2:
+    andi        t0, a0, 1
+    beqz        t0, 4f
+     lbu        t3, 0(t5)
+    lbu         s3, 0(t6)
+    addiu       t3, t3, -128    // (cb - 128)
+    addiu       s3, s3, -128    // (cr - 128)
+    mult        $ac1, s1, t3
+    madd        $ac1, s2, s3
+    sll         s3, s3, 15
+    sll         t3, t3, 15
+    lbu         v0, 0(t1)
+    extr_r.w    s5, $ac1, 16
+    mulq_rs.w   s4, t8, s3      // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
+    mulq_rs.w   s6, s0, t3      // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
+    addu        t3, v0, s4      // y+cred
+    addu        s3, v0, s5      // y+cgreen
+    addu        v1, v0, s6      // y+cblue
+    addu        t3, t9, t3      // y+cred
+    addu        s3, t9, s3      // y+cgreen
+    addu        v1, t9, v1      // y+cblue
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+    lbu         v0, 0(t2)
+
+    STORE_H2V2_1_PIXEL t3, s3, v1, t4
+
+    addu        t3, v0, s4      // y+cred
+    addu        s3, v0, s5      // y+cgreen
+    addu        v1, v0, s6      // y+cblue
+    addu        t3, t9, t3      // y+cred
+    addu        s3, t9, s3      // y+cgreen
+    addu        v1, t9, v1      // y+cblue
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+
+    STORE_H2V2_1_PIXEL t3, s3, v1, t7
+4:
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    j           ra
+     nop
+
+END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
+
+.purgem STORE_H2V2_1_PIXEL
+.purgem STORE_H2V2_2_PIXELS
+.endm
+
+/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v1_merged_upsample_dspr2
+ * jsimd_h2v1_extrgb_merged_upsample_dspr2
+ * jsimd_h2v1_extrgbx_merged_upsample_dspr2
+ * jsimd_h2v1_extbgr_merged_upsample_dspr2
+ * jsimd_h2v1_extbgrx_merged_upsample_dspr2
+ * jsimd_h2v1_extxbgr_merged_upsample_dspr2
+ * jsimd_h2v1_extxrgb_merged_upsample_dspr2
+ *
+ * Merged h2v1 upsample routines
+ */
+
+.macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2  colorid, pixel_size, \
+                                            r1_offs, g1_offs, \
+                                            b1_offs, a1_offs, \
+                                            r2_offs, g2_offs, \
+                                            b2_offs, a2_offs
+
+.macro STORE_H2V1_2_PIXELS  scratch0 scratch1 scratch2 scratch3 scratch4 \
+                            scratch5 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+    sb          \scratch3, \r2_offs(\outptr)
+    sb          \scratch4, \g2_offs(\outptr)
+    sb          \scratch5, \b2_offs(\outptr)
+.if (\pixel_size == 8)
+    li          t0, 0xFF
+    sb          t0, \a1_offs(\outptr)
+    sb          t0, \a2_offs(\outptr)
+.endif
+    addiu       \outptr, \pixel_size
+.endm
+
+.macro STORE_H2V1_1_PIXEL  scratch0 scratch1 scratch2 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+.if (\pixel_size == 8)
+    li          t0, 0xFF
+    sb          t0, \a1_offs(\outptr)
+.endif
+.endm
+
+LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
+/*
+ * a0     = cinfo->output_width
+ * a1     = input_buf
+ * a2     = in_row_group_ctr
+ * a3     = output_buf
+ * 16(sp) = range_limit
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    li          t0, 0xe6ea
+    lw          t1, 0(a1)         // t1 = input_buf[0]
+    lw          t2, 4(a1)         // t2 = input_buf[1]
+    lw          t3, 8(a1)         // t3 = input_buf[2]
+    lw          t8, 56(sp)        // t8 = range_limit
+    addiu       s1, t0, 0x7fff    // s1 = 0x166e9 [FIX(1.40200)]
+    addiu       s2, s1, 0x5eb9    // s2 = 0x1c5a2 [FIX(1.77200)]
+    addiu       s0, t0, 0x9916    // s0 = 0x8000
+    addiu       s4, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
+    xori        s3, s4, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
+    srl         t0, a0, 1
+    sll         t4, a2, 2
+    lwx         s5, t4(t1)      // s5 = inptr0
+    lwx         s6, t4(t2)      // s6 = inptr1
+    lwx         s7, t4(t3)      // s7 = inptr2
+    lw          t7, 0(a3)       // t7 = outptr
+    blez        t0, 2f
+     addu       t9, s6, t0      // t9 = end address
+1:
+    lbu         t2, 0(s6)       // t2 = cb
+    lbu         t0, 0(s7)       // t0 = cr
+    lbu         t1, 0(s5)       // t1 = y
+    addiu       t2, t2, -128    // t2 = cb - 128
+    addiu       t0, t0, -128    // t0 = cr - 128
+    mult        $ac1, s4, t2
+    madd        $ac1, s3, t0
+    sll         t0, t0, 15
+    sll         t2, t2, 15
+    mulq_rs.w   t0, s1, t0      // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
+    extr_r.w    t5, $ac1, 16
+    mulq_rs.w   t6, s2, t2      // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
+    addiu       s7, s7, 1
+    addiu       s6, s6, 1
+    addu        t2, t1, t0      // t2 = y + cred
+    addu        t3, t1, t5      // t3 = y + cgreen
+    addu        t4, t1, t6      // t4 = y + cblue
+    addu        t2, t8, t2
+    addu        t3, t8, t3
+    addu        t4, t8, t4
+    lbu         t1, 1(s5)
+    lbu         v0, 0(t2)
+    lbu         v1, 0(t3)
+    lbu         ra, 0(t4)
+    addu        t2, t1, t0
+    addu        t3, t1, t5
+    addu        t4, t1, t6
+    addu        t2, t8, t2
+    addu        t3, t8, t3
+    addu        t4, t8, t4
+    lbu         t2, 0(t2)
+    lbu         t3, 0(t3)
+    lbu         t4, 0(t4)
+
+    STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
+
+    bne         t9, s6, 1b
+     addiu      s5, s5, 2
+2:
+    andi        t0, a0, 1
+    beqz        t0, 4f
+     nop
+3:
+    lbu         t2, 0(s6)
+    lbu         t0, 0(s7)
+    lbu         t1, 0(s5)
+    addiu       t2, t2, -128    // (cb - 128)
+    addiu       t0, t0, -128    // (cr - 128)
+    mul         t3, s4, t2
+    mul         t4, s3, t0
+    sll         t0, t0, 15
+    sll         t2, t2, 15
+    mulq_rs.w   t0, s1, t0      // (C1*cr + ONE_HALF)>> SCALEBITS
+    mulq_rs.w   t6, s2, t2      // (C2*cb + ONE_HALF)>> SCALEBITS
+    addu        t3, t3, s0
+    addu        t3, t4, t3
+    sra         t5, t3, 16      // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
+    addu        t2, t1, t0      // y + cred
+    addu        t3, t1, t5      // y + cgreen
+    addu        t4, t1, t6      // y + cblue
+    addu        t2, t8, t2
+    addu        t3, t8, t3
+    addu        t4, t8, t4
+    lbu         t2, 0(t2)
+    lbu         t3, 0(t3)
+    lbu         t4, 0(t4)
+
+    STORE_H2V1_1_PIXEL t2, t3, t4, t7
+4:
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    j           ra
+     nop
+
+END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
+
+.purgem STORE_H2V1_1_PIXEL
+.purgem STORE_H2V1_2_PIXELS
+.endm
+
+/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_fancy_upsample_dspr2
+ *
+ * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+ */
+LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = downsampled_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+    li            s4, 0
+    lw            s2, 0(a3)       // s2 = *output_data_ptr
+0:
+    li            t9, 2
+    lw            s1, -4(a2)      // s1 = inptr1
+
+1:
+    lw            s0, 0(a2)       // s0 = inptr0
+    lwx           s3, s4(s2)
+    addiu         s5, a1, -2      // s5 = downsampled_width - 2
+    srl           t4, s5, 1
+    sll           t4, t4, 1
+    lbu           t0, 0(s0)
+    lbu           t1, 1(s0)
+    lbu           t2, 0(s1)
+    lbu           t3, 1(s1)
+    addiu         s0, 2
+    addiu         s1, 2
+    addu          t8, s0, t4      // t8 = end address
+    andi          s5, s5, 1       // s5 = residual
+    sll           t4, t0, 1
+    sll           t6, t1, 1
+    addu          t0, t0, t4      // t0 = (*inptr0++) * 3
+    addu          t1, t1, t6      // t1 = (*inptr0++) * 3
+    addu          t7, t0, t2      // t7 = thiscolsum
+    addu          t6, t1, t3      // t5 = nextcolsum
+    sll           t0, t7, 2       // t0 = thiscolsum * 4
+    subu          t1, t0, t7      // t1 = thiscolsum * 3
+    shra_r.w      t0, t0, 4
+    addiu         t1, 7
+    addu          t1, t1, t6
+    srl           t1, t1, 4
+    sb            t0, 0(s3)
+    sb            t1, 1(s3)
+    beq           t8, s0, 22f     // skip to final iteration if width == 3
+     addiu        s3, 2
+2:
+    lh            t0, 0(s0)       // t0 = A3|A2
+    lh            t2, 0(s1)       // t2 = B3|B2
+    addiu         s0, 2
+    addiu         s1, 2
+    preceu.ph.qbr t0, t0          // t0 = 0|A3|0|A2
+    preceu.ph.qbr t2, t2          // t2 = 0|B3|0|B2
+    shll.ph       t1, t0, 1
+    sll           t3, t6, 1
+    addu.ph       t0, t1, t0      // t0 = A3*3|A2*3
+    addu          t3, t3, t6      // t3 = this * 3
+    addu.ph       t0, t0, t2      // t0 = next2|next1
+    addu          t1, t3, t7
+    andi          t7, t0, 0xFFFF  // t7 = next1
+    sll           t2, t7, 1
+    addu          t2, t7, t2      // t2 = next1*3
+    addu          t4, t2, t6
+    srl           t6, t0, 16      // t6 = next2
+    shra_r.w      t1, t1, 4       // t1 = (this*3 + last + 8) >> 4
+    addu          t0, t3, t7
+    addiu         t0, 7
+    srl           t0, t0, 4       // t0 = (this*3 + next1 + 7) >> 4
+    shra_r.w      t4, t4, 4       // t3 = (next1*3 + this + 8) >> 4
+    addu          t2, t2, t6
+    addiu         t2, 7
+    srl           t2, t2, 4       // t2 = (next1*3 + next2 + 7) >> 4
+    sb            t1, 0(s3)
+    sb            t0, 1(s3)
+    sb            t4, 2(s3)
+    sb            t2, 3(s3)
+    bne           t8, s0, 2b
+     addiu        s3, 4
+22:
+    beqz          s5, 4f
+     addu         t8, s0, s5
+3:
+    lbu           t0, 0(s0)
+    lbu           t2, 0(s1)
+    addiu         s0, 1
+    addiu         s1, 1
+    sll           t3, t6, 1
+    sll           t1, t0, 1
+    addu          t1, t0, t1      // t1 = inptr0 * 3
+    addu          t3, t3, t6      // t3 = thiscolsum * 3
+    addu          t5, t1, t2
+    addu          t1, t3, t7
+    shra_r.w      t1, t1, 4
+    addu          t0, t3, t5
+    addiu         t0, 7
+    srl           t0, t0, 4
+    sb            t1, 0(s3)
+    sb            t0, 1(s3)
+    addiu         s3, 2
+    move          t7, t6
+    bne           t8, s0, 3b
+     move         t6, t5
+4:
+    sll           t0, t6, 2       // t0 = thiscolsum * 4
+    subu          t1, t0, t6      // t1 = thiscolsum * 3
+    addu          t1, t1, t7
+    addiu         s4, 4
+    shra_r.w      t1, t1, 4
+    addiu         t0, 7
+    srl           t0, t0, 4
+    sb            t1, 0(s3)
+    sb            t0, 1(s3)
+    addiu         t9, -1
+    addiu         s3, 2
+    bnez          t9, 1b
+     lw           s1, 4(a2)
+    srl           t0, s4, 2
+    subu          t0, a0, t0
+    bgtz          t0, 0b
+     addiu        a2, 4
+
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+    j             ra
+     nop
+END(jsimd_h2v2_fancy_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = downsampled_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    .set at
+
+    beqz          a0, 3f
+     sll          t0, a0, 2
+    lw            s1, 0(a3)
+    li            s3, 0x10001
+    addu          s0, s1, t0
+0:
+    addiu         t8, a1, -2
+    srl           t9, t8, 2
+    lw            t7, 0(a2)
+    lw            s2, 0(s1)
+    lbu           t0, 0(t7)
+    lbu           t1, 1(t7)       // t1 = inptr[1]
+    sll           t2, t0, 1
+    addu          t2, t2, t0      // t2 = invalue*3
+    addu          t2, t2, t1
+    shra_r.w      t2, t2, 2
+    sb            t0, 0(s2)
+    sb            t2, 1(s2)
+    beqz          t9, 11f
+     addiu        s2, 2
+1:
+    ulw           t0, 0(t7)       // t0 = |P3|P2|P1|P0|
+    ulw           t1, 1(t7)
+    ulh           t2, 4(t7)       // t2 = |0|0|P5|P4|
+    preceu.ph.qbl t3, t0          // t3 = |0|P3|0|P2|
+    preceu.ph.qbr t0, t0          // t0 = |0|P1|0|P0|
+    preceu.ph.qbr t2, t2          // t2 = |0|P5|0|P4|
+    preceu.ph.qbl t4, t1          // t4 = |0|P4|0|P3|
+    preceu.ph.qbr t1, t1          // t1 = |0|P2|0|P1|
+    shll.ph       t5, t4, 1
+    shll.ph       t6, t1, 1
+    addu.ph       t5, t5, t4      // t5 = |P4*3|P3*3|
+    addu.ph       t6, t6, t1      // t6 = |P2*3|P1*3|
+    addu.ph       t4, t3, s3
+    addu.ph       t0, t0, s3
+    addu.ph       t4, t4, t5
+    addu.ph       t0, t0, t6
+    shrl.ph       t4, t4, 2       // t4 = |0|P3|0|P2|
+    shrl.ph       t0, t0, 2       // t0 = |0|P1|0|P0|
+    addu.ph       t2, t2, t5
+    addu.ph       t3, t3, t6
+    shra_r.ph     t2, t2, 2       // t2 = |0|P5|0|P4|
+    shra_r.ph     t3, t3, 2       // t3 = |0|P3|0|P2|
+    shll.ph       t2, t2, 8
+    shll.ph       t3, t3, 8
+    or            t2, t4, t2
+    or            t3, t3, t0
+    addiu         t9, -1
+    usw           t3, 0(s2)
+    usw           t2, 4(s2)
+    addiu         s2, 8
+    bgtz          t9, 1b
+     addiu        t7, 4
+11:
+    andi          t8, 3
+    beqz          t8, 22f
+     addiu        t7, 1
+
+2:
+    lbu           t0, 0(t7)
+    addiu         t7, 1
+    sll           t1, t0, 1
+    addu          t2, t0, t1      // t2 = invalue
+    lbu           t3, -2(t7)
+    lbu           t4, 0(t7)
+    addiu         t3, 1
+    addiu         t4, 2
+    addu          t3, t3, t2
+    addu          t4, t4, t2
+    srl           t3, 2
+    srl           t4, 2
+    sb            t3, 0(s2)
+    sb            t4, 1(s2)
+    addiu         t8, -1
+    bgtz          t8, 2b
+     addiu        s2, 2
+
+22:
+    lbu           t0, 0(t7)
+    lbu           t2, -1(t7)
+    sll           t1, t0, 1
+    addu          t1, t1, t0      // t1 = invalue * 3
+    addu          t1, t1, t2
+    addiu         t1, 1
+    srl           t1, t1, 2
+    sb            t1, 0(s2)
+    sb            t0, 1(s2)
+    addiu         s1, 4
+    bne           s1, s0, 0b
+     addiu        a2, 4
+3:
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j             ra
+     nop
+END(jsimd_h2v1_fancy_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_downsample_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = cinfo->max_v_samp_factor
+ * a2     = compptr->v_samp_factor
+ * a3     = compptr->width_in_blocks
+ * 16(sp) = input_data
+ * 20(sp) = output_data
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
+
+    beqz        a2, 7f
+     lw         s1, 44(sp)      // s1 = output_data
+    lw          s0, 40(sp)      // s0 = input_data
+    srl         s2, a0, 2
+    andi        t9, a0, 2
+    srl         t7, t9, 1
+    addu        s2, t7, s2
+    sll         t0, a3, 3       // t0 = width_in_blocks*DCT
+    srl         t7, t0, 1
+    subu        s2, t7, s2
+0:
+    andi        t6, a0, 1       // t6 = temp_index
+    addiu       t6, -1
+    lw          t4, 0(s1)       // t4 = outptr
+    lw          t5, 0(s0)       // t5 = inptr0
+    li          s3, 0           // s3 = bias
+    srl         t7, a0, 1       // t7 = image_width1
+    srl         s4, t7, 2
+    andi        t8, t7, 3
+1:
+    ulhu        t0, 0(t5)
+    ulhu        t1, 2(t5)
+    ulhu        t2, 4(t5)
+    ulhu        t3, 6(t5)
+    raddu.w.qb  t0, t0
+    raddu.w.qb  t1, t1
+    raddu.w.qb  t2, t2
+    raddu.w.qb  t3, t3
+    shra.ph     t0, t0, 1
+    shra_r.ph   t1, t1, 1
+    shra.ph     t2, t2, 1
+    shra_r.ph   t3, t3, 1
+    sb          t0, 0(t4)
+    sb          t1, 1(t4)
+    sb          t2, 2(t4)
+    sb          t3, 3(t4)
+    addiu       s4, -1
+    addiu       t4, 4
+    bgtz        s4, 1b
+     addiu      t5, 8
+    beqz        t8, 3f
+     addu       s4, t4, t8
+2:
+    ulhu        t0, 0(t5)
+    raddu.w.qb  t0, t0
+    addqh.w     t0, t0, s3
+    xori        s3, s3, 1
+    sb          t0, 0(t4)
+    addiu       t4, 1
+    bne         t4, s4, 2b
+     addiu      t5, 2
+3:
+    lbux        t1, t6(t5)
+    sll         t1, 1
+    addqh.w     t2, t1, s3      // t2 = pixval1
+    xori        s3, s3, 1
+    addqh.w     t3, t1, s3      // t3 = pixval2
+    blez        s2, 5f
+     append     t3, t2,  8
+    addu        t5, t4, s2      // t5 = loop_end2
+4:
+    ush         t3, 0(t4)
+    addiu       s2, -1
+    bgtz        s2, 4b
+     addiu      t4,  2
+5:
+    beqz        t9, 6f
+     nop
+    sb          t2, 0(t4)
+6:
+    addiu       s1, 4
+    addiu       a2, -1
+    bnez        a2, 0b
+     addiu      s0, 4
+7:
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
+
+    j           ra
+    nop
+END(jsimd_h2v1_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_downsample_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = cinfo->max_v_samp_factor
+ * a2     = compptr->v_samp_factor
+ * a3     = compptr->width_in_blocks
+ * 16(sp) = input_data
+ * 20(sp) = output_data
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    beqz        a2, 8f
+     lw         s1, 52(sp)      // s1 = output_data
+    lw          s0, 48(sp)      // s0 = input_data
+
+    andi        t6, a0, 1       // t6 = temp_index
+    addiu       t6, -1
+    srl         t7, a0, 1       // t7 = image_width1
+    srl         s4, t7, 2
+    andi        t8, t7, 3
+    andi        t9, a0, 2
+    srl         s2, a0, 2
+    srl         t7, t9, 1
+    addu        s2, t7, s2
+    sll         t0, a3, 3       // s2 = width_in_blocks*DCT
+    srl         t7, t0, 1
+    subu        s2, t7, s2
+0:
+    lw          t4, 0(s1)       // t4 = outptr
+    lw          t5, 0(s0)       // t5 = inptr0
+    lw          s7, 4(s0)       // s7 = inptr1
+    li          s6, 1           // s6 = bias
+2:
+    ulw         t0, 0(t5)       // t0 = |P3|P2|P1|P0|
+    ulw         t1, 0(s7)       // t1 = |Q3|Q2|Q1|Q0|
+    ulw         t2, 4(t5)
+    ulw         t3, 4(s7)
+    precrq.ph.w t7, t0, t1      // t2 = |P3|P2|Q3|Q2|
+    ins         t0, t1, 16, 16  // t0 = |Q1|Q0|P1|P0|
+    raddu.w.qb  t1, t7
+    raddu.w.qb  t0, t0
+    shra_r.w    t1, t1, 2
+    addiu       t0, 1
+    srl         t0, 2
+    precrq.ph.w t7, t2, t3
+    ins         t2, t3, 16, 16
+    raddu.w.qb  t7, t7
+    raddu.w.qb  t2, t2
+    shra_r.w    t7, t7, 2
+    addiu       t2, 1
+    srl         t2, 2
+    sb          t0, 0(t4)
+    sb          t1, 1(t4)
+    sb          t2, 2(t4)
+    sb          t7, 3(t4)
+    addiu       t4, 4
+    addiu       t5, 8
+    addiu       s4, s4, -1
+    bgtz        s4, 2b
+     addiu      s7, 8
+    beqz        t8, 4f
+     addu       t8, t4, t8
+3:
+    ulhu        t0, 0(t5)
+    ulhu        t1, 0(s7)
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t0, t0
+    addu        t0, t0, s6
+    srl         t0, 2
+    xori        s6, s6, 3
+    sb          t0, 0(t4)
+    addiu       t5, 2
+    addiu       t4, 1
+    bne         t8, t4, 3b
+     addiu      s7, 2
+4:
+    lbux        t1, t6(t5)
+    sll         t1, 1
+    lbux        t0, t6(s7)
+    sll         t0, 1
+    addu        t1, t1, t0
+    addu        t3, t1, s6
+    srl         t0, t3, 2       // t2 = pixval1
+    xori        s6, s6, 3
+    addu        t2, t1, s6
+    srl         t1, t2, 2       // t3 = pixval2
+    blez        s2, 6f
+     append     t1, t0, 8
+5:
+    ush         t1, 0(t4)
+    addiu       s2, -1
+    bgtz        s2, 5b
+     addiu      t4, 2
+6:
+    beqz        t9, 7f
+     nop
+    sb          t0, 0(t4)
+7:
+    addiu       s1, 4
+    addiu       a2, -1
+    bnez        a2, 0b
+     addiu      s0, 8
+8:
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_h2v2_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2)
+/*
+ * a0     = input_data
+ * a1     = output_data
+ * a2     = compptr->v_samp_factor
+ * a3     = cinfo->max_v_samp_factor
+ * 16(sp) = cinfo->smoothing_factor
+ * 20(sp) = compptr->width_in_blocks
+ * 24(sp) = cinfo->image_width
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          s7, 52(sp)      // compptr->width_in_blocks
+    lw          s0, 56(sp)      // cinfo->image_width
+    lw          s6, 48(sp)      // cinfo->smoothing_factor
+    sll         s7, 3           // output_cols = width_in_blocks * DCTSIZE
+    sll         v0, s7, 1
+    subu        v0, v0, s0
+    blez        v0, 2f
+    move        v1, zero
+    addiu       t0, a3, 2       // t0 = cinfo->max_v_samp_factor + 2
+0:
+    addiu       t1, a0, -4
+    sll         t2, v1, 2
+    lwx         t1, t2(t1)
+    move        t3, v0
+    addu        t1, t1, s0
+    lbu         t2, -1(t1)
+1:
+    addiu       t3, t3, -1
+    sb          t2, 0(t1)
+    bgtz        t3, 1b
+    addiu       t1, t1, 1
+    addiu       v1, v1, 1
+    bne         v1, t0, 0b
+    nop
+2:
+    li          v0, 80
+    mul         v0, s6, v0
+    li          v1, 16384
+    move        t4, zero
+    move        t5, zero
+    subu        t6, v1, v0      // t6 = 16384 - tmp_smoot_f * 80
+    sll         t7, s6, 4       // t7 = tmp_smoot_f * 16
+3:
+/* Special case for first column: pretend column -1 is same as column 0 */
+    sll         v0, t4, 2
+    lwx         t8, v0(a1)      //  outptr = output_data[outrow]
+    sll         v1, t5, 2
+    addiu       t9, v1, 4
+    addiu       s0, v1, -4
+    addiu       s1, v1, 8
+    lwx         s2, v1(a0)      // inptr0 = input_data[inrow]
+    lwx         t9, t9(a0)      // inptr1 = input_data[inrow+1]
+    lwx         s0, s0(a0)      // above_ptr = input_data[inrow-1]
+    lwx         s1, s1(a0)      // below_ptr = input_data[inrow+2]
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, 0(s2)
+    lbu         v1, 2(s2)
+    lbu         t0, 0(t9)
+    lbu         t1, 2(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 2(s0)
+    addu        t0, t0, v0
+    lbu         t3, 2(s1)
+    addu        s3, t0, s3
+    lbu         v0, 0(s0)
+    lbu         t0, 0(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    addu        t0, t0, v0
+    addu        s3, t0, s3
+    madd        $ac1, s3, t7
+    extr_r.w    v0, $ac1, 16
+    addiu       t8, t8, 1
+    addiu       s2, s2, 2
+    addiu       t9, t9, 2
+    addiu       s0, s0, 2
+    addiu       s1, s1, 2
+    sb          v0, -1(t8)
+    addiu       s4, s7, -2
+    and         s4, s4, 3
+    addu        s5, s4, t8      // end address
+4:
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, -1(s2)
+    lbu         v1, 2(s2)
+    lbu         t0, -1(t9)
+    lbu         t1, 2(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 2(s0)
+    addu        t0, t0, v0
+    lbu         t3, 2(s1)
+    addu        s3, t0, s3
+    lbu         v0, -1(s0)
+    lbu         t0, -1(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    addu        t0, t0, v0
+    addu        s3, t0, s3
+    madd        $ac1, s3, t7
+    extr_r.w    t2, $ac1, 16
+    addiu       t8, t8, 1
+    addiu       s2, s2, 2
+    addiu       t9, t9, 2
+    addiu       s0, s0, 2
+    sb          t2, -1(t8)
+    bne         s5, t8, 4b
+    addiu       s1, s1, 2
+    addiu       s5, s7, -2
+    subu        s5, s5, s4
+    addu        s5, s5, t8      // end address
+5:
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, -1(s2)
+    lbu         v1, 2(s2)
+    lbu         t0, -1(t9)
+    lbu         t1, 2(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 2(s0)
+    addu        t0, t0, v0
+    lbu         t3, 2(s1)
+    addu        s3, t0, s3
+    lbu         v0, -1(s0)
+    lbu         t0, -1(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    lh          v1, 2(t9)
+    addu        t0, t0, v0
+    lh          v0, 2(s2)
+    addu        s3, t0, s3
+    lh          t0, 2(s0)
+    lh          t1, 2(s1)
+    madd        $ac1, s3, t7
+    extr_r.w    t2, $ac1, 16
+    ins         t0, t1, 16, 16
+    ins         v0, v1, 16, 16
+    raddu.w.qb  s3, t0
+    lbu         v1, 4(s2)
+    lbu         t0, 1(t9)
+    lbu         t1, 4(t9)
+    sb          t2, 0(t8)
+    raddu.w.qb  t3, v0
+    lbu         v0, 1(s2)
+    addu        t0, t0, t1
+    mult        $ac1, t3, t6
+    addu        v0, v0, v1
+    lbu         t2, 4(s0)
+    addu        t0, t0, v0
+    lbu         v0, 1(s0)
+    addu        s3, t0, s3
+    lbu         t0, 1(s1)
+    lbu         t3, 4(s1)
+    addu        v0, v0, t2
+    sll         s3, s3, 1
+    addu        t0, t0, t3
+    lh          v1, 4(t9)
+    addu        t0, t0, v0
+    lh          v0, 4(s2)
+    addu        s3, t0, s3
+    lh          t0, 4(s0)
+    lh          t1, 4(s1)
+    madd        $ac1, s3, t7
+    extr_r.w    t2, $ac1, 16
+    ins         t0, t1, 16, 16
+    ins         v0, v1, 16, 16
+    raddu.w.qb  s3, t0
+    lbu         v1, 6(s2)
+    lbu         t0, 3(t9)
+    lbu         t1, 6(t9)
+    sb          t2, 1(t8)
+    raddu.w.qb  t3, v0
+    lbu         v0, 3(s2)
+    addu        t0, t0, t1
+    mult        $ac1, t3, t6
+    addu        v0, v0, v1
+    lbu         t2, 6(s0)
+    addu        t0, t0, v0
+    lbu         v0, 3(s0)
+    addu        s3, t0, s3
+    lbu         t0, 3(s1)
+    lbu         t3, 6(s1)
+    addu        v0, v0, t2
+    sll         s3, s3, 1
+    addu        t0, t0, t3
+    lh          v1, 6(t9)
+    addu        t0, t0, v0
+    lh          v0, 6(s2)
+    addu        s3, t0, s3
+    lh          t0, 6(s0)
+    lh          t1, 6(s1)
+    madd        $ac1, s3, t7
+    extr_r.w    t3, $ac1, 16
+    ins         t0, t1, 16, 16
+    ins         v0, v1, 16, 16
+    raddu.w.qb  s3, t0
+    lbu         v1, 8(s2)
+    lbu         t0, 5(t9)
+    lbu         t1, 8(t9)
+    sb          t3, 2(t8)
+    raddu.w.qb  t2, v0
+    lbu         v0, 5(s2)
+    addu        t0, t0, t1
+    mult        $ac1, t2, t6
+    addu        v0, v0, v1
+    lbu         t2, 8(s0)
+    addu        t0, t0, v0
+    lbu         v0, 5(s0)
+    addu        s3, t0, s3
+    lbu         t0, 5(s1)
+    lbu         t3, 8(s1)
+    addu        v0, v0, t2
+    sll         s3, s3, 1
+    addu        t0, t0, t3
+    addiu       t8, t8, 4
+    addu        t0, t0, v0
+    addiu       s2, s2, 8
+    addu        s3, t0, s3
+    addiu       t9, t9, 8
+    madd        $ac1, s3, t7
+    extr_r.w    t1, $ac1, 16
+    addiu       s0, s0, 8
+    addiu       s1, s1, 8
+    bne         s5, t8, 5b
+    sb          t1, -1(t8)
+/* Special case for last column */
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, -1(s2)
+    lbu         v1, 1(s2)
+    lbu         t0, -1(t9)
+    lbu         t1, 1(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 1(s0)
+    addu        t0, t0, v0
+    lbu         t3, 1(s1)
+    addu        s3, t0, s3
+    lbu         v0, -1(s0)
+    lbu         t0, -1(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    addu        t0, t0, v0
+    addu        s3, t0, s3
+    madd        $ac1, s3, t7
+    extr_r.w    t0, $ac1, 16
+    addiu       t5, t5, 2
+    sb          t0, 0(t8)
+    addiu       t4, t4, 1
+    bne         t4, a2, 3b
+    addiu       t5, t5, 2
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+
+END(jsimd_h2v2_smooth_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_int_upsample_dspr2)
+/*
+ * a0     = upsample->h_expand[compptr->component_index]
+ * a1     = upsample->v_expand[compptr->component_index]
+ * a2     = input_data
+ * a3     = output_data_ptr
+ * 16(sp) = cinfo->output_width
+ * 20(sp) = cinfo->max_v_samp_factor
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    lw          s0, 0(a3)       // s0 = output_data
+    lw          s1, 32(sp)      // s1 = cinfo->output_width
+    lw          s2, 36(sp)      // s2 = cinfo->max_v_samp_factor
+    li          t6, 0           // t6 = inrow
+    beqz        s2, 10f
+     li         s3, 0           // s3 = outrow
+0:
+    addu        t0, a2, t6
+    addu        t7, s0, s3
+    lw          t3, 0(t0)       // t3 = inptr
+    lw          t8, 0(t7)       // t8 = outptr
+    beqz        s1, 4f
+     addu       t5, t8, s1      // t5 = outend
+1:
+    lb          t2, 0(t3)       // t2 = invalue = *inptr++
+    addiu       t3, 1
+    beqz        a0, 3f
+     move       t0, a0          // t0 = h_expand
+2:
+    sb          t2, 0(t8)
+    addiu       t0, -1
+    bgtz        t0, 2b
+     addiu      t8, 1
+3:
+    bgt         t5, t8, 1b
+     nop
+4:
+    addiu       t9, a1, -1      // t9 = v_expand - 1
+    blez        t9, 9f
+     nop
+5:
+    lw          t3, 0(s0)
+    lw          t4, 4(s0)
+    subu        t0, s1, 0xF
+    blez        t0, 7f
+     addu       t5, t3, s1      // t5 = end address
+    andi        t7, s1, 0xF     // t7 = residual
+    subu        t8, t5, t7
+6:
+    ulw         t0, 0(t3)
+    ulw         t1, 4(t3)
+    ulw         t2, 8(t3)
+    usw         t0, 0(t4)
+    ulw         t0, 12(t3)
+    usw         t1, 4(t4)
+    usw         t2, 8(t4)
+    usw         t0, 12(t4)
+    addiu       t3, 16
+    bne         t3, t8, 6b
+     addiu      t4, 16
+    beqz        t7, 8f
+     nop
+7:
+    lbu         t0, 0(t3)
+    sb          t0, 0(t4)
+    addiu       t3, 1
+    bne         t3, t5, 7b
+     addiu      t4, 1
+8:
+    addiu       t9, -1
+    bgtz        t9, 5b
+     addiu      s0, 8
+9:
+    addu        s3, s3, a1
+    bne         s3, s2, 0b
+     addiu      t6, 1
+10:
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j           ra
+     nop
+END(jsimd_int_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = cinfo->output_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    lw          t7, 0(a3)       // t7 = output_data
+    andi        t8, a1, 0xf     // t8 = residual
+    sll         t0, a0, 2
+    blez        a0, 4f
+     addu       t9, t7, t0      // t9 = output_data end address
+0:
+    lw          t5, 0(t7)       // t5 = outptr
+    lw          t6, 0(a2)       // t6 = inptr
+    addu        t3, t5, a1      // t3 = outptr + output_width (end address)
+    subu        t3, t8          // t3 = end address - residual
+    beq         t5, t3, 2f
+     move       t4, t8
+1:
+    ulw         t0, 0(t6)       // t0 = |P3|P2|P1|P0|
+    ulw         t2, 4(t6)       // t2 = |P7|P6|P5|P4|
+    srl         t1, t0, 16      // t1 = |X|X|P3|P2|
+    ins         t0, t0, 16, 16  // t0 = |P1|P0|P1|P0|
+    ins         t1, t1, 16, 16  // t1 = |P3|P2|P3|P2|
+    ins         t0, t0, 8, 16   // t0 = |P1|P1|P0|P0|
+    ins         t1, t1, 8, 16   // t1 = |P3|P3|P2|P2|
+    usw         t0, 0(t5)
+    usw         t1, 4(t5)
+    srl         t0, t2, 16      // t0 = |X|X|P7|P6|
+    ins         t2, t2, 16, 16  // t2 = |P5|P4|P5|P4|
+    ins         t0, t0, 16, 16  // t0 = |P7|P6|P7|P6|
+    ins         t2, t2, 8, 16   // t2 = |P5|P5|P4|P4|
+    ins         t0, t0, 8, 16   // t0 = |P7|P7|P6|P6|
+    usw         t2, 8(t5)
+    usw         t0, 12(t5)
+    addiu       t5, 16
+    bne         t5, t3, 1b
+     addiu      t6, 8
+    beqz        t8, 3f
+     move       t4, t8
+2:
+    lbu         t1, 0(t6)
+    sb          t1, 0(t5)
+    sb          t1, 1(t5)
+    addiu       t4, -2
+    addiu       t6, 1
+    bgtz        t4, 2b
+     addiu      t5, 2
+3:
+    addiu       t7, 4
+    bne         t9, t7, 0b
+     addiu      a2, 4
+4:
+    j           ra
+     nop
+END(jsimd_h2v1_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = cinfo->output_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    lw          t7, 0(a3)
+    blez        a0, 7f
+     andi       t9, a1, 0xf     // t9 = residual
+0:
+    lw          t6, 0(a2)       // t6 = inptr
+    lw          t5, 0(t7)       // t5 = outptr
+    addu        t8, t5, a1      // t8 = outptr end address
+    subu        t8, t9          // t8 = end address - residual
+    beq         t5, t8, 2f
+     move       t4, t9
+1:
+    ulw         t0, 0(t6)
+    srl         t1, t0, 16
+    ins         t0, t0, 16, 16
+    ins         t0, t0, 8, 16
+    ins         t1, t1, 16, 16
+    ins         t1, t1, 8, 16
+    ulw         t2, 4(t6)
+    usw         t0, 0(t5)
+    usw         t1, 4(t5)
+    srl         t3, t2, 16
+    ins         t2, t2, 16, 16
+    ins         t2, t2, 8, 16
+    ins         t3, t3, 16, 16
+    ins         t3, t3, 8, 16
+    usw         t2, 8(t5)
+    usw         t3, 12(t5)
+    addiu       t5, 16
+    bne         t5, t8, 1b
+     addiu      t6, 8
+    beqz        t9, 3f
+     move       t4, t9
+2:
+    lbu         t0, 0(t6)
+    sb          t0, 0(t5)
+    sb          t0, 1(t5)
+    addiu       t4, -2
+    addiu       t6, 1
+    bgtz        t4, 2b
+     addiu      t5, 2
+3:
+    lw          t6, 0(t7)       // t6 = outptr[0]
+    lw          t5, 4(t7)       // t5 = outptr[1]
+    addu        t4, t6, a1      // t4 = new end address
+    beq         a1, t9, 5f
+     subu       t8, t4, t9
+4:
+    ulw         t0, 0(t6)
+    ulw         t1, 4(t6)
+    ulw         t2, 8(t6)
+    usw         t0, 0(t5)
+    ulw         t0, 12(t6)
+    usw         t1, 4(t5)
+    usw         t2, 8(t5)
+    usw         t0, 12(t5)
+    addiu       t6, 16
+    bne         t6, t8, 4b
+     addiu      t5, 16
+    beqz        t9, 6f
+     nop
+5:
+    lbu         t0, 0(t6)
+    sb          t0, 0(t5)
+    addiu       t6, 1
+    bne         t6, t4, 5b
+     addiu      t5, 1
+6:
+    addiu       t7, 8
+    addiu       a0, -2
+    bgtz        a0, 0b
+     addiu      a2, 4
+7:
+    j           ra
+     nop
+END(jsimd_h2v2_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_islow_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = compptr->dcttable
+ * a2 = output
+ * a3 = range_limit
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu       sp, sp, -256
+    move        v0, sp
+    addiu       v1, zero, 8     // v1 = DCTSIZE = 8
+1:
+    lh          s4, 32(a0)      // s4 = inptr[16]
+    lh          s5, 64(a0)      // s5 = inptr[32]
+    lh          s6, 96(a0)      // s6 = inptr[48]
+    lh          t1, 112(a0)     // t1 = inptr[56]
+    lh          t7, 16(a0)      // t7 = inptr[8]
+    lh          t5, 80(a0)      // t5 = inptr[40]
+    lh          t3, 48(a0)      // t3 = inptr[24]
+    or          s4, s4, t1
+    or          s4, s4, t3
+    or          s4, s4, t5
+    or          s4, s4, t7
+    or          s4, s4, s5
+    or          s4, s4, s6
+    bnez        s4, 2f
+     addiu      v1, v1, -1
+    lh          s5, 0(a1)       // quantptr[DCTSIZE*0]
+    lh          s6, 0(a0)       // inptr[DCTSIZE*0]
+    mul         s5, s5, s6      // DEQUANTIZE(inptr[0], quantptr[0])
+    sll         s5, s5, 2
+    sw          s5, 0(v0)
+    sw          s5, 32(v0)
+    sw          s5, 64(v0)
+    sw          s5, 96(v0)
+    sw          s5, 128(v0)
+    sw          s5, 160(v0)
+    sw          s5, 192(v0)
+    b           3f
+     sw         s5, 224(v0)
+2:
+    lh          t0, 112(a1)
+    lh          t2, 48(a1)
+    lh          t4, 80(a1)
+    lh          t6, 16(a1)
+    mul         t0, t0, t1      // DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7])
+    mul         t1, t2, t3      // DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3])
+    mul         t2, t4, t5      // DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5])
+    mul         t3, t6, t7      // DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1])
+    lh          t4, 32(a1)
+    lh          t5, 32(a0)
+    lh          t6, 96(a1)
+    lh          t7, 96(a0)
+    addu        s0, t0, t1       // z3 = tmp0 + tmp2
+    addu        s1, t1, t2       // z2 = tmp1 + tmp2
+    addu        s2, t2, t3       // z4 = tmp1 + tmp3
+    addu        s3, s0, s2       // z3 + z4
+    addiu       t9, zero, 9633   // FIX_1_175875602
+    mul         s3, s3, t9       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
+    addu        t8, t0, t3       // z1 = tmp0 + tmp3
+    addiu       t9, zero, 2446   // FIX_0_298631336
+    mul         t0, t0, t9       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
+    addiu       t9, zero, 16819  // FIX_2_053119869
+    mul         t2, t2, t9       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
+    addiu       t9, zero, 25172  // FIX_3_072711026
+    mul         t1, t1, t9       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
+    addiu       t9, zero, 12299  // FIX_1_501321110
+    mul         t3, t3, t9       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
+    addiu       t9, zero, 16069  // FIX_1_961570560
+    mul         s0, s0, t9       // -z3 = MULTIPLY(z3, FIX_1_961570560)
+    addiu       t9, zero, 3196   // FIX_0_390180644
+    mul         s2, s2, t9       // -z4 = MULTIPLY(z4, FIX_0_390180644)
+    addiu       t9, zero, 7373   // FIX_0_899976223
+    mul         t8, t8, t9       // -z1 = MULTIPLY(z1, FIX_0_899976223)
+    addiu       t9, zero, 20995  // FIX_2_562915447
+    mul         s1, s1, t9       // -z2 = MULTIPLY(z2, FIX_2_562915447)
+    subu        s0, s3, s0       // z3 += z5
+    addu        t0, t0, s0       // tmp0 += z3
+    addu        t1, t1, s0       // tmp2 += z3
+    subu        s2, s3, s2       // z4 += z5
+    addu        t2, t2, s2       // tmp1 += z4
+    addu        t3, t3, s2       // tmp3 += z4
+    subu        t0, t0, t8       // tmp0 += z1
+    subu        t1, t1, s1       // tmp2 += z2
+    subu        t2, t2, s1       // tmp1 += z2
+    subu        t3, t3, t8       // tmp3 += z1
+    mul         s0, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2])
+    addiu       t9, zero, 6270   // FIX_0_765366865
+    mul         s1, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6])
+    lh          t4, 0(a1)
+    lh          t5, 0(a0)
+    lh          t6, 64(a1)
+    lh          t7, 64(a0)
+    mul         s2, t9, s0       // MULTIPLY(z2, FIX_0_765366865)
+    mul         t5, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0])
+    mul         t6, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4])
+    addiu       t9, zero, 4433   // FIX_0_541196100
+    addu        s3, s0, s1       // z2 + z3
+    mul         s3, s3, t9       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
+    addiu       t9, zero, 15137  // FIX_1_847759065
+    mul         t8, s1, t9       // MULTIPLY(z3, FIX_1_847759065)
+    addu        t4, t5, t6
+    subu        t5, t5, t6
+    sll         t4, t4, 13      // tmp0 = (z2 + z3) << CONST_BITS
+    sll         t5, t5, 13      // tmp1 = (z2 - z3) << CONST_BITS
+    addu        t7, s3, s2      // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
+    subu        t6, s3, t8      // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065)
+    addu        s0, t4, t7
+    subu        s1, t4, t7
+    addu        s2, t5, t6
+    subu        s3, t5, t6
+    addu        t4, s0, t3
+    subu        s0, s0, t3
+    addu        t3, s2, t1
+    subu        s2, s2, t1
+    addu        t1, s3, t2
+    subu        s3, s3, t2
+    addu        t2, s1, t0
+    subu        s1, s1, t0
+    shra_r.w    t4, t4, 11
+    shra_r.w    t3, t3, 11
+    shra_r.w    t1, t1, 11
+    shra_r.w    t2, t2, 11
+    shra_r.w    s1, s1, 11
+    shra_r.w    s3, s3, 11
+    shra_r.w    s2, s2, 11
+    shra_r.w    s0, s0, 11
+    sw          t4, 0(v0)
+    sw          t3, 32(v0)
+    sw          t1, 64(v0)
+    sw          t2, 96(v0)
+    sw          s1, 128(v0)
+    sw          s3, 160(v0)
+    sw          s2, 192(v0)
+    sw          s0, 224(v0)
+3:
+    addiu       a1, a1, 2
+    addiu       a0, a0, 2
+    bgtz        v1, 1b
+     addiu      v0, v0, 4
+    move        v0, sp
+    addiu       v1, zero, 8
+4:
+    lw          t0, 8(v0)       // z2 = (JLONG)wsptr[2]
+    lw          t1, 24(v0)      // z3 = (JLONG)wsptr[6]
+    lw          t2, 0(v0)       // (JLONG)wsptr[0]
+    lw          t3, 16(v0)      // (JLONG)wsptr[4]
+    lw          s4, 4(v0)       // (JLONG)wsptr[1]
+    lw          s5, 12(v0)      // (JLONG)wsptr[3]
+    lw          s6, 20(v0)      // (JLONG)wsptr[5]
+    lw          s7, 28(v0)      // (JLONG)wsptr[7]
+    or          s4, s4, t0
+    or          s4, s4, t1
+    or          s4, s4, t3
+    or          s4, s4, s7
+    or          s4, s4, s5
+    or          s4, s4, s6
+    bnez        s4, 5f
+     addiu      v1, v1, -1
+    shra_r.w    s5, t2, 5
+    andi        s5, s5, 0x3ff
+    lbux        s5, s5(a3)
+    lw          s1, 0(a2)
+    replv.qb    s5, s5
+    usw         s5, 0(s1)
+    usw         s5, 4(s1)
+    b           6f
+     nop
+5:
+    addu        t4, t0, t1       // z2 + z3
+    addiu       t8, zero, 4433   // FIX_0_541196100
+    mul         t5, t4, t8       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
+    addiu       t8, zero, 15137  // FIX_1_847759065
+    mul         t1, t1, t8       // MULTIPLY(z3, FIX_1_847759065)
+    addiu       t8, zero, 6270   // FIX_0_765366865
+    mul         t0, t0, t8       // MULTIPLY(z2, FIX_0_765366865)
+    addu        t4, t2, t3       // (JLONG)wsptr[0] + (JLONG)wsptr[4]
+    subu        t2, t2, t3       // (JLONG)wsptr[0] - (JLONG)wsptr[4]
+    sll         t4, t4, 13       // tmp0 = (wsptr[0] + wsptr[4]) << CONST_BITS
+    sll         t2, t2, 13       // tmp1 = (wsptr[0] - wsptr[4]) << CONST_BITS
+    subu        t1, t5, t1       // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065)
+    subu        t3, t2, t1       // tmp12 = tmp1 - tmp2
+    addu        t2, t2, t1       // tmp11 = tmp1 + tmp2
+    addu        t5, t5, t0       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
+    subu        t1, t4, t5       // tmp13 = tmp0 - tmp3
+    addu        t0, t4, t5       // tmp10 = tmp0 + tmp3
+    lw          t4, 28(v0)       // tmp0 = (JLONG)wsptr[7]
+    lw          t6, 12(v0)       // tmp2 = (JLONG)wsptr[3]
+    lw          t5, 20(v0)       // tmp1 = (JLONG)wsptr[5]
+    lw          t7, 4(v0)        // tmp3 = (JLONG)wsptr[1]
+    addu        s0, t4, t6       // z3 = tmp0 + tmp2
+    addiu       t8, zero, 9633   // FIX_1_175875602
+    addu        s1, t5, t7       // z4 = tmp1 + tmp3
+    addu        s2, s0, s1       // z3 + z4
+    mul         s2, s2, t8       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
+    addu        s3, t4, t7       // z1 = tmp0 + tmp3
+    addu        t9, t5, t6       // z2 = tmp1 + tmp2
+    addiu       t8, zero, 16069  // FIX_1_961570560
+    mul         s0, s0, t8       // -z3 = MULTIPLY(z3, FIX_1_961570560)
+    addiu       t8, zero, 3196   // FIX_0_390180644
+    mul         s1, s1, t8       // -z4 = MULTIPLY(z4, FIX_0_390180644)
+    addiu       t8, zero, 2446   // FIX_0_298631336
+    mul         t4, t4, t8       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
+    addiu       t8, zero, 7373   // FIX_0_899976223
+    mul         s3, s3, t8       // -z1 = MULTIPLY(z1, FIX_0_899976223)
+    addiu       t8, zero, 16819  // FIX_2_053119869
+    mul         t5, t5, t8       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
+    addiu       t8, zero, 20995  // FIX_2_562915447
+    mul         t9, t9, t8       // -z2 = MULTIPLY(z2, FIX_2_562915447)
+    addiu       t8, zero, 25172  // FIX_3_072711026
+    mul         t6, t6, t8       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
+    addiu       t8, zero, 12299  // FIX_1_501321110
+    mul         t7, t7, t8       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
+    subu        s0, s2, s0       // z3 += z5
+    subu        s1, s2, s1       // z4 += z5
+    addu        t4, t4, s0
+    subu        t4, t4, s3      // tmp0
+    addu        t5, t5, s1
+    subu        t5, t5, t9      // tmp1
+    addu        t6, t6, s0
+    subu        t6, t6, t9      // tmp2
+    addu        t7, t7, s1
+    subu        t7, t7, s3      // tmp3
+    addu        s0, t0, t7
+    subu        t0, t0, t7
+    addu        t7, t2, t6
+    subu        t2, t2, t6
+    addu        t6, t3, t5
+    subu        t3, t3, t5
+    addu        t5, t1, t4
+    subu        t1, t1, t4
+    shra_r.w    s0, s0, 18
+    shra_r.w    t7, t7, 18
+    shra_r.w    t6, t6, 18
+    shra_r.w    t5, t5, 18
+    shra_r.w    t1, t1, 18
+    shra_r.w    t3, t3, 18
+    shra_r.w    t2, t2, 18
+    shra_r.w    t0, t0, 18
+    andi        s0, s0, 0x3ff
+    andi        t7, t7, 0x3ff
+    andi        t6, t6, 0x3ff
+    andi        t5, t5, 0x3ff
+    andi        t1, t1, 0x3ff
+    andi        t3, t3, 0x3ff
+    andi        t2, t2, 0x3ff
+    andi        t0, t0, 0x3ff
+    lw          s1, 0(a2)
+    lbux        s0, s0(a3)
+    lbux        t7, t7(a3)
+    lbux        t6, t6(a3)
+    lbux        t5, t5(a3)
+    lbux        t1, t1(a3)
+    lbux        t3, t3(a3)
+    lbux        t2, t2(a3)
+    lbux        t0, t0(a3)
+    sb          s0, 0(s1)
+    sb          t7, 1(s1)
+    sb          t6, 2(s1)
+    sb          t5, 3(s1)
+    sb          t1, 4(s1)
+    sb          t3, 5(s1)
+    sb          t2, 6(s1)
+    sb          t0, 7(s1)
+6:
+    addiu       v0, v0, 32
+    bgtz        v1, 4b
+     addiu      a2, a2, 4
+    addiu       sp, sp, 256
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+
+END(jsimd_idct_islow_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2)
+/*
+ * a0 = inptr
+ * a1 = quantptr
+ * a2 = wsptr
+ * a3 = mips_idct_ifast_coefs
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu         t9, a0, 16      // end address
+    or            AT, a3, zero
+
+0:
+    lw            s0, 0(a1)       // quantptr[DCTSIZE*0]
+    lw            t0, 0(a0)       // inptr[DCTSIZE*0]
+    lw            t1, 16(a0)      // inptr[DCTSIZE*1]
+    muleq_s.w.phl v0, t0, s0      // tmp0 ...
+    lw            t2, 32(a0)      // inptr[DCTSIZE*2]
+    lw            t3, 48(a0)      // inptr[DCTSIZE*3]
+    lw            t4, 64(a0)      // inptr[DCTSIZE*4]
+    lw            t5, 80(a0)      // inptr[DCTSIZE*5]
+    muleq_s.w.phr t0, t0, s0      // ... tmp0 ...
+    lw            t6, 96(a0)      // inptr[DCTSIZE*6]
+    lw            t7, 112(a0)     // inptr[DCTSIZE*7]
+    or            s4, t1, t2
+    or            s5, t3, t4
+    bnez          s4, 1f
+     ins          t0, v0, 16, 16  // ... tmp0
+    bnez          s5, 1f
+     or           s6, t5, t6
+    or            s6, s6, t7
+    bnez          s6, 1f
+     sw           t0, 0(a2)       // wsptr[DCTSIZE*0]
+    sw            t0, 16(a2)      // wsptr[DCTSIZE*1]
+    sw            t0, 32(a2)      // wsptr[DCTSIZE*2]
+    sw            t0, 48(a2)      // wsptr[DCTSIZE*3]
+    sw            t0, 64(a2)      // wsptr[DCTSIZE*4]
+    sw            t0, 80(a2)      // wsptr[DCTSIZE*5]
+    sw            t0, 96(a2)      // wsptr[DCTSIZE*6]
+    sw            t0, 112(a2)     // wsptr[DCTSIZE*7]
+    addiu         a0, a0, 4
+    b             2f
+     addiu        a1, a1, 4
+
+1:
+    lw            s1, 32(a1)      // quantptr[DCTSIZE*2]
+    lw            s2, 64(a1)      // quantptr[DCTSIZE*4]
+    muleq_s.w.phl v0, t2, s1      // tmp1 ...
+    muleq_s.w.phr t2, t2, s1      // ... tmp1 ...
+    lw            s0, 16(a1)      // quantptr[DCTSIZE*1]
+    lw            s1, 48(a1)      // quantptr[DCTSIZE*3]
+    lw            s3, 96(a1)      // quantptr[DCTSIZE*6]
+    muleq_s.w.phl v1, t4, s2      // tmp2 ...
+    muleq_s.w.phr t4, t4, s2      // ... tmp2 ...
+    lw            s2, 80(a1)      // quantptr[DCTSIZE*5]
+    lw            t8, 4(AT)       // FIX(1.414213562)
+    ins           t2, v0, 16, 16  // ... tmp1
+    muleq_s.w.phl v0, t6, s3      // tmp3 ...
+    muleq_s.w.phr t6, t6, s3      // ... tmp3 ...
+    ins           t4, v1, 16, 16  // ... tmp2
+    addq.ph       s4, t0, t4      // tmp10
+    subq.ph       s5, t0, t4      // tmp11
+    ins           t6, v0, 16, 16  // ... tmp3
+    subq.ph       s6, t2, t6      // tmp12 ...
+    addq.ph       s7, t2, t6      // tmp13
+    mulq_s.ph     s6, s6, t8      // ... tmp12 ...
+    addq.ph       t0, s4, s7      // tmp0
+    subq.ph       t6, s4, s7      // tmp3
+    muleq_s.w.phl v0, t1, s0      // tmp4 ...
+    muleq_s.w.phr t1, t1, s0      // ... tmp4 ...
+    shll_s.ph     s6, s6, 1       // x2
+    lw            s3, 112(a1)     // quantptr[DCTSIZE*7]
+    subq.ph       s6, s6, s7      // ... tmp12
+    muleq_s.w.phl v1, t7, s3      // tmp7 ...
+    muleq_s.w.phr t7, t7, s3      // ... tmp7 ...
+    ins           t1, v0, 16, 16  // ... tmp4
+    addq.ph       t2, s5, s6      // tmp1
+    subq.ph       t4, s5, s6      // tmp2
+    muleq_s.w.phl v0, t5, s2      // tmp6 ...
+    muleq_s.w.phr t5, t5, s2      // ... tmp6 ...
+    ins           t7, v1, 16, 16  // ... tmp7
+    addq.ph       s5, t1, t7      // z11
+    subq.ph       s6, t1, t7      // z12
+    muleq_s.w.phl v1, t3, s1      // tmp5 ...
+    muleq_s.w.phr t3, t3, s1      // ... tmp5 ...
+    ins           t5, v0, 16, 16  // ... tmp6
+    ins           t3, v1, 16, 16  // ... tmp5
+    addq.ph       s7, t5, t3      // z13
+    subq.ph       v0, t5, t3      // z10
+    addq.ph       t7, s5, s7      // tmp7
+    subq.ph       s5, s5, s7      // tmp11 ...
+    addq.ph       v1, v0, s6      // z5 ...
+    mulq_s.ph     s5, s5, t8      // ... tmp11
+    lw            t8, 8(AT)       // FIX(1.847759065)
+    lw            s4, 0(AT)       // FIX(1.082392200)
+    addq.ph       s0, t0, t7
+    subq.ph       s1, t0, t7
+    mulq_s.ph     v1, v1, t8      // ... z5
+    shll_s.ph     s5, s5, 1       // x2
+    lw            t8, 12(AT)      // FIX(-2.613125930)
+    sw            s0, 0(a2)       // wsptr[DCTSIZE*0]
+    shll_s.ph     v0, v0, 1       // x4
+    mulq_s.ph     v0, v0, t8      // tmp12 ...
+    mulq_s.ph     s4, s6, s4      // tmp10 ...
+    shll_s.ph     v1, v1, 1       // x2
+    addiu         a0, a0, 4
+    addiu         a1, a1, 4
+    sw            s1, 112(a2)     // wsptr[DCTSIZE*7]
+    shll_s.ph     s6, v0, 1       // x4
+    shll_s.ph     s4, s4, 1       // x2
+    addq.ph       s6, s6, v1      // ... tmp12
+    subq.ph       t5, s6, t7      // tmp6
+    subq.ph       s4, s4, v1      // ... tmp10
+    subq.ph       t3, s5, t5      // tmp5
+    addq.ph       s2, t2, t5
+    addq.ph       t1, s4, t3      // tmp4
+    subq.ph       s3, t2, t5
+    sw            s2, 16(a2)      // wsptr[DCTSIZE*1]
+    sw            s3, 96(a2)      // wsptr[DCTSIZE*6]
+    addq.ph       v0, t4, t3
+    subq.ph       v1, t4, t3
+    sw            v0, 32(a2)      // wsptr[DCTSIZE*2]
+    sw            v1, 80(a2)      // wsptr[DCTSIZE*5]
+    addq.ph       v0, t6, t1
+    subq.ph       v1, t6, t1
+    sw            v0, 64(a2)      // wsptr[DCTSIZE*4]
+    sw            v1, 48(a2)      // wsptr[DCTSIZE*3]
+
+2:
+    bne           a0, t9, 0b
+     addiu        a2, a2, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j             ra
+     nop
+
+END(jsimd_idct_ifast_cols_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2)
+/*
+ * a0 = wsptr
+ * a1 = output_buf
+ * a2 = output_col
+ * a3 = mips_idct_ifast_coefs
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+    addiu         t9, a0, 128     // end address
+    lui           s8, 0x8080
+    ori           s8, s8, 0x8080
+
+0:
+    lw            AT, 36(sp)      // restore $a3 (mips_idct_ifast_coefs)
+    lw            t0, 0(a0)       // wsptr[DCTSIZE*0+0/1]  b a
+    lw            s0, 16(a0)      // wsptr[DCTSIZE*1+0/1]  B A
+    lw            t2, 4(a0)       // wsptr[DCTSIZE*0+2/3]  d c
+    lw            s2, 20(a0)      // wsptr[DCTSIZE*1+2/3]  D C
+    lw            t4, 8(a0)       // wsptr[DCTSIZE*0+4/5]  f e
+    lw            s4, 24(a0)      // wsptr[DCTSIZE*1+4/5]  F E
+    lw            t6, 12(a0)      // wsptr[DCTSIZE*0+6/7]  h g
+    lw            s6, 28(a0)      // wsptr[DCTSIZE*1+6/7]  H G
+    precrq.ph.w   t1, s0, t0      // B b
+    ins           t0, s0, 16, 16  // A a
+    bnez          t1, 1f
+     or           s0, t2, s2
+    bnez          s0, 1f
+     or           s0, t4, s4
+    bnez          s0, 1f
+     or           s0, t6, s6
+    bnez          s0, 1f
+     shll_s.ph    s0, t0, 2       // A a
+    lw            a3, 0(a1)
+    lw            AT, 4(a1)
+    precrq.ph.w   t0, s0, s0      // A A
+    ins           s0, s0, 16, 16  // a a
+    addu          a3, a3, a2
+    addu          AT, AT, a2
+    precrq.qb.ph  t0, t0, t0      // A A A A
+    precrq.qb.ph  s0, s0, s0      // a a a a
+    addu.qb       s0, s0, s8
+    addu.qb       t0, t0, s8
+    sw            s0, 0(a3)
+    sw            s0, 4(a3)
+    sw            t0, 0(AT)
+    sw            t0, 4(AT)
+    addiu         a0, a0, 32
+    bne           a0, t9, 0b
+     addiu        a1, a1, 8
+    b             2f
+     nop
+
+1:
+    precrq.ph.w   t3, s2, t2
+    ins           t2, s2, 16, 16
+    precrq.ph.w   t5, s4, t4
+    ins           t4, s4, 16, 16
+    precrq.ph.w   t7, s6, t6
+    ins           t6, s6, 16, 16
+    lw            t8, 4(AT)       // FIX(1.414213562)
+    addq.ph       s4, t0, t4      // tmp10
+    subq.ph       s5, t0, t4      // tmp11
+    subq.ph       s6, t2, t6      // tmp12 ...
+    addq.ph       s7, t2, t6      // tmp13
+    mulq_s.ph     s6, s6, t8      // ... tmp12 ...
+    addq.ph       t0, s4, s7      // tmp0
+    subq.ph       t6, s4, s7      // tmp3
+    shll_s.ph     s6, s6, 1       // x2
+    subq.ph       s6, s6, s7      // ... tmp12
+    addq.ph       t2, s5, s6      // tmp1
+    subq.ph       t4, s5, s6      // tmp2
+    addq.ph       s5, t1, t7      // z11
+    subq.ph       s6, t1, t7      // z12
+    addq.ph       s7, t5, t3      // z13
+    subq.ph       v0, t5, t3      // z10
+    addq.ph       t7, s5, s7      // tmp7
+    subq.ph       s5, s5, s7      // tmp11 ...
+    addq.ph       v1, v0, s6      // z5 ...
+    mulq_s.ph     s5, s5, t8      // ... tmp11
+    lw            t8, 8(AT)       // FIX(1.847759065)
+    lw            s4, 0(AT)       // FIX(1.082392200)
+    addq.ph       s0, t0, t7      // tmp0 + tmp7
+    subq.ph       s7, t0, t7      // tmp0 - tmp7
+    mulq_s.ph     v1, v1, t8      // ... z5
+    lw            a3, 0(a1)
+    lw            t8, 12(AT)      // FIX(-2.613125930)
+    shll_s.ph     s5, s5, 1       // x2
+    addu          a3, a3, a2
+    shll_s.ph     v0, v0, 1       // x4
+    mulq_s.ph     v0, v0, t8      // tmp12 ...
+    mulq_s.ph     s4, s6, s4      // tmp10 ...
+    shll_s.ph     v1, v1, 1       // x2
+    addiu         a0, a0, 32
+    addiu         a1, a1, 8
+    shll_s.ph     s6, v0, 1       // x4
+    shll_s.ph     s4, s4, 1       // x2
+    addq.ph       s6, s6, v1      // ... tmp12
+    shll_s.ph     s0, s0, 2
+    subq.ph       t5, s6, t7      // tmp6
+    subq.ph       s4, s4, v1      // ... tmp10
+    subq.ph       t3, s5, t5      // tmp5
+    shll_s.ph     s7, s7, 2
+    addq.ph       t1, s4, t3      // tmp4
+    addq.ph       s1, t2, t5      // tmp1 + tmp6
+    subq.ph       s6, t2, t5      // tmp1 - tmp6
+    addq.ph       s2, t4, t3      // tmp2 + tmp5
+    subq.ph       s5, t4, t3      // tmp2 - tmp5
+    addq.ph       s4, t6, t1      // tmp3 + tmp4
+    subq.ph       s3, t6, t1      // tmp3 - tmp4
+    shll_s.ph     s1, s1, 2
+    shll_s.ph     s2, s2, 2
+    shll_s.ph     s3, s3, 2
+    shll_s.ph     s4, s4, 2
+    shll_s.ph     s5, s5, 2
+    shll_s.ph     s6, s6, 2
+    precrq.ph.w   t0, s1, s0      // B A
+    ins           s0, s1, 16, 16  // b a
+    precrq.ph.w   t2, s3, s2      // D C
+    ins           s2, s3, 16, 16  // d c
+    precrq.ph.w   t4, s5, s4      // F E
+    ins           s4, s5, 16, 16  // f e
+    precrq.ph.w   t6, s7, s6      // H G
+    ins           s6, s7, 16, 16  // h g
+    precrq.qb.ph  t0, t2, t0      // D C B A
+    precrq.qb.ph  s0, s2, s0      // d c b a
+    precrq.qb.ph  t4, t6, t4      // H G F E
+    precrq.qb.ph  s4, s6, s4      // h g f e
+    addu.qb       s0, s0, s8
+    addu.qb       s4, s4, s8
+    sw            s0, 0(a3)       // outptr[0/1/2/3]       d c b a
+    sw            s4, 4(a3)       // outptr[4/5/6/7]       h g f e
+    lw            a3, -4(a1)
+    addu.qb       t0, t0, s8
+    addu          a3, a3, a2
+    addu.qb       t4, t4, s8
+    sw            t0, 0(a3)       // outptr[0/1/2/3]       D C B A
+    bne           a0, t9, 0b
+     sw           t4, 4(a3)       // outptr[4/5/6/7]       H G F E
+
+2:
+
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+    j             ra
+     nop
+
+END(jsimd_idct_ifast_rows_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_fdct_islow_dspr2)
+/*
+ * a0 = data
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    lui         t0, 6437
+    ori         t0, 2260
+    lui         t1, 9633
+    ori         t1, 11363
+    lui         t2, 0xd39e
+    ori         t2, 0xe6dc
+    lui         t3, 0xf72d
+    ori         t3, 9633
+    lui         t4, 2261
+    ori         t4, 9633
+    lui         t5, 0xd39e
+    ori         t5, 6437
+    lui         t6, 9633
+    ori         t6, 0xd39d
+    lui         t7, 0xe6dc
+    ori         t7, 2260
+    lui         t8, 4433
+    ori         t8, 10703
+    lui         t9, 0xd630
+    ori         t9, 4433
+    li          s8, 8
+    move        a1, a0
+1:
+    lw          s0, 0(a1)       // tmp0 = 1|0
+    lw          s1, 4(a1)       // tmp1 = 3|2
+    lw          s2, 8(a1)       // tmp2 = 5|4
+    lw          s3, 12(a1)      // tmp3 = 7|6
+    packrl.ph   s1, s1, s1      // tmp1 = 2|3
+    packrl.ph   s3, s3, s3      // tmp3 = 6|7
+    subq.ph     s7, s1, s2      // tmp7 = 2-5|3-4 = t5|t4
+    subq.ph     s5, s0, s3      // tmp5 = 1-6|0-7 = t6|t7
+    mult        $0, $0          // ac0  = 0
+    dpa.w.ph    $ac0, s7, t0    // ac0 += t5*  6437 + t4*  2260
+    dpa.w.ph    $ac0, s5, t1    // ac0 += t6*  9633 + t7* 11363
+    mult        $ac1, $0, $0    // ac1  = 0
+    dpa.w.ph    $ac1, s7, t2    // ac1 += t5*-11362 + t4* -6436
+    dpa.w.ph    $ac1, s5, t3    // ac1 += t6* -2259 + t7*  9633
+    mult        $ac2, $0, $0    // ac2  = 0
+    dpa.w.ph    $ac2, s7, t4    // ac2 += t5*  2261 + t4*  9633
+    dpa.w.ph    $ac2, s5, t5    // ac2 += t6*-11362 + t7*  6437
+    mult        $ac3, $0, $0    // ac3  = 0
+    dpa.w.ph    $ac3, s7, t6    // ac3 += t5*  9633 + t4*-11363
+    dpa.w.ph    $ac3, s5, t7    // ac3 += t6* -6436 + t7*  2260
+    addq.ph     s6, s1, s2      // tmp6 = 2+5|3+4 = t2|t3
+    addq.ph     s4, s0, s3      // tmp4 = 1+6|0+7 = t1|t0
+    extr_r.w    s0, $ac0, 11    // tmp0 = (ac0 + 1024) >> 11
+    extr_r.w    s1, $ac1, 11    // tmp1 = (ac1 + 1024) >> 11
+    extr_r.w    s2, $ac2, 11    // tmp2 = (ac2 + 1024) >> 11
+    extr_r.w    s3, $ac3, 11    // tmp3 = (ac3 + 1024) >> 11
+    addq.ph     s5, s4, s6      // tmp5 = t1+t2|t0+t3 = t11|t10
+    subq.ph     s7, s4, s6      // tmp7 = t1-t2|t0-t3 = t12|t13
+    sh          s0, 2(a1)
+    sh          s1, 6(a1)
+    sh          s2, 10(a1)
+    sh          s3, 14(a1)
+    mult        $0, $0          // ac0  = 0
+    dpa.w.ph    $ac0, s7, t8    // ac0 += t12*  4433 + t13* 10703
+    mult        $ac1, $0, $0    // ac1  = 0
+    dpa.w.ph    $ac1, s7, t9    // ac1 += t12*-10704 + t13*  4433
+    sra         s4, s5, 16      // tmp4 = t11
+    addiu       a1, a1, 16
+    addiu       s8, s8, -1
+    extr_r.w    s0, $ac0, 11    // tmp0 = (ac0 + 1024) >> 11
+    extr_r.w    s1, $ac1, 11    // tmp1 = (ac1 + 1024) >> 11
+    addu        s2, s5, s4      // tmp2 = t10 + t11
+    subu        s3, s5, s4      // tmp3 = t10 - t11
+    sll         s2, s2, 2       // tmp2 = (t10 + t11) << 2
+    sll         s3, s3, 2       // tmp3 = (t10 - t11) << 2
+    sh          s2, -16(a1)
+    sh          s3, -8(a1)
+    sh          s0, -12(a1)
+    bgtz        s8, 1b
+     sh         s1, -4(a1)
+    li          t0, 2260
+    li          t1, 11363
+    li          t2, 9633
+    li          t3, 6436
+    li          t4, 6437
+    li          t5, 2261
+    li          t6, 11362
+    li          t7, 2259
+    li          t8, 4433
+    li          t9, 10703
+    li          a1, 10704
+    li          s8, 8
+
+2:
+    lh          a2, 0(a0)       // 0
+    lh          a3, 16(a0)      // 8
+    lh          v0, 32(a0)      // 16
+    lh          v1, 48(a0)      // 24
+    lh          s4, 64(a0)      // 32
+    lh          s5, 80(a0)      // 40
+    lh          s6, 96(a0)      // 48
+    lh          s7, 112(a0)     // 56
+    addu        s2, v0, s5      // tmp2 = 16 + 40
+    subu        s5, v0, s5      // tmp5 = 16 - 40
+    addu        s3, v1, s4      // tmp3 = 24 + 32
+    subu        s4, v1, s4      // tmp4 = 24 - 32
+    addu        s0, a2, s7      // tmp0 =  0 + 56
+    subu        s7, a2, s7      // tmp7 =  0 - 56
+    addu        s1, a3, s6      // tmp1 =  8 + 48
+    subu        s6, a3, s6      // tmp6 =  8 - 48
+    addu        a2, s0, s3      // tmp10 = tmp0 + tmp3
+    subu        v1, s0, s3      // tmp13 = tmp0 - tmp3
+    addu        a3, s1, s2      // tmp11 = tmp1 + tmp2
+    subu        v0, s1, s2      // tmp12 = tmp1 - tmp2
+    mult        s7, t1          // ac0  = tmp7 * c1
+    madd        s4, t0          // ac0 += tmp4 * c0
+    madd        s5, t4          // ac0 += tmp5 * c4
+    madd        s6, t2          // ac0 += tmp6 * c2
+    mult        $ac1, s7, t2    // ac1  = tmp7 * c2
+    msub        $ac1, s4, t3    // ac1 -= tmp4 * c3
+    msub        $ac1, s5, t6    // ac1 -= tmp5 * c6
+    msub        $ac1, s6, t7    // ac1 -= tmp6 * c7
+    mult        $ac2, s7, t4    // ac2  = tmp7 * c4
+    madd        $ac2, s4, t2    // ac2 += tmp4 * c2
+    madd        $ac2, s5, t5    // ac2 += tmp5 * c5
+    msub        $ac2, s6, t6    // ac2 -= tmp6 * c6
+    mult        $ac3, s7, t0    // ac3  = tmp7 * c0
+    msub        $ac3, s4, t1    // ac3 -= tmp4 * c1
+    madd        $ac3, s5, t2    // ac3 += tmp5 * c2
+    msub        $ac3, s6, t3    // ac3 -= tmp6 * c3
+    extr_r.w    s0, $ac0, 15    // tmp0 = (ac0 + 16384) >> 15
+    extr_r.w    s1, $ac1, 15    // tmp1 = (ac1 + 16384) >> 15
+    extr_r.w    s2, $ac2, 15    // tmp2 = (ac2 + 16384) >> 15
+    extr_r.w    s3, $ac3, 15    // tmp3 = (ac3 + 16384) >> 15
+    addiu       s8, s8, -1
+    addu        s4, a2, a3      // tmp4 = tmp10 + tmp11
+    subu        s5, a2, a3      // tmp5 = tmp10 - tmp11
+    sh          s0, 16(a0)
+    sh          s1, 48(a0)
+    sh          s2, 80(a0)
+    sh          s3, 112(a0)
+    mult        v0, t8          // ac0  = tmp12 * c8
+    madd        v1, t9          // ac0 += tmp13 * c9
+    mult        $ac1, v1, t8    // ac1  = tmp13 * c8
+    msub        $ac1, v0, a1    // ac1 -= tmp12 * c10
+    addiu       a0, a0, 2
+    extr_r.w    s6, $ac0, 15    // tmp6 = (ac0 + 16384) >> 15
+    extr_r.w    s7, $ac1, 15    // tmp7 = (ac1 + 16384) >> 15
+    shra_r.w    s4, s4, 2       // tmp4 = (tmp4 + 2) >> 2
+    shra_r.w    s5, s5, 2       // tmp5 = (tmp5 + 2) >> 2
+    sh          s4, -2(a0)
+    sh          s5, 62(a0)
+    sh          s6, 30(a0)
+    bgtz        s8, 2b
+     sh         s7, 94(a0)
+
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    jr          ra
+     nop
+
+END(jsimd_fdct_islow_dspr2)
+
+
+/**************************************************************************/
+LEAF_DSPR2(jsimd_fdct_ifast_dspr2)
+/*
+ * a0 = data
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 8, s0, s1
+
+    li          a1, 0x014e014e  // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
+    li          a2, 0x008b008b  // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
+    li          a3, 0x00620062  // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
+    li          s1, 0x00b500b5  // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
+
+    move        v0, a0
+    addiu       v1, v0, 128     // end address
+
+0:
+    lw          t0, 0(v0)       // tmp0 = 1|0
+    lw          t1, 4(v0)       // tmp1 = 3|2
+    lw          t2, 8(v0)       // tmp2 = 5|4
+    lw          t3, 12(v0)      // tmp3 = 7|6
+    packrl.ph   t1, t1, t1      // tmp1 = 2|3
+    packrl.ph   t3, t3, t3      // tmp3 = 6|7
+    subq.ph     t7, t1, t2      // tmp7 = 2-5|3-4 = t5|t4
+    subq.ph     t5, t0, t3      // tmp5 = 1-6|0-7 = t6|t7
+    addq.ph     t6, t1, t2      // tmp6 = 2+5|3+4 = t2|t3
+    addq.ph     t4, t0, t3      // tmp4 = 1+6|0+7 = t1|t0
+    addq.ph     t8, t4, t6      // tmp5 = t1+t2|t0+t3 = t11|t10
+    subq.ph     t9, t4, t6      // tmp7 = t1-t2|t0-t3 = t12|t13
+    sra         t4, t8, 16      // tmp4 = t11
+    mult        $0, $0          // ac0  = 0
+    dpa.w.ph    $ac0, t9, s1
+    mult        $ac1, $0, $0    // ac1  = 0
+    dpa.w.ph    $ac1, t7, a3    // ac1 += t4*98 + t5*98
+    dpsx.w.ph   $ac1, t5, a3    // ac1 += t6*98 + t7*98
+    mult        $ac2, $0, $0    // ac2  = 0
+    dpa.w.ph    $ac2, t7, a2    // ac2 += t4*139 + t5*139
+    mult        $ac3, $0, $0    // ac3  = 0
+    dpa.w.ph    $ac3, t5, a1    // ac3 += t6*334 + t7*334
+    precrq.ph.w t0, t5, t7      // t0 = t5|t6
+    addq.ph     t2, t8, t4      // tmp2 = t10 + t11
+    subq.ph     t3, t8, t4      // tmp3 = t10 - t11
+    extr.w      t4, $ac0, 8
+    mult        $0, $0          // ac0  = 0
+    dpa.w.ph    $ac0, t0, s1    // ac0 += t5*181 + t6*181
+    extr.w      t0, $ac1, 8     // t0 = z5
+    extr.w      t1, $ac2, 8     // t1 = MULTIPLY(tmp10, 139)
+    extr.w      t7, $ac3, 8     // t2 = MULTIPLY(tmp12, 334)
+    extr.w      t8, $ac0, 8     // t8 = z3 = MULTIPLY(tmp11, 181)
+    add         t6, t1, t0      // t6 = z2
+    add         t7, t7, t0      // t7 = z4
+    subq.ph     t0, t5, t8      // t0 = z13 = tmp7 - z3
+    addq.ph     t8, t5, t8      // t9 = z11 = tmp7 + z3
+    addq.ph     t1, t0, t6      // t1 = z13 + z2
+    subq.ph     t6, t0, t6      // t6 = z13 - z2
+    addq.ph     t0, t8, t7      // t0 = z11 + z4
+    subq.ph     t7, t8, t7      // t7 = z11 - z4
+    addq.ph     t5, t4, t9
+    subq.ph     t4, t9, t4
+    sh          t2, 0(v0)
+    sh          t5, 4(v0)
+    sh          t3, 8(v0)
+    sh          t4, 12(v0)
+    sh          t1, 10(v0)
+    sh          t6, 6(v0)
+    sh          t0, 2(v0)
+    sh          t7, 14(v0)
+    addiu       v0, 16
+    bne         v1, v0, 0b
+     nop
+    move        v0, a0
+    addiu       v1, v0, 16
+
+1:
+    lh          t0, 0(v0)       // 0
+    lh          t1, 16(v0)      // 8
+    lh          t2, 32(v0)      // 16
+    lh          t3, 48(v0)      // 24
+    lh          t4, 64(v0)      // 32
+    lh          t5, 80(v0)      // 40
+    lh          t6, 96(v0)      // 48
+    lh          t7, 112(v0)     // 56
+    add         t8, t0, t7      // t8 = tmp0
+    sub         t7, t0, t7      // t7 = tmp7
+    add         t0, t1, t6      // t0 = tmp1
+    sub         t1, t1, t6      // t1 = tmp6
+    add         t6, t2, t5      // t6 = tmp2
+    sub         t5, t2, t5      // t5 = tmp5
+    add         t2, t3, t4      // t2 = tmp3
+    sub         t3, t3, t4      // t3 = tmp4
+    add         t4, t8, t2      // t4 = tmp10 = tmp0 + tmp3
+    sub         t8, t8, t2      // t8 = tmp13 = tmp0 - tmp3
+    sub         s0, t0, t6      // s0 = tmp12 = tmp1 - tmp2
+    ins         t8, s0, 16, 16  // t8 = tmp12|tmp13
+    add         t2, t0, t6      // t2 = tmp11 = tmp1 + tmp2
+    mult        $0, $0          // ac0  = 0
+    dpa.w.ph    $ac0, t8, s1    // ac0 += t12*181 + t13*181
+    add         s0, t4, t2      // t8 = tmp10+tmp11
+    sub         t4, t4, t2      // t4 = tmp10-tmp11
+    sh          s0, 0(v0)
+    sh          t4, 64(v0)
+    extr.w      t2, $ac0, 8     // z1 = MULTIPLY(tmp12+tmp13, FIX_0_707106781)
+    addq.ph     t4, t8, t2      // t9 = tmp13 + z1
+    subq.ph     t8, t8, t2      // t2 = tmp13 - z1
+    sh          t4, 32(v0)
+    sh          t8, 96(v0)
+    add         t3, t3, t5      // t3 = tmp10 = tmp4 + tmp5
+    add         t0, t5, t1      // t0 = tmp11 = tmp5 + tmp6
+    add         t1, t1, t7      // t1 = tmp12 = tmp6 + tmp7
+    andi        t4, a1, 0xffff
+    mul         s0, t1, t4
+    sra         s0, s0, 8       // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
+    ins         t1, t3, 16, 16  // t1 = tmp10|tmp12
+    mult        $0, $0          // ac0  = 0
+    mulsa.w.ph  $ac0, t1, a3    // ac0 += t10*98 - t12*98
+    extr.w      t8, $ac0, 8     // z5 = MULTIPLY(tmp10-tmp12, FIX_0_382683433)
+    add         t2, t7, t8      // t2 = tmp7 + z5
+    sub         t7, t7, t8      // t7 = tmp7 - z5
+    andi        t4, a2, 0xffff
+    mul         t8, t3, t4
+    sra         t8, t8, 8       // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
+    andi        t4, s1, 0xffff
+    mul         t6, t0, t4
+    sra         t6, t6, 8       // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
+    add         t0, t6, t8      // t0 = z3 + z2
+    sub         t1, t6, t8      // t1 = z3 - z2
+    add         t3, t6, s0      // t3 = z3 + z4
+    sub         t4, t6, s0      // t4 = z3 - z4
+    sub         t5, t2, t1      // t5 = dataptr[5]
+    sub         t6, t7, t0      // t6 = dataptr[3]
+    add         t3, t2, t3      // t3 = dataptr[1]
+    add         t4, t7, t4      // t4 = dataptr[7]
+    sh          t5, 80(v0)
+    sh          t6, 48(v0)
+    sh          t3, 16(v0)
+    sh          t4, 112(v0)
+    addiu       v0, 2
+    bne         v0, v1, 1b
+     nop
+
+    RESTORE_REGS_FROM_STACK 8, s0, s1
+
+    j           ra
+     nop
+END(jsimd_fdct_ifast_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_quantize_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = divisors
+ * a2 = workspace
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2
+
+    addiu       v0, a2, 124     // v0 = workspace_end
+    lh          t0, 0(a2)
+    lh          t1, 0(a1)
+    lh          t2, 128(a1)
+    sra         t3, t0, 15
+    sll         t3, t3, 1
+    addiu       t3, t3, 1
+    mul         t0, t0, t3
+    lh          t4, 384(a1)
+    lh          t5, 130(a1)
+    lh          t6, 2(a2)
+    lh          t7, 2(a1)
+    lh          t8, 386(a1)
+
+1:
+    andi        t1, 0xffff
+    add         t9, t0, t2
+    andi        t9, 0xffff
+    mul         v1, t9, t1
+    sra         s0, t6, 15
+    sll         s0, s0, 1
+    addiu       s0, s0, 1
+    addiu       t9, t4, 16
+    srav        v1, v1, t9
+    mul         v1, v1, t3
+    mul         t6, t6, s0
+    andi        t7, 0xffff
+    addiu       a2, a2, 4
+    addiu       a1, a1, 4
+    add         s1, t6, t5
+    andi        s1, 0xffff
+    sh          v1, 0(a0)
+
+    mul         s2, s1, t7
+    addiu       s1, t8, 16
+    srav        s2, s2, s1
+    mul         s2, s2, s0
+    lh          t0, 0(a2)
+    lh          t1, 0(a1)
+    sra         t3, t0, 15
+    sll         t3, t3, 1
+    addiu       t3, t3, 1
+    mul         t0, t0, t3
+    lh          t2, 128(a1)
+    lh          t4, 384(a1)
+    lh          t5, 130(a1)
+    lh          t8, 386(a1)
+    lh          t6, 2(a2)
+    lh          t7, 2(a1)
+    sh          s2, 2(a0)
+    lh          t0, 0(a2)
+    sra         t3, t0, 15
+    sll         t3, t3, 1
+    addiu       t3, t3, 1
+    mul         t0, t0, t3
+    bne         a2, v0, 1b
+     addiu      a0, a0, 4
+
+    andi        t1, 0xffff
+    add         t9, t0, t2
+    andi        t9, 0xffff
+    mul         v1, t9, t1
+    sra         s0, t6, 15
+    sll         s0, s0, 1
+    addiu       s0, s0, 1
+    addiu       t9, t4, 16
+    srav        v1, v1, t9
+    mul         v1, v1, t3
+    mul         t6, t6, s0
+    andi        t7, 0xffff
+    sh          v1, 0(a0)
+    add         s1, t6, t5
+    andi        s1, 0xffff
+    mul         s2, s1, t7
+    addiu       s1, t8, 16
+    addiu       a2, a2, 4
+    addiu       a1, a1, 4
+    srav        s2, s2, s1
+    mul         s2, s2, s0
+    sh          s2, 2(a0)
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2
+
+    j           ra
+     nop
+
+END(jsimd_quantize_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_quantize_float_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = divisors
+ * a2 = workspace
+ */
+    .set at
+
+    li          t1, 0x46800100  // integer representation 16384.5
+    mtc1        t1, f0
+    li          t0, 63
+0:
+    lwc1        f2, 0(a2)
+    lwc1        f10, 0(a1)
+    lwc1        f4, 4(a2)
+    lwc1        f12, 4(a1)
+    lwc1        f6, 8(a2)
+    lwc1        f14, 8(a1)
+    lwc1        f8, 12(a2)
+    lwc1        f16, 12(a1)
+    madd.s      f2, f0, f2, f10
+    madd.s      f4, f0, f4, f12
+    madd.s      f6, f0, f6, f14
+    madd.s      f8, f0, f8, f16
+    lwc1        f10, 16(a1)
+    lwc1        f12, 20(a1)
+    trunc.w.s   f2, f2
+    trunc.w.s   f4, f4
+    trunc.w.s   f6, f6
+    trunc.w.s   f8, f8
+    lwc1        f14, 24(a1)
+    lwc1        f16, 28(a1)
+    mfc1        t1, f2
+    mfc1        t2, f4
+    mfc1        t3, f6
+    mfc1        t4, f8
+    lwc1        f2, 16(a2)
+    lwc1        f4, 20(a2)
+    lwc1        f6, 24(a2)
+    lwc1        f8, 28(a2)
+    madd.s      f2, f0, f2, f10
+    madd.s      f4, f0, f4, f12
+    madd.s      f6, f0, f6, f14
+    madd.s      f8, f0, f8, f16
+    addiu       t1, t1, -16384
+    addiu       t2, t2, -16384
+    addiu       t3, t3, -16384
+    addiu       t4, t4, -16384
+    trunc.w.s   f2, f2
+    trunc.w.s   f4, f4
+    trunc.w.s   f6, f6
+    trunc.w.s   f8, f8
+    sh          t1, 0(a0)
+    sh          t2, 2(a0)
+    sh          t3, 4(a0)
+    sh          t4, 6(a0)
+    mfc1        t1, f2
+    mfc1        t2, f4
+    mfc1        t3, f6
+    mfc1        t4, f8
+    addiu       t0, t0, -8
+    addiu       a2, a2, 32
+    addiu       a1, a1, 32
+    addiu       t1, t1, -16384
+    addiu       t2, t2, -16384
+    addiu       t3, t3, -16384
+    addiu       t4, t4, -16384
+    sh          t1, 8(a0)
+    sh          t2, 10(a0)
+    sh          t3, 12(a0)
+    sh          t4, 14(a0)
+    bgez        t0, 0b
+     addiu      a0, a0, 16
+
+    j           ra
+     nop
+
+END(jsimd_quantize_float_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_2x2_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = output_buf
+ * a3 = output_col
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+    addiu       sp, sp, -40
+    move        v0, sp
+    addiu       s2, zero, 29692
+    addiu       s3, zero, -10426
+    addiu       s4, zero, 6967
+    addiu       s5, zero, -5906
+    lh          t0, 0(a1)       // t0 = inptr[DCTSIZE*0]
+    lh          t5, 0(a0)       // t5 = quantptr[DCTSIZE*0]
+    lh          t1, 48(a1)      // t1 = inptr[DCTSIZE*3]
+    lh          t6, 48(a0)      // t6 = quantptr[DCTSIZE*3]
+    mul         t4, t5, t0
+    lh          t0, 16(a1)      // t0 = inptr[DCTSIZE*1]
+    lh          t5, 16(a0)      // t5 = quantptr[DCTSIZE*1]
+    mul         t6, t6, t1
+    mul         t5, t5, t0
+    lh          t2, 80(a1)      // t2 = inptr[DCTSIZE*5]
+    lh          t7, 80(a0)      // t7 = quantptr[DCTSIZE*5]
+    lh          t3, 112(a1)     // t3 = inptr[DCTSIZE*7]
+    lh          t8, 112(a0)     // t8 = quantptr[DCTSIZE*7]
+    mul         t7, t7, t2
+    mult        zero, zero
+    mul         t8, t8, t3
+    li          s0, 0x73FCD746  // s0 = (29692 << 16) | (-10426 & 0xffff)
+    li          s1, 0x1B37E8EE  // s1 = (6967 << 16) | (-5906 & 0xffff)
+    ins         t6, t5, 16, 16  // t6 = t5|t6
+    sll         t4, t4, 15
+    dpa.w.ph    $ac0, t6, s0
+    lh          t1, 2(a1)
+    lh          t6, 2(a0)
+    ins         t8, t7, 16, 16  // t8 = t7|t8
+    dpa.w.ph    $ac0, t8, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 18(a1)
+    lh          t6, 18(a0)
+    lh          t2, 50(a1)
+    lh          t7, 50(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 82(a1)
+    lh          t2, 82(a0)
+    lh          t3, 114(a1)
+    lh          t4, 114(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 0(v0)
+    sw          t8, 20(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    lh          t1, 6(a1)
+    lh          t6, 6(a0)
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 22(a1)
+    lh          t6, 22(a0)
+    lh          t2, 54(a1)
+    lh          t7, 54(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 86(a1)
+    lh          t2, 86(a0)
+    lh          t3, 118(a1)
+    lh          t4, 118(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 4(v0)
+    sw          t8, 24(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    lh          t1, 10(a1)
+    lh          t6, 10(a0)
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 26(a1)
+    lh          t6, 26(a0)
+    lh          t2, 58(a1)
+    lh          t7, 58(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 90(a1)
+    lh          t2, 90(a0)
+    lh          t3, 122(a1)
+    lh          t4, 122(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 8(v0)
+    sw          t8, 28(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    lh          t1, 14(a1)
+    lh          t6, 14(a0)
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 30(a1)
+    lh          t6, 30(a0)
+    lh          t2, 62(a1)
+    lh          t7, 62(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 94(a1)
+    lh          t2, 94(a0)
+    lh          t3, 126(a1)
+    lh          t4, 126(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 12(v0)
+    sw          t8, 32(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    lw          t9, 0(a2)
+    lw          t3, 0(v0)
+    lw          t7, 4(v0)
+    lw          t1, 8(v0)
+    addu        t9, t9, a3
+    sll         t3, t3, 15
+    subu        t8, t4, t0
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    shra_r.w    t8, t8, 13
+    sw          t0, 16(v0)
+    sw          t8, 36(v0)
+    lw          t5, 12(v0)
+    lw          t6, 16(v0)
+    mult        t7, s2
+    madd        t1, s3
+    madd        t5, s4
+    madd        t6, s5
+    lw          t5, 24(v0)
+    lw          t7, 28(v0)
+    mflo        t0, $ac0
+    lw          t8, 32(v0)
+    lw          t2, 36(v0)
+    mult        $ac1, t5, s2
+    madd        $ac1, t7, s3
+    madd        $ac1, t8, s4
+    madd        $ac1, t2, s5
+    addu        t1, t3, t0
+    subu        t6, t3, t0
+    shra_r.w    t1, t1, 20
+    shra_r.w    t6, t6, 20
+    mflo        t4, $ac1
+    shll_s.w    t1, t1, 24
+    shll_s.w    t6, t6, 24
+    sra         t1, t1, 24
+    sra         t6, t6, 24
+    addiu       t1, t1, 128
+    addiu       t6, t6, 128
+    lw          t0, 20(v0)
+    sb          t1, 0(t9)
+    sb          t6, 1(t9)
+    sll         t0, t0, 15
+    lw          t9, 4(a2)
+    addu        t1, t0, t4
+    subu        t6, t0, t4
+    addu        t9, t9, a3
+    shra_r.w    t1, t1, 20
+    shra_r.w    t6, t6, 20
+    shll_s.w    t1, t1, 24
+    shll_s.w    t6, t6, 24
+    sra         t1, t1, 24
+    sra         t6, t6, 24
+    addiu       t1, t1, 128
+    addiu       t6, t6, 128
+    sb          t1, 0(t9)
+    sb          t6, 1(t9)
+    addiu       sp, sp, 40
+
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+    j           ra
+     nop
+
+END(jsimd_idct_2x2_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_4x4_dspr2)
+/*
+ * a0     = compptr->dct_table
+ * a1     = coef_block
+ * a2     = output_buf
+ * a3     = output_col
+ * 16(sp) = workspace[DCTSIZE*4];  // buffers data between passes
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          v1, 48(sp)
+    move        t0, a1
+    move        t1, v1
+    li          t9, 4
+    li          s0, 0x2e75f93e
+    li          s1, 0x21f9ba79
+    li          s2, 0xecc2efb0
+    li          s3, 0x52031ccd
+
+0:
+    lh          s6, 32(t0)      // inptr[DCTSIZE*2]
+    lh          t6, 32(a0)      // quantptr[DCTSIZE*2]
+    lh          s7, 96(t0)      // inptr[DCTSIZE*6]
+    lh          t7, 96(a0)      // quantptr[DCTSIZE*6]
+    mul         t6, s6, t6      // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
+    lh          s4, 0(t0)       // inptr[DCTSIZE*0]
+    mul         t7, s7, t7      // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
+    lh          s5, 0(a0)       // quantptr[0]
+    li          s6, 15137
+    li          s7, 6270
+    mul         t2, s4, s5      // tmp0 = (inptr[0] * quantptr[0])
+    mul         t6, s6, t6      // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
+    lh          t5, 112(t0)     // inptr[DCTSIZE*7]
+    mul         t7, s7, t7      // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
+    lh          s4, 112(a0)     // quantptr[DCTSIZE*7]
+    lh          v0, 80(t0)      // inptr[DCTSIZE*5]
+    lh          s5, 80(a0)      // quantptr[DCTSIZE*5]
+    lh          s6, 48(a0)      // quantptr[DCTSIZE*3]
+    sll         t2, t2, 14      // tmp0 <<= (CONST_BITS+1)
+    lh          s7, 16(a0)      // quantptr[DCTSIZE*1]
+    lh          t8, 16(t0)      // inptr[DCTSIZE*1]
+    subu        t6, t6, t7      // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
+    lh          t7, 48(t0)      // inptr[DCTSIZE*3]
+    mul         t5, s4, t5      // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
+    mul         v0, s5, v0      // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
+    mul         t7, s6, t7      // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
+    mul         t8, s7, t8      // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
+    addu        t3, t2, t6      // tmp10 = tmp0 + z2
+    subu        t4, t2, t6      // tmp10 = tmp0 - z2
+    mult        $ac0, zero, zero
+    mult        $ac1, zero, zero
+    ins         t5, v0, 16, 16
+    ins         t7, t8, 16, 16
+    addiu       t9, t9, -1
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    mflo        s4, $ac0
+    mflo        s5, $ac1
+    addiu       a0, a0, 2
+    addiu       t1, t1, 4
+    addiu       t0, t0, 2
+    addu        t6, t4, s4
+    subu        t5, t4, s4
+    addu        s6, t3, s5
+    subu        s7, t3, s5
+    shra_r.w    t6, t6, 12      // DESCALE(tmp12 + temp1, 12)
+    shra_r.w    t5, t5, 12      // DESCALE(tmp12 - temp1, 12)
+    shra_r.w    s6, s6, 12      // DESCALE(tmp10 + temp2, 12)
+    shra_r.w    s7, s7, 12      // DESCALE(tmp10 - temp2, 12)
+    sw          t6, 28(t1)
+    sw          t5, 60(t1)
+    sw          s6, -4(t1)
+    bgtz        t9, 0b
+     sw         s7, 92(t1)
+    // second loop three pass
+    li          t9, 3
+1:
+    lh          s6, 34(t0)      // inptr[DCTSIZE*2]
+    lh          t6, 34(a0)      // quantptr[DCTSIZE*2]
+    lh          s7, 98(t0)      // inptr[DCTSIZE*6]
+    lh          t7, 98(a0)      // quantptr[DCTSIZE*6]
+    mul         t6, s6, t6      // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
+    lh          s4, 2(t0)       // inptr[DCTSIZE*0]
+    mul         t7, s7, t7      // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
+    lh          s5, 2(a0)       // quantptr[DCTSIZE*0]
+    li          s6, 15137
+    li          s7, 6270
+    mul         t2, s4, s5      // tmp0 = (inptr[0] * quantptr[0])
+    mul         v0, s6, t6      // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
+    lh          t5, 114(t0)     // inptr[DCTSIZE*7]
+    mul         t7, s7, t7      // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
+    lh          s4, 114(a0)     // quantptr[DCTSIZE*7]
+    lh          s5, 82(a0)      // quantptr[DCTSIZE*5]
+    lh          t6, 82(t0)      // inptr[DCTSIZE*5]
+    sll         t2, t2, 14      // tmp0 <<= (CONST_BITS+1)
+    lh          s6, 50(a0)      // quantptr[DCTSIZE*3]
+    lh          t8, 18(t0)      // inptr[DCTSIZE*1]
+    subu        v0, v0, t7      // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
+    lh          t7, 50(t0)      // inptr[DCTSIZE*3]
+    lh          s7, 18(a0)      // quantptr[DCTSIZE*1]
+    mul         t5, s4, t5      // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
+    mul         t6, s5, t6      // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
+    mul         t7, s6, t7      // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
+    mul         t8, s7, t8      // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
+    addu        t3, t2, v0      // tmp10 = tmp0 + z2
+    subu        t4, t2, v0      // tmp10 = tmp0 - z2
+    mult        $ac0, zero, zero
+    mult        $ac1, zero, zero
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    mflo        t5, $ac0
+    mflo        t6, $ac1
+    addiu       t9, t9, -1
+    addiu       t0, t0, 2
+    addiu       a0, a0, 2
+    addiu       t1, t1, 4
+    addu        s5, t4, t5
+    subu        s4, t4, t5
+    addu        s6, t3, t6
+    subu        s7, t3, t6
+    shra_r.w    s5, s5, 12      // DESCALE(tmp12 + temp1, 12)
+    shra_r.w    s4, s4, 12      // DESCALE(tmp12 - temp1, 12)
+    shra_r.w    s6, s6, 12      // DESCALE(tmp10 + temp2, 12)
+    shra_r.w    s7, s7, 12      // DESCALE(tmp10 - temp2, 12)
+    sw          s5, 32(t1)
+    sw          s4, 64(t1)
+    sw          s6, 0(t1)
+    bgtz        t9, 1b
+     sw         s7, 96(t1)
+    move        t1, v1
+    li          s4, 15137
+    lw          s6, 8(t1)       // wsptr[2]
+    li          s5, 6270
+    lw          s7, 24(t1)      // wsptr[6]
+    mul         s4, s4, s6      // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
+    lw          t2, 0(t1)       // wsptr[0]
+    mul         s5, s5, s7      // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
+    lh          t5, 28(t1)      // wsptr[7]
+    lh          t6, 20(t1)      // wsptr[5]
+    lh          t7, 12(t1)      // wsptr[3]
+    lh          t8, 4(t1)       // wsptr[1]
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
+    mflo        s6, $ac0
+    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
+    subu        s4, s4, s5
+    addu        t3, t2, s4      // tmp10 = tmp0 + z2
+    mflo        s7, $ac1
+    subu        t4, t2, s4      // tmp10 = tmp0 - z2
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      // DESCALE(tmp10 + temp2, 19)
+    shra_r.w    t6, t6, 19      // DESCALE(tmp10 - temp2, 19)
+    shra_r.w    t7, t7, 19      // DESCALE(tmp12 + temp1, 19)
+    shra_r.w    t8, t8, 19      // DESCALE(tmp12 - temp1, 19)
+    sll         s4, t9, 2
+    lw          v0, 0(a2)       // output_buf[ctr]
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      // outptr = output_buf[ctr] + output_col
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+    // 2
+    li          s4, 15137
+    lw          s6, 40(t1)      // wsptr[2]
+    li          s5, 6270
+    lw          s7, 56(t1)      // wsptr[6]
+    mul         s4, s4, s6      // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
+    lw          t2, 32(t1)      // wsptr[0]
+    mul         s5, s5, s7      // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
+    lh          t5, 60(t1)      // wsptr[7]
+    lh          t6, 52(t1)      // wsptr[5]
+    lh          t7, 44(t1)      // wsptr[3]
+    lh          t8, 36(t1)      // wsptr[1]
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
+    mflo        s6, $ac0
+    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
+    subu        s4, s4, s5
+    addu        t3, t2, s4      // tmp10 = tmp0 + z2
+    mflo        s7, $ac1
+    subu        t4, t2, s4      // tmp10 = tmp0 - z2
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
+    shra_r.w    t6, t6, 19      // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
+    shra_r.w    t7, t7, 19      // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
+    shra_r.w    t8, t8, 19      // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
+    sll         s4, t9, 2
+    lw          v0, 4(a2)       // output_buf[ctr]
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      // outptr = output_buf[ctr] + output_col
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+    // 3
+    li          s4, 15137
+    lw          s6, 72(t1)      // wsptr[2]
+    li          s5, 6270
+    lw          s7, 88(t1)      // wsptr[6]
+    mul         s4, s4, s6      // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
+    lw          t2, 64(t1)      // wsptr[0]
+    mul         s5, s5, s7      // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
+    lh          t5, 92(t1)      // wsptr[7]
+    lh          t6, 84(t1)      // wsptr[5]
+    lh          t7, 76(t1)      // wsptr[3]
+    lh          t8, 68(t1)      // wsptr[1]
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
+    mflo        s6, $ac0
+    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
+    subu        s4, s4, s5
+    addu        t3, t2, s4      // tmp10 = tmp0 + z2
+    mflo        s7, $ac1
+    subu        t4, t2, s4      // tmp10 = tmp0 - z2
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      // DESCALE(tmp10 + temp2, 19)
+    shra_r.w    t6, t6, 19      // DESCALE(tmp10 - temp2, 19)
+    shra_r.w    t7, t7, 19      // DESCALE(tmp12 + temp1, 19)
+    shra_r.w    t8, t8, 19      // DESCALE(tmp12 - temp1, 19)
+    sll         s4, t9, 2
+    lw          v0, 8(a2)       // output_buf[ctr]
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      // outptr = output_buf[ctr] + output_col
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+    li          s4, 15137
+    lw          s6, 104(t1)     // wsptr[2]
+    li          s5, 6270
+    lw          s7, 120(t1)     // wsptr[6]
+    mul         s4, s4, s6      // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
+    lw          t2, 96(t1)      // wsptr[0]
+    mul         s5, s5, s7      // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
+    lh          t5, 124(t1)     // wsptr[7]
+    lh          t6, 116(t1)     // wsptr[5]
+    lh          t7, 108(t1)     // wsptr[3]
+    lh          t8, 100(t1)     // wsptr[1]
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
+    mflo        s6, $ac0
+    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
+    subu        s4, s4, s5
+    addu        t3, t2, s4      // tmp10 = tmp0 + z2;
+    mflo        s7, $ac1
+    subu        t4, t2, s4      // tmp10 = tmp0 - z2;
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      // DESCALE(tmp10 + temp2, 19)
+    shra_r.w    t6, t6, 19      // DESCALE(tmp10 - temp2, 19)
+    shra_r.w    t7, t7, 19      // DESCALE(tmp12 + temp1, 19)
+    shra_r.w    t8, t8, 19      // DESCALE(tmp12 - temp1, 19)
+    sll         s4, t9, 2
+    lw          v0, 12(a2)      // output_buf[ctr]
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      // outptr = output_buf[ctr] + output_col
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_idct_4x4_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_6x6_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = output_buf
+ * a3 = output_col
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu       sp, sp, -144
+    move        v0, sp
+    addiu       v1, v0, 24
+    addiu       t9, zero, 5793
+    addiu       s0, zero, 10033
+    addiu       s1, zero, 2998
+
+1:
+    lh          s2, 0(a0)       // q0 = quantptr[ 0]
+    lh          s3, 32(a0)      // q1 = quantptr[16]
+    lh          s4, 64(a0)      // q2 = quantptr[32]
+    lh          t2, 64(a1)      // tmp2 = inptr[32]
+    lh          t1, 32(a1)      // tmp1 = inptr[16]
+    lh          t0, 0(a1)       // tmp0 = inptr[ 0]
+    mul         t2, t2, s4      // tmp2 = tmp2 * q2
+    mul         t1, t1, s3      // tmp1 = tmp1 * q1
+    mul         t0, t0, s2      // tmp0 = tmp0 * q0
+    lh          t6, 16(a1)      // z1 = inptr[ 8]
+    lh          t8, 80(a1)      // z3 = inptr[40]
+    lh          t7, 48(a1)      // z2 = inptr[24]
+    lh          s2, 16(a0)      // q0 = quantptr[ 8]
+    lh          s4, 80(a0)      // q2 = quantptr[40]
+    lh          s3, 48(a0)      // q1 = quantptr[24]
+    mul         t2, t2, t9      // tmp2 = tmp2 * 5793
+    mul         t1, t1, s0      // tmp1 = tmp1 * 10033
+    sll         t0, t0, 13      // tmp0 = tmp0 << 13
+    mul         t6, t6, s2      // z1 = z1 * q0
+    mul         t8, t8, s4      // z3 = z3 * q2
+    mul         t7, t7, s3      // z2 = z2 * q1
+    addu        t3, t0, t2      // tmp10 = tmp0 + tmp2
+    sll         t2, t2, 1       // tmp2 = tmp2 << 2
+    subu        t4, t0, t2      // tmp11 = tmp0 - tmp2;
+    subu        t5, t3, t1      // tmp12 = tmp10 - tmp1
+    addu        t3, t3, t1      // tmp10 = tmp10 + tmp1
+    addu        t1, t6, t8      // tmp1 = z1 + z3
+    mul         t1, t1, s1      // tmp1 = tmp1 * 2998
+    shra_r.w    t4, t4, 11      // tmp11 = (tmp11 + 1024) >> 11
+    subu        t2, t6, t8      // tmp2 = z1 - z3
+    subu        t2, t2, t7      // tmp2 = tmp2 - z2
+    sll         t2, t2, 2       // tmp2 = tmp2 << 2
+    addu        t0, t6, t7      // tmp0 = z1 + z2
+    sll         t0, t0, 13      // tmp0 = tmp0 << 13
+    subu        s2, t8, t7      // q0 = z3 - z2
+    sll         s2, s2, 13      // q0 = q0 << 13
+    addu        t0, t0, t1      // tmp0 = tmp0 + tmp1
+    addu        t1, s2, t1      // tmp1 = q0 + tmp1
+    addu        s2, t4, t2      // q0 = tmp11 + tmp2
+    subu        s3, t4, t2      // q1 = tmp11 - tmp2
+    addu        t6, t3, t0      // z1 = tmp10 + tmp0
+    subu        t7, t3, t0      // z2 = tmp10 - tmp0
+    addu        t4, t5, t1      // tmp11 = tmp12 + tmp1
+    subu        t5, t5, t1      // tmp12 = tmp12 - tmp1
+    shra_r.w    t6, t6, 11      // z1 = (z1 + 1024) >> 11
+    shra_r.w    t7, t7, 11      // z2 = (z2 + 1024) >> 11
+    shra_r.w    t4, t4, 11      // tmp11 = (tmp11 + 1024) >> 11
+    shra_r.w    t5, t5, 11      // tmp12 = (tmp12 + 1024) >> 11
+    sw          s2, 24(v0)
+    sw          s3, 96(v0)
+    sw          t6, 0(v0)
+    sw          t7, 120(v0)
+    sw          t4, 48(v0)
+    sw          t5, 72(v0)
+    addiu       v0, v0, 4
+    addiu       a1, a1, 2
+    bne         v0, v1, 1b
+     addiu      a0, a0, 2
+
+    /* Pass 2: process 6 rows from work array, store into output array. */
+    move        v0, sp
+    addiu       v1, v0, 144
+
+2:
+    lw          t0, 0(v0)
+    lw          t2, 16(v0)
+    lw          s5, 0(a2)
+    addiu       t0, t0, 16
+    sll         t0, t0, 13
+    mul         t3, t2, t9
+    lw          t6, 4(v0)
+    lw          t8, 20(v0)
+    lw          t7, 12(v0)
+    addu        s5, s5, a3
+    addu        s6, t6, t8
+    mul         s6, s6, s1
+    addu        t1, t0, t3
+    subu        t4, t0, t3
+    subu        t4, t4, t3
+    lw          t3, 8(v0)
+    mul         t0, t3, s0
+    addu        s7, t6, t7
+    sll         s7, s7, 13
+    addu        s7, s6, s7
+    subu        t2, t8, t7
+    sll         t2, t2, 13
+    addu        t2, s6, t2
+    subu        s6, t6, t7
+    subu        s6, s6, t8
+    sll         s6, s6, 13
+    addu        t3, t1, t0
+    subu        t5, t1, t0
+    addu        t6, t3, s7
+    subu        t3, t3, s7
+    addu        t7, t4, s6
+    subu        t4, t4, s6
+    addu        t8, t5, t2
+    subu        t5, t5, t2
+    shll_s.w    t6, t6, 6
+    shll_s.w    t3, t3, 6
+    shll_s.w    t7, t7, 6
+    shll_s.w    t4, t4, 6
+    shll_s.w    t8, t8, 6
+    shll_s.w    t5, t5, 6
+    sra         t6, t6, 24
+    addiu       t6, t6, 128
+    sra         t3, t3, 24
+    addiu       t3, t3, 128
+    sb          t6, 0(s5)
+    sra         t7, t7, 24
+    addiu       t7, t7, 128
+    sb          t3, 5(s5)
+    sra         t4, t4, 24
+    addiu       t4, t4, 128
+    sb          t7, 1(s5)
+    sra         t8, t8, 24
+    addiu       t8, t8, 128
+    sb          t4, 4(s5)
+    addiu       v0, v0, 24
+    sra         t5, t5, 24
+    addiu       t5, t5, 128
+    sb          t8, 2(s5)
+    addiu       a2, a2,  4
+    bne         v0, v1, 2b
+     sb         t5, 3(s5)
+
+    addiu       sp, sp, 144
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+
+END(jsimd_idct_6x6_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = workspace
+ */
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    li          a3, 8
+
+1:
+    // odd part
+    lh          t0, 48(a1)
+    lh          t1, 48(a0)
+    lh          t2, 16(a1)
+    lh          t3, 16(a0)
+    lh          t4, 80(a1)
+    lh          t5, 80(a0)
+    lh          t6, 112(a1)
+    lh          t7, 112(a0)
+    mul         t0, t0, t1      // z2
+    mul         t1, t2, t3      // z1
+    mul         t2, t4, t5      // z3
+    mul         t3, t6, t7      // z4
+    li          t4, 10703       // FIX(1.306562965)
+    li          t5, 4433        // FIX_0_541196100
+    li          t6, 7053        // FIX(0.860918669)
+    mul         t4, t0, t4      // tmp11
+    mul         t5, t0, t5      // -tmp14
+    addu        t7, t1, t2      // tmp10
+    addu        t8, t7, t3      // tmp10 + z4
+    mul         t6, t6, t8      // tmp15
+    li          t8, 2139        // FIX(0.261052384)
+    mul         t8, t7, t8      // MULTIPLY(tmp10, FIX(0.261052384))
+    li          t7, 2295        // FIX(0.280143716)
+    mul         t7, t1, t7      // MULTIPLY(z1, FIX(0.280143716))
+    addu        t9, t2, t3      // z3 + z4
+    li          s0, 8565        // FIX(1.045510580)
+    mul         t9, t9, s0      // -tmp13
+    li          s0, 12112       // FIX(1.478575242)
+    mul         s0, t2, s0      // MULTIPLY(z3, FIX(1.478575242)
+    li          s1, 12998       // FIX(1.586706681)
+    mul         s1, t3, s1      // MULTIPLY(z4, FIX(1.586706681))
+    li          s2, 5540        // FIX(0.676326758)
+    mul         s2, t1, s2      // MULTIPLY(z1, FIX(0.676326758))
+    li          s3, 16244       // FIX(1.982889723)
+    mul         s3, t3, s3      // MULTIPLY(z4, FIX(1.982889723))
+    subu        t1, t1, t3      // z1-=z4
+    subu        t0, t0, t2      // z2-=z3
+    addu        t2, t0, t1      // z1+z2
+    li          t3, 4433        // FIX_0_541196100
+    mul         t2, t2, t3      // z3
+    li          t3, 6270        // FIX_0_765366865
+    mul         t1, t1, t3      // MULTIPLY(z1, FIX_0_765366865)
+    li          t3, 15137       // FIX_0_765366865
+    mul         t0, t0, t3      // MULTIPLY(z2, FIX_1_847759065)
+    addu        t8, t6, t8      // tmp12
+    addu        t3, t8, t4      // tmp12 + tmp11
+    addu        t3, t3, t7      // tmp10
+    subu        t8, t8, t9      // tmp12 + tmp13
+    addu        s0, t5, s0
+    subu        t8, t8, s0      // tmp12
+    subu        t9, t6, t9
+    subu        s1, s1, t4
+    addu        t9, t9, s1      // tmp13
+    subu        t6, t6, t5
+    subu        t6, t6, s2
+    subu        t6, t6, s3      // tmp15
+    // even part start
+    lh          t4, 64(a1)
+    lh          t5, 64(a0)
+    lh          t7, 32(a1)
+    lh          s0, 32(a0)
+    lh          s1, 0(a1)
+    lh          s2, 0(a0)
+    lh          s3, 96(a1)
+    lh          v0, 96(a0)
+    mul         t4, t4, t5      // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4])
+    mul         t5, t7, s0      // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2])
+    mul         t7, s1, s2      // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0])
+    mul         s0, s3, v0      // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6])
+    // odd part end
+    addu        t1, t2, t1      // tmp11
+    subu        t0, t2, t0      // tmp14
+    // update counter and pointers
+    addiu       a3, a3, -1
+    addiu       a0, a0, 2
+    addiu       a1, a1, 2
+    // even part rest
+    li          s1, 10033
+    li          s2, 11190
+    mul         t4, t4, s1      // z4
+    mul         s1, t5, s2      // z4
+    sll         t5, t5, 13      // z1
+    sll         t7, t7, 13
+    addiu       t7, t7, 1024    // z3
+    sll         s0, s0, 13      // z2
+    addu        s2, t7, t4      // tmp10
+    subu        t4, t7, t4      // tmp11
+    subu        s3, t5, s0      // tmp12
+    addu        t2, t7, s3      // tmp21
+    subu        s3, t7, s3      // tmp24
+    addu        t7, s1, s0      // tmp12
+    addu        v0, s2, t7      // tmp20
+    subu        s2, s2, t7      // tmp25
+    subu        s1, s1, t5      // z4 - z1
+    subu        s1, s1, s0      // tmp12
+    addu        s0, t4, s1      // tmp22
+    subu        t4, t4, s1      // tmp23
+    // final output stage
+    addu        t5, v0, t3
+    subu        v0, v0, t3
+    addu        t3, t2, t1
+    subu        t2, t2, t1
+    addu        t1, s0, t8
+    subu        s0, s0, t8
+    addu        t8, t4, t9
+    subu        t4, t4, t9
+    addu        t9, s3, t0
+    subu        s3, s3, t0
+    addu        t0, s2, t6
+    subu        s2, s2, t6
+    sra         t5, t5, 11
+    sra         t3, t3, 11
+    sra         t1, t1, 11
+    sra         t8, t8, 11
+    sra         t9, t9, 11
+    sra         t0, t0, 11
+    sra         s2, s2, 11
+    sra         s3, s3, 11
+    sra         t4, t4, 11
+    sra         s0, s0, 11
+    sra         t2, t2, 11
+    sra         v0, v0, 11
+    sw          t5, 0(a2)
+    sw          t3, 32(a2)
+    sw          t1, 64(a2)
+    sw          t8, 96(a2)
+    sw          t9, 128(a2)
+    sw          t0, 160(a2)
+    sw          s2, 192(a2)
+    sw          s3, 224(a2)
+    sw          t4, 256(a2)
+    sw          s0, 288(a2)
+    sw          t2, 320(a2)
+    sw          v0, 352(a2)
+    bgtz        a3, 1b
+     addiu      a2, a2, 4
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j           ra
+     nop
+
+END(jsimd_idct_12x12_pass1_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2)
+/*
+ * a0 = workspace
+ * a1 = output
+ */
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    li          a3, 12
+
+1:
+    // Odd part
+    lw          t0, 12(a0)
+    lw          t1, 4(a0)
+    lw          t2, 20(a0)
+    lw          t3, 28(a0)
+    li          t4, 10703       // FIX(1.306562965)
+    li          t5, 4433        // FIX_0_541196100
+    mul         t4, t0, t4      // tmp11
+    mul         t5, t0, t5      // -tmp14
+    addu        t6, t1, t2      // tmp10
+    li          t7, 2139        // FIX(0.261052384)
+    mul         t7, t6, t7      // MULTIPLY(tmp10, FIX(0.261052384))
+    addu        t6, t6, t3      // tmp10 + z4
+    li          t8, 7053        // FIX(0.860918669)
+    mul         t6, t6, t8      // tmp15
+    li          t8, 2295        // FIX(0.280143716)
+    mul         t8, t1, t8      // MULTIPLY(z1, FIX(0.280143716))
+    addu        t9, t2, t3      // z3 + z4
+    li          s0, 8565        // FIX(1.045510580)
+    mul         t9, t9, s0      // -tmp13
+    li          s0, 12112       // FIX(1.478575242)
+    mul         s0, t2, s0      // MULTIPLY(z3, FIX(1.478575242))
+    li          s1, 12998       // FIX(1.586706681)
+    mul         s1, t3, s1      // MULTIPLY(z4, FIX(1.586706681))
+    li          s2, 5540        // FIX(0.676326758)
+    mul         s2, t1, s2      // MULTIPLY(z1, FIX(0.676326758))
+    li          s3, 16244       // FIX(1.982889723)
+    mul         s3, t3, s3      // MULTIPLY(z4, FIX(1.982889723))
+    subu        t1, t1, t3      // z1 -= z4
+    subu        t0, t0, t2      // z2 -= z3
+    addu        t2, t1, t0      // z1 + z2
+    li          t3, 4433        // FIX_0_541196100
+    mul         t2, t2, t3      // z3
+    li          t3, 6270        // FIX_0_765366865
+    mul         t1, t1, t3      // MULTIPLY(z1, FIX_0_765366865)
+    li          t3, 15137       // FIX_1_847759065
+    mul         t0, t0, t3      // MULTIPLY(z2, FIX_1_847759065)
+    addu        t3, t6, t7      // tmp12
+    addu        t7, t3, t4
+    addu        t7, t7, t8      // tmp10
+    subu        t3, t3, t9
+    subu        t3, t3, t5
+    subu        t3, t3, s0      // tmp12
+    subu        t9, t6, t9
+    subu        t9, t9, t4
+    addu        t9, t9, s1      // tmp13
+    subu        t6, t6, t5
+    subu        t6, t6, s2
+    subu        t6, t6, s3      // tmp15
+    addu        t1, t2, t1      // tmp11
+    subu        t0, t2, t0      // tmp14
+    // even part
+    lw          t2, 16(a0)      // z4
+    lw          t4, 8(a0)       // z1
+    lw          t5, 0(a0)       // z3
+    lw          t8, 24(a0)      // z2
+    li          s0, 10033       // FIX(1.224744871)
+    li          s1, 11190       // FIX(1.366025404)
+    mul         t2, t2, s0      // z4
+    mul         s0, t4, s1      // z4
+    addiu       t5, t5, 0x10
+    sll         t5, t5, 13      // z3
+    sll         t4, t4, 13      // z1
+    sll         t8, t8, 13      // z2
+    subu        s1, t4, t8      // tmp12
+    addu        s2, t5, t2      // tmp10
+    subu        t2, t5, t2      // tmp11
+    addu        s3, t5, s1      // tmp21
+    subu        s1, t5, s1      // tmp24
+    addu        t5, s0, t8      // tmp12
+    addu        v0, s2, t5      // tmp20
+    subu        t5, s2, t5      // tmp25
+    subu        t4, s0, t4
+    subu        t4, t4, t8      // tmp12
+    addu        t8, t2, t4      // tmp22
+    subu        t2, t2, t4      // tmp23
+    // increment counter and pointers
+    addiu       a3, a3, -1
+    addiu       a0, a0, 32
+    // Final stage
+    addu        t4, v0, t7
+    subu        v0, v0, t7
+    addu        t7, s3, t1
+    subu        s3, s3, t1
+    addu        t1, t8, t3
+    subu        t8, t8, t3
+    addu        t3, t2, t9
+    subu        t2, t2, t9
+    addu        t9, s1, t0
+    subu        s1, s1, t0
+    addu        t0, t5, t6
+    subu        t5, t5, t6
+    sll         t4, t4, 4
+    sll         t7, t7, 4
+    sll         t1, t1, 4
+    sll         t3, t3, 4
+    sll         t9, t9, 4
+    sll         t0, t0, 4
+    sll         t5, t5, 4
+    sll         s1, s1, 4
+    sll         t2, t2, 4
+    sll         t8, t8, 4
+    sll         s3, s3, 4
+    sll         v0, v0, 4
+    shll_s.w    t4, t4, 2
+    shll_s.w    t7, t7, 2
+    shll_s.w    t1, t1, 2
+    shll_s.w    t3, t3, 2
+    shll_s.w    t9, t9, 2
+    shll_s.w    t0, t0, 2
+    shll_s.w    t5, t5, 2
+    shll_s.w    s1, s1, 2
+    shll_s.w    t2, t2, 2
+    shll_s.w    t8, t8, 2
+    shll_s.w    s3, s3, 2
+    shll_s.w    v0, v0, 2
+    srl         t4, t4, 24
+    srl         t7, t7, 24
+    srl         t1, t1, 24
+    srl         t3, t3, 24
+    srl         t9, t9, 24
+    srl         t0, t0, 24
+    srl         t5, t5, 24
+    srl         s1, s1, 24
+    srl         t2, t2, 24
+    srl         t8, t8, 24
+    srl         s3, s3, 24
+    srl         v0, v0, 24
+    lw          t6, 0(a1)
+    addiu       t4, t4, 0x80
+    addiu       t7, t7, 0x80
+    addiu       t1, t1, 0x80
+    addiu       t3, t3, 0x80
+    addiu       t9, t9, 0x80
+    addiu       t0, t0, 0x80
+    addiu       t5, t5, 0x80
+    addiu       s1, s1, 0x80
+    addiu       t2, t2, 0x80
+    addiu       t8, t8, 0x80
+    addiu       s3, s3, 0x80
+    addiu       v0, v0, 0x80
+    sb          t4, 0(t6)
+    sb          t7, 1(t6)
+    sb          t1, 2(t6)
+    sb          t3, 3(t6)
+    sb          t9, 4(t6)
+    sb          t0, 5(t6)
+    sb          t5, 6(t6)
+    sb          s1, 7(t6)
+    sb          t2, 8(t6)
+    sb          t8, 9(t6)
+    sb          s3, 10(t6)
+    sb          v0, 11(t6)
+    bgtz        a3, 1b
+     addiu      a1, a1, 4
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    jr          ra
+     nop
+
+END(jsimd_idct_12x12_pass2_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_convsamp_dspr2)
+/*
+ * a0 = sample_data
+ * a1 = start_col
+ * a2 = workspace
+ */
+    lw            t0, 0(a0)
+    li            t7, 0xff80ff80
+    addu          t0, t0, a1
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    lw            t0, 4(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 0(a2)
+    usw           t4, 4(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 8(a2)
+    usw           t6, 12(a2)
+
+    lw            t0, 8(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 16(a2)
+    usw           t4, 20(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 24(a2)
+    usw           t6, 28(a2)
+
+    lw            t0, 12(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 32(a2)
+    usw           t4, 36(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 40(a2)
+    usw           t6, 44(a2)
+
+    lw            t0, 16(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 48(a2)
+    usw           t4, 52(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 56(a2)
+    usw           t6, 60(a2)
+
+    lw            t0, 20(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 64(a2)
+    usw           t4, 68(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 72(a2)
+    usw           t6, 76(a2)
+
+    lw            t0, 24(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 80(a2)
+    usw           t4, 84(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 88(a2)
+    usw           t6, 92(a2)
+
+    lw            t0, 28(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 96(a2)
+    usw           t4, 100(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 104(a2)
+    usw           t6, 108(a2)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 112(a2)
+    usw           t4, 116(a2)
+    usw           t5, 120(a2)
+    usw           t6, 124(a2)
+
+    j             ra
+     nop
+
+END(jsimd_convsamp_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_convsamp_float_dspr2)
+/*
+ * a0 = sample_data
+ * a1 = start_col
+ * a2 = workspace
+ */
+    .set at
+
+    lw          t0, 0(a0)
+    addu        t0, t0, a1
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 4(a0)
+    swc1        f2, 0(a2)
+    swc1        f4, 4(a2)
+    swc1        f6, 8(a2)
+    addu        t0, t0, a1
+    swc1        f8, 12(a2)
+    swc1        f10, 16(a2)
+    swc1        f12, 20(a2)
+    swc1        f14, 24(a2)
+    swc1        f16, 28(a2)
+    // elemr 1
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 8(a0)
+    swc1        f2, 32(a2)
+    swc1        f4, 36(a2)
+    swc1        f6, 40(a2)
+    addu        t0, t0, a1
+    swc1        f8, 44(a2)
+    swc1        f10, 48(a2)
+    swc1        f12, 52(a2)
+    swc1        f14, 56(a2)
+    swc1        f16, 60(a2)
+    // elemr 2
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 12(a0)
+    swc1        f2, 64(a2)
+    swc1        f4, 68(a2)
+    swc1        f6, 72(a2)
+    addu        t0, t0, a1
+    swc1        f8, 76(a2)
+    swc1        f10, 80(a2)
+    swc1        f12, 84(a2)
+    swc1        f14, 88(a2)
+    swc1        f16, 92(a2)
+    //  elemr 3
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 16(a0)
+    swc1        f2, 96(a2)
+    swc1        f4, 100(a2)
+    swc1        f6, 104(a2)
+    addu        t0, t0, a1
+    swc1        f8, 108(a2)
+    swc1        f10, 112(a2)
+    swc1        f12, 116(a2)
+    swc1        f14, 120(a2)
+    swc1        f16, 124(a2)
+    // elemr 4
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 20(a0)
+    swc1        f2, 128(a2)
+    swc1        f4, 132(a2)
+    swc1        f6, 136(a2)
+    addu        t0, t0, a1
+    swc1        f8, 140(a2)
+    swc1        f10, 144(a2)
+    swc1        f12, 148(a2)
+    swc1        f14, 152(a2)
+    swc1        f16, 156(a2)
+    // elemr 5
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 24(a0)
+    swc1        f2, 160(a2)
+    swc1        f4, 164(a2)
+    swc1        f6, 168(a2)
+    addu        t0, t0, a1
+    swc1        f8, 172(a2)
+    swc1        f10, 176(a2)
+    swc1        f12, 180(a2)
+    swc1        f14, 184(a2)
+    swc1        f16, 188(a2)
+    // elemr 6
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 28(a0)
+    swc1        f2, 192(a2)
+    swc1        f4, 196(a2)
+    swc1        f6, 200(a2)
+    addu        t0, t0, a1
+    swc1        f8, 204(a2)
+    swc1        f10, 208(a2)
+    swc1        f12, 212(a2)
+    swc1        f14, 216(a2)
+    swc1        f16, 220(a2)
+    // elemr 7
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    swc1        f2, 224(a2)
+    swc1        f4, 228(a2)
+    swc1        f6, 232(a2)
+    swc1        f8, 236(a2)
+    swc1        f10, 240(a2)
+    swc1        f12, 244(a2)
+    swc1        f14, 248(a2)
+    swc1        f16, 252(a2)
+
+    j           ra
+     nop
+
+END(jsimd_convsamp_float_dspr2)
+
+/*****************************************************************************/
diff --git a/simd/mips/jsimd_dspr2_asm.h b/simd/mips/jsimd_dspr2_asm.h
new file mode 100644
index 0000000..12cfda4
--- /dev/null
+++ b/simd/mips/jsimd_dspr2_asm.h
@@ -0,0 +1,292 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013, MIPS Technologies, Inc., California.
+ * Copyright (C) 2018, Matthieu Darbois.
+ * All Rights Reserved.
+ * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
+ *           Darko Laus       (darko.laus@imgtec.com)
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define zero  $0
+#define AT    $1
+#define v0    $2
+#define v1    $3
+#define a0    $4
+#define a1    $5
+#define a2    $6
+#define a3    $7
+#define t0    $8
+#define t1    $9
+#define t2    $10
+#define t3    $11
+#define t4    $12
+#define t5    $13
+#define t6    $14
+#define t7    $15
+#define s0    $16
+#define s1    $17
+#define s2    $18
+#define s3    $19
+#define s4    $20
+#define s5    $21
+#define s6    $22
+#define s7    $23
+#define t8    $24
+#define t9    $25
+#define k0    $26
+#define k1    $27
+#define gp    $28
+#define sp    $29
+#define fp    $30
+#define s8    $30
+#define ra    $31
+
+#define f0    $f0
+#define f1    $f1
+#define f2    $f2
+#define f3    $f3
+#define f4    $f4
+#define f5    $f5
+#define f6    $f6
+#define f7    $f7
+#define f8    $f8
+#define f9    $f9
+#define f10   $f10
+#define f11   $f11
+#define f12   $f12
+#define f13   $f13
+#define f14   $f14
+#define f15   $f15
+#define f16   $f16
+#define f17   $f17
+#define f18   $f18
+#define f19   $f19
+#define f20   $f20
+#define f21   $f21
+#define f22   $f22
+#define f23   $f23
+#define f24   $f24
+#define f25   $f25
+#define f26   $f26
+#define f27   $f27
+#define f28   $f28
+#define f29   $f29
+#define f30   $f30
+#define f31   $f31
+
+#ifdef __ELF__
+#define HIDDEN_SYMBOL(symbol)  .hidden symbol;
+#else
+#define HIDDEN_SYMBOL(symbol)
+#endif
+
+/*
+ * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
+ */
+#define LEAF_MIPS32R2(symbol) \
+    .globl      symbol; \
+    HIDDEN_SYMBOL(symbol) \
+    .align      2; \
+    .type       symbol, @function; \
+    .ent        symbol, 0; \
+symbol: \
+    .frame      sp, 0, ra; \
+    .set        push; \
+    .set        arch = mips32r2; \
+    .set        noreorder; \
+    .set        noat;
+
+/*
+ * LEAF_DSPR2 - declare leaf routine for MIPS DSPr2
+ */
+#define LEAF_DSPR2(symbol) \
+LEAF_MIPS32R2(symbol) \
+    .set        dspr2;
+
+/*
+ * END - mark end of function
+ */
+#define END(function) \
+    .set        pop; \
+    .end        function; \
+    .size       function, .-function
+
+/*
+ * Checks if stack offset is big enough for storing/restoring regs_num
+ * number of register to/from stack. Stack offset must be greater than
+ * or equal to the number of bytes needed for storing registers (regs_num*4).
+ * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
+ * preserved for input arguments of the functions, already stored in a0-a3),
+ * stack size can be further optimized by utilizing this space.
+ */
+.macro CHECK_STACK_OFFSET regs_num, stack_offset
+.if \stack_offset < \regs_num * 4 - 16
+.error "Stack offset too small."
+.endif
+.endm
+
+/*
+ * Saves set of registers on stack. Maximum number of registers that
+ * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * before registers are pushed in order to provide enough space on stack
+ * (offset must be multiple of 4, and must be big enough, as described by
+ * CHECK_STACK_OFFSET macro). This macro is intended to be used in
+ * combination with RESTORE_REGS_FROM_STACK macro. Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro SAVE_REGS_ON_STACK  stack_offset = 0, r1, \
+                           r2  = 0, r3  = 0, r4  = 0, \
+                           r5  = 0, r6  = 0, r7  = 0, \
+                           r8  = 0, r9  = 0, r10 = 0, \
+                           r11 = 0, r12 = 0, r13 = 0, \
+                           r14 = 0
+.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+    .error "Stack offset must be pozitive and multiple of 4."
+.endif
+.if \stack_offset != 0
+    addiu       sp, sp, -\stack_offset
+.endif
+    sw          \r1, 0(sp)
+.if \r2 != 0
+    sw          \r2, 4(sp)
+.endif
+.if \r3 != 0
+    sw          \r3, 8(sp)
+.endif
+.if \r4 != 0
+    sw          \r4, 12(sp)
+.endif
+.if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    sw          \r5, 16(sp)
+.endif
+.if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    sw          \r6, 20(sp)
+.endif
+.if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    sw          \r7, 24(sp)
+.endif
+.if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    sw          \r8, 28(sp)
+.endif
+.if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    sw          \r9, 32(sp)
+.endif
+.if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    sw          \r10, 36(sp)
+.endif
+.if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    sw          \r11, 40(sp)
+.endif
+.if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    sw          \r12, 44(sp)
+.endif
+.if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    sw          \r13, 48(sp)
+.endif
+.if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    sw          \r14, 52(sp)
+.endif
+.endm
+
+/*
+ * Restores set of registers from stack. Maximum number of registers that
+ * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * after registers are restored (offset must be multiple of 4, and must
+ * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
+ * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
+ * Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro RESTORE_REGS_FROM_STACK  stack_offset = 0, r1, \
+                                r2  = 0, r3  = 0, r4  = 0, \
+                                r5  = 0, r6  = 0, r7  = 0, \
+                                r8  = 0, r9  = 0, r10 = 0, \
+                                r11 = 0, r12 = 0, r13 = 0, \
+                                r14 = 0
+.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+    .error "Stack offset must be pozitive and multiple of 4."
+.endif
+    lw          \r1, 0(sp)
+.if \r2 != 0
+    lw          \r2, 4(sp)
+.endif
+.if \r3 != 0
+    lw          \r3, 8(sp)
+.endif
+.if \r4 != 0
+    lw          \r4, 12(sp)
+.endif
+.if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    lw          \r5, 16(sp)
+.endif
+.if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    lw          \r6, 20(sp)
+.endif
+.if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    lw          \r7, 24(sp)
+.endif
+.if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    lw          \r8, 28(sp)
+.endif
+.if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    lw          \r9, 32(sp)
+.endif
+.if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    lw          \r10, 36(sp)
+.endif
+.if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    lw          \r11, 40(sp)
+.endif
+.if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    lw          \r12, 44(sp)
+.endif
+.if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    lw          \r13, 48(sp)
+.endif
+.if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    lw          \r14, 52(sp)
+.endif
+.if \stack_offset != 0
+    addiu       sp, sp, \stack_offset
+.endif
+.endm
diff --git a/simd/jcolsamp.inc b/simd/nasm/jcolsamp.inc
similarity index 76%
rename from simd/jcolsamp.inc
rename to simd/nasm/jcolsamp.inc
index 3be446e..3fa6214 100644
--- a/simd/jcolsamp.inc
+++ b/simd/nasm/jcolsamp.inc
@@ -2,6 +2,7 @@
 ; jcolsamp.inc - private declarations for color conversion & up/downsampling
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -18,21 +19,29 @@
 %define  mmB  mm1
 %define xmmA xmm0
 %define xmmB xmm1
+%define ymmA ymm0
+%define ymmB ymm1
 %elif RGB_GREEN == 0
 %define  mmA  mm2
 %define  mmB  mm3
 %define xmmA xmm2
 %define xmmB xmm3
+%define ymmA ymm2
+%define ymmB ymm3
 %elif RGB_BLUE == 0
 %define  mmA  mm4
 %define  mmB  mm5
 %define xmmA xmm4
 %define xmmB xmm5
+%define ymmA ymm4
+%define ymmB ymm5
 %else
 %define  mmA  mm6
 %define  mmB  mm7
 %define xmmA xmm6
 %define xmmB xmm7
+%define ymmA ymm6
+%define ymmB ymm7
 %endif
 
 %if RGB_RED == 1
@@ -40,21 +49,29 @@
 %define  mmD  mm1
 %define xmmC xmm0
 %define xmmD xmm1
+%define ymmC ymm0
+%define ymmD ymm1
 %elif RGB_GREEN == 1
 %define  mmC  mm2
 %define  mmD  mm3
 %define xmmC xmm2
 %define xmmD xmm3
+%define ymmC ymm2
+%define ymmD ymm3
 %elif RGB_BLUE == 1
 %define  mmC  mm4
 %define  mmD  mm5
 %define xmmC xmm4
 %define xmmD xmm5
+%define ymmC ymm4
+%define ymmD ymm5
 %else
 %define  mmC  mm6
 %define  mmD  mm7
 %define xmmC xmm6
 %define xmmD xmm7
+%define ymmC ymm6
+%define ymmD ymm7
 %endif
 
 %if RGB_RED == 2
@@ -62,21 +79,29 @@
 %define  mmF  mm1
 %define xmmE xmm0
 %define xmmF xmm1
+%define ymmE ymm0
+%define ymmF ymm1
 %elif RGB_GREEN == 2
 %define  mmE  mm2
 %define  mmF  mm3
 %define xmmE xmm2
 %define xmmF xmm3
+%define ymmE ymm2
+%define ymmF ymm3
 %elif RGB_BLUE == 2
 %define  mmE  mm4
 %define  mmF  mm5
 %define xmmE xmm4
 %define xmmF xmm5
+%define ymmE ymm4
+%define ymmF ymm5
 %else
 %define  mmE  mm6
 %define  mmF  mm7
 %define xmmE xmm6
 %define xmmF xmm7
+%define ymmE ymm6
+%define ymmF ymm7
 %endif
 
 %if RGB_RED == 3
@@ -84,21 +109,29 @@
 %define  mmH  mm1
 %define xmmG xmm0
 %define xmmH xmm1
+%define ymmG ymm0
+%define ymmH ymm1
 %elif RGB_GREEN == 3
 %define  mmG  mm2
 %define  mmH  mm3
 %define xmmG xmm2
 %define xmmH xmm3
+%define ymmG ymm2
+%define ymmH ymm3
 %elif RGB_BLUE == 3
 %define  mmG  mm4
 %define  mmH  mm5
 %define xmmG xmm4
 %define xmmH xmm5
+%define ymmG ymm4
+%define ymmH ymm5
 %else
 %define  mmG  mm6
 %define  mmH  mm7
 %define xmmG xmm6
 %define xmmH xmm7
+%define ymmG ymm6
+%define ymmH ymm7
 %endif
 
 ; --------------------------------------------------------------------------
diff --git a/simd/jdct.inc b/simd/nasm/jdct.inc
similarity index 64%
rename from simd/jdct.inc
rename to simd/nasm/jdct.inc
index b976107..79d5146 100644
--- a/simd/jdct.inc
+++ b/simd/nasm/jdct.inc
@@ -2,6 +2,7 @@
 ; jdct.inc - private declarations for forward & reverse DCT subsystems
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2018, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -17,11 +18,16 @@
 ;
 %define RANGE_MASK  (MAXJSAMPLE * 4 + 3)  ; 2 bits wider than legal samples
 
-%define ROW(n,b,s)              ((b)+(n)*(s))
-%define COL(n,b,s)              ((b)+(n)*(s)*DCTSIZE)
+%define ROW(n, b, s)  ((b) + (n) * (s))
+%define COL(n, b, s)  ((b) + (n) * (s) * DCTSIZE)
 
-%define DWBLOCK(m,n,b,s)        ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
-%define MMBLOCK(m,n,b,s)        ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
-%define XMMBLOCK(m,n,b,s)       ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD)
+%define DWBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_DWORD)
+%define MMBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_MMWORD)
+%define XMMBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_XMMWORD)
+%define YMMBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_YMMWORD)
 
 ; --------------------------------------------------------------------------
diff --git a/simd/jpeg_nbits_table.inc b/simd/nasm/jpeg_nbits_table.inc
similarity index 92%
rename from simd/jpeg_nbits_table.inc
rename to simd/nasm/jpeg_nbits_table.inc
index cbc6990..2ce6c28 100644
--- a/simd/jpeg_nbits_table.inc
+++ b/simd/nasm/jpeg_nbits_table.inc
@@ -1,4097 +1,4097 @@
-jpeg_nbits_table db  \
-   0,  1,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,  \
-   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  \
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  \
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
+jpeg_nbits_table db \
+   0,  1,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4, \
+   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, \
+   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6, \
+   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6, \
+   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, \
+   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, \
+   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, \
+   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
+   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
diff --git a/simd/nasm/jsimdext.inc b/simd/nasm/jsimdext.inc
new file mode 100644
index 0000000..b40901f
--- /dev/null
+++ b/simd/nasm/jsimdext.inc
@@ -0,0 +1,476 @@
+;
+; jsimdext.inc - common declarations
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2010, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthieu Darbois.
+;
+; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
+;
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+;
+; This software is provided 'as-is', without any express or implied
+; warranty.  In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+;    claim that you wrote the original software. If you use this software
+;    in a product, an acknowledgment in the product documentation would be
+;    appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+;    misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+;
+; [TAB8]
+
+; ==========================================================================
+;  System-dependent configurations
+
+%ifdef WIN32    ; ----(nasm -fwin32 -DWIN32 ...)--------
+; * Microsoft Visual C++
+; * MinGW (Minimalist GNU for Windows)
+; * CygWin
+; * LCC-Win32
+
+; -- segment definition --
+;
+%ifdef __YASM_VER__
+%define SEG_TEXT   .text  align=32
+%define SEG_CONST  .rdata align=32
+%else
+%define SEG_TEXT   .text  align=32 public use32 class=CODE
+%define SEG_CONST  .rdata align=32 public use32 class=CONST
+%endif
+
+%elifdef WIN64  ; ----(nasm -fwin64 -DWIN64 ...)--------
+; * Microsoft Visual C++
+
+; -- segment definition --
+;
+%ifdef __YASM_VER__
+%define SEG_TEXT    .text  align=32
+%define SEG_CONST   .rdata align=32
+%else
+%define SEG_TEXT    .text  align=32 public use64 class=CODE
+%define SEG_CONST   .rdata align=32 public use64 class=CONST
+%endif
+%define EXTN(name)  name                ; foo() -> foo
+
+%elifdef OBJ32  ; ----(nasm -fobj -DOBJ32 ...)----------
+; * Borland C++ (Win32)
+
+; -- segment definition --
+;
+%define SEG_TEXT   _text align=32 public use32 class=CODE
+%define SEG_CONST  _data align=32 public use32 class=DATA
+
+%elifdef ELF    ; ----(nasm -felf[64] -DELF ...)------------
+; * Linux
+; * *BSD family Unix using elf format
+; * Unix System V, including Solaris x86, UnixWare and SCO Unix
+
+; mark stack as non-executable
+section .note.GNU-stack noalloc noexec nowrite progbits
+
+; -- segment definition --
+;
+%ifdef __x86_64__
+%define SEG_TEXT   .text   progbits align=32
+%define SEG_CONST  .rodata progbits align=32
+%else
+%define SEG_TEXT   .text   progbits alloc exec   nowrite align=32
+%define SEG_CONST  .rodata progbits alloc noexec nowrite align=32
+%endif
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_  ; ELF supports PIC
+%define EXTN(name)  name                   ; foo() -> foo
+
+%elifdef AOUT   ; ----(nasm -faoutb/aout -DAOUT ...)----
+; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
+; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
+
+; -- segment definition --
+;
+%define SEG_TEXT   .text
+%define SEG_CONST  .data
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_  ; BSD-style a.out supports PIC
+
+%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
+; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
+
+; -- segment definition --
+;
+%define SEG_TEXT   .text  ;align=32     ; nasm doesn't accept align=32. why?
+%define SEG_CONST  .rodata align=32
+
+; The generation of position-independent code (PIC) is the default on Darwin.
+;
+%define PIC
+%define GOT_SYMBOL  _MACHO_PIC_         ; Mach-O style code-relative addressing
+
+%else           ; ----(Other case)----------------------
+
+; -- segment definition --
+;
+%define SEG_TEXT   .text
+%define SEG_CONST  .data
+
+%endif          ; ----------------------------------------------
+
+; ==========================================================================
+
+; --------------------------------------------------------------------------
+;  Common types
+;
+%ifdef __x86_64__
+%define POINTER         qword           ; general pointer type
+%define SIZEOF_POINTER  SIZEOF_QWORD    ; sizeof(POINTER)
+%define POINTER_BIT     QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
+%else
+%define POINTER         dword           ; general pointer type
+%define SIZEOF_POINTER  SIZEOF_DWORD    ; sizeof(POINTER)
+%define POINTER_BIT     DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
+%endif
+
+%define INT             dword           ; signed integer type
+%define SIZEOF_INT      SIZEOF_DWORD    ; sizeof(INT)
+%define INT_BIT         DWORD_BIT       ; sizeof(INT)*BYTE_BIT
+
+%define FP32            dword           ; IEEE754 single
+%define SIZEOF_FP32     SIZEOF_DWORD    ; sizeof(FP32)
+%define FP32_BIT        DWORD_BIT       ; sizeof(FP32)*BYTE_BIT
+
+%define MMWORD          qword           ; int64  (MMX register)
+%define SIZEOF_MMWORD   SIZEOF_QWORD    ; sizeof(MMWORD)
+%define MMWORD_BIT      QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
+
+; NASM is buggy and doesn't properly handle operand sizes for SSE
+; instructions, so for now we have to define XMMWORD as blank.
+%define XMMWORD                         ; int128 (SSE register)
+%define SIZEOF_XMMWORD  SIZEOF_OWORD    ; sizeof(XMMWORD)
+%define XMMWORD_BIT     OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT
+
+%define YMMWORD                         ; int256 (AVX register)
+%define SIZEOF_YMMWORD  SIZEOF_YWORD    ; sizeof(YMMWORD)
+%define YMMWORD_BIT     YWORD_BIT       ; sizeof(YMMWORD)*BYTE_BIT
+
+; Similar hacks for when we load a dword or MMWORD into an xmm# register
+%define XMM_DWORD
+%define XMM_MMWORD
+
+%define SIZEOF_BYTE   1                 ; sizeof(BYTE)
+%define SIZEOF_WORD   2                 ; sizeof(WORD)
+%define SIZEOF_DWORD  4                 ; sizeof(DWORD)
+%define SIZEOF_QWORD  8                 ; sizeof(QWORD)
+%define SIZEOF_OWORD  16                ; sizeof(OWORD)
+%define SIZEOF_YWORD  32                ; sizeof(YWORD)
+
+%define BYTE_BIT      8                 ; CHAR_BIT in C
+%define WORD_BIT      16                ; sizeof(WORD)*BYTE_BIT
+%define DWORD_BIT     32                ; sizeof(DWORD)*BYTE_BIT
+%define QWORD_BIT     64                ; sizeof(QWORD)*BYTE_BIT
+%define OWORD_BIT     128               ; sizeof(OWORD)*BYTE_BIT
+%define YWORD_BIT     256               ; sizeof(YWORD)*BYTE_BIT
+
+; --------------------------------------------------------------------------
+;  External Symbol Name
+;
+%ifndef EXTN
+%define EXTN(name)  _ %+ name           ; foo() -> _foo
+%endif
+
+; --------------------------------------------------------------------------
+;  Hidden symbols
+;
+%ifdef ELF      ; ----(nasm -felf[64] -DELF ...)--------
+%define GLOBAL_FUNCTION(name)  global EXTN(name):function hidden
+%define GLOBAL_DATA(name)      global EXTN(name):data hidden
+%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
+%ifdef __YASM_VER__
+%define GLOBAL_FUNCTION(name)  global EXTN(name):private_extern
+%define GLOBAL_DATA(name)      global EXTN(name):private_extern
+%endif
+%endif
+
+%ifndef GLOBAL_FUNCTION
+%define GLOBAL_FUNCTION(name)  global EXTN(name)
+%endif
+%ifndef GLOBAL_DATA
+%define GLOBAL_DATA(name)      global EXTN(name)
+%endif
+
+; --------------------------------------------------------------------------
+;  Macros for position-independent code (PIC) support
+;
+%ifndef GOT_SYMBOL
+%undef PIC
+%endif
+
+%ifdef PIC  ; -------------------------------------------
+
+%ifidn GOT_SYMBOL, _MACHO_PIC_  ; --------------------
+
+; At present, nasm doesn't seem to support PIC generation for Mach-O.
+; The PIC support code below is a little tricky.
+
+    SECTION     SEG_CONST
+const_base:
+
+%define GOTOFF(got, sym)  (got) + (sym) - const_base
+
+%imacro get_GOT 1
+    ; NOTE: this macro destroys ecx resister.
+    call        %%geteip
+    add         ecx, byte (%%ref - $)
+    jmp         short %%adjust
+%%geteip:
+    mov         ecx, POINTER [esp]
+    ret
+%%adjust:
+    push        ebp
+    xor         ebp, ebp                ; ebp = 0
+%ifidni %1, ebx  ; (%1 == ebx)
+    ; db 0x8D,0x9C + jmp near const_base =
+    ;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
+    db          0x8D, 0x9C              ; 8D,9C
+    jmp         near const_base         ; E9,(const_base-%%ref)
+%%ref:
+%else  ; (%1 != ebx)
+    ; db 0x8D,0x8C + jmp near const_base =
+    ;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
+    db          0x8D, 0x8C              ; 8D,8C
+    jmp         near const_base         ; E9,(const_base-%%ref)
+%%ref:
+    mov         %1, ecx
+%endif  ; (%1 == ebx)
+    pop         ebp
+%endmacro
+
+%else     ; GOT_SYMBOL != _MACHO_PIC_ ----------------
+
+%define GOTOFF(got, sym)  (got) + (sym) wrt ..gotoff
+
+%imacro get_GOT 1
+    extern      GOT_SYMBOL
+    call        %%geteip
+    add         %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
+    jmp         short %%done
+%%geteip:
+    mov         %1, POINTER [esp]
+    ret
+%%done:
+%endmacro
+
+%endif    ; GOT_SYMBOL == _MACHO_PIC_ ----------------
+
+%imacro pushpic 1.nolist
+    push        %1
+%endmacro
+%imacro poppic  1.nolist
+    pop         %1
+%endmacro
+%imacro movpic  2.nolist
+    mov         %1, %2
+%endmacro
+
+%else    ; !PIC -----------------------------------------
+
+%define GOTOFF(got, sym)  (sym)
+
+%imacro get_GOT 1.nolist
+%endmacro
+%imacro pushpic 1.nolist
+%endmacro
+%imacro poppic  1.nolist
+%endmacro
+%imacro movpic  2.nolist
+%endmacro
+
+%endif   ;  PIC -----------------------------------------
+
+; --------------------------------------------------------------------------
+;  Align the next instruction on {2,4,8,16,..}-byte boundary.
+;  ".balign n,,m" in GNU as
+;
+%define MSKLE(x, y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
+%define FILLB(b, n)  (($$-(b)) & ((n)-1))
+
+%imacro alignx 1-2.nolist 0xFFFF
+%%bs: \
+  times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \
+        db 0x90                                      ; nop
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \
+        db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00  ; lea ebx,[ebx+0x00000000]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \
+        db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00  ; lea ebp,[ebp+0x00000000]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \
+        db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00        ; lea ebp,[ebp+0x00000000]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \
+        db 0x8D, 0x6C, 0x25, 0x00                    ; lea ebp,[ebp+0x00]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \
+        db 0x8D, 0x6D, 0x00                          ; lea ebp,[ebp+0x00]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \
+        db 0x8B, 0xED                                ; mov ebp,ebp
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \
+        db 0x90                                      ; nop
+%endmacro
+
+; Align the next data on {2,4,8,16,..}-byte boundary.
+;
+%imacro alignz 1.nolist
+    align       %1, db 0                ; filling zeros
+%endmacro
+
+%ifdef __x86_64__
+
+%ifdef WIN64
+
+%imacro collect_args 1
+    sub         rsp, SIZEOF_XMMWORD
+    movaps      XMMWORD [rsp], xmm6
+    sub         rsp, SIZEOF_XMMWORD
+    movaps      XMMWORD [rsp], xmm7
+    mov         r10, rcx
+%if %1 > 1
+    mov         r11, rdx
+%endif
+%if %1 > 2
+    push        r12
+    mov         r12, r8
+%endif
+%if %1 > 3
+    push        r13
+    mov         r13, r9
+%endif
+%if %1 > 4
+    push        r14
+    mov         r14, [rax+48]
+%endif
+%if %1 > 5
+    push        r15
+    mov         r15, [rax+56]
+%endif
+    push        rsi
+    push        rdi
+%endmacro
+
+%imacro uncollect_args 1
+    pop         rdi
+    pop         rsi
+%if %1 > 5
+    pop         r15
+%endif
+%if %1 > 4
+    pop         r14
+%endif
+%if %1 > 3
+    pop         r13
+%endif
+%if %1 > 2
+    pop         r12
+%endif
+    movaps      xmm7, XMMWORD [rsp]
+    add         rsp, SIZEOF_XMMWORD
+    movaps      xmm6, XMMWORD [rsp]
+    add         rsp, SIZEOF_XMMWORD
+%endmacro
+
+%imacro push_xmm 1
+    sub         rsp, %1 * SIZEOF_XMMWORD
+    movaps      XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8
+%if %1 > 1
+    movaps      XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9
+%endif
+%if %1 > 2
+    movaps      XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10
+%endif
+%if %1 > 3
+    movaps      XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11
+%endif
+%endmacro
+
+%imacro pop_xmm 1
+    movaps      xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD]
+%if %1 > 1
+    movaps      xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD]
+%endif
+%if %1 > 2
+    movaps      xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD]
+%endif
+%if %1 > 3
+    movaps      xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD]
+%endif
+    add         rsp, %1 * SIZEOF_XMMWORD
+%endmacro
+
+%else
+
+%imacro collect_args 1
+    push        r10
+    mov         r10, rdi
+%if %1 > 1
+    push        r11
+    mov         r11, rsi
+%endif
+%if %1 > 2
+    push        r12
+    mov         r12, rdx
+%endif
+%if %1 > 3
+    push        r13
+    mov         r13, rcx
+%endif
+%if %1 > 4
+    push        r14
+    mov         r14, r8
+%endif
+%if %1 > 5
+    push        r15
+    mov         r15, r9
+%endif
+%endmacro
+
+%imacro uncollect_args 1
+%if %1 > 5
+    pop         r15
+%endif
+%if %1 > 4
+    pop         r14
+%endif
+%if %1 > 3
+    pop         r13
+%endif
+%if %1 > 2
+    pop         r12
+%endif
+%if %1 > 1
+    pop         r11
+%endif
+    pop         r10
+%endmacro
+
+%imacro push_xmm 1
+%endmacro
+
+%imacro pop_xmm 1
+%endmacro
+
+%endif
+
+%endif
+
+; --------------------------------------------------------------------------
+;  Defines picked up from the C headers
+;
+%include "jsimdcfg.inc"
+
+; --------------------------------------------------------------------------
diff --git a/simd/nasm_lt.sh b/simd/nasm_lt.sh
deleted file mode 100755
index 817be16..0000000
--- a/simd/nasm_lt.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#! /bin/sh
-command=""
-infile=""
-o_opt=no
-pic=no
-while [ $# -gt 0 ]; do
-    case "$1" in
-        --silent)
-            exec > /dev/null
-            ;;
-        -DPIC|-fPIC|-fpic|-Kpic|-KPIC)
-            if [ "$pic" != "yes" ] ; then
-                command="$command -DPIC"
-                pic=yes
-            fi
-            ;;
-        -f|-fbin|-faout|-faoutb|-fcoff|-felf|-felf64|-fas86| \
-        -fobj|-fwin32|-fwin64|-frdf|-fieee|-fmacho|-fmacho64)
-            # it's a file format specifier for nasm.
-            command="$command $1"
-            ;;
-        -f*)
-            # maybe a code-generation flag for gcc.
-            ;;
-        -[Ii]*)
-            incdir=`echo "$1" | sed 's/^-[Ii]//'`
-            if [ "x$incdir" = x -a "x$2" != x ] ; then
-                case "$2" in
-                    -*) ;;
-                    *) incdir="$2"; shift;;
-                esac
-            fi
-            if [ "x$incdir" != x ] ; then
-                # In the case of NASM, the trailing slash is necessary.
-                incdir=`echo "$incdir" | sed 's%/*$%/%'`
-                command="$command -I$incdir"
-            fi
-            ;;
-        -o*)
-            o_opt=yes
-            command="$command $1"
-            ;;
-        *.asm)
-            infile=$1
-            command="$command $1"
-            ;;
-        *)
-            command="$command $1"
-            ;;
-    esac
-    shift
-done
-if [ "$o_opt" != yes ] ; then
-    # By default, NASM creates an output file
-    # in the same directory as the input file.
-    outfile="-o `echo $infile | sed -e 's%^.*/%%' -e 's%\.[^.]*$%%'`.o"
-    command="$command $outfile"
-fi
-echo $command
-exec $command
diff --git a/simd/jccolext-altivec.c b/simd/powerpc/jccolext-altivec.c
similarity index 94%
rename from simd/jccolext-altivec.c
rename to simd/powerpc/jccolext-altivec.c
index 849825e..170f90f 100644
--- a/simd/jccolext-altivec.c
+++ b/simd/powerpc/jccolext-altivec.c
@@ -24,9 +24,9 @@
 /* This file is included by jccolor-altivec.c */
 
 
-void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
-                                    JSAMPIMAGE output_buf,
-                                    JDIMENSION output_row, int num_rows)
+void jsimd_rgb_ycc_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
+                                   JSAMPIMAGE output_buf,
+                                   JDIMENSION output_row, int num_rows)
 {
   JSAMPROW inptr, outptr0, outptr1, outptr2;
   int pitch = img_width * RGB_PIXELSIZE, num_cols;
@@ -35,13 +35,13 @@
 #endif
   unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
 
-  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0},
+  __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
     rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr;
 #if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
-  __vector unsigned char rgb3 = {0};
+  __vector unsigned char rgb3 = { 0 };
 #endif
 #if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
-  __vector unsigned char rgb4 = {0};
+  __vector unsigned char rgb4 = { 0 };
 #endif
   __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
   __vector unsigned short yl, yh, crl, crh, cbl, cbh;
@@ -57,9 +57,11 @@
     pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) };
   __vector unsigned char pb_zero = { __16X(0) },
 #if __BIG_ENDIAN__
-    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+    shift_pack_index =
+      {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
 #else
-    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+    shift_pack_index =
+      {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
 #endif
 
   while (--num_rows >= 0) {
diff --git a/simd/jccolor-altivec.c b/simd/powerpc/jccolor-altivec.c
similarity index 74%
rename from simd/jccolor-altivec.c
rename to simd/powerpc/jccolor-altivec.c
index ec47332..0dc76bb 100644
--- a/simd/jccolor-altivec.c
+++ b/simd/powerpc/jccolor-altivec.c
@@ -40,10 +40,14 @@
 #define ONE_HALF (1 << (SCALEBITS - 1))
 
 
-#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
-#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
-#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
-#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
+#define RGBG_INDEX0 \
+  {  0,  1,  3,  4,  6,  7,  9, 10,  2,  1,  5,  4,  8,  7, 11, 10 }
+#define RGBG_INDEX1 \
+  { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
+#define RGBG_INDEX2 \
+  {  8,  9, 11, 12, 14, 15, 17, 18, 10,  9, 13, 12, 16, 15, 19, 18 }
+#define RGBG_INDEX3 \
+  {  4,  5,  7,  8, 10, 11, 13, 14,  6,  5,  9,  8, 12, 11, 15, 14 }
 #include "jccolext-altivec.c"
 #undef RGB_PIXELSIZE
 
@@ -58,7 +62,8 @@
 #undef jsimd_rgb_ycc_convert_altivec
 
 #define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
+#define RGBG_INDEX \
+  {  0,  1,  4,  5,  8,  9, 12, 13,  2,  1,  6,  5, 10,  9, 14, 13 }
 #define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec
 #include "jccolext-altivec.c"
 #undef RGB_PIXELSIZE
@@ -66,10 +71,14 @@
 #undef jsimd_rgb_ycc_convert_altivec
 
 #define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
-#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
-#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
-#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
+#define RGBG_INDEX0 \
+  {  2,  1,  5,  4,  8,  7, 11, 10,  0,  1,  3,  4,  6,  7,  9, 10 }
+#define RGBG_INDEX1 \
+  { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
+#define RGBG_INDEX2 \
+  { 10,  9, 13, 12, 16, 15, 19, 18,  8,  9, 11, 12, 14, 15, 17, 18 }
+#define RGBG_INDEX3 \
+  {  6,  5,  9,  8, 12, 11, 15, 14,  4,  5,  7,  8, 10, 11, 13, 14 }
 #define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec
 #include "jccolext-altivec.c"
 #undef RGB_PIXELSIZE
@@ -80,7 +89,8 @@
 #undef jsimd_rgb_ycc_convert_altivec
 
 #define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
+#define RGBG_INDEX \
+  {  2,  1,  6,  5, 10,  9, 14, 13,  0,  1,  4,  5,  8,  9, 12, 13 }
 #define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec
 #include "jccolext-altivec.c"
 #undef RGB_PIXELSIZE
@@ -88,7 +98,8 @@
 #undef jsimd_rgb_ycc_convert_altivec
 
 #define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
+#define RGBG_INDEX \
+  {  3,  2,  7,  6, 11, 10, 15, 14,  1,  2,  5,  6,  9, 10, 13, 14 }
 #define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec
 #include "jccolext-altivec.c"
 #undef RGB_PIXELSIZE
@@ -96,7 +107,8 @@
 #undef jsimd_rgb_ycc_convert_altivec
 
 #define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
+#define RGBG_INDEX \
+  {  1,  2,  5,  6,  9, 10, 13, 14,  3,  2,  7,  6, 11, 10, 15, 14 }
 #define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec
 #include "jccolext-altivec.c"
 #undef RGB_PIXELSIZE
diff --git a/simd/jcgray-altivec.c b/simd/powerpc/jcgray-altivec.c
similarity index 72%
rename from simd/jcgray-altivec.c
rename to simd/powerpc/jcgray-altivec.c
index 684df5e..2c5fd54 100644
--- a/simd/jcgray-altivec.c
+++ b/simd/powerpc/jcgray-altivec.c
@@ -35,10 +35,14 @@
 #define ONE_HALF (1 << (SCALEBITS - 1))
 
 
-#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
-#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
-#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
-#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
+#define RGBG_INDEX0 \
+  {  0,  1,  3,  4,  6,  7,  9, 10,  2,  1,  5,  4,  8,  7, 11, 10 }
+#define RGBG_INDEX1 \
+  { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
+#define RGBG_INDEX2 \
+  {  8,  9, 11, 12, 14, 15, 17, 18, 10,  9, 13, 12, 16, 15, 19, 18 }
+#define RGBG_INDEX3 \
+  {  4,  5,  7,  8, 10, 11, 13, 14,  6,  5,  9,  8, 12, 11, 15, 14 }
 #include "jcgryext-altivec.c"
 #undef RGB_PIXELSIZE
 
@@ -53,7 +57,8 @@
 #undef jsimd_rgb_gray_convert_altivec
 
 #define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
+#define RGBG_INDEX \
+  {  0,  1,  4,  5,  8,  9, 12, 13,  2,  1,  6,  5, 10,  9, 14, 13 }
 #define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec
 #include "jcgryext-altivec.c"
 #undef RGB_PIXELSIZE
@@ -61,10 +66,14 @@
 #undef jsimd_rgb_gray_convert_altivec
 
 #define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
-#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
-#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
-#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
+#define RGBG_INDEX0 \
+  {  2,  1,  5,  4,  8,  7, 11, 10,  0,  1,  3,  4,  6,  7,  9, 10 }
+#define RGBG_INDEX1 \
+  { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
+#define RGBG_INDEX2 \
+  { 10,  9, 13, 12, 16, 15, 19, 18,  8,  9, 11, 12, 14, 15, 17, 18 }
+#define RGBG_INDEX3 \
+  {  6,  5,  9,  8, 12, 11, 15, 14,  4,  5,  7,  8, 10, 11, 13, 14 }
 #define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec
 #include "jcgryext-altivec.c"
 #undef RGB_PIXELSIZE
@@ -75,7 +84,8 @@
 #undef jsimd_rgb_gray_convert_altivec
 
 #define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
+#define RGBG_INDEX \
+  {  2,  1,  6,  5, 10,  9, 14, 13,  0,  1,  4,  5,  8,  9, 12, 13 }
 #define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec
 #include "jcgryext-altivec.c"
 #undef RGB_PIXELSIZE
@@ -83,7 +93,8 @@
 #undef jsimd_rgb_gray_convert_altivec
 
 #define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
+#define RGBG_INDEX \
+  {  3,  2,  7,  6, 11, 10, 15, 14,  1,  2,  5,  6,  9, 10, 13, 14 }
 #define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec
 #include "jcgryext-altivec.c"
 #undef RGB_PIXELSIZE
@@ -91,7 +102,8 @@
 #undef jsimd_rgb_gray_convert_altivec
 
 #define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
+#define RGBG_INDEX \
+  {  1,  2,  5,  6,  9, 10, 13, 14,  3,  2,  7,  6, 11, 10, 15, 14 }
 #define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec
 #include "jcgryext-altivec.c"
 #undef RGB_PIXELSIZE
diff --git a/simd/jcgryext-altivec.c b/simd/powerpc/jcgryext-altivec.c
similarity index 93%
rename from simd/jcgryext-altivec.c
rename to simd/powerpc/jcgryext-altivec.c
index 7f8232b..b280cbb 100644
--- a/simd/jcgryext-altivec.c
+++ b/simd/powerpc/jcgryext-altivec.c
@@ -24,10 +24,9 @@
 /* This file is included by jcgray-altivec.c */
 
 
-void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
-                                     JSAMPARRAY input_buf,
-                                     JSAMPIMAGE output_buf,
-                                     JDIMENSION output_row, int num_rows)
+void jsimd_rgb_gray_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
+                                    JSAMPIMAGE output_buf,
+                                    JDIMENSION output_row, int num_rows)
 {
   JSAMPROW inptr, outptr;
   int pitch = img_width * RGB_PIXELSIZE, num_cols;
@@ -36,13 +35,13 @@
   unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
 #endif
 
-  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0},
+  __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
     rgbg0, rgbg1, rgbg2, rgbg3, y;
 #if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
-  __vector unsigned char rgb3 = {0};
+  __vector unsigned char rgb3 = { 0 };
 #endif
 #if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
-  __vector unsigned char rgb4 = {0};
+  __vector unsigned char rgb4 = { 0 };
 #endif
   __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
   __vector unsigned short yl, yh;
@@ -54,9 +53,11 @@
   __vector int pd_onehalf = { __4X(ONE_HALF) };
   __vector unsigned char pb_zero = { __16X(0) },
 #if __BIG_ENDIAN__
-    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+    shift_pack_index =
+      { 0, 1, 4, 5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
 #else
-    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+    shift_pack_index =
+      { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
 #endif
 
   while (--num_rows >= 0) {
diff --git a/simd/jcsample-altivec.c b/simd/powerpc/jcsample-altivec.c
similarity index 84%
rename from simd/jcsample-altivec.c
rename to simd/powerpc/jcsample-altivec.c
index 11609d9..6e25b8d 100644
--- a/simd/jcsample-altivec.c
+++ b/simd/powerpc/jcsample-altivec.c
@@ -26,14 +26,15 @@
 #include "jcsample.h"
 
 
-void
-jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
-                               JDIMENSION v_samp_factor,
-                               JDIMENSION width_blocks,
-                               JSAMPARRAY input_data, JSAMPARRAY output_data)
+void jsimd_h2v1_downsample_altivec(JDIMENSION image_width,
+                                   int max_v_samp_factor,
+                                   JDIMENSION v_samp_factor,
+                                   JDIMENSION width_in_blocks,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY output_data)
 {
   int outrow, outcol;
-  JDIMENSION output_cols = width_blocks * DCTSIZE;
+  JDIMENSION output_cols = width_in_blocks * DCTSIZE;
   JSAMPROW inptr, outptr;
 
   __vector unsigned char this0, next0, out;
@@ -43,7 +44,7 @@
   __vector unsigned short pw_bias = { __4X2(0, 1) },
     pw_one = { __8X(1) };
   __vector unsigned char even_odd_index =
-    {0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15},
+    {  0,  2,  4,  6,  8, 10, 12, 14,  1,  3,  5,  7,  9, 11, 13, 15 },
     pb_zero = { __16X(0) };
 
   expand_right_edge(input_data, max_v_samp_factor, image_width,
@@ -83,13 +84,13 @@
 
 
 void
-jsimd_h2v2_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
-                               JDIMENSION v_samp_factor,
-                               JDIMENSION width_blocks,
-                               JSAMPARRAY input_data, JSAMPARRAY output_data)
+jsimd_h2v2_downsample_altivec(JDIMENSION image_width, int max_v_samp_factor,
+                              JDIMENSION v_samp_factor,
+                              JDIMENSION width_in_blocks,
+                              JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int inrow, outrow, outcol;
-  JDIMENSION output_cols = width_blocks * DCTSIZE;
+  JDIMENSION output_cols = width_in_blocks * DCTSIZE;
   JSAMPROW inptr0, inptr1, outptr;
 
   __vector unsigned char this0, next0, this1, next1, out;
@@ -100,7 +101,7 @@
   __vector unsigned short pw_bias = { __4X2(1, 2) },
     pw_two = { __8X(2) };
   __vector unsigned char even_odd_index =
-    { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
+    {  0,  2,  4,  6,  8, 10, 12, 14,  1,  3,  5,  7,  9, 11, 13, 15 },
     pb_zero = { __16X(0) };
 
   expand_right_edge(input_data, max_v_samp_factor, image_width,
diff --git a/simd/jcsample.h b/simd/powerpc/jcsample.h
similarity index 76%
copy from simd/jcsample.h
copy to simd/powerpc/jcsample.h
index 2a50544..2ac4816 100644
--- a/simd/jcsample.h
+++ b/simd/powerpc/jcsample.h
@@ -8,14 +8,14 @@
  */
 
 LOCAL(void)
-expand_right_edge (JSAMPARRAY image_data, int num_rows,
-                   JDIMENSION input_cols, JDIMENSION output_cols)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+                  JDIMENSION output_cols)
 {
   register JSAMPROW ptr;
   register JSAMPLE pixval;
   register int count;
   int row;
-  int numcols = (int) (output_cols - input_cols);
+  int numcols = (int)(output_cols - input_cols);
 
   if (numcols > 0) {
     for (row = 0; row < num_rows; row++) {
diff --git a/simd/jdcolext-altivec.c b/simd/powerpc/jdcolext-altivec.c
similarity index 95%
rename from simd/jdcolext-altivec.c
rename to simd/powerpc/jdcolext-altivec.c
index fb121ce..68d52bd 100644
--- a/simd/jdcolext-altivec.c
+++ b/simd/powerpc/jdcolext-altivec.c
@@ -23,9 +23,9 @@
 /* This file is included by jdcolor-altivec.c */
 
 
-void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf,
-                                    JDIMENSION input_row,
-                                    JSAMPARRAY output_buf, int num_rows)
+void jsimd_ycc_rgb_convert_altivec(JDIMENSION out_width, JSAMPIMAGE input_buf,
+                                   JDIMENSION input_row, JSAMPARRAY output_buf,
+                                   int num_rows)
 {
   JSAMPROW outptr, inptr0, inptr1, inptr2;
   int pitch = out_width * RGB_PIXELSIZE, num_cols;
@@ -61,9 +61,11 @@
   __vector int pd_onehalf = { __4X(ONE_HALF) };
   __vector unsigned char pb_zero = { __16X(0) },
 #if __BIG_ENDIAN__
-    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+    shift_pack_index =
+      {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
 #else
-    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+    shift_pack_index =
+      {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
 #endif
 
   while (--num_rows >= 0) {
diff --git a/simd/jdcolor-altivec.c b/simd/powerpc/jdcolor-altivec.c
similarity index 76%
rename from simd/jdcolor-altivec.c
rename to simd/powerpc/jdcolor-altivec.c
index 0dc4c42..620df4a 100644
--- a/simd/jdcolor-altivec.c
+++ b/simd/powerpc/jdcolor-altivec.c
@@ -36,9 +36,12 @@
 #define SCALEBITS 16
 #define ONE_HALF (1 << (SCALEBITS - 1))
 
-#define RGB_INDEX0 {0,1,8,2,3,10,4,5,12,6,7,14,16,17,24,18}
-#define RGB_INDEX1 {3,10,4,5,12,6,7,14,16,17,24,18,19,26,20,21}
-#define RGB_INDEX2 {12,6,7,14,16,17,24,18,19,26,20,21,28,22,23,30}
+#define RGB_INDEX0 \
+  {  0,  1,  8,  2,  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18 }
+#define RGB_INDEX1 \
+  {  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
+#define RGB_INDEX2 \
+  { 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
 #include "jdcolext-altivec.c"
 #undef RGB_PIXELSIZE
 
@@ -52,7 +55,8 @@
 #undef jsimd_ycc_rgb_convert_altivec
 
 #define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGB_INDEX {0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15}
+#define RGB_INDEX \
+  {  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15 }
 #define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgbx_convert_altivec
 #include "jdcolext-altivec.c"
 #undef RGB_PIXELSIZE
@@ -60,9 +64,12 @@
 #undef jsimd_ycc_rgb_convert_altivec
 
 #define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGB_INDEX0 {8,1,0,10,3,2,12,5,4,14,7,6,24,17,16,26}
-#define RGB_INDEX1 {3,2,12,5,4,14,7,6,24,17,16,26,19,18,28,21}
-#define RGB_INDEX2 {4,14,7,6,24,17,16,26,19,18,28,21,20,30,23,22}
+#define RGB_INDEX0 \
+  {  8,  1,  0, 10,  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26 }
+#define RGB_INDEX1 \
+  {  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21 }
+#define RGB_INDEX2 \
+  {  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
 #define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgr_convert_altivec
 #include "jdcolext-altivec.c"
 #undef RGB_PIXELSIZE
@@ -72,7 +79,8 @@
 #undef jsimd_ycc_rgb_convert_altivec
 
 #define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGB_INDEX {8,1,0,9,10,3,2,11,12,5,4,13,14,7,6,15}
+#define RGB_INDEX \
+  {  8,  1,  0,  9, 10,  3,  2, 11, 12,  5,  4, 13, 14,  7,  6, 15 }
 #define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgrx_convert_altivec
 #include "jdcolext-altivec.c"
 #undef RGB_PIXELSIZE
@@ -80,7 +88,8 @@
 #undef jsimd_ycc_rgb_convert_altivec
 
 #define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGB_INDEX {9,8,1,0,11,10,3,2,13,12,5,4,15,14,7,6}
+#define RGB_INDEX \
+  {  9,  8,  1,  0, 11, 10,  3,  2, 13, 12,  5,  4, 15, 14,  7,  6 }
 #define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxbgr_convert_altivec
 #include "jdcolext-altivec.c"
 #undef RGB_PIXELSIZE
@@ -88,7 +97,8 @@
 #undef jsimd_ycc_rgb_convert_altivec
 
 #define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGB_INDEX {9,0,1,8,11,2,3,10,13,4,5,12,15,6,7,14}
+#define RGB_INDEX \
+  {  9,  0,  1,  8, 11,  2,  3, 10, 13,  4,  5, 12, 15,  6,  7, 14 }
 #define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxrgb_convert_altivec
 #include "jdcolext-altivec.c"
 #undef RGB_PIXELSIZE
diff --git a/simd/jdmerge-altivec.c b/simd/powerpc/jdmerge-altivec.c
similarity index 80%
rename from simd/jdmerge-altivec.c
rename to simd/powerpc/jdmerge-altivec.c
index 6a35f20..a00bd89 100644
--- a/simd/jdmerge-altivec.c
+++ b/simd/powerpc/jdmerge-altivec.c
@@ -36,9 +36,12 @@
 #define SCALEBITS 16
 #define ONE_HALF (1 << (SCALEBITS - 1))
 
-#define RGB_INDEX0 {0,1,8,2,3,10,4,5,12,6,7,14,16,17,24,18}
-#define RGB_INDEX1 {3,10,4,5,12,6,7,14,16,17,24,18,19,26,20,21}
-#define RGB_INDEX2 {12,6,7,14,16,17,24,18,19,26,20,21,28,22,23,30}
+#define RGB_INDEX0 \
+  {  0,  1,  8,  2,  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18 }
+#define RGB_INDEX1 \
+  {  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
+#define RGB_INDEX2 \
+  { 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
 #include "jdmrgext-altivec.c"
 #undef RGB_PIXELSIZE
 
@@ -54,7 +57,8 @@
 #undef jsimd_h2v2_merged_upsample_altivec
 
 #define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGB_INDEX {0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15}
+#define RGB_INDEX \
+  {  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15 }
 #define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extrgbx_merged_upsample_altivec
 #define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extrgbx_merged_upsample_altivec
 #include "jdmrgext-altivec.c"
@@ -64,9 +68,12 @@
 #undef jsimd_h2v2_merged_upsample_altivec
 
 #define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGB_INDEX0 {8,1,0,10,3,2,12,5,4,14,7,6,24,17,16,26}
-#define RGB_INDEX1 {3,2,12,5,4,14,7,6,24,17,16,26,19,18,28,21}
-#define RGB_INDEX2 {4,14,7,6,24,17,16,26,19,18,28,21,20,30,23,22}
+#define RGB_INDEX0 \
+  {  8,  1,  0, 10,  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26 }
+#define RGB_INDEX1 \
+  {  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21 }
+#define RGB_INDEX2 \
+  {  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
 #define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extbgr_merged_upsample_altivec
 #define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extbgr_merged_upsample_altivec
 #include "jdmrgext-altivec.c"
@@ -78,7 +85,8 @@
 #undef jsimd_h2v2_merged_upsample_altivec
 
 #define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGB_INDEX {8,1,0,9,10,3,2,11,12,5,4,13,14,7,6,15}
+#define RGB_INDEX \
+  {  8,  1,  0,  9, 10,  3,  2, 11, 12,  5,  4, 13, 14,  7,  6, 15 }
 #define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extbgrx_merged_upsample_altivec
 #define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extbgrx_merged_upsample_altivec
 #include "jdmrgext-altivec.c"
@@ -88,7 +96,8 @@
 #undef jsimd_h2v2_merged_upsample_altivec
 
 #define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGB_INDEX {9,8,1,0,11,10,3,2,13,12,5,4,15,14,7,6}
+#define RGB_INDEX \
+  {  9,  8,  1,  0, 11, 10,  3,  2, 13, 12,  5,  4, 15, 14,  7,  6 }
 #define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extxbgr_merged_upsample_altivec
 #define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extxbgr_merged_upsample_altivec
 #include "jdmrgext-altivec.c"
@@ -98,7 +107,8 @@
 #undef jsimd_h2v2_merged_upsample_altivec
 
 #define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGB_INDEX {9,0,1,8,11,2,3,10,13,4,5,12,15,6,7,14}
+#define RGB_INDEX \
+  {  9,  0,  1,  8, 11,  2,  3, 10, 13,  4,  5, 12, 15,  6,  7, 14 }
 #define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extxrgb_merged_upsample_altivec
 #define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extxrgb_merged_upsample_altivec
 #include "jdmrgext-altivec.c"
diff --git a/simd/jdmrgext-altivec.c b/simd/powerpc/jdmrgext-altivec.c
similarity index 90%
rename from simd/jdmrgext-altivec.c
rename to simd/powerpc/jdmrgext-altivec.c
index 55205bb..40f02c3 100644
--- a/simd/jdmrgext-altivec.c
+++ b/simd/powerpc/jdmrgext-altivec.c
@@ -23,10 +23,10 @@
 /* This file is included by jdmerge-altivec.c */
 
 
-void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width,
-                                         JSAMPIMAGE input_buf,
-                                         JDIMENSION in_row_group_ctr,
-                                         JSAMPARRAY output_buf)
+void jsimd_h2v1_merged_upsample_altivec(JDIMENSION output_width,
+                                        JSAMPIMAGE input_buf,
+                                        JDIMENSION in_row_group_ctr,
+                                        JSAMPARRAY output_buf)
 {
   JSAMPROW outptr, inptr0, inptr1, inptr2;
   int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop;
@@ -63,13 +63,19 @@
   __vector int pd_onehalf = { __4X(ONE_HALF) };
   __vector unsigned char pb_zero = { __16X(0) },
 #if __BIG_ENDIAN__
-    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29},
-    even_index = {0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30},
-    odd_index = {0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31};
+    shift_pack_index =
+      {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
+    even_index =
+      {  0, 16,  0, 18,  0, 20,  0, 22,  0, 24,  0, 26,  0, 28,  0, 30 },
+    odd_index =
+      {  0, 17,  0, 19,  0, 21,  0, 23,  0, 25,  0, 27,  0, 29,  0, 31 };
 #else
-    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31},
-    even_index = {16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0},
-    odd_index = {17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0};
+    shift_pack_index =
+      {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
+    even_index =
+      { 16,  0, 18,  0, 20,  0, 22,  0, 24,  0, 26,  0, 28,  0, 30,  0 },
+    odd_index =
+      { 17,  0, 19,  0, 21,  0, 23,  0, 25,  0, 27,  0, 29,  0, 31,  0 };
 #endif
 
   inptr0 = input_buf[0][in_row_group_ctr];
@@ -299,10 +305,10 @@
 }
 
 
-void jsimd_h2v2_merged_upsample_altivec (JDIMENSION output_width,
-                                         JSAMPIMAGE input_buf,
-                                         JDIMENSION in_row_group_ctr,
-                                         JSAMPARRAY output_buf)
+void jsimd_h2v2_merged_upsample_altivec(JDIMENSION output_width,
+                                        JSAMPIMAGE input_buf,
+                                        JDIMENSION in_row_group_ctr,
+                                        JSAMPARRAY output_buf)
 {
   JSAMPROW inptr, outptr;
 
diff --git a/simd/jdsample-altivec.c b/simd/powerpc/jdsample-altivec.c
similarity index 83%
rename from simd/jdsample-altivec.c
rename to simd/powerpc/jdsample-altivec.c
index b40ce55..04df0cf 100644
--- a/simd/jdsample-altivec.c
+++ b/simd/powerpc/jdsample-altivec.c
@@ -25,31 +25,36 @@
 #include "jsimd_altivec.h"
 
 
-void
-jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
-                                   JDIMENSION downsampled_width,
-                                   JSAMPARRAY input_data,
-                                   JSAMPARRAY *output_data_ptr)
+void jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor,
+                                       JDIMENSION downsampled_width,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr, outptr;
   int inrow, incol;
 
-  __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0,
+  __vector unsigned char this0, last0, p_last0, next0 = { 0 }, p_next0,
     out;
   __vector short this0e, this0o, this0l, this0h, last0l, last0h,
     next0l, next0h, outle, outhe, outlo, outho;
 
   /* Constants */
   __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
-    last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14},
-    last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30},
-    next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},
-    next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15},
+    last_index_col0 =
+      {  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14 },
+    last_index =
+      { 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 },
+    next_index =
+      {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16 },
+    next_index_lastcol =
+      {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15 },
 #if __BIG_ENDIAN__
-    merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
+    merge_pack_index =
+      {  1, 17,  3, 19,  5, 21,  7, 23,  9, 25, 11, 27, 13, 29, 15, 31 };
 #else
-    merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
+    merge_pack_index =
+      {  0, 16,  2, 18,  4, 20,  6, 22,  8, 24, 10, 26, 12, 28, 14, 30 };
 #endif
   __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
 
@@ -121,11 +126,10 @@
 }
 
 
-void
-jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
-                                   JDIMENSION downsampled_width,
-                                   JSAMPARRAY input_data,
-                                   JSAMPARRAY *output_data_ptr)
+void jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor,
+                                       JDIMENSION downsampled_width,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
@@ -136,21 +140,27 @@
     lastcolsum_1h, lastcolsum1h,
     p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
     thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
-    nextcolsum_1l = {0}, nextcolsum_1h = {0},
-    nextcolsum1l = {0}, nextcolsum1h = {0},
+    nextcolsum_1l = { 0 }, nextcolsum_1h = { 0 },
+    nextcolsum1l = { 0 }, nextcolsum1h = { 0 },
     p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
     tmpl, tmph, outle, outhe, outlo, outho;
 
   /* Constants */
   __vector unsigned char pb_zero = { __16X(0) },
-    last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13},
-    last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29},
-    next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17},
-    next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15},
+    last_index_col0 =
+      {  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13 },
+    last_index =
+      { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
+    next_index =
+      {  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17 },
+    next_index_lastcol =
+      {  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15 },
 #if __BIG_ENDIAN__
-    merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
+    merge_pack_index =
+      {  1, 17,  3, 19,  5, 21,  7, 23,  9, 25, 11, 27, 13, 29, 15, 31 };
 #else
-    merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
+    merge_pack_index =
+      {  0, 16,  2, 18,  4, 20,  6, 22,  8, 24, 10, 26, 12, 28, 14, 30 };
 #endif
   __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
     pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
@@ -306,11 +316,10 @@
 
 /* These are rarely used (mainly just for decompressing YCCK images) */
 
-void
-jsimd_h2v1_upsample_altivec (int max_v_samp_factor,
-                             JDIMENSION output_width,
-                             JSAMPARRAY input_data,
-                             JSAMPARRAY *output_data_ptr)
+void jsimd_h2v1_upsample_altivec(int max_v_samp_factor,
+                                 JDIMENSION output_width,
+                                 JSAMPARRAY input_data,
+                                 JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr, outptr;
@@ -345,11 +354,10 @@
 }
 
 
-void
-jsimd_h2v2_upsample_altivec (int max_v_samp_factor,
-                             JDIMENSION output_width,
-                             JSAMPARRAY input_data,
-                             JSAMPARRAY *output_data_ptr)
+void jsimd_h2v2_upsample_altivec(int max_v_samp_factor,
+                                 JDIMENSION output_width,
+                                 JSAMPARRAY input_data,
+                                 JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr, outptr0, outptr1;
diff --git a/simd/jfdctfst-altivec.c b/simd/powerpc/jfdctfst-altivec.c
similarity index 74%
rename from simd/jfdctfst-altivec.c
rename to simd/powerpc/jfdctfst-altivec.c
index 04157f7..23ece72 100644
--- a/simd/jfdctfst-altivec.c
+++ b/simd/powerpc/jfdctfst-altivec.c
@@ -42,54 +42,52 @@
 #define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
 
 
-#define DO_FDCT()  \
-{  \
-  /* Even part */  \
+#define DO_FDCT() { \
+  /* Even part */ \
   \
-  tmp10 = vec_add(tmp0, tmp3);  \
-  tmp13 = vec_sub(tmp0, tmp3);  \
-  tmp11 = vec_add(tmp1, tmp2);  \
-  tmp12 = vec_sub(tmp1, tmp2);  \
+  tmp10 = vec_add(tmp0, tmp3); \
+  tmp13 = vec_sub(tmp0, tmp3); \
+  tmp11 = vec_add(tmp1, tmp2); \
+  tmp12 = vec_sub(tmp1, tmp2); \
   \
-  out0  = vec_add(tmp10, tmp11);  \
-  out4  = vec_sub(tmp10, tmp11);  \
+  out0  = vec_add(tmp10, tmp11); \
+  out4  = vec_sub(tmp10, tmp11); \
   \
-  z1 = vec_add(tmp12, tmp13);  \
-  z1 = vec_sl(z1, pre_multiply_scale_bits);  \
-  z1 = vec_madds(z1, pw_0707, pw_zero);  \
+  z1 = vec_add(tmp12, tmp13); \
+  z1 = vec_sl(z1, pre_multiply_scale_bits); \
+  z1 = vec_madds(z1, pw_0707, pw_zero); \
   \
-  out2 = vec_add(tmp13, z1);  \
-  out6 = vec_sub(tmp13, z1);  \
+  out2 = vec_add(tmp13, z1); \
+  out6 = vec_sub(tmp13, z1); \
   \
-  /* Odd part */  \
+  /* Odd part */ \
   \
-  tmp10 = vec_add(tmp4, tmp5);  \
-  tmp11 = vec_add(tmp5, tmp6);  \
-  tmp12 = vec_add(tmp6, tmp7);  \
+  tmp10 = vec_add(tmp4, tmp5); \
+  tmp11 = vec_add(tmp5, tmp6); \
+  tmp12 = vec_add(tmp6, tmp7); \
   \
-  tmp10 = vec_sl(tmp10, pre_multiply_scale_bits);  \
-  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits);  \
-  z5 = vec_sub(tmp10, tmp12);  \
-  z5 = vec_madds(z5, pw_0382, pw_zero);  \
+  tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \
+  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
+  z5 = vec_sub(tmp10, tmp12); \
+  z5 = vec_madds(z5, pw_0382, pw_zero); \
   \
-  z2 = vec_madds(tmp10, pw_0541, z5);  \
-  z4 = vec_madds(tmp12, pw_1306, z5);  \
+  z2 = vec_madds(tmp10, pw_0541, z5); \
+  z4 = vec_madds(tmp12, pw_1306, z5); \
   \
-  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits);  \
-  z3 = vec_madds(tmp11, pw_0707, pw_zero);  \
+  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
+  z3 = vec_madds(tmp11, pw_0707, pw_zero); \
   \
-  z11 = vec_add(tmp7, z3);  \
-  z13 = vec_sub(tmp7, z3);  \
+  z11 = vec_add(tmp7, z3); \
+  z13 = vec_sub(tmp7, z3); \
   \
-  out5 = vec_add(z13, z2);  \
-  out3 = vec_sub(z13, z2);  \
-  out1 = vec_add(z11, z4);  \
-  out7 = vec_sub(z11, z4);  \
+  out5 = vec_add(z13, z2); \
+  out3 = vec_sub(z13, z2); \
+  out1 = vec_add(z11, z4); \
+  out7 = vec_sub(z11, z4); \
 }
 
 
-void
-jsimd_fdct_ifast_altivec (DCTELEM *data)
+void jsimd_fdct_ifast_altivec(DCTELEM *data)
 {
   __vector short row0, row1, row2, row3, row4, row5, row6, row7,
     col0, col1, col2, col3, col4, col5, col6, col7,
diff --git a/simd/powerpc/jfdctint-altivec.c b/simd/powerpc/jfdctint-altivec.c
new file mode 100644
index 0000000..0cb1a50
--- /dev/null
+++ b/simd/powerpc/jfdctint-altivec.c
@@ -0,0 +1,258 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* SLOW INTEGER FORWARD DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298 2446   /* FIX(0.298631336) */
+#define F_0_390 3196   /* FIX(0.390180644) */
+#define F_0_541 4433   /* FIX(0.541196100) */
+#define F_0_765 6270   /* FIX(0.765366865) */
+#define F_0_899 7373   /* FIX(0.899976223) */
+#define F_1_175 9633   /* FIX(1.175875602) */
+#define F_1_501 12299  /* FIX(1.501321110) */
+#define F_1_847 15137  /* FIX(1.847759065) */
+#define F_1_961 16069  /* FIX(1.961570560) */
+#define F_2_053 16819  /* FIX(2.053119869) */
+#define F_2_562 20995  /* FIX(2.562915447) */
+#define F_3_072 25172  /* FIX(3.072711026) */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+
+#define DO_FDCT_COMMON(PASS) { \
+  /* (Original) \
+   * z1 = (tmp12 + tmp13) * 0.541196100; \
+   * data2 = z1 + tmp13 * 0.765366865; \
+   * data6 = z1 + tmp12 * -1.847759065; \
+   * \
+   * (This implementation) \
+   * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
+   * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
+   */ \
+  \
+  tmp1312l = vec_mergeh(tmp13, tmp12); \
+  tmp1312h = vec_mergel(tmp13, tmp12); \
+  \
+  out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \
+  out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \
+  out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \
+  out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \
+  \
+  out2l = vec_sra(out2l, descale_p##PASS); \
+  out2h = vec_sra(out2h, descale_p##PASS); \
+  out6l = vec_sra(out6l, descale_p##PASS); \
+  out6h = vec_sra(out6h, descale_p##PASS); \
+  \
+  out2 = vec_pack(out2l, out2h); \
+  out6 = vec_pack(out6l, out6h); \
+  \
+  /* Odd part */ \
+  \
+  z3 = vec_add(tmp4, tmp6); \
+  z4 = vec_add(tmp5, tmp7); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = vec_mergeh(z3, z4); \
+  z34h = vec_mergel(z3, z4); \
+  \
+  z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \
+  z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \
+  z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \
+  z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \
+  \
+  /* (Original) \
+   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6; \
+   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869; \
+   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4; \
+   * data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
+   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
+   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
+   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
+   * data7 = tmp4 + z3;  data5 = tmp5 + z4; \
+   * data3 = tmp6 + z3;  data1 = tmp7 + z4; \
+   */ \
+  \
+  tmp47l = vec_mergeh(tmp4, tmp7); \
+  tmp47h = vec_mergel(tmp4, tmp7); \
+  \
+  out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \
+  out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \
+  out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \
+  out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \
+  \
+  out7l = vec_sra(out7l, descale_p##PASS); \
+  out7h = vec_sra(out7h, descale_p##PASS); \
+  out1l = vec_sra(out1l, descale_p##PASS); \
+  out1h = vec_sra(out1h, descale_p##PASS); \
+  \
+  out7 = vec_pack(out7l, out7h); \
+  out1 = vec_pack(out1l, out1h); \
+  \
+  tmp56l = vec_mergeh(tmp5, tmp6); \
+  tmp56h = vec_mergel(tmp5, tmp6); \
+  \
+  out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \
+  out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \
+  out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \
+  out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \
+  \
+  out5l = vec_sra(out5l, descale_p##PASS); \
+  out5h = vec_sra(out5h, descale_p##PASS); \
+  out3l = vec_sra(out3l, descale_p##PASS); \
+  out3h = vec_sra(out3h, descale_p##PASS); \
+  \
+  out5 = vec_pack(out5l, out5h); \
+  out3 = vec_pack(out3l, out3h); \
+}
+
+#define DO_FDCT_PASS1() { \
+  /* Even part */ \
+  \
+  tmp10 = vec_add(tmp0, tmp3); \
+  tmp13 = vec_sub(tmp0, tmp3); \
+  tmp11 = vec_add(tmp1, tmp2); \
+  tmp12 = vec_sub(tmp1, tmp2); \
+  \
+  out0  = vec_add(tmp10, tmp11); \
+  out0  = vec_sl(out0, pass1_bits); \
+  out4  = vec_sub(tmp10, tmp11); \
+  out4  = vec_sl(out4, pass1_bits); \
+  \
+  DO_FDCT_COMMON(1); \
+}
+
+#define DO_FDCT_PASS2() { \
+  /* Even part */ \
+  \
+  tmp10 = vec_add(tmp0, tmp3); \
+  tmp13 = vec_sub(tmp0, tmp3); \
+  tmp11 = vec_add(tmp1, tmp2); \
+  tmp12 = vec_sub(tmp1, tmp2); \
+  \
+  out0  = vec_add(tmp10, tmp11); \
+  out0  = vec_add(out0, pw_descale_p2x); \
+  out0  = vec_sra(out0, pass1_bits); \
+  out4  = vec_sub(tmp10, tmp11); \
+  out4  = vec_add(out4, pw_descale_p2x); \
+  out4  = vec_sra(out4, pass1_bits); \
+  \
+  DO_FDCT_COMMON(2); \
+}
+
+
+void jsimd_fdct_islow_altivec(DCTELEM *data)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
+    z3, z4, z34l, z34h,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector int z3l, z3h, z4l, z4h,
+    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
+    out7l, out7h;
+
+  /* Constants */
+  __vector short
+    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
+    pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
+  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+  __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+    descale_p2 = { __4X(DESCALE_P2) };
+
+  /* Pass 1: process rows */
+
+  row0 = vec_ld(0, data);
+  row1 = vec_ld(16, data);
+  row2 = vec_ld(32, data);
+  row3 = vec_ld(48, data);
+  row4 = vec_ld(64, data);
+  row5 = vec_ld(80, data);
+  row6 = vec_ld(96, data);
+  row7 = vec_ld(112, data);
+
+  TRANSPOSE(row, col);
+
+  tmp0 = vec_add(col0, col7);
+  tmp7 = vec_sub(col0, col7);
+  tmp1 = vec_add(col1, col6);
+  tmp6 = vec_sub(col1, col6);
+  tmp2 = vec_add(col2, col5);
+  tmp5 = vec_sub(col2, col5);
+  tmp3 = vec_add(col3, col4);
+  tmp4 = vec_sub(col3, col4);
+
+  DO_FDCT_PASS1();
+
+  /* Pass 2: process columns */
+
+  TRANSPOSE(out, row);
+
+  tmp0 = vec_add(row0, row7);
+  tmp7 = vec_sub(row0, row7);
+  tmp1 = vec_add(row1, row6);
+  tmp6 = vec_sub(row1, row6);
+  tmp2 = vec_add(row2, row5);
+  tmp5 = vec_sub(row2, row5);
+  tmp3 = vec_add(row3, row4);
+  tmp4 = vec_sub(row3, row4);
+
+  DO_FDCT_PASS2();
+
+  vec_st(out0, 0, data);
+  vec_st(out1, 16, data);
+  vec_st(out2, 32, data);
+  vec_st(out3, 48, data);
+  vec_st(out4, 64, data);
+  vec_st(out5, 80, data);
+  vec_st(out6, 96, data);
+  vec_st(out7, 112, data);
+}
diff --git a/simd/jidctfst-altivec.c b/simd/powerpc/jidctfst-altivec.c
similarity index 76%
rename from simd/jidctfst-altivec.c
rename to simd/powerpc/jidctfst-altivec.c
index ec30c39..d130780 100644
--- a/simd/jidctfst-altivec.c
+++ b/simd/powerpc/jidctfst-altivec.c
@@ -44,75 +44,73 @@
 #define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
 
 
-#define DO_IDCT(in)  \
-{  \
-  /* Even part */  \
+#define DO_IDCT(in) { \
+  /* Even part */ \
   \
-  tmp10 = vec_add(in##0, in##4);  \
-  tmp11 = vec_sub(in##0, in##4);  \
-  tmp13 = vec_add(in##2, in##6);  \
+  tmp10 = vec_add(in##0, in##4); \
+  tmp11 = vec_sub(in##0, in##4); \
+  tmp13 = vec_add(in##2, in##6); \
   \
-  tmp12 = vec_sub(in##2, in##6);  \
-  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits);  \
-  tmp12 = vec_madds(tmp12, pw_F1414, pw_zero);  \
-  tmp12 = vec_sub(tmp12, tmp13);  \
+  tmp12 = vec_sub(in##2, in##6); \
+  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
+  tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \
+  tmp12 = vec_sub(tmp12, tmp13); \
   \
-  tmp0 = vec_add(tmp10, tmp13);  \
-  tmp3 = vec_sub(tmp10, tmp13);  \
-  tmp1 = vec_add(tmp11, tmp12);  \
-  tmp2 = vec_sub(tmp11, tmp12);  \
+  tmp0 = vec_add(tmp10, tmp13); \
+  tmp3 = vec_sub(tmp10, tmp13); \
+  tmp1 = vec_add(tmp11, tmp12); \
+  tmp2 = vec_sub(tmp11, tmp12); \
   \
-  /* Odd part */  \
+  /* Odd part */ \
   \
-  z13 = vec_add(in##5, in##3);  \
-  z10 = vec_sub(in##5, in##3);  \
-  z10s = vec_sl(z10, pre_multiply_scale_bits);  \
-  z11 = vec_add(in##1, in##7);  \
-  z12s = vec_sub(in##1, in##7);  \
-  z12s = vec_sl(z12s, pre_multiply_scale_bits);  \
+  z13 = vec_add(in##5, in##3); \
+  z10 = vec_sub(in##5, in##3); \
+  z10s = vec_sl(z10, pre_multiply_scale_bits); \
+  z11 = vec_add(in##1, in##7); \
+  z12s = vec_sub(in##1, in##7); \
+  z12s = vec_sl(z12s, pre_multiply_scale_bits); \
   \
-  tmp11 = vec_sub(z11, z13);  \
-  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits);  \
-  tmp11 = vec_madds(tmp11, pw_F1414, pw_zero);  \
+  tmp11 = vec_sub(z11, z13); \
+  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
+  tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \
   \
-  tmp7 = vec_add(z11, z13);  \
+  tmp7 = vec_add(z11, z13); \
   \
-  /* To avoid overflow...  \
-   *  \
-   * (Original)  \
-   * tmp12 = -2.613125930 * z10 + z5;  \
-   *  \
-   * (This implementation)  \
-   * tmp12 = (-1.613125930 - 1) * z10 + z5;  \
-   *       = -1.613125930 * z10 - z10 + z5;  \
-   */  \
+  /* To avoid overflow... \
+   * \
+   * (Original) \
+   * tmp12 = -2.613125930 * z10 + z5; \
+   * \
+   * (This implementation) \
+   * tmp12 = (-1.613125930 - 1) * z10 + z5; \
+   *       = -1.613125930 * z10 - z10 + z5; \
+   */ \
   \
-  z5 = vec_add(z10s, z12s);  \
-  z5 = vec_madds(z5, pw_F1847, pw_zero);  \
+  z5 = vec_add(z10s, z12s); \
+  z5 = vec_madds(z5, pw_F1847, pw_zero); \
   \
-  tmp10 = vec_madds(z12s, pw_F1082, pw_zero);  \
-  tmp10 = vec_sub(tmp10, z5);  \
-  tmp12 = vec_madds(z10s, pw_MF1613, z5);  \
-  tmp12 = vec_sub(tmp12, z10);  \
+  tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \
+  tmp10 = vec_sub(tmp10, z5); \
+  tmp12 = vec_madds(z10s, pw_MF1613, z5); \
+  tmp12 = vec_sub(tmp12, z10); \
   \
-  tmp6 = vec_sub(tmp12, tmp7);  \
-  tmp5 = vec_sub(tmp11, tmp6);  \
-  tmp4 = vec_add(tmp10, tmp5);  \
+  tmp6 = vec_sub(tmp12, tmp7); \
+  tmp5 = vec_sub(tmp11, tmp6); \
+  tmp4 = vec_add(tmp10, tmp5); \
   \
-  out0 = vec_add(tmp0, tmp7);  \
-  out1 = vec_add(tmp1, tmp6);  \
-  out2 = vec_add(tmp2, tmp5);  \
-  out3 = vec_sub(tmp3, tmp4);  \
-  out4 = vec_add(tmp3, tmp4);  \
-  out5 = vec_sub(tmp2, tmp5);  \
-  out6 = vec_sub(tmp1, tmp6);  \
-  out7 = vec_sub(tmp0, tmp7);  \
+  out0 = vec_add(tmp0, tmp7); \
+  out1 = vec_add(tmp1, tmp6); \
+  out2 = vec_add(tmp2, tmp5); \
+  out3 = vec_sub(tmp3, tmp4); \
+  out4 = vec_add(tmp3, tmp4); \
+  out5 = vec_sub(tmp2, tmp5); \
+  out6 = vec_sub(tmp1, tmp6); \
+  out7 = vec_sub(tmp0, tmp7); \
 }
 
 
-void
-jsimd_idct_ifast_altivec (void *dct_table_, JCOEFPTR coef_block,
-                          JSAMPARRAY output_buf, JDIMENSION output_col)
+void jsimd_idct_ifast_altivec(void *dct_table_, JCOEFPTR coef_block,
+                              JSAMPARRAY output_buf, JDIMENSION output_col)
 {
   short *dct_table = (short *)dct_table_;
   int *outptr;
diff --git a/simd/powerpc/jidctint-altivec.c b/simd/powerpc/jidctint-altivec.c
new file mode 100644
index 0000000..a81923b
--- /dev/null
+++ b/simd/powerpc/jidctint-altivec.c
@@ -0,0 +1,357 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* SLOW INTEGER INVERSE DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298 2446   /* FIX(0.298631336) */
+#define F_0_390 3196   /* FIX(0.390180644) */
+#define F_0_541 4433   /* FIX(0.541196100) */
+#define F_0_765 6270   /* FIX(0.765366865) */
+#define F_0_899 7373   /* FIX(0.899976223) */
+#define F_1_175 9633   /* FIX(1.175875602) */
+#define F_1_501 12299  /* FIX(1.501321110) */
+#define F_1_847 15137  /* FIX(1.847759065) */
+#define F_1_961 16069  /* FIX(1.961570560) */
+#define F_2_053 16819  /* FIX(2.053119869) */
+#define F_2_562 20995  /* FIX(2.562915447) */
+#define F_3_072 25172  /* FIX(3.072711026) */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+
+#define DO_IDCT(in, PASS) { \
+  /* Even part \
+   * \
+   * (Original) \
+   * z1 = (z2 + z3) * 0.541196100; \
+   * tmp2 = z1 + z3 * -1.847759065; \
+   * tmp3 = z1 + z2 * 0.765366865; \
+   * \
+   * (This implementation) \
+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+   */ \
+  \
+  in##26l = vec_mergeh(in##2, in##6); \
+  in##26h = vec_mergel(in##2, in##6); \
+  \
+  tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \
+  tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \
+  tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \
+  tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \
+  \
+  tmp0 = vec_add(in##0, in##4); \
+  tmp1 = vec_sub(in##0, in##4); \
+  \
+  tmp0l = vec_unpackh(tmp0); \
+  tmp0h = vec_unpackl(tmp0); \
+  tmp0l = vec_sl(tmp0l, const_bits); \
+  tmp0h = vec_sl(tmp0h, const_bits); \
+  tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \
+  tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \
+  \
+  tmp10l = vec_add(tmp0l, tmp3l); \
+  tmp10h = vec_add(tmp0h, tmp3h); \
+  tmp13l = vec_sub(tmp0l, tmp3l); \
+  tmp13h = vec_sub(tmp0h, tmp3h); \
+  \
+  tmp1l = vec_unpackh(tmp1); \
+  tmp1h = vec_unpackl(tmp1); \
+  tmp1l = vec_sl(tmp1l, const_bits); \
+  tmp1h = vec_sl(tmp1h, const_bits); \
+  tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \
+  tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \
+  \
+  tmp11l = vec_add(tmp1l, tmp2l); \
+  tmp11h = vec_add(tmp1h, tmp2h); \
+  tmp12l = vec_sub(tmp1l, tmp2l); \
+  tmp12h = vec_sub(tmp1h, tmp2h); \
+  \
+  /* Odd part */ \
+  \
+  z3 = vec_add(in##3, in##7); \
+  z4 = vec_add(in##1, in##5); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = vec_mergeh(z3, z4); \
+  z34h = vec_mergel(z3, z4); \
+  \
+  z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \
+  z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \
+  z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \
+  z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \
+  \
+  /* (Original) \
+   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2; \
+   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869; \
+   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * tmp0 += z1 + z3;  tmp1 += z2 + z4; \
+   * tmp2 += z2 + z3;  tmp3 += z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
+   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
+   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
+   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
+   * tmp0 += z3;  tmp1 += z4; \
+   * tmp2 += z3;  tmp3 += z4; \
+   */ \
+  \
+  in##71l = vec_mergeh(in##7, in##1); \
+  in##71h = vec_mergel(in##7, in##1); \
+  \
+  tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \
+  tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \
+  tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \
+  tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \
+  \
+  in##53l = vec_mergeh(in##5, in##3); \
+  in##53h = vec_mergel(in##5, in##3); \
+  \
+  tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \
+  tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \
+  tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \
+  tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \
+  \
+  /* Final output stage */ \
+  \
+  out0l = vec_add(tmp10l, tmp3l); \
+  out0h = vec_add(tmp10h, tmp3h); \
+  out7l = vec_sub(tmp10l, tmp3l); \
+  out7h = vec_sub(tmp10h, tmp3h); \
+  \
+  out0l = vec_sra(out0l, descale_p##PASS); \
+  out0h = vec_sra(out0h, descale_p##PASS); \
+  out7l = vec_sra(out7l, descale_p##PASS); \
+  out7h = vec_sra(out7h, descale_p##PASS); \
+  \
+  out0 = vec_pack(out0l, out0h); \
+  out7 = vec_pack(out7l, out7h); \
+  \
+  out1l = vec_add(tmp11l, tmp2l); \
+  out1h = vec_add(tmp11h, tmp2h); \
+  out6l = vec_sub(tmp11l, tmp2l); \
+  out6h = vec_sub(tmp11h, tmp2h); \
+  \
+  out1l = vec_sra(out1l, descale_p##PASS); \
+  out1h = vec_sra(out1h, descale_p##PASS); \
+  out6l = vec_sra(out6l, descale_p##PASS); \
+  out6h = vec_sra(out6h, descale_p##PASS); \
+  \
+  out1 = vec_pack(out1l, out1h); \
+  out6 = vec_pack(out6l, out6h); \
+  \
+  out2l = vec_add(tmp12l, tmp1l); \
+  out2h = vec_add(tmp12h, tmp1h); \
+  out5l = vec_sub(tmp12l, tmp1l); \
+  out5h = vec_sub(tmp12h, tmp1h); \
+  \
+  out2l = vec_sra(out2l, descale_p##PASS); \
+  out2h = vec_sra(out2h, descale_p##PASS); \
+  out5l = vec_sra(out5l, descale_p##PASS); \
+  out5h = vec_sra(out5h, descale_p##PASS); \
+  \
+  out2 = vec_pack(out2l, out2h); \
+  out5 = vec_pack(out5l, out5h); \
+  \
+  out3l = vec_add(tmp13l, tmp0l); \
+  out3h = vec_add(tmp13h, tmp0h); \
+  out4l = vec_sub(tmp13l, tmp0l); \
+  out4h = vec_sub(tmp13h, tmp0h); \
+  \
+  out3l = vec_sra(out3l, descale_p##PASS); \
+  out3h = vec_sra(out3h, descale_p##PASS); \
+  out4l = vec_sra(out4l, descale_p##PASS); \
+  out4h = vec_sra(out4h, descale_p##PASS); \
+  \
+  out3 = vec_pack(out3l, out3h); \
+  out4 = vec_pack(out4l, out4h); \
+}
+
+
+void jsimd_idct_islow_altivec(void *dct_table_, JCOEFPTR coef_block,
+                              JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  short *dct_table = (short *)dct_table_;
+  int *outptr;
+
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+    tmp0, tmp1, tmp2, tmp3, z3, z4,
+    z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
+    row71l, row71h, row26l, row26h, row53l, row53h,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
+    tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
+    z3l, z3h, z4l, z4h,
+    out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
+    out5l, out5h, out6l, out6h, out7l, out7h;
+  __vector signed char outb;
+
+  /* Constants */
+  __vector short pw_zero = { __8X(0) },
+    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
+  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+  __vector int pd_zero = { __4X(0) },
+    pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+    descale_p2 = { __4X(DESCALE_P2) },
+    const_bits = { __4X(CONST_BITS) };
+  __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
+
+  /* Pass 1: process columns */
+
+  col0 = vec_ld(0, coef_block);
+  col1 = vec_ld(16, coef_block);
+  col2 = vec_ld(32, coef_block);
+  col3 = vec_ld(48, coef_block);
+  col4 = vec_ld(64, coef_block);
+  col5 = vec_ld(80, coef_block);
+  col6 = vec_ld(96, coef_block);
+  col7 = vec_ld(112, coef_block);
+
+  tmp1 = vec_or(col1, col2);
+  tmp2 = vec_or(col3, col4);
+  tmp1 = vec_or(tmp1, tmp2);
+  tmp3 = vec_or(col5, col6);
+  tmp3 = vec_or(tmp3, col7);
+  tmp1 = vec_or(tmp1, tmp3);
+
+  quant0 = vec_ld(0, dct_table);
+  col0 = vec_mladd(col0, quant0, pw_zero);
+
+  if (vec_all_eq(tmp1, pw_zero)) {
+    /* AC terms all zero */
+
+    col0 = vec_sl(col0, pass1_bits);
+
+    row0 = vec_splat(col0, 0);
+    row1 = vec_splat(col0, 1);
+    row2 = vec_splat(col0, 2);
+    row3 = vec_splat(col0, 3);
+    row4 = vec_splat(col0, 4);
+    row5 = vec_splat(col0, 5);
+    row6 = vec_splat(col0, 6);
+    row7 = vec_splat(col0, 7);
+
+  } else {
+
+    quant1 = vec_ld(16, dct_table);
+    quant2 = vec_ld(32, dct_table);
+    quant3 = vec_ld(48, dct_table);
+    quant4 = vec_ld(64, dct_table);
+    quant5 = vec_ld(80, dct_table);
+    quant6 = vec_ld(96, dct_table);
+    quant7 = vec_ld(112, dct_table);
+
+    col1 = vec_mladd(col1, quant1, pw_zero);
+    col2 = vec_mladd(col2, quant2, pw_zero);
+    col3 = vec_mladd(col3, quant3, pw_zero);
+    col4 = vec_mladd(col4, quant4, pw_zero);
+    col5 = vec_mladd(col5, quant5, pw_zero);
+    col6 = vec_mladd(col6, quant6, pw_zero);
+    col7 = vec_mladd(col7, quant7, pw_zero);
+
+    DO_IDCT(col, 1);
+
+    TRANSPOSE(out, row);
+  }
+
+  /* Pass 2: process rows */
+
+  DO_IDCT(row, 2);
+
+  TRANSPOSE(out, col);
+
+  outb = vec_packs(col0, col0);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[0] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col1, col1);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[1] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col2, col2);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[2] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col3, col3);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[3] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col4, col4);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[4] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col5, col5);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[5] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col6, col6);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[6] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+
+  outb = vec_packs(col7, col7);
+  outb = vec_add(outb, pb_centerjsamp);
+  outptr = (int *)(output_buf[7] + output_col);
+  vec_ste((__vector int)outb, 0, outptr);
+  vec_ste((__vector int)outb, 4, outptr);
+}
diff --git a/simd/jquanti-altivec.c b/simd/powerpc/jquanti-altivec.c
similarity index 88%
rename from simd/jquanti-altivec.c
rename to simd/powerpc/jquanti-altivec.c
index 25cc296..148b252 100644
--- a/simd/jquanti-altivec.c
+++ b/simd/powerpc/jquanti-altivec.c
@@ -31,26 +31,25 @@
  */
 #if __BIG_ENDIAN__
 
-#define LOAD_ROW(row) {  \
-  elemptr = sample_data[row] + start_col;  \
-  in##row = vec_ld(0, elemptr);  \
-  if ((size_t)elemptr & 15)  \
-    in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr));  \
+#define LOAD_ROW(row) { \
+  elemptr = sample_data[row] + start_col; \
+  in##row = vec_ld(0, elemptr); \
+  if ((size_t)elemptr & 15) \
+    in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
 }
 
 #else
 
-#define LOAD_ROW(row) {  \
-  elemptr = sample_data[row] + start_col;  \
-  in##row = vec_vsx_ld(0, elemptr);  \
+#define LOAD_ROW(row) { \
+  elemptr = sample_data[row] + start_col; \
+  in##row = vec_vsx_ld(0, elemptr); \
 }
 
 #endif
 
 
-void
-jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
-                        DCTELEM *workspace)
+void jsimd_convsamp_altivec(JSAMPARRAY sample_data, JDIMENSION start_col,
+                            DCTELEM *workspace)
 {
   JSAMPROW elemptr;
 
@@ -104,19 +103,18 @@
 /* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
    We basically need an unsigned equivalent of vec_madds(). */
 
-#define MULTIPLY(vs0, vs1, out) {  \
-  tmpe = vec_mule((__vector unsigned short)vs0,  \
-                  (__vector unsigned short)vs1);  \
-  tmpo = vec_mulo((__vector unsigned short)vs0,  \
-                  (__vector unsigned short)vs1);  \
-  out = (__vector short)vec_perm((__vector unsigned short)tmpe,  \
-                                 (__vector unsigned short)tmpo,  \
-                                 shift_pack_index);  \
+#define MULTIPLY(vs0, vs1, out) { \
+  tmpe = vec_mule((__vector unsigned short)vs0, \
+                  (__vector unsigned short)vs1); \
+  tmpo = vec_mulo((__vector unsigned short)vs0, \
+                  (__vector unsigned short)vs1); \
+  out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
+                                 (__vector unsigned short)tmpo, \
+                                 shift_pack_index); \
 }
 
-void
-jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM *divisors,
-                        DCTELEM *workspace)
+void jsimd_quantize_altivec(JCOEFPTR coef_block, DCTELEM *divisors,
+                            DCTELEM *workspace)
 {
   __vector short row0, row1, row2, row3, row4, row5, row6, row7,
     row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
@@ -129,10 +127,10 @@
   __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
 #if __BIG_ENDIAN__
   __vector unsigned char shift_pack_index =
-    {0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29};
+    {  0,  1, 16, 17,  4,  5, 20, 21,  8,  9, 24, 25, 12, 13, 28, 29 };
 #else
   __vector unsigned char shift_pack_index =
-    {2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31};
+    {  2,  3, 18, 19,  6,  7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 };
 #endif
 
   row0 = vec_ld(0, workspace);
diff --git a/simd/powerpc/jsimd.c b/simd/powerpc/jsimd.c
new file mode 100644
index 0000000..53f1a7d
--- /dev/null
+++ b/simd/powerpc/jsimd.c
@@ -0,0 +1,841 @@
+/*
+ * jsimd_powerpc.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
+ * Copyright (C) 2015, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * PowerPC architecture.
+ */
+
+#ifdef __amigaos4__
+/* This must be defined first as it re-defines GLOBAL otherwise */
+#include <proto/exec.h>
+#endif
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+#if defined(__OpenBSD__)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+#endif
+
+static unsigned int simd_support = ~0;
+
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+  char *p;
+
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "cpu", 3) != 0)
+    return 0;
+  buffer += 3;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "altivec"))
+        simd_support |= JSIMD_ALTIVEC;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+  char *env = NULL;
+#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#elif defined(__amigaos4__)
+  uint32 altivec = 0;
+#elif defined(__OpenBSD__)
+  int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC };
+  int altivec;
+  size_t len = sizeof(altivec);
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__ALTIVEC__) || defined(__APPLE__)
+  simd_support |= JSIMD_ALTIVEC;
+#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#elif defined(__amigaos4__)
+  IExec->GetCPUInfoTags(GCIT_VectorUnit, &altivec, TAG_DONE);
+  if (altivec == VECTORTYPE_ALTIVEC)
+    simd_support |= JSIMD_ALTIVEC;
+#elif defined(__OpenBSD__)
+  if (sysctl(mib, 2, &altivec, &len, NULL, 0) == 0 && altivec != 0)
+    simd_support |= JSIMD_ALTIVEC;
+#endif
+
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEALTIVEC");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_ALTIVEC;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_extrgb_ycc_convert_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_extrgbx_ycc_convert_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_extbgr_ycc_convert_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_extbgrx_ycc_convert_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_extxbgr_ycc_convert_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_extxrgb_ycc_convert_altivec;
+    break;
+  default:
+    altivecfct = jsimd_rgb_ycc_convert_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_extrgb_gray_convert_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_extrgbx_gray_convert_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_extbgr_gray_convert_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_extbgrx_gray_convert_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_extxbgr_gray_convert_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_extxrgb_gray_convert_altivec;
+    break;
+  default:
+    altivecfct = jsimd_rgb_gray_convert_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_ycc_extrgb_convert_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_ycc_extrgbx_convert_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_ycc_extbgr_convert_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_ycc_extbgrx_convert_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_ycc_extxbgr_convert_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_ycc_extxrgb_convert_altivec;
+    break;
+  default:
+    altivecfct = jsimd_ycc_rgb_convert_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+                                compptr->v_samp_factor,
+                                compptr->width_in_blocks, input_data,
+                                output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+                                compptr->v_samp_factor,
+                                compptr->width_in_blocks, input_data,
+                                output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+                              input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+                              input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+                                    compptr->downsampled_width, input_data,
+                                    output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+                                    compptr->downsampled_width, input_data,
+                                    output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_h2v2_extrgb_merged_upsample_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_h2v2_extrgbx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_h2v2_extbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_h2v2_extbgrx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_h2v2_extxbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_h2v2_extxrgb_merged_upsample_altivec;
+    break;
+  default:
+    altivecfct = jsimd_h2v2_merged_upsample_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_h2v1_extrgb_merged_upsample_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_h2v1_extrgbx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_h2v1_extbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_h2v1_extbgrx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_h2v1_extxbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_h2v1_extxrgb_merged_upsample_altivec;
+    break;
+  default:
+    altivecfct = jsimd_h2v1_merged_upsample_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_altivec(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_altivec(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return NULL;
+}
diff --git a/simd/jsimd_altivec.h b/simd/powerpc/jsimd_altivec.h
similarity index 87%
rename from simd/jsimd_altivec.h
rename to simd/powerpc/jsimd_altivec.h
index 62dbc5c..31af215 100644
--- a/simd/jsimd_altivec.h
+++ b/simd/powerpc/jsimd_altivec.h
@@ -21,12 +21,12 @@
  */
 
 #define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
 #include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
 #include <altivec.h>
 
 
@@ -37,12 +37,11 @@
 #define __8X(a) __4X(a), __4X(a)
 #define __16X(a) __8X(a), __8X(a)
 
-#define TRANSPOSE(row, col)  \
-{  \
-  __vector short row04l, row04h, row15l, row15h,  \
-                 row26l, row26h, row37l, row37h;  \
-  __vector short col01e, col01o, col23e, col23o,  \
-                 col45e, col45o, col67e, col67o;  \
+#define TRANSPOSE(row, col) { \
+  __vector short row04l, row04h, row15l, row15h, \
+                 row26l, row26h, row37l, row37h; \
+  __vector short col01e, col01o, col23e, col23o, \
+                 col45e, col45o, col67e, col67o; \
   \
                                        /* transpose coefficients (phase 1) */ \
   row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \
@@ -65,18 +64,18 @@
   col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \
   \
                                        /* transpose coefficients (phase 3) */ \
-  col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */   \
-  col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */   \
-  col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */   \
-  col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */   \
-  col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */   \
-  col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */   \
-  col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */   \
-  col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */   \
+  col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \
+  col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \
+  col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \
+  col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \
+  col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \
+  col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \
+  col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \
+  col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \
 }
 
 #ifndef min
-#define min(a,b) ((a) < (b) ? (a) : (b))
+#define min(a, b) ((a) < (b) ? (a) : (b))
 #endif
 
 
diff --git a/simd/x86_64/jccolext-avx2.asm b/simd/x86_64/jccolext-avx2.asm
new file mode 100644
index 0000000..5fa3848
--- /dev/null
+++ b/simd/x86_64/jccolext-avx2.asm
@@ -0,0 +1,560 @@
+;
+; jccolext.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  8
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
+
+EXTN(jsimd_rgb_ycc_convert_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    mov         rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+    mov         rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdx
+    push        rbx
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsi, JSAMPROW [rsi]     ; inptr
+    mov         rdi, JSAMPROW [rdi]     ; outptr0
+    mov         rbx, JSAMPROW [rbx]     ; outptr1
+    mov         rdx, JSAMPROW [rdx]     ; outptr2
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, BYTE [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, WORD [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         rcx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [rsi+rcx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         rcx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=RE
+    vmovdqa     YMMWORD [wk(1)], ymm1   ; wk(1)=RO
+    vmovdqa     YMMWORD [wk(2)], ymm4   ; wk(2)=BE
+    vmovdqa     YMMWORD [wk(3)], ymm5   ; wk(3)=BO
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vmovdqa     ymm7, ymm1
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm1, ymm1, [rel PW_F0299_F0337]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    vpmaddwd    ymm7, ymm7, [rel PW_MF016_MF033]  ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [rel PW_MF016_MF033]  ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(4)], ymm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(5)], ymm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vpxor       ymm1, ymm1, ymm1
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm1, ymm1, ymm5        ; ymm1=BOL
+    vpunpckhwd  ymm6, ymm6, ymm5        ; ymm6=BOH
+    vpsrld      ymm1, ymm1, 1           ; ymm1=BOL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BOH*FIX(0.500)
+
+    vmovdqa     ymm5, [rel PD_ONEHALFM1_CJ]  ; ymm5=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm7, ymm7, ymm5
+    vpaddd      ymm4, ymm4, ymm5
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CbOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbOH
+    vpackssdw   ymm7, ymm7, ymm4        ; ymm7=CbO
+
+    vmovdqa     ymm1, YMMWORD [wk(2)]   ; ymm1=BE
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vmovdqa     ymm5, ymm0
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm0, ymm0, [rel PW_F0299_F0337]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    vpmaddwd    ymm5, ymm5, [rel PW_MF016_MF033]  ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [rel PW_MF016_MF033]  ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(6)], ymm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(7)], ymm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vpxor       ymm0, ymm0, ymm0
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm0, ymm0, ymm1        ; ymm0=BEL
+    vpunpckhwd  ymm6, ymm6, ymm1        ; ymm6=BEH
+    vpsrld      ymm0, ymm0, 1           ; ymm0=BEL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BEH*FIX(0.500)
+
+    vmovdqa     ymm1, [rel PD_ONEHALFM1_CJ]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm5, ymm5, ymm0
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm5, ymm5, ymm1
+    vpaddd      ymm4, ymm4, ymm1
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CbEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbEH
+    vpackssdw   ymm5, ymm5, ymm4        ; ymm5=CbE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm5, ymm5, ymm7        ; ymm5=Cb
+    vmovdqu     YMMWORD [rbx], ymm5     ; Save Cb
+
+    vmovdqa     ymm0, YMMWORD [wk(3)]   ; ymm0=BO
+    vmovdqa     ymm6, YMMWORD [wk(2)]   ; ymm6=BE
+    vmovdqa     ymm1, YMMWORD [wk(1)]   ; ymm1=RO
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vmovdqa     ymm7, ymm0
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm0, ymm0, [rel PW_F0114_F0250]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    vpmaddwd    ymm7, ymm7, [rel PW_MF008_MF041]  ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [rel PW_MF008_MF041]  ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    vmovdqa     ymm3, [rel PD_ONEHALF]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, YMMWORD [wk(4)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(5)]
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vpxor       ymm3, ymm3, ymm3
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm3, ymm3, ymm1        ; ymm3=ROL
+    vpunpckhwd  ymm4, ymm4, ymm1        ; ymm4=ROH
+    vpsrld      ymm3, ymm3, 1           ; ymm3=ROL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=ROH*FIX(0.500)
+
+    vmovdqa     ymm1, [rel PD_ONEHALFM1_CJ]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm3
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm5, ymm5, ymm1
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CrOL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrOH
+    vpackssdw   ymm7, ymm7, ymm5        ; ymm7=CrO
+
+    vmovdqa     ymm3, YMMWORD [wk(0)]   ; ymm3=RE
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vmovdqa     ymm1, ymm6
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm6, ymm6, [rel PW_F0114_F0250]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    vpmaddwd    ymm1, ymm1, [rel PW_MF008_MF041]  ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [rel PW_MF008_MF041]  ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    vmovdqa     ymm2, [rel PD_ONEHALF]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(6)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(7)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [rdi], ymm6     ; Save Y
+
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm2, ymm2, ymm3        ; ymm2=REL
+    vpunpckhwd  ymm4, ymm4, ymm3        ; ymm4=REH
+    vpsrld      ymm2, ymm2, 1           ; ymm2=REL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=REH*FIX(0.500)
+
+    vmovdqa     ymm0, [rel PD_ONEHALFM1_CJ]  ; ymm0=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm1, ymm1, ymm2
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm1, ymm1, ymm0
+    vpaddd      ymm5, ymm5, ymm0
+    vpsrld      ymm1, ymm1, SCALEBITS   ; ymm1=CrEL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrEH
+    vpackssdw   ymm1, ymm1, ymm5        ; ymm1=CrE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Cr
+    vmovdqu     YMMWORD [rdx], ymm1     ; Save Cr
+
+    sub         rcx, byte SIZEOF_YMMWORD
+    add         rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte SIZEOF_YMMWORD           ; outptr0
+    add         rbx, byte SIZEOF_YMMWORD           ; outptr1
+    add         rdx, byte SIZEOF_YMMWORD           ; outptr2
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+    pop         rbx
+    pop         rdx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jccolext-sse2.asm b/simd/x86_64/jccolext-sse2.asm
new file mode 100644
index 0000000..b1486c0
--- /dev/null
+++ b/simd/x86_64/jccolext-sse2.asm
@@ -0,0 +1,485 @@
+;
+; jccolext.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  8
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    mov         rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+    mov         rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdx
+    push        rbx
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsi, JSAMPROW [rsi]     ; inptr
+    mov         rdi, JSAMPROW [rdi]     ; outptr0
+    mov         rbx, JSAMPROW [rbx]     ; outptr1
+    mov         rdx, JSAMPROW [rdx]     ; outptr2
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, BYTE [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, WORD [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    movd        xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .rgb_ycc_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=RE
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=RO
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=BE
+    movdqa      XMMWORD [wk(3)], xmm5   ; wk(3)=BO
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm1, [rel PW_F0299_F0337]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    pmaddwd     xmm7, [rel PW_MF016_MF033]  ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    pmaddwd     xmm4, [rel PW_MF016_MF033]  ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(4)], xmm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    movdqa      XMMWORD [wk(5)], xmm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    pxor        xmm1, xmm1
+    pxor        xmm6, xmm6
+    punpcklwd   xmm1, xmm5              ; xmm1=BOL
+    punpckhwd   xmm6, xmm5              ; xmm6=BOH
+    psrld       xmm1, 1                 ; xmm1=BOL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BOH*FIX(0.500)
+
+    movdqa      xmm5, [rel PD_ONEHALFM1_CJ]  ; xmm5=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm1
+    paddd       xmm4, xmm6
+    paddd       xmm7, xmm5
+    paddd       xmm4, xmm5
+    psrld       xmm7, SCALEBITS         ; xmm7=CbOL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbOH
+    packssdw    xmm7, xmm4              ; xmm7=CbO
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=BE
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm0, [rel PW_F0299_F0337]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    pmaddwd     xmm5, [rel PW_MF016_MF033]  ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    pmaddwd     xmm4, [rel PW_MF016_MF033]  ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(7)], xmm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    pxor        xmm0, xmm0
+    pxor        xmm6, xmm6
+    punpcklwd   xmm0, xmm1              ; xmm0=BEL
+    punpckhwd   xmm6, xmm1              ; xmm6=BEH
+    psrld       xmm0, 1                 ; xmm0=BEL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BEH*FIX(0.500)
+
+    movdqa      xmm1, [rel PD_ONEHALFM1_CJ]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm5, xmm0
+    paddd       xmm4, xmm6
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrld       xmm5, SCALEBITS         ; xmm5=CbEL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbEH
+    packssdw    xmm5, xmm4              ; xmm5=CbE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm5, xmm7              ; xmm5=Cb
+    movdqa      XMMWORD [rbx], xmm5     ; Save Cb
+
+    movdqa      xmm0, XMMWORD [wk(3)]   ; xmm0=BO
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=BE
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=RO
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    movdqa      xmm7, xmm0
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm0, [rel PW_F0114_F0250]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    pmaddwd     xmm7, [rel PW_MF008_MF041]  ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    pmaddwd     xmm5, [rel PW_MF008_MF041]  ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    movdqa      xmm3, [rel PD_ONEHALF]      ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, XMMWORD [wk(4)]
+    paddd       xmm4, XMMWORD [wk(5)]
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    pxor        xmm3, xmm3
+    pxor        xmm4, xmm4
+    punpcklwd   xmm3, xmm1              ; xmm3=ROL
+    punpckhwd   xmm4, xmm1              ; xmm4=ROH
+    psrld       xmm3, 1                 ; xmm3=ROL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=ROH*FIX(0.500)
+
+    movdqa      xmm1, [rel PD_ONEHALFM1_CJ]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm3
+    paddd       xmm5, xmm4
+    paddd       xmm7, xmm1
+    paddd       xmm5, xmm1
+    psrld       xmm7, SCALEBITS         ; xmm7=CrOL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrOH
+    packssdw    xmm7, xmm5              ; xmm7=CrO
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=RE
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm6, [rel PW_F0114_F0250]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    pmaddwd     xmm1, [rel PW_MF008_MF041]  ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    pmaddwd     xmm5, [rel PW_MF008_MF041]  ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    movdqa      xmm2, [rel PD_ONEHALF]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(6)]
+    paddd       xmm4, XMMWORD [wk(7)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [rdi], xmm6     ; Save Y
+
+    pxor        xmm2, xmm2
+    pxor        xmm4, xmm4
+    punpcklwd   xmm2, xmm3              ; xmm2=REL
+    punpckhwd   xmm4, xmm3              ; xmm4=REH
+    psrld       xmm2, 1                 ; xmm2=REL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=REH*FIX(0.500)
+
+    movdqa      xmm0, [rel PD_ONEHALFM1_CJ]  ; xmm0=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm1, xmm2
+    paddd       xmm5, xmm4
+    paddd       xmm1, xmm0
+    paddd       xmm5, xmm0
+    psrld       xmm1, SCALEBITS         ; xmm1=CrEL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrEH
+    packssdw    xmm1, xmm5              ; xmm1=CrE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Cr
+    movdqa      XMMWORD [rdx], xmm1     ; Save Cr
+
+    sub         rcx, byte SIZEOF_XMMWORD
+    add         rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte SIZEOF_XMMWORD                ; outptr0
+    add         rbx, byte SIZEOF_XMMWORD                ; outptr1
+    add         rdx, byte SIZEOF_XMMWORD                ; outptr2
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+    pop         rbx
+    pop         rdx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jccolor-avx2.asm b/simd/x86_64/jccolor-avx2.asm
new file mode 100644
index 0000000..9642eb5
--- /dev/null
+++ b/simd/x86_64/jccolor-avx2.asm
@@ -0,0 +1,123 @@
+;
+; jccolor.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
+
+EXTN(jconst_rgb_ycc_convert_avx2):
+
+PW_F0299_F0337  times 8 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 8 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 8 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 8 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 8 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 8 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extrgbx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extbgrx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extxbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2 jsimd_extxrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
diff --git a/simd/jccolor-sse2.asm b/simd/x86_64/jccolor-sse2.asm
similarity index 70%
copy from simd/jccolor-sse2.asm
copy to simd/x86_64/jccolor-sse2.asm
index 13124d1..7dc5bb7 100644
--- a/simd/jccolor-sse2.asm
+++ b/simd/x86_64/jccolor-sse2.asm
@@ -1,7 +1,7 @@
 ;
-; jccolor.asm - colorspace conversion (SSE2)
+; jccolor.asm - colorspace conversion (64-bit SSE2)
 ;
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -19,38 +19,39 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS       16
+%define SCALEBITS  16
 
-F_0_081 equ      5329                   ; FIX(0.08131)
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_168 equ     11059                   ; FIX(0.16874)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_331 equ     21709                   ; FIX(0.33126)
-F_0_418 equ     27439                   ; FIX(0.41869)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_CONST
+    SECTION     SEG_CONST
 
-        alignz  16
-        global  EXTN(jconst_rgb_ycc_convert_sse2)
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
 
 EXTN(jconst_rgb_ycc_convert_sse2):
 
-PW_F0299_F0337  times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 4 dw  F_0_114, F_0_250
-PW_MF016_MF033  times 4 dw -F_0_168,-F_0_331
-PW_MF008_MF041  times 4 dw -F_0_081,-F_0_418
-PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
-PD_ONEHALF      times 4 dd  (1 << (SCALEBITS-1))
+PW_F0299_F0337  times 4 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 4 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 4 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 4 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 4 dd  (1 << (SCALEBITS - 1))
 
-        alignz  16
+    alignz      32
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
+    SECTION     SEG_TEXT
+    BITS        64
 
 %include "jccolext-sse2.asm"
 
diff --git a/simd/x86_64/jcgray-avx2.asm b/simd/x86_64/jcgray-avx2.asm
new file mode 100644
index 0000000..49e61ee
--- /dev/null
+++ b/simd/x86_64/jcgray-avx2.asm
@@ -0,0 +1,115 @@
+;
+; jcgray.asm - grayscale colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
+
+EXTN(jconst_rgb_gray_convert_avx2):
+
+PW_F0299_F0337 times 8 dw F_0_299, F_0_337
+PW_F0114_F0250 times 8 dw F_0_114, F_0_250
+PD_ONEHALF     times 8 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extrgbx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extbgrx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extxbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2 jsimd_extxrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
diff --git a/simd/jcgray-sse2.asm b/simd/x86_64/jcgray-sse2.asm
similarity index 78%
copy from simd/jcgray-sse2.asm
copy to simd/x86_64/jcgray-sse2.asm
index 5b0b466..13becee 100644
--- a/simd/jcgray-sse2.asm
+++ b/simd/x86_64/jcgray-sse2.asm
@@ -1,7 +1,7 @@
 ;
-; jcgray.asm - grayscale colorspace conversion (SSE2)
+; jcgray.asm - grayscale colorspace conversion (64-bit SSE2)
 ;
-; Copyright (C) 2011, D. R. Commander.
+; Copyright (C) 2011, 2016, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -19,31 +19,31 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS       16
+%define SCALEBITS  16
 
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_CONST
+    SECTION     SEG_CONST
 
-        alignz  16
-        global  EXTN(jconst_rgb_gray_convert_sse2)
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
 
 EXTN(jconst_rgb_gray_convert_sse2):
 
-PW_F0299_F0337  times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 4 dw  F_0_114, F_0_250
-PD_ONEHALF      times 4 dd  (1 << (SCALEBITS-1))
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PD_ONEHALF     times 4 dd (1 << (SCALEBITS - 1))
 
-        alignz  16
+    alignz      32
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
+    SECTION     SEG_TEXT
+    BITS        64
 
 %include "jcgryext-sse2.asm"
 
diff --git a/simd/x86_64/jcgryext-avx2.asm b/simd/x86_64/jcgryext-avx2.asm
new file mode 100644
index 0000000..79e2aa0
--- /dev/null
+++ b/simd/x86_64/jcgryext-avx2.asm
@@ -0,0 +1,439 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
+
+EXTN(jsimd_rgb_gray_convert_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsi, JSAMPROW [rsi]     ; inptr
+    mov         rdi, JSAMPROW [rdi]     ; outptr0
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, BYTE [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, WORD [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         rcx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [rsi+rcx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         rcx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vpmaddwd    ymm1, ymm1, [rel PW_F0299_F0337]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm7, ymm6              ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vpmaddwd    ymm0, ymm0, [rel PW_F0299_F0337]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(1)], ymm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     ymm0, ymm5              ; ymm0=BO
+    vmovdqa     ymm6, ymm4              ; ymm6=BE
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vpmaddwd    ymm0, ymm0, [rel PW_F0114_F0250]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    vmovdqa     ymm3, [rel PD_ONEHALF]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, ymm1
+    vpaddd      ymm4, ymm4, ymm7
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vpmaddwd    ymm6, ymm6, [rel PW_F0114_F0250]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    vmovdqa     ymm2, [rel PD_ONEHALF]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(0)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(1)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [rdi], ymm6     ; Save Y
+
+    sub         rcx, byte SIZEOF_YMMWORD
+    add         rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte SIZEOF_YMMWORD           ; outptr0
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jcgryext-sse2.asm b/simd/x86_64/jcgryext-sse2.asm
new file mode 100644
index 0000000..9c3ae5e
--- /dev/null
+++ b/simd/x86_64/jcgryext-sse2.asm
@@ -0,0 +1,364 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
+
+EXTN(jsimd_rgb_gray_convert_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsi, JSAMPROW [rsi]     ; inptr
+    mov         rdi, JSAMPROW [rdi]     ; outptr0
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, BYTE [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, WORD [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    movd        xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .rgb_gray_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    pmaddwd     xmm1, [rel PW_F0299_F0337]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm7, xmm6              ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    pmaddwd     xmm0, [rel PW_F0299_F0337]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      xmm0, xmm5              ; xmm0=BO
+    movdqa      xmm6, xmm4              ; xmm6=BE
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    pmaddwd     xmm0, [rel PW_F0114_F0250]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    movdqa      xmm3, [rel PD_ONEHALF]      ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, xmm1
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    pmaddwd     xmm6, [rel PW_F0114_F0250]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    movdqa      xmm2, [rel PD_ONEHALF]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(0)]
+    paddd       xmm4, XMMWORD [wk(1)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [rdi], xmm6     ; Save Y
+
+    sub         rcx, byte SIZEOF_XMMWORD
+    add         rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte SIZEOF_XMMWORD                ; outptr0
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jchuff-sse2.asm b/simd/x86_64/jchuff-sse2.asm
new file mode 100644
index 0000000..1b091ad
--- /dev/null
+++ b/simd/x86_64/jchuff-sse2.asm
@@ -0,0 +1,348 @@
+;
+; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
+;
+; Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
+; Copyright (C) 2015, Matthieu Darbois.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation for Huffman coding of one block.
+; The following code is based directly on jchuff.c; see jchuff.c for more
+; details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_huff_encode_one_block)
+
+EXTN(jconst_huff_encode_one_block):
+
+%include "jpeg_nbits_table.inc"
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+; These macros perform the same task as the emit_bits() function in the
+; original libjpeg code.  In addition to reducing overhead by explicitly
+; inlining the code, additional performance is achieved by taking into
+; account the size of the bit buffer and waiting until it is almost full
+; before emptying it.  This mostly benefits 64-bit platforms, since 6
+; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
+
+%macro EMIT_BYTE 0
+    sub         put_bits, 8             ; put_bits -= 8;
+    mov         rdx, put_buffer
+    mov         ecx, put_bits
+    shr         rdx, cl                 ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
+    mov         byte [buffer], dl       ; *buffer++ = c;
+    add         buffer, 1
+    cmp         dl, 0xFF                ; need to stuff a zero byte?
+    jne         %%.EMIT_BYTE_END
+    mov         byte [buffer], 0        ; *buffer++ = 0;
+    add         buffer, 1
+%%.EMIT_BYTE_END:
+%endmacro
+
+%macro PUT_BITS 1
+    add         put_bits, ecx           ; put_bits += size;
+    shl         put_buffer, cl          ; put_buffer = (put_buffer << size);
+    or          put_buffer, %1
+%endmacro
+
+%macro CHECKBUF31 0
+    cmp         put_bits, 32            ; if (put_bits > 31) {
+    jl          %%.CHECKBUF31_END
+    EMIT_BYTE
+    EMIT_BYTE
+    EMIT_BYTE
+    EMIT_BYTE
+%%.CHECKBUF31_END:
+%endmacro
+
+%macro CHECKBUF47 0
+    cmp         put_bits, 48            ; if (put_bits > 47) {
+    jl          %%.CHECKBUF47_END
+    EMIT_BYTE
+    EMIT_BYTE
+    EMIT_BYTE
+    EMIT_BYTE
+    EMIT_BYTE
+    EMIT_BYTE
+%%.CHECKBUF47_END:
+%endmacro
+
+%macro EMIT_BITS 2
+    CHECKBUF47
+    mov         ecx, %2
+    PUT_BITS    %1
+%endmacro
+
+%macro kloop_prepare 37                 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
+    pxor        xmm8, xmm8              ; __m128i neg = _mm_setzero_si128();
+    pxor        xmm9, xmm9              ; __m128i neg = _mm_setzero_si128();
+    pxor        xmm10, xmm10            ; __m128i neg = _mm_setzero_si128();
+    pxor        xmm11, xmm11            ; __m128i neg = _mm_setzero_si128();
+    pinsrw      %34, word [r12 + %2  * SIZEOF_WORD], 0  ; xmm_shadow[0] = block[jno0];
+    pinsrw      %35, word [r12 + %10 * SIZEOF_WORD], 0  ; xmm_shadow[8] = block[jno8];
+    pinsrw      %36, word [r12 + %18 * SIZEOF_WORD], 0  ; xmm_shadow[16] = block[jno16];
+    pinsrw      %37, word [r12 + %26 * SIZEOF_WORD], 0  ; xmm_shadow[24] = block[jno24];
+    pinsrw      %34, word [r12 + %3  * SIZEOF_WORD], 1  ; xmm_shadow[1] = block[jno1];
+    pinsrw      %35, word [r12 + %11 * SIZEOF_WORD], 1  ; xmm_shadow[9] = block[jno9];
+    pinsrw      %36, word [r12 + %19 * SIZEOF_WORD], 1  ; xmm_shadow[17] = block[jno17];
+    pinsrw      %37, word [r12 + %27 * SIZEOF_WORD], 1  ; xmm_shadow[25] = block[jno25];
+    pinsrw      %34, word [r12 + %4  * SIZEOF_WORD], 2  ; xmm_shadow[2] = block[jno2];
+    pinsrw      %35, word [r12 + %12 * SIZEOF_WORD], 2  ; xmm_shadow[10] = block[jno10];
+    pinsrw      %36, word [r12 + %20 * SIZEOF_WORD], 2  ; xmm_shadow[18] = block[jno18];
+    pinsrw      %37, word [r12 + %28 * SIZEOF_WORD], 2  ; xmm_shadow[26] = block[jno26];
+    pinsrw      %34, word [r12 + %5  * SIZEOF_WORD], 3  ; xmm_shadow[3] = block[jno3];
+    pinsrw      %35, word [r12 + %13 * SIZEOF_WORD], 3  ; xmm_shadow[11] = block[jno11];
+    pinsrw      %36, word [r12 + %21 * SIZEOF_WORD], 3  ; xmm_shadow[19] = block[jno19];
+    pinsrw      %37, word [r12 + %29 * SIZEOF_WORD], 3  ; xmm_shadow[27] = block[jno27];
+    pinsrw      %34, word [r12 + %6  * SIZEOF_WORD], 4  ; xmm_shadow[4] = block[jno4];
+    pinsrw      %35, word [r12 + %14 * SIZEOF_WORD], 4  ; xmm_shadow[12] = block[jno12];
+    pinsrw      %36, word [r12 + %22 * SIZEOF_WORD], 4  ; xmm_shadow[20] = block[jno20];
+    pinsrw      %37, word [r12 + %30 * SIZEOF_WORD], 4  ; xmm_shadow[28] = block[jno28];
+    pinsrw      %34, word [r12 + %7  * SIZEOF_WORD], 5  ; xmm_shadow[5] = block[jno5];
+    pinsrw      %35, word [r12 + %15 * SIZEOF_WORD], 5  ; xmm_shadow[13] = block[jno13];
+    pinsrw      %36, word [r12 + %23 * SIZEOF_WORD], 5  ; xmm_shadow[21] = block[jno21];
+    pinsrw      %37, word [r12 + %31 * SIZEOF_WORD], 5  ; xmm_shadow[29] = block[jno29];
+    pinsrw      %34, word [r12 + %8  * SIZEOF_WORD], 6  ; xmm_shadow[6] = block[jno6];
+    pinsrw      %35, word [r12 + %16 * SIZEOF_WORD], 6  ; xmm_shadow[14] = block[jno14];
+    pinsrw      %36, word [r12 + %24 * SIZEOF_WORD], 6  ; xmm_shadow[22] = block[jno22];
+    pinsrw      %37, word [r12 + %32 * SIZEOF_WORD], 6  ; xmm_shadow[30] = block[jno30];
+    pinsrw      %34, word [r12 + %9  * SIZEOF_WORD], 7  ; xmm_shadow[7] = block[jno7];
+    pinsrw      %35, word [r12 + %17 * SIZEOF_WORD], 7  ; xmm_shadow[15] = block[jno15];
+    pinsrw      %36, word [r12 + %25 * SIZEOF_WORD], 7  ; xmm_shadow[23] = block[jno23];
+%if %1 != 32
+    pinsrw      %37, word [r12 + %33 * SIZEOF_WORD], 7  ; xmm_shadow[31] = block[jno31];
+%else
+    pinsrw      %37, ebx, 7             ; xmm_shadow[31] = block[jno31];
+%endif
+    pcmpgtw     xmm8, %34               ; neg = _mm_cmpgt_epi16(neg, x1);
+    pcmpgtw     xmm9, %35               ; neg = _mm_cmpgt_epi16(neg, x1);
+    pcmpgtw     xmm10, %36              ; neg = _mm_cmpgt_epi16(neg, x1);
+    pcmpgtw     xmm11, %37              ; neg = _mm_cmpgt_epi16(neg, x1);
+    paddw       %34, xmm8               ; x1 = _mm_add_epi16(x1, neg);
+    paddw       %35, xmm9               ; x1 = _mm_add_epi16(x1, neg);
+    paddw       %36, xmm10              ; x1 = _mm_add_epi16(x1, neg);
+    paddw       %37, xmm11              ; x1 = _mm_add_epi16(x1, neg);
+    pxor        %34, xmm8               ; x1 = _mm_xor_si128(x1, neg);
+    pxor        %35, xmm9               ; x1 = _mm_xor_si128(x1, neg);
+    pxor        %36, xmm10              ; x1 = _mm_xor_si128(x1, neg);
+    pxor        %37, xmm11              ; x1 = _mm_xor_si128(x1, neg);
+    pxor        xmm8, %34               ; neg = _mm_xor_si128(neg, x1);
+    pxor        xmm9, %35               ; neg = _mm_xor_si128(neg, x1);
+    pxor        xmm10, %36              ; neg = _mm_xor_si128(neg, x1);
+    pxor        xmm11, %37              ; neg = _mm_xor_si128(neg, x1);
+    movdqa      XMMWORD [t1 + %1 * SIZEOF_WORD], %34           ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
+    movdqa      XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35     ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
+    movdqa      XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36    ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
+    movdqa      XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37    ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
+    movdqa      XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8          ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
+    movdqa      XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9    ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
+    movdqa      XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10  ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
+    movdqa      XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11  ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
+%endmacro
+
+;
+; Encode a single block's worth of coefficients.
+;
+; GLOBAL(JOCTET *)
+; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
+;                                  JCOEFPTR block, int last_dc_val,
+;                                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
+;
+
+; r10 = working_state *state
+; r11 = JOCTET *buffer
+; r12 = JCOEFPTR block
+; r13d = int last_dc_val
+; r14 = c_derived_tbl *dctbl
+; r15 = c_derived_tbl *actbl
+
+%define t1          rbp - (DCTSIZE2 * SIZEOF_WORD)
+%define t2          t1 - (DCTSIZE2 * SIZEOF_WORD)
+%define put_buffer  r8
+%define put_bits    r9d
+%define buffer      rax
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
+
+EXTN(jsimd_huff_encode_one_block_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [t2]
+    push_xmm    4
+    collect_args 6
+    push        rbx
+
+    mov         buffer, r11                  ; r11 is now sratch
+
+    mov         put_buffer, MMWORD [r10+16]  ; put_buffer = state->cur.put_buffer;
+    mov         put_bits,    DWORD [r10+24]  ; put_bits = state->cur.put_bits;
+    push        r10                          ; r10 is now scratch
+
+    ; Encode the DC coefficient difference per section F.1.2.1
+    movsx       edi, word [r12]         ; temp = temp2 = block[0] - last_dc_val;
+    sub         edi, r13d               ; r13 is not used anymore
+    mov         ebx, edi
+
+    ; This is a well-known technique for obtaining the absolute value
+    ; without a branch.  It is derived from an assembly language technique
+    ; presented in "How to Optimize for the Pentium Processors",
+    ; Copyright (c) 1996, 1997 by Agner Fog.
+    mov         esi, edi
+    sar         esi, 31                 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
+    xor         edi, esi                ; temp ^= temp3;
+    sub         edi, esi                ; temp -= temp3;
+
+    ; For a negative input, want temp2 = bitwise complement of abs(input)
+    ; This code assumes we are on a two's complement machine
+    add         ebx, esi                ; temp2 += temp3;
+
+    ; Find the number of bits needed for the magnitude of the coefficient
+    lea         r11, [rel jpeg_nbits_table]
+    movzx       rdi, byte [r11 + rdi]         ; nbits = JPEG_NBITS(temp);
+    ; Emit the Huffman-coded symbol for the number of bits
+    mov         r11d,  INT [r14 + rdi * 4]    ; code = dctbl->ehufco[nbits];
+    movzx       esi, byte [r14 + rdi + 1024]  ; size = dctbl->ehufsi[nbits];
+    EMIT_BITS   r11, esi                      ; EMIT_BITS(code, size)
+
+    ; Mask off any extra bits in code
+    mov         esi, 1
+    mov         ecx, edi
+    shl         esi, cl
+    dec         esi
+    and         ebx, esi                ; temp2 &= (((JLONG)1)<<nbits) - 1;
+
+    ; Emit that number of bits of the value, if positive,
+    ; or the complement of its magnitude, if negative.
+    EMIT_BITS   rbx, edi                ; EMIT_BITS(temp2, nbits)
+
+    ; Prepare data
+    xor         ebx, ebx
+    kloop_prepare  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, \
+                   18, 11, 4,  5,  12, 19, 26, 33, 40, 48, 41, 34, \
+                   27, 20, 13, 6,  7,  14, 21, 28, 35, \
+                   xmm0, xmm1, xmm2, xmm3
+    kloop_prepare  32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
+                   30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
+                   53, 60, 61, 54, 47, 55, 62, 63, 63, \
+                   xmm4, xmm5, xmm6, xmm7
+
+    pxor        xmm8, xmm8
+    pcmpeqw     xmm0, xmm8              ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
+    pcmpeqw     xmm1, xmm8              ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
+    pcmpeqw     xmm2, xmm8              ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
+    pcmpeqw     xmm3, xmm8              ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
+    pcmpeqw     xmm4, xmm8              ; tmp4 = _mm_cmpeq_epi16(tmp4, zero);
+    pcmpeqw     xmm5, xmm8              ; tmp5 = _mm_cmpeq_epi16(tmp5, zero);
+    pcmpeqw     xmm6, xmm8              ; tmp6 = _mm_cmpeq_epi16(tmp6, zero);
+    pcmpeqw     xmm7, xmm8              ; tmp7 = _mm_cmpeq_epi16(tmp7, zero);
+    packsswb    xmm0, xmm1              ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
+    packsswb    xmm2, xmm3              ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
+    packsswb    xmm4, xmm5              ; tmp4 = _mm_packs_epi16(tmp4, tmp5);
+    packsswb    xmm6, xmm7              ; tmp6 = _mm_packs_epi16(tmp6, tmp7);
+    pmovmskb    r11d, xmm0              ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
+    pmovmskb    r12d, xmm2              ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
+    pmovmskb    r13d, xmm4              ; index  = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32;
+    pmovmskb    r14d, xmm6              ; index  = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48;
+    shl         r12, 16
+    shl         r14, 16
+    or          r11, r12
+    or          r13, r14
+    shl         r13, 32
+    or          r11, r13
+    not         r11                     ; index = ~index;
+
+    ;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11
+    ;jmp .EFN
+
+    mov         r13d,  INT [r15 + 240 * 4]     ; code_0xf0 = actbl->ehufco[0xf0];
+    movzx       r14d, byte [r15 + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
+    lea         rsi, [t1]
+.BLOOP:
+    bsf         r12, r11                     ; r = __builtin_ctzl(index);
+    jz          .ELOOP
+    mov         rcx, r12
+    lea         rsi, [rsi+r12*2]             ; k += r;
+    shr         r11, cl                      ; index >>= r;
+    movzx       rdi, word [rsi]              ; temp = t1[k];
+    lea         rbx, [rel jpeg_nbits_table]
+    movzx       rdi, byte [rbx + rdi]        ; nbits = JPEG_NBITS(temp);
+.BRLOOP:
+    cmp         r12, 16                 ; while (r > 15) {
+    jl          .ERLOOP
+    EMIT_BITS   r13, r14d               ; EMIT_BITS(code_0xf0, size_0xf0)
+    sub         r12, 16                 ; r -= 16;
+    jmp         .BRLOOP
+.ERLOOP:
+    ; Emit Huffman symbol for run length / number of bits
+    CHECKBUF31  ; uses rcx, rdx
+
+    shl         r12, 4                        ; temp3 = (r << 4) + nbits;
+    add         r12, rdi
+    mov         ebx,  INT [r15 + r12 * 4]     ; code = actbl->ehufco[temp3];
+    movzx       ecx, byte [r15 + r12 + 1024]  ; size = actbl->ehufsi[temp3];
+    PUT_BITS    rbx
+
+    ;EMIT_CODE(code, size)
+
+    movsx       ebx, word [rsi-DCTSIZE2*2]    ; temp2 = t2[k];
+    ; Mask off any extra bits in code
+    mov         rcx, rdi
+    mov         rdx, 1
+    shl         rdx, cl
+    dec         rdx
+    and         rbx, rdx                ; temp2 &= (((JLONG)1)<<nbits) - 1;
+    PUT_BITS    rbx                     ; PUT_BITS(temp2, nbits)
+
+    shr         r11, 1                  ; index >>= 1;
+    add         rsi, 2                  ; ++k;
+    jmp         .BLOOP
+.ELOOP:
+    ; If the last coef(s) were zero, emit an end-of-block code
+    lea         rdi, [t1 + (DCTSIZE2-1) * 2]  ; r = DCTSIZE2-1-k;
+    cmp         rdi, rsi                      ; if (r > 0) {
+    je          .EFN
+    mov         ebx,  INT [r15]               ; code = actbl->ehufco[0];
+    movzx       r12d, byte [r15 + 1024]       ; size = actbl->ehufsi[0];
+    EMIT_BITS   rbx, r12d
+.EFN:
+    pop         r10
+    ; Save put_buffer & put_bits
+    mov         MMWORD [r10+16], put_buffer  ; state->cur.put_buffer = put_buffer;
+    mov         DWORD  [r10+24], put_bits    ; state->cur.put_bits = put_bits;
+
+    pop         rbx
+    uncollect_args 6
+    pop_xmm     4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jcsample-avx2.asm b/simd/x86_64/jcsample-avx2.asm
new file mode 100644
index 0000000..9d5a861
--- /dev/null
+++ b/simd/x86_64/jcsample-avx2.asm
@@ -0,0 +1,368 @@
+;
+; jcsample.asm - downsampling (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
+
+EXTN(jsimd_h2v1_downsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdi, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         rdx, 0x00010000         ; bias pattern
+    vmovd       xmm7, edx
+    vpshufd     xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    vperm2i128  ymm7, ymm7, ymm7, 0     ; ymm7={xmm7, xmm7}
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rsi, JSAMPROW [rsi]     ; inptr
+    mov         rdi, JSAMPROW [rdi]     ; outptr
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+
+.columnloop_r24:
+    ; rcx can possibly be 8, 16, 24
+    cmp         rcx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD]
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         rcx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpsrlw      ymm2, ymm0, BYTE_BIT
+    vpand       ymm0, ymm0, ymm6
+    vpsrlw      ymm3, ymm1, BYTE_BIT
+    vpand       ymm1, ymm1, ymm6
+
+    vpaddw      ymm0, ymm0, ymm2
+    vpaddw      ymm1, ymm1, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm1, ymm1, ymm7
+    vpsrlw      ymm0, ymm0, 1
+    vpsrlw      ymm1, ymm1, 1
+
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         rcx, byte SIZEOF_YMMWORD    ; outcol
+    add         rsi, byte 2*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    test        rcx, rcx
+    jnz         near .columnloop_r24
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
+
+EXTN(jsimd_h2v2_downsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdi, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        rax, rax
+    jle         near .return
+
+    mov         rdx, 0x00020001         ; bias pattern
+    vmovd       xmm7, edx
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpshufd     xmm7, xmm7, 0x00        ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
+    vperm2i128  ymm7, ymm7, ymm7, 0
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdi, JSAMPROW [rdi]                    ; outptr
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+
+.columnloop_r24:
+    cmp         rcx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD]
+    vmovdqu     xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD]
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         rcx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    vmovdqu     xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD]
+    vmovdqu     ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpand       ymm4, ymm0, ymm6
+    vpsrlw      ymm0, ymm0, BYTE_BIT
+    vpand       ymm5, ymm1, ymm6
+    vpsrlw      ymm1, ymm1, BYTE_BIT
+    vpaddw      ymm0, ymm0, ymm4
+    vpaddw      ymm1, ymm1, ymm5
+
+    vpand       ymm4, ymm2, ymm6
+    vpsrlw      ymm2, ymm2, BYTE_BIT
+    vpand       ymm5, ymm3, ymm6
+    vpsrlw      ymm3, ymm3, BYTE_BIT
+    vpaddw      ymm2, ymm2, ymm4
+    vpaddw      ymm3, ymm3, ymm5
+
+    vpaddw      ymm0, ymm0, ymm1
+    vpaddw      ymm2, ymm2, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpsrlw      ymm0, ymm0, 2
+    vpsrlw      ymm2, ymm2, 2
+
+    vpackuswb   ymm0, ymm0, ymm2
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         rcx, byte SIZEOF_YMMWORD    ; outcol
+    add         rdx, byte 2*SIZEOF_YMMWORD  ; inptr0
+    add         rsi, byte 2*SIZEOF_YMMWORD  ; inptr1
+    add         rdi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .columnloop_r24
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         rax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jcsample-sse2.asm b/simd/x86_64/jcsample-sse2.asm
new file mode 100644
index 0000000..1b31536
--- /dev/null
+++ b/simd/x86_64/jcsample-sse2.asm
@@ -0,0 +1,331 @@
+;
+; jcsample.asm - downsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdi, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         rdx, 0x00010000         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rsi, JSAMPROW [rsi]     ; inptr
+    mov         rdi, JSAMPROW [rdi]     ; outptr
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    pxor        xmm1, xmm1
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    pand        xmm0, xmm6
+    psrlw       xmm2, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm3, BYTE_BIT
+
+    paddw       xmm0, xmm2
+    paddw       xmm1, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    psrlw       xmm0, 1
+    psrlw       xmm1, 1
+
+    packuswb    xmm0, xmm1
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    test        rcx, rcx
+    jnz         short .columnloop_r8
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdi, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        rax, rax
+    jle         near .return
+
+    mov         rdx, 0x00020001         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdi, JSAMPROW [rdi]                    ; outptr
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    pxor        xmm2, xmm2
+    pxor        xmm3, xmm3
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    pand        xmm0, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm0, xmm4
+    paddw       xmm1, xmm5
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    pand        xmm2, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm3, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+
+    paddw       xmm0, xmm1
+    paddw       xmm2, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm7
+    psrlw       xmm0, 2
+    psrlw       xmm2, 2
+
+    packuswb    xmm0, xmm2
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
+    add         rdx, byte 2*SIZEOF_XMMWORD  ; inptr0
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr1
+    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .columnloop_r8
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         rax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jdcolext-avx2.asm b/simd/x86_64/jdcolext-avx2.asm
new file mode 100644
index 0000000..e2b96c7
--- /dev/null
+++ b/simd/x86_64/jdcolext-avx2.asm
@@ -0,0 +1,497 @@
+;
+; jdcolext.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+; r10d = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2)
+
+EXTN(jsimd_ycc_rgb_convert_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d               ; num_cols
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rdi, r13
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rax
+    push        rdi
+    push        rdx
+    push        rbx
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsi, JSAMPROW [rsi]     ; inptr0
+    mov         rbx, JSAMPROW [rbx]     ; inptr1
+    mov         rdx, JSAMPROW [rdx]     ; inptr2
+    mov         rdi, JSAMPROW [rdi]     ; outptr
+.columnloop:
+
+    vmovdqu     ymm5, YMMWORD [rbx]     ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm1, YMMWORD [rdx]     ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm0, ymm0, ymm0
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsrlw      ymm0, ymm0, BYTE_BIT    ; ymm0={0xFF 0x00 0xFF 0x00 ..}
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpand       ymm4, ymm0, ymm5        ; ymm4=Cb(02468ACEGIKMOQSU)=CbE
+    vpsrlw      ymm5, ymm5, BYTE_BIT    ; ymm5=Cb(13579BDFHJLNPRTV)=CbO
+    vpand       ymm0, ymm0, ymm1        ; ymm0=Cr(02468ACEGIKMOQSU)=CrE
+    vpsrlw      ymm1, ymm1, BYTE_BIT    ; ymm1=Cr(13579BDFHJLNPRTV)=CrO
+
+    vpaddw      ymm2, ymm4, ymm7
+    vpaddw      ymm3, ymm5, ymm7
+    vpaddw      ymm6, ymm0, ymm7
+    vpaddw      ymm7, ymm1, ymm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm4, ymm2, ymm2             ; ymm4=2*CbE
+    vpaddw      ymm5, ymm3, ymm3             ; ymm5=2*CbO
+    vpaddw      ymm0, ymm6, ymm6             ; ymm0=2*CrE
+    vpaddw      ymm1, ymm7, ymm7             ; ymm1=2*CrO
+
+    vpmulhw     ymm4, ymm4, [rel PW_MF0228]  ; ymm4=(2*CbE * -FIX(0.22800))
+    vpmulhw     ymm5, ymm5, [rel PW_MF0228]  ; ymm5=(2*CbO * -FIX(0.22800))
+    vpmulhw     ymm0, ymm0, [rel PW_F0402]   ; ymm0=(2*CrE * FIX(0.40200))
+    vpmulhw     ymm1, ymm1, [rel PW_F0402]   ; ymm1=(2*CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, [rel PW_ONE]
+    vpaddw      ymm5, ymm5, [rel PW_ONE]
+    vpsraw      ymm4, ymm4, 1                ; ymm4=(CbE * -FIX(0.22800))
+    vpsraw      ymm5, ymm5, 1                ; ymm5=(CbO * -FIX(0.22800))
+    vpaddw      ymm0, ymm0, [rel PW_ONE]
+    vpaddw      ymm1, ymm1, [rel PW_ONE]
+    vpsraw      ymm0, ymm0, 1                ; ymm0=(CrE * FIX(0.40200))
+    vpsraw      ymm1, ymm1, 1                ; ymm1=(CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm5, ymm5, ymm3
+    vpaddw      ymm4, ymm4, ymm2             ; ymm4=(CbE * FIX(1.77200))=(B-Y)E
+    vpaddw      ymm5, ymm5, ymm3             ; ymm5=(CbO * FIX(1.77200))=(B-Y)O
+    vpaddw      ymm0, ymm0, ymm6             ; ymm0=(CrE * FIX(1.40200))=(R-Y)E
+    vpaddw      ymm1, ymm1, ymm7             ; ymm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    vmovdqa     YMMWORD [wk(0)], ymm4        ; wk(0)=(B-Y)E
+    vmovdqa     YMMWORD [wk(1)], ymm5        ; wk(1)=(B-Y)O
+
+    vpunpckhwd  ymm4, ymm2, ymm6
+    vpunpcklwd  ymm2, ymm2, ymm6
+    vpmaddwd    ymm2, ymm2, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm4, ymm4, [rel PW_MF0344_F0285]
+    vpunpckhwd  ymm5, ymm3, ymm7
+    vpunpcklwd  ymm3, ymm3, ymm7
+    vpmaddwd    ymm3, ymm3, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm5, ymm5, [rel PW_MF0344_F0285]
+
+    vpaddd      ymm2, ymm2, [rel PD_ONEHALF]
+    vpaddd      ymm4, ymm4, [rel PD_ONEHALF]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm4, ymm4, SCALEBITS
+    vpaddd      ymm3, ymm3, [rel PD_ONEHALF]
+    vpaddd      ymm5, ymm5, [rel PD_ONEHALF]
+    vpsrad      ymm3, ymm3, SCALEBITS
+    vpsrad      ymm5, ymm5, SCALEBITS
+
+    vpackssdw   ymm2, ymm2, ymm4             ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    vpackssdw   ymm3, ymm3, ymm5             ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    vpsubw      ymm2, ymm2, ymm6             ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    vpsubw      ymm3, ymm3, ymm7             ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    vmovdqu     ymm5, YMMWORD [rsi]          ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm4, ymm4, ymm4
+    vpsrlw      ymm4, ymm4, BYTE_BIT         ; ymm4={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm4, ymm4, ymm5             ; ymm4=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm5, ymm5, BYTE_BIT         ; ymm5=Y(13579BDFHJLNPRTV)=YO
+
+    vpaddw      ymm0, ymm0, ymm4             ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm5             ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0             ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1             ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm4             ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm5             ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2             ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3             ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, YMMWORD [wk(0)]  ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, YMMWORD [wk(1)]  ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4             ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5             ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         rcx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         WORD [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    mov         BYTE [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    cmp         rcx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         rcx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         rcx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_YMMWORD/16*4
+    sub         rcx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    vmovd       XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+    pop         rcx
+    pop         rsi
+    pop         rbx
+    pop         rdx
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jdcolext-sse2.asm b/simd/x86_64/jdcolext-sse2.asm
new file mode 100644
index 0000000..a94954b
--- /dev/null
+++ b/simd/x86_64/jdcolext-sse2.asm
@@ -0,0 +1,440 @@
+;
+; jdcolext.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+; r10d = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d               ; num_cols
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rdi, r13
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rax
+    push        rdi
+    push        rdx
+    push        rbx
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsi, JSAMPROW [rsi]     ; inptr0
+    mov         rbx, JSAMPROW [rbx]     ; inptr1
+    mov         rdx, JSAMPROW [rdx]     ; inptr2
+    mov         rdi, JSAMPROW [rdi]     ; outptr
+.columnloop:
+
+    movdqa      xmm5, XMMWORD [rbx]     ; xmm5=Cb(0123456789ABCDEF)
+    movdqa      xmm1, XMMWORD [rdx]     ; xmm1=Cr(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    pcmpeqw     xmm7, xmm7
+    psrlw       xmm4, BYTE_BIT
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+    movdqa      xmm0, xmm4              ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+    pand        xmm4, xmm5              ; xmm4=Cb(02468ACE)=CbE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Cb(13579BDF)=CbO
+    pand        xmm0, xmm1              ; xmm0=Cr(02468ACE)=CrE
+    psrlw       xmm1, BYTE_BIT          ; xmm1=Cr(13579BDF)=CrO
+
+    paddw       xmm4, xmm7
+    paddw       xmm5, xmm7
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm2, xmm4              ; xmm2=CbE
+    movdqa      xmm3, xmm5              ; xmm3=CbO
+    paddw       xmm4, xmm4              ; xmm4=2*CbE
+    paddw       xmm5, xmm5              ; xmm5=2*CbO
+    movdqa      xmm6, xmm0              ; xmm6=CrE
+    movdqa      xmm7, xmm1              ; xmm7=CrO
+    paddw       xmm0, xmm0              ; xmm0=2*CrE
+    paddw       xmm1, xmm1              ; xmm1=2*CrO
+
+    pmulhw      xmm4, [rel PW_MF0228]   ; xmm4=(2*CbE * -FIX(0.22800))
+    pmulhw      xmm5, [rel PW_MF0228]   ; xmm5=(2*CbO * -FIX(0.22800))
+    pmulhw      xmm0, [rel PW_F0402]    ; xmm0=(2*CrE * FIX(0.40200))
+    pmulhw      xmm1, [rel PW_F0402]    ; xmm1=(2*CrO * FIX(0.40200))
+
+    paddw       xmm4, [rel PW_ONE]
+    paddw       xmm5, [rel PW_ONE]
+    psraw       xmm4, 1                 ; xmm4=(CbE * -FIX(0.22800))
+    psraw       xmm5, 1                 ; xmm5=(CbO * -FIX(0.22800))
+    paddw       xmm0, [rel PW_ONE]
+    paddw       xmm1, [rel PW_ONE]
+    psraw       xmm0, 1                 ; xmm0=(CrE * FIX(0.40200))
+    psraw       xmm1, 1                 ; xmm1=(CrO * FIX(0.40200))
+
+    paddw       xmm4, xmm2
+    paddw       xmm5, xmm3
+    paddw       xmm4, xmm2              ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+    paddw       xmm5, xmm3              ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+    paddw       xmm0, xmm6              ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+    paddw       xmm1, xmm7              ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=(B-Y)E
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(B-Y)O
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm4, xmm6
+    pmaddwd     xmm2, [rel PW_MF0344_F0285]
+    pmaddwd     xmm4, [rel PW_MF0344_F0285]
+    punpcklwd   xmm3, xmm7
+    punpckhwd   xmm5, xmm7
+    pmaddwd     xmm3, [rel PW_MF0344_F0285]
+    pmaddwd     xmm5, [rel PW_MF0344_F0285]
+
+    paddd       xmm2, [rel PD_ONEHALF]
+    paddd       xmm4, [rel PD_ONEHALF]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm4, SCALEBITS
+    paddd       xmm3, [rel PD_ONEHALF]
+    paddd       xmm5, [rel PD_ONEHALF]
+    psrad       xmm3, SCALEBITS
+    psrad       xmm5, SCALEBITS
+
+    packssdw    xmm2, xmm4              ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    packssdw    xmm3, xmm5              ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    psubw       xmm2, xmm6              ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    psubw       xmm3, xmm7              ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    movdqa      xmm5, XMMWORD [rsi]     ; xmm5=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    psrlw       xmm4, BYTE_BIT          ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm4, xmm5              ; xmm4=Y(02468ACE)=YE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Y(13579BDF)=YO
+
+    paddw       xmm0, xmm4              ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm5              ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm4              ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm5              ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, XMMWORD [wk(0)]   ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+    paddw       xmm5, XMMWORD [wk(1)]   ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         rcx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         WORD [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    mov         BYTE [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    cmp         rcx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_XMMWORD/8*4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    movd        XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+    pop         rcx
+    pop         rsi
+    pop         rbx
+    pop         rdx
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jdcolor-avx2.asm b/simd/x86_64/jdcolor-avx2.asm
new file mode 100644
index 0000000..84f4aa8
--- /dev/null
+++ b/simd/x86_64/jdcolor-avx2.asm
@@ -0,0 +1,120 @@
+;
+; jdcolor.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
+
+EXTN(jconst_ycc_rgb_convert_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgb_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extrgbx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extbgrx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2 jsimd_ycc_extxrgb_convert_avx2
+%include "jdcolext-avx2.asm"
diff --git a/simd/jdcolor-sse2.asm b/simd/x86_64/jdcolor-sse2.asm
similarity index 79%
copy from simd/jdcolor-sse2.asm
copy to simd/x86_64/jdcolor-sse2.asm
index 7ff5d05..694a44f 100644
--- a/simd/jdcolor-sse2.asm
+++ b/simd/x86_64/jdcolor-sse2.asm
@@ -1,8 +1,8 @@
 ;
-; jdcolor.asm - colorspace conversion (SSE2)
+; jdcolor.asm - colorspace conversion (64-bit SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -20,21 +20,21 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS       16
+%define SCALEBITS  16
 
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_CONST
+    SECTION     SEG_CONST
 
-        alignz  16
-        global  EXTN(jconst_ycc_rgb_convert_sse2)
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
 
 EXTN(jconst_ycc_rgb_convert_sse2):
 
@@ -42,13 +42,13 @@
 PW_MF0228       times 8 dw -F_0_228
 PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
 PW_ONE          times 8 dw  1
-PD_ONEHALF      times 4 dd  1 << (SCALEBITS-1)
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
 
-        alignz  16
+    alignz      32
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
+    SECTION     SEG_TEXT
+    BITS        64
 
 %include "jdcolext-sse2.asm"
 
diff --git a/simd/x86_64/jdmerge-avx2.asm b/simd/x86_64/jdmerge-avx2.asm
new file mode 100644
index 0000000..1f97c75
--- /dev/null
+++ b/simd/x86_64/jdmerge-avx2.asm
@@ -0,0 +1,126 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_avx2)
+
+EXTN(jconst_merged_upsample_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGB_RED
+%define RGB_GREEN EXT_RGB_GREEN
+%define RGB_BLUE EXT_RGB_BLUE
+%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 jsimd_h2v1_extrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 jsimd_h2v2_extrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_RGBX_RED
+%define RGB_GREEN EXT_RGBX_GREEN
+%define RGB_BLUE EXT_RGBX_BLUE
+%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 jsimd_h2v1_extrgbx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 jsimd_h2v2_extrgbx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGR_RED
+%define RGB_GREEN EXT_BGR_GREEN
+%define RGB_BLUE EXT_BGR_BLUE
+%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 jsimd_h2v1_extbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 jsimd_h2v2_extbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_BGRX_RED
+%define RGB_GREEN EXT_BGRX_GREEN
+%define RGB_BLUE EXT_BGRX_BLUE
+%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 jsimd_h2v1_extbgrx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 jsimd_h2v2_extbgrx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XBGR_RED
+%define RGB_GREEN EXT_XBGR_GREEN
+%define RGB_BLUE EXT_XBGR_BLUE
+%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 jsimd_h2v1_extxbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 jsimd_h2v2_extxbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED EXT_XRGB_RED
+%define RGB_GREEN EXT_XRGB_GREEN
+%define RGB_BLUE EXT_XRGB_BLUE
+%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 jsimd_h2v1_extxrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 jsimd_h2v2_extxrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
diff --git a/simd/jdmerge-sse2.asm b/simd/x86_64/jdmerge-sse2.asm
similarity index 81%
copy from simd/jdmerge-sse2.asm
copy to simd/x86_64/jdmerge-sse2.asm
index 236de5a..e08ffcf 100644
--- a/simd/jdmerge-sse2.asm
+++ b/simd/x86_64/jdmerge-sse2.asm
@@ -1,8 +1,8 @@
 ;
-; jdmerge.asm - merged upsampling/color conversion (SSE2)
+; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
+; Copyright (C) 2009, 2016, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -20,21 +20,21 @@
 
 ; --------------------------------------------------------------------------
 
-%define SCALEBITS       16
+%define SCALEBITS  16
 
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_CONST
+    SECTION     SEG_CONST
 
-        alignz  16
-        global  EXTN(jconst_merged_upsample_sse2)
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_sse2)
 
 EXTN(jconst_merged_upsample_sse2):
 
@@ -42,13 +42,13 @@
 PW_MF0228       times 8 dw -F_0_228
 PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
 PW_ONE          times 8 dw  1
-PD_ONEHALF      times 4 dd  1 << (SCALEBITS-1)
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
 
-        alignz  16
+    alignz      32
 
 ; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
+    SECTION     SEG_TEXT
+    BITS        64
 
 %include "jdmrgext-sse2.asm"
 
diff --git a/simd/x86_64/jdmrgext-avx2.asm b/simd/x86_64/jdmrgext-avx2.asm
new file mode 100644
index 0000000..04e8a94
--- /dev/null
+++ b/simd/x86_64/jdmrgext-avx2.asm
@@ -0,0 +1,595 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (64-bit AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  3
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
+
+EXTN(jsimd_h2v1_merged_upsample_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+    push        rbx
+
+    mov         ecx, r10d               ; col
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    mov         rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         rdi, JSAMPROW [rdi]                      ; outptr
+
+    pop         rcx                     ; col
+
+.columnloop:
+
+    vmovdqu     ymm6, YMMWORD [rbx]     ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm7, YMMWORD [rdx]     ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+    vpcmpeqw    ymm3, ymm3, ymm3
+    vpsllw      ymm3, ymm3, 7           ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpermq      ymm6, ymm6, 0xd8        ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpermq      ymm7, ymm7, 0xd8        ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpunpcklbw  ymm4, ymm6, ymm1        ; ymm4=Cb(0123456789ABCDEF)=CbL
+    vpunpckhbw  ymm6, ymm6, ymm1        ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH
+    vpunpcklbw  ymm0, ymm7, ymm1        ; ymm0=Cr(0123456789ABCDEF)=CrL
+    vpunpckhbw  ymm7, ymm7, ymm1        ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH
+
+    vpaddw      ymm5, ymm6, ymm3
+    vpaddw      ymm2, ymm4, ymm3
+    vpaddw      ymm1, ymm7, ymm3
+    vpaddw      ymm3, ymm0, ymm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm6, ymm5, ymm5             ; ymm6=2*CbH
+    vpaddw      ymm4, ymm2, ymm2             ; ymm4=2*CbL
+    vpaddw      ymm7, ymm1, ymm1             ; ymm7=2*CrH
+    vpaddw      ymm0, ymm3, ymm3             ; ymm0=2*CrL
+
+    vpmulhw     ymm6, ymm6, [rel PW_MF0228]  ; ymm6=(2*CbH * -FIX(0.22800))
+    vpmulhw     ymm4, ymm4, [rel PW_MF0228]  ; ymm4=(2*CbL * -FIX(0.22800))
+    vpmulhw     ymm7, ymm7, [rel PW_F0402]   ; ymm7=(2*CrH * FIX(0.40200))
+    vpmulhw     ymm0, ymm0, [rel PW_F0402]   ; ymm0=(2*CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, [rel PW_ONE]
+    vpaddw      ymm4, ymm4, [rel PW_ONE]
+    vpsraw      ymm6, ymm6, 1                ; ymm6=(CbH * -FIX(0.22800))
+    vpsraw      ymm4, ymm4, 1                ; ymm4=(CbL * -FIX(0.22800))
+    vpaddw      ymm7, ymm7, [rel PW_ONE]
+    vpaddw      ymm0, ymm0, [rel PW_ONE]
+    vpsraw      ymm7, ymm7, 1                ; ymm7=(CrH * FIX(0.40200))
+    vpsraw      ymm0, ymm0, 1                ; ymm0=(CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, ymm5
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm6, ymm6, ymm5             ; ymm6=(CbH * FIX(1.77200))=(B-Y)H
+    vpaddw      ymm4, ymm4, ymm2             ; ymm4=(CbL * FIX(1.77200))=(B-Y)L
+    vpaddw      ymm7, ymm7, ymm1             ; ymm7=(CrH * FIX(1.40200))=(R-Y)H
+    vpaddw      ymm0, ymm0, ymm3             ; ymm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    vmovdqa     YMMWORD [wk(0)], ymm6        ; wk(0)=(B-Y)H
+    vmovdqa     YMMWORD [wk(1)], ymm7        ; wk(1)=(R-Y)H
+
+    vpunpckhwd  ymm6, ymm5, ymm1
+    vpunpcklwd  ymm5, ymm5, ymm1
+    vpmaddwd    ymm5, ymm5, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm6, ymm6, [rel PW_MF0344_F0285]
+    vpunpckhwd  ymm7, ymm2, ymm3
+    vpunpcklwd  ymm2, ymm2, ymm3
+    vpmaddwd    ymm2, ymm2, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm7, ymm7, [rel PW_MF0344_F0285]
+
+    vpaddd      ymm5, ymm5, [rel PD_ONEHALF]
+    vpaddd      ymm6, ymm6, [rel PD_ONEHALF]
+    vpsrad      ymm5, ymm5, SCALEBITS
+    vpsrad      ymm6, ymm6, SCALEBITS
+    vpaddd      ymm2, ymm2, [rel PD_ONEHALF]
+    vpaddd      ymm7, ymm7, [rel PD_ONEHALF]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm7, ymm7, SCALEBITS
+
+    vpackssdw   ymm5, ymm5, ymm6        ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    vpackssdw   ymm2, ymm2, ymm7        ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    vpsubw      ymm5, ymm5, ymm1        ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    vpsubw      ymm2, ymm2, ymm3        ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    vmovdqa     YMMWORD [wk(2)], ymm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+
+.Yloop_2nd:
+    vmovdqa     ymm0, YMMWORD [wk(1)]   ; ymm0=(R-Y)H
+    vmovdqa     ymm2, YMMWORD [wk(2)]   ; ymm2=(G-Y)H
+    vmovdqa     ymm4, YMMWORD [wk(0)]   ; ymm4=(B-Y)H
+
+.Yloop_1st:
+    vmovdqu     ymm7, YMMWORD [rsi]     ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm6, ymm6, ymm7        ; ymm6=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm7, ymm7, BYTE_BIT    ; ymm7=Y(13579BDFHJLNPRTV)=YO
+
+    vmovdqa     ymm1, ymm0              ; ymm1=ymm0=(R-Y)(L/H)
+    vmovdqa     ymm3, ymm2              ; ymm3=ymm2=(G-Y)(L/H)
+    vmovdqa     ymm5, ymm4              ; ymm5=ymm4=(B-Y)(L/H)
+
+    vpaddw      ymm0, ymm0, ymm6        ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm7        ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0        ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1        ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm6        ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm7        ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2        ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3        ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, ymm6        ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, ymm7        ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4        ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5        ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         rcx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         WORD [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    mov         BYTE [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    cmp         rcx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         rcx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         rcx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         rcx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_YMMWORD/16*4
+    sub         rcx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    vmovd       XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
+
+EXTN(jsimd_h2v2_merged_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         eax, r10d
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+    push        rdx                     ; inptr2
+    push        rbx                     ; inptr1
+    push        rsi                     ; inptr00
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    pop         rsi
+    pop         rbx
+    pop         rdx
+
+    add         rdi, byte SIZEOF_JSAMPROW  ; outptr1
+    add         rsi, byte SIZEOF_JSAMPROW  ; inptr01
+
+    push        rdx                     ; inptr2
+    push        rbx                     ; inptr1
+    push        rsi                     ; inptr00
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    pop         rsi
+    pop         rbx
+    pop         rdx
+
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jdmrgext-sse2.asm b/simd/x86_64/jdmrgext-sse2.asm
new file mode 100644
index 0000000..1cc3345
--- /dev/null
+++ b/simd/x86_64/jdmrgext-sse2.asm
@@ -0,0 +1,537 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  3
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+    push        rbx
+
+    mov         ecx, r10d               ; col
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    mov         rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         rdi, JSAMPROW [rdi]                      ; outptr
+
+    pop         rcx                     ; col
+
+.columnloop:
+
+    movdqa      xmm6, XMMWORD [rbx]     ; xmm6=Cb(0123456789ABCDEF)
+    movdqa      xmm7, XMMWORD [rdx]     ; xmm7=Cr(0123456789ABCDEF)
+
+    pxor        xmm1, xmm1              ; xmm1=(all 0's)
+    pcmpeqw     xmm3, xmm3
+    psllw       xmm3, 7                 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    movdqa      xmm4, xmm6
+    punpckhbw   xmm6, xmm1              ; xmm6=Cb(89ABCDEF)=CbH
+    punpcklbw   xmm4, xmm1              ; xmm4=Cb(01234567)=CbL
+    movdqa      xmm0, xmm7
+    punpckhbw   xmm7, xmm1              ; xmm7=Cr(89ABCDEF)=CrH
+    punpcklbw   xmm0, xmm1              ; xmm0=Cr(01234567)=CrL
+
+    paddw       xmm6, xmm3
+    paddw       xmm4, xmm3
+    paddw       xmm7, xmm3
+    paddw       xmm0, xmm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm5, xmm6              ; xmm5=CbH
+    movdqa      xmm2, xmm4              ; xmm2=CbL
+    paddw       xmm6, xmm6              ; xmm6=2*CbH
+    paddw       xmm4, xmm4              ; xmm4=2*CbL
+    movdqa      xmm1, xmm7              ; xmm1=CrH
+    movdqa      xmm3, xmm0              ; xmm3=CrL
+    paddw       xmm7, xmm7              ; xmm7=2*CrH
+    paddw       xmm0, xmm0              ; xmm0=2*CrL
+
+    pmulhw      xmm6, [rel PW_MF0228]   ; xmm6=(2*CbH * -FIX(0.22800))
+    pmulhw      xmm4, [rel PW_MF0228]   ; xmm4=(2*CbL * -FIX(0.22800))
+    pmulhw      xmm7, [rel PW_F0402]    ; xmm7=(2*CrH * FIX(0.40200))
+    pmulhw      xmm0, [rel PW_F0402]    ; xmm0=(2*CrL * FIX(0.40200))
+
+    paddw       xmm6, [rel PW_ONE]
+    paddw       xmm4, [rel PW_ONE]
+    psraw       xmm6, 1                 ; xmm6=(CbH * -FIX(0.22800))
+    psraw       xmm4, 1                 ; xmm4=(CbL * -FIX(0.22800))
+    paddw       xmm7, [rel PW_ONE]
+    paddw       xmm0, [rel PW_ONE]
+    psraw       xmm7, 1                 ; xmm7=(CrH * FIX(0.40200))
+    psraw       xmm0, 1                 ; xmm0=(CrL * FIX(0.40200))
+
+    paddw       xmm6, xmm5
+    paddw       xmm4, xmm2
+    paddw       xmm6, xmm5              ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+    paddw       xmm4, xmm2              ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+    paddw       xmm7, xmm1              ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+    paddw       xmm0, xmm3              ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm5, xmm1
+    punpckhwd   xmm6, xmm1
+    pmaddwd     xmm5, [rel PW_MF0344_F0285]
+    pmaddwd     xmm6, [rel PW_MF0344_F0285]
+    punpcklwd   xmm2, xmm3
+    punpckhwd   xmm7, xmm3
+    pmaddwd     xmm2, [rel PW_MF0344_F0285]
+    pmaddwd     xmm7, [rel PW_MF0344_F0285]
+
+    paddd       xmm5, [rel PD_ONEHALF]
+    paddd       xmm6, [rel PD_ONEHALF]
+    psrad       xmm5, SCALEBITS
+    psrad       xmm6, SCALEBITS
+    paddd       xmm2, [rel PD_ONEHALF]
+    paddd       xmm7, [rel PD_ONEHALF]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm7, SCALEBITS
+
+    packssdw    xmm5, xmm6              ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    packssdw    xmm2, xmm7              ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    psubw       xmm5, xmm1              ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    psubw       xmm2, xmm3              ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    movdqa      XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+
+.Yloop_2nd:
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
+
+.Yloop_1st:
+    movdqa      xmm7, XMMWORD [rsi]     ; xmm7=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm6, xmm6
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm6, xmm7              ; xmm6=Y(02468ACE)=YE
+    psrlw       xmm7, BYTE_BIT          ; xmm7=Y(13579BDF)=YO
+
+    movdqa      xmm1, xmm0              ; xmm1=xmm0=(R-Y)(L/H)
+    movdqa      xmm3, xmm2              ; xmm3=xmm2=(G-Y)(L/H)
+    movdqa      xmm5, xmm4              ; xmm5=xmm4=(B-Y)(L/H)
+
+    paddw       xmm0, xmm6              ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm7              ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm6              ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm7              ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, xmm6              ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+    paddw       xmm5, xmm7              ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         rcx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         WORD [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    mov         BYTE [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    cmp         rcx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_XMMWORD/8*4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    movd        XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         eax, r10d
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+    push        rdx                     ; inptr2
+    push        rbx                     ; inptr1
+    push        rsi                     ; inptr00
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    pop         rsi
+    pop         rbx
+    pop         rdx
+
+    add         rdi, byte SIZEOF_JSAMPROW  ; outptr1
+    add         rsi, byte SIZEOF_JSAMPROW  ; inptr01
+
+    push        rdx                     ; inptr2
+    push        rbx                     ; inptr1
+    push        rsi                     ; inptr00
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    pop         rsi
+    pop         rbx
+    pop         rdx
+
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jdsample-avx2.asm b/simd/x86_64/jdsample-avx2.asm
new file mode 100644
index 0000000..10fa5c4
--- /dev/null
+++ b/simd/x86_64/jdsample-avx2.asm
@@ -0,0 +1,697 @@
+;
+; jdsample.asm - upsampling (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_avx2)
+
+EXTN(jconst_fancy_upsample_avx2):
+
+PW_ONE   times 16 dw 1
+PW_TWO   times 16 dw 2
+PW_THREE times 16 dw 3
+PW_SEVEN times 16 dw 7
+PW_EIGHT times 16 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v1_fancy_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    push_xmm    3
+    collect_args 4
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+
+    vpxor       ymm0, ymm0, ymm0                 ; ymm0=(all 0's)
+    vpcmpeqb    xmm9, xmm9, xmm9
+    vpsrldq     xmm10, xmm9, (SIZEOF_XMMWORD-1)  ; (ff -- -- -- ... -- --) LSB is ff
+
+    vpslldq     xmm9, xmm9, (SIZEOF_XMMWORD-1)
+    vperm2i128  ymm9, ymm9, ymm9, 1              ; (---- ---- ... ---- ---- ff) MSB is ff
+
+.rowloop:
+    push        rax                     ; colctr
+    push        rdi
+    push        rsi
+
+    mov         rsi, JSAMPROW [rsi]     ; inptr
+    mov         rdi, JSAMPROW [rdi]     ; outptr
+
+    test        rax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    vpand       ymm7, ymm10, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+    add         rax, byte SIZEOF_YMMWORD-1
+    and         rax, byte -SIZEOF_YMMWORD
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    vpand       ymm6, ymm9, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    jmp         short .upsample
+
+.columnloop:
+    vmovdqu     ymm6, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vperm2i128  ymm6, ymm0, ymm6, 0x20
+    vpslldq     ymm6, ymm6, 15
+
+.upsample:
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]  ; ymm1=( 0  1  2 ... 29 30 31)
+
+    vperm2i128  ymm2, ymm0, ymm1, 0x20
+    vpalignr    ymm2, ymm1, ymm2, 15            ; ymm2=(--  0  1 ... 28 29 30)
+    vperm2i128  ymm4, ymm0, ymm1, 0x03
+    vpalignr    ymm3, ymm4, ymm1, 1             ; ymm3=( 1  2  3 ... 30 31 --)
+
+    vpor        ymm2, ymm2, ymm7                ; ymm2=(-1  0  1 ... 28 29 30)
+    vpor        ymm3, ymm3, ymm6                ; ymm3=( 1  2  3 ... 30 31 32)
+
+    vpsrldq     ymm7, ymm4, (SIZEOF_XMMWORD-1)  ; ymm7=(31 -- -- ... -- -- --)
+
+    vpunpckhbw  ymm4, ymm1, ymm0                ; ymm4=( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm1, ymm0                ; ymm5=( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm5, ymm4, 0x20          ; ymm1=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31          ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm2, ymm0                ; ymm5=( 7  8  9 10 11 12 13 14 23 24 25 26 27 28 29 30)
+    vpunpcklbw  ymm6, ymm2, ymm0                ; ymm6=(-1  0  1  2  3  4  5  6 15 16 17 18 19 20 21 22)
+    vperm2i128  ymm2, ymm6, ymm5, 0x20          ; ymm2=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31          ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpunpckhbw  ymm6, ymm3, ymm0                ; ymm6=( 1  2  3  4  5  6  7  8 17 18 19 20 21 22 23 24)
+    vpunpcklbw  ymm8, ymm3, ymm0                ; ymm8=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
+    vperm2i128  ymm3, ymm8, ymm6, 0x20          ; ymm3=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vperm2i128  ymm6, ymm8, ymm6, 0x31          ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vpmullw     ymm1, ymm1, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+    vpaddw      ymm2, ymm2, [rel PW_ONE]
+    vpaddw      ymm5, ymm5, [rel PW_ONE]
+    vpaddw      ymm3, ymm3, [rel PW_TWO]
+    vpaddw      ymm6, ymm6, [rel PW_TWO]
+
+    vpaddw      ymm2, ymm2, ymm1
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm2, ymm2, 2                   ; ymm2=OutLE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 2                   ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm3, ymm3, ymm1
+    vpaddw      ymm6, ymm6, ymm4
+    vpsrlw      ymm3, ymm3, 2                   ; ymm3=OutLO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm6, ymm6, 2                   ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm3, ymm3, BYTE_BIT
+    vpsllw      ymm6, ymm6, BYTE_BIT
+    vpor        ymm2, ymm2, ymm3                ; ymm2=OutL=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm6                ; ymm5=OutH=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm5
+
+    sub         rax, byte SIZEOF_YMMWORD
+    add         rsi, byte 1*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 4
+    pop_xmm     3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  4
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v2_fancy_upsample_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    push_xmm    3
+    collect_args 4
+    push        rbx
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+.rowloop:
+    push        rax                     ; colctr
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    vpxor       ymm8, ymm8, ymm8                 ; ymm8=(all 0's)
+    vpcmpeqb    xmm9, xmm9, xmm9
+    vpsrldq     xmm10, xmm9, (SIZEOF_XMMWORD-2)  ; (ffff ---- ---- ... ---- ----) LSB is ffff
+    vpslldq     xmm9, xmm9, (SIZEOF_XMMWORD-2)
+    vperm2i128  ymm9, ymm9, ymm9, 1              ; (---- ---- ... ---- ---- ffff) MSB is ffff
+
+    test        rax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    push        rdx
+    mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         rdx
+.skip:
+    ; -- process the first column block
+
+    vmovdqu     ymm0, YMMWORD [rbx+0*SIZEOF_YMMWORD]  ; ymm0=row[ 0][0]
+    vmovdqu     ymm1, YMMWORD [rcx+0*SIZEOF_YMMWORD]  ; ymm1=row[-1][0]
+    vmovdqu     ymm2, YMMWORD [rsi+0*SIZEOF_YMMWORD]  ; ymm2=row[+1][0]
+
+    vpunpckhbw  ymm4, ymm0, ymm8        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm8        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm8        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm8        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm8        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm3, ymm2, ymm8        ; ymm3=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm3, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm3, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm6
+
+    vpand       ymm1, ymm1, ymm10       ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vpand       ymm2, ymm2, ymm10       ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+
+    vmovdqa     YMMWORD [wk(0)], ymm1
+    vmovdqa     YMMWORD [wk(1)], ymm2
+
+    add         rax, byte SIZEOF_YMMWORD-1
+    and         rax, byte -SIZEOF_YMMWORD
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    ; -- process the last column block
+
+    vpand       ymm1, ymm9, YMMWORD [rdx+1*SIZEOF_YMMWORD]
+    vpand       ymm2, ymm9, YMMWORD [rdi+1*SIZEOF_YMMWORD]
+
+    vmovdqa     YMMWORD [wk(2)], ymm1   ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+    vmovdqa     YMMWORD [wk(3)], ymm2   ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+
+    jmp         near .upsample
+
+.columnloop:
+    ; -- process the next column block
+
+    vmovdqu     ymm0, YMMWORD [rbx+1*SIZEOF_YMMWORD]  ; ymm0=row[ 0][1]
+    vmovdqu     ymm1, YMMWORD [rcx+1*SIZEOF_YMMWORD]  ; ymm1=row[-1][1]
+    vmovdqu     ymm2, YMMWORD [rsi+1*SIZEOF_YMMWORD]  ; ymm2=row[+1][1]
+
+    vpunpckhbw  ymm4, ymm0, ymm8        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm8        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm8        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm8        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm8        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm7, ymm2, ymm8        ; ymm7=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm7, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm7, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [rdx+2*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [rdx+3*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [rdi+3*SIZEOF_YMMWORD], ymm6
+
+    vperm2i128  ymm1, ymm8, ymm1, 0x20
+    vpslldq     ymm1, ymm1, 14          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+    vperm2i128  ymm2, ymm8, ymm2, 0x20
+    vpslldq     ymm2, ymm2, 14          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+
+    vmovdqa     YMMWORD [wk(2)], ymm1
+    vmovdqa     YMMWORD [wk(3)], ymm2
+
+.upsample:
+    ; -- process the upper row
+
+    vmovdqu     ymm7, YMMWORD [rdx+0*SIZEOF_YMMWORD]  ; ymm7=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm3, YMMWORD [rdx+1*SIZEOF_YMMWORD]  ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vperm2i128  ymm0, ymm8, ymm7, 0x03
+    vpalignr    ymm0, ymm0, ymm7, 2     ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm4, ymm8, ymm3, 0x20
+    vpslldq     ymm4, ymm4, 14          ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm5, ymm8, ymm7, 0x03
+    vpsrldq     ymm5, ymm5, 14          ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm6, ymm8, ymm3, 0x20
+    vpalignr    ymm6, ymm3, ymm6, 14    ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm0, ymm0, ymm4        ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm5, ymm5, ymm6        ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm2, ymm8, ymm3, 0x03
+    vpalignr    ymm2, ymm2, ymm3, 2     ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm4, ymm8, ymm3, 0x03
+    vpsrldq     ymm4, ymm4, 14          ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm8, ymm7, 0x20
+    vpalignr    ymm1, ymm7, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(0)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm2, ymm2, YMMWORD [wk(2)]  ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(0)], ymm4
+
+    vpmullw     ymm7, ymm7, [rel PW_THREE]
+    vpmullw     ymm3, ymm3, [rel PW_THREE]
+    vpaddw      ymm1, ymm1, [rel PW_EIGHT]
+    vpaddw      ymm5, ymm5, [rel PW_EIGHT]
+    vpaddw      ymm0, ymm0, [rel PW_SEVEN]
+    vpaddw      ymm2, [rel PW_SEVEN]
+
+    vpaddw      ymm1, ymm1, ymm7
+    vpaddw      ymm5, ymm5, ymm3
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out0LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm3
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out0LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm2, ymm2, 4           ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpsllw      ymm2, ymm2, BYTE_BIT
+    vpor        ymm1, ymm1, ymm0        ; ymm1=Out0L=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm2        ; ymm5=Out0H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5
+
+    ; -- process the lower row
+
+    vmovdqu     ymm6, YMMWORD [rdi+0*SIZEOF_YMMWORD]  ; ymm6=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm4, YMMWORD [rdi+1*SIZEOF_YMMWORD]  ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vperm2i128  ymm7, ymm8, ymm6, 0x03
+    vpalignr    ymm7, ymm7, ymm6, 2     ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm3, ymm8, ymm4, 0x20
+    vpslldq     ymm3, ymm3, 14          ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm0, ymm8, ymm6, 0x03
+    vpsrldq     ymm0, ymm0, 14          ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm2, ymm8, ymm4, 0x20
+    vpalignr    ymm2, ymm4, ymm2, 14    ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm7, ymm7, ymm3        ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm0, ymm0, ymm2        ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm5, ymm8, ymm4, 0x03
+    vpalignr    ymm5, ymm5, ymm4, 2     ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm3, ymm8, ymm4, 0x03
+    vpsrldq     ymm3, ymm3, 14          ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm8, ymm6, 0x20
+    vpalignr    ymm1, ymm6, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(1)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm5, ymm5, YMMWORD [wk(3)]  ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(1)], ymm3
+
+    vpmullw     ymm6, ymm6, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+    vpaddw      ymm1, ymm1, [rel PW_EIGHT]
+    vpaddw      ymm0, ymm0, [rel PW_EIGHT]
+    vpaddw      ymm7, ymm7, [rel PW_SEVEN]
+    vpaddw      ymm5, ymm5, [rel PW_SEVEN]
+
+    vpaddw      ymm1, ymm1, ymm6
+    vpaddw      ymm0, ymm0, ymm4
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out1LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm7, ymm7, ymm6
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm7, ymm7, 4           ; ymm7=Out1LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpsllw      ymm5, ymm5, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Out1L=( 0  1  2 ... 29 30 31)
+    vpor        ymm0, ymm0, ymm5        ; ymm0=Out1H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm0
+
+    sub         rax, byte SIZEOF_YMMWORD
+    add         rcx, byte 1*SIZEOF_YMMWORD  ; inptr1(above)
+    add         rbx, byte 1*SIZEOF_YMMWORD  ; inptr0
+    add         rsi, byte 1*SIZEOF_YMMWORD  ; inptr1(below)
+    add         rdx, byte 2*SIZEOF_YMMWORD  ; outptr0
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr1
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        rax, rax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 4
+    pop_xmm     3
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
+
+EXTN(jsimd_h2v1_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+
+    mov         edx, r11d
+    add         rdx, byte (SIZEOF_YMMWORD-1)
+    and         rdx, -SIZEOF_YMMWORD
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          short .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsi, JSAMPROW [rsi]     ; inptr
+    mov         rdi, JSAMPROW [rdi]     ; outptr
+    mov         rax, rdx                ; colctr
+.columnloop:
+
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          near .above_16
+
+    vmovdqu     xmm0, XMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         short .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         rax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD    ; inptr
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
+
+EXTN(jsimd_h2v2_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         edx, r11d
+    add         rdx, byte (SIZEOF_YMMWORD-1)
+    and         rdx, -SIZEOF_YMMWORD
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsi, JSAMPROW [rsi]                    ; inptr
+    mov         rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         rax, rdx                               ; colctr
+.columnloop:
+
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          short .above_16
+
+    vmovdqu     xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         near .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [rbx+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [rbx+1*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         rax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr
+    add         rbx, 2*SIZEOF_YMMWORD     ; outptr0
+    add         rdi, 2*SIZEOF_YMMWORD     ; outptr1
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jdsample-sse2.asm b/simd/x86_64/jdsample-sse2.asm
new file mode 100644
index 0000000..d8ccda9
--- /dev/null
+++ b/simd/x86_64/jdsample-sse2.asm
@@ -0,0 +1,666 @@
+;
+; jdsample.asm - upsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE   times 8 dw 1
+PW_TWO   times 8 dw 2
+PW_THREE times 8 dw 3
+PW_SEVEN times 8 dw 7
+PW_EIGHT times 8 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+.rowloop:
+    push        rax                     ; colctr
+    push        rdi
+    push        rsi
+
+    mov         rsi, JSAMPROW [rsi]     ; inptr
+    mov         rdi, JSAMPROW [rdi]     ; outptr
+
+    test        rax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    pxor        xmm0, xmm0              ; xmm0=(all 0's)
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)
+    pand        xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+    add         rax, byte SIZEOF_XMMWORD-1
+    and         rax, byte -SIZEOF_XMMWORD
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    pcmpeqb     xmm6, xmm6
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+    pand        xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    jmp         short .upsample
+
+.columnloop:
+    movdqa      xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+
+.upsample:
+    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, xmm1
+    movdqa      xmm3, xmm1                ; xmm1=( 0  1  2 ... 13 14 15)
+    pslldq      xmm2, 1                   ; xmm2=(--  0  1 ... 12 13 14)
+    psrldq      xmm3, 1                   ; xmm3=( 1  2  3 ... 14 15 --)
+
+    por         xmm2, xmm7                ; xmm2=(-1  0  1 ... 12 13 14)
+    por         xmm3, xmm6                ; xmm3=( 1  2  3 ... 14 15 16)
+
+    movdqa      xmm7, xmm1
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)  ; xmm7=(15 -- -- ... -- -- --)
+
+    movdqa      xmm4, xmm1
+    punpcklbw   xmm1, xmm0                ; xmm1=( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm0                ; xmm4=( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm2
+    punpcklbw   xmm2, xmm0                ; xmm2=(-1  0  1  2  3  4  5  6)
+    punpckhbw   xmm5, xmm0                ; xmm5=( 7  8  9 10 11 12 13 14)
+    movdqa      xmm6, xmm3
+    punpcklbw   xmm3, xmm0                ; xmm3=( 1  2  3  4  5  6  7  8)
+    punpckhbw   xmm6, xmm0                ; xmm6=( 9 10 11 12 13 14 15 16)
+
+    pmullw      xmm1, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+    paddw       xmm2, [rel PW_ONE]
+    paddw       xmm5, [rel PW_ONE]
+    paddw       xmm3, [rel PW_TWO]
+    paddw       xmm6, [rel PW_TWO]
+
+    paddw       xmm2, xmm1
+    paddw       xmm5, xmm4
+    psrlw       xmm2, 2                 ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 2                 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+    paddw       xmm3, xmm1
+    paddw       xmm6, xmm4
+    psrlw       xmm3, 2                 ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm6, 2                 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm3, BYTE_BIT
+    psllw       xmm6, BYTE_BIT
+    por         xmm2, xmm3              ; xmm2=OutL=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm6              ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
+
+    sub         rax, byte SIZEOF_XMMWORD
+    add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  4
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+    push        rbx
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+.rowloop:
+    push        rax                     ; colctr
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        rax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    push        rdx
+    mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         rdx
+.skip:
+    ; -- process the first column block
+
+    movdqa      xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]  ; xmm0=row[ 0][0]
+    movdqa      xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]  ; xmm1=row[-1][0]
+    movdqa      xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]  ; xmm2=row[+1][0]
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-2)
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
+
+    pand        xmm1, xmm7              ; xmm1=( 0 -- -- -- -- -- -- --)
+    pand        xmm2, xmm7              ; xmm2=( 0 -- -- -- -- -- -- --)
+
+    movdqa      XMMWORD [wk(0)], xmm1
+    movdqa      XMMWORD [wk(1)], xmm2
+
+    add         rax, byte SIZEOF_XMMWORD-1
+    and         rax, byte -SIZEOF_XMMWORD
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pcmpeqb     xmm1, xmm1
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)
+    movdqa      xmm2, xmm1
+
+    pand        xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+    pand        xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+    movdqa      XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
+    movdqa      XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
+
+    jmp         near .upsample
+
+.columnloop:
+    ; -- process the next column block
+
+    movdqa      xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]  ; xmm0=row[ 0][1]
+    movdqa      xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]  ; xmm1=row[-1][1]
+    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]  ; xmm2=row[+1][1]
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
+
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)  ; xmm1=(-- -- -- -- -- -- --  0)
+    pslldq      xmm2, (SIZEOF_XMMWORD-2)  ; xmm2=(-- -- -- -- -- -- --  0)
+
+    movdqa      XMMWORD [wk(2)], xmm1
+    movdqa      XMMWORD [wk(3)], xmm2
+
+.upsample:
+    ; -- process the upper row
+
+    movdqa      xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm0, xmm7                ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm4, xmm3                ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm0, 2                   ; xmm0=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(-- -- -- -- -- -- --  8)
+    movdqa      xmm5, xmm7
+    movdqa      xmm6, xmm3
+    psrldq      xmm5, (SIZEOF_XMMWORD-2)  ; xmm5=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm6, 2                   ; xmm6=(--  8  9 10 11 12 13 14)
+
+    por         xmm0, xmm4                ; xmm0=( 1  2  3  4  5  6  7  8)
+    por         xmm5, xmm6                ; xmm5=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm2, xmm3
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm2, 2                   ; xmm2=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm4, xmm3
+    psrldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(0)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm2, XMMWORD [wk(2)]     ; xmm2=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(0)], xmm4
+
+    pmullw      xmm7, [rel PW_THREE]
+    pmullw      xmm3, [rel PW_THREE]
+    paddw       xmm1, [rel PW_EIGHT]
+    paddw       xmm5, [rel PW_EIGHT]
+    paddw       xmm0, [rel PW_SEVEN]
+    paddw       xmm2, [rel PW_SEVEN]
+
+    paddw       xmm1, xmm7
+    paddw       xmm5, xmm3
+    psrlw       xmm1, 4                 ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 4                 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm3
+    psrlw       xmm0, 4                 ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm2, 4                 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm0, BYTE_BIT
+    psllw       xmm2, BYTE_BIT
+    por         xmm1, xmm0              ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm2              ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
+
+    ; -- process the lower row
+
+    movdqa      xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
+    movdqa      xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm7, xmm6                ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm3, xmm4                ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm7, 2                   ; xmm7=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(-- -- -- -- -- -- --  8)
+    movdqa      xmm0, xmm6
+    movdqa      xmm2, xmm4
+    psrldq      xmm0, (SIZEOF_XMMWORD-2)  ; xmm0=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm2, 2                   ; xmm2=(--  8  9 10 11 12 13 14)
+
+    por         xmm7, xmm3                ; xmm7=( 1  2  3  4  5  6  7  8)
+    por         xmm0, xmm2                ; xmm0=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm5, 2                   ; xmm5=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm3, xmm4
+    psrldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(1)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm5, XMMWORD [wk(3)]     ; xmm5=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(1)], xmm3
+
+    pmullw      xmm6, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+    paddw       xmm1, [rel PW_EIGHT]
+    paddw       xmm0, [rel PW_EIGHT]
+    paddw       xmm7, [rel PW_SEVEN]
+    paddw       xmm5, [rel PW_SEVEN]
+
+    paddw       xmm1, xmm6
+    paddw       xmm0, xmm4
+    psrlw       xmm1, 4                 ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm0, 4                 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm7, xmm6
+    paddw       xmm5, xmm4
+    psrlw       xmm7, 4                 ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm5, 4                 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm7, BYTE_BIT
+    psllw       xmm5, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
+    por         xmm0, xmm5              ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
+
+    sub         rax, byte SIZEOF_XMMWORD
+    add         rcx, byte 1*SIZEOF_XMMWORD  ; inptr1(above)
+    add         rbx, byte 1*SIZEOF_XMMWORD  ; inptr0
+    add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr1(below)
+    add         rdx, byte 2*SIZEOF_XMMWORD  ; outptr0
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr1
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        rax, rax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+
+    mov         edx, r11d
+    add         rdx, byte (2*SIZEOF_XMMWORD)-1
+    and         rdx, byte -(2*SIZEOF_XMMWORD)
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          short .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsi, JSAMPROW [rsi]     ; inptr
+    mov         rdi, JSAMPROW [rdi]     ; outptr
+    mov         rax, rdx                ; colctr
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         edx, r11d
+    add         rdx, byte (2*SIZEOF_XMMWORD)-1
+    and         rdx, byte -(2*SIZEOF_XMMWORD)
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsi, JSAMPROW [rsi]                    ; inptr
+    mov         rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         rax, rdx                               ; colctr
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
+    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         rbx, byte 4*SIZEOF_XMMWORD  ; outptr0
+    add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr1
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jfdctflt-sse.asm b/simd/x86_64/jfdctflt-sse.asm
new file mode 100644
index 0000000..26f9fb6
--- /dev/null
+++ b/simd/x86_64/jfdctflt-sse.asm
@@ -0,0 +1,357 @@
+;
+; jfdctflt.asm - floating-point FDCT (64-bit SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro  unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro  unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382 times 4 dd 0.382683432365089771728460
+PD_0_707 times 4 dd 0.707106781186547524400844
+PD_0_541 times 4 dd 0.541196100146196984399723
+PD_1_306 times 4 dd 1.306562964876376527856643
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse(FAST_FLOAT *data)
+;
+
+; r10 = FAST_FLOAT *data
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    mov         rdx, r10                ; (FAST_FLOAT *)
+    mov         rcx, DCTSIZE/4
+.rowloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+    ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(20 30 21 31)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 32 23 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(24 34 25 35)
+    unpckhps    xmm5, xmm3              ; xmm5=(26 36 27 37)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+    ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(02 12 03 13)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(04 14 05 15)
+    unpckhps    xmm2, xmm3              ; xmm2=(06 16 07 17)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 10 20 30)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(01 11 21 31)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(06 16 26 36)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(07 17 27 37)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(02 12 22 32)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(03 13 23 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(04 14 24 34)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(05 15 25 35)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [rel PD_0_707]    ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [rel PD_0_707]    ; xmm3=z3
+
+    movaps      xmm1, xmm2              ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [rel PD_0_382]    ; xmm2=z5
+    mulps       xmm1, [rel PD_0_541]    ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [rel PD_1_306]    ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2              ; xmm1=z2
+    addps       xmm6, xmm2              ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         rcx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         rdx, r10                ; (FAST_FLOAT *)
+    mov         rcx, DCTSIZE/4
+.columnloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+    ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(02 03 12 13)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 23 32 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(42 43 52 53)
+    unpckhps    xmm5, xmm3              ; xmm5=(62 63 72 73)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+    ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 01 10 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(20 21 30 31)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(40 41 50 51)
+    unpckhps    xmm2, xmm3              ; xmm2=(60 61 70 71)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 01 02 03)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(10 11 12 13)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(60 61 62 63)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(70 71 72 73)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(20 21 22 23)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(30 31 32 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(40 41 42 43)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(50 51 52 53)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [rel PD_0_707]    ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [rel PD_0_707]    ; xmm3=z3
+
+    movaps      xmm1, xmm2              ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [rel PD_0_382]    ; xmm2=z5
+    mulps       xmm1, [rel PD_0_541]    ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [rel PD_1_306]    ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2              ; xmm1=z2
+    addps       xmm6, xmm2              ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         rdx, byte 4*SIZEOF_FAST_FLOAT
+    dec         rcx
+    jnz         near .columnloop
+
+    uncollect_args 1
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jfdctfst-sse2.asm b/simd/x86_64/jfdctfst-sse2.asm
new file mode 100644
index 0000000..aaf8b9e
--- /dev/null
+++ b/simd/x86_64/jfdctfst-sse2.asm
@@ -0,0 +1,391 @@
+;
+; jfdctfst.asm - fast integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ  98  ; FIX(0.382683433)
+F_0_541 equ 139  ; FIX(0.541196100)
+F_0_707 equ 181  ; FIX(0.707106781)
+F_1_306 equ 334  ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS)  ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS)  ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS)  ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    mov         rdx, r10                ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    psubw       xmm3, xmm1              ; xmm3=tmp13
+    psubw       xmm6, xmm7              ; xmm6=tmp12
+    paddw       xmm4, xmm1              ; xmm4=tmp10
+    paddw       xmm0, xmm7              ; xmm0=tmp11
+
+    paddw       xmm6, xmm3
+    psllw       xmm6, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm6, [rel PW_F0707]    ; xmm6=z1
+
+    movdqa      xmm1, xmm4
+    movdqa      xmm7, xmm3
+    psubw       xmm4, xmm0              ; xmm4=data4
+    psubw       xmm3, xmm6              ; xmm3=data6
+    paddw       xmm1, xmm0              ; xmm1=data0
+    paddw       xmm7, xmm6              ; xmm7=data2
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=data4
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=data6
+
+    ; -- Odd part
+
+    paddw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm5, xmm0              ; xmm5=tmp11
+    paddw       xmm0, xmm6              ; xmm0=tmp12, xmm6=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [rel PW_F0707]    ; xmm5=z3
+
+    movdqa      xmm4, xmm2              ; xmm4=tmp10
+    psubw       xmm2, xmm0
+    pmulhw      xmm2, [rel PW_F0382]    ; xmm2=z5
+    pmulhw      xmm4, [rel PW_F0541]    ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm0, [rel PW_F1306]    ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm2              ; xmm4=z2
+    paddw       xmm0, xmm2              ; xmm0=z4
+
+    movdqa      xmm3, xmm6
+    psubw       xmm6, xmm5              ; xmm6=z13
+    paddw       xmm3, xmm5              ; xmm3=z11
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm5, xmm3
+    psubw       xmm6, xmm4              ; xmm6=data3
+    psubw       xmm3, xmm0              ; xmm3=data7
+    paddw       xmm2, xmm4              ; xmm2=data5
+    paddw       xmm5, xmm0              ; xmm5=data1
+
+    ; ---- Pass 2: process columns.
+
+    ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+    ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm4, xmm5              ; xmm4=(40 41 50 51 60 61 70 71)
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm6              ; xmm7=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm0, xmm6              ; xmm0=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=col4
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=col6
+
+    ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+    ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm2              ; xmm5=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm7, xmm2              ; xmm7=(44 45 54 55 64 65 74 75)
+    movdqa      xmm0, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm3              ; xmm6=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm0, xmm3              ; xmm0=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm2, xmm5              ; transpose coefficients(phase 2)
+    punpckldq   xmm5, xmm6              ; xmm5=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm2, xmm6              ; xmm2=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm0              ; xmm7=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm3, xmm0              ; xmm3=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm6              ; xmm1=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm2, xmm6              ; xmm2=(20 21 22 23 30 31 32 33)
+    movdqa      xmm7, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm7, xmm0              ; xmm7=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm5              ; xmm1=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm6, xmm5              ; xmm6=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm3              ; xmm7=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm0, xmm3              ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm3, xmm1
+    psubw       xmm6, xmm7              ; xmm6=data1-data6=tmp6
+    psubw       xmm1, xmm0              ; xmm1=data0-data7=tmp7
+    paddw       xmm5, xmm7              ; xmm5=data1+data6=tmp1
+    paddw       xmm3, xmm0              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
+
+    movdqa      xmm6, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm7              ; xmm2=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm6, xmm7              ; xmm6=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm1, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm1, xmm0              ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm0, xmm2
+    paddw       xmm6, xmm4              ; xmm6=data3+data4=tmp3
+    paddw       xmm2, xmm1              ; xmm2=data2+data5=tmp2
+    psubw       xmm7, xmm4              ; xmm7=data3-data4=tmp4
+    psubw       xmm0, xmm1              ; xmm0=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm1, xmm5
+    psubw       xmm3, xmm6              ; xmm3=tmp13
+    psubw       xmm5, xmm2              ; xmm5=tmp12
+    paddw       xmm4, xmm6              ; xmm4=tmp10
+    paddw       xmm1, xmm2              ; xmm1=tmp11
+
+    paddw       xmm5, xmm3
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [rel PW_F0707]    ; xmm5=z1
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm2, xmm3
+    psubw       xmm4, xmm1              ; xmm4=data4
+    psubw       xmm3, xmm5              ; xmm3=data6
+    paddw       xmm6, xmm1              ; xmm6=data0
+    paddw       xmm2, xmm5              ; xmm2=data2
+
+    movdqa      XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+    ; -- Odd part
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    paddw       xmm7, xmm0              ; xmm7=tmp10
+    paddw       xmm0, xmm1              ; xmm0=tmp11
+    paddw       xmm1, xmm5              ; xmm1=tmp12, xmm5=tmp7
+
+    psllw       xmm7, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm0, [rel PW_F0707]    ; xmm0=z3
+
+    movdqa      xmm4, xmm7              ; xmm4=tmp10
+    psubw       xmm7, xmm1
+    pmulhw      xmm7, [rel PW_F0382]    ; xmm7=z5
+    pmulhw      xmm4, [rel PW_F0541]    ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm1, [rel PW_F1306]    ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm7              ; xmm4=z2
+    paddw       xmm1, xmm7              ; xmm1=z4
+
+    movdqa      xmm3, xmm5
+    psubw       xmm5, xmm0              ; xmm5=z13
+    paddw       xmm3, xmm0              ; xmm3=z11
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm2, xmm3
+    psubw       xmm5, xmm4              ; xmm5=data3
+    psubw       xmm3, xmm1              ; xmm3=data7
+    paddw       xmm6, xmm4              ; xmm6=data5
+    paddw       xmm2, xmm1              ; xmm2=data1
+
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
+    movdqa      XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+    uncollect_args 1
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jfdctint-avx2.asm b/simd/x86_64/jfdctint-avx2.asm
new file mode 100644
index 0000000..448f47d
--- /dev/null
+++ b/simd/x86_64/jfdctint-avx2.asm
@@ -0,0 +1,322 @@
+;
+; jfdctint.asm - accurate integer FDCT (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpckhwd  %6, %1, %2
+    vpunpcklwd  %7, %3, %4
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 01 11 02 12 03 13  40 50 41 51 42 52 43 53)
+    ; %6=(04 14 05 15 06 16 07 17  44 54 45 55 46 56 47 57)
+    ; %7=(20 30 21 31 22 32 23 33  60 70 61 71 62 72 63 73)
+    ; %8=(24 34 25 35 26 36 27 37  64 74 65 75 66 76 67 77)
+
+    vpunpckldq  %1, %5, %7
+    vpunpckhdq  %2, %5, %7
+    vpunpckldq  %3, %6, %8
+    vpunpckhdq  %4, %6, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %2=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %3=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %4=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpermq      %1, %1, 0x8D
+    vpermq      %2, %2, 0x8D
+    vpermq      %3, %3, 0xD8
+    vpermq      %4, %4, 0xD8
+    ; transpose coefficients(phase 3)
+    ; %1=(01 11 21 31 41 51 61 71  00 10 20 30 40 50 60 70)
+    ; %2=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %3=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %4=(06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit slow integer forward DCT using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+; %9:    Pass (1 or 2)
+
+%macro dodct 9
+    vpsubw      %5, %1, %4              ; %5=data1_0-data6_7=tmp6_7
+    vpaddw      %6, %1, %4              ; %6=data1_0+data6_7=tmp1_0
+    vpaddw      %7, %2, %3              ; %7=data3_2+data4_5=tmp3_2
+    vpsubw      %8, %2, %3              ; %8=data3_2-data4_5=tmp4_5
+
+    ; -- Even part
+
+    vperm2i128  %6, %6, %6, 0x01        ; %6=tmp0_1
+    vpaddw      %1, %6, %7              ; %1=tmp0_1+tmp3_2=tmp10_11
+    vpsubw      %6, %6, %7              ; %6=tmp0_1-tmp3_2=tmp13_12
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=tmp11_10
+    vpsignw     %1, %1, [rel PW_1_NEG1]  ; %1=tmp10_neg11
+    vpaddw      %7, %7, %1              ; %7=(tmp10+tmp11)_(tmp10-tmp11)
+%if %9 == 1
+    vpsllw      %1, %7, PASS1_BITS      ; %1=data0_4
+%else
+    vpaddw      %7, %7, [rel PW_DESCALE_P2X]
+    vpsraw      %1, %7, PASS1_BITS      ; %1=data0_4
+%endif
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    vperm2i128  %7, %6, %6, 0x01        ; %7=tmp12_13
+    vpunpcklwd  %2, %6, %7
+    vpunpckhwd  %6, %6, %7
+    vpmaddwd    %2, %2, [rel PW_F130_F054_MF130_F054]  ; %2=data2_6L
+    vpmaddwd    %6, %6, [rel PW_F130_F054_MF130_F054]  ; %6=data2_6H
+
+    vpaddd      %2, %2, [rel PD_DESCALE_P %+ %9]
+    vpaddd      %6, %6, [rel PD_DESCALE_P %+ %9]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %6, %6, DESCALE_P %+ %9
+
+    vpackssdw   %3, %2, %6              ; %6=data2_6
+
+    ; -- Odd part
+
+    vpaddw      %7, %8, %5              ; %7=tmp4_5+tmp6_7=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %2, %7, %7, 0x01        ; %2=z4_3
+    vpunpcklwd  %6, %7, %2
+    vpunpckhwd  %7, %7, %2
+    vpmaddwd    %6, %6, [rel PW_MF078_F117_F078_F117]  ; %6=z3_4L
+    vpmaddwd    %7, %7, [rel PW_MF078_F117_F078_F117]  ; %7=z3_4H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    vperm2i128  %4, %5, %5, 0x01        ; %4=tmp7_6
+    vpunpcklwd  %2, %8, %4
+    vpunpckhwd  %4, %8, %4
+    vpmaddwd    %2, %2, [rel PW_MF060_MF089_MF050_MF256]  ; %2=tmp4_5L
+    vpmaddwd    %4, %4, [rel PW_MF060_MF089_MF050_MF256]  ; %4=tmp4_5H
+
+    vpaddd      %2, %2, %6              ; %2=data7_5L
+    vpaddd      %4, %4, %7              ; %4=data7_5H
+
+    vpaddd      %2, %2, [rel PD_DESCALE_P %+ %9]
+    vpaddd      %4, %4, [rel PD_DESCALE_P %+ %9]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %4, %4, DESCALE_P %+ %9
+
+    vpackssdw   %4, %2, %4              ; %4=data7_5
+
+    vperm2i128  %2, %8, %8, 0x01        ; %2=tmp5_4
+    vpunpcklwd  %8, %5, %2
+    vpunpckhwd  %5, %5, %2
+    vpmaddwd    %8, %8, [rel PW_F050_MF256_F060_MF089]  ; %8=tmp6_7L
+    vpmaddwd    %5, %5, [rel PW_F050_MF256_F060_MF089]  ; %5=tmp6_7H
+
+    vpaddd      %8, %8, %6              ; %8=data3_1L
+    vpaddd      %5, %5, %7              ; %5=data3_1H
+
+    vpaddd      %8, %8, [rel PD_DESCALE_P %+ %9]
+    vpaddd      %5, %5, [rel PD_DESCALE_P %+ %9]
+    vpsrad      %8, %8, DESCALE_P %+ %9
+    vpsrad      %5, %5, DESCALE_P %+ %9
+
+    vpackssdw   %2, %8, %5              ; %2=data3_1
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_avx2)
+
+EXTN(jconst_fdct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_F050_MF256_F060_MF089   times 4  dw  (F_3_072 - F_2_562), -F_2_562
+                           times 4  dw  (F_1_501 - F_0_899), -F_0_899
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X             times 16 dw  1 << (PASS1_BITS - 1)
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_avx2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_avx2)
+
+EXTN(jsimd_fdct_islow_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)]
+    ; ymm4=(00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17)
+    ; ymm5=(20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37)
+    ; ymm6=(40 41 42 43 44 45 46 47  50 51 52 53 54 55 56 57)
+    ; ymm7=(60 61 62 63 64 65 66 67  70 71 72 73 74 75 76 77)
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20
+    vperm2i128  ymm1, ymm4, ymm6, 0x31
+    vperm2i128  ymm2, ymm5, ymm7, 0x20
+    vperm2i128  ymm3, ymm5, ymm7, 0x31
+    ; ymm0=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; ymm1=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; ymm2=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; ymm3=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
+
+    ; ---- Pass 2: process columns.
+
+    vperm2i128  ymm4, ymm1, ymm3, 0x20  ; ymm4=data3_7
+    vperm2i128  ymm1, ymm1, ymm3, 0x31  ; ymm1=data1_5
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
+
+    vperm2i128 ymm3, ymm0, ymm1, 0x30   ; ymm3=data0_1
+    vperm2i128 ymm5, ymm2, ymm1, 0x20   ; ymm5=data2_3
+    vperm2i128 ymm6, ymm0, ymm4, 0x31   ; ymm6=data4_5
+    vperm2i128 ymm7, ymm2, ymm4, 0x21   ; ymm7=data6_7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm3
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm5
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm6
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm7
+
+    vzeroupper
+    uncollect_args 1
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jfdctint-sse2.asm b/simd/x86_64/jfdctint-sse2.asm
new file mode 100644
index 0000000..ef16a52
--- /dev/null
+++ b/simd/x86_64/jfdctint-sse2.asm
@@ -0,0 +1,621 @@
+;
+; jfdctint.asm - accurate integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054   times 4 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4 dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 8 dw  1 << (PASS1_BITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  6
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    mov         rdx, r10                ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(3)], xmm2   ; wk(3)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(3)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    paddw       xmm3, xmm1              ; xmm3=tmp10
+    paddw       xmm6, xmm7              ; xmm6=tmp11
+    psubw       xmm4, xmm1              ; xmm4=tmp13
+    psubw       xmm0, xmm7              ; xmm0=tmp12
+
+    movdqa      xmm1, xmm3
+    paddw       xmm3, xmm6              ; xmm3=tmp10+tmp11
+    psubw       xmm1, xmm6              ; xmm1=tmp10-tmp11
+
+    psllw       xmm3, PASS1_BITS        ; xmm3=data0
+    psllw       xmm1, PASS1_BITS        ; xmm1=data4
+
+    movdqa      XMMWORD [wk(2)], xmm3   ; wk(2)=data0
+    movdqa      XMMWORD [wk(3)], xmm1   ; wk(3)=data4
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm7, xmm4              ; xmm4=tmp13
+    movdqa      xmm6, xmm4
+    punpcklwd   xmm7, xmm0              ; xmm0=tmp12
+    punpckhwd   xmm6, xmm0
+    movdqa      xmm4, xmm7
+    movdqa      xmm0, xmm6
+    pmaddwd     xmm7, [rel PW_F130_F054]   ; xmm7=data2L
+    pmaddwd     xmm6, [rel PW_F130_F054]   ; xmm6=data2H
+    pmaddwd     xmm4, [rel PW_F054_MF130]  ; xmm4=data6L
+    pmaddwd     xmm0, [rel PW_F054_MF130]  ; xmm0=data6H
+
+    paddd       xmm7, [rel PD_DESCALE_P1]
+    paddd       xmm6, [rel PD_DESCALE_P1]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+    paddd       xmm4, [rel PD_DESCALE_P1]
+    paddd       xmm0, [rel PD_DESCALE_P1]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm7, xmm6              ; xmm7=data2
+    packssdw    xmm4, xmm0              ; xmm4=data6
+
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=data2
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=data6
+
+    ; -- Odd part
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=tmp6
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp7
+
+    movdqa      xmm6, xmm2              ; xmm2=tmp4
+    movdqa      xmm0, xmm5              ; xmm5=tmp5
+    paddw       xmm6, xmm3              ; xmm6=z3
+    paddw       xmm0, xmm1              ; xmm0=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm7, xmm0
+    punpckhwd   xmm4, xmm0
+    movdqa      xmm6, xmm7
+    movdqa      xmm0, xmm4
+    pmaddwd     xmm7, [rel PW_MF078_F117]  ; xmm7=z3L
+    pmaddwd     xmm4, [rel PW_MF078_F117]  ; xmm4=z3H
+    pmaddwd     xmm6, [rel PW_F117_F078]   ; xmm6=z4L
+    pmaddwd     xmm0, [rel PW_F117_F078]   ; xmm0=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm7, xmm2
+    movdqa      xmm4, xmm2
+    punpcklwd   xmm7, xmm1
+    punpckhwd   xmm4, xmm1
+    movdqa      xmm2, xmm7
+    movdqa      xmm1, xmm4
+    pmaddwd     xmm7, [rel PW_MF060_MF089]  ; xmm7=tmp4L
+    pmaddwd     xmm4, [rel PW_MF060_MF089]  ; xmm4=tmp4H
+    pmaddwd     xmm2, [rel PW_MF089_F060]   ; xmm2=tmp7L
+    pmaddwd     xmm1, [rel PW_MF089_F060]   ; xmm1=tmp7H
+
+    paddd       xmm7, XMMWORD [wk(0)]   ; xmm7=data7L
+    paddd       xmm4, XMMWORD [wk(1)]   ; xmm4=data7H
+    paddd       xmm2, xmm6              ; xmm2=data1L
+    paddd       xmm1, xmm0              ; xmm1=data1H
+
+    paddd       xmm7, [rel PD_DESCALE_P1]
+    paddd       xmm4, [rel PD_DESCALE_P1]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm2, [rel PD_DESCALE_P1]
+    paddd       xmm1, [rel PD_DESCALE_P1]
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+
+    packssdw    xmm7, xmm4              ; xmm7=data7
+    packssdw    xmm2, xmm1              ; xmm2=data1
+
+    movdqa      xmm4, xmm5
+    movdqa      xmm1, xmm5
+    punpcklwd   xmm4, xmm3
+    punpckhwd   xmm1, xmm3
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm1
+    pmaddwd     xmm4, [rel PW_MF050_MF256]  ; xmm4=tmp5L
+    pmaddwd     xmm1, [rel PW_MF050_MF256]  ; xmm1=tmp5H
+    pmaddwd     xmm5, [rel PW_MF256_F050]   ; xmm5=tmp6L
+    pmaddwd     xmm3, [rel PW_MF256_F050]   ; xmm3=tmp6H
+
+    paddd       xmm4, xmm6              ; xmm4=data5L
+    paddd       xmm1, xmm0              ; xmm1=data5H
+    paddd       xmm5, XMMWORD [wk(0)]   ; xmm5=data3L
+    paddd       xmm3, XMMWORD [wk(1)]   ; xmm3=data3H
+
+    paddd       xmm4, [rel PD_DESCALE_P1]
+    paddd       xmm1, [rel PD_DESCALE_P1]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+    paddd       xmm5, [rel PD_DESCALE_P1]
+    paddd       xmm3, [rel PD_DESCALE_P1]
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+
+    packssdw    xmm4, xmm1              ; xmm4=data5
+    packssdw    xmm5, xmm3              ; xmm5=data3
+
+    ; ---- Pass 2: process columns.
+
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=col0
+    movdqa      xmm0, XMMWORD [wk(4)]   ; xmm0=col2
+
+    ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+    ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm2              ; xmm6=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm1, xmm2              ; xmm1=(40 41 50 51 60 61 70 71)
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm3, xmm5              ; xmm3=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm2, XMMWORD [wk(3)]   ; xmm2=col4
+    movdqa      xmm5, XMMWORD [wk(5)]   ; xmm5=col6
+
+    ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+    ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm4              ; xmm2=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm0, xmm4              ; xmm0=(44 45 54 55 64 65 74 75)
+    movdqa      xmm3, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm7              ; xmm5=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm3, xmm7              ; xmm3=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm4, xmm5              ; xmm4=(24 25 26 27 34 35 36 37)
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm7, xmm3              ; xmm7=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=(02 03 12 13 22 23 32 33)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(3)], xmm0   ; wk(3)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm5              ; xmm6=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm4, xmm5              ; xmm4=(20 21 22 23 30 31 32 33)
+    movdqa      xmm0, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm3              ; xmm1=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm0, xmm3              ; xmm0=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm2              ; xmm6=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm5, xmm2              ; xmm5=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm7              ; xmm0=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm3, xmm7              ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm7, xmm6
+    psubw       xmm5, xmm0              ; xmm5=data1-data6=tmp6
+    psubw       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    paddw       xmm2, xmm0              ; xmm2=data1+data6=tmp1
+    paddw       xmm7, xmm3              ; xmm7=data0+data7=tmp0
+
+    movdqa      xmm0, XMMWORD [wk(2)]   ; xmm0=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, XMMWORD [wk(3)]   ; xmm3=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movdqa      xmm5, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm5, xmm0              ; xmm5=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm3              ; xmm1=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm6, xmm3              ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm3, xmm4
+    paddw       xmm5, xmm1              ; xmm5=data3+data4=tmp3
+    paddw       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    psubw       xmm0, xmm1              ; xmm0=data3-data4=tmp4
+    psubw       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm6, xmm2
+    paddw       xmm7, xmm5              ; xmm7=tmp10
+    paddw       xmm2, xmm4              ; xmm2=tmp11
+    psubw       xmm1, xmm5              ; xmm1=tmp13
+    psubw       xmm6, xmm4              ; xmm6=tmp12
+
+    movdqa      xmm5, xmm7
+    paddw       xmm7, xmm2              ; xmm7=tmp10+tmp11
+    psubw       xmm5, xmm2              ; xmm5=tmp10-tmp11
+
+    paddw       xmm7, [rel PW_DESCALE_P2X]
+    paddw       xmm5, [rel PW_DESCALE_P2X]
+    psraw       xmm7, PASS1_BITS        ; xmm7=data0
+    psraw       xmm5, PASS1_BITS        ; xmm5=data4
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
+    movdqa      XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm4, xmm1              ; xmm1=tmp13
+    movdqa      xmm2, xmm1
+    punpcklwd   xmm4, xmm6              ; xmm6=tmp12
+    punpckhwd   xmm2, xmm6
+    movdqa      xmm1, xmm4
+    movdqa      xmm6, xmm2
+    pmaddwd     xmm4, [rel PW_F130_F054]   ; xmm4=data2L
+    pmaddwd     xmm2, [rel PW_F130_F054]   ; xmm2=data2H
+    pmaddwd     xmm1, [rel PW_F054_MF130]  ; xmm1=data6L
+    pmaddwd     xmm6, [rel PW_F054_MF130]  ; xmm6=data6H
+
+    paddd       xmm4, [rel PD_DESCALE_P2]
+    paddd       xmm2, [rel PD_DESCALE_P2]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm1, [rel PD_DESCALE_P2]
+    paddd       xmm6, [rel PD_DESCALE_P2]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm6, DESCALE_P2
+
+    packssdw    xmm4, xmm2              ; xmm4=data2
+    packssdw    xmm1, xmm6              ; xmm1=data6
+
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
+
+    ; -- Odd part
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    movdqa      xmm2, xmm0              ; xmm0=tmp4
+    movdqa      xmm6, xmm3              ; xmm3=tmp5
+    paddw       xmm2, xmm7              ; xmm2=z3
+    paddw       xmm6, xmm5              ; xmm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm1, xmm2
+    punpcklwd   xmm4, xmm6
+    punpckhwd   xmm1, xmm6
+    movdqa      xmm2, xmm4
+    movdqa      xmm6, xmm1
+    pmaddwd     xmm4, [rel PW_MF078_F117]  ; xmm4=z3L
+    pmaddwd     xmm1, [rel PW_MF078_F117]  ; xmm1=z3H
+    pmaddwd     xmm2, [rel PW_F117_F078]   ; xmm2=z4L
+    pmaddwd     xmm6, [rel PW_F117_F078]   ; xmm6=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm1, xmm0
+    punpcklwd   xmm4, xmm5
+    punpckhwd   xmm1, xmm5
+    movdqa      xmm0, xmm4
+    movdqa      xmm5, xmm1
+    pmaddwd     xmm4, [rel PW_MF060_MF089]  ; xmm4=tmp4L
+    pmaddwd     xmm1, [rel PW_MF060_MF089]  ; xmm1=tmp4H
+    pmaddwd     xmm0, [rel PW_MF089_F060]   ; xmm0=tmp7L
+    pmaddwd     xmm5, [rel PW_MF089_F060]   ; xmm5=tmp7H
+
+    paddd       xmm4,  XMMWORD [wk(0)]  ; xmm4=data7L
+    paddd       xmm1,  XMMWORD [wk(1)]  ; xmm1=data7H
+    paddd       xmm0, xmm2              ; xmm0=data1L
+    paddd       xmm5, xmm6              ; xmm5=data1H
+
+    paddd       xmm4, [rel PD_DESCALE_P2]
+    paddd       xmm1, [rel PD_DESCALE_P2]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm0, [rel PD_DESCALE_P2]
+    paddd       xmm5, [rel PD_DESCALE_P2]
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+
+    packssdw    xmm4, xmm1              ; xmm4=data7
+    packssdw    xmm0, xmm5              ; xmm0=data1
+
+    movdqa      XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
+
+    movdqa      xmm1, xmm3
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm1, xmm7
+    punpckhwd   xmm5, xmm7
+    movdqa      xmm3, xmm1
+    movdqa      xmm7, xmm5
+    pmaddwd     xmm1, [rel PW_MF050_MF256]  ; xmm1=tmp5L
+    pmaddwd     xmm5, [rel PW_MF050_MF256]  ; xmm5=tmp5H
+    pmaddwd     xmm3, [rel PW_MF256_F050]   ; xmm3=tmp6L
+    pmaddwd     xmm7, [rel PW_MF256_F050]   ; xmm7=tmp6H
+
+    paddd       xmm1, xmm2              ; xmm1=data5L
+    paddd       xmm5, xmm6              ; xmm5=data5H
+    paddd       xmm3, XMMWORD [wk(0)]   ; xmm3=data3L
+    paddd       xmm7, XMMWORD [wk(1)]   ; xmm7=data3H
+
+    paddd       xmm1, [rel PD_DESCALE_P2]
+    paddd       xmm5, [rel PD_DESCALE_P2]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+    paddd       xmm3, [rel PD_DESCALE_P2]
+    paddd       xmm7, [rel PD_DESCALE_P2]
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm1, xmm5              ; xmm1=data5
+    packssdw    xmm3, xmm7              ; xmm3=data3
+
+    movdqa      XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
+
+    uncollect_args 1
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jidctflt-sse2.asm b/simd/x86_64/jidctflt-sse2.asm
new file mode 100644
index 0000000..b676ef3
--- /dev/null
+++ b/simd/x86_64/jidctflt-sse2.asm
@@ -0,0 +1,483 @@
+;
+; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414        times 4  dd  1.414213562373095048801689
+PD_1_847        times 4  dd  1.847759065022573512256366
+PD_1_082        times 4  dd  1.082392200292393968799446
+PD_M2_613       times 4  dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+%define workspace     wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [workspace]
+    collect_args 4
+    push        rbx
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+    lea         rdi, [workspace]        ; FAST_FLOAT *wsptr
+    mov         rcx, DCTSIZE/4          ; ctr
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+    mov         eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, xmm2
+    por         xmm3, xmm4
+    por         xmm5, xmm6
+    por         xmm1, xmm3
+    por         xmm5, xmm7
+    por         xmm1, xmm5
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm1, xmm0
+    movaps      xmm2, xmm0
+    movaps      xmm3, xmm0
+
+    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
+    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
+    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
+    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    jmp         near .nextcolumn
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [rel PD_1_414]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
+
+    punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
+    psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
+    cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
+
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
+    mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0              ; xmm3=tmp12
+    subps       xmm4, xmm0              ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
+    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
+    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
+    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
+    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
+    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
+
+    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
+    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm0, xmm7
+    movaps      xmm3, xmm5
+    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
+    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
+    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
+    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
+
+    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
+    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
+    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
+    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
+    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
+    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
+
+    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
+    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
+    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
+    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
+    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+
+    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
+    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
+    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
+    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
+    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+    add         rsi, byte 4*SIZEOF_JCOEF               ; coef_block
+    add         rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         rcx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         rax, [original_rbp]
+    lea         rsi, [workspace]        ; FAST_FLOAT *wsptr
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+    mov         rcx, DCTSIZE/4          ; ctr
+.rowloop:
+
+    ; -- Even part
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [rel PD_1_414]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
+    mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0              ; xmm3=tmp12
+    subps       xmm4, xmm0              ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
+    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
+    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
+    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, [rel PD_RNDINT_MAGIC]  ; xmm1=[rel PD_RNDINT_MAGIC]
+    pcmpeqd     xmm3, xmm3
+    psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+    addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+    addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+    addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+    pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+    pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+    por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
+    por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
+
+    movaps      xmm1,  XMMWORD [wk(0)]  ; xmm1=tmp2
+    movaps      xmm3,  XMMWORD [wk(1)]  ; xmm3=tmp3
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm7, xmm1
+    movaps      xmm5, xmm3
+    addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
+    addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
+    subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
+    subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
+
+    movaps      xmm2, [rel PD_RNDINT_MAGIC]  ; xmm2=[rel PD_RNDINT_MAGIC]
+    pcmpeqd     xmm4, xmm4
+    psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+    addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+    addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+    addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+    pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+    pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+    por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
+    por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm2, [rel PB_CENTERJSAMP]  ; xmm2=[rel PB_CENTERJSAMP]
+
+    packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+    packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+    paddb       xmm6, xmm2
+    paddb       xmm1, xmm2
+
+    movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+    movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
+    punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+    pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
+    mov         rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
+
+    add         rsi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
+    add         rdi, byte 4*SIZEOF_JSAMPROW
+    dec         rcx                            ; ctr
+    jnz         near .rowloop
+
+    pop         rbx
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jidctfst-sse2.asm b/simd/x86_64/jidctfst-sse2.asm
new file mode 100644
index 0000000..c6c42f9
--- /dev/null
+++ b/simd/x86_64/jidctfst-sse2.asm
@@ -0,0 +1,492 @@
+;
+; jidctfst.asm - fast integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+%define PASS1_BITS  2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277              ; FIX(1.082392200)
+F_1_414 equ 362              ; FIX(1.414213562)
+F_1_847 equ 473              ; FIX(1.847759065)
+F_2_613 equ 669              ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256)  ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS)  ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS)  ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS)  ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS))         ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414       times 8  dw  F_1_414 << CONST_SHIFT
+PW_F1847       times 8  dw  F_1_847 << CONST_SHIFT
+PW_MF1613      times 8  dw -F_1_613 << CONST_SHIFT
+PW_F1082       times 8  dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+    mov         eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm7, xmm0              ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0              ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm7, xmm7              ; xmm7=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm6, xmm0, 0x00        ; xmm6=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm2, xmm0, 0x55        ; xmm2=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm5, xmm0, 0xAA        ; xmm5=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm0, xmm0, 0xFF        ; xmm0=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm1, xmm7, 0x00        ; xmm1=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm4, xmm7, 0x55        ; xmm4=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm3, xmm7, 0xAA        ; xmm3=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm7, xmm7, 0xFF        ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=col3
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    psubw       xmm0, xmm2              ; xmm0=tmp11
+    psubw       xmm1, xmm3
+    paddw       xmm4, xmm2              ; xmm4=tmp10
+    paddw       xmm5, xmm3              ; xmm5=tmp13
+
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm1, [rel PW_F1414]
+    psubw       xmm1, xmm5              ; xmm1=tmp12
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm7, xmm0
+    psubw       xmm4, xmm5              ; xmm4=tmp3
+    psubw       xmm0, xmm1              ; xmm0=tmp2
+    paddw       xmm6, xmm5              ; xmm6=tmp0
+    paddw       xmm7, xmm1              ; xmm7=tmp1
+
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movdqa      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm0, xmm5
+    psubw       xmm2, xmm1              ; xmm2=z12
+    psubw       xmm5, xmm3              ; xmm5=z10
+    paddw       xmm4, xmm1              ; xmm4=z11
+    paddw       xmm0, xmm3              ; xmm0=z13
+
+    movdqa      xmm1, xmm5              ; xmm1=z10(unscaled)
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm3, xmm4
+    psubw       xmm4, xmm0
+    paddw       xmm3, xmm0              ; xmm3=tmp7
+
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm4, [rel PW_F1414]    ; xmm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm0, xmm5
+    paddw       xmm5, xmm2
+    pmulhw      xmm5, [rel PW_F1847]    ; xmm5=z5
+    pmulhw      xmm0, [rel PW_MF1613]
+    pmulhw      xmm2, [rel PW_F1082]
+    psubw       xmm0, xmm1
+    psubw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm0, xmm5              ; xmm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm0, xmm3              ; xmm0=tmp6
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm7
+    paddw       xmm6, xmm3              ; xmm6=data0=(00 01 02 03 04 05 06 07)
+    paddw       xmm7, xmm0              ; xmm7=data1=(10 11 12 13 14 15 16 17)
+    psubw       xmm1, xmm3              ; xmm1=data7=(70 71 72 73 74 75 76 77)
+    psubw       xmm5, xmm0              ; xmm5=data6=(60 61 62 63 64 65 66 67)
+    psubw       xmm4, xmm0              ; xmm4=tmp5
+
+    movdqa      xmm3, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm3, xmm7              ; xmm3=(04 14 05 15 06 16 07 17)
+    movdqa      xmm0, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm1              ; xmm5=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm0, xmm1              ; xmm0=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
+
+    paddw       xmm2, xmm4              ; xmm2=tmp4
+    movdqa      xmm5, xmm7
+    movdqa      xmm0, xmm1
+    paddw       xmm7, xmm4              ; xmm7=data2=(20 21 22 23 24 25 26 27)
+    paddw       xmm1, xmm2              ; xmm1=data4=(40 41 42 43 44 45 46 47)
+    psubw       xmm5, xmm4              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+    psubw       xmm0, xmm2              ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm0              ; xmm7=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm0              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm5              ; xmm2=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm0, xmm3              ; transpose coefficients(phase 2)
+    punpckldq   xmm3, xmm4              ; xmm3=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm0, xmm4              ; xmm0=(06 16 26 36 07 17 27 37)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7              ; xmm6=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm5, xmm7              ; xmm5=(02 12 22 32 03 13 23 33)
+
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm4              ; xmm1=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm3, xmm4              ; xmm3=(42 52 62 72 43 53 63 73)
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm7              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm0, xmm7              ; xmm0=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm1              ; xmm6=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm4, xmm1              ; xmm4=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm5, xmm3              ; xmm5=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm7, xmm3              ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=col3
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm7, xmm3              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm3, xmm0              ; xmm3=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm7, xmm0              ; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         rax, [original_rbp]
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; -- Even part
+
+    ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm0, xmm5
+    psubw       xmm6, xmm1              ; xmm6=tmp11
+    psubw       xmm5, xmm3
+    paddw       xmm2, xmm1              ; xmm2=tmp10
+    paddw       xmm0, xmm3              ; xmm0=tmp13
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [rel PW_F1414]
+    psubw       xmm5, xmm0              ; xmm5=tmp12
+
+    movdqa      xmm1, xmm2
+    movdqa      xmm3, xmm6
+    psubw       xmm2, xmm0              ; xmm2=tmp3
+    psubw       xmm6, xmm5              ; xmm6=tmp2
+    paddw       xmm1, xmm0              ; xmm1=tmp0
+    paddw       xmm3, xmm5              ; xmm3=tmp1
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=col1
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=col3
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
+
+    ; -- Odd part
+
+    ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm4
+    psubw       xmm0, xmm7              ; xmm0=z12
+    psubw       xmm4, xmm5              ; xmm4=z10
+    paddw       xmm2, xmm7              ; xmm2=z11
+    paddw       xmm6, xmm5              ; xmm6=z13
+
+    movdqa      xmm7, xmm4              ; xmm7=z10(unscaled)
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm5, xmm2
+    psubw       xmm2, xmm6
+    paddw       xmm5, xmm6              ; xmm5=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm2, [rel PW_F1414]    ; xmm2=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm6, xmm4
+    paddw       xmm4, xmm0
+    pmulhw      xmm4, [rel PW_F1847]    ; xmm4=z5
+    pmulhw      xmm6, [rel PW_MF1613]
+    pmulhw      xmm0, [rel PW_F1082]
+    psubw       xmm6, xmm7
+    psubw       xmm0, xmm4              ; xmm0=tmp10
+    paddw       xmm6, xmm4              ; xmm6=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm6, xmm5              ; xmm6=tmp6
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm3
+    paddw       xmm1, xmm5              ; xmm1=data0=(00 10 20 30 40 50 60 70)
+    paddw       xmm3, xmm6              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    psraw       xmm1, (PASS1_BITS+3)    ; descale
+    psraw       xmm3, (PASS1_BITS+3)    ; descale
+    psubw       xmm7, xmm5              ; xmm7=data7=(07 17 27 37 47 57 67 77)
+    psubw       xmm4, xmm6              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psubw       xmm2, xmm6              ; xmm2=tmp5
+
+    packsswb    xmm1, xmm4        ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm7        ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
+
+    paddw       xmm0, xmm2              ; xmm0=tmp4
+    movdqa      xmm4, xmm5
+    movdqa      xmm7, xmm6
+    paddw       xmm5, xmm2              ; xmm5=data2=(02 12 22 32 42 52 62 72)
+    paddw       xmm6, xmm0              ; xmm6=data4=(04 14 24 34 44 54 64 74)
+    psraw       xmm5, (PASS1_BITS+3)    ; descale
+    psraw       xmm6, (PASS1_BITS+3)    ; descale
+    psubw       xmm4, xmm2              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+    psubw       xmm7, xmm0              ; xmm7=data3=(03 13 23 33 43 53 63 73)
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+
+    movdqa      xmm2, [rel PB_CENTERJSAMP]  ; xmm2=[rel PB_CENTERJSAMP]
+
+    packsswb    xmm5, xmm6        ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm7, xmm4        ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm1, xmm2
+    paddb       xmm3, xmm2
+    paddb       xmm5, xmm2
+    paddb       xmm7, xmm2
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 1)
+    punpcklbw   xmm1, xmm3        ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm3        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm6, xmm5        ; transpose coefficients(phase 1)
+    punpcklbw   xmm5, xmm7        ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm6, xmm7        ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm1        ; transpose coefficients(phase 2)
+    punpcklwd   xmm1, xmm5        ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm5        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm0        ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm2, xmm0        ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm3, xmm1        ; transpose coefficients(phase 3)
+    punpckldq   xmm1, xmm6        ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm3, xmm6        ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm7, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm2        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm7, xmm2        ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm5, xmm1, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm3, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm6, xmm4, 0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm2, xmm7, 0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+    mov         rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+    mov         rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+    mov         rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+    mov         rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+    mov         rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jidctint-avx2.asm b/simd/x86_64/jidctint-avx2.asm
new file mode 100644
index 0000000..b60b44f
--- /dev/null
+++ b/simd/x86_64/jidctint-avx2.asm
@@ -0,0 +1,419 @@
+;
+; jidctint.asm - accurate integer IDCT (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %5=(00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71)
+    ; %6=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %7=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %8=(07 17 27 37 47 57 67 77  06 16 26 36 46 56 66 76)
+
+    vpermq      %5, %1, 0xD8
+    vpermq      %6, %2, 0x72
+    vpermq      %7, %3, 0xD8
+    vpermq      %8, %4, 0x72
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %6=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %7=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %8=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpunpcklwd  %1, %5, %6
+    vpunpckhwd  %2, %5, %6
+    vpunpcklwd  %3, %7, %8
+    vpunpckhwd  %4, %7, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 02 10 12 20 22 30 32  40 42 50 52 60 62 70 72)
+    ; %2=(01 03 11 13 21 23 31 33  41 43 51 53 61 63 71 73)
+    ; %3=(04 06 14 16 24 26 34 36  44 46 54 56 64 66 74 76)
+    ; %4=(05 07 15 17 25 27 35 37  45 47 55 57 65 67 75 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpcklwd  %6, %3, %4
+    vpunpckhwd  %7, %1, %2
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 3)
+    ; %5=(00 01 02 03 10 11 12 13  40 41 42 43 50 51 52 53)
+    ; %6=(04 05 06 07 14 15 16 17  44 45 46 47 54 55 56 57)
+    ; %7=(20 21 22 23 30 31 32 33  60 61 62 63 70 71 72 73)
+    ; %8=(24 25 26 27 34 35 36 37  64 65 66 67 74 75 76 77)
+
+    vpunpcklqdq %1, %5, %6
+    vpunpckhqdq %2, %5, %6
+    vpunpcklqdq %3, %7, %8
+    vpunpckhqdq %4, %7, %8
+    ; transpose coefficients(phase 4)
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit slow integer inverse DCT using AVX2 instructions
+; %1-%4:  Input/output registers
+; %5-%12: Temp registers
+; %9:     Pass (1 or 2)
+
+%macro dodct 13
+    ; -- Even part
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    vperm2i128  %6, %3, %3, 0x01        ; %6=in6_2
+    vpunpcklwd  %5, %3, %6              ; %5=in26_62L
+    vpunpckhwd  %6, %3, %6              ; %6=in26_62H
+    vpmaddwd    %5, %5, [rel PW_F130_F054_MF130_F054]  ; %5=tmp3_2L
+    vpmaddwd    %6, %6, [rel PW_F130_F054_MF130_F054]  ; %6=tmp3_2H
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=in4_0
+    vpsignw     %1, %1, [rel PW_1_NEG1]
+    vpaddw      %7, %7, %1              ; %7=(in0+in4)_(in0-in4)
+
+    vpxor       %1, %1, %1
+    vpunpcklwd  %8, %1, %7              ; %8=tmp0_1L
+    vpunpckhwd  %1, %1, %7              ; %1=tmp0_1H
+    vpsrad      %8, %8, (16-CONST_BITS)  ; vpsrad %8,16 & vpslld %8,CONST_BITS
+    vpsrad      %1, %1, (16-CONST_BITS)  ; vpsrad %1,16 & vpslld %1,CONST_BITS
+
+    vpsubd      %11, %8, %5             ; %11=tmp0_1L-tmp3_2L=tmp13_12L
+    vpaddd      %9, %8, %5              ; %9=tmp0_1L+tmp3_2L=tmp10_11L
+    vpsubd      %12, %1, %6             ; %12=tmp0_1H-tmp3_2H=tmp13_12H
+    vpaddd      %10, %1, %6             ; %10=tmp0_1H+tmp3_2H=tmp10_11H
+
+    ; -- Odd part
+
+    vpaddw      %1, %4, %2              ; %1=in7_5+in3_1=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %8, %1, %1, 0x01        ; %8=z4_3
+    vpunpcklwd  %7, %1, %8              ; %7=z34_43L
+    vpunpckhwd  %8, %1, %8              ; %8=z34_43H
+    vpmaddwd    %7, %7, [rel PW_MF078_F117_F078_F117]  ; %7=z3_4L
+    vpmaddwd    %8, %8, [rel PW_MF078_F117_F078_F117]  ; %8=z3_4H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    vperm2i128  %2, %2, %2, 0x01        ; %2=in1_3
+    vpunpcklwd  %3, %4, %2              ; %3=in71_53L
+    vpunpckhwd  %4, %4, %2              ; %4=in71_53H
+
+    vpmaddwd    %5, %3, [rel PW_MF060_MF089_MF050_MF256]  ; %5=tmp0_1L
+    vpmaddwd    %6, %4, [rel PW_MF060_MF089_MF050_MF256]  ; %6=tmp0_1H
+    vpaddd      %5, %5, %7              ; %5=tmp0_1L+z3_4L=tmp0_1L
+    vpaddd      %6, %6, %8              ; %6=tmp0_1H+z3_4H=tmp0_1H
+
+    vpmaddwd    %3, %3, [rel PW_MF089_F060_MF256_F050]  ; %3=tmp3_2L
+    vpmaddwd    %4, %4, [rel PW_MF089_F060_MF256_F050]  ; %4=tmp3_2H
+    vperm2i128  %7, %7, %7, 0x01        ; %7=z4_3L
+    vperm2i128  %8, %8, %8, 0x01        ; %8=z4_3H
+    vpaddd      %7, %3, %7              ; %7=tmp3_2L+z4_3L=tmp3_2L
+    vpaddd      %8, %4, %8              ; %8=tmp3_2H+z4_3H=tmp3_2H
+
+    ; -- Final output stage
+
+    vpaddd      %1, %9, %7              ; %1=tmp10_11L+tmp3_2L=data0_1L
+    vpaddd      %2, %10, %8             ; %2=tmp10_11H+tmp3_2H=data0_1H
+    vpaddd      %1, %1, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %2, %2, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %1, %1, DESCALE_P %+ %13
+    vpsrad      %2, %2, DESCALE_P %+ %13
+    vpackssdw   %1, %1, %2              ; %1=data0_1
+
+    vpsubd      %3, %9, %7              ; %3=tmp10_11L-tmp3_2L=data7_6L
+    vpsubd      %4, %10, %8             ; %4=tmp10_11H-tmp3_2H=data7_6H
+    vpaddd      %3, %3, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %4, %4, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpsrad      %4, %4, DESCALE_P %+ %13
+    vpackssdw   %4, %3, %4              ; %4=data7_6
+
+    vpaddd      %7, %11, %5             ; %7=tmp13_12L+tmp0_1L=data3_2L
+    vpaddd      %8, %12, %6             ; %8=tmp13_12H+tmp0_1H=data3_2H
+    vpaddd      %7, %7, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %8, %8, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %7, %7, DESCALE_P %+ %13
+    vpsrad      %8, %8, DESCALE_P %+ %13
+    vpackssdw   %2, %7, %8              ; %2=data3_2
+
+    vpsubd      %7, %11, %5             ; %7=tmp13_12L-tmp0_1L=data4_5L
+    vpsubd      %8, %12, %6             ; %8=tmp13_12H-tmp0_1H=data4_5H
+    vpaddd      %7, %7, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %8, %8, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %7, %7, DESCALE_P %+ %13
+    vpsrad      %8, %8, DESCALE_P %+ %13
+    vpackssdw   %3, %7, %8              ; %3=data4_5
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_avx2)
+
+EXTN(jconst_idct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF089_F060_MF256_F050   times 4  dw -F_0_899, (F_1_501 - F_0_899)
+                           times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP             times 32 db  CENTERJSAMPLE
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_avx2)
+
+EXTN(jsimd_idct_islow_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    push_xmm    4
+    collect_args 4
+
+    ; ---- Pass 1: process columns.
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
+    mov         eax, DWORD [DWBLOCK(1,0,r11,SIZEOF_JCOEF)]
+    or          eax, DWORD [DWBLOCK(2,0,r11,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,r11,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, xmm0
+    vpacksswb   xmm1, xmm1, xmm1
+    vpacksswb   xmm1, xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,r11,SIZEOF_JCOEF)]
+    vpmullw     xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vpsllw      xmm5, xmm5, PASS1_BITS
+
+    vpunpcklwd  xmm4, xmm5, xmm5        ; xmm4=(00 00 01 01 02 02 03 03)
+    vpunpckhwd  xmm5, xmm5, xmm5        ; xmm5=(04 04 05 05 06 06 07 07)
+    vinserti128 ymm4, ymm4, xmm5, 1
+
+    vpshufd     ymm0, ymm4, 0x00        ; ymm0=col0_4=(00 00 00 00 00 00 00 00  04 04 04 04 04 04 04 04)
+    vpshufd     ymm1, ymm4, 0x55        ; ymm1=col1_5=(01 01 01 01 01 01 01 01  05 05 05 05 05 05 05 05)
+    vpshufd     ymm2, ymm4, 0xAA        ; ymm2=col2_6=(02 02 02 02 02 02 02 02  06 06 06 06 06 06 06 06)
+    vpshufd     ymm3, ymm4, 0xFF        ; ymm3=col3_7=(03 03 03 03 03 03 03 03  07 07 07 07 07 07 07 07)
+
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,r11,SIZEOF_JCOEF)]  ; ymm4=in0_1
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,r11,SIZEOF_JCOEF)]  ; ymm5=in2_3
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,r11,SIZEOF_JCOEF)]  ; ymm6=in4_5
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,r11,SIZEOF_JCOEF)]  ; ymm7=in6_7
+    vpmullw     ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20  ; ymm0=in0_4
+    vperm2i128  ymm1, ymm5, ymm4, 0x31  ; ymm1=in3_1
+    vperm2i128  ymm2, ymm5, ymm7, 0x20  ; ymm2=in2_6
+    vperm2i128  ymm3, ymm7, ymm6, 0x31  ; ymm3=in7_5
+
+    dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 1
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
+
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows.
+
+    vperm2i128  ymm4, ymm3, ymm1, 0x31  ; ymm3=in7_5
+    vperm2i128  ymm1, ymm3, ymm1, 0x20  ; ymm1=in3_1
+
+    dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 2
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
+
+    vpacksswb   ymm0, ymm0, ymm1        ; ymm0=data01_45
+    vpacksswb   ymm1, ymm2, ymm4        ; ymm1=data23_67
+    vpaddb      ymm0, ymm0, [rel PB_CENTERJSAMP]
+    vpaddb      ymm1, ymm1, [rel PB_CENTERJSAMP]
+
+    vextracti128 xmm6, ymm1, 1          ; xmm3=data67
+    vextracti128 xmm4, ymm0, 1          ; xmm2=data45
+    vextracti128 xmm2, ymm1, 0          ; xmm1=data23
+    vextracti128 xmm0, ymm0, 0          ; xmm0=data01
+
+    vpshufd     xmm1, xmm0, 0x4E  ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    vpshufd     xmm3, xmm2, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    vpshufd     xmm5, xmm4, 0x4E  ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    vpshufd     xmm7, xmm6, 0x4E  ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    vzeroupper
+
+    mov         eax, r13d
+
+    mov         rdx, JSAMPROW [r12+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsi, JSAMPROW [r12+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm0
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+
+    mov         rdx, JSAMPROW [r12+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsi, JSAMPROW [r12+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+    mov         rdx, JSAMPROW [r12+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsi, JSAMPROW [r12+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+    mov         rdx, JSAMPROW [r12+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsi, JSAMPROW [r12+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+    uncollect_args 4
+    pop_xmm     4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jidctint-sse2.asm b/simd/x86_64/jidctint-sse2.asm
new file mode 100644
index 0000000..83fc344
--- /dev/null
+++ b/simd/x86_64/jidctint-sse2.asm
@@ -0,0 +1,848 @@
+;
+; jidctint.asm - accurate integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054   times 4  dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4  dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4  dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4  dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4  dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        12
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+    mov         eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm5, PASS1_BITS
+
+    movdqa      xmm4, xmm5              ; xmm5=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm5, xmm5              ; xmm5=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm4, xmm4              ; xmm4=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm7, xmm5, 0x00        ; xmm7=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm6, xmm5, 0x55        ; xmm6=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm1, xmm5, 0xAA        ; xmm1=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm5, xmm5, 0xFF        ; xmm5=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm0, xmm4, 0x00        ; xmm0=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm3, xmm4, 0x55        ; xmm3=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm2, xmm4, 0xAA        ; xmm2=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm4, xmm4, 0xFF        ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(8)], xmm6   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm5   ; wk(9)=col3
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm4, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm4, xmm3              ; xmm3=in6=z3
+    punpckhwd   xmm5, xmm3
+    movdqa      xmm1, xmm4
+    movdqa      xmm3, xmm5
+    pmaddwd     xmm4, [rel PW_F130_F054]   ; xmm4=tmp3L
+    pmaddwd     xmm5, [rel PW_F130_F054]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [rel PW_F054_MF130]  ; xmm1=tmp2L
+    pmaddwd     xmm3, [rel PW_F054_MF130]  ; xmm3=tmp2H
+
+    movdqa      xmm6, xmm0
+    paddw       xmm0, xmm2              ; xmm0=in0+in4
+    psubw       xmm6, xmm2              ; xmm6=in0-in4
+
+    pxor        xmm7, xmm7
+    pxor        xmm2, xmm2
+    punpcklwd   xmm7, xmm0              ; xmm7=tmp0L
+    punpckhwd   xmm2, xmm0              ; xmm2=tmp0H
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+    psrad       xmm2, (16-CONST_BITS)   ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm4              ; xmm7=tmp10L
+    psubd       xmm0, xmm4              ; xmm0=tmp13L
+    movdqa      xmm4, xmm2
+    paddd       xmm2, xmm5              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm7, xmm7
+    punpcklwd   xmm5, xmm6              ; xmm5=tmp1L
+    punpckhwd   xmm7, xmm6              ; xmm7=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+    movdqa      xmm2, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm2, xmm1              ; xmm2=tmp12L
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm3              ; xmm7=tmp11H
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm7, xmm4
+    paddw       xmm5, xmm3              ; xmm5=z3
+    paddw       xmm7, xmm1              ; xmm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm5
+    punpcklwd   xmm2, xmm7
+    punpckhwd   xmm0, xmm7
+    movdqa      xmm5, xmm2
+    movdqa      xmm7, xmm0
+    pmaddwd     xmm2, [rel PW_MF078_F117]  ; xmm2=z3L
+    pmaddwd     xmm0, [rel PW_MF078_F117]  ; xmm0=z3H
+    pmaddwd     xmm5, [rel PW_F117_F078]   ; xmm5=z4L
+    pmaddwd     xmm7, [rel PW_F117_F078]   ; xmm7=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm3
+    punpcklwd   xmm2, xmm4
+    punpckhwd   xmm0, xmm4
+    movdqa      xmm3, xmm2
+    movdqa      xmm4, xmm0
+    pmaddwd     xmm2, [rel PW_MF060_MF089]  ; xmm2=tmp0L
+    pmaddwd     xmm0, [rel PW_MF060_MF089]  ; xmm0=tmp0H
+    pmaddwd     xmm3, [rel PW_MF089_F060]   ; xmm3=tmp3L
+    pmaddwd     xmm4, [rel PW_MF089_F060]   ; xmm4=tmp3H
+
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
+    paddd       xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
+    paddd       xmm3, xmm5              ; xmm3=tmp3L
+    paddd       xmm4, xmm7              ; xmm4=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm0, xmm1
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm0, xmm6
+    movdqa      xmm1, xmm2
+    movdqa      xmm6, xmm0
+    pmaddwd     xmm2, [rel PW_MF050_MF256]  ; xmm2=tmp1L
+    pmaddwd     xmm0, [rel PW_MF050_MF256]  ; xmm0=tmp1H
+    pmaddwd     xmm1, [rel PW_MF256_F050]   ; xmm1=tmp2L
+    pmaddwd     xmm6, [rel PW_MF256_F050]   ; xmm6=tmp2H
+
+    paddd       xmm2, xmm5              ; xmm2=tmp1L
+    paddd       xmm0, xmm7              ; xmm0=tmp1H
+    paddd       xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm7
+    paddd       xmm5, xmm3              ; xmm5=data0L
+    paddd       xmm7, xmm4              ; xmm7=data0H
+    psubd       xmm2, xmm3              ; xmm2=data7L
+    psubd       xmm0, xmm4              ; xmm0=data7H
+
+    movdqa      xmm3, [rel PD_DESCALE_P1]  ; xmm3=[rel PD_DESCALE_P1]
+
+    paddd       xmm5, xmm3
+    paddd       xmm7, xmm3
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm7, DESCALE_P1
+    paddd       xmm2, xmm3
+    paddd       xmm0, xmm3
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm5, xmm7              ; xmm5=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm2, xmm0              ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+    movdqa      xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
+    movdqa      xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
+
+    movdqa      xmm7, xmm4
+    movdqa      xmm0, xmm3
+    paddd       xmm4, xmm1              ; xmm4=data1L
+    paddd       xmm3, xmm6              ; xmm3=data1H
+    psubd       xmm7, xmm1              ; xmm7=data6L
+    psubd       xmm0, xmm6              ; xmm0=data6H
+
+    movdqa      xmm1, [rel PD_DESCALE_P1]  ; xmm1=[rel PD_DESCALE_P1]
+
+    paddd       xmm4, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+    paddd       xmm7, xmm1
+    paddd       xmm0, xmm1
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm4, xmm3              ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm7, xmm0              ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+    movdqa      xmm6, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm4              ; xmm5=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4              ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm2              ; xmm7=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm1, xmm2              ; xmm1=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
+    movdqa      xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
+    movdqa      xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
+    movdqa      xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm5, xmm3
+    movdqa      xmm6, xmm0
+    paddd       xmm3, xmm4              ; xmm3=data2L
+    paddd       xmm0, xmm2              ; xmm0=data2H
+    psubd       xmm5, xmm4              ; xmm5=data5L
+    psubd       xmm6, xmm2              ; xmm6=data5H
+
+    movdqa      xmm7, [rel PD_DESCALE_P1]  ; xmm7=[rel PD_DESCALE_P1]
+
+    paddd       xmm3, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm3, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+    paddd       xmm5, xmm7
+    paddd       xmm6, xmm7
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm3, xmm0              ; xmm3=data2=(20 21 22 23 24 25 26 27)
+    packssdw    xmm5, xmm6              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
+    movdqa      xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
+    movdqa      xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
+    movdqa      xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm6, xmm4
+    paddd       xmm1, xmm2              ; xmm1=data3L
+    paddd       xmm4, xmm7              ; xmm4=data3H
+    psubd       xmm0, xmm2              ; xmm0=data4L
+    psubd       xmm6, xmm7              ; xmm6=data4H
+
+    movdqa      xmm2, [rel PD_DESCALE_P1]  ; xmm2=[rel PD_DESCALE_P1]
+
+    paddd       xmm1, xmm2
+    paddd       xmm4, xmm2
+    psrad       xmm1, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm0, xmm2
+    paddd       xmm6, xmm2
+    psrad       xmm0, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm1, xmm4              ; xmm1=data3=(30 31 32 33 34 35 36 37)
+    packssdw    xmm0, xmm6              ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
+    movdqa      xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
+
+    movdqa      xmm4, xmm3              ; transpose coefficients(phase 1)
+    punpcklwd   xmm3, xmm1              ; xmm3=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm1              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm6, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm6, xmm5              ; xmm6=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm3              ; xmm7=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm1, xmm3              ; xmm1=(02 12 22 32 03 13 23 33)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm4              ; xmm2=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm5, xmm4              ; xmm5=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
+    movdqa      xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm2, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm2, xmm3              ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm4              ; xmm6=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm5, xmm4              ; xmm5=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm0              ; xmm7=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm3, xmm0              ; xmm3=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(8)], xmm3   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm4   ; wk(9)=col3
+
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm3, xmm6              ; xmm3=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm5              ; xmm2=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm4, xmm5              ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         rax, [original_rbp]
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; -- Even part
+
+    ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm6, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm6, xmm2              ; xmm2=in6=z3
+    punpckhwd   xmm5, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm2, xmm5
+    pmaddwd     xmm6, [rel PW_F130_F054]   ; xmm6=tmp3L
+    pmaddwd     xmm5, [rel PW_F130_F054]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [rel PW_F054_MF130]  ; xmm1=tmp2L
+    pmaddwd     xmm2, [rel PW_F054_MF130]  ; xmm2=tmp2H
+
+    movdqa      xmm3, xmm7
+    paddw       xmm7, xmm0              ; xmm7=in0+in4
+    psubw       xmm3, xmm0              ; xmm3=in0-in4
+
+    pxor        xmm4, xmm4
+    pxor        xmm0, xmm0
+    punpcklwd   xmm4, xmm7              ; xmm4=tmp0L
+    punpckhwd   xmm0, xmm7              ; xmm0=tmp0H
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+    psrad       xmm0, (16-CONST_BITS)   ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm6              ; xmm4=tmp10L
+    psubd       xmm7, xmm6              ; xmm7=tmp13L
+    movdqa      xmm6, xmm0
+    paddd       xmm0, xmm5              ; xmm0=tmp10H
+    psubd       xmm6, xmm5              ; xmm6=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm4, xmm4
+    punpcklwd   xmm5, xmm3              ; xmm5=tmp1L
+    punpckhwd   xmm4, xmm3              ; xmm4=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+    movdqa      xmm0, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm0, xmm1              ; xmm0=tmp12L
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm2              ; xmm4=tmp11H
+    psubd       xmm7, xmm2              ; xmm7=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm6, XMMWORD [wk(9)]   ; xmm6=col3
+    movdqa      xmm3, XMMWORD [wk(8)]   ; xmm3=col1
+    movdqa      xmm1, XMMWORD [wk(11)]  ; xmm1=col7
+    movdqa      xmm2, XMMWORD [wk(10)]  ; xmm2=col5
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm4, xmm3
+    paddw       xmm5, xmm1              ; xmm5=z3
+    paddw       xmm4, xmm2              ; xmm4=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm5
+    punpcklwd   xmm0, xmm4
+    punpckhwd   xmm7, xmm4
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm7
+    pmaddwd     xmm0, [rel PW_MF078_F117]  ; xmm0=z3L
+    pmaddwd     xmm7, [rel PW_MF078_F117]  ; xmm7=z3H
+    pmaddwd     xmm5, [rel PW_F117_F078]   ; xmm5=z4L
+    pmaddwd     xmm4, [rel PW_F117_F078]   ; xmm4=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm7, xmm1
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm1, xmm0
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm0, [rel PW_MF060_MF089]  ; xmm0=tmp0L
+    pmaddwd     xmm7, [rel PW_MF060_MF089]  ; xmm7=tmp0H
+    pmaddwd     xmm1, [rel PW_MF089_F060]   ; xmm1=tmp3L
+    pmaddwd     xmm3, [rel PW_MF089_F060]   ; xmm3=tmp3H
+
+    paddd       xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
+    paddd       xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
+    paddd       xmm1, xmm5              ; xmm1=tmp3L
+    paddd       xmm3, xmm4              ; xmm3=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
+
+    movdqa      xmm0, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm0, xmm6
+    punpckhwd   xmm7, xmm6
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm7
+    pmaddwd     xmm0, [rel PW_MF050_MF256]  ; xmm0=tmp1L
+    pmaddwd     xmm7, [rel PW_MF050_MF256]  ; xmm7=tmp1H
+    pmaddwd     xmm2, [rel PW_MF256_F050]   ; xmm2=tmp2L
+    pmaddwd     xmm6, [rel PW_MF256_F050]   ; xmm6=tmp2H
+
+    paddd       xmm0, xmm5              ; xmm0=tmp1L
+    paddd       xmm7, xmm4              ; xmm7=tmp1H
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm4
+    paddd       xmm5, xmm1              ; xmm5=data0L
+    paddd       xmm4, xmm3              ; xmm4=data0H
+    psubd       xmm0, xmm1              ; xmm0=data7L
+    psubd       xmm7, xmm3              ; xmm7=data7H
+
+    movdqa      xmm1, [rel PD_DESCALE_P2]  ; xmm1=[rel PD_DESCALE_P2]
+
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrad       xmm5, DESCALE_P2
+    psrad       xmm4, DESCALE_P2
+    paddd       xmm0, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm5, xmm4              ; xmm5=data0=(00 10 20 30 40 50 60 70)
+    packssdw    xmm0, xmm7              ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
+    movdqa      xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm7, xmm1
+    paddd       xmm3, xmm2              ; xmm3=data1L
+    paddd       xmm1, xmm6              ; xmm1=data1H
+    psubd       xmm4, xmm2              ; xmm4=data6L
+    psubd       xmm7, xmm6              ; xmm7=data6H
+
+    movdqa      xmm2, [rel PD_DESCALE_P2]  ; xmm2=[rel PD_DESCALE_P2]
+
+    paddd       xmm3, xmm2
+    paddd       xmm1, xmm2
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm4, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm3, xmm1              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    packssdw    xmm4, xmm7              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+    packsswb    xmm5, xmm4              ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm0              ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
+    movdqa      xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
+    movdqa      xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm4, xmm6
+    movdqa      xmm0, xmm2
+    paddd       xmm6, xmm1              ; xmm6=data2L
+    paddd       xmm2, xmm7              ; xmm2=data2H
+    psubd       xmm4, xmm1              ; xmm4=data5L
+    psubd       xmm0, xmm7              ; xmm0=data5H
+
+    movdqa      xmm5, [rel PD_DESCALE_P2]  ; xmm5=[rel PD_DESCALE_P2]
+
+    paddd       xmm6, xmm5
+    paddd       xmm2, xmm5
+    psrad       xmm6, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm4, xmm5
+    paddd       xmm0, xmm5
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    packssdw    xmm6, xmm2              ; xmm6=data2=(02 12 22 32 42 52 62 72)
+    packssdw    xmm4, xmm0              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+    movdqa      xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
+    movdqa      xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
+    movdqa      xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
+    movdqa      xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm1
+    paddd       xmm3, xmm7              ; xmm3=data3L
+    paddd       xmm1, xmm5              ; xmm1=data3H
+    psubd       xmm2, xmm7              ; xmm2=data4L
+    psubd       xmm0, xmm5              ; xmm0=data4H
+
+    movdqa      xmm7, [rel PD_DESCALE_P2]  ; xmm7=[rel PD_DESCALE_P2]
+
+    paddd       xmm3, xmm7
+    paddd       xmm1, xmm7
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm2, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm2, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    movdqa      xmm5, [rel PB_CENTERJSAMP]  ; xmm5=[rel PB_CENTERJSAMP]
+
+    packssdw    xmm3, xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
+    packssdw    xmm2, xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+    movdqa      xmm7, XMMWORD [wk(0)]  ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      xmm1, XMMWORD [wk(1)]  ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    packsswb    xmm6, xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm3, xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm7, xmm5
+    paddb       xmm1, xmm5
+    paddb       xmm6, xmm5
+    paddb       xmm3, xmm5
+
+    movdqa      xmm0, xmm7        ; transpose coefficients(phase 1)
+    punpcklbw   xmm7, xmm1        ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm1        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 1)
+    punpcklbw   xmm6, xmm3        ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm2, xmm3        ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm7        ; transpose coefficients(phase 2)
+    punpcklwd   xmm7, xmm6        ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm6        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm5, xmm2        ; transpose coefficients(phase 2)
+    punpcklwd   xmm2, xmm0        ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm5, xmm0        ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm1, xmm7        ; transpose coefficients(phase 3)
+    punpckldq   xmm7, xmm2        ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm1, xmm2        ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm3, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm5        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm3, xmm5        ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm6, xmm7, 0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm1, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm2, xmm4, 0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm5, xmm3, 0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+    mov         rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+    mov         rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+    mov         rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+    mov         rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+    mov         rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jidctred-sse2.asm b/simd/x86_64/jidctred-sse2.asm
new file mode 100644
index 0000000..af64fdc
--- /dev/null
+++ b/simd/x86_64/jidctred-sse2.asm
@@ -0,0 +1,575 @@
+;
+; jidctred.asm - reduced-size IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS    13
+%define PASS1_BITS    2
+
+%define DESCALE_P1_4  (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4  (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2  (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2  (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ  1730  ; FIX(0.211164243)
+F_0_509 equ  4176  ; FIX(0.509795579)
+F_0_601 equ  4926  ; FIX(0.601344887)
+F_0_720 equ  5906  ; FIX(0.720959822)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_850 equ  6967  ; FIX(0.850430095)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_061 equ  8697  ; FIX(1.061594337)
+F_1_272 equ 10426  ; FIX(1.272758580)
+F_1_451 equ 11893  ; FIX(1.451774981)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_2_172 equ 17799  ; FIX(2.172734803)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_624 equ 29692  ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS)  ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS)  ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS)  ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS)  ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS)  ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS)  ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS)  ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS)  ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS)  ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076   times 4  dw  F_1_847, -F_0_765
+PW_F256_F089    times 4  dw  F_2_562,  F_0_899
+PW_F106_MF217   times 4  dw  F_1_061, -F_2_172
+PW_MF060_MF050  times 4  dw -F_0_601, -F_0_509
+PW_F145_MF021   times 4  dw  F_1_451, -F_0_211
+PW_F362_MF127   times 4  dw  F_3_624, -F_1_272
+PW_F085_MF072   times 4  dw  F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 4  dd  1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 4  dd  1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 4  dd  1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 4  dd  1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+    mov         eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, xmm1
+    packsswb    xmm0, xmm0
+    packsswb    xmm0, xmm0
+    movd        eax, xmm0
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm0, PASS1_BITS
+
+    movdqa      xmm3, xmm0        ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0        ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm3, xmm3        ; xmm3=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm1, xmm0, 0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+    pshufd      xmm0, xmm0, 0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+    pshufd      xmm6, xmm3, 0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+    pshufd      xmm3, xmm3, 0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm0
+    punpcklwd   xmm4, xmm1
+    punpckhwd   xmm5, xmm1
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    pmaddwd     xmm4, [rel PW_F256_F089]   ; xmm4=(tmp2L)
+    pmaddwd     xmm5, [rel PW_F256_F089]   ; xmm5=(tmp2H)
+    pmaddwd     xmm0, [rel PW_F106_MF217]  ; xmm0=(tmp0L)
+    pmaddwd     xmm1, [rel PW_F106_MF217]  ; xmm1=(tmp0H)
+
+    movdqa      xmm6, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm6, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm6, [rel PW_MF060_MF050]  ; xmm6=(tmp2L)
+    pmaddwd     xmm7, [rel PW_MF060_MF050]  ; xmm7=(tmp2H)
+    pmaddwd     xmm2, [rel PW_F145_MF021]   ; xmm2=(tmp0L)
+    pmaddwd     xmm3, [rel PW_F145_MF021]   ; xmm3=(tmp0H)
+
+    paddd       xmm6, xmm4              ; xmm6=tmp2L
+    paddd       xmm7, xmm5              ; xmm7=tmp2H
+    paddd       xmm2, xmm0              ; xmm2=tmp0L
+    paddd       xmm3, xmm1              ; xmm3=tmp0H
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    pxor        xmm1, xmm1
+    pxor        xmm2, xmm2
+    punpcklwd   xmm1, xmm4               ; xmm1=tmp0L
+    punpckhwd   xmm2, xmm4               ; xmm2=tmp0H
+    psrad       xmm1, (16-CONST_BITS-1)  ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+    psrad       xmm2, (16-CONST_BITS-1)  ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+    movdqa      xmm3, xmm5              ; xmm5=in2=z2
+    punpcklwd   xmm5, xmm0              ; xmm0=in6=z3
+    punpckhwd   xmm3, xmm0
+    pmaddwd     xmm5, [rel PW_F184_MF076]  ; xmm5=tmp2L
+    pmaddwd     xmm3, [rel PW_F184_MF076]  ; xmm3=tmp2H
+
+    movdqa      xmm4, xmm1
+    movdqa      xmm0, xmm2
+    paddd       xmm1, xmm5              ; xmm1=tmp10L
+    paddd       xmm2, xmm3              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp12L
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, xmm1
+    movdqa      xmm3, xmm2
+    paddd       xmm1, xmm6              ; xmm1=data0L
+    paddd       xmm2, xmm7              ; xmm2=data0H
+    psubd       xmm5, xmm6              ; xmm5=data3L
+    psubd       xmm3, xmm7              ; xmm3=data3H
+
+    movdqa      xmm6, [rel PD_DESCALE_P1_4]  ; xmm6=[rel PD_DESCALE_P1_4]
+
+    paddd       xmm1, xmm6
+    paddd       xmm2, xmm6
+    psrad       xmm1, DESCALE_P1_4
+    psrad       xmm2, DESCALE_P1_4
+    paddd       xmm5, xmm6
+    paddd       xmm3, xmm6
+    psrad       xmm5, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm1, xmm2              ; xmm1=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm5, xmm3              ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
+
+    movdqa      xmm2, xmm4
+    movdqa      xmm3, xmm0
+    paddd       xmm4, xmm7              ; xmm4=data1L
+    paddd       xmm0, xmm6              ; xmm0=data1H
+    psubd       xmm2, xmm7              ; xmm2=data2L
+    psubd       xmm3, xmm6              ; xmm3=data2H
+
+    movdqa      xmm7, [rel PD_DESCALE_P1_4]  ; xmm7=[rel PD_DESCALE_P1_4]
+
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm4, DESCALE_P1_4
+    psrad       xmm0, DESCALE_P1_4
+    paddd       xmm2, xmm7
+    paddd       xmm3, xmm7
+    psrad       xmm2, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm4, xmm0        ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm2, xmm3        ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+    movdqa      xmm6, xmm1        ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm4        ; xmm1=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4        ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm7, xmm2        ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm5        ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm7, xmm5        ; xmm7=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm2        ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm0, xmm2        ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+    movdqa      xmm3, xmm6        ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7        ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm3, xmm7        ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         rax, [original_rbp]
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; -- Even part
+
+    pxor        xmm4, xmm4
+    punpcklwd   xmm4, xmm1               ; xmm4=tmp0
+    psrad       xmm4, (16-CONST_BITS-1)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+    ; -- Odd part
+
+    punpckhwd   xmm1, xmm0
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm5, xmm1
+    movdqa      xmm2, xmm6
+    pmaddwd     xmm1, [rel PW_F256_F089]    ; xmm1=(tmp2)
+    pmaddwd     xmm6, [rel PW_MF060_MF050]  ; xmm6=(tmp2)
+    pmaddwd     xmm5, [rel PW_F106_MF217]   ; xmm5=(tmp0)
+    pmaddwd     xmm2, [rel PW_F145_MF021]   ; xmm2=(tmp0)
+
+    paddd       xmm6, xmm1              ; xmm6=tmp2
+    paddd       xmm2, xmm5              ; xmm2=tmp0
+
+    ; -- Even part
+
+    punpcklwd   xmm0, xmm3
+    pmaddwd     xmm0, [rel PW_F184_MF076]  ; xmm0=tmp2
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm0              ; xmm4=tmp10
+    psubd       xmm7, xmm0              ; xmm7=tmp12
+
+    ; -- Final output stage
+
+    movdqa      xmm1, [rel PD_DESCALE_P2_4]  ; xmm1=[rel PD_DESCALE_P2_4]
+
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm7
+    paddd       xmm4, xmm6              ; xmm4=data0=(00 10 20 30)
+    paddd       xmm7, xmm2              ; xmm7=data1=(01 11 21 31)
+    psubd       xmm5, xmm6              ; xmm5=data3=(03 13 23 33)
+    psubd       xmm3, xmm2              ; xmm3=data2=(02 12 22 32)
+
+    paddd       xmm4, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm4, DESCALE_P2_4
+    psrad       xmm7, DESCALE_P2_4
+    paddd       xmm5, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm5, DESCALE_P2_4
+    psrad       xmm3, DESCALE_P2_4
+
+    packssdw    xmm4, xmm3              ; xmm4=(00 10 20 30 02 12 22 32)
+    packssdw    xmm7, xmm5              ; xmm7=(01 11 21 31 03 13 23 33)
+
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 1)
+    punpcklwd   xmm4, xmm7              ; xmm4=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm0, xmm7              ; xmm0=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm6, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm6, xmm0              ; xmm6=(20 21 22 23 30 31 32 33)
+
+    packsswb    xmm4, xmm6              ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+    paddb       xmm4, [rel PB_CENTERJSAMP]
+
+    pshufd      xmm2, xmm4, 0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+    pshufd      xmm1, xmm4, 0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+    pshufd      xmm3, xmm4, 0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movd        XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+    mov         rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    mov         rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+    movd        XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+    ; | input:                  | result:        |
+    ; | 00 01 ** 03 ** 05 ** 07 |                |
+    ; | 10 11 ** 13 ** 15 ** 17 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+    ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+    ; | 50 51 ** 53 ** 55 ** 57 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 70 71 ** 73 ** 75 ** 77 |                |
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+    ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+    pcmpeqd     xmm7, xmm7
+    pslld       xmm7, WORD_BIT          ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+    movdqa      xmm4, xmm0              ; xmm4=(10 11 ** 13 ** 15 ** 17)
+    movdqa      xmm5, xmm2              ; xmm5=(50 51 ** 53 ** 55 ** 57)
+    punpcklwd   xmm4, xmm1              ; xmm4=(10 30 11 31 ** ** 13 33)
+    punpcklwd   xmm5, xmm3              ; xmm5=(50 70 51 71 ** ** 53 73)
+    pmaddwd     xmm4, [rel PW_F362_MF127]
+    pmaddwd     xmm5, [rel PW_F085_MF072]
+
+    psrld       xmm0, WORD_BIT          ; xmm0=(11 -- 13 -- 15 -- 17 --)
+    pand        xmm1, xmm7              ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+    psrld       xmm2, WORD_BIT          ; xmm2=(51 -- 53 -- 55 -- 57 --)
+    pand        xmm3, xmm7              ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+    por         xmm0, xmm1              ; xmm0=(11 31 13 33 15 35 17 37)
+    por         xmm2, xmm3              ; xmm2=(51 71 53 73 55 75 57 77)
+    pmaddwd     xmm0, [rel PW_F362_MF127]
+    pmaddwd     xmm2, [rel PW_F085_MF072]
+
+    paddd       xmm4, xmm5              ; xmm4=tmp0[col0 col1 **** col3]
+    paddd       xmm0, xmm2              ; xmm0=tmp0[col1 col3 col5 col7]
+
+    ; -- Even part
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+    movdqa      xmm1, xmm6              ; xmm1=(00 01 ** 03 ** 05 ** 07)
+    pslld       xmm6, WORD_BIT          ; xmm6=(-- 00 -- ** -- ** -- **)
+    pand        xmm1, xmm7              ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+    psrad       xmm6, (WORD_BIT-CONST_BITS-2)  ; xmm6=tmp10[col0 **** **** ****]
+    psrad       xmm1, (WORD_BIT-CONST_BITS-2)  ; xmm1=tmp10[col1 col3 col5 col7]
+
+    ; -- Final output stage
+
+    movdqa      xmm3, xmm6
+    movdqa      xmm5, xmm1
+    paddd       xmm6, xmm4      ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+    paddd       xmm1, xmm0      ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+    psubd       xmm3, xmm4      ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+    psubd       xmm5, xmm0      ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+    movdqa      xmm2, [rel PD_DESCALE_P1_2]  ; xmm2=[rel PD_DESCALE_P1_2]
+
+    punpckldq   xmm6, xmm3              ; xmm6=(A0 B0 ** **)
+
+    movdqa      xmm7, xmm1
+    punpcklqdq  xmm1, xmm5              ; xmm1=(A1 A3 B1 B3)
+    punpckhqdq  xmm7, xmm5              ; xmm7=(A5 A7 B5 B7)
+
+    paddd       xmm6, xmm2
+    psrad       xmm6, DESCALE_P1_2
+
+    paddd       xmm1, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm1, DESCALE_P1_2
+    psrad       xmm7, DESCALE_P1_2
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; | input:| result:|
+    ; | A0 B0 |        |
+    ; | A1 B1 | C0 C1  |
+    ; | A3 B3 | D0 D1  |
+    ; | A5 B5 |        |
+    ; | A7 B7 |        |
+
+    ; -- Odd part
+
+    packssdw    xmm1, xmm1              ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+    packssdw    xmm7, xmm7              ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+    pmaddwd     xmm1, [rel PW_F362_MF127]
+    pmaddwd     xmm7, [rel PW_F085_MF072]
+
+    paddd       xmm1, xmm7              ; xmm1=tmp0[row0 row1 row0 row1]
+
+    ; -- Even part
+
+    pslld       xmm6, (CONST_BITS+2)    ; xmm6=tmp10[row0 row1 **** ****]
+
+    ; -- Final output stage
+
+    movdqa      xmm4, xmm6
+    paddd       xmm6, xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+    psubd       xmm4, xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+    punpckldq   xmm6, xmm4     ; xmm6=(C0 D0 C1 D1)
+
+    paddd       xmm6, [rel PD_DESCALE_P2_2]
+    psrad       xmm6, DESCALE_P2_2
+
+    packssdw    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+    packsswb    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+    paddb       xmm6, [rel PB_CENTERJSAMP]
+
+    pextrw      ebx, xmm6, 0x00         ; ebx=(C0 D0 -- --)
+    pextrw      ecx, xmm6, 0x01         ; ecx=(C1 D1 -- --)
+
+    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         WORD [rdx+rax*SIZEOF_JSAMPLE], bx
+    mov         WORD [rsi+rax*SIZEOF_JSAMPLE], cx
+
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jquantf-sse2.asm b/simd/x86_64/jquantf-sse2.asm
new file mode 100644
index 0000000..4600eec
--- /dev/null
+++ b/simd/x86_64/jquantf-sse2.asm
@@ -0,0 +1,156 @@
+;
+; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                           FAST_FLOAT *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+    push        rbx
+
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7
+    packsswb    xmm7, xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         rsi, r10
+    mov         eax, r11d
+    mov         rdi, r12
+    mov         rcx, DCTSIZE/2
+.convloop:
+    mov         rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
+    movq        xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
+
+    psubb       xmm0, xmm7              ; xmm0=(01234567)
+    psubb       xmm1, xmm7              ; xmm1=(89ABCDEF)
+
+    punpcklbw   xmm0, xmm0              ; xmm0=(*0*1*2*3*4*5*6*7)
+    punpcklbw   xmm1, xmm1              ; xmm1=(*8*9*A*B*C*D*E*F)
+
+    punpcklwd   xmm2, xmm0              ; xmm2=(***0***1***2***3)
+    punpckhwd   xmm0, xmm0              ; xmm0=(***4***5***6***7)
+    punpcklwd   xmm3, xmm1              ; xmm3=(***8***9***A***B)
+    punpckhwd   xmm1, xmm1              ; xmm1=(***C***D***E***F)
+
+    psrad       xmm2, (DWORD_BIT-BYTE_BIT)  ; xmm2=(0123)
+    psrad       xmm0, (DWORD_BIT-BYTE_BIT)  ; xmm0=(4567)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=(0123)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=(4567)
+    psrad       xmm3, (DWORD_BIT-BYTE_BIT)  ; xmm3=(89AB)
+    psrad       xmm1, (DWORD_BIT-BYTE_BIT)  ; xmm1=(CDEF)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=(89AB)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=(CDEF)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+
+    add         rsi, byte 2*SIZEOF_JSAMPROW
+    add         rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         rcx
+    jnz         short .convloop
+
+    pop         rbx
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                           FAST_FLOAT *workspace);
+;
+
+; r10 = JCOEFPTR coef_block
+; r11 = FAST_FLOAT *divisors
+; r12 = FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    mov         rsi, r12
+    mov         rdx, r11
+    mov         rdi, r10
+    mov         rax, DCTSIZE2/16
+.quantloop:
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+    cvtps2dq    xmm0, xmm0
+    cvtps2dq    xmm1, xmm1
+    cvtps2dq    xmm2, xmm2
+    cvtps2dq    xmm3, xmm3
+
+    packssdw    xmm0, xmm1
+    packssdw    xmm2, xmm3
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
+
+    add         rsi, byte 16*SIZEOF_FAST_FLOAT
+    add         rdx, byte 16*SIZEOF_FAST_FLOAT
+    add         rdi, byte 16*SIZEOF_JCOEF
+    dec         rax
+    jnz         short .quantloop
+
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jquanti-avx2.asm b/simd/x86_64/jquanti-avx2.asm
new file mode 100644
index 0000000..b7243e4
--- /dev/null
+++ b/simd/x86_64/jquanti-avx2.asm
@@ -0,0 +1,164 @@
+;
+; jquanti.asm - sample data conversion and quantization (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, D. R. Commander.
+; Copyright (C) 2016, Matthieu Darbois.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_avx2)
+
+EXTN(jsimd_convsamp_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    mov         eax, r11d
+
+    mov         rsi, JSAMPROW [r10+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdi, JSAMPROW [r10+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm0, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm0, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    mov         rsi, JSAMPROW [r10+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdi, JSAMPROW [r10+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm1, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm1, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    mov         rsi, JSAMPROW [r10+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdi, JSAMPROW [r10+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm2, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm2, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    mov         rsi, JSAMPROW [r10+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdi, JSAMPROW [r10+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm3, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm3, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    vpmovzxbw   ymm0, xmm0              ; ymm0=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    vpmovzxbw   ymm1, xmm1              ; ymm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    vpmovzxbw   ymm2, xmm2              ; ymm2=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    vpmovzxbw   ymm3, xmm3              ; ymm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm1, ymm1, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpaddw      ymm3, ymm3, ymm7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)], ymm1
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)], ymm3
+
+    vzeroupper
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM *divisors
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_avx2)
+
+EXTN(jsimd_quantize_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    vmovdqu     ymm4, [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)]
+    vpabsw      ymm0, ymm4
+    vpabsw      ymm1, ymm5
+    vpabsw      ymm2, ymm6
+    vpabsw      ymm3, ymm7
+
+    vpaddw      ymm0, YMMWORD [CORRECTION(0,0,r11)]  ; correction + roundfactor
+    vpaddw      ymm1, YMMWORD [CORRECTION(2,0,r11)]
+    vpaddw      ymm2, YMMWORD [CORRECTION(4,0,r11)]
+    vpaddw      ymm3, YMMWORD [CORRECTION(6,0,r11)]
+    vpmulhuw    ymm0, YMMWORD [RECIPROCAL(0,0,r11)]  ; reciprocal
+    vpmulhuw    ymm1, YMMWORD [RECIPROCAL(2,0,r11)]
+    vpmulhuw    ymm2, YMMWORD [RECIPROCAL(4,0,r11)]
+    vpmulhuw    ymm3, YMMWORD [RECIPROCAL(6,0,r11)]
+    vpmulhuw    ymm0, YMMWORD [SCALE(0,0,r11)]       ; scale
+    vpmulhuw    ymm1, YMMWORD [SCALE(2,0,r11)]
+    vpmulhuw    ymm2, YMMWORD [SCALE(4,0,r11)]
+    vpmulhuw    ymm3, YMMWORD [SCALE(6,0,r11)]
+
+    vpsignw     ymm0, ymm0, ymm4
+    vpsignw     ymm1, ymm1, ymm5
+    vpsignw     ymm2, ymm2, ymm6
+    vpsignw     ymm3, ymm3, ymm7
+
+    vmovdqu     [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm1
+    vmovdqu     [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3
+
+    vzeroupper
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jquanti-sse2.asm b/simd/x86_64/jquanti-sse2.asm
new file mode 100644
index 0000000..7ff7275
--- /dev/null
+++ b/simd/x86_64/jquanti-sse2.asm
@@ -0,0 +1,189 @@
+;
+; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+    push        rbx
+
+    pxor        xmm6, xmm6              ; xmm6=(all 0's)
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    mov         rsi, r10
+    mov         eax, r11d
+    mov         rdi, r12
+    mov         rcx, DCTSIZE/4
+.convloop:
+    mov         rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]  ; xmm0=(01234567)
+    movq        xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]  ; xmm1=(89ABCDEF)
+
+    mov         rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]  ; xmm2=(GHIJKLMN)
+    movq        xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]  ; xmm3=(OPQRSTUV)
+
+    punpcklbw   xmm0, xmm6              ; xmm0=(01234567)
+    punpcklbw   xmm1, xmm6              ; xmm1=(89ABCDEF)
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    punpcklbw   xmm2, xmm6              ; xmm2=(GHIJKLMN)
+    punpcklbw   xmm3, xmm6              ; xmm3=(OPQRSTUV)
+    paddw       xmm2, xmm7
+    paddw       xmm3, xmm7
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+    add         rsi, byte 4*SIZEOF_JSAMPROW
+    add         rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         rcx
+    jnz         short .convloop
+
+    pop         rbx
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM *divisors
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    mov         rsi, r12
+    mov         rdx, r11
+    mov         rdi, r10
+    mov         rax, DCTSIZE2/32
+.quantloop:
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    psraw       xmm4, (WORD_BIT-1)
+    psraw       xmm5, (WORD_BIT-1)
+    psraw       xmm6, (WORD_BIT-1)
+    psraw       xmm7, (WORD_BIT-1)
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4              ; if (xmm0 < 0) xmm0 = -xmm0;
+    psubw       xmm1, xmm5              ; if (xmm1 < 0) xmm1 = -xmm1;
+    psubw       xmm2, xmm6              ; if (xmm2 < 0) xmm2 = -xmm2;
+    psubw       xmm3, xmm7              ; if (xmm3 < 0) xmm3 = -xmm3;
+
+    paddw       xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor
+    paddw       xmm1, XMMWORD [CORRECTION(1,0,rdx)]
+    paddw       xmm2, XMMWORD [CORRECTION(2,0,rdx)]
+    paddw       xmm3, XMMWORD [CORRECTION(3,0,rdx)]
+    pmulhuw     xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal
+    pmulhuw     xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
+    pmulhuw     xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
+    pmulhuw     xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
+    pmulhuw     xmm0, XMMWORD [SCALE(0,0,rdx)]       ; scale
+    pmulhuw     xmm1, XMMWORD [SCALE(1,0,rdx)]
+    pmulhuw     xmm2, XMMWORD [SCALE(2,0,rdx)]
+    pmulhuw     xmm3, XMMWORD [SCALE(3,0,rdx)]
+
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4
+    psubw       xmm1, xmm5
+    psubw       xmm2, xmm6
+    psubw       xmm3, xmm7
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+    add         rsi, byte 32*SIZEOF_DCTELEM
+    add         rdx, byte 32*SIZEOF_DCTELEM
+    add         rdi, byte 32*SIZEOF_JCOEF
+    dec         rax
+    jnz         near .quantloop
+
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/simd/x86_64/jsimd.c b/simd/x86_64/jsimd.c
new file mode 100644
index 0000000..03a4da3
--- /dev/null
+++ b/simd/x86_64/jsimd.c
@@ -0,0 +1,1018 @@
+/*
+ * jsimd_x86_64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
+ * Copyright (C) 2015, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit x86 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order) (((size_t)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+#define IS_ALIGNED_AVX(ptr) (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
+
+static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+  char *env = NULL;
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = jpeg_simd_cpu_support();
+
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCESSE2");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support &= JSIMD_SSE2;
+  env = getenv("JSIMD_FORCEAVX2");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support &= JSIMD_AVX2;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+  env = getenv("JSIMD_NOHUFFENC");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_huffman = 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_ycc_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_gray_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extrgb_ycc_convert_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_ycc_convert_avx2;
+    sse2fct = jsimd_extrgbx_ycc_convert_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extbgr_ycc_convert_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_ycc_convert_avx2;
+    sse2fct = jsimd_extbgrx_ycc_convert_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extxbgr_ycc_convert_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extxrgb_ycc_convert_sse2;
+    break;
+  default:
+    avx2fct = jsimd_rgb_ycc_convert_avx2;
+    sse2fct = jsimd_rgb_ycc_convert_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_gray_convert_avx2;
+    sse2fct = jsimd_extrgb_gray_convert_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_gray_convert_avx2;
+    sse2fct = jsimd_extrgbx_gray_convert_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_gray_convert_avx2;
+    sse2fct = jsimd_extbgr_gray_convert_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_gray_convert_avx2;
+    sse2fct = jsimd_extbgrx_gray_convert_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_gray_convert_avx2;
+    sse2fct = jsimd_extxbgr_gray_convert_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_gray_convert_avx2;
+    sse2fct = jsimd_extxrgb_gray_convert_sse2;
+    break;
+  default:
+    avx2fct = jsimd_rgb_gray_convert_avx2;
+    sse2fct = jsimd_rgb_gray_convert_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_ycc_extrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extrgb_convert_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_ycc_extrgbx_convert_avx2;
+    sse2fct = jsimd_ycc_extrgbx_convert_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_ycc_extbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extbgr_convert_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_ycc_extbgrx_convert_avx2;
+    sse2fct = jsimd_ycc_extbgrx_convert_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_ycc_extxbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extxbgr_convert_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_ycc_extxrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extxrgb_convert_sse2;
+    break;
+  default:
+    avx2fct = jsimd_ycc_rgb_convert_avx2;
+    sse2fct = jsimd_ycc_rgb_convert_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  else
+    sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgb_merged_upsample_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v2_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgbx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v2_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v2_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgrx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v2_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v2_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxrgb_merged_upsample_sse2;
+    break;
+  default:
+    avx2fct = jsimd_h2v2_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_merged_upsample_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgb_merged_upsample_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v1_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgbx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v1_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v1_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgrx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v1_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v1_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxrgb_merged_upsample_sse2;
+    break;
+  default:
+    avx2fct = jsimd_h2v1_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_merged_upsample_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_convsamp_avx2(sample_data, start_col, workspace);
+  else
+    jsimd_convsamp_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+  jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_fdct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_fdct_islow_avx2(data);
+  else
+    jsimd_fdct_islow_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+  jsimd_fdct_float_sse(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_quantize_avx2(coef_block, divisors, workspace);
+  else
+    jsimd_quantize_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+  jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_idct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+  if (sizeof(FLOAT_MULT_TYPE) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else
+    jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
+      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
diff --git a/simd/x86_64/jsimdcpu.asm b/simd/x86_64/jsimdcpu.asm
new file mode 100644
index 0000000..42979be
--- /dev/null
+++ b/simd/x86_64/jsimdcpu.asm
@@ -0,0 +1,78 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support(void)
+;
+
+    align       32
+    GLOBAL_FUNCTION(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+    push        rbx
+    push        rdi
+
+    xor         rdi, rdi                ; simd support flag
+
+    ; Check for AVX2 instruction support
+    mov         rax, 7
+    xor         rcx, rcx
+    cpuid
+    mov         rax, rbx                ; rax = Extended feature flags
+
+    or          rdi, JSIMD_SSE2
+    or          rdi, JSIMD_SSE
+    test        rax, 1<<5               ; bit5:AVX2
+    jz          short .return
+
+    ; Check for AVX2 O/S support
+    mov         rax, 1
+    xor         rcx, rcx
+    cpuid
+    test        rcx, 1<<27
+    jz          short .return           ; O/S does not support XSAVE
+    test        rcx, 1<<28
+    jz          short .return           ; CPU does not support AVX2
+
+    xor         rcx, rcx
+    xgetbv
+    test        rax, 6                  ; O/S does not manage XMM/YMM state
+                                        ; using XSAVE
+    jz          short .return
+
+    or          rdi, JSIMD_AVX2
+
+.return:
+    mov         rax, rdi
+
+    pop         rdi
+    pop         rbx
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/testimages/test1.icc b/testimages/test1.icc
new file mode 100644
index 0000000..d0245c8
--- /dev/null
+++ b/testimages/test1.icc
Binary files differ
diff --git a/testimages/test1.icc.txt b/testimages/test1.icc.txt
new file mode 100644
index 0000000..57fc52f
--- /dev/null
+++ b/testimages/test1.icc.txt
@@ -0,0 +1,20 @@
+Little CMS
+Copyright (c) 1998-2011 Marti Maria Saguer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/testimages/test2.icc b/testimages/test2.icc
new file mode 100644
index 0000000..73f1b5a
--- /dev/null
+++ b/testimages/test2.icc
Binary files differ
diff --git a/testimages/test2.icc.txt b/testimages/test2.icc.txt
new file mode 100644
index 0000000..57fc52f
--- /dev/null
+++ b/testimages/test2.icc.txt
@@ -0,0 +1,20 @@
+Little CMS
+Copyright (c) 1998-2011 Marti Maria Saguer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/tjbench.c b/tjbench.c
index 76b61cd..e17fd7d 100644
--- a/tjbench.c
+++ b/tjbench.c
@@ -33,978 +33,977 @@
 #include <math.h>
 #include <errno.h>
 #include <cdjpeg.h>
-#include "./bmp.h"
 #include "./tjutil.h"
 #include "./turbojpeg.h"
 
 
-#define _throw(op, err) {  \
-	printf("ERROR in line %d while %s:\n%s\n", __LINE__, op, err);  \
-	retval=-1;  goto bailout;}
+#define _throw(op, err) { \
+  printf("ERROR in line %d while %s:\n%s\n", __LINE__, op, err); \
+  retval = -1;  goto bailout; \
+}
 #define _throwunix(m) _throw(m, strerror(errno))
-#define _throwtj(m) _throw(m, tjGetErrorStr())
-#define _throwbmp(m) _throw(m, bmpgeterr())
 
-int flags=TJFLAG_NOREALLOC, componly=0, decomponly=0, doyuv=0, quiet=0,
-	dotile=0, pf=TJPF_BGR, yuvpad=1, dowrite=1;
-char *ext="ppm";
-const char *pixFormatStr[TJ_NUMPF]=
-{
-	"RGB", "BGR", "RGBX", "BGRX", "XBGR", "XRGB", "GRAY", "", "", "", "", "CMYK"
+char tjErrorStr[JMSG_LENGTH_MAX] = "\0", tjErrorMsg[JMSG_LENGTH_MAX] = "\0";
+int tjErrorLine = -1, tjErrorCode = -1;
+
+#define _throwtjg(m) { \
+  printf("ERROR in line %d while %s:\n%s\n", __LINE__, m, \
+         tjGetErrorStr2(NULL)); \
+  retval = -1;  goto bailout; \
+}
+
+#define _throwtj(m) { \
+  int _tjErrorCode = tjGetErrorCode(handle); \
+  char *_tjErrorStr = tjGetErrorStr2(handle); \
+  \
+  if (!(flags & TJFLAG_STOPONWARNING) && _tjErrorCode == TJERR_WARNING) { \
+    if (strncmp(tjErrorStr, _tjErrorStr, JMSG_LENGTH_MAX) || \
+        strncmp(tjErrorMsg, m, JMSG_LENGTH_MAX) || \
+        tjErrorCode != _tjErrorCode || tjErrorLine != __LINE__) { \
+      strncpy(tjErrorStr, _tjErrorStr, JMSG_LENGTH_MAX); \
+      strncpy(tjErrorMsg, m, JMSG_LENGTH_MAX); \
+      tjErrorCode = _tjErrorCode; \
+      tjErrorLine = __LINE__; \
+      printf("WARNING in line %d while %s:\n%s\n", __LINE__, m, _tjErrorStr); \
+    } \
+  } else { \
+    printf("%s in line %d while %s:\n%s\n", \
+           _tjErrorCode == TJERR_WARNING ? "WARNING" : "ERROR", __LINE__, m, \
+           _tjErrorStr); \
+    retval = -1;  goto bailout; \
+  } \
+}
+
+int flags = TJFLAG_NOREALLOC, compOnly = 0, decompOnly = 0, doYUV = 0,
+  quiet = 0, doTile = 0, pf = TJPF_BGR, yuvPad = 1, doWrite = 1;
+char *ext = "ppm";
+const char *pixFormatStr[TJ_NUMPF] = {
+  "RGB", "BGR", "RGBX", "BGRX", "XBGR", "XRGB", "GRAY", "", "", "", "", "CMYK"
 };
-const char *subNameLong[TJ_NUMSAMP]=
-{
-	"4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0", "4:1:1"
+const char *subNameLong[TJ_NUMSAMP] = {
+  "4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0", "4:1:1"
 };
-const char *csName[TJ_NUMCS]=
-{
-	"RGB", "YCbCr", "GRAY", "CMYK", "YCCK"
+const char *csName[TJ_NUMCS] = {
+  "RGB", "YCbCr", "GRAY", "CMYK", "YCCK"
 };
-const char *subName[TJ_NUMSAMP]={"444", "422", "420", "GRAY", "440", "411"};
-tjscalingfactor *scalingfactors=NULL, sf={1, 1};  int nsf=0;
-int xformop=TJXOP_NONE, xformopt=0;
-int (*customFilter)(short *, tjregion, tjregion, int, int, tjtransform *);
-double benchtime=5.0, warmup=1.0;
+const char *subName[TJ_NUMSAMP] = {
+  "444", "422", "420", "GRAY", "440", "411"
+};
+tjscalingfactor *scalingFactors = NULL, sf = { 1, 1 };
+int nsf = 0, xformOp = TJXOP_NONE, xformOpt = 0;
+int (*customFilter) (short *, tjregion, tjregion, int, int, tjtransform *);
+double benchTime = 5.0, warmup = 1.0;
 
 
 char *formatName(int subsamp, int cs, char *buf)
 {
-	if(cs==TJCS_YCbCr) return (char *)subNameLong[subsamp];
-	else if(cs==TJCS_YCCK)
-	{
-		snprintf(buf, 80, "%s %s", csName[cs], subNameLong[subsamp]);
-		return buf;
-	}
-	else return (char *)csName[cs];
+  if (cs == TJCS_YCbCr)
+    return (char *)subNameLong[subsamp];
+  else if (cs == TJCS_YCCK) {
+    snprintf(buf, 80, "%s %s", csName[cs], subNameLong[subsamp]);
+    return buf;
+  } else
+    return (char *)csName[cs];
 }
 
 
 char *sigfig(double val, int figs, char *buf, int len)
 {
-	char format[80];
-	int digitsafterdecimal=figs-(int)ceil(log10(fabs(val)));
-	if(digitsafterdecimal<1) snprintf(format, 80, "%%.0f");
-	else snprintf(format, 80, "%%.%df", digitsafterdecimal);
-	snprintf(buf, len, format, val);
-	return buf;
+  char format[80];
+  int digitsAfterDecimal = figs - (int)ceil(log10(fabs(val)));
+
+  if (digitsAfterDecimal < 1)
+    snprintf(format, 80, "%%.0f");
+  else
+    snprintf(format, 80, "%%.%df", digitsAfterDecimal);
+  snprintf(buf, len, format, val);
+  return buf;
 }
 
 
 /* Custom DCT filter which produces a negative of the image */
 int dummyDCTFilter(short *coeffs, tjregion arrayRegion, tjregion planeRegion,
-	int componentIndex, int transformIndex, tjtransform *transform)
+                   int componentIndex, int transformIndex,
+                   tjtransform *transform)
 {
-	int i;
-	for(i=0; i<arrayRegion.w*arrayRegion.h; i++) coeffs[i]=-coeffs[i];
-	return 0;
+  int i;
+
+  for (i = 0; i < arrayRegion.w * arrayRegion.h; i++)
+    coeffs[i] = -coeffs[i];
+  return 0;
 }
 
 
 /* Decompression test */
-int decomp(unsigned char *srcbuf, unsigned char **jpegbuf,
-	unsigned long *jpegsize, unsigned char *dstbuf, int w, int h,
-	int subsamp, int jpegqual, char *filename, int tilew, int tileh)
+int decomp(unsigned char *srcBuf, unsigned char **jpegBuf,
+           unsigned long *jpegSize, unsigned char *dstBuf, int w, int h,
+           int subsamp, int jpegQual, char *fileName, int tilew, int tileh)
 {
-	char tempstr[1024], sizestr[20]="\0", qualstr[6]="\0", *ptr;
-	FILE *file=NULL;  tjhandle handle=NULL;
-	int row, col, iter=0, dstbufalloc=0, retval=0;
-	double elapsed, elapsedDecode;
-	int ps=tjPixelSize[pf];
-	int scaledw=TJSCALED(w, sf);
-	int scaledh=TJSCALED(h, sf);
-	int pitch=scaledw*ps;
-	int ntilesw=(w+tilew-1)/tilew, ntilesh=(h+tileh-1)/tileh;
-	unsigned char *dstptr, *dstptr2, *yuvbuf=NULL;
+  char tempStr[1024], sizeStr[20] = "\0", qualStr[6] = "\0", *ptr;
+  FILE *file = NULL;
+  tjhandle handle = NULL;
+  int row, col, iter = 0, dstBufAlloc = 0, retval = 0;
+  double elapsed, elapsedDecode;
+  int ps = tjPixelSize[pf];
+  int scaledw = TJSCALED(w, sf);
+  int scaledh = TJSCALED(h, sf);
+  int pitch = scaledw * ps;
+  int ntilesw = (w + tilew - 1) / tilew, ntilesh = (h + tileh - 1) / tileh;
+  unsigned char *dstPtr, *dstPtr2, *yuvBuf = NULL;
 
-	if(jpegqual>0)
-	{
-		snprintf(qualstr, 6, "_Q%d", jpegqual);
-		qualstr[5]=0;
-	}
+  if (jpegQual > 0) {
+    snprintf(qualStr, 6, "_Q%d", jpegQual);
+    qualStr[5] = 0;
+  }
 
-	if((handle=tjInitDecompress())==NULL)
-		_throwtj("executing tjInitDecompress()");
+  if ((handle = tjInitDecompress()) == NULL)
+    _throwtj("executing tjInitDecompress()");
 
-	if(dstbuf==NULL)
-	{
-		if((dstbuf=(unsigned char *)malloc(pitch*scaledh))==NULL)
-			_throwunix("allocating destination buffer");
-		dstbufalloc=1;
-	}
-	/* Set the destination buffer to gray so we know whether the decompressor
-	   attempted to write to it */
-	memset(dstbuf, 127, pitch*scaledh);
+  if (dstBuf == NULL) {
+    if ((dstBuf = (unsigned char *)malloc(pitch * scaledh)) == NULL)
+      _throwunix("allocating destination buffer");
+    dstBufAlloc = 1;
+  }
+  /* Set the destination buffer to gray so we know whether the decompressor
+     attempted to write to it */
+  memset(dstBuf, 127, pitch * scaledh);
 
-	if(doyuv)
-	{
-		int width=dotile? tilew:scaledw;
-		int height=dotile? tileh:scaledh;
-		int yuvsize=tjBufSizeYUV2(width, yuvpad, height, subsamp);
-		if((yuvbuf=(unsigned char *)malloc(yuvsize))==NULL)
-			_throwunix("allocating YUV buffer");
-		memset(yuvbuf, 127, yuvsize);
-	}
+  if (doYUV) {
+    int width = doTile ? tilew : scaledw;
+    int height = doTile ? tileh : scaledh;
+    int yuvSize = tjBufSizeYUV2(width, yuvPad, height, subsamp);
 
-	/* Benchmark */
-	iter=-1;
-	elapsed=elapsedDecode=0.;
-	while(1)
-	{
-		int tile=0;
-		double start=gettime();
-		for(row=0, dstptr=dstbuf; row<ntilesh; row++, dstptr+=pitch*tileh)
-		{
-			for(col=0, dstptr2=dstptr; col<ntilesw; col++, tile++, dstptr2+=ps*tilew)
-			{
-				int width=dotile? min(tilew, w-col*tilew):scaledw;
-				int height=dotile? min(tileh, h-row*tileh):scaledh;
-				if(doyuv)
-				{
-					double startDecode;
-					if(tjDecompressToYUV2(handle, jpegbuf[tile], jpegsize[tile], yuvbuf,
-						width, yuvpad, height, flags)==-1)
-						_throwtj("executing tjDecompressToYUV2()");
-					startDecode=gettime();
-					if(tjDecodeYUV(handle, yuvbuf, yuvpad, subsamp, dstptr2, width,
-						pitch, height, pf, flags)==-1)
-						_throwtj("executing tjDecodeYUV()");
-					if(iter>=0) elapsedDecode+=gettime()-startDecode;
-				}
-				else
-					if(tjDecompress2(handle, jpegbuf[tile], jpegsize[tile], dstptr2,
-						width, pitch, height, pf, flags)==-1)
-						_throwtj("executing tjDecompress2()");
-			}
-		}
-		elapsed+=gettime()-start;
-		if(iter>=0)
-		{
-			iter++;
-			if(elapsed>=benchtime) break;
-		}
-		else if(elapsed>=warmup)
-		{
-			iter=0;
-			elapsed=elapsedDecode=0.;
-		}
-	}
-	if(doyuv) elapsed-=elapsedDecode;
+    if ((yuvBuf = (unsigned char *)malloc(yuvSize)) == NULL)
+      _throwunix("allocating YUV buffer");
+    memset(yuvBuf, 127, yuvSize);
+  }
 
-	if(tjDestroy(handle)==-1) _throwtj("executing tjDestroy()");
-	handle=NULL;
+  /* Benchmark */
+  iter = -1;
+  elapsed = elapsedDecode = 0.;
+  while (1) {
+    int tile = 0;
+    double start = getTime();
 
-	if(quiet)
-	{
-		printf("%-6s%s",
-			sigfig((double)(w*h)/1000000.*(double)iter/elapsed, 4, tempstr, 1024),
-			quiet==2? "\n":"  ");
-		if(doyuv)
-			printf("%s\n",
-				sigfig((double)(w*h)/1000000.*(double)iter/elapsedDecode, 4, tempstr,
-					1024));
-		else if(quiet!=2) printf("\n");
-	}
-	else
-	{
-		printf("%s --> Frame rate:         %f fps\n",
-			doyuv? "Decomp to YUV":"Decompress   ", (double)iter/elapsed);
-		printf("                  Throughput:         %f Megapixels/sec\n",
-			(double)(w*h)/1000000.*(double)iter/elapsed);
-		if(doyuv)
-		{
-			printf("YUV Decode    --> Frame rate:         %f fps\n",
-				(double)iter/elapsedDecode);
-			printf("                  Throughput:         %f Megapixels/sec\n",
-				(double)(w*h)/1000000.*(double)iter/elapsedDecode);
-		}
-	}
+    for (row = 0, dstPtr = dstBuf; row < ntilesh;
+         row++, dstPtr += pitch * tileh) {
+      for (col = 0, dstPtr2 = dstPtr; col < ntilesw;
+           col++, tile++, dstPtr2 += ps * tilew) {
+        int width = doTile ? min(tilew, w - col * tilew) : scaledw;
+        int height = doTile ? min(tileh, h - row * tileh) : scaledh;
 
-	if (!dowrite) goto bailout;
+        if (doYUV) {
+          double startDecode;
 
-	if(sf.num!=1 || sf.denom!=1)
-		snprintf(sizestr, 20, "%d_%d", sf.num, sf.denom);
-	else if(tilew!=w || tileh!=h)
-		snprintf(sizestr, 20, "%dx%d", tilew, tileh);
-	else snprintf(sizestr, 20, "full");
-	if(decomponly)
-		snprintf(tempstr, 1024, "%s_%s.%s", filename, sizestr, ext);
-	else
-		snprintf(tempstr, 1024, "%s_%s%s_%s.%s", filename, subName[subsamp],
-			qualstr, sizestr, ext);
+          if (tjDecompressToYUV2(handle, jpegBuf[tile], jpegSize[tile], yuvBuf,
+                                 width, yuvPad, height, flags) == -1)
+            _throwtj("executing tjDecompressToYUV2()");
+          startDecode = getTime();
+          if (tjDecodeYUV(handle, yuvBuf, yuvPad, subsamp, dstPtr2, width,
+                          pitch, height, pf, flags) == -1)
+            _throwtj("executing tjDecodeYUV()");
+          if (iter >= 0) elapsedDecode += getTime() - startDecode;
+        } else if (tjDecompress2(handle, jpegBuf[tile], jpegSize[tile],
+                                 dstPtr2, width, pitch, height, pf,
+                                 flags) == -1)
+          _throwtj("executing tjDecompress2()");
+      }
+    }
+    elapsed += getTime() - start;
+    if (iter >= 0) {
+      iter++;
+      if (elapsed >= benchTime) break;
+    } else if (elapsed >= warmup) {
+      iter = 0;
+      elapsed = elapsedDecode = 0.;
+    }
+  }
+  if (doYUV) elapsed -= elapsedDecode;
 
-	if(savebmp(tempstr, dstbuf, scaledw, scaledh, pf,
-		(flags&TJFLAG_BOTTOMUP)!=0)==-1)
-		_throwbmp("saving bitmap");
-	ptr=strrchr(tempstr, '.');
-	snprintf(ptr, 1024-(ptr-tempstr), "-err.%s", ext);
-	if(srcbuf && sf.num==1 && sf.denom==1)
-	{
-		if(!quiet) printf("Compression error written to %s.\n", tempstr);
-		if(subsamp==TJ_GRAYSCALE)
-		{
-			int index, index2;
-			for(row=0, index=0; row<h; row++, index+=pitch)
-			{
-				for(col=0, index2=index; col<w; col++, index2+=ps)
-				{
-					int rindex=index2+tjRedOffset[pf];
-					int gindex=index2+tjGreenOffset[pf];
-					int bindex=index2+tjBlueOffset[pf];
-					int y=(int)((double)srcbuf[rindex]*0.299
-						+ (double)srcbuf[gindex]*0.587
-						+ (double)srcbuf[bindex]*0.114 + 0.5);
-					if(y>255) y=255;
-					if(y<0) y=0;
-					dstbuf[rindex]=abs(dstbuf[rindex]-y);
-					dstbuf[gindex]=abs(dstbuf[gindex]-y);
-					dstbuf[bindex]=abs(dstbuf[bindex]-y);
-				}
-			}
-		}
-		else
-		{
-			for(row=0; row<h; row++)
-				for(col=0; col<w*ps; col++)
-					dstbuf[pitch*row+col]
-						=abs(dstbuf[pitch*row+col]-srcbuf[pitch*row+col]);
-		}
-		if(savebmp(tempstr, dstbuf, w, h, pf,
-			(flags&TJFLAG_BOTTOMUP)!=0)==-1)
-			_throwbmp("saving bitmap");
-	}
+  if (tjDestroy(handle) == -1) _throwtj("executing tjDestroy()");
+  handle = NULL;
 
-	bailout:
-	if(file) fclose(file);
-	if(handle) tjDestroy(handle);
-	if(dstbuf && dstbufalloc) free(dstbuf);
-	if(yuvbuf) free(yuvbuf);
-	return retval;
+  if (quiet) {
+    printf("%-6s%s",
+           sigfig((double)(w * h) / 1000000. * (double)iter / elapsed, 4,
+                  tempStr, 1024),
+           quiet == 2 ? "\n" : "  ");
+    if (doYUV)
+      printf("%s\n",
+             sigfig((double)(w * h) / 1000000. * (double)iter / elapsedDecode,
+                    4, tempStr, 1024));
+    else if (quiet != 2) printf("\n");
+  } else {
+    printf("%s --> Frame rate:         %f fps\n",
+           doYUV ? "Decomp to YUV" : "Decompress   ", (double)iter / elapsed);
+    printf("                  Throughput:         %f Megapixels/sec\n",
+           (double)(w * h) / 1000000. * (double)iter / elapsed);
+    if (doYUV) {
+      printf("YUV Decode    --> Frame rate:         %f fps\n",
+             (double)iter / elapsedDecode);
+      printf("                  Throughput:         %f Megapixels/sec\n",
+             (double)(w * h) / 1000000. * (double)iter / elapsedDecode);
+    }
+  }
+
+  if (!doWrite) goto bailout;
+
+  if (sf.num != 1 || sf.denom != 1)
+    snprintf(sizeStr, 20, "%d_%d", sf.num, sf.denom);
+  else if (tilew != w || tileh != h)
+    snprintf(sizeStr, 20, "%dx%d", tilew, tileh);
+  else snprintf(sizeStr, 20, "full");
+  if (decompOnly)
+    snprintf(tempStr, 1024, "%s_%s.%s", fileName, sizeStr, ext);
+  else
+    snprintf(tempStr, 1024, "%s_%s%s_%s.%s", fileName, subName[subsamp],
+             qualStr, sizeStr, ext);
+
+  if (tjSaveImage(tempStr, dstBuf, scaledw, 0, scaledh, pf, flags) == -1)
+    _throwtjg("saving bitmap");
+  ptr = strrchr(tempStr, '.');
+  snprintf(ptr, 1024 - (ptr - tempStr), "-err.%s", ext);
+  if (srcBuf && sf.num == 1 && sf.denom == 1) {
+    if (!quiet) printf("Compression error written to %s.\n", tempStr);
+    if (subsamp == TJ_GRAYSCALE) {
+      int index, index2;
+
+      for (row = 0, index = 0; row < h; row++, index += pitch) {
+        for (col = 0, index2 = index; col < w; col++, index2 += ps) {
+          int rindex = index2 + tjRedOffset[pf];
+          int gindex = index2 + tjGreenOffset[pf];
+          int bindex = index2 + tjBlueOffset[pf];
+          int y = (int)((double)srcBuf[rindex] * 0.299 +
+                        (double)srcBuf[gindex] * 0.587 +
+                        (double)srcBuf[bindex] * 0.114 + 0.5);
+
+          if (y > 255) y = 255;
+          if (y < 0) y = 0;
+          dstBuf[rindex] = abs(dstBuf[rindex] - y);
+          dstBuf[gindex] = abs(dstBuf[gindex] - y);
+          dstBuf[bindex] = abs(dstBuf[bindex] - y);
+        }
+      }
+    } else {
+      for (row = 0; row < h; row++)
+        for (col = 0; col < w * ps; col++)
+          dstBuf[pitch * row + col] =
+            abs(dstBuf[pitch * row + col] - srcBuf[pitch * row + col]);
+    }
+    if (tjSaveImage(tempStr, dstBuf, w, 0, h, pf, flags) == -1)
+      _throwtjg("saving bitmap");
+  }
+
+bailout:
+  if (file) fclose(file);
+  if (handle) tjDestroy(handle);
+  if (dstBuf && dstBufAlloc) free(dstBuf);
+  if (yuvBuf) free(yuvBuf);
+  return retval;
 }
 
 
-int fullTest(unsigned char *srcbuf, int w, int h, int subsamp, int jpegqual,
-	char *filename)
+int fullTest(unsigned char *srcBuf, int w, int h, int subsamp, int jpegQual,
+             char *fileName)
 {
-	char tempstr[1024], tempstr2[80];
-	FILE *file=NULL;  tjhandle handle=NULL;
-	unsigned char **jpegbuf=NULL, *yuvbuf=NULL, *tmpbuf=NULL, *srcptr, *srcptr2;
-	double start, elapsed, elapsedEncode;
-	int totaljpegsize=0, row, col, i, tilew=w, tileh=h, retval=0;
-	int iter, yuvsize=0;
-	unsigned long *jpegsize=NULL;
-	int ps=tjPixelSize[pf];
-	int ntilesw=1, ntilesh=1, pitch=w*ps;
-	const char *pfStr=pixFormatStr[pf];
+  char tempStr[1024], tempStr2[80];
+  FILE *file = NULL;
+  tjhandle handle = NULL;
+  unsigned char **jpegBuf = NULL, *yuvBuf = NULL, *tmpBuf = NULL, *srcPtr,
+    *srcPtr2;
+  double start, elapsed, elapsedEncode;
+  int totalJpegSize = 0, row, col, i, tilew = w, tileh = h, retval = 0;
+  int iter, yuvSize = 0;
+  unsigned long *jpegSize = NULL;
+  int ps = tjPixelSize[pf];
+  int ntilesw = 1, ntilesh = 1, pitch = w * ps;
+  const char *pfStr = pixFormatStr[pf];
 
-	if((tmpbuf=(unsigned char *)malloc(pitch*h)) == NULL)
-		_throwunix("allocating temporary image buffer");
+  if ((tmpBuf = (unsigned char *)malloc(pitch * h)) == NULL)
+    _throwunix("allocating temporary image buffer");
 
-	if(!quiet)
-		printf(">>>>>  %s (%s) <--> JPEG %s Q%d  <<<<<\n", pfStr,
-			(flags&TJFLAG_BOTTOMUP)? "Bottom-up":"Top-down", subNameLong[subsamp],
-			jpegqual);
+  if (!quiet)
+    printf(">>>>>  %s (%s) <--> JPEG %s Q%d  <<<<<\n", pfStr,
+           (flags & TJFLAG_BOTTOMUP) ? "Bottom-up" : "Top-down",
+           subNameLong[subsamp], jpegQual);
 
-	for(tilew=dotile? 8:w, tileh=dotile? 8:h; ; tilew*=2, tileh*=2)
-	{
-		if(tilew>w) tilew=w;
-		if(tileh>h) tileh=h;
-		ntilesw=(w+tilew-1)/tilew;  ntilesh=(h+tileh-1)/tileh;
+  for (tilew = doTile ? 8 : w, tileh = doTile ? 8 : h; ;
+       tilew *= 2, tileh *= 2) {
+    if (tilew > w) tilew = w;
+    if (tileh > h) tileh = h;
+    ntilesw = (w + tilew - 1) / tilew;
+    ntilesh = (h + tileh - 1) / tileh;
 
-		if((jpegbuf=(unsigned char **)malloc(sizeof(unsigned char *)
-			*ntilesw*ntilesh))==NULL)
-			_throwunix("allocating JPEG tile array");
-		memset(jpegbuf, 0, sizeof(unsigned char *)*ntilesw*ntilesh);
-		if((jpegsize=(unsigned long *)malloc(sizeof(unsigned long)
-			*ntilesw*ntilesh))==NULL)
-			_throwunix("allocating JPEG size array");
-		memset(jpegsize, 0, sizeof(unsigned long)*ntilesw*ntilesh);
+    if ((jpegBuf = (unsigned char **)malloc(sizeof(unsigned char *) *
+                                            ntilesw * ntilesh)) == NULL)
+      _throwunix("allocating JPEG tile array");
+    memset(jpegBuf, 0, sizeof(unsigned char *) * ntilesw * ntilesh);
+    if ((jpegSize = (unsigned long *)malloc(sizeof(unsigned long) *
+                                            ntilesw * ntilesh)) == NULL)
+      _throwunix("allocating JPEG size array");
+    memset(jpegSize, 0, sizeof(unsigned long) * ntilesw * ntilesh);
 
-		if((flags&TJFLAG_NOREALLOC)!=0)
-			for(i=0; i<ntilesw*ntilesh; i++)
-			{
-				if((jpegbuf[i]=(unsigned char *)tjAlloc(tjBufSize(tilew, tileh,
-					subsamp)))==NULL)
-					_throwunix("allocating JPEG tiles");
-			}
+    if ((flags & TJFLAG_NOREALLOC) != 0)
+      for (i = 0; i < ntilesw * ntilesh; i++) {
+        if ((jpegBuf[i] = (unsigned char *)
+                          tjAlloc(tjBufSize(tilew, tileh, subsamp))) == NULL)
+          _throwunix("allocating JPEG tiles");
+      }
 
-		/* Compression test */
-		if(quiet==1)
-			printf("%-4s (%s)  %-5s    %-3d   ", pfStr,
-				(flags&TJFLAG_BOTTOMUP)? "BU":"TD", subNameLong[subsamp], jpegqual);
-		for(i=0; i<h; i++)
-			memcpy(&tmpbuf[pitch*i], &srcbuf[w*ps*i], w*ps);
-		if((handle=tjInitCompress())==NULL)
-			_throwtj("executing tjInitCompress()");
+    /* Compression test */
+    if (quiet == 1)
+      printf("%-4s (%s)  %-5s    %-3d   ", pfStr,
+             (flags & TJFLAG_BOTTOMUP) ? "BU" : "TD", subNameLong[subsamp],
+             jpegQual);
+    for (i = 0; i < h; i++)
+      memcpy(&tmpBuf[pitch * i], &srcBuf[w * ps * i], w * ps);
+    if ((handle = tjInitCompress()) == NULL)
+      _throwtj("executing tjInitCompress()");
 
-		if(doyuv)
-		{
-			yuvsize=tjBufSizeYUV2(tilew, yuvpad, tileh, subsamp);
-			if((yuvbuf=(unsigned char *)malloc(yuvsize))==NULL)
-				_throwunix("allocating YUV buffer");
-			memset(yuvbuf, 127, yuvsize);
-		}
+    if (doYUV) {
+      yuvSize = tjBufSizeYUV2(tilew, yuvPad, tileh, subsamp);
+      if ((yuvBuf = (unsigned char *)malloc(yuvSize)) == NULL)
+        _throwunix("allocating YUV buffer");
+      memset(yuvBuf, 127, yuvSize);
+    }
 
-		/* Benchmark */
-		iter=-1;
-		elapsed=elapsedEncode=0.;
-		while(1)
-		{
-			int tile=0;
-			totaljpegsize=0;
-			start=gettime();
-			for(row=0, srcptr=srcbuf; row<ntilesh; row++, srcptr+=pitch*tileh)
-			{
-				for(col=0, srcptr2=srcptr; col<ntilesw; col++, tile++,
-					srcptr2+=ps*tilew)
-				{
-					int width=min(tilew, w-col*tilew);
-					int height=min(tileh, h-row*tileh);
-					if(doyuv)
-					{
-						double startEncode=gettime();
-						if(tjEncodeYUV3(handle, srcptr2, width, pitch, height, pf, yuvbuf,
-							yuvpad, subsamp, flags)==-1)
-							_throwtj("executing tjEncodeYUV3()");
-						if(iter>=0) elapsedEncode+=gettime()-startEncode;
-						if(tjCompressFromYUV(handle, yuvbuf, width, yuvpad, height,
-							subsamp, &jpegbuf[tile], &jpegsize[tile], jpegqual, flags)==-1)
-							_throwtj("executing tjCompressFromYUV()");
-					}
-					else
-					{
-						if(tjCompress2(handle, srcptr2, width, pitch, height, pf,
-							&jpegbuf[tile], &jpegsize[tile], subsamp, jpegqual, flags)==-1)
-							_throwtj("executing tjCompress2()");
-					}
-					totaljpegsize+=jpegsize[tile];
-				}
-			}
-			elapsed+=gettime()-start;
-			if(iter>=0)
-			{
-				iter++;
-				if(elapsed>=benchtime) break;
-			}
-			else if(elapsed>=warmup)
-			{
-				iter=0;
-				elapsed=elapsedEncode=0.;
-			}
-		}
-		if(doyuv) elapsed-=elapsedEncode;
+    /* Benchmark */
+    iter = -1;
+    elapsed = elapsedEncode = 0.;
+    while (1) {
+      int tile = 0;
 
-		if(tjDestroy(handle)==-1) _throwtj("executing tjDestroy()");
-		handle=NULL;
+      totalJpegSize = 0;
+      start = getTime();
+      for (row = 0, srcPtr = srcBuf; row < ntilesh;
+           row++, srcPtr += pitch * tileh) {
+        for (col = 0, srcPtr2 = srcPtr; col < ntilesw;
+             col++, tile++, srcPtr2 += ps * tilew) {
+          int width = min(tilew, w - col * tilew);
+          int height = min(tileh, h - row * tileh);
 
-		if(quiet==1) printf("%-5d  %-5d   ", tilew, tileh);
-		if(quiet)
-		{
-			if(doyuv)
-				printf("%-6s%s",
-					sigfig((double)(w*h)/1000000.*(double)iter/elapsedEncode, 4, tempstr,
-						1024), quiet==2? "\n":"  ");
-			printf("%-6s%s",
-				sigfig((double)(w*h)/1000000.*(double)iter/elapsed, 4,	tempstr, 1024),
-				quiet==2? "\n":"  ");
-			printf("%-6s%s",
-				sigfig((double)(w*h*ps)/(double)totaljpegsize, 4, tempstr2, 80),
-				quiet==2? "\n":"  ");
-		}
-		else
-		{
-			printf("\n%s size: %d x %d\n", dotile? "Tile":"Image", tilew,
-				tileh);
-			if(doyuv)
-			{
-				printf("Encode YUV    --> Frame rate:         %f fps\n",
-					(double)iter/elapsedEncode);
-				printf("                  Output image size:  %d bytes\n", yuvsize);
-				printf("                  Compression ratio:  %f:1\n",
-					(double)(w*h*ps)/(double)yuvsize);
-				printf("                  Throughput:         %f Megapixels/sec\n",
-					(double)(w*h)/1000000.*(double)iter/elapsedEncode);
-				printf("                  Output bit stream:  %f Megabits/sec\n",
-					(double)yuvsize*8./1000000.*(double)iter/elapsedEncode);
-			}
-			printf("%s --> Frame rate:         %f fps\n",
-				doyuv? "Comp from YUV":"Compress     ", (double)iter/elapsed);
-			printf("                  Output image size:  %d bytes\n",
-				totaljpegsize);
-			printf("                  Compression ratio:  %f:1\n",
-				(double)(w*h*ps)/(double)totaljpegsize);
-			printf("                  Throughput:         %f Megapixels/sec\n",
-				(double)(w*h)/1000000.*(double)iter/elapsed);
-			printf("                  Output bit stream:  %f Megabits/sec\n",
-				(double)totaljpegsize*8./1000000.*(double)iter/elapsed);
-		}
-		if(tilew==w && tileh==h && dowrite)
-		{
-			snprintf(tempstr, 1024, "%s_%s_Q%d.jpg", filename, subName[subsamp],
-				jpegqual);
-			if((file=fopen(tempstr, "wb"))==NULL)
-				_throwunix("opening reference image");
-			if(fwrite(jpegbuf[0], jpegsize[0], 1, file)!=1)
-				_throwunix("writing reference image");
-			fclose(file);  file=NULL;
-			if(!quiet) printf("Reference image written to %s\n", tempstr);
-		}
+          if (doYUV) {
+            double startEncode = getTime();
 
-		/* Decompression test */
-		if(!componly)
-		{
-			if(decomp(srcbuf, jpegbuf, jpegsize, tmpbuf, w, h, subsamp, jpegqual,
-				filename, tilew, tileh)==-1)
-				goto bailout;
-		}
+            if (tjEncodeYUV3(handle, srcPtr2, width, pitch, height, pf, yuvBuf,
+                             yuvPad, subsamp, flags) == -1)
+              _throwtj("executing tjEncodeYUV3()");
+            if (iter >= 0) elapsedEncode += getTime() - startEncode;
+            if (tjCompressFromYUV(handle, yuvBuf, width, yuvPad, height,
+                                  subsamp, &jpegBuf[tile], &jpegSize[tile],
+                                  jpegQual, flags) == -1)
+              _throwtj("executing tjCompressFromYUV()");
+          } else {
+            if (tjCompress2(handle, srcPtr2, width, pitch, height, pf,
+                            &jpegBuf[tile], &jpegSize[tile], subsamp, jpegQual,
+                            flags) == -1)
+              _throwtj("executing tjCompress2()");
+          }
+          totalJpegSize += jpegSize[tile];
+        }
+      }
+      elapsed += getTime() - start;
+      if (iter >= 0) {
+        iter++;
+        if (elapsed >= benchTime) break;
+      } else if (elapsed >= warmup) {
+        iter = 0;
+        elapsed = elapsedEncode = 0.;
+      }
+    }
+    if (doYUV) elapsed -= elapsedEncode;
 
-		for(i=0; i<ntilesw*ntilesh; i++)
-		{
-			if(jpegbuf[i]) tjFree(jpegbuf[i]);
-			jpegbuf[i]=NULL;
-		}
-		free(jpegbuf);  jpegbuf=NULL;
-		free(jpegsize);  jpegsize=NULL;
-		if(doyuv)
-		{
-			free(yuvbuf);  yuvbuf=NULL;
-		}
+    if (tjDestroy(handle) == -1) _throwtj("executing tjDestroy()");
+    handle = NULL;
 
-		if(tilew==w && tileh==h) break;
-	}
+    if (quiet == 1) printf("%-5d  %-5d   ", tilew, tileh);
+    if (quiet) {
+      if (doYUV)
+        printf("%-6s%s",
+               sigfig((double)(w * h) / 1000000. *
+                      (double)iter / elapsedEncode, 4, tempStr, 1024),
+               quiet == 2 ? "\n" : "  ");
+      printf("%-6s%s",
+             sigfig((double)(w * h) / 1000000. * (double)iter / elapsed, 4,
+                    tempStr, 1024),
+             quiet == 2 ? "\n" : "  ");
+      printf("%-6s%s",
+             sigfig((double)(w * h * ps) / (double)totalJpegSize, 4, tempStr2,
+                    80),
+             quiet == 2 ? "\n" : "  ");
+    } else {
+      printf("\n%s size: %d x %d\n", doTile ? "Tile" : "Image", tilew, tileh);
+      if (doYUV) {
+        printf("Encode YUV    --> Frame rate:         %f fps\n",
+               (double)iter / elapsedEncode);
+        printf("                  Output image size:  %d bytes\n", yuvSize);
+        printf("                  Compression ratio:  %f:1\n",
+               (double)(w * h * ps) / (double)yuvSize);
+        printf("                  Throughput:         %f Megapixels/sec\n",
+               (double)(w * h) / 1000000. * (double)iter / elapsedEncode);
+        printf("                  Output bit stream:  %f Megabits/sec\n",
+               (double)yuvSize * 8. / 1000000. * (double)iter / elapsedEncode);
+      }
+      printf("%s --> Frame rate:         %f fps\n",
+             doYUV ? "Comp from YUV" : "Compress     ",
+             (double)iter / elapsed);
+      printf("                  Output image size:  %d bytes\n",
+             totalJpegSize);
+      printf("                  Compression ratio:  %f:1\n",
+             (double)(w * h * ps) / (double)totalJpegSize);
+      printf("                  Throughput:         %f Megapixels/sec\n",
+             (double)(w * h) / 1000000. * (double)iter / elapsed);
+      printf("                  Output bit stream:  %f Megabits/sec\n",
+             (double)totalJpegSize * 8. / 1000000. * (double)iter / elapsed);
+    }
+    if (tilew == w && tileh == h && doWrite) {
+      snprintf(tempStr, 1024, "%s_%s_Q%d.jpg", fileName, subName[subsamp],
+               jpegQual);
+      if ((file = fopen(tempStr, "wb")) == NULL)
+        _throwunix("opening reference image");
+      if (fwrite(jpegBuf[0], jpegSize[0], 1, file) != 1)
+        _throwunix("writing reference image");
+      fclose(file);  file = NULL;
+      if (!quiet) printf("Reference image written to %s\n", tempStr);
+    }
 
-	bailout:
-	if(file) {fclose(file);  file=NULL;}
-	if(jpegbuf)
-	{
-		for(i=0; i<ntilesw*ntilesh; i++)
-		{
-			if(jpegbuf[i]) tjFree(jpegbuf[i]);
-			jpegbuf[i]=NULL;
-		}
-		free(jpegbuf);  jpegbuf=NULL;
-	}
-	if(yuvbuf) {free(yuvbuf);  yuvbuf=NULL;}
-	if(jpegsize) {free(jpegsize);  jpegsize=NULL;}
-	if(tmpbuf) {free(tmpbuf);  tmpbuf=NULL;}
-	if(handle) {tjDestroy(handle);  handle=NULL;}
-	return retval;
+    /* Decompression test */
+    if (!compOnly) {
+      if (decomp(srcBuf, jpegBuf, jpegSize, tmpBuf, w, h, subsamp, jpegQual,
+                 fileName, tilew, tileh) == -1)
+        goto bailout;
+    }
+
+    for (i = 0; i < ntilesw * ntilesh; i++) {
+      if (jpegBuf[i]) tjFree(jpegBuf[i]);
+      jpegBuf[i] = NULL;
+    }
+    free(jpegBuf);  jpegBuf = NULL;
+    free(jpegSize);  jpegSize = NULL;
+    if (doYUV) {
+      free(yuvBuf);  yuvBuf = NULL;
+    }
+
+    if (tilew == w && tileh == h) break;
+  }
+
+bailout:
+  if (file) { fclose(file);  file = NULL; }
+  if (jpegBuf) {
+    for (i = 0; i < ntilesw * ntilesh; i++) {
+      if (jpegBuf[i]) tjFree(jpegBuf[i]);
+      jpegBuf[i] = NULL;
+    }
+    free(jpegBuf);  jpegBuf = NULL;
+  }
+  if (yuvBuf) { free(yuvBuf);  yuvBuf = NULL; }
+  if (jpegSize) { free(jpegSize);  jpegSize = NULL; }
+  if (tmpBuf) { free(tmpBuf);  tmpBuf = NULL; }
+  if (handle) { tjDestroy(handle);  handle = NULL; }
+  return retval;
 }
 
 
-int decompTest(char *filename)
+int decompTest(char *fileName)
 {
-	FILE *file=NULL;  tjhandle handle=NULL;
-	unsigned char **jpegbuf=NULL, *srcbuf=NULL;
-	unsigned long *jpegsize=NULL, srcsize, totaljpegsize;
-	tjtransform *t=NULL;
-	int w=0, h=0, subsamp=-1, cs=-1, _w, _h, _tilew, _tileh,
-		_ntilesw, _ntilesh, _subsamp;
-	char *temp=NULL, tempstr[80], tempstr2[80];
-	int row, col, i, iter, tilew, tileh, ntilesw=1, ntilesh=1, retval=0;
-	double start, elapsed;
-	int ps=tjPixelSize[pf], tile, decompsrc=0;
+  FILE *file = NULL;
+  tjhandle handle = NULL;
+  unsigned char **jpegBuf = NULL, *srcBuf = NULL;
+  unsigned long *jpegSize = NULL, srcSize, totalJpegSize;
+  tjtransform *t = NULL;
+  int w = 0, h = 0, subsamp = -1, cs = -1, _w, _h, _tilew, _tileh, _ntilesw,
+    _ntilesh, _subsamp;
+  char *temp = NULL, tempStr[80], tempStr2[80];
+  int row, col, i, iter, tilew, tileh, ntilesw = 1, ntilesh = 1, retval = 0;
+  double start, elapsed;
+  int ps = tjPixelSize[pf], tile, decompsrc = 0;
 
-	if((file=fopen(filename, "rb"))==NULL)
-		_throwunix("opening file");
-	if(fseek(file, 0, SEEK_END)<0 || (srcsize=ftell(file))==(unsigned long)-1)
-		_throwunix("determining file size");
-	if((srcbuf=(unsigned char *)malloc(srcsize))==NULL)
-		_throwunix("allocating memory");
-	if(fseek(file, 0, SEEK_SET)<0)
-		_throwunix("setting file position");
-	if(fread(srcbuf, srcsize, 1, file)<1)
-		_throwunix("reading JPEG data");
-	fclose(file);  file=NULL;
+  if ((file = fopen(fileName, "rb")) == NULL)
+    _throwunix("opening file");
+  if (fseek(file, 0, SEEK_END) < 0 ||
+      (srcSize = ftell(file)) == (unsigned long)-1)
+    _throwunix("determining file size");
+  if ((srcBuf = (unsigned char *)malloc(srcSize)) == NULL)
+    _throwunix("allocating memory");
+  if (fseek(file, 0, SEEK_SET) < 0)
+    _throwunix("setting file position");
+  if (fread(srcBuf, srcSize, 1, file) < 1)
+    _throwunix("reading JPEG data");
+  fclose(file);  file = NULL;
 
-	temp=strrchr(filename, '.');
-	if(temp!=NULL) *temp='\0';
+  temp = strrchr(fileName, '.');
+  if (temp != NULL) *temp = '\0';
 
-	if((handle=tjInitTransform())==NULL)
-		_throwtj("executing tjInitTransform()");
-	if(tjDecompressHeader3(handle, srcbuf, srcsize, &w, &h, &subsamp, &cs)==-1)
-		_throwtj("executing tjDecompressHeader3()");
-	if(cs==TJCS_YCCK || cs==TJCS_CMYK)
-	{
-		pf=TJPF_CMYK;  ps=tjPixelSize[pf];
-	}
+  if ((handle = tjInitTransform()) == NULL)
+    _throwtj("executing tjInitTransform()");
+  if (tjDecompressHeader3(handle, srcBuf, srcSize, &w, &h, &subsamp,
+                          &cs) == -1)
+    _throwtj("executing tjDecompressHeader3()");
+  if (cs == TJCS_YCCK || cs == TJCS_CMYK) {
+    pf = TJPF_CMYK;  ps = tjPixelSize[pf];
+  }
 
-	if(quiet==1)
-	{
-		printf("All performance values in Mpixels/sec\n\n");
-		printf("Bitmap     JPEG   JPEG     %s  %s   Xform   Comp    Decomp  ",
-			dotile? "Tile ":"Image", dotile? "Tile ":"Image");
-		if(doyuv) printf("Decode");
-		printf("\n");
-		printf("Format     CS     Subsamp  Width  Height  Perf    Ratio   Perf    ");
-		if(doyuv) printf("Perf");
-		printf("\n\n");
-	}
-	else if(!quiet)
-		printf(">>>>>  JPEG %s --> %s (%s)  <<<<<\n",
-			formatName(subsamp, cs, tempstr), pixFormatStr[pf],
-			(flags&TJFLAG_BOTTOMUP)? "Bottom-up":"Top-down");
+  if (quiet == 1) {
+    printf("All performance values in Mpixels/sec\n\n");
+    printf("Bitmap     JPEG   JPEG     %s  %s   Xform   Comp    Decomp  ",
+           doTile ? "Tile " : "Image", doTile ? "Tile " : "Image");
+    if (doYUV) printf("Decode");
+    printf("\n");
+    printf("Format     CS     Subsamp  Width  Height  Perf    Ratio   Perf    ");
+    if (doYUV) printf("Perf");
+    printf("\n\n");
+  } else if (!quiet)
+    printf(">>>>>  JPEG %s --> %s (%s)  <<<<<\n",
+           formatName(subsamp, cs, tempStr), pixFormatStr[pf],
+           (flags & TJFLAG_BOTTOMUP) ? "Bottom-up" : "Top-down");
 
-	for(tilew=dotile? 16:w, tileh=dotile? 16:h; ; tilew*=2, tileh*=2)
-	{
-		if(tilew>w) tilew=w;
-		if(tileh>h) tileh=h;
-		ntilesw=(w+tilew-1)/tilew;  ntilesh=(h+tileh-1)/tileh;
+  for (tilew = doTile ? 16 : w, tileh = doTile ? 16 : h; ;
+       tilew *= 2, tileh *= 2) {
+    if (tilew > w) tilew = w;
+    if (tileh > h) tileh = h;
+    ntilesw = (w + tilew - 1) / tilew;
+    ntilesh = (h + tileh - 1) / tileh;
 
-		if((jpegbuf=(unsigned char **)malloc(sizeof(unsigned char *)
-			*ntilesw*ntilesh))==NULL)
-			_throwunix("allocating JPEG tile array");
-		memset(jpegbuf, 0, sizeof(unsigned char *)*ntilesw*ntilesh);
-		if((jpegsize=(unsigned long *)malloc(sizeof(unsigned long)
-			*ntilesw*ntilesh))==NULL)
-			_throwunix("allocating JPEG size array");
-		memset(jpegsize, 0, sizeof(unsigned long)*ntilesw*ntilesh);
+    if ((jpegBuf = (unsigned char **)malloc(sizeof(unsigned char *) *
+                                            ntilesw * ntilesh)) == NULL)
+      _throwunix("allocating JPEG tile array");
+    memset(jpegBuf, 0, sizeof(unsigned char *) * ntilesw * ntilesh);
+    if ((jpegSize = (unsigned long *)malloc(sizeof(unsigned long) *
+                                            ntilesw * ntilesh)) == NULL)
+      _throwunix("allocating JPEG size array");
+    memset(jpegSize, 0, sizeof(unsigned long) * ntilesw * ntilesh);
 
-		if((flags&TJFLAG_NOREALLOC)!=0 || !dotile)
-			for(i=0; i<ntilesw*ntilesh; i++)
-			{
-				if((jpegbuf[i]=(unsigned char *)tjAlloc(tjBufSize(tilew, tileh,
-					subsamp)))==NULL)
-					_throwunix("allocating JPEG tiles");
-			}
+    if ((flags & TJFLAG_NOREALLOC) != 0 || !doTile)
+      for (i = 0; i < ntilesw * ntilesh; i++) {
+        if ((jpegBuf[i] = (unsigned char *)
+                          tjAlloc(tjBufSize(tilew, tileh, subsamp))) == NULL)
+          _throwunix("allocating JPEG tiles");
+      }
 
-		_w=w;  _h=h;  _tilew=tilew;  _tileh=tileh;
-		if(!quiet)
-		{
-			printf("\n%s size: %d x %d", dotile? "Tile":"Image", _tilew,
-				_tileh);
-			if(sf.num!=1 || sf.denom!=1)
-				printf(" --> %d x %d", TJSCALED(_w, sf), TJSCALED(_h, sf));
-			printf("\n");
-		}
-		else if(quiet==1)
-		{
-			printf("%-4s (%s)  %-5s  %-5s    ", pixFormatStr[pf],
-				(flags&TJFLAG_BOTTOMUP)? "BU":"TD", csName[cs], subNameLong[subsamp]);
-			printf("%-5d  %-5d   ", tilew, tileh);
-		}
+    _w = w;  _h = h;  _tilew = tilew;  _tileh = tileh;
+    if (!quiet) {
+      printf("\n%s size: %d x %d", doTile ? "Tile" : "Image", _tilew, _tileh);
+      if (sf.num != 1 || sf.denom != 1)
+        printf(" --> %d x %d", TJSCALED(_w, sf), TJSCALED(_h, sf));
+      printf("\n");
+    } else if (quiet == 1) {
+      printf("%-4s (%s)  %-5s  %-5s    ", pixFormatStr[pf],
+             (flags & TJFLAG_BOTTOMUP) ? "BU" : "TD", csName[cs],
+             subNameLong[subsamp]);
+      printf("%-5d  %-5d   ", tilew, tileh);
+    }
 
-		_subsamp=subsamp;
-		if(dotile || xformop!=TJXOP_NONE || xformopt!=0 || customFilter)
-		{
-			if((t=(tjtransform *)malloc(sizeof(tjtransform)*ntilesw*ntilesh))
-				==NULL)
-				_throwunix("allocating image transform array");
+    _subsamp = subsamp;
+    if (doTile || xformOp != TJXOP_NONE || xformOpt != 0 || customFilter) {
+      if ((t = (tjtransform *)malloc(sizeof(tjtransform) * ntilesw *
+                                     ntilesh)) == NULL)
+        _throwunix("allocating image transform array");
 
-			if(xformop==TJXOP_TRANSPOSE || xformop==TJXOP_TRANSVERSE
-				|| xformop==TJXOP_ROT90 || xformop==TJXOP_ROT270)
-			{
-				_w=h;  _h=w;  _tilew=tileh;  _tileh=tilew;
-			}
+      if (xformOp == TJXOP_TRANSPOSE || xformOp == TJXOP_TRANSVERSE ||
+          xformOp == TJXOP_ROT90 || xformOp == TJXOP_ROT270) {
+        _w = h;  _h = w;  _tilew = tileh;  _tileh = tilew;
+      }
 
-			if(xformopt&TJXOPT_GRAY) _subsamp=TJ_GRAYSCALE;
-			if(xformop==TJXOP_HFLIP || xformop==TJXOP_ROT180)
-				_w=_w-(_w%tjMCUWidth[_subsamp]);
-			if(xformop==TJXOP_VFLIP || xformop==TJXOP_ROT180)
-				_h=_h-(_h%tjMCUHeight[_subsamp]);
-			if(xformop==TJXOP_TRANSVERSE || xformop==TJXOP_ROT90)
-				_w=_w-(_w%tjMCUHeight[_subsamp]);
-			if(xformop==TJXOP_TRANSVERSE || xformop==TJXOP_ROT270)
-				_h=_h-(_h%tjMCUWidth[_subsamp]);
-			_ntilesw=(_w+_tilew-1)/_tilew;
-			_ntilesh=(_h+_tileh-1)/_tileh;
+      if (xformOpt & TJXOPT_GRAY) _subsamp = TJ_GRAYSCALE;
+      if (xformOp == TJXOP_HFLIP || xformOp == TJXOP_ROT180)
+        _w = _w - (_w % tjMCUWidth[_subsamp]);
+      if (xformOp == TJXOP_VFLIP || xformOp == TJXOP_ROT180)
+        _h = _h - (_h % tjMCUHeight[_subsamp]);
+      if (xformOp == TJXOP_TRANSVERSE || xformOp == TJXOP_ROT90)
+        _w = _w - (_w % tjMCUHeight[_subsamp]);
+      if (xformOp == TJXOP_TRANSVERSE || xformOp == TJXOP_ROT270)
+        _h = _h - (_h % tjMCUWidth[_subsamp]);
+      _ntilesw = (_w + _tilew - 1) / _tilew;
+      _ntilesh = (_h + _tileh - 1) / _tileh;
 
-			if(xformop==TJXOP_TRANSPOSE || xformop==TJXOP_TRANSVERSE
-				|| xformop==TJXOP_ROT90 || xformop==TJXOP_ROT270)
-			{
-				if(_subsamp==TJSAMP_422) _subsamp=TJSAMP_440;
-				else if(_subsamp==TJSAMP_440) _subsamp=TJSAMP_422;
-			}
+      if (xformOp == TJXOP_TRANSPOSE || xformOp == TJXOP_TRANSVERSE ||
+          xformOp == TJXOP_ROT90 || xformOp == TJXOP_ROT270) {
+        if (_subsamp == TJSAMP_422) _subsamp = TJSAMP_440;
+        else if (_subsamp == TJSAMP_440) _subsamp = TJSAMP_422;
+      }
 
-			for(row=0, tile=0; row<_ntilesh; row++)
-			{
-				for(col=0; col<_ntilesw; col++, tile++)
-				{
-					t[tile].r.w=min(_tilew, _w-col*_tilew);
-					t[tile].r.h=min(_tileh, _h-row*_tileh);
-					t[tile].r.x=col*_tilew;
-					t[tile].r.y=row*_tileh;
-					t[tile].op=xformop;
-					t[tile].options=xformopt|TJXOPT_TRIM;
-					t[tile].customFilter=customFilter;
-					if(t[tile].options&TJXOPT_NOOUTPUT && jpegbuf[tile])
-					{
-						tjFree(jpegbuf[tile]);  jpegbuf[tile]=NULL;
-					}
-				}
-			}
+      for (row = 0, tile = 0; row < _ntilesh; row++) {
+        for (col = 0; col < _ntilesw; col++, tile++) {
+          t[tile].r.w = min(_tilew, _w - col * _tilew);
+          t[tile].r.h = min(_tileh, _h - row * _tileh);
+          t[tile].r.x = col * _tilew;
+          t[tile].r.y = row * _tileh;
+          t[tile].op = xformOp;
+          t[tile].options = xformOpt | TJXOPT_TRIM;
+          t[tile].customFilter = customFilter;
+          if (t[tile].options & TJXOPT_NOOUTPUT && jpegBuf[tile]) {
+            tjFree(jpegBuf[tile]);  jpegBuf[tile] = NULL;
+          }
+        }
+      }
 
-			iter=-1;
-			elapsed=0.;
-			while(1)
-			{
-				start=gettime();
-				if(tjTransform(handle, srcbuf, srcsize, _ntilesw*_ntilesh, jpegbuf,
-					jpegsize, t, flags)==-1)
-					_throwtj("executing tjTransform()");
-				elapsed+=gettime()-start;
-				if(iter>=0)
-				{
-					iter++;
-					if(elapsed>=benchtime) break;
-				}
-				else if(elapsed>=warmup)
-				{
-					iter=0;
-					elapsed=0.;
-				}
-			}
+      iter = -1;
+      elapsed = 0.;
+      while (1) {
+        start = getTime();
+        if (tjTransform(handle, srcBuf, srcSize, _ntilesw * _ntilesh, jpegBuf,
+                        jpegSize, t, flags) == -1)
+          _throwtj("executing tjTransform()");
+        elapsed += getTime() - start;
+        if (iter >= 0) {
+          iter++;
+          if (elapsed >= benchTime) break;
+        } else if (elapsed >= warmup) {
+          iter = 0;
+          elapsed = 0.;
+        }
+      }
 
-			free(t);  t=NULL;
+      free(t);  t = NULL;
 
-			for(tile=0, totaljpegsize=0; tile<_ntilesw*_ntilesh; tile++)
-				totaljpegsize+=jpegsize[tile];
+      for (tile = 0, totalJpegSize = 0; tile < _ntilesw * _ntilesh; tile++)
+        totalJpegSize += jpegSize[tile];
 
-			if(quiet)
-			{
-				printf("%-6s%s%-6s%s",
-					sigfig((double)(w*h)/1000000./elapsed, 4, tempstr, 80),
-					quiet==2? "\n":"  ",
-					sigfig((double)(w*h*ps)/(double)totaljpegsize, 4, tempstr2, 80),
-					quiet==2? "\n":"  ");
-			}
-			else if(!quiet)
-			{
-				printf("Transform     --> Frame rate:         %f fps\n", 1.0/elapsed);
-				printf("                  Output image size:  %lu bytes\n", totaljpegsize);
-				printf("                  Compression ratio:  %f:1\n",
-					(double)(w*h*ps)/(double)totaljpegsize);
-				printf("                  Throughput:         %f Megapixels/sec\n",
-					(double)(w*h)/1000000./elapsed);
-				printf("                  Output bit stream:  %f Megabits/sec\n",
-					(double)totaljpegsize*8./1000000./elapsed);
-			}
-		}
-		else
-		{
-			if(quiet==1) printf("N/A     N/A     ");
-			tjFree(jpegbuf[0]);
-			jpegbuf[0]=NULL;
-			decompsrc=1;
-		}
+      if (quiet) {
+        printf("%-6s%s%-6s%s",
+               sigfig((double)(w * h) / 1000000. / elapsed, 4, tempStr, 80),
+               quiet == 2 ? "\n" : "  ",
+               sigfig((double)(w * h * ps) / (double)totalJpegSize, 4,
+                      tempStr2, 80),
+               quiet == 2 ? "\n" : "  ");
+      } else if (!quiet) {
+        printf("Transform     --> Frame rate:         %f fps\n",
+               1.0 / elapsed);
+        printf("                  Output image size:  %lu bytes\n",
+               totalJpegSize);
+        printf("                  Compression ratio:  %f:1\n",
+               (double)(w * h * ps) / (double)totalJpegSize);
+        printf("                  Throughput:         %f Megapixels/sec\n",
+               (double)(w * h) / 1000000. / elapsed);
+        printf("                  Output bit stream:  %f Megabits/sec\n",
+               (double)totalJpegSize * 8. / 1000000. / elapsed);
+      }
+    } else {
+      if (quiet == 1) printf("N/A     N/A     ");
+      tjFree(jpegBuf[0]);
+      jpegBuf[0] = NULL;
+      decompsrc = 1;
+    }
 
-		if(w==tilew) _tilew=_w;
-		if(h==tileh) _tileh=_h;
-		if(!(xformopt&TJXOPT_NOOUTPUT))
-		{
-			if(decomp(NULL, decompsrc? &srcbuf:jpegbuf, decompsrc? &srcsize:jpegsize,
-					NULL, _w, _h, _subsamp, 0, filename, _tilew, _tileh)==-1)
-				goto bailout;
-		}
-		else if(quiet==1) printf("N/A\n");
+    if (w == tilew) _tilew = _w;
+    if (h == tileh) _tileh = _h;
+    if (!(xformOpt & TJXOPT_NOOUTPUT)) {
+      if (decomp(NULL, decompsrc ? &srcBuf : jpegBuf,
+                 decompsrc ? &srcSize : jpegSize, NULL, _w, _h, _subsamp, 0,
+                 fileName, _tilew, _tileh) == -1)
+        goto bailout;
+    } else if (quiet == 1) printf("N/A\n");
 
-		for(i=0; i<ntilesw*ntilesh; i++)
-		{
-			tjFree(jpegbuf[i]);  jpegbuf[i]=NULL;
-		}
-		free(jpegbuf);  jpegbuf=NULL;
-		if(jpegsize) {free(jpegsize);  jpegsize=NULL;}
+    for (i = 0; i < ntilesw * ntilesh; i++) {
+      tjFree(jpegBuf[i]);  jpegBuf[i] = NULL;
+    }
+    free(jpegBuf);  jpegBuf = NULL;
+    if (jpegSize) { free(jpegSize);  jpegSize = NULL; }
 
-		if(tilew==w && tileh==h) break;
-	}
+    if (tilew == w && tileh == h) break;
+  }
 
-	bailout:
-	if(file) {fclose(file);  file=NULL;}
-	if(jpegbuf)
-	{
-		for(i=0; i<ntilesw*ntilesh; i++)
-		{
-			if(jpegbuf[i]) tjFree(jpegbuf[i]);
-			jpegbuf[i]=NULL;
-		}
-		free(jpegbuf);  jpegbuf=NULL;
-	}
-	if(jpegsize) {free(jpegsize);  jpegsize=NULL;}
-	if(srcbuf) {free(srcbuf);  srcbuf=NULL;}
-	if(t) {free(t);  t=NULL;}
-	if(handle) {tjDestroy(handle);  handle=NULL;}
-	return retval;
+bailout:
+  if (file) { fclose(file);  file = NULL; }
+  if (jpegBuf) {
+    for (i = 0; i < ntilesw * ntilesh; i++) {
+      if (jpegBuf[i]) tjFree(jpegBuf[i]);
+      jpegBuf[i] = NULL;
+    }
+    free(jpegBuf);  jpegBuf = NULL;
+  }
+  if (jpegSize) { free(jpegSize);  jpegSize = NULL; }
+  if (srcBuf) { free(srcBuf);  srcBuf = NULL; }
+  if (t) { free(t);  t = NULL; }
+  if (handle) { tjDestroy(handle);  handle = NULL; }
+  return retval;
 }
 
 
-void usage(char *progname)
+void usage(char *progName)
 {
-	int i;
-	printf("USAGE: %s\n", progname);
-	printf("       <Inputfile (BMP|PPM)> <Quality> [options]\n\n");
-	printf("       %s\n", progname);
-	printf("       <Inputfile (JPG)> [options]\n\n");
-	printf("Options:\n\n");
-	printf("-alloc = Dynamically allocate JPEG image buffers\n");
-	printf("-bmp = Generate output images in Windows Bitmap format (default = PPM)\n");
-	printf("-bottomup = Test bottom-up compression/decompression\n");
-	printf("-tile = Test performance of the codec when the image is encoded as separate\n");
-	printf("     tiles of varying sizes.\n");
-	printf("-rgb, -bgr, -rgbx, -bgrx, -xbgr, -xrgb =\n");
-	printf("     Test the specified color conversion path in the codec (default = BGR)\n");
-	printf("-cmyk = Indirectly test YCCK JPEG compression/decompression (the source\n");
-	printf("     and destination bitmaps are still RGB.  The conversion is done\n");
-	printf("     internally prior to compression or after decompression.)\n");
-	printf("-fastupsample = Use the fastest chrominance upsampling algorithm available in\n");
-	printf("     the underlying codec\n");
-	printf("-fastdct = Use the fastest DCT/IDCT algorithms available in the underlying\n");
-	printf("     codec\n");
-	printf("-accuratedct = Use the most accurate DCT/IDCT algorithms available in the\n");
-	printf("     underlying codec\n");
-	printf("-subsamp <s> = When testing JPEG compression, this option specifies the level\n");
-	printf("     of chrominance subsampling to use (<s> = 444, 422, 440, 420, 411, or\n");
-	printf("     GRAY).  The default is to test Grayscale, 4:2:0, 4:2:2, and 4:4:4 in\n");
-	printf("     sequence.\n");
-	printf("-quiet = Output results in tabular rather than verbose format\n");
-	printf("-yuv = Test YUV encoding/decoding functions\n");
-	printf("-yuvpad <p> = If testing YUV encoding/decoding, this specifies the number of\n");
-	printf("     bytes to which each row of each plane in the intermediate YUV image is\n");
-	printf("     padded (default = 1)\n");
-	printf("-scale M/N = Scale down the width/height of the decompressed JPEG image by a\n");
-	printf("     factor of M/N (M/N = ");
-	for(i=0; i<nsf; i++)
-	{
-		printf("%d/%d", scalingfactors[i].num, scalingfactors[i].denom);
-		if(nsf==2 && i!=nsf-1) printf(" or ");
-		else if(nsf>2)
-		{
-			if(i!=nsf-1) printf(", ");
-			if(i==nsf-2) printf("or ");
-		}
-		if(i%8==0 && i!=0) printf("\n     ");
-	}
-	printf(")\n");
-	printf("-hflip, -vflip, -transpose, -transverse, -rot90, -rot180, -rot270 =\n");
-	printf("     Perform the corresponding lossless transform prior to\n");
-	printf("     decompression (these options are mutually exclusive)\n");
-	printf("-grayscale = Perform lossless grayscale conversion prior to decompression\n");
-	printf("     test (can be combined with the other transforms above)\n");
-	printf("-benchtime <t> = Run each benchmark for at least <t> seconds (default = 5.0)\n");
-	printf("-warmup <t> = Run each benchmark for <t> seconds (default = 1.0) prior to\n");
-	printf("     starting the timer, in order to prime the caches and thus improve the\n");
-	printf("     consistency of the results.\n");
-	printf("-componly = Stop after running compression tests.  Do not test decompression.\n");
-	printf("-nowrite = Do not write reference or output images (improves consistency of\n");
-	printf("     performance measurements.)\n\n");
-	printf("NOTE:  If the quality is specified as a range (e.g. 90-100), a separate\n");
-	printf("test will be performed for all quality values in the range.\n\n");
-	exit(1);
+  int i;
+
+  printf("USAGE: %s\n", progName);
+  printf("       <Inputfile (BMP|PPM)> <Quality> [options]\n\n");
+  printf("       %s\n", progName);
+  printf("       <Inputfile (JPG)> [options]\n\n");
+  printf("Options:\n\n");
+  printf("-alloc = Dynamically allocate JPEG image buffers\n");
+  printf("-bmp = Generate output images in Windows Bitmap format (default = PPM)\n");
+  printf("-bottomup = Test bottom-up compression/decompression\n");
+  printf("-tile = Test performance of the codec when the image is encoded as separate\n");
+  printf("     tiles of varying sizes.\n");
+  printf("-rgb, -bgr, -rgbx, -bgrx, -xbgr, -xrgb =\n");
+  printf("     Test the specified color conversion path in the codec (default = BGR)\n");
+  printf("-cmyk = Indirectly test YCCK JPEG compression/decompression (the source\n");
+  printf("     and destination bitmaps are still RGB.  The conversion is done\n");
+  printf("     internally prior to compression or after decompression.)\n");
+  printf("-fastupsample = Use the fastest chrominance upsampling algorithm available in\n");
+  printf("     the underlying codec\n");
+  printf("-fastdct = Use the fastest DCT/IDCT algorithms available in the underlying\n");
+  printf("     codec\n");
+  printf("-accuratedct = Use the most accurate DCT/IDCT algorithms available in the\n");
+  printf("     underlying codec\n");
+  printf("-progressive = Use progressive entropy coding in JPEG images generated by\n");
+  printf("     compression and transform operations.\n");
+  printf("-subsamp <s> = When testing JPEG compression, this option specifies the level\n");
+  printf("     of chrominance subsampling to use (<s> = 444, 422, 440, 420, 411, or\n");
+  printf("     GRAY).  The default is to test Grayscale, 4:2:0, 4:2:2, and 4:4:4 in\n");
+  printf("     sequence.\n");
+  printf("-quiet = Output results in tabular rather than verbose format\n");
+  printf("-yuv = Test YUV encoding/decoding functions\n");
+  printf("-yuvpad <p> = If testing YUV encoding/decoding, this specifies the number of\n");
+  printf("     bytes to which each row of each plane in the intermediate YUV image is\n");
+  printf("     padded (default = 1)\n");
+  printf("-scale M/N = Scale down the width/height of the decompressed JPEG image by a\n");
+  printf("     factor of M/N (M/N = ");
+  for (i = 0; i < nsf; i++) {
+    printf("%d/%d", scalingFactors[i].num, scalingFactors[i].denom);
+    if (nsf == 2 && i != nsf - 1) printf(" or ");
+    else if (nsf > 2) {
+      if (i != nsf - 1) printf(", ");
+      if (i == nsf - 2) printf("or ");
+    }
+    if (i % 8 == 0 && i != 0) printf("\n     ");
+  }
+  printf(")\n");
+  printf("-hflip, -vflip, -transpose, -transverse, -rot90, -rot180, -rot270 =\n");
+  printf("     Perform the corresponding lossless transform prior to\n");
+  printf("     decompression (these options are mutually exclusive)\n");
+  printf("-grayscale = Perform lossless grayscale conversion prior to decompression\n");
+  printf("     test (can be combined with the other transforms above)\n");
+  printf("-copynone = Do not copy any extra markers (including EXIF and ICC profile data)\n");
+  printf("     when transforming the image.\n");
+  printf("-benchtime <t> = Run each benchmark for at least <t> seconds (default = 5.0)\n");
+  printf("-warmup <t> = Run each benchmark for <t> seconds (default = 1.0) prior to\n");
+  printf("     starting the timer, in order to prime the caches and thus improve the\n");
+  printf("     consistency of the results.\n");
+  printf("-componly = Stop after running compression tests.  Do not test decompression.\n");
+  printf("-nowrite = Do not write reference or output images (improves consistency of\n");
+  printf("     performance measurements.)\n");
+  printf("-stoponwarning = Immediately discontinue the current\n");
+  printf("     compression/decompression/transform operation if the underlying codec\n");
+  printf("     throws a warning (non-fatal error)\n\n");
+  printf("NOTE:  If the quality is specified as a range (e.g. 90-100), a separate\n");
+  printf("test will be performed for all quality values in the range.\n\n");
+  exit(1);
 }
 
 
 int main(int argc, char *argv[])
 {
-	unsigned char *srcbuf=NULL;  int w=0, h=0, i, j;
-	int minqual=-1, maxqual=-1;  char *temp;
-	int minarg=2, retval=0, subsamp=-1;
+  unsigned char *srcBuf = NULL;
+  int w = 0, h = 0, i, j, minQual = -1, maxQual = -1;
+  char *temp;
+  int minArg = 2, retval = 0, subsamp = -1;
 
-	if((scalingfactors=tjGetScalingFactors(&nsf))==NULL || nsf==0)
-		_throwtj("executing tjGetScalingFactors()");
+  if ((scalingFactors = tjGetScalingFactors(&nsf)) == NULL || nsf == 0)
+    _throw("executing tjGetScalingFactors()", tjGetErrorStr());
 
-	if(argc<minarg) usage(argv[0]);
+  if (argc < minArg) usage(argv[0]);
 
-	temp=strrchr(argv[1], '.');
-	if(temp!=NULL)
-	{
-		if(!strcasecmp(temp, ".bmp")) ext="bmp";
-		if(!strcasecmp(temp, ".jpg") || !strcasecmp(temp, ".jpeg")) decomponly=1;
-	}
+  temp = strrchr(argv[1], '.');
+  if (temp != NULL) {
+    if (!strcasecmp(temp, ".bmp")) ext = "bmp";
+    if (!strcasecmp(temp, ".jpg") || !strcasecmp(temp, ".jpeg"))
+      decompOnly = 1;
+  }
 
-	printf("\n");
+  printf("\n");
 
-	if(!decomponly)
-	{
-		minarg=3;
-		if(argc<minarg) usage(argv[0]);
-		if((minqual=atoi(argv[2]))<1 || minqual>100)
-		{
-			puts("ERROR: Quality must be between 1 and 100.");
-			exit(1);
-		}
-		if((temp=strchr(argv[2], '-'))!=NULL && strlen(temp)>1
-			&& sscanf(&temp[1], "%d", &maxqual)==1 && maxqual>minqual && maxqual>=1
-			&& maxqual<=100) {}
-		else maxqual=minqual;
-	}
+  if (!decompOnly) {
+    minArg = 3;
+    if (argc < minArg) usage(argv[0]);
+    if ((minQual = atoi(argv[2])) < 1 || minQual > 100) {
+      puts("ERROR: Quality must be between 1 and 100.");
+      exit(1);
+    }
+    if ((temp = strchr(argv[2], '-')) != NULL && strlen(temp) > 1 &&
+        sscanf(&temp[1], "%d", &maxQual) == 1 && maxQual > minQual &&
+        maxQual >= 1 && maxQual <= 100) {}
+    else maxQual = minQual;
+  }
 
-	if(argc>minarg)
-	{
-		for(i=minarg; i<argc; i++)
-		{
-			if(!strcasecmp(argv[i], "-tile"))
-			{
-				dotile=1;  xformopt|=TJXOPT_CROP;
-			}
-			else if(!strcasecmp(argv[i], "-fastupsample"))
-			{
-				printf("Using fast upsampling code\n\n");
-				flags|=TJFLAG_FASTUPSAMPLE;
-			}
-			else if(!strcasecmp(argv[i], "-fastdct"))
-			{
-				printf("Using fastest DCT/IDCT algorithm\n\n");
-				flags|=TJFLAG_FASTDCT;
-			}
-			else if(!strcasecmp(argv[i], "-accuratedct"))
-			{
-				printf("Using most accurate DCT/IDCT algorithm\n\n");
-				flags|=TJFLAG_ACCURATEDCT;
-			}
-			else if(!strcasecmp(argv[i], "-rgb")) pf=TJPF_RGB;
-			else if(!strcasecmp(argv[i], "-rgbx")) pf=TJPF_RGBX;
-			else if(!strcasecmp(argv[i], "-bgr")) pf=TJPF_BGR;
-			else if(!strcasecmp(argv[i], "-bgrx")) pf=TJPF_BGRX;
-			else if(!strcasecmp(argv[i], "-xbgr")) pf=TJPF_XBGR;
-			else if(!strcasecmp(argv[i], "-xrgb")) pf=TJPF_XRGB;
-			else if(!strcasecmp(argv[i], "-cmyk")) pf=TJPF_CMYK;
-			else if(!strcasecmp(argv[i], "-bottomup")) flags|=TJFLAG_BOTTOMUP;
-			else if(!strcasecmp(argv[i], "-quiet")) quiet=1;
-			else if(!strcasecmp(argv[i], "-qq")) quiet=2;
-			else if(!strcasecmp(argv[i], "-scale") && i<argc-1)
-			{
-				int temp1=0, temp2=0, match=0;
-				if(sscanf(argv[++i], "%d/%d", &temp1, &temp2)==2)
-				{
-					for(j=0; j<nsf; j++)
-					{
-						if((double)temp1/(double)temp2
-							== (double)scalingfactors[j].num/(double)scalingfactors[j].denom)
-						{
-							sf=scalingfactors[j];
-							match=1;  break;
-						}
-					}
-					if(!match) usage(argv[0]);
-				}
-				else usage(argv[0]);
-			}
-			else if(!strcasecmp(argv[i], "-hflip")) xformop=TJXOP_HFLIP;
-			else if(!strcasecmp(argv[i], "-vflip")) xformop=TJXOP_VFLIP;
-			else if(!strcasecmp(argv[i], "-transpose")) xformop=TJXOP_TRANSPOSE;
-			else if(!strcasecmp(argv[i], "-transverse")) xformop=TJXOP_TRANSVERSE;
-			else if(!strcasecmp(argv[i], "-rot90")) xformop=TJXOP_ROT90;
-			else if(!strcasecmp(argv[i], "-rot180")) xformop=TJXOP_ROT180;
-			else if(!strcasecmp(argv[i], "-rot270")) xformop=TJXOP_ROT270;
-			else if(!strcasecmp(argv[i], "-grayscale")) xformopt|=TJXOPT_GRAY;
-			else if(!strcasecmp(argv[i], "-custom")) customFilter=dummyDCTFilter;
-			else if(!strcasecmp(argv[i], "-nooutput")) xformopt|=TJXOPT_NOOUTPUT;
-			else if(!strcasecmp(argv[i], "-benchtime") && i<argc-1)
-			{
-				double temp=atof(argv[++i]);
-				if(temp>0.0) benchtime=temp;
-				else usage(argv[0]);
-			}
-			else if(!strcasecmp(argv[i], "-warmup") && i<argc-1)
-			{
-				double temp=atof(argv[++i]);
-				if(temp>=0.0) warmup=temp;
-				else usage(argv[0]);
-				printf("Warmup time = %.1f seconds\n\n", warmup);
-			}
-			else if(!strcasecmp(argv[i], "-alloc")) flags&=(~TJFLAG_NOREALLOC);
-			else if(!strcasecmp(argv[i], "-bmp")) ext="bmp";
-			else if(!strcasecmp(argv[i], "-yuv"))
-			{
-				printf("Testing YUV planar encoding/decoding\n\n");
-				doyuv=1;
-			}
-			else if(!strcasecmp(argv[i], "-yuvpad") && i<argc-1)
-			{
-				int temp=atoi(argv[++i]);
-				if(temp>=1) yuvpad=temp;
-			}
-			else if(!strcasecmp(argv[i], "-subsamp") && i<argc-1)
-			{
-				i++;
-				if(toupper(argv[i][0])=='G') subsamp=TJSAMP_GRAY;
-				else
-				{
-					int temp=atoi(argv[i]);
-					switch(temp)
-					{
-						case 444:  subsamp=TJSAMP_444;  break;
-						case 422:  subsamp=TJSAMP_422;  break;
-						case 440:  subsamp=TJSAMP_440;  break;
-						case 420:  subsamp=TJSAMP_420;  break;
-						case 411:  subsamp=TJSAMP_411;  break;
-					}
-				}
-			}
-			else if(!strcasecmp(argv[i], "-componly")) componly=1;
-			else if(!strcasecmp(argv[i], "-nowrite")) dowrite=0;
-			else usage(argv[0]);
-		}
-	}
+  if (argc > minArg) {
+    for (i = minArg; i < argc; i++) {
+      if (!strcasecmp(argv[i], "-tile")) {
+        doTile = 1;  xformOpt |= TJXOPT_CROP;
+      } else if (!strcasecmp(argv[i], "-fastupsample")) {
+        printf("Using fast upsampling code\n\n");
+        flags |= TJFLAG_FASTUPSAMPLE;
+      } else if (!strcasecmp(argv[i], "-fastdct")) {
+        printf("Using fastest DCT/IDCT algorithm\n\n");
+        flags |= TJFLAG_FASTDCT;
+      } else if (!strcasecmp(argv[i], "-accuratedct")) {
+        printf("Using most accurate DCT/IDCT algorithm\n\n");
+        flags |= TJFLAG_ACCURATEDCT;
+      } else if (!strcasecmp(argv[i], "-progressive")) {
+        printf("Using progressive entropy coding\n\n");
+        flags |= TJFLAG_PROGRESSIVE;
+      } else if (!strcasecmp(argv[i], "-rgb"))
+        pf = TJPF_RGB;
+      else if (!strcasecmp(argv[i], "-rgbx"))
+        pf = TJPF_RGBX;
+      else if (!strcasecmp(argv[i], "-bgr"))
+        pf = TJPF_BGR;
+      else if (!strcasecmp(argv[i], "-bgrx"))
+        pf = TJPF_BGRX;
+      else if (!strcasecmp(argv[i], "-xbgr"))
+        pf = TJPF_XBGR;
+      else if (!strcasecmp(argv[i], "-xrgb"))
+        pf = TJPF_XRGB;
+      else if (!strcasecmp(argv[i], "-cmyk"))
+        pf = TJPF_CMYK;
+      else if (!strcasecmp(argv[i], "-bottomup"))
+        flags |= TJFLAG_BOTTOMUP;
+      else if (!strcasecmp(argv[i], "-quiet"))
+        quiet = 1;
+      else if (!strcasecmp(argv[i], "-qq"))
+        quiet = 2;
+      else if (!strcasecmp(argv[i], "-scale") && i < argc - 1) {
+        int temp1 = 0, temp2 = 0, match = 0;
 
-	if((sf.num!=1 || sf.denom!=1) && dotile)
-	{
-		printf("Disabling tiled compression/decompression tests, because those tests do not\n");
-		printf("work when scaled decompression is enabled.\n");
-		dotile=0;
-	}
+        if (sscanf(argv[++i], "%d/%d", &temp1, &temp2) == 2) {
+          for (j = 0; j < nsf; j++) {
+            if ((double)temp1 / (double)temp2 ==
+                (double)scalingFactors[j].num /
+                (double)scalingFactors[j].denom) {
+              sf = scalingFactors[j];
+              match = 1;  break;
+            }
+          }
+          if (!match) usage(argv[0]);
+        } else usage(argv[0]);
+      } else if (!strcasecmp(argv[i], "-hflip"))
+        xformOp = TJXOP_HFLIP;
+      else if (!strcasecmp(argv[i], "-vflip"))
+        xformOp = TJXOP_VFLIP;
+      else if (!strcasecmp(argv[i], "-transpose"))
+        xformOp = TJXOP_TRANSPOSE;
+      else if (!strcasecmp(argv[i], "-transverse"))
+        xformOp = TJXOP_TRANSVERSE;
+      else if (!strcasecmp(argv[i], "-rot90"))
+        xformOp = TJXOP_ROT90;
+      else if (!strcasecmp(argv[i], "-rot180"))
+        xformOp = TJXOP_ROT180;
+      else if (!strcasecmp(argv[i], "-rot270"))
+        xformOp = TJXOP_ROT270;
+      else if (!strcasecmp(argv[i], "-grayscale"))
+        xformOpt |= TJXOPT_GRAY;
+      else if (!strcasecmp(argv[i], "-custom"))
+        customFilter = dummyDCTFilter;
+      else if (!strcasecmp(argv[i], "-nooutput"))
+        xformOpt |= TJXOPT_NOOUTPUT;
+      else if (!strcasecmp(argv[i], "-copynone"))
+        xformOpt |= TJXOPT_COPYNONE;
+      else if (!strcasecmp(argv[i], "-benchtime") && i < argc - 1) {
+        double temp = atof(argv[++i]);
 
-	if((flags&TJFLAG_NOREALLOC)==0 && dotile)
-	{
-		printf("Disabling tiled compression/decompression tests, because those tests do not\n");
-		printf("work when dynamic JPEG buffer allocation is enabled.\n\n");
-		dotile=0;
-	}
+        if (temp > 0.0) benchTime = temp;
+        else usage(argv[0]);
+      } else if (!strcasecmp(argv[i], "-warmup") && i < argc - 1) {
+        double temp = atof(argv[++i]);
 
-	if(!decomponly)
-	{
-		if(loadbmp(argv[1], &srcbuf, &w, &h, pf, (flags&TJFLAG_BOTTOMUP)!=0)==-1)
-			_throwbmp("loading bitmap");
-		temp=strrchr(argv[1], '.');
-		if(temp!=NULL) *temp='\0';
-	}
+        if (temp >= 0.0) warmup = temp;
+        else usage(argv[0]);
+        printf("Warmup time = %.1f seconds\n\n", warmup);
+      } else if (!strcasecmp(argv[i], "-alloc"))
+        flags &= (~TJFLAG_NOREALLOC);
+      else if (!strcasecmp(argv[i], "-bmp"))
+        ext = "bmp";
+      else if (!strcasecmp(argv[i], "-yuv")) {
+        printf("Testing YUV planar encoding/decoding\n\n");
+        doYUV = 1;
+      } else if (!strcasecmp(argv[i], "-yuvpad") && i < argc - 1) {
+        int temp = atoi(argv[++i]);
 
-	if(quiet==1 && !decomponly)
-	{
-		printf("All performance values in Mpixels/sec\n\n");
-		printf("Bitmap     JPEG     JPEG  %s  %s   ",
-			dotile? "Tile ":"Image", dotile? "Tile ":"Image");
-		if(doyuv) printf("Encode  ");
-		printf("Comp    Comp    Decomp  ");
-		if(doyuv) printf("Decode");
-		printf("\n");
-		printf("Format     Subsamp  Qual  Width  Height  ");
-		if(doyuv) printf("Perf    ");
-		printf("Perf    Ratio   Perf    ");
-		if(doyuv) printf("Perf");
-		printf("\n\n");
-	}
+        if (temp >= 1) yuvPad = temp;
+      } else if (!strcasecmp(argv[i], "-subsamp") && i < argc - 1) {
+        i++;
+        if (toupper(argv[i][0]) == 'G') subsamp = TJSAMP_GRAY;
+        else {
+          int temp = atoi(argv[i]);
 
-	if(decomponly)
-	{
-		decompTest(argv[1]);
-		printf("\n");
-		goto bailout;
-	}
-	if(subsamp>=0 && subsamp<TJ_NUMSAMP)
-	{
-		for(i=maxqual; i>=minqual; i--)
-			fullTest(srcbuf, w, h, subsamp, i, argv[1]);
-		printf("\n");
-	}
-	else
-	{
-		if(pf!=TJPF_CMYK)
-		{
-			for(i=maxqual; i>=minqual; i--)
-				fullTest(srcbuf, w, h, TJSAMP_GRAY, i, argv[1]);
-			printf("\n");
-		}
-		for(i=maxqual; i>=minqual; i--)
-			fullTest(srcbuf, w, h, TJSAMP_420, i, argv[1]);
-		printf("\n");
-		for(i=maxqual; i>=minqual; i--)
-			fullTest(srcbuf, w, h, TJSAMP_422, i, argv[1]);
-		printf("\n");
-		for(i=maxqual; i>=minqual; i--)
-			fullTest(srcbuf, w, h, TJSAMP_444, i, argv[1]);
-		printf("\n");
-	}
+          switch (temp) {
+          case 444:  subsamp = TJSAMP_444;  break;
+          case 422:  subsamp = TJSAMP_422;  break;
+          case 440:  subsamp = TJSAMP_440;  break;
+          case 420:  subsamp = TJSAMP_420;  break;
+          case 411:  subsamp = TJSAMP_411;  break;
+          }
+        }
+      } else if (!strcasecmp(argv[i], "-componly"))
+        compOnly = 1;
+      else if (!strcasecmp(argv[i], "-nowrite"))
+        doWrite = 0;
+      else if (!strcasecmp(argv[i], "-stoponwarning"))
+        flags |= TJFLAG_STOPONWARNING;
+      else usage(argv[0]);
+    }
+  }
 
-	bailout:
-	if(srcbuf) free(srcbuf);
-	return retval;
+  if ((sf.num != 1 || sf.denom != 1) && doTile) {
+    printf("Disabling tiled compression/decompression tests, because those tests do not\n");
+    printf("work when scaled decompression is enabled.\n");
+    doTile = 0;
+  }
+
+  if ((flags & TJFLAG_NOREALLOC) == 0 && doTile) {
+    printf("Disabling tiled compression/decompression tests, because those tests do not\n");
+    printf("work when dynamic JPEG buffer allocation is enabled.\n\n");
+    doTile = 0;
+  }
+
+  if (!decompOnly) {
+    if ((srcBuf = tjLoadImage(argv[1], &w, 1, &h, &pf, flags)) == NULL)
+      _throwtjg("loading bitmap");
+    temp = strrchr(argv[1], '.');
+    if (temp != NULL) *temp = '\0';
+  }
+
+  if (quiet == 1 && !decompOnly) {
+    printf("All performance values in Mpixels/sec\n\n");
+    printf("Bitmap     JPEG     JPEG  %s  %s   ",
+           doTile ? "Tile " : "Image", doTile ? "Tile " : "Image");
+    if (doYUV) printf("Encode  ");
+    printf("Comp    Comp    Decomp  ");
+    if (doYUV) printf("Decode");
+    printf("\n");
+    printf("Format     Subsamp  Qual  Width  Height  ");
+    if (doYUV) printf("Perf    ");
+    printf("Perf    Ratio   Perf    ");
+    if (doYUV) printf("Perf");
+    printf("\n\n");
+  }
+
+  if (decompOnly) {
+    decompTest(argv[1]);
+    printf("\n");
+    goto bailout;
+  }
+  if (subsamp >= 0 && subsamp < TJ_NUMSAMP) {
+    for (i = maxQual; i >= minQual; i--)
+      fullTest(srcBuf, w, h, subsamp, i, argv[1]);
+    printf("\n");
+  } else {
+    if (pf != TJPF_CMYK) {
+      for (i = maxQual; i >= minQual; i--)
+        fullTest(srcBuf, w, h, TJSAMP_GRAY, i, argv[1]);
+      printf("\n");
+    }
+    for (i = maxQual; i >= minQual; i--)
+      fullTest(srcBuf, w, h, TJSAMP_420, i, argv[1]);
+    printf("\n");
+    for (i = maxQual; i >= minQual; i--)
+      fullTest(srcBuf, w, h, TJSAMP_422, i, argv[1]);
+    printf("\n");
+    for (i = maxQual; i >= minQual; i--)
+      fullTest(srcBuf, w, h, TJSAMP_444, i, argv[1]);
+    printf("\n");
+  }
+
+bailout:
+  if (srcBuf) tjFree(srcBuf);
+  return retval;
 }
diff --git a/tjbenchtest.in b/tjbenchtest.in
index 22e15db..1c08b37 100755
--- a/tjbenchtest.in
+++ b/tjbenchtest.in
@@ -21,14 +21,15 @@
 
 EXT=bmp
 IMAGES="vgl_5674_0098.${EXT} vgl_6434_0018a.${EXT} vgl_6548_0026a.${EXT} nightshot_iso_100.${EXT}"
-IMGDIR=@srcdir@/testimages
+IMGDIR=@CMAKE_CURRENT_SOURCE_DIR@/testimages
 OUTDIR=`mktemp -d /tmp/__tjbenchtest_output.XXXXXX`
-EXEDIR=.
+EXEDIR=@CMAKE_CURRENT_BINARY_DIR@
 BMPARG=
 NSARG=
 YUVARG=
 ALLOC=0
 ALLOCARG=
+PROGARG=
 if [ "$EXT" = "bmp" ]; then BMPARG=-bmp; fi
 
 if [ -d $OUTDIR ]; then
@@ -64,25 +65,28 @@
 		ALLOCARG=-alloc
 		ALLOC=1
 		;;
+	-progressive)
+		PROGARG=-progressive
+		;;
 	esac
 	shift
 done
 
-exec >$EXEDIR/tjbenchtest$YUVARG$ALLOCARG.log
+exec >$EXEDIR/tjbenchtest$YUVARG$ALLOCARG$PROGARG.log
 
 # Standard tests
 for image in $IMAGES; do
 
 	cp $IMGDIR/$image $OUTDIR
 	basename=`basename $image .${EXT}`
-	runme $EXEDIR/cjpeg -quality 95 -dct fast -grayscale -outfile $OUTDIR/${basename}_GRAY_fast_cjpeg.jpg $IMGDIR/${basename}.${EXT}
-	runme $EXEDIR/cjpeg -quality 95 -dct fast -sample 2x2 -outfile $OUTDIR/${basename}_420_fast_cjpeg.jpg $IMGDIR/${basename}.${EXT}
-	runme $EXEDIR/cjpeg -quality 95 -dct fast -sample 2x1 -outfile $OUTDIR/${basename}_422_fast_cjpeg.jpg $IMGDIR/${basename}.${EXT}
-	runme $EXEDIR/cjpeg -quality 95 -dct fast -sample 1x1 -outfile $OUTDIR/${basename}_444_fast_cjpeg.jpg $IMGDIR/${basename}.${EXT}
-	runme $EXEDIR/cjpeg -quality 95 -dct int -grayscale -outfile $OUTDIR/${basename}_GRAY_accurate_cjpeg.jpg $IMGDIR/${basename}.${EXT}
-	runme $EXEDIR/cjpeg -quality 95 -dct int -sample 2x2 -outfile $OUTDIR/${basename}_420_accurate_cjpeg.jpg $IMGDIR/${basename}.${EXT}
-	runme $EXEDIR/cjpeg -quality 95 -dct int -sample 2x1 -outfile $OUTDIR/${basename}_422_accurate_cjpeg.jpg $IMGDIR/${basename}.${EXT}
-	runme $EXEDIR/cjpeg -quality 95 -dct int -sample 1x1 -outfile $OUTDIR/${basename}_444_accurate_cjpeg.jpg $IMGDIR/${basename}.${EXT}
+	runme $EXEDIR/cjpeg -quality 95 -dct fast $PROGARG -grayscale -outfile $OUTDIR/${basename}_GRAY_fast_cjpeg.jpg $IMGDIR/${basename}.${EXT}
+	runme $EXEDIR/cjpeg -quality 95 -dct fast $PROGARG -sample 2x2 -outfile $OUTDIR/${basename}_420_fast_cjpeg.jpg $IMGDIR/${basename}.${EXT}
+	runme $EXEDIR/cjpeg -quality 95 -dct fast $PROGARG -sample 2x1 -outfile $OUTDIR/${basename}_422_fast_cjpeg.jpg $IMGDIR/${basename}.${EXT}
+	runme $EXEDIR/cjpeg -quality 95 -dct fast $PROGARG -sample 1x1 -outfile $OUTDIR/${basename}_444_fast_cjpeg.jpg $IMGDIR/${basename}.${EXT}
+	runme $EXEDIR/cjpeg -quality 95 -dct int $PROGARG -grayscale -outfile $OUTDIR/${basename}_GRAY_accurate_cjpeg.jpg $IMGDIR/${basename}.${EXT}
+	runme $EXEDIR/cjpeg -quality 95 -dct int $PROGARG -sample 2x2 -outfile $OUTDIR/${basename}_420_accurate_cjpeg.jpg $IMGDIR/${basename}.${EXT}
+	runme $EXEDIR/cjpeg -quality 95 -dct int $PROGARG -sample 2x1 -outfile $OUTDIR/${basename}_422_accurate_cjpeg.jpg $IMGDIR/${basename}.${EXT}
+	runme $EXEDIR/cjpeg -quality 95 -dct int $PROGARG -sample 1x1 -outfile $OUTDIR/${basename}_444_accurate_cjpeg.jpg $IMGDIR/${basename}.${EXT}
 	for samp in GRAY 420 422 444; do
 		runme $EXEDIR/djpeg -rgb $NSARG $BMPARG -outfile $OUTDIR/${basename}_${samp}_default_djpeg.${EXT} $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
 		runme $EXEDIR/djpeg -dct fast -rgb $NSARG $BMPARG -outfile $OUTDIR/${basename}_${samp}_fast_djpeg.${EXT} $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
@@ -96,7 +100,7 @@
 
 	# Compression
 	for dct in accurate fast; do
-		runme $EXEDIR/tjbench $OUTDIR/$image 95 -rgb -quiet -benchtime 0.01 -warmup 0 -${dct}dct $YUVARG $ALLOCARG
+		runme $EXEDIR/tjbench $OUTDIR/$image 95 -rgb -quiet -benchtime 0.01 -warmup 0 -${dct}dct $YUVARG $ALLOCARG $PROGARG
 		for samp in GRAY 420 422 444; do
 			runme cmp $OUTDIR/${basename}_${samp}_Q95.jpg $OUTDIR/${basename}_${samp}_${dct}_cjpeg.jpg
 		done
@@ -109,7 +113,7 @@
 		fi
 
 		# Tiled compression & decompression
-		runme $EXEDIR/tjbench $OUTDIR/$image 95 -rgb -tile -quiet -benchtime 0.01 -warmup 0 ${dctarg} $YUVARG $ALLOCARG
+		runme $EXEDIR/tjbench $OUTDIR/$image 95 -rgb -tile -quiet -benchtime 0.01 -warmup 0 ${dctarg} $YUVARG $ALLOCARG $PROGARG
 		for samp in GRAY 444; do
 			if [ $ALLOC = 1 ]; then
 				runme cmp $OUTDIR/${basename}_${samp}_Q95_full.${EXT} $OUTDIR/${basename}_${samp}_${dct}_djpeg.${EXT}
@@ -122,7 +126,7 @@
 				done
 			fi
 		done
-		runme $EXEDIR/tjbench $OUTDIR/$image 95 -rgb -tile -quiet -benchtime 0.01 -warmup 0 -fastupsample ${dctarg} $YUVARG $ALLOCARG
+		runme $EXEDIR/tjbench $OUTDIR/$image 95 -rgb -tile -quiet -benchtime 0.01 -warmup 0 -fastupsample ${dctarg} $YUVARG $ALLOCARG $PROGARG
 		for samp in 420 422; do
 			if [ $ALLOC = 1 ]; then
 				runme cmp $OUTDIR/${basename}_${samp}_Q95_full.${EXT} $OUTDIR/${basename}_${samp}_${dct}_nosmooth_djpeg.${EXT}
@@ -138,7 +142,7 @@
 
 		# Tiled decompression
 		for samp in GRAY 444; do
-			runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -tile -quiet -benchtime 0.01 -warmup 0 ${dctarg} $YUVARG $ALLOCARG
+			runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -tile -quiet -benchtime 0.01 -warmup 0 ${dctarg} $YUVARG $ALLOCARG $PROGARG
 			if [ $ALLOC = 1 ]; then
 				runme cmp $OUTDIR/${basename}_${samp}_Q95_full.${EXT} $OUTDIR/${basename}_${samp}_${dct}_djpeg.${EXT}
 				rm $OUTDIR/${basename}_${samp}_Q95_full.${EXT}
@@ -151,7 +155,7 @@
 			fi
 		done
 		for samp in 420 422; do
-			runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -tile -quiet -benchtime 0.01 -warmup 0 -fastupsample ${dctarg} $YUVARG $ALLOCARG
+			runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -tile -quiet -benchtime 0.01 -warmup 0 -fastupsample ${dctarg} $YUVARG $ALLOCARG $PROGARG
 			if [ $ALLOC = 1 ]; then
 				runme cmp $OUTDIR/${basename}_${samp}_Q95_full.${EXT} $OUTDIR/${basename}_${samp}_${dct}_nosmooth_djpeg.${EXT}
 				rm $OUTDIR/${basename}_${samp}_Q95_full.${EXT}
@@ -167,10 +171,10 @@
 
 	# Scaled decompression
 	for scale in 2_1 15_8 7_4 13_8 3_2 11_8 5_4 9_8 7_8 3_4 5_8 1_2 3_8 1_4 1_8; do
-		scalearg=`echo $scale | sed s@_@/@g`
+		scalearg=`echo $scale | sed 's/\_/\//g'`
 		for samp in GRAY 420 422 444; do
 			runme $EXEDIR/djpeg -rgb -scale ${scalearg} $NSARG $BMPARG -outfile $OUTDIR/${basename}_${samp}_${scale}_djpeg.${EXT} $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
-			runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -scale ${scalearg} -quiet -benchtime 0.01 -warmup 0 $YUVARG $ALLOCARG
+			runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -scale ${scalearg} -quiet -benchtime 0.01 -warmup 0 $YUVARG $ALLOCARG $PROGARG
 			runme cmp $OUTDIR/${basename}_${samp}_Q95_${scale}.${EXT} $OUTDIR/${basename}_${samp}_${scale}_djpeg.${EXT}
 			rm $OUTDIR/${basename}_${samp}_Q95_${scale}.${EXT}
 		done
@@ -189,7 +193,7 @@
 	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
 		for samp in GRAY 444; do
 			runme $EXEDIR/djpeg -rgb $BMPARG -outfile $OUTDIR/${basename}_${samp}_${xform}_jpegtran.${EXT} $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
-			runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -$xform -tile -quiet -benchtime 0.01 -warmup 0 $YUVARG $ALLOCARG
+			runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -$xform -tile -quiet -benchtime 0.01 -warmup 0 $YUVARG $ALLOCARG $PROGARG
 			if [ $ALLOC = 1 ]; then
 				runme cmp $OUTDIR/${basename}_${samp}_Q95_full.${EXT} $OUTDIR/${basename}_${samp}_${xform}_jpegtran.${EXT}
 				rm $OUTDIR/${basename}_${samp}_Q95_full.${EXT}
@@ -203,7 +207,7 @@
 		done
 		for samp in 420 422; do
 			runme $EXEDIR/djpeg -nosmooth -rgb $BMPARG -outfile $OUTDIR/${basename}_${samp}_${xform}_jpegtran.${EXT} $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
-			runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -$xform -tile -quiet -benchtime 0.01 -warmup 0 -fastupsample $YUVARG $ALLOCARG
+			runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -$xform -tile -quiet -benchtime 0.01 -warmup 0 -fastupsample $YUVARG $ALLOCARG $PROGARG
 			if [ $ALLOC = 1 ]; then
 				runme cmp $OUTDIR/${basename}_${samp}_Q95_full.${EXT} $OUTDIR/${basename}_${samp}_${xform}_jpegtran.${EXT}
 				rm $OUTDIR/${basename}_${samp}_Q95_full.${EXT}
@@ -220,7 +224,7 @@
 	# Grayscale transform
 	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
 		for samp in GRAY 444 422 420; do
-			runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -$xform -tile -quiet -benchtime 0.01 -warmup 0 -grayscale $YUVARG $ALLOCARG
+			runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -$xform -tile -quiet -benchtime 0.01 -warmup 0 -grayscale $YUVARG $ALLOCARG $PROGARG
 			if [ $ALLOC = 1 ]; then
 				runme cmp $OUTDIR/${basename}_${samp}_Q95_full.${EXT} $OUTDIR/${basename}_GRAY_${xform}_jpegtran.${EXT}
 				rm $OUTDIR/${basename}_${samp}_Q95_full.${EXT}
@@ -238,9 +242,9 @@
 	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
 		for samp in GRAY 444 422 420; do
 			for scale in 2_1 15_8 7_4 13_8 3_2 11_8 5_4 9_8 7_8 3_4 5_8 1_2 3_8 1_4 1_8; do
-				scalearg=`echo $scale | sed s@_@/@g`
+				scalearg=`echo $scale | sed 's/\_/\//g'`
 				runme $EXEDIR/djpeg -rgb -scale ${scalearg} $NSARG $BMPARG -outfile $OUTDIR/${basename}_${samp}_${xform}_${scale}_jpegtran.${EXT} $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
-				runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -$xform -scale ${scalearg} -quiet -benchtime 0.01 -warmup 0 $YUVARG $ALLOCARG
+				runme $EXEDIR/tjbench $OUTDIR/${basename}_${samp}_Q95.jpg $BMPARG -$xform -scale ${scalearg} -quiet -benchtime 0.01 -warmup 0 $YUVARG $ALLOCARG $PROGARG
 				runme cmp $OUTDIR/${basename}_${samp}_Q95_${scale}.${EXT} $OUTDIR/${basename}_${samp}_${xform}_${scale}_jpegtran.${EXT}
 				rm $OUTDIR/${basename}_${samp}_Q95_${scale}.${EXT}
 			done
diff --git a/tjbenchtest.java.in b/tjbenchtest.java.in
index 0fd2896..689561d 100755
--- a/tjbenchtest.java.in
+++ b/tjbenchtest.java.in
@@ -16,25 +16,28 @@
 runme()
 {
 	echo \*\*\* $*
-	$*
+	"$@"
 }
 
 IMAGES="vgl_5674_0098.bmp vgl_6434_0018a.bmp vgl_6548_0026a.bmp nightshot_iso_100.bmp"
-IMGDIR=@srcdir@/testimages
+IMGDIR=@CMAKE_CURRENT_SOURCE_DIR@/testimages
 OUTDIR=`mktemp -d /tmp/__tjbenchtest_java_output.XXXXXX`
-EXEDIR=.
-JAVA="@JAVA@ -cp java/turbojpeg.jar -Djava.library.path=.libs"
+EXEDIR=@CMAKE_CURRENT_BINARY_DIR@
+JAVA="@Java_JAVA_EXECUTABLE@"
+JAVAARGS="-cp $EXEDIR/java/turbojpeg.jar -Djava.library.path=$EXEDIR"
 BMPARG=
 NSARG=
 YUVARG=
+PROGARG=
 
 if [ -d $OUTDIR ]; then
 	rm -rf $OUTDIR
 fi
 mkdir -p $OUTDIR
 
-if [ $# -gt 0 ]; then
-	if [ "$1" = "-yuv" ]; then
+while [ $# -gt 0 ]; do
+	case "$1" in
+	-yuv)
 		NSARG=-nosmooth
 		YUVARG=-yuv
 
@@ -55,24 +58,29 @@
 # phenomenon is not yet fully understood but is also believed to be some sort
 # of round-off error.)
 		IMAGES="vgl_6548_0026a.bmp"
-	fi
-fi
+		;;
+	-progressive)
+		PROGARG=-progressive
+		;;
+	esac
+	shift
+done
 
-exec >$EXEDIR/tjbenchtest-java$YUVARG.log
+exec >$EXEDIR/tjbenchtest-java$YUVARG$PROGARG.log
 
 # Standard tests
 for image in $IMAGES; do
 
 	cp $IMGDIR/$image $OUTDIR
 	basename=`basename $image .bmp`
-	runme $EXEDIR/cjpeg -quality 95 -dct fast -grayscale -outfile $OUTDIR/${basename}_GRAY_fast_cjpeg.jpg $IMGDIR/${basename}.bmp
-	runme $EXEDIR/cjpeg -quality 95 -dct fast -sample 2x2 -outfile $OUTDIR/${basename}_420_fast_cjpeg.jpg $IMGDIR/${basename}.bmp
-	runme $EXEDIR/cjpeg -quality 95 -dct fast -sample 2x1 -outfile $OUTDIR/${basename}_422_fast_cjpeg.jpg $IMGDIR/${basename}.bmp
-	runme $EXEDIR/cjpeg -quality 95 -dct fast -sample 1x1 -outfile $OUTDIR/${basename}_444_fast_cjpeg.jpg $IMGDIR/${basename}.bmp
-	runme $EXEDIR/cjpeg -quality 95 -dct int -grayscale -outfile $OUTDIR/${basename}_GRAY_accurate_cjpeg.jpg $IMGDIR/${basename}.bmp
-	runme $EXEDIR/cjpeg -quality 95 -dct int -sample 2x2 -outfile $OUTDIR/${basename}_420_accurate_cjpeg.jpg $IMGDIR/${basename}.bmp
-	runme $EXEDIR/cjpeg -quality 95 -dct int -sample 2x1 -outfile $OUTDIR/${basename}_422_accurate_cjpeg.jpg $IMGDIR/${basename}.bmp
-	runme $EXEDIR/cjpeg -quality 95 -dct int -sample 1x1 -outfile $OUTDIR/${basename}_444_accurate_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct fast $PROGARG -grayscale -outfile $OUTDIR/${basename}_GRAY_fast_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct fast $PROGARG -sample 2x2 -outfile $OUTDIR/${basename}_420_fast_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct fast $PROGARG -sample 2x1 -outfile $OUTDIR/${basename}_422_fast_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct fast $PROGARG -sample 1x1 -outfile $OUTDIR/${basename}_444_fast_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct int $PROGARG -grayscale -outfile $OUTDIR/${basename}_GRAY_accurate_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct int $PROGARG -sample 2x2 -outfile $OUTDIR/${basename}_420_accurate_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct int $PROGARG -sample 2x1 -outfile $OUTDIR/${basename}_422_accurate_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct int $PROGARG -sample 1x1 -outfile $OUTDIR/${basename}_444_accurate_cjpeg.jpg $IMGDIR/${basename}.bmp
 	for samp in GRAY 420 422 444; do
 		runme $EXEDIR/djpeg -rgb -bmp -outfile $OUTDIR/${basename}_${samp}_default_djpeg.bmp $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
 		runme $EXEDIR/djpeg -dct fast -rgb -bmp -outfile $OUTDIR/${basename}_${samp}_fast_djpeg.bmp $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
@@ -86,7 +94,7 @@
 
 	# Compression
 	for dct in accurate fast; do
-		runme $JAVA TJBench $OUTDIR/$image 95 -rgb -quiet -benchtime 0.01 -warmup 0 -${dct}dct $YUVARG
+		runme "$JAVA" $JAVAARGS TJBench $OUTDIR/$image 95 -rgb -quiet -benchtime 0.01 -warmup 0 -${dct}dct $YUVARG $PROGARG
 		for samp in GRAY 420 422 444; do
 			runme cmp $OUTDIR/${basename}_${samp}_Q95.jpg $OUTDIR/${basename}_${samp}_${dct}_cjpeg.jpg
 		done
@@ -99,7 +107,7 @@
 		fi
 
 		# Tiled compression & decompression
-		runme $JAVA TJBench $OUTDIR/$image 95 -rgb -tile -quiet -benchtime 0.01 -warmup 0 ${dctarg} $YUVARG
+		runme "$JAVA" $JAVAARGS TJBench $OUTDIR/$image 95 -rgb -tile -quiet -benchtime 0.01 -warmup 0 ${dctarg} $YUVARG $PROGARG
 		for samp in GRAY 444; do
 			for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].bmp \
 				$OUTDIR/${basename}_${samp}_Q95_full.bmp; do
@@ -107,7 +115,7 @@
 				rm $i
 			done
 		done
-		runme $JAVA TJBench $OUTDIR/$image 95 -rgb -tile -quiet -benchtime 0.01 -warmup 0 -fastupsample ${dctarg} $YUVARG
+		runme "$JAVA" $JAVAARGS TJBench $OUTDIR/$image 95 -rgb -tile -quiet -benchtime 0.01 -warmup 0 -fastupsample ${dctarg} $YUVARG $PROGARG
 		for samp in 420 422; do
 			for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].bmp \
 				$OUTDIR/${basename}_${samp}_Q95_full.bmp; do
@@ -118,7 +126,7 @@
 
 		# Tiled decompression
 		for samp in GRAY 444; do
-			runme $JAVA TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -tile -quiet -benchtime 0.01 -warmup 0 ${dctarg} $YUVARG
+			runme "$JAVA" $JAVAARGS TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -tile -quiet -benchtime 0.01 -warmup 0 ${dctarg} $YUVARG $PROGARG
 			for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].bmp \
 				$OUTDIR/${basename}_${samp}_Q95_full.bmp; do
 				runme cmp -i 54:54 $i $OUTDIR/${basename}_${samp}_${dct}_djpeg.bmp
@@ -126,7 +134,7 @@
 			done
 		done
 		for samp in 420 422; do
-			runme $JAVA TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -tile -quiet -benchtime 0.01 -warmup 0 -fastupsample ${dctarg} $YUVARG
+			runme "$JAVA" $JAVAARGS TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -tile -quiet -benchtime 0.01 -warmup 0 -fastupsample ${dctarg} $YUVARG $PROGARG
 			for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].bmp \
 				$OUTDIR/${basename}_${samp}_Q95_full.bmp; do
 				runme cmp $i -i 54:54 $OUTDIR/${basename}_${samp}_${dct}_nosmooth_djpeg.bmp
@@ -137,10 +145,10 @@
 
 	# Scaled decompression
 	for scale in 2_1 15_8 7_4 13_8 3_2 11_8 5_4 9_8 7_8 3_4 5_8 1_2 3_8 1_4 1_8; do
-		scalearg=`echo $scale | sed s@_@/@g`
+		scalearg=`echo $scale | sed 's/\_/\//g'`
 		for samp in GRAY 420 422 444; do
 			runme $EXEDIR/djpeg -rgb -scale ${scalearg} $NSARG -bmp -outfile $OUTDIR/${basename}_${samp}_${scale}_djpeg.bmp $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
-			runme $JAVA TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -scale ${scalearg} -quiet -benchtime 0.01 -warmup 0 $YUVARG
+			runme "$JAVA" $JAVAARGS TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -scale ${scalearg} -quiet -benchtime 0.01 -warmup 0 $YUVARG $PROGARG
 			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_Q95_${scale}.bmp $OUTDIR/${basename}_${samp}_${scale}_djpeg.bmp
 			rm $OUTDIR/${basename}_${samp}_Q95_${scale}.bmp
 		done
@@ -159,7 +167,7 @@
 	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
 		for samp in GRAY 444; do
 			runme $EXEDIR/djpeg -rgb -bmp -outfile $OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
-			runme $JAVA TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -$xform -tile -quiet -benchtime 0.01 -warmup 0 $YUVARG
+			runme "$JAVA" $JAVAARGS TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -$xform -tile -quiet -benchtime 0.01 -warmup 0 $YUVARG $PROGARG
 			for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].bmp \
 				$OUTDIR/${basename}_${samp}_Q95_full.bmp; do
 				runme cmp -i 54:54 $i $OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp
@@ -168,7 +176,7 @@
 		done
 		for samp in 420 422; do
 			runme $EXEDIR/djpeg -nosmooth -rgb -bmp -outfile $OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
-			runme $JAVA TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -$xform -tile -quiet -benchtime 0.01 -warmup 0 -fastupsample $YUVARG
+			runme "$JAVA" $JAVAARGS TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -$xform -tile -quiet -benchtime 0.01 -warmup 0 -fastupsample $YUVARG $PROGARG
 			for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].bmp \
 				$OUTDIR/${basename}_${samp}_Q95_full.bmp; do
 				runme cmp -i 54:54 $i $OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp
@@ -180,7 +188,7 @@
 	# Grayscale transform
 	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
 		for samp in GRAY 444 422 420; do
-			runme $JAVA TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -$xform -tile -quiet -benchtime 0.01 -warmup 0 -grayscale $YUVARG
+			runme "$JAVA" $JAVAARGS TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -$xform -tile -quiet -benchtime 0.01 -warmup 0 -grayscale $YUVARG $PROGARG
 			for i in $OUTDIR/${basename}_${samp}_Q95_[0-9]*[0-9]x[0-9]*[0-9].bmp \
 				$OUTDIR/${basename}_${samp}_Q95_full.bmp; do
 				runme cmp -i 54:54 $i $OUTDIR/${basename}_GRAY_${xform}_jpegtran.bmp
@@ -193,9 +201,9 @@
 	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
 		for samp in GRAY 444 422 420; do
 			for scale in 2_1 15_8 7_4 13_8 3_2 11_8 5_4 9_8 7_8 3_4 5_8 1_2 3_8 1_4 1_8; do
-				scalearg=`echo $scale | sed s@_@/@g`
+				scalearg=`echo $scale | sed 's/\_/\//g'`
 				runme $EXEDIR/djpeg -rgb -scale ${scalearg} $NSARG -bmp -outfile $OUTDIR/${basename}_${samp}_${xform}_${scale}_jpegtran.bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
-				runme $JAVA TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -$xform -scale ${scalearg} -quiet -benchtime 0.01 -warmup 0 $YUVARG
+				runme "$JAVA" $JAVAARGS TJBench $OUTDIR/${basename}_${samp}_Q95.jpg -$xform -scale ${scalearg} -quiet -benchtime 0.01 -warmup 0 $YUVARG $PROGARG
 				runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_Q95_${scale}.bmp $OUTDIR/${basename}_${samp}_${xform}_${scale}_jpegtran.bmp
 				rm $OUTDIR/${basename}_${samp}_Q95_${scale}.bmp
 			done
diff --git a/tjexample.c b/tjexample.c
new file mode 100644
index 0000000..6bdd792
--- /dev/null
+++ b/tjexample.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (C)2011-2012, 2014-2015, 2017 D. R. Commander.
+ *                                         All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This program demonstrates how to compress, decompress, and transform JPEG
+ * images using the TurboJPEG C API
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <turbojpeg.h>
+
+
+#ifdef _WIN32
+#define strcasecmp stricmp
+#define strncasecmp strnicmp
+#endif
+
+#define _throw(action, message) { \
+  printf("ERROR in line %d while %s:\n%s\n", __LINE__, action, message); \
+  retval = -1;  goto bailout; \
+}
+
+#define _throwtj(action) _throw(action, tjGetErrorStr2(tjInstance))
+
+#define _throwunix(action) _throw(action, strerror(errno))
+
+#define DEFAULT_SUBSAMP TJSAMP_444
+#define DEFAULT_QUALITY 95
+
+
+const char *subsampName[TJ_NUMSAMP] = {
+  "4:4:4", "4:2:2", "4:2:0", "Grayscale", "4:4:0", "4:1:1"
+};
+
+const char *colorspaceName[TJ_NUMCS] = {
+  "RGB", "YCbCr", "GRAY", "CMYK", "YCCK"
+};
+
+tjscalingfactor *scalingFactors = NULL;
+int numScalingFactors = 0;
+
+
+/* DCT filter example.  This produces a negative of the image. */
+
+int customFilter(short *coeffs, tjregion arrayRegion, tjregion planeRegion,
+                 int componentIndex, int transformIndex,
+                 tjtransform *transform)
+{
+  int i;
+
+  for (i = 0; i < arrayRegion.w * arrayRegion.h; i++)
+    coeffs[i] = -coeffs[i];
+
+  return 0;
+}
+
+
+void usage(char *programName)
+{
+  int i;
+
+  printf("\nUSAGE: %s <Input image> <Output image> [options]\n\n",
+         programName);
+
+  printf("Input and output images can be in Windows BMP or PBMPLUS (PPM/PGM) format.  If\n");
+  printf("either filename ends in a .jpg extension, then the TurboJPEG API will be used\n");
+  printf("to compress or decompress the image.\n\n");
+
+  printf("Compression Options (used if the output image is a JPEG image)\n");
+  printf("--------------------------------------------------------------\n\n");
+
+  printf("-subsamp <444|422|420|gray> = Apply this level of chrominance subsampling when\n");
+  printf("     compressing the output image.  The default is to use the same level of\n");
+  printf("     subsampling as in the input image, if the input image is also a JPEG\n");
+  printf("     image, or to use grayscale if the input image is a grayscale non-JPEG\n");
+  printf("     image, or to use %s subsampling otherwise.\n\n",
+         subsampName[DEFAULT_SUBSAMP]);
+
+  printf("-q <1-100> = Compress the output image with this JPEG quality level\n");
+  printf("     (default = %d).\n\n", DEFAULT_QUALITY);
+
+  printf("Decompression Options (used if the input image is a JPEG image)\n");
+  printf("---------------------------------------------------------------\n\n");
+
+  printf("-scale M/N = Scale the input image by a factor of M/N when decompressing it.\n");
+  printf("(M/N = ");
+  for (i = 0; i < numScalingFactors; i++) {
+    printf("%d/%d", scalingFactors[i].num, scalingFactors[i].denom);
+    if (numScalingFactors == 2 && i != numScalingFactors - 1)
+      printf(" or ");
+    else if (numScalingFactors > 2) {
+      if (i != numScalingFactors - 1)
+        printf(", ");
+      if (i == numScalingFactors - 2)
+        printf("or ");
+    }
+  }
+  printf(")\n\n");
+
+  printf("-hflip, -vflip, -transpose, -transverse, -rot90, -rot180, -rot270 =\n");
+  printf("     Perform one of these lossless transform operations on the input image\n");
+  printf("     prior to decompressing it (these options are mutually exclusive.)\n\n");
+
+  printf("-grayscale = Perform lossless grayscale conversion on the input image prior\n");
+  printf("     to decompressing it (can be combined with the other transform operations\n");
+  printf("     above.)\n\n");
+
+  printf("-crop WxH+X+Y = Perform lossless cropping on the input image prior to\n");
+  printf("     decompressing it.  X and Y specify the upper left corner of the cropping\n");
+  printf("     region, and W and H specify the width and height of the cropping region.\n");
+  printf("     X and Y must be evenly divible by the MCU block size (8x8 if the input\n");
+  printf("     image was compressed using no subsampling or grayscale, 16x8 if it was\n");
+  printf("     compressed using 4:2:2 subsampling, or 16x16 if it was compressed using\n");
+  printf("     4:2:0 subsampling.)\n\n");
+
+  printf("General Options\n");
+  printf("---------------\n\n");
+
+  printf("-fastupsample = Use the fastest chrominance upsampling algorithm available in\n");
+  printf("     the underlying codec.\n\n");
+
+  printf("-fastdct = Use the fastest DCT/IDCT algorithms available in the underlying\n");
+  printf("     codec.\n\n");
+
+  printf("-accuratedct = Use the most accurate DCT/IDCT algorithms available in the\n");
+  printf("     underlying codec.\n\n");
+
+  exit(1);
+}
+
+
+int main(int argc, char **argv)
+{
+  tjscalingfactor scalingFactor = { 1, 1 };
+  int outSubsamp = -1, outQual = -1;
+  tjtransform xform;
+  int flags = 0;
+  int width, height;
+  char *inFormat, *outFormat;
+  FILE *jpegFile = NULL;
+  unsigned char *imgBuf = NULL, *jpegBuf = NULL;
+  int retval = 0, i, pixelFormat = TJPF_UNKNOWN;
+  tjhandle tjInstance = NULL;
+
+  if ((scalingFactors = tjGetScalingFactors(&numScalingFactors)) == NULL)
+    _throwtj("getting scaling factors");
+  memset(&xform, 0, sizeof(tjtransform));
+
+  if (argc < 3)
+    usage(argv[0]);
+
+  /* Parse arguments. */
+  for (i = 3; i < argc; i++) {
+    if (!strncasecmp(argv[i], "-sc", 3) && i < argc - 1) {
+      int match = 0, temp1 = 0, temp2 = 0, j;
+
+      if (sscanf(argv[++i], "%d/%d", &temp1, &temp2) < 2)
+        usage(argv[0]);
+      for (j = 0; j < numScalingFactors; j++) {
+        if ((double)temp1 / (double)temp2 == (double)scalingFactors[j].num /
+                                             (double)scalingFactors[j].denom) {
+          scalingFactor = scalingFactors[j];
+          match = 1;
+          break;
+        }
+      }
+      if (match != 1)
+        usage(argv[0]);
+    } else if (!strncasecmp(argv[i], "-su", 3) && i < argc - 1) {
+      i++;
+      if (!strncasecmp(argv[i], "g", 1))
+        outSubsamp = TJSAMP_GRAY;
+      else if (!strcasecmp(argv[i], "444"))
+        outSubsamp = TJSAMP_444;
+      else if (!strcasecmp(argv[i], "422"))
+        outSubsamp = TJSAMP_422;
+      else if (!strcasecmp(argv[i], "420"))
+        outSubsamp = TJSAMP_420;
+      else
+        usage(argv[0]);
+    } else if (!strncasecmp(argv[i], "-q", 2) && i < argc - 1) {
+      outQual = atoi(argv[++i]);
+      if (outQual < 1 || outQual > 100)
+        usage(argv[0]);
+    } else if (!strncasecmp(argv[i], "-g", 2))
+      xform.options |= TJXOPT_GRAY;
+    else if (!strcasecmp(argv[i], "-hflip"))
+      xform.op = TJXOP_HFLIP;
+    else if (!strcasecmp(argv[i], "-vflip"))
+      xform.op = TJXOP_VFLIP;
+    else if (!strcasecmp(argv[i], "-transpose"))
+      xform.op = TJXOP_TRANSPOSE;
+    else if (!strcasecmp(argv[i], "-transverse"))
+      xform.op = TJXOP_TRANSVERSE;
+    else if (!strcasecmp(argv[i], "-rot90"))
+      xform.op = TJXOP_ROT90;
+    else if (!strcasecmp(argv[i], "-rot180"))
+      xform.op = TJXOP_ROT180;
+    else if (!strcasecmp(argv[i], "-rot270"))
+      xform.op = TJXOP_ROT270;
+    else if (!strcasecmp(argv[i], "-custom"))
+      xform.customFilter = customFilter;
+    else if (!strncasecmp(argv[i], "-c", 2) && i < argc - 1) {
+      if (sscanf(argv[++i], "%dx%d+%d+%d", &xform.r.w, &xform.r.h, &xform.r.x,
+                 &xform.r.y) < 4 ||
+          xform.r.x < 0 || xform.r.y < 0 || xform.r.w < 1 || xform.r.h < 1)
+        usage(argv[0]);
+      xform.options |= TJXOPT_CROP;
+    } else if (!strcasecmp(argv[i], "-fastupsample")) {
+      printf("Using fast upsampling code\n");
+      flags |= TJFLAG_FASTUPSAMPLE;
+    } else if (!strcasecmp(argv[i], "-fastdct")) {
+      printf("Using fastest DCT/IDCT algorithm\n");
+      flags |= TJFLAG_FASTDCT;
+    } else if (!strcasecmp(argv[i], "-accuratedct")) {
+      printf("Using most accurate DCT/IDCT algorithm\n");
+      flags |= TJFLAG_ACCURATEDCT;
+    } else usage(argv[0]);
+  }
+
+  /* Determine input and output image formats based on file extensions. */
+  inFormat = strrchr(argv[1], '.');
+  outFormat = strrchr(argv[2], '.');
+  if (inFormat == NULL || outFormat == NULL || strlen(inFormat) < 2 ||
+      strlen(outFormat) < 2)
+    usage(argv[0]);
+  inFormat = &inFormat[1];
+  outFormat = &outFormat[1];
+
+  if (!strcasecmp(inFormat, "jpg")) {
+    /* Input image is a JPEG image.  Decompress and/or transform it. */
+    long size;
+    int inSubsamp, inColorspace;
+    int doTransform = (xform.op != TJXOP_NONE || xform.options != 0 ||
+                       xform.customFilter != NULL);
+    unsigned long jpegSize;
+
+    /* Read the JPEG file into memory. */
+    if ((jpegFile = fopen(argv[1], "rb")) == NULL)
+      _throwunix("opening input file");
+    if (fseek(jpegFile, 0, SEEK_END) < 0 || ((size = ftell(jpegFile)) < 0) ||
+        fseek(jpegFile, 0, SEEK_SET) < 0)
+      _throwunix("determining input file size");
+    if (size == 0)
+      _throw("determining input file size", "Input file contains no data");
+    jpegSize = (unsigned long)size;
+    if ((jpegBuf = (unsigned char *)tjAlloc(jpegSize)) == NULL)
+      _throwunix("allocating JPEG buffer");
+    if (fread(jpegBuf, jpegSize, 1, jpegFile) < 1)
+      _throwunix("reading input file");
+    fclose(jpegFile);  jpegFile = NULL;
+
+    if (doTransform) {
+      /* Transform it. */
+      unsigned char *dstBuf = NULL;  /* Dynamically allocate the JPEG buffer */
+      unsigned long dstSize = 0;
+
+      if ((tjInstance = tjInitTransform()) == NULL)
+        _throwtj("initializing transformer");
+      xform.options |= TJXOPT_TRIM;
+      if (tjTransform(tjInstance, jpegBuf, jpegSize, 1, &dstBuf, &dstSize,
+                      &xform, flags) < 0)
+        _throwtj("transforming input image");
+      tjFree(jpegBuf);
+      jpegBuf = dstBuf;
+      jpegSize = dstSize;
+    } else {
+      if ((tjInstance = tjInitDecompress()) == NULL)
+        _throwtj("initializing decompressor");
+    }
+
+    if (tjDecompressHeader3(tjInstance, jpegBuf, jpegSize, &width, &height,
+                            &inSubsamp, &inColorspace) < 0)
+      _throwtj("reading JPEG header");
+
+    printf("%s Image:  %d x %d pixels, %s subsampling, %s colorspace\n",
+           (doTransform ? "Transformed" : "Input"), width, height,
+           subsampName[inSubsamp], colorspaceName[inColorspace]);
+
+    if (!strcasecmp(outFormat, "jpg") && doTransform &&
+        scalingFactor.num == 1 && scalingFactor.denom == 1 && outSubsamp < 0 &&
+        outQual < 0) {
+      /* Input image has been transformed, and no re-compression options
+         have been selected.  Write the transformed image to disk and exit. */
+      if ((jpegFile = fopen(argv[2], "wb")) == NULL)
+        _throwunix("opening output file");
+      if (fwrite(jpegBuf, jpegSize, 1, jpegFile) < 1)
+        _throwunix("writing output file");
+      fclose(jpegFile);  jpegFile = NULL;
+      goto bailout;
+    }
+
+    /* Scaling and/or a non-JPEG output image format and/or compression options
+       have been selected, so we need to decompress the input/transformed
+       image. */
+    width = TJSCALED(width, scalingFactor);
+    height = TJSCALED(height, scalingFactor);
+    if (outSubsamp < 0)
+      outSubsamp = inSubsamp;
+
+    pixelFormat = TJPF_BGRX;
+    if ((imgBuf = (unsigned char *)tjAlloc(width * height *
+                                           tjPixelSize[pixelFormat])) == NULL)
+      _throwunix("allocating uncompressed image buffer");
+
+    if (tjDecompress2(tjInstance, jpegBuf, jpegSize, imgBuf, width, 0, height,
+                      pixelFormat, flags) < 0)
+      _throwtj("decompressing JPEG image");
+    tjFree(jpegBuf);  jpegBuf = NULL;
+    tjDestroy(tjInstance);  tjInstance = NULL;
+  } else {
+    /* Input image is not a JPEG image.  Load it into memory. */
+    if ((imgBuf = tjLoadImage(argv[1], &width, 1, &height, &pixelFormat,
+                              0)) == NULL)
+      _throwtj("loading input image");
+    if (outSubsamp < 0) {
+      if (pixelFormat == TJPF_GRAY)
+        outSubsamp = TJSAMP_GRAY;
+      else
+        outSubsamp = TJSAMP_444;
+    }
+    printf("Input Image:  %d x %d pixels\n", width, height);
+  }
+
+  printf("Output Image (%s):  %d x %d pixels", outFormat, width, height);
+
+  if (!strcasecmp(outFormat, "jpg")) {
+    /* Output image format is JPEG.  Compress the uncompressed image. */
+    unsigned char *jpegBuf = NULL;  /* Dynamically allocate the JPEG buffer */
+    unsigned long jpegSize = 0;
+
+    if (outQual < 0)
+      outQual = DEFAULT_QUALITY;
+    printf(", %s subsampling, quality = %d\n", subsampName[outSubsamp],
+           outQual);
+
+    if ((tjInstance = tjInitCompress()) == NULL)
+      _throwtj("initializing compressor");
+    if (tjCompress2(tjInstance, imgBuf, width, 0, height, pixelFormat,
+                    &jpegBuf, &jpegSize, outSubsamp, outQual, flags) < 0)
+      _throwtj("compressing image");
+    tjDestroy(tjInstance);  tjInstance = NULL;
+
+    /* Write the JPEG image to disk. */
+    if ((jpegFile = fopen(argv[2], "wb")) == NULL)
+      _throwunix("opening output file");
+    if (fwrite(jpegBuf, jpegSize, 1, jpegFile) < 1)
+      _throwunix("writing output file");
+    tjDestroy(tjInstance);  tjInstance = NULL;
+    fclose(jpegFile);  jpegFile = NULL;
+    tjFree(jpegBuf);  jpegBuf = NULL;
+  } else {
+    /* Output image format is not JPEG.  Save the uncompressed image
+       directly to disk. */
+    printf("\n");
+    if (tjSaveImage(argv[2], imgBuf, width, 0, height, pixelFormat, 0) < 0)
+      _throwtj("saving output image");
+  }
+
+bailout:
+  if (imgBuf) tjFree(imgBuf);
+  if (tjInstance) tjDestroy(tjInstance);
+  if (jpegBuf) tjFree(jpegBuf);
+  if (jpegFile) fclose(jpegFile);
+  return retval;
+}
diff --git a/tjexampletest.in b/tjexampletest.in
index 4cb9e9d..0d3047e 100755
--- a/tjexampletest.in
+++ b/tjexampletest.in
@@ -20,10 +20,9 @@
 }
 
 IMAGES="vgl_5674_0098.bmp vgl_6434_0018a.bmp vgl_6548_0026a.bmp nightshot_iso_100.bmp"
-IMGDIR=@srcdir@/testimages
+IMGDIR=@CMAKE_CURRENT_SOURCE_DIR@/testimages
 OUTDIR=`mktemp -d /tmp/__tjexampletest_output.XXXXXX`
-EXEDIR=.
-JAVA="@JAVA@ -cp java/turbojpeg.jar -Djava.library.path=.libs"
+EXEDIR=@CMAKE_CURRENT_BINARY_DIR@
 
 if [ -d $OUTDIR ]; then
 	rm -rf $OUTDIR
@@ -58,7 +57,7 @@
 	# Compression
 	for dct in fast accurate; do
 		for samp in GRAY 420 422 444; do
-			runme $JAVA TJExample $OUTDIR/$image $OUTDIR/${basename}_${samp}_${dct}.jpg -q 95 -samp ${samp} -${dct}dct
+			runme $EXEDIR/tjexample $OUTDIR/$image $OUTDIR/${basename}_${samp}_${dct}.jpg -q 95 -subsamp ${samp} -${dct}dct
 			runme cmp $OUTDIR/${basename}_${samp}_${dct}.jpg $OUTDIR/${basename}_${samp}_${dct}_cjpeg.jpg
 		done
 	done
@@ -72,12 +71,12 @@
 			dctarg=
 		fi
 		for samp in GRAY 420 422 444; do
-			runme $JAVA TJExample $OUTDIR/${basename}_${samp}_${srcdct}.jpg $OUTDIR/${basename}_${samp}_${dct}.bmp ${dctarg}
+			runme $EXEDIR/tjexample $OUTDIR/${basename}_${samp}_${srcdct}.jpg $OUTDIR/${basename}_${samp}_${dct}.bmp ${dctarg}
 			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${dct}.bmp $OUTDIR/${basename}_${samp}_${dct}_djpeg.bmp
 			rm $OUTDIR/${basename}_${samp}_${dct}.bmp
 		done
 		for samp in 420 422; do
-			runme $JAVA TJExample $OUTDIR/${basename}_${samp}_${srcdct}.jpg $OUTDIR/${basename}_${samp}_${dct}_nosmooth.bmp -fastupsample ${dctarg}
+			runme $EXEDIR/tjexample $OUTDIR/${basename}_${samp}_${srcdct}.jpg $OUTDIR/${basename}_${samp}_${dct}_nosmooth.bmp -fastupsample ${dctarg}
 			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${dct}_nosmooth.bmp $OUTDIR/${basename}_${samp}_${dct}_nosmooth_djpeg.bmp
 			rm $OUTDIR/${basename}_${samp}_${dct}_nosmooth.bmp
 		done
@@ -85,10 +84,10 @@
 
 	# Scaled decompression
 	for scale in 2_1 15_8 7_4 13_8 3_2 11_8 5_4 9_8 7_8 3_4 5_8 1_2 3_8 1_4 1_8; do
-		scalearg=`echo $scale | sed s@_@/@g`
+		scalearg=`echo $scale | sed 's/\_/\//g'`
 		for samp in GRAY 420 422 444; do
 			runme $EXEDIR/djpeg -rgb -bmp -scale ${scalearg} -outfile $OUTDIR/${basename}_${samp}_${scale}_djpeg.bmp $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
-			runme $JAVA TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${scale}.bmp -scale ${scalearg}
+			runme $EXEDIR/tjexample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${scale}.bmp -scale ${scalearg}
 			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${scale}.bmp $OUTDIR/${basename}_${samp}_${scale}_djpeg.bmp
 			rm $OUTDIR/${basename}_${samp}_${scale}.bmp
 		done
@@ -106,16 +105,16 @@
 	done
 	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
 		for samp in GRAY 420 422 444; do
-			runme $JAVA TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.jpg -$xform -crop 16,16,70x60
+			runme $EXEDIR/tjexample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.jpg -$xform -crop 70x60+16+16
 			runme cmp $OUTDIR/${basename}_${samp}_${xform}.jpg $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
 			runme $EXEDIR/djpeg -rgb -bmp -outfile $OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
-			runme $JAVA TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.bmp -$xform -crop 16,16,70x60
+			runme $EXEDIR/tjexample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.bmp -$xform -crop 70x60+16+16
 			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${xform}.bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp
 			rm $OUTDIR/${basename}_${samp}_${xform}.bmp
 		done
 		for samp in 420 422; do
 			runme $EXEDIR/djpeg -nosmooth -rgb -bmp -outfile $OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
-			runme $JAVA TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.bmp -$xform -crop 16,16,70x60 -fastupsample
+			runme $EXEDIR/tjexample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.bmp -$xform -crop 70x60+16+16 -fastupsample
 			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${xform}.bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp
 			rm $OUTDIR/${basename}_${samp}_${xform}.bmp
 		done
@@ -124,9 +123,9 @@
 	# Grayscale transform
 	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
 		for samp in GRAY 444 422 420; do
-			runme $JAVA TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.jpg -$xform -grayscale -crop 16,16,70x60
+			runme $EXEDIR/tjexample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.jpg -$xform -grayscale -crop 70x60+16+16
 			runme cmp $OUTDIR/${basename}_${samp}_${xform}.jpg $OUTDIR/${basename}_GRAY_${xform}_jpegtran.jpg
-			runme $JAVA TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.bmp -$xform -grayscale -crop 16,16,70x60
+			runme $EXEDIR/tjexample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.bmp -$xform -grayscale -crop 70x60+16+16
 			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${xform}.bmp $OUTDIR/${basename}_GRAY_${xform}_jpegtran.bmp
 			rm $OUTDIR/${basename}_${samp}_${xform}.bmp
 		done
@@ -136,9 +135,9 @@
 	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
 		for samp in GRAY 444 422 420; do
 			for scale in 2_1 15_8 7_4 13_8 3_2 11_8 5_4 9_8 7_8 3_4 5_8 1_2 3_8 1_4 1_8; do
-				scalearg=`echo $scale | sed s@_@/@g`
+				scalearg=`echo $scale | sed 's/\_/\//g'`
 				runme $EXEDIR/djpeg -rgb -bmp -scale ${scalearg} -outfile $OUTDIR/${basename}_${samp}_${xform}_${scale}_jpegtran.bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
-				runme $JAVA TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}_${scale}.bmp -$xform -scale ${scalearg} -crop 16,16,70x60
+				runme $EXEDIR/tjexample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}_${scale}.bmp -$xform -scale ${scalearg} -crop 70x60+16+16
 				runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${xform}_${scale}.bmp $OUTDIR/${basename}_${samp}_${xform}_${scale}_jpegtran.bmp
 				rm $OUTDIR/${basename}_${samp}_${xform}_${scale}.bmp
 			done
diff --git a/tjexampletest.java.in b/tjexampletest.java.in
new file mode 100755
index 0000000..d4b63bc
--- /dev/null
+++ b/tjexampletest.java.in
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+set -u
+set -e
+trap onexit INT
+trap onexit TERM
+trap onexit EXIT
+
+onexit()
+{
+	if [ -d $OUTDIR ]; then
+		rm -rf $OUTDIR
+	fi
+}
+
+runme()
+{
+	echo \*\*\* $*
+	"$@"
+}
+
+IMAGES="vgl_5674_0098.bmp vgl_6434_0018a.bmp vgl_6548_0026a.bmp nightshot_iso_100.bmp"
+IMGDIR=@CMAKE_CURRENT_SOURCE_DIR@/testimages
+OUTDIR=`mktemp -d /tmp/__tjexampletest_java_output.XXXXXX`
+EXEDIR=@CMAKE_CURRENT_BINARY_DIR@
+JAVA="@Java_JAVA_EXECUTABLE@"
+JAVAARGS="-cp $EXEDIR/java/turbojpeg.jar -Djava.library.path=$EXEDIR"
+
+if [ -d $OUTDIR ]; then
+	rm -rf $OUTDIR
+fi
+mkdir -p $OUTDIR
+
+exec >$EXEDIR/tjexampletest-java.log
+
+for image in $IMAGES; do
+
+	cp $IMGDIR/$image $OUTDIR
+	basename=`basename $image .bmp`
+	runme $EXEDIR/cjpeg -quality 95 -dct fast -grayscale -outfile $OUTDIR/${basename}_GRAY_fast_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct fast -sample 2x2 -outfile $OUTDIR/${basename}_420_fast_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct fast -sample 2x1 -outfile $OUTDIR/${basename}_422_fast_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct fast -sample 1x1 -outfile $OUTDIR/${basename}_444_fast_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct int -grayscale -outfile $OUTDIR/${basename}_GRAY_accurate_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct int -sample 2x2 -outfile $OUTDIR/${basename}_420_accurate_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct int -sample 2x1 -outfile $OUTDIR/${basename}_422_accurate_cjpeg.jpg $IMGDIR/${basename}.bmp
+	runme $EXEDIR/cjpeg -quality 95 -dct int -sample 1x1 -outfile $OUTDIR/${basename}_444_accurate_cjpeg.jpg $IMGDIR/${basename}.bmp
+	for samp in GRAY 420 422 444; do
+		runme $EXEDIR/djpeg -rgb -bmp -outfile $OUTDIR/${basename}_${samp}_default_djpeg.bmp $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
+		runme $EXEDIR/djpeg -dct fast -rgb -bmp -outfile $OUTDIR/${basename}_${samp}_fast_djpeg.bmp $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
+		runme $EXEDIR/djpeg -dct int -rgb -bmp -outfile $OUTDIR/${basename}_${samp}_accurate_djpeg.bmp $OUTDIR/${basename}_${samp}_accurate_cjpeg.jpg
+	done
+	for samp in 420 422; do
+		runme $EXEDIR/djpeg -nosmooth -bmp -outfile $OUTDIR/${basename}_${samp}_default_nosmooth_djpeg.bmp $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
+		runme $EXEDIR/djpeg -dct fast -nosmooth -bmp -outfile $OUTDIR/${basename}_${samp}_fast_nosmooth_djpeg.bmp $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
+		runme $EXEDIR/djpeg -dct int -nosmooth -bmp -outfile $OUTDIR/${basename}_${samp}_accurate_nosmooth_djpeg.bmp $OUTDIR/${basename}_${samp}_accurate_cjpeg.jpg
+	done
+
+	# Compression
+	for dct in fast accurate; do
+		for samp in GRAY 420 422 444; do
+			runme "$JAVA" $JAVAARGS TJExample $OUTDIR/$image $OUTDIR/${basename}_${samp}_${dct}.jpg -q 95 -subsamp ${samp} -${dct}dct
+			runme cmp $OUTDIR/${basename}_${samp}_${dct}.jpg $OUTDIR/${basename}_${samp}_${dct}_cjpeg.jpg
+		done
+	done
+
+	# Decompression
+	for dct in fast accurate default; do
+		srcdct=${dct}
+		dctarg=-${dct}dct
+		if [ "${dct}" = "default" ]; then
+			srcdct=fast
+			dctarg=
+		fi
+		for samp in GRAY 420 422 444; do
+			runme "$JAVA" $JAVAARGS TJExample $OUTDIR/${basename}_${samp}_${srcdct}.jpg $OUTDIR/${basename}_${samp}_${dct}.bmp ${dctarg}
+			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${dct}.bmp $OUTDIR/${basename}_${samp}_${dct}_djpeg.bmp
+			rm $OUTDIR/${basename}_${samp}_${dct}.bmp
+		done
+		for samp in 420 422; do
+			runme "$JAVA" $JAVAARGS TJExample $OUTDIR/${basename}_${samp}_${srcdct}.jpg $OUTDIR/${basename}_${samp}_${dct}_nosmooth.bmp -fastupsample ${dctarg}
+			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${dct}_nosmooth.bmp $OUTDIR/${basename}_${samp}_${dct}_nosmooth_djpeg.bmp
+			rm $OUTDIR/${basename}_${samp}_${dct}_nosmooth.bmp
+		done
+	done
+
+	# Scaled decompression
+	for scale in 2_1 15_8 7_4 13_8 3_2 11_8 5_4 9_8 7_8 3_4 5_8 1_2 3_8 1_4 1_8; do
+		scalearg=`echo $scale | sed 's/\_/\//g'`
+		for samp in GRAY 420 422 444; do
+			runme $EXEDIR/djpeg -rgb -bmp -scale ${scalearg} -outfile $OUTDIR/${basename}_${samp}_${scale}_djpeg.bmp $OUTDIR/${basename}_${samp}_fast_cjpeg.jpg
+			runme "$JAVA" $JAVAARGS TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${scale}.bmp -scale ${scalearg}
+			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${scale}.bmp $OUTDIR/${basename}_${samp}_${scale}_djpeg.bmp
+			rm $OUTDIR/${basename}_${samp}_${scale}.bmp
+		done
+	done
+
+	# Transforms
+	for samp in GRAY 420 422 444; do
+		runme $EXEDIR/jpegtran -crop 70x60+16+16 -flip horizontal -trim -outfile $OUTDIR/${basename}_${samp}_hflip_jpegtran.jpg $OUTDIR/${basename}_${samp}_fast.jpg
+		runme $EXEDIR/jpegtran -crop 70x60+16+16 -flip vertical -trim -outfile $OUTDIR/${basename}_${samp}_vflip_jpegtran.jpg $OUTDIR/${basename}_${samp}_fast.jpg
+		runme $EXEDIR/jpegtran -crop 70x60+16+16 -transpose -trim -outfile $OUTDIR/${basename}_${samp}_transpose_jpegtran.jpg $OUTDIR/${basename}_${samp}_fast.jpg
+		runme $EXEDIR/jpegtran -crop 70x60+16+16 -transverse -trim -outfile $OUTDIR/${basename}_${samp}_transverse_jpegtran.jpg $OUTDIR/${basename}_${samp}_fast.jpg
+		runme $EXEDIR/jpegtran -crop 70x60+16+16 -rotate 90 -trim -outfile $OUTDIR/${basename}_${samp}_rot90_jpegtran.jpg $OUTDIR/${basename}_${samp}_fast.jpg
+		runme $EXEDIR/jpegtran -crop 70x60+16+16 -rotate 180 -trim -outfile $OUTDIR/${basename}_${samp}_rot180_jpegtran.jpg $OUTDIR/${basename}_${samp}_fast.jpg
+		runme $EXEDIR/jpegtran -crop 70x60+16+16 -rotate 270 -trim -outfile $OUTDIR/${basename}_${samp}_rot270_jpegtran.jpg $OUTDIR/${basename}_${samp}_fast.jpg
+	done
+	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
+		for samp in GRAY 420 422 444; do
+			runme "$JAVA" $JAVAARGS TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.jpg -$xform -crop 70x60+16+16
+			runme cmp $OUTDIR/${basename}_${samp}_${xform}.jpg $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
+			runme $EXEDIR/djpeg -rgb -bmp -outfile $OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
+			runme "$JAVA" $JAVAARGS TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.bmp -$xform -crop 70x60+16+16
+			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${xform}.bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp
+			rm $OUTDIR/${basename}_${samp}_${xform}.bmp
+		done
+		for samp in 420 422; do
+			runme $EXEDIR/djpeg -nosmooth -rgb -bmp -outfile $OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
+			runme "$JAVA" $JAVAARGS TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.bmp -$xform -crop 70x60+16+16 -fastupsample
+			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${xform}.bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.bmp
+			rm $OUTDIR/${basename}_${samp}_${xform}.bmp
+		done
+	done
+
+	# Grayscale transform
+	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
+		for samp in GRAY 444 422 420; do
+			runme "$JAVA" $JAVAARGS TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.jpg -$xform -grayscale -crop 70x60+16+16
+			runme cmp $OUTDIR/${basename}_${samp}_${xform}.jpg $OUTDIR/${basename}_GRAY_${xform}_jpegtran.jpg
+			runme "$JAVA" $JAVAARGS TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}.bmp -$xform -grayscale -crop 70x60+16+16
+			runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${xform}.bmp $OUTDIR/${basename}_GRAY_${xform}_jpegtran.bmp
+			rm $OUTDIR/${basename}_${samp}_${xform}.bmp
+		done
+	done
+
+	# Transforms with scaling
+	for xform in hflip vflip transpose transverse rot90 rot180 rot270; do
+		for samp in GRAY 444 422 420; do
+			for scale in 2_1 15_8 7_4 13_8 3_2 11_8 5_4 9_8 7_8 3_4 5_8 1_2 3_8 1_4 1_8; do
+				scalearg=`echo $scale | sed 's/\_/\//g'`
+				runme $EXEDIR/djpeg -rgb -bmp -scale ${scalearg} -outfile $OUTDIR/${basename}_${samp}_${xform}_${scale}_jpegtran.bmp $OUTDIR/${basename}_${samp}_${xform}_jpegtran.jpg
+				runme "$JAVA" $JAVAARGS TJExample $OUTDIR/${basename}_${samp}_fast.jpg $OUTDIR/${basename}_${samp}_${xform}_${scale}.bmp -$xform -scale ${scalearg} -crop 70x60+16+16
+				runme cmp -i 54:54 $OUTDIR/${basename}_${samp}_${xform}_${scale}.bmp $OUTDIR/${basename}_${samp}_${xform}_${scale}_jpegtran.bmp
+				rm $OUTDIR/${basename}_${samp}_${xform}_${scale}.bmp
+			done
+		done
+	done
+
+done
+
+echo SUCCESS!
diff --git a/tjunittest.c b/tjunittest.c
index f793796..4f8d3d3 100644
--- a/tjunittest.c
+++ b/tjunittest.c
@@ -34,701 +34,859 @@
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
-#include "./tjutil.h"
-#include "./turbojpeg.h"
+#include "tjutil.h"
+#include "turbojpeg.h"
+#include "md5/md5.h"
+#include "cmyk.h"
 #ifdef _WIN32
- #include <time.h>
- #define random() rand()
+#include <time.h>
+#define random() rand()
+#else
+#include <unistd.h>
 #endif
 
 
 void usage(char *progName)
 {
-	printf("\nUSAGE: %s [options]\n\n", progName);
-	printf("Options:\n");
-	printf("-yuv = test YUV encoding/decoding support\n");
-	printf("-noyuvpad = do not pad each line of each Y, U, and V plane to the nearest\n");
-	printf("            4-byte boundary\n");
-	printf("-alloc = test automatic buffer allocation\n\n");
-	exit(1);
+  printf("\nUSAGE: %s [options]\n\n", progName);
+  printf("Options:\n");
+  printf("-yuv = test YUV encoding/decoding support\n");
+  printf("-noyuvpad = do not pad each line of each Y, U, and V plane to the nearest\n");
+  printf("            4-byte boundary\n");
+  printf("-alloc = test automatic buffer allocation\n");
+  printf("-bmp = tjLoadImage()/tjSaveImage() unit test\n\n");
+  exit(1);
 }
 
 
-#define _throwtj() {printf("TurboJPEG ERROR:\n%s\n", tjGetErrorStr());  \
-	bailout();}
-#define _tj(f) {if((f)==-1) _throwtj();}
-#define _throw(m) {printf("ERROR: %s\n", m);  bailout();}
+#define _throwtj() { \
+  printf("TurboJPEG ERROR:\n%s\n", tjGetErrorStr()); \
+  bailout(); \
+}
+#define _tj(f) { if ((f) == -1) _throwtj(); }
+#define _throw(m) { printf("ERROR: %s\n", m);  bailout(); }
+#define _throwmd5(filename, md5sum, ref) { \
+  printf("\n%s has an MD5 sum of %s.\n   Should be %s.\n", filename, md5sum, \
+         ref); \
+  bailout(); \
+}
 
-const char *subNameLong[TJ_NUMSAMP]=
-{
-	"4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0", "4:1:1"
+const char *subNameLong[TJ_NUMSAMP] = {
+  "4:4:4", "4:2:2", "4:2:0", "GRAY", "4:4:0", "4:1:1"
 };
-const char *subName[TJ_NUMSAMP]={"444", "422", "420", "GRAY", "440", "411"};
-
-const char *pixFormatStr[TJ_NUMPF]=
-{
-	"RGB", "BGR", "RGBX", "BGRX", "XBGR", "XRGB", "Grayscale",
-	"RGBA", "BGRA", "ABGR", "ARGB", "CMYK"
+const char *subName[TJ_NUMSAMP] = {
+  "444", "422", "420", "GRAY", "440", "411"
 };
 
-const int alphaOffset[TJ_NUMPF] = {-1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0, -1};
+const char *pixFormatStr[TJ_NUMPF] = {
+  "RGB", "BGR", "RGBX", "BGRX", "XBGR", "XRGB", "Grayscale",
+  "RGBA", "BGRA", "ABGR", "ARGB", "CMYK"
+};
 
-const int _3byteFormats[]={TJPF_RGB, TJPF_BGR};
-const int _4byteFormats[]={TJPF_RGBX, TJPF_BGRX, TJPF_XBGR, TJPF_XRGB,
-	TJPF_CMYK};
-const int _onlyGray[]={TJPF_GRAY};
-const int _onlyRGB[]={TJPF_RGB};
+const int _3byteFormats[] = { TJPF_RGB, TJPF_BGR };
+const int _4byteFormats[] = {
+  TJPF_RGBX, TJPF_BGRX, TJPF_XBGR, TJPF_XRGB, TJPF_CMYK
+};
+const int _onlyGray[] = { TJPF_GRAY };
+const int _onlyRGB[] = { TJPF_RGB };
 
-int doyuv=0, alloc=0, pad=4;
+int doYUV = 0, alloc = 0, pad = 4;
 
-int exitStatus=0;
-#define bailout() {exitStatus=-1;  goto bailout;}
+int exitStatus = 0;
+#define bailout() { exitStatus = -1;  goto bailout; }
 
 
 void initBuf(unsigned char *buf, int w, int h, int pf, int flags)
 {
-	int roffset=tjRedOffset[pf];
-	int goffset=tjGreenOffset[pf];
-	int boffset=tjBlueOffset[pf];
-	int ps=tjPixelSize[pf];
-	int index, row, col, halfway=16;
+  int roffset = tjRedOffset[pf];
+  int goffset = tjGreenOffset[pf];
+  int boffset = tjBlueOffset[pf];
+  int ps = tjPixelSize[pf];
+  int index, row, col, halfway = 16;
 
-	if(pf==TJPF_GRAY)
-	{
-		memset(buf, 0, w*h*ps);
-		for(row=0; row<h; row++)
-		{
-			for(col=0; col<w; col++)
-			{
-				if(flags&TJFLAG_BOTTOMUP) index=(h-row-1)*w+col;
-				else index=row*w+col;
-				if(((row/8)+(col/8))%2==0) buf[index]=(row<halfway)? 255:0;
-				else buf[index]=(row<halfway)? 76:226;
-			}
-		}
-	}
-	else if(pf==TJPF_CMYK)
-	{
-		memset(buf, 255, w*h*ps);
-		for(row=0; row<h; row++)
-		{
-			for(col=0; col<w; col++)
-			{
-				if(flags&TJFLAG_BOTTOMUP) index=(h-row-1)*w+col;
-				else index=row*w+col;
-				if(((row/8)+(col/8))%2==0)
-				{
-					if(row>=halfway) buf[index*ps+3]=0;
-				}
-				else
-				{
-					buf[index*ps+2]=0;
-					if(row<halfway) buf[index*ps+1]=0;
-				}
-			}
-		}
-	}
-	else
-	{
-		memset(buf, 0, w*h*ps);
-		for(row=0; row<h; row++)
-		{
-			for(col=0; col<w; col++)
-			{
-				if(flags&TJFLAG_BOTTOMUP) index=(h-row-1)*w+col;
-				else index=row*w+col;
-				if(((row/8)+(col/8))%2==0)
-				{
-					if(row<halfway)
-					{
-						buf[index*ps+roffset]=255;
-						buf[index*ps+goffset]=255;
-						buf[index*ps+boffset]=255;
-					}
-				}
-				else
-				{
-					buf[index*ps+roffset]=255;
-					if(row>=halfway) buf[index*ps+goffset]=255;
-				}
-			}
-		}
-	}
+  if (pf == TJPF_GRAY) {
+    memset(buf, 0, w * h * ps);
+    for (row = 0; row < h; row++) {
+      for (col = 0; col < w; col++) {
+        if (flags & TJFLAG_BOTTOMUP) index = (h - row - 1) * w + col;
+        else index = row * w + col;
+        if (((row / 8) + (col / 8)) % 2 == 0)
+          buf[index] = (row < halfway) ? 255 : 0;
+        else buf[index] = (row < halfway) ? 76 : 226;
+      }
+    }
+  } else if (pf == TJPF_CMYK) {
+    memset(buf, 255, w * h * ps);
+    for (row = 0; row < h; row++) {
+      for (col = 0; col < w; col++) {
+        if (flags & TJFLAG_BOTTOMUP) index = (h - row - 1) * w + col;
+        else index = row * w + col;
+        if (((row / 8) + (col / 8)) % 2 == 0) {
+          if (row >= halfway) buf[index * ps + 3] = 0;
+        } else {
+          buf[index * ps + 2] = 0;
+          if (row < halfway) buf[index * ps + 1] = 0;
+        }
+      }
+    }
+  } else {
+    memset(buf, 0, w * h * ps);
+    for (row = 0; row < h; row++) {
+      for (col = 0; col < w; col++) {
+        if (flags & TJFLAG_BOTTOMUP) index = (h - row - 1) * w + col;
+        else index = row * w + col;
+        if (((row / 8) + (col / 8)) % 2 == 0) {
+          if (row < halfway) {
+            buf[index * ps + roffset] = 255;
+            buf[index * ps + goffset] = 255;
+            buf[index * ps + boffset] = 255;
+          }
+        } else {
+          buf[index * ps + roffset] = 255;
+          if (row >= halfway) buf[index * ps + goffset] = 255;
+        }
+      }
+    }
+  }
 }
 
 
 #define checkval(v, cv) { \
-	if(v<cv-1 || v>cv+1) { \
-		printf("\nComp. %s at %d,%d should be %d, not %d\n",  \
-			#v, row, col, cv, v); \
-		retval=0;  exitStatus=-1;  goto bailout; \
-	}}
+  if (v < cv - 1 || v > cv + 1) { \
+    printf("\nComp. %s at %d,%d should be %d, not %d\n", #v, row, col, cv, \
+           v); \
+    retval = 0;  exitStatus = -1;  goto bailout; \
+  } \
+}
 
 #define checkval0(v) { \
-	if(v>1) { \
-		printf("\nComp. %s at %d,%d should be 0, not %d\n", #v, row, col, v); \
-		retval=0;  exitStatus=-1;  goto bailout; \
-	}}
+  if (v > 1) { \
+    printf("\nComp. %s at %d,%d should be 0, not %d\n", #v, row, col, v); \
+    retval = 0;  exitStatus = -1;  goto bailout; \
+  } \
+}
 
 #define checkval255(v) { \
-	if(v<254) { \
-		printf("\nComp. %s at %d,%d should be 255, not %d\n", #v, row, col, v); \
-		retval=0;  exitStatus=-1;  goto bailout; \
-	}}
-
-
-int checkBuf(unsigned char *buf, int w, int h, int pf, int subsamp,
-	tjscalingfactor sf, int flags)
-{
-	int roffset=tjRedOffset[pf];
-	int goffset=tjGreenOffset[pf];
-	int boffset=tjBlueOffset[pf];
-	int aoffset=alphaOffset[pf];
-	int ps=tjPixelSize[pf];
-	int index, row, col, retval=1;
-	int halfway=16*sf.num/sf.denom;
-	int blocksize=8*sf.num/sf.denom;
-
-	if(pf==TJPF_CMYK)
-	{
-		for(row=0; row<h; row++)
-		{
-			for(col=0; col<w; col++)
-			{
-				unsigned char c, m, y, k;
-				if(flags&TJFLAG_BOTTOMUP) index=(h-row-1)*w+col;
-				else index=row*w+col;
-				c=buf[index*ps];
-				m=buf[index*ps+1];
-				y=buf[index*ps+2];
-				k=buf[index*ps+3];
-				if(((row/blocksize)+(col/blocksize))%2==0)
-				{
-					checkval255(c);  checkval255(m);  checkval255(y);
-					if(row<halfway) checkval255(k)
-					else checkval0(k)
-				}
-				else
-				{
-					checkval255(c);  checkval0(y);  checkval255(k);
-					if(row<halfway) checkval0(m)
-					else checkval255(m)
-				}
-			}
-		}
-		return 1;
-	}
-
-	for(row=0; row<h; row++)
-	{
-		for(col=0; col<w; col++)
-		{
-			unsigned char r, g, b, a;
-			if(flags&TJFLAG_BOTTOMUP) index=(h-row-1)*w+col;
-			else index=row*w+col;
-			r=buf[index*ps+roffset];
-			g=buf[index*ps+goffset];
-			b=buf[index*ps+boffset];
-			a=aoffset>=0? buf[index*ps+aoffset]:0xFF;
-			if(((row/blocksize)+(col/blocksize))%2==0)
-			{
-				if(row<halfway)
-				{
-					checkval255(r);  checkval255(g);  checkval255(b);
-				}
-				else
-				{
-					checkval0(r);  checkval0(g);  checkval0(b);
-				}
-			}
-			else
-			{
-				if(subsamp==TJSAMP_GRAY)
-				{
-					if(row<halfway)
-					{
-						checkval(r, 76);  checkval(g, 76);  checkval(b, 76);
-					}
-					else
-					{
-						checkval(r, 226);  checkval(g, 226);  checkval(b, 226);
-					}
-				}
-				else
-				{
-					if(row<halfway)
-					{
-						checkval255(r);  checkval0(g);  checkval0(b);
-					}
-					else
-					{
-						checkval255(r);  checkval255(g);  checkval0(b);
-					}
-				}
-			}
-			checkval255(a);
-		}
-	}
-
-	bailout:
-	if(retval==0)
-	{
-		for(row=0; row<h; row++)
-		{
-			for(col=0; col<w; col++)
-			{
-				if(pf==TJPF_CMYK)
-					printf("%.3d/%.3d/%.3d/%.3d ", buf[(row*w+col)*ps],
-						buf[(row*w+col)*ps+1], buf[(row*w+col)*ps+2],
-						buf[(row*w+col)*ps+3]);
-				else
-					printf("%.3d/%.3d/%.3d ", buf[(row*w+col)*ps+roffset],
-						buf[(row*w+col)*ps+goffset], buf[(row*w+col)*ps+boffset]);
-			}
-			printf("\n");
-		}
-	}
-	return retval;
+  if (v < 254) { \
+    printf("\nComp. %s at %d,%d should be 255, not %d\n", #v, row, col, v); \
+    retval = 0;  exitStatus = -1;  goto bailout; \
+  } \
 }
 
 
-#define PAD(v, p) ((v+(p)-1)&(~((p)-1)))
+int checkBuf(unsigned char *buf, int w, int h, int pf, int subsamp,
+             tjscalingfactor sf, int flags)
+{
+  int roffset = tjRedOffset[pf];
+  int goffset = tjGreenOffset[pf];
+  int boffset = tjBlueOffset[pf];
+  int aoffset = tjAlphaOffset[pf];
+  int ps = tjPixelSize[pf];
+  int index, row, col, retval = 1;
+  int halfway = 16 * sf.num / sf.denom;
+  int blocksize = 8 * sf.num / sf.denom;
+
+  if (pf == TJPF_GRAY) roffset = goffset = boffset = 0;
+
+  if (pf == TJPF_CMYK) {
+    for (row = 0; row < h; row++) {
+      for (col = 0; col < w; col++) {
+        unsigned char c, m, y, k;
+
+        if (flags & TJFLAG_BOTTOMUP) index = (h - row - 1) * w + col;
+        else index = row * w + col;
+        c = buf[index * ps];
+        m = buf[index * ps + 1];
+        y = buf[index * ps + 2];
+        k = buf[index * ps + 3];
+        if (((row / blocksize) + (col / blocksize)) % 2 == 0) {
+          checkval255(c);  checkval255(m);  checkval255(y);
+          if (row < halfway) checkval255(k)
+          else checkval0(k)
+        } else {
+          checkval255(c);  checkval0(y);  checkval255(k);
+          if (row < halfway) checkval0(m)
+          else checkval255(m)
+        }
+      }
+    }
+    return 1;
+  }
+
+  for (row = 0; row < h; row++) {
+    for (col = 0; col < w; col++) {
+      unsigned char r, g, b, a;
+
+      if (flags & TJFLAG_BOTTOMUP) index = (h - row - 1) * w + col;
+      else index = row * w + col;
+      r = buf[index * ps + roffset];
+      g = buf[index * ps + goffset];
+      b = buf[index * ps + boffset];
+      a = aoffset >= 0 ? buf[index * ps + aoffset] : 0xFF;
+      if (((row / blocksize) + (col / blocksize)) % 2 == 0) {
+        if (row < halfway) {
+          checkval255(r);  checkval255(g);  checkval255(b);
+        } else {
+          checkval0(r);  checkval0(g);  checkval0(b);
+        }
+      } else {
+        if (subsamp == TJSAMP_GRAY) {
+          if (row < halfway) {
+            checkval(r, 76);  checkval(g, 76);  checkval(b, 76);
+          } else {
+            checkval(r, 226);  checkval(g, 226);  checkval(b, 226);
+          }
+        } else {
+          if (row < halfway) {
+            checkval255(r);  checkval0(g);  checkval0(b);
+          } else {
+            checkval255(r);  checkval255(g);  checkval0(b);
+          }
+        }
+      }
+      checkval255(a);
+    }
+  }
+
+bailout:
+  if (retval == 0) {
+    for (row = 0; row < h; row++) {
+      for (col = 0; col < w; col++) {
+        if (pf == TJPF_CMYK)
+          printf("%.3d/%.3d/%.3d/%.3d ", buf[(row * w + col) * ps],
+                 buf[(row * w + col) * ps + 1], buf[(row * w + col) * ps + 2],
+                 buf[(row * w + col) * ps + 3]);
+        else
+          printf("%.3d/%.3d/%.3d ", buf[(row * w + col) * ps + roffset],
+                 buf[(row * w + col) * ps + goffset],
+                 buf[(row * w + col) * ps + boffset]);
+      }
+      printf("\n");
+    }
+  }
+  return retval;
+}
+
+
+#define PAD(v, p) ((v + (p) - 1) & (~((p) - 1)))
 
 int checkBufYUV(unsigned char *buf, int w, int h, int subsamp,
-	tjscalingfactor sf)
+                tjscalingfactor sf)
 {
-	int row, col;
-	int hsf=tjMCUWidth[subsamp]/8, vsf=tjMCUHeight[subsamp]/8;
-	int pw=PAD(w, hsf), ph=PAD(h, vsf);
-	int cw=pw/hsf, ch=ph/vsf;
-	int ypitch=PAD(pw, pad), uvpitch=PAD(cw, pad);
-	int retval=1;
-	int halfway=16*sf.num/sf.denom;
-	int blocksize=8*sf.num/sf.denom;
+  int row, col;
+  int hsf = tjMCUWidth[subsamp] / 8, vsf = tjMCUHeight[subsamp] / 8;
+  int pw = PAD(w, hsf), ph = PAD(h, vsf);
+  int cw = pw / hsf, ch = ph / vsf;
+  int ypitch = PAD(pw, pad), uvpitch = PAD(cw, pad);
+  int retval = 1;
+  int halfway = 16 * sf.num / sf.denom;
+  int blocksize = 8 * sf.num / sf.denom;
 
-	for(row=0; row<ph; row++)
-	{
-		for(col=0; col<pw; col++)
-		{
-			unsigned char y=buf[ypitch*row+col];
-			if(((row/blocksize)+(col/blocksize))%2==0)
-			{
-				if(row<halfway) checkval255(y)  else checkval0(y);
-			}
-			else
-			{
-				if(row<halfway) checkval(y, 76)  else checkval(y, 226);
-			}
-		}
-	}
-	if(subsamp!=TJSAMP_GRAY)
-	{
-		int halfway=16/vsf*sf.num/sf.denom;
-		for(row=0; row<ch; row++)
-		{
-			for(col=0; col<cw; col++)
-			{
-				unsigned char u=buf[ypitch*ph + (uvpitch*row+col)],
-					v=buf[ypitch*ph + uvpitch*ch + (uvpitch*row+col)];
-				if(((row*vsf/blocksize)+(col*hsf/blocksize))%2==0)
-				{
-					checkval(u, 128);  checkval(v, 128);
-				}
-				else
-				{
-					if(row<halfway)
-					{
-						checkval(u, 85);  checkval255(v);
-					}
-					else
-					{
-						checkval0(u);  checkval(v, 149);
-					}
-				}
-			}
-		}
-	}
+  for (row = 0; row < ph; row++) {
+    for (col = 0; col < pw; col++) {
+      unsigned char y = buf[ypitch * row + col];
 
-	bailout:
-	if(retval==0)
-	{
-		for(row=0; row<ph; row++)
-		{
-			for(col=0; col<pw; col++)
-				printf("%.3d ", buf[ypitch*row+col]);
-			printf("\n");
-		}
-		printf("\n");
-		for(row=0; row<ch; row++)
-		{
-			for(col=0; col<cw; col++)
-				printf("%.3d ", buf[ypitch*ph + (uvpitch*row+col)]);
-			printf("\n");
-		}
-		printf("\n");
-		for(row=0; row<ch; row++)
-		{
-			for(col=0; col<cw; col++)
-				printf("%.3d ", buf[ypitch*ph + uvpitch*ch + (uvpitch*row+col)]);
-			printf("\n");
-		}
-	}
+      if (((row / blocksize) + (col / blocksize)) % 2 == 0) {
+        if (row < halfway) checkval255(y)
+        else checkval0(y);
+      } else {
+        if (row < halfway) checkval(y, 76)
+        else checkval(y, 226);
+      }
+    }
+  }
+  if (subsamp != TJSAMP_GRAY) {
+    int halfway = 16 / vsf * sf.num / sf.denom;
 
-	return retval;
+    for (row = 0; row < ch; row++) {
+      for (col = 0; col < cw; col++) {
+        unsigned char u = buf[ypitch * ph + (uvpitch * row + col)],
+          v = buf[ypitch * ph + uvpitch * ch + (uvpitch * row + col)];
+
+        if (((row * vsf / blocksize) + (col * hsf / blocksize)) % 2 == 0) {
+          checkval(u, 128);  checkval(v, 128);
+        } else {
+          if (row < halfway) {
+            checkval(u, 85);  checkval255(v);
+          } else {
+            checkval0(u);  checkval(v, 149);
+          }
+        }
+      }
+    }
+  }
+
+bailout:
+  if (retval == 0) {
+    for (row = 0; row < ph; row++) {
+      for (col = 0; col < pw; col++)
+        printf("%.3d ", buf[ypitch * row + col]);
+      printf("\n");
+    }
+    printf("\n");
+    for (row = 0; row < ch; row++) {
+      for (col = 0; col < cw; col++)
+        printf("%.3d ", buf[ypitch * ph + (uvpitch * row + col)]);
+      printf("\n");
+    }
+    printf("\n");
+    for (row = 0; row < ch; row++) {
+      for (col = 0; col < cw; col++)
+        printf("%.3d ",
+               buf[ypitch * ph + uvpitch * ch + (uvpitch * row + col)]);
+      printf("\n");
+    }
+  }
+
+  return retval;
 }
 
 
 void writeJPEG(unsigned char *jpegBuf, unsigned long jpegSize, char *filename)
 {
-	FILE *file=fopen(filename, "wb");
-	if(!file || fwrite(jpegBuf, jpegSize, 1, file)!=1)
-	{
-		printf("ERROR: Could not write to %s.\n%s\n", filename, strerror(errno));
-		bailout();
-	}
+  FILE *file = fopen(filename, "wb");
 
-	bailout:
-	if(file) fclose(file);
+  if (!file || fwrite(jpegBuf, jpegSize, 1, file) != 1) {
+    printf("ERROR: Could not write to %s.\n%s\n", filename, strerror(errno));
+    bailout();
+  }
+
+bailout:
+  if (file) fclose(file);
 }
 
 
-void compTest(tjhandle handle, unsigned char **dstBuf,
-	unsigned long *dstSize, int w, int h, int pf, char *basename,
-	int subsamp, int jpegQual, int flags)
+void compTest(tjhandle handle, unsigned char **dstBuf, unsigned long *dstSize,
+              int w, int h, int pf, char *basename, int subsamp, int jpegQual,
+              int flags)
 {
-	char tempStr[1024];  unsigned char *srcBuf=NULL, *yuvBuf=NULL;
-	const char *pfStr=pixFormatStr[pf];
-	const char *buStrLong=(flags&TJFLAG_BOTTOMUP)? "Bottom-Up":"Top-Down ";
-	const char *buStr=(flags&TJFLAG_BOTTOMUP)? "BU":"TD";
+  char tempStr[1024];
+  unsigned char *srcBuf = NULL, *yuvBuf = NULL;
+  const char *pfStr = pixFormatStr[pf];
+  const char *buStrLong =
+    (flags & TJFLAG_BOTTOMUP) ? "Bottom-Up" : "Top-Down ";
+  const char *buStr = (flags & TJFLAG_BOTTOMUP) ? "BU" : "TD";
 
-	if((srcBuf=(unsigned char *)malloc(w*h*tjPixelSize[pf]))==NULL)
-		_throw("Memory allocation failure");
-	initBuf(srcBuf, w, h, pf, flags);
+  if ((srcBuf = (unsigned char *)malloc(w * h * tjPixelSize[pf])) == NULL)
+    _throw("Memory allocation failure");
+  initBuf(srcBuf, w, h, pf, flags);
 
-	if(*dstBuf && *dstSize>0) memset(*dstBuf, 0, *dstSize);
+  if (*dstBuf && *dstSize > 0) memset(*dstBuf, 0, *dstSize);
 
+  if (!alloc) flags |= TJFLAG_NOREALLOC;
+  if (doYUV) {
+    unsigned long yuvSize = tjBufSizeYUV2(w, pad, h, subsamp);
+    tjscalingfactor sf = { 1, 1 };
+    tjhandle handle2 = tjInitCompress();
 
-	if(!alloc) flags|=TJFLAG_NOREALLOC;
-	if(doyuv)
-	{
-		unsigned long yuvSize=tjBufSizeYUV2(w, pad, h, subsamp);
-		tjscalingfactor sf={1, 1};
-		tjhandle handle2=tjInitCompress();
-		if(!handle2) _throwtj();
+    if (!handle2) _throwtj();
 
-		if((yuvBuf=(unsigned char *)malloc(yuvSize))==NULL)
-			_throw("Memory allocation failure");
-		memset(yuvBuf, 0, yuvSize);
+    if ((yuvBuf = (unsigned char *)malloc(yuvSize)) == NULL)
+      _throw("Memory allocation failure");
+    memset(yuvBuf, 0, yuvSize);
 
-		printf("%s %s -> YUV %s ... ", pfStr, buStrLong, subNameLong[subsamp]);
-		_tj(tjEncodeYUV3(handle2, srcBuf, w, 0, h, pf, yuvBuf, pad, subsamp,
-			flags));
-		tjDestroy(handle2);
-		if(checkBufYUV(yuvBuf, w, h, subsamp, sf)) printf("Passed.\n");
-		else printf("FAILED!\n");
+    printf("%s %s -> YUV %s ... ", pfStr, buStrLong, subNameLong[subsamp]);
+    _tj(tjEncodeYUV3(handle2, srcBuf, w, 0, h, pf, yuvBuf, pad, subsamp,
+                     flags));
+    tjDestroy(handle2);
+    if (checkBufYUV(yuvBuf, w, h, subsamp, sf)) printf("Passed.\n");
+    else printf("FAILED!\n");
 
-		printf("YUV %s %s -> JPEG Q%d ... ", subNameLong[subsamp], buStrLong,
-			jpegQual);
-		_tj(tjCompressFromYUV(handle, yuvBuf, w, pad, h, subsamp, dstBuf,
-			dstSize, jpegQual, flags));
-	}
-	else
-	{
-		printf("%s %s -> %s Q%d ... ", pfStr, buStrLong, subNameLong[subsamp],
-			jpegQual);
-		_tj(tjCompress2(handle, srcBuf, w, 0, h, pf, dstBuf, dstSize, subsamp,
-			jpegQual, flags));
-	}
+    printf("YUV %s %s -> JPEG Q%d ... ", subNameLong[subsamp], buStrLong,
+           jpegQual);
+    _tj(tjCompressFromYUV(handle, yuvBuf, w, pad, h, subsamp, dstBuf, dstSize,
+                          jpegQual, flags));
+  } else {
+    printf("%s %s -> %s Q%d ... ", pfStr, buStrLong, subNameLong[subsamp],
+           jpegQual);
+    _tj(tjCompress2(handle, srcBuf, w, 0, h, pf, dstBuf, dstSize, subsamp,
+                    jpegQual, flags));
+  }
 
-	snprintf(tempStr, 1024, "%s_enc_%s_%s_%s_Q%d.jpg", basename, pfStr, buStr,
-		subName[subsamp], jpegQual);
-	writeJPEG(*dstBuf, *dstSize, tempStr);
-	printf("Done.\n  Result in %s\n", tempStr);
+  snprintf(tempStr, 1024, "%s_enc_%s_%s_%s_Q%d.jpg", basename, pfStr, buStr,
+           subName[subsamp], jpegQual);
+  writeJPEG(*dstBuf, *dstSize, tempStr);
+  printf("Done.\n  Result in %s\n", tempStr);
 
-	bailout:
-	if(yuvBuf) free(yuvBuf);
-	if(srcBuf) free(srcBuf);
+bailout:
+  if (yuvBuf) free(yuvBuf);
+  if (srcBuf) free(srcBuf);
 }
 
 
 void _decompTest(tjhandle handle, unsigned char *jpegBuf,
-	unsigned long jpegSize, int w, int h, int pf, char *basename, int subsamp,
-	int flags, tjscalingfactor sf)
+                 unsigned long jpegSize, int w, int h, int pf, char *basename,
+                 int subsamp, int flags, tjscalingfactor sf)
 {
-	unsigned char *dstBuf=NULL, *yuvBuf=NULL;
-	int _hdrw=0, _hdrh=0, _hdrsubsamp=-1;
-	int scaledWidth=TJSCALED(w, sf);
-	int scaledHeight=TJSCALED(h, sf);
-	unsigned long dstSize=0;
+  unsigned char *dstBuf = NULL, *yuvBuf = NULL;
+  int _hdrw = 0, _hdrh = 0, _hdrsubsamp = -1;
+  int scaledWidth = TJSCALED(w, sf);
+  int scaledHeight = TJSCALED(h, sf);
+  unsigned long dstSize = 0;
 
-	_tj(tjDecompressHeader2(handle, jpegBuf, jpegSize, &_hdrw, &_hdrh,
-		&_hdrsubsamp));
-	if(_hdrw!=w || _hdrh!=h || _hdrsubsamp!=subsamp)
-		_throw("Incorrect JPEG header");
+  _tj(tjDecompressHeader2(handle, jpegBuf, jpegSize, &_hdrw, &_hdrh,
+                          &_hdrsubsamp));
+  if (_hdrw != w || _hdrh != h || _hdrsubsamp != subsamp)
+    _throw("Incorrect JPEG header");
 
-	dstSize=scaledWidth*scaledHeight*tjPixelSize[pf];
-	if((dstBuf=(unsigned char *)malloc(dstSize))==NULL)
-		_throw("Memory allocation failure");
-	memset(dstBuf, 0, dstSize);
+  dstSize = scaledWidth * scaledHeight * tjPixelSize[pf];
+  if ((dstBuf = (unsigned char *)malloc(dstSize)) == NULL)
+    _throw("Memory allocation failure");
+  memset(dstBuf, 0, dstSize);
 
-	if(doyuv)
-	{
-		unsigned long yuvSize=tjBufSizeYUV2(scaledWidth, pad, scaledHeight,
-			subsamp);
-		tjhandle handle2=tjInitDecompress();
-		if(!handle2) _throwtj();
+  if (doYUV) {
+    unsigned long yuvSize = tjBufSizeYUV2(scaledWidth, pad, scaledHeight,
+                                          subsamp);
+    tjhandle handle2 = tjInitDecompress();
 
-		if((yuvBuf=(unsigned char *)malloc(yuvSize))==NULL)
-			_throw("Memory allocation failure");
-		memset(yuvBuf, 0, yuvSize);
+    if (!handle2) _throwtj();
 
-		printf("JPEG -> YUV %s ", subNameLong[subsamp]);
-		if(sf.num!=1 || sf.denom!=1)
-			printf("%d/%d ... ", sf.num, sf.denom);
-		else printf("... ");
-		_tj(tjDecompressToYUV2(handle, jpegBuf, jpegSize, yuvBuf, scaledWidth,
-			pad, scaledHeight, flags));
-		if(checkBufYUV(yuvBuf, scaledWidth, scaledHeight, subsamp, sf))
-			printf("Passed.\n");
-		else printf("FAILED!\n");
+    if ((yuvBuf = (unsigned char *)malloc(yuvSize)) == NULL)
+      _throw("Memory allocation failure");
+    memset(yuvBuf, 0, yuvSize);
 
-		printf("YUV %s -> %s %s ... ", subNameLong[subsamp], pixFormatStr[pf],
-			(flags&TJFLAG_BOTTOMUP)? "Bottom-Up":"Top-Down ");
-		_tj(tjDecodeYUV(handle2, yuvBuf, pad, subsamp, dstBuf, scaledWidth, 0,
-			scaledHeight, pf, flags));
-		tjDestroy(handle2);
-	}
-	else
-	{
-		printf("JPEG -> %s %s ", pixFormatStr[pf],
-			(flags&TJFLAG_BOTTOMUP)? "Bottom-Up":"Top-Down ");
-		if(sf.num!=1 || sf.denom!=1)
-			printf("%d/%d ... ", sf.num, sf.denom);
-		else printf("... ");
-		_tj(tjDecompress2(handle, jpegBuf, jpegSize, dstBuf, scaledWidth, 0,
-			scaledHeight, pf, flags));
-	}
+    printf("JPEG -> YUV %s ", subNameLong[subsamp]);
+    if (sf.num != 1 || sf.denom != 1)
+      printf("%d/%d ... ", sf.num, sf.denom);
+    else printf("... ");
+    _tj(tjDecompressToYUV2(handle, jpegBuf, jpegSize, yuvBuf, scaledWidth, pad,
+                           scaledHeight, flags));
+    if (checkBufYUV(yuvBuf, scaledWidth, scaledHeight, subsamp, sf))
+      printf("Passed.\n");
+    else printf("FAILED!\n");
 
-	if(checkBuf(dstBuf, scaledWidth, scaledHeight, pf, subsamp, sf, flags))
-		printf("Passed.");
-	else printf("FAILED!");
-	printf("\n");
+    printf("YUV %s -> %s %s ... ", subNameLong[subsamp], pixFormatStr[pf],
+           (flags & TJFLAG_BOTTOMUP) ? "Bottom-Up" : "Top-Down ");
+    _tj(tjDecodeYUV(handle2, yuvBuf, pad, subsamp, dstBuf, scaledWidth, 0,
+                    scaledHeight, pf, flags));
+    tjDestroy(handle2);
+  } else {
+    printf("JPEG -> %s %s ", pixFormatStr[pf],
+           (flags & TJFLAG_BOTTOMUP) ? "Bottom-Up" : "Top-Down ");
+    if (sf.num != 1 || sf.denom != 1)
+      printf("%d/%d ... ", sf.num, sf.denom);
+    else printf("... ");
+    _tj(tjDecompress2(handle, jpegBuf, jpegSize, dstBuf, scaledWidth, 0,
+                      scaledHeight, pf, flags));
+  }
 
-	bailout:
-	if(yuvBuf) free(yuvBuf);
-	if(dstBuf) free(dstBuf);
+  if (checkBuf(dstBuf, scaledWidth, scaledHeight, pf, subsamp, sf, flags))
+    printf("Passed.");
+  else printf("FAILED!");
+  printf("\n");
+
+bailout:
+  if (yuvBuf) free(yuvBuf);
+  if (dstBuf) free(dstBuf);
 }
 
 
 void decompTest(tjhandle handle, unsigned char *jpegBuf,
-	unsigned long jpegSize, int w, int h, int pf, char *basename, int subsamp,
-	int flags)
+                unsigned long jpegSize, int w, int h, int pf, char *basename,
+                int subsamp, int flags)
 {
-	int i, n=0;
-	tjscalingfactor *sf=tjGetScalingFactors(&n);
-	if(!sf || !n) _throwtj();
+  int i, n = 0;
+  tjscalingfactor *sf = tjGetScalingFactors(&n);
 
-	for(i=0; i<n; i++)
-	{
-		if(subsamp==TJSAMP_444 || subsamp==TJSAMP_GRAY ||
-			(subsamp==TJSAMP_411 && sf[i].num==1 &&
-				(sf[i].denom==2 || sf[i].denom==1)) ||
-			(subsamp!=TJSAMP_411 && sf[i].num==1 &&
-				(sf[i].denom==4 || sf[i].denom==2 || sf[i].denom==1)))
-			_decompTest(handle, jpegBuf, jpegSize, w, h, pf, basename, subsamp,
-				flags, sf[i]);
-	}
+  if (!sf || !n) _throwtj();
 
-	bailout:
-	return;
+  for (i = 0; i < n; i++) {
+    if (subsamp == TJSAMP_444 || subsamp == TJSAMP_GRAY ||
+        (subsamp == TJSAMP_411 && sf[i].num == 1 &&
+         (sf[i].denom == 2 || sf[i].denom == 1)) ||
+        (subsamp != TJSAMP_411 && sf[i].num == 1 &&
+         (sf[i].denom == 4 || sf[i].denom == 2 || sf[i].denom == 1)))
+      _decompTest(handle, jpegBuf, jpegSize, w, h, pf, basename, subsamp,
+                  flags, sf[i]);
+  }
+
+bailout:
+  return;
 }
 
 
 void doTest(int w, int h, const int *formats, int nformats, int subsamp,
-	char *basename)
+            char *basename)
 {
-	tjhandle chandle=NULL, dhandle=NULL;
-	unsigned char *dstBuf=NULL;
-	unsigned long size=0;  int pfi, pf, i;
+  tjhandle chandle = NULL, dhandle = NULL;
+  unsigned char *dstBuf = NULL;
+  unsigned long size = 0;
+  int pfi, pf, i;
 
-	if(!alloc)
-		size=tjBufSize(w, h, subsamp);
-	if(size!=0)
-		if((dstBuf=(unsigned char *)tjAlloc(size))==NULL)
-			_throw("Memory allocation failure.");
+  if (!alloc)
+    size = tjBufSize(w, h, subsamp);
+  if (size != 0)
+    if ((dstBuf = (unsigned char *)tjAlloc(size)) == NULL)
+      _throw("Memory allocation failure.");
 
-	if((chandle=tjInitCompress())==NULL || (dhandle=tjInitDecompress())==NULL)
-		_throwtj();
+  if ((chandle = tjInitCompress()) == NULL ||
+      (dhandle = tjInitDecompress()) == NULL)
+    _throwtj();
 
-	for(pfi=0; pfi<nformats; pfi++)
-	{
-		for(i=0; i<2; i++)
-		{
-			int flags=0;
-			if(subsamp==TJSAMP_422 || subsamp==TJSAMP_420 || subsamp==TJSAMP_440 ||
-				subsamp==TJSAMP_411)
-				flags|=TJFLAG_FASTUPSAMPLE;
-			if(i==1) flags|=TJFLAG_BOTTOMUP;
-			pf=formats[pfi];
-			compTest(chandle, &dstBuf, &size, w, h, pf, basename, subsamp, 100,
-				flags);
-			decompTest(dhandle, dstBuf, size, w, h, pf, basename, subsamp,
-				flags);
-			if(pf>=TJPF_RGBX && pf<=TJPF_XRGB)
-			{
-				printf("\n");
-				decompTest(dhandle, dstBuf, size, w, h, pf+(TJPF_RGBA-TJPF_RGBX),
-					basename, subsamp, flags);
-			}
-			printf("\n");
-		}
-	}
-	printf("--------------------\n\n");
+  for (pfi = 0; pfi < nformats; pfi++) {
+    for (i = 0; i < 2; i++) {
+      int flags = 0;
 
-	bailout:
-	if(chandle) tjDestroy(chandle);
-	if(dhandle) tjDestroy(dhandle);
+      if (subsamp == TJSAMP_422 || subsamp == TJSAMP_420 ||
+          subsamp == TJSAMP_440 || subsamp == TJSAMP_411)
+        flags |= TJFLAG_FASTUPSAMPLE;
+      if (i == 1) flags |= TJFLAG_BOTTOMUP;
+      pf = formats[pfi];
+      compTest(chandle, &dstBuf, &size, w, h, pf, basename, subsamp, 100,
+               flags);
+      decompTest(dhandle, dstBuf, size, w, h, pf, basename, subsamp, flags);
+      if (pf >= TJPF_RGBX && pf <= TJPF_XRGB) {
+        printf("\n");
+        decompTest(dhandle, dstBuf, size, w, h, pf + (TJPF_RGBA - TJPF_RGBX),
+                   basename, subsamp, flags);
+      }
+      printf("\n");
+    }
+  }
+  printf("--------------------\n\n");
 
-	if(dstBuf) tjFree(dstBuf);
+bailout:
+  if (chandle) tjDestroy(chandle);
+  if (dhandle) tjDestroy(dhandle);
+  if (dstBuf) tjFree(dstBuf);
 }
 
 
 void bufSizeTest(void)
 {
-	int w, h, i, subsamp;
-	unsigned char *srcBuf=NULL, *dstBuf=NULL;
-	tjhandle handle=NULL;
-	unsigned long dstSize=0;
+  int w, h, i, subsamp;
+  unsigned char *srcBuf = NULL, *dstBuf = NULL;
+  tjhandle handle = NULL;
+  unsigned long dstSize = 0;
 
-	if((handle=tjInitCompress())==NULL) _throwtj();
+  if ((handle = tjInitCompress()) == NULL) _throwtj();
 
-	printf("Buffer size regression test\n");
-	for(subsamp=0; subsamp<TJ_NUMSAMP; subsamp++)
-	{
-		for(w=1; w<48; w++)
-		{
-			int maxh=(w==1)? 2048:48;
-			for(h=1; h<maxh; h++)
-			{
-				if(h%100==0) printf("%.4d x %.4d\b\b\b\b\b\b\b\b\b\b\b", w, h);
-				if((srcBuf=(unsigned char *)malloc(w*h*4))==NULL)
-					_throw("Memory allocation failure");
-				if(!alloc || doyuv)
-				{
-					if(doyuv) dstSize=tjBufSizeYUV2(w, pad, h, subsamp);
-					else dstSize=tjBufSize(w, h, subsamp);
-					if((dstBuf=(unsigned char *)tjAlloc(dstSize))==NULL)
-						_throw("Memory allocation failure");
-				}
+  printf("Buffer size regression test\n");
+  for (subsamp = 0; subsamp < TJ_NUMSAMP; subsamp++) {
+    for (w = 1; w < 48; w++) {
+      int maxh = (w == 1) ? 2048 : 48;
 
-				for(i=0; i<w*h*4; i++)
-				{
-					if(random()<RAND_MAX/2) srcBuf[i]=0;
-					else srcBuf[i]=255;
-				}
+      for (h = 1; h < maxh; h++) {
+        if (h % 100 == 0) printf("%.4d x %.4d\b\b\b\b\b\b\b\b\b\b\b", w, h);
+        if ((srcBuf = (unsigned char *)malloc(w * h * 4)) == NULL)
+          _throw("Memory allocation failure");
+        if (!alloc || doYUV) {
+          if (doYUV) dstSize = tjBufSizeYUV2(w, pad, h, subsamp);
+          else dstSize = tjBufSize(w, h, subsamp);
+          if ((dstBuf = (unsigned char *)tjAlloc(dstSize)) == NULL)
+            _throw("Memory allocation failure");
+        }
 
-				if(doyuv)
-				{
-					_tj(tjEncodeYUV3(handle, srcBuf, w, 0, h, TJPF_BGRX, dstBuf, pad,
-						subsamp, 0));
-				}
-				else
-				{
-					_tj(tjCompress2(handle, srcBuf, w, 0, h, TJPF_BGRX, &dstBuf,
-						&dstSize, subsamp, 100, alloc? 0:TJFLAG_NOREALLOC));
-				}
-				free(srcBuf);  srcBuf=NULL;
-				if(!alloc || doyuv)
-				{
-					tjFree(dstBuf);  dstBuf=NULL;
-				}
+        for (i = 0; i < w * h * 4; i++) {
+          if (random() < RAND_MAX / 2) srcBuf[i] = 0;
+          else srcBuf[i] = 255;
+        }
 
-				if((srcBuf=(unsigned char *)malloc(h*w*4))==NULL)
-					_throw("Memory allocation failure");
-				if(!alloc || doyuv)
-				{
-					if(doyuv) dstSize=tjBufSizeYUV2(h, pad, w, subsamp);
-					else dstSize=tjBufSize(h, w, subsamp);
-					if((dstBuf=(unsigned char *)tjAlloc(dstSize))==NULL)
-						_throw("Memory allocation failure");
-				}
+        if (doYUV) {
+          _tj(tjEncodeYUV3(handle, srcBuf, w, 0, h, TJPF_BGRX, dstBuf, pad,
+                           subsamp, 0));
+        } else {
+          _tj(tjCompress2(handle, srcBuf, w, 0, h, TJPF_BGRX, &dstBuf,
+                          &dstSize, subsamp, 100,
+                          alloc ? 0 : TJFLAG_NOREALLOC));
+        }
+        free(srcBuf);  srcBuf = NULL;
+        if (!alloc || doYUV) {
+          tjFree(dstBuf);  dstBuf = NULL;
+        }
 
-				for(i=0; i<h*w*4; i++)
-				{
-					if(random()<RAND_MAX/2) srcBuf[i]=0;
-					else srcBuf[i]=255;
-				}
+        if ((srcBuf = (unsigned char *)malloc(h * w * 4)) == NULL)
+          _throw("Memory allocation failure");
+        if (!alloc || doYUV) {
+          if (doYUV) dstSize = tjBufSizeYUV2(h, pad, w, subsamp);
+          else dstSize = tjBufSize(h, w, subsamp);
+          if ((dstBuf = (unsigned char *)tjAlloc(dstSize)) == NULL)
+            _throw("Memory allocation failure");
+        }
 
-				if(doyuv)
-				{
-					_tj(tjEncodeYUV3(handle, srcBuf, h, 0, w, TJPF_BGRX, dstBuf, pad,
-						subsamp, 0));
-				}
-				else
-				{
-					_tj(tjCompress2(handle, srcBuf, h, 0, w, TJPF_BGRX, &dstBuf,
-						&dstSize, subsamp, 100, alloc? 0:TJFLAG_NOREALLOC));
-				}
-				free(srcBuf);  srcBuf=NULL;
-				if(!alloc || doyuv)
-				{
-					tjFree(dstBuf);  dstBuf=NULL;
-				}
-			}
-		}
-	}
-	printf("Done.      \n");
+        for (i = 0; i < h * w * 4; i++) {
+          if (random() < RAND_MAX / 2) srcBuf[i] = 0;
+          else srcBuf[i] = 255;
+        }
 
-	bailout:
-	if(srcBuf) free(srcBuf);
-	if(dstBuf) tjFree(dstBuf);
-	if(handle) tjDestroy(handle);
+        if (doYUV) {
+          _tj(tjEncodeYUV3(handle, srcBuf, h, 0, w, TJPF_BGRX, dstBuf, pad,
+                           subsamp, 0));
+        } else {
+          _tj(tjCompress2(handle, srcBuf, h, 0, w, TJPF_BGRX, &dstBuf,
+                          &dstSize, subsamp, 100,
+                          alloc ? 0 : TJFLAG_NOREALLOC));
+        }
+        free(srcBuf);  srcBuf = NULL;
+        if (!alloc || doYUV) {
+          tjFree(dstBuf);  dstBuf = NULL;
+        }
+      }
+    }
+  }
+  printf("Done.      \n");
+
+bailout:
+  if (srcBuf) free(srcBuf);
+  if (dstBuf) tjFree(dstBuf);
+  if (handle) tjDestroy(handle);
+}
+
+
+void initBitmap(unsigned char *buf, int width, int pitch, int height, int pf,
+                int flags)
+{
+  int roffset = tjRedOffset[pf];
+  int goffset = tjGreenOffset[pf];
+  int boffset = tjBlueOffset[pf];
+  int ps = tjPixelSize[pf];
+  int i, j;
+
+  for (j = 0; j < height; j++) {
+    int row = (flags & TJFLAG_BOTTOMUP) ? height - j - 1 : j;
+
+    for (i = 0; i < width; i++) {
+      unsigned char r = (i * 256 / width) % 256;
+      unsigned char g = (j * 256 / height) % 256;
+      unsigned char b = (j * 256 / height + i * 256 / width) % 256;
+
+      memset(&buf[row * pitch + i * ps], 0, ps);
+      if (pf == TJPF_GRAY) buf[row * pitch + i * ps] = b;
+      else if (pf == TJPF_CMYK)
+        rgb_to_cmyk(r, g, b, &buf[row * pitch + i * ps + 0],
+                    &buf[row * pitch + i * ps + 1],
+                    &buf[row * pitch + i * ps + 2],
+                    &buf[row * pitch + i * ps + 3]);
+      else {
+        buf[row * pitch + i * ps + roffset] = r;
+        buf[row * pitch + i * ps + goffset] = g;
+        buf[row * pitch + i * ps + boffset] = b;
+      }
+    }
+  }
+}
+
+
+int cmpBitmap(unsigned char *buf, int width, int pitch, int height, int pf,
+              int flags, int gray2rgb)
+{
+  int roffset = tjRedOffset[pf];
+  int goffset = tjGreenOffset[pf];
+  int boffset = tjBlueOffset[pf];
+  int aoffset = tjAlphaOffset[pf];
+  int ps = tjPixelSize[pf];
+  int i, j;
+
+  for (j = 0; j < height; j++) {
+    int row = (flags & TJFLAG_BOTTOMUP) ? height - j - 1 : j;
+
+    for (i = 0; i < width; i++) {
+      unsigned char r = (i * 256 / width) % 256;
+      unsigned char g = (j * 256 / height) % 256;
+      unsigned char b = (j * 256 / height + i * 256 / width) % 256;
+
+      if (pf == TJPF_GRAY) {
+        if (buf[row * pitch + i * ps] != b)
+          return 0;
+      } else if (pf == TJPF_CMYK) {
+        unsigned char rf, gf, bf;
+
+        cmyk_to_rgb(buf[row * pitch + i * ps + 0],
+                    buf[row * pitch + i * ps + 1],
+                    buf[row * pitch + i * ps + 2],
+                    buf[row * pitch + i * ps + 3], &rf, &gf, &bf);
+        if (gray2rgb) {
+          if (rf != b || gf != b || bf != b)
+            return 0;
+        } else if (rf != r || gf != g || bf != b) return 0;
+      } else {
+        if (gray2rgb) {
+          if (buf[row * pitch + i * ps + roffset] != b ||
+              buf[row * pitch + i * ps + goffset] != b ||
+              buf[row * pitch + i * ps + boffset] != b)
+            return 0;
+        } else if (buf[row * pitch + i * ps + roffset] != r ||
+                   buf[row * pitch + i * ps + goffset] != g ||
+                   buf[row * pitch + i * ps + boffset] != b)
+          return 0;
+        if (aoffset >= 0 && buf[row * pitch + i * ps + aoffset] != 0xFF)
+          return 0;
+      }
+    }
+  }
+  return 1;
+}
+
+
+int doBmpTest(const char *ext, int width, int align, int height, int pf,
+              int flags)
+{
+  char filename[80], *md5sum, md5buf[65];
+  int ps = tjPixelSize[pf], pitch = PAD(width * ps, align), loadWidth = 0,
+    loadHeight = 0, retval = 0, pixelFormat = pf;
+  unsigned char *buf = NULL;
+  char *md5ref;
+
+  if (pf == TJPF_GRAY) {
+    md5ref = !strcasecmp(ext, "ppm") ? "112c682e82ce5de1cca089e20d60000b" :
+                                       "51976530acf75f02beddf5d21149101d";
+  } else {
+    md5ref = !strcasecmp(ext, "ppm") ? "c0c9f772b464d1896326883a5c79c545" :
+                                       "6d659071b9bfcdee2def22cb58ddadca";
+  }
+
+  if ((buf = (unsigned char *)tjAlloc(pitch * height)) == NULL)
+    _throw("Could not allocate memory");
+  initBitmap(buf, width, pitch, height, pf, flags);
+
+  snprintf(filename, 80, "test_bmp_%s_%d_%s.%s", pixFormatStr[pf], align,
+           (flags & TJFLAG_BOTTOMUP) ? "bu" : "td", ext);
+  _tj(tjSaveImage(filename, buf, width, pitch, height, pf, flags));
+  md5sum = MD5File(filename, md5buf);
+  if (strcasecmp(md5sum, md5ref))
+    _throwmd5(filename, md5sum, md5ref);
+
+  tjFree(buf);  buf = NULL;
+  if ((buf = tjLoadImage(filename, &loadWidth, align, &loadHeight, &pf,
+                         flags)) == NULL)
+    _throwtj();
+  if (width != loadWidth || height != loadHeight) {
+    printf("\n   Image dimensions of %s are bogus\n", filename);
+    retval = -1;  goto bailout;
+  }
+  if (!cmpBitmap(buf, width, pitch, height, pf, flags, 0)) {
+    printf("\n   Pixel data in %s is bogus\n", filename);
+    retval = -1;  goto bailout;
+  }
+  if (pf == TJPF_GRAY) {
+    tjFree(buf);  buf = NULL;
+    pf = TJPF_XBGR;
+    if ((buf = tjLoadImage(filename, &loadWidth, align, &loadHeight, &pf,
+                           flags)) == NULL)
+      _throwtj();
+    pitch = PAD(width * tjPixelSize[pf], align);
+    if (!cmpBitmap(buf, width, pitch, height, pf, flags, 1)) {
+      printf("\n   Converting %s to RGB failed\n", filename);
+      retval = -1;  goto bailout;
+    }
+
+    tjFree(buf);  buf = NULL;
+    pf = TJPF_CMYK;
+    if ((buf = tjLoadImage(filename, &loadWidth, align, &loadHeight, &pf,
+                           flags)) == NULL)
+      _throwtj();
+    pitch = PAD(width * tjPixelSize[pf], align);
+    if (!cmpBitmap(buf, width, pitch, height, pf, flags, 1)) {
+      printf("\n   Converting %s to CMYK failed\n", filename);
+      retval = -1;  goto bailout;
+    }
+  }
+  /* Verify that tjLoadImage() returns the proper "preferred" pixel format for
+     the file type. */
+  tjFree(buf);  buf = NULL;
+  pf = pixelFormat;
+  pixelFormat = TJPF_UNKNOWN;
+  if ((buf = tjLoadImage(filename, &loadWidth, align, &loadHeight,
+                         &pixelFormat, flags)) == NULL)
+    _throwtj();
+  if ((pf == TJPF_GRAY && pixelFormat != TJPF_GRAY) ||
+      (pf != TJPF_GRAY && !strcasecmp(ext, "bmp") &&
+       pixelFormat != TJPF_BGR) ||
+      (pf != TJPF_GRAY && !strcasecmp(ext, "ppm") &&
+       pixelFormat != TJPF_RGB)) {
+    printf("\n   tjLoadImage() returned unexpected pixel format: %s\n",
+           pixFormatStr[pixelFormat]);
+    retval = -1;
+  }
+  unlink(filename);
+
+bailout:
+  if (buf) tjFree(buf);
+  if (exitStatus < 0) return exitStatus;
+  return retval;
+}
+
+
+int bmpTest(void)
+{
+  int align, width = 35, height = 39, format;
+
+  for (align = 1; align <= 8; align *= 2) {
+    for (format = 0; format < TJ_NUMPF; format++) {
+      printf("%s Top-Down BMP (row alignment = %d bytes)  ...  ",
+             pixFormatStr[format], align);
+      if (doBmpTest("bmp", width, align, height, format, 0) == -1)
+        return -1;
+      printf("OK.\n");
+
+      printf("%s Top-Down PPM (row alignment = %d bytes)  ...  ",
+             pixFormatStr[format], align);
+      if (doBmpTest("ppm", width, align, height, format,
+                    TJFLAG_BOTTOMUP) == -1)
+        return -1;
+      printf("OK.\n");
+
+      printf("%s Bottom-Up BMP (row alignment = %d bytes)  ...  ",
+             pixFormatStr[format], align);
+      if (doBmpTest("bmp", width, align, height, format, 0) == -1)
+        return -1;
+      printf("OK.\n");
+
+      printf("%s Bottom-Up PPM (row alignment = %d bytes)  ...  ",
+             pixFormatStr[format], align);
+      if (doBmpTest("ppm", width, align, height, format,
+                    TJFLAG_BOTTOMUP) == -1)
+        return -1;
+      printf("OK.\n");
+    }
+  }
+
+  return 0;
 }
 
 
 int main(int argc, char *argv[])
 {
-	int i, num4bf=5;
-	#ifdef _WIN32
-	srand((unsigned int)time(NULL));
-	#endif
-	if(argc>1)
-	{
-		for(i=1; i<argc; i++)
-		{
-			if(!strcasecmp(argv[i], "-yuv")) doyuv=1;
-			else if(!strcasecmp(argv[i], "-noyuvpad")) pad=1;
-			else if(!strcasecmp(argv[i], "-alloc")) alloc=1;
-			else usage(argv[0]);
-		}
-	}
-	if(alloc) printf("Testing automatic buffer allocation\n");
-	if(doyuv) num4bf=4;
-	doTest(35, 39, _3byteFormats, 2, TJSAMP_444, "test");
-	doTest(39, 41, _4byteFormats, num4bf, TJSAMP_444, "test");
-	doTest(41, 35, _3byteFormats, 2, TJSAMP_422, "test");
-	doTest(35, 39, _4byteFormats, num4bf, TJSAMP_422, "test");
-	doTest(39, 41, _3byteFormats, 2, TJSAMP_420, "test");
-	doTest(41, 35, _4byteFormats, num4bf, TJSAMP_420, "test");
-	doTest(35, 39, _3byteFormats, 2, TJSAMP_440, "test");
-	doTest(39, 41, _4byteFormats, num4bf, TJSAMP_440, "test");
-	doTest(41, 35, _3byteFormats, 2, TJSAMP_411, "test");
-	doTest(35, 39, _4byteFormats, num4bf, TJSAMP_411, "test");
-	doTest(39, 41, _onlyGray, 1, TJSAMP_GRAY, "test");
-	doTest(41, 35, _3byteFormats, 2, TJSAMP_GRAY, "test");
-	doTest(35, 39, _4byteFormats, 4, TJSAMP_GRAY, "test");
-	bufSizeTest();
-	if(doyuv)
-	{
-		printf("\n--------------------\n\n");
-		doTest(48, 48, _onlyRGB, 1, TJSAMP_444, "test_yuv0");
-		doTest(48, 48, _onlyRGB, 1, TJSAMP_422, "test_yuv0");
-		doTest(48, 48, _onlyRGB, 1, TJSAMP_420, "test_yuv0");
-		doTest(48, 48, _onlyRGB, 1, TJSAMP_440, "test_yuv0");
-		doTest(48, 48, _onlyRGB, 1, TJSAMP_411, "test_yuv0");
-		doTest(48, 48, _onlyRGB, 1, TJSAMP_GRAY, "test_yuv0");
-		doTest(48, 48, _onlyGray, 1, TJSAMP_GRAY, "test_yuv0");
-	}
+  int i, num4bf = 5;
 
-	return exitStatus;
+#ifdef _WIN32
+  srand((unsigned int)time(NULL));
+#endif
+  if (argc > 1) {
+    for (i = 1; i < argc; i++) {
+      if (!strcasecmp(argv[i], "-yuv")) doYUV = 1;
+      else if (!strcasecmp(argv[i], "-noyuvpad")) pad = 1;
+      else if (!strcasecmp(argv[i], "-alloc")) alloc = 1;
+      else if (!strcasecmp(argv[i], "-bmp")) return bmpTest();
+      else usage(argv[0]);
+    }
+  }
+  if (alloc) printf("Testing automatic buffer allocation\n");
+  if (doYUV) num4bf = 4;
+  doTest(35, 39, _3byteFormats, 2, TJSAMP_444, "test");
+  doTest(39, 41, _4byteFormats, num4bf, TJSAMP_444, "test");
+  doTest(41, 35, _3byteFormats, 2, TJSAMP_422, "test");
+  doTest(35, 39, _4byteFormats, num4bf, TJSAMP_422, "test");
+  doTest(39, 41, _3byteFormats, 2, TJSAMP_420, "test");
+  doTest(41, 35, _4byteFormats, num4bf, TJSAMP_420, "test");
+  doTest(35, 39, _3byteFormats, 2, TJSAMP_440, "test");
+  doTest(39, 41, _4byteFormats, num4bf, TJSAMP_440, "test");
+  doTest(41, 35, _3byteFormats, 2, TJSAMP_411, "test");
+  doTest(35, 39, _4byteFormats, num4bf, TJSAMP_411, "test");
+  doTest(39, 41, _onlyGray, 1, TJSAMP_GRAY, "test");
+  doTest(41, 35, _3byteFormats, 2, TJSAMP_GRAY, "test");
+  doTest(35, 39, _4byteFormats, 4, TJSAMP_GRAY, "test");
+  bufSizeTest();
+  if (doYUV) {
+    printf("\n--------------------\n\n");
+    doTest(48, 48, _onlyRGB, 1, TJSAMP_444, "test_yuv0");
+    doTest(48, 48, _onlyRGB, 1, TJSAMP_422, "test_yuv0");
+    doTest(48, 48, _onlyRGB, 1, TJSAMP_420, "test_yuv0");
+    doTest(48, 48, _onlyRGB, 1, TJSAMP_440, "test_yuv0");
+    doTest(48, 48, _onlyRGB, 1, TJSAMP_411, "test_yuv0");
+    doTest(48, 48, _onlyRGB, 1, TJSAMP_GRAY, "test_yuv0");
+    doTest(48, 48, _onlyGray, 1, TJSAMP_GRAY, "test_yuv0");
+  }
+
+  return exitStatus;
 }
diff --git a/tjutil.c b/tjutil.c
index 6618d15..b44086d 100644
--- a/tjutil.c
+++ b/tjutil.c
@@ -30,25 +30,26 @@
 
 #include <windows.h>
 
-static double getfreq(void)
+static double getFreq(void)
 {
-	LARGE_INTEGER freq;
-	if(!QueryPerformanceFrequency(&freq)) return 0.0;
-	return (double)freq.QuadPart;
+  LARGE_INTEGER freq;
+
+  if (!QueryPerformanceFrequency(&freq)) return 0.0;
+  return (double)freq.QuadPart;
 }
 
-static double f=-1.0;
+static double f = -1.0;
 
-double gettime(void)
+double getTime(void)
 {
-	LARGE_INTEGER t;
-	if(f<0.0) f=getfreq();
-	if(f==0.0) return (double)GetTickCount()/1000.;
-	else
-	{
-		QueryPerformanceCounter(&t);
-		return (double)t.QuadPart/f;
-	}
+  LARGE_INTEGER t;
+
+  if (f < 0.0) f = getFreq();
+  if (f == 0.0) return (double)GetTickCount() / 1000.;
+  else {
+    QueryPerformanceCounter(&t);
+    return (double)t.QuadPart / f;
+  }
 }
 
 #else
@@ -56,11 +57,12 @@
 #include <stdlib.h>
 #include <sys/time.h>
 
-double gettime(void)
+double getTime(void)
 {
-	struct timeval tv;
-	if(gettimeofday(&tv, NULL)<0) return 0.0;
-	else return (double)tv.tv_sec+((double)tv.tv_usec/1000000.);
+  struct timeval tv;
+
+  if (gettimeofday(&tv, NULL) < 0) return 0.0;
+  else return (double)tv.tv_sec + ((double)tv.tv_usec / 1000000.);
 }
 
 #endif
diff --git a/tjutil.h b/tjutil.h
index bdad348..4f7738c 100644
--- a/tjutil.h
+++ b/tjutil.h
@@ -27,21 +27,21 @@
  */
 
 #ifdef _WIN32
-	#ifndef __MINGW32__
-		#include <stdio.h>
-		#define snprintf(str, n, format, ...)  \
-			_snprintf_s(str, n, _TRUNCATE, format, __VA_ARGS__)
-	#endif
-	#define strcasecmp stricmp
-	#define strncasecmp strnicmp
+#ifndef __MINGW32__
+#include <stdio.h>
+#define snprintf(str, n, format, ...) \
+  _snprintf_s(str, n, _TRUNCATE, format, __VA_ARGS__)
+#endif
+#define strcasecmp stricmp
+#define strncasecmp strnicmp
 #endif
 
 #ifndef min
- #define min(a,b) ((a)<(b)?(a):(b))
+#define min(a, b) ((a) < (b) ? (a) : (b))
 #endif
 
 #ifndef max
- #define max(a,b) ((a)>(b)?(a):(b))
+#define max(a, b) ((a) > (b) ? (a) : (b))
 #endif
 
-extern double gettime(void);
+extern double getTime(void);
diff --git a/transupp.c b/transupp.c
index b51ef39..d86e95c 100644
--- a/transupp.c
+++ b/transupp.c
@@ -89,10 +89,10 @@
 
 
 LOCAL(void)
-do_crop (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-         JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-         jvirt_barray_ptr *src_coef_arrays,
-         jvirt_barray_ptr *dst_coef_arrays)
+do_crop(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+        JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+        jvirt_barray_ptr *src_coef_arrays,
+        jvirt_barray_ptr *dst_coef_arrays)
 /* Crop.  This is only used when no rotate/flip is requested with the crop. */
 {
   JDIMENSION dst_blk_y, x_crop_blocks, y_crop_blocks;
@@ -110,12 +110,12 @@
     for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
          dst_blk_y += compptr->v_samp_factor) {
       dst_buffer = (*srcinfo->mem->access_virt_barray)
-        ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-         (JDIMENSION) compptr->v_samp_factor, TRUE);
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
       src_buffer = (*srcinfo->mem->access_virt_barray)
-        ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+        ((j_common_ptr)srcinfo, src_coef_arrays[ci],
          dst_blk_y + y_crop_blocks,
-         (JDIMENSION) compptr->v_samp_factor, FALSE);
+         (JDIMENSION)compptr->v_samp_factor, FALSE);
       for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
         jcopy_block_row(src_buffer[offset_y] + x_crop_blocks,
                         dst_buffer[offset_y],
@@ -127,9 +127,8 @@
 
 
 LOCAL(void)
-do_flip_h_no_crop (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-                   JDIMENSION x_crop_offset,
-                   jvirt_barray_ptr *src_coef_arrays)
+do_flip_h_no_crop(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+                  JDIMENSION x_crop_offset, jvirt_barray_ptr *src_coef_arrays)
 /* Horizontal flip; done in-place, so no separate dest array is required.
  * NB: this only works when y_crop_offset is zero.
  */
@@ -147,7 +146,7 @@
    * Partial iMCUs at the right edge are left untouched.
    */
   MCU_cols = srcinfo->output_width /
-    (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
+             (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
 
   for (ci = 0; ci < dstinfo->num_components; ci++) {
     compptr = dstinfo->comp_info + ci;
@@ -156,8 +155,8 @@
     for (blk_y = 0; blk_y < compptr->height_in_blocks;
          blk_y += compptr->v_samp_factor) {
       buffer = (*srcinfo->mem->access_virt_barray)
-        ((j_common_ptr) srcinfo, src_coef_arrays[ci], blk_y,
-         (JDIMENSION) compptr->v_samp_factor, TRUE);
+        ((j_common_ptr)srcinfo, src_coef_arrays[ci], blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
       for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
         /* Do the mirroring */
         for (blk_x = 0; blk_x * 2 < comp_width; blk_x++) {
@@ -183,8 +182,7 @@
            */
           for (blk_x = 0; blk_x < compptr->width_in_blocks; blk_x++) {
             jcopy_block_row(buffer[offset_y] + blk_x + x_crop_blocks,
-                            buffer[offset_y] + blk_x,
-                            (JDIMENSION) 1);
+                            buffer[offset_y] + blk_x, (JDIMENSION)1);
           }
         }
       }
@@ -194,10 +192,10 @@
 
 
 LOCAL(void)
-do_flip_h (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-           JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-           jvirt_barray_ptr *src_coef_arrays,
-           jvirt_barray_ptr *dst_coef_arrays)
+do_flip_h(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+          JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+          jvirt_barray_ptr *src_coef_arrays,
+          jvirt_barray_ptr *dst_coef_arrays)
 /* Horizontal flip in general cropping case */
 {
   JDIMENSION MCU_cols, comp_width, dst_blk_x, dst_blk_y;
@@ -213,7 +211,7 @@
    * this is essentially the same as the routine above.
    */
   MCU_cols = srcinfo->output_width /
-    (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
+             (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
 
   for (ci = 0; ci < dstinfo->num_components; ci++) {
     compptr = dstinfo->comp_info + ci;
@@ -223,16 +221,17 @@
     for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
          dst_blk_y += compptr->v_samp_factor) {
       dst_buffer = (*srcinfo->mem->access_virt_barray)
-        ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-         (JDIMENSION) compptr->v_samp_factor, TRUE);
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
       src_buffer = (*srcinfo->mem->access_virt_barray)
-        ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+        ((j_common_ptr)srcinfo, src_coef_arrays[ci],
          dst_blk_y + y_crop_blocks,
-         (JDIMENSION) compptr->v_samp_factor, FALSE);
+         (JDIMENSION)compptr->v_samp_factor, FALSE);
       for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
         dst_row_ptr = dst_buffer[offset_y];
         src_row_ptr = src_buffer[offset_y];
-        for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) {
+        for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
+             dst_blk_x++) {
           if (x_crop_blocks + dst_blk_x < comp_width) {
             /* Do the mirrorable blocks */
             dst_ptr = dst_row_ptr[dst_blk_x];
@@ -245,8 +244,7 @@
           } else {
             /* Copy last partial block(s) verbatim */
             jcopy_block_row(src_row_ptr + dst_blk_x + x_crop_blocks,
-                            dst_row_ptr + dst_blk_x,
-                            (JDIMENSION) 1);
+                            dst_row_ptr + dst_blk_x, (JDIMENSION)1);
           }
         }
       }
@@ -256,10 +254,10 @@
 
 
 LOCAL(void)
-do_flip_v (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-           JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-           jvirt_barray_ptr *src_coef_arrays,
-           jvirt_barray_ptr *dst_coef_arrays)
+do_flip_v(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+          JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+          jvirt_barray_ptr *src_coef_arrays,
+          jvirt_barray_ptr *dst_coef_arrays)
 /* Vertical flip */
 {
   JDIMENSION MCU_rows, comp_height, dst_blk_x, dst_blk_y;
@@ -278,7 +276,7 @@
    * Partial iMCUs at the bottom edge are copied verbatim.
    */
   MCU_rows = srcinfo->output_height /
-    (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
+             (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
 
   for (ci = 0; ci < dstinfo->num_components; ci++) {
     compptr = dstinfo->comp_info + ci;
@@ -288,21 +286,21 @@
     for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
          dst_blk_y += compptr->v_samp_factor) {
       dst_buffer = (*srcinfo->mem->access_virt_barray)
-        ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-         (JDIMENSION) compptr->v_samp_factor, TRUE);
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
       if (y_crop_blocks + dst_blk_y < comp_height) {
         /* Row is within the mirrorable area. */
         src_buffer = (*srcinfo->mem->access_virt_barray)
-          ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
            comp_height - y_crop_blocks - dst_blk_y -
-           (JDIMENSION) compptr->v_samp_factor,
-           (JDIMENSION) compptr->v_samp_factor, FALSE);
+           (JDIMENSION)compptr->v_samp_factor,
+           (JDIMENSION)compptr->v_samp_factor, FALSE);
       } else {
         /* Bottom-edge blocks will be copied verbatim. */
         src_buffer = (*srcinfo->mem->access_virt_barray)
-          ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
            dst_blk_y + y_crop_blocks,
-           (JDIMENSION) compptr->v_samp_factor, FALSE);
+           (JDIMENSION)compptr->v_samp_factor, FALSE);
       }
       for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
         if (y_crop_blocks + dst_blk_y < comp_height) {
@@ -336,10 +334,10 @@
 
 
 LOCAL(void)
-do_transpose (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-              JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-              jvirt_barray_ptr *src_coef_arrays,
-              jvirt_barray_ptr *dst_coef_arrays)
+do_transpose(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+             JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+             jvirt_barray_ptr *src_coef_arrays,
+             jvirt_barray_ptr *dst_coef_arrays)
 /* Transpose source into destination */
 {
   JDIMENSION dst_blk_x, dst_blk_y, x_crop_blocks, y_crop_blocks;
@@ -360,21 +358,22 @@
     for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
          dst_blk_y += compptr->v_samp_factor) {
       dst_buffer = (*srcinfo->mem->access_virt_barray)
-        ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-         (JDIMENSION) compptr->v_samp_factor, TRUE);
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
       for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
         for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
              dst_blk_x += compptr->h_samp_factor) {
           src_buffer = (*srcinfo->mem->access_virt_barray)
-            ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+            ((j_common_ptr)srcinfo, src_coef_arrays[ci],
              dst_blk_x + x_crop_blocks,
-             (JDIMENSION) compptr->h_samp_factor, FALSE);
+             (JDIMENSION)compptr->h_samp_factor, FALSE);
           for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
             dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
-            src_ptr = src_buffer[offset_x][dst_blk_y + offset_y + y_crop_blocks];
+            src_ptr =
+              src_buffer[offset_x][dst_blk_y + offset_y + y_crop_blocks];
             for (i = 0; i < DCTSIZE; i++)
               for (j = 0; j < DCTSIZE; j++)
-                dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+                dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
           }
         }
       }
@@ -384,10 +383,10 @@
 
 
 LOCAL(void)
-do_rot_90 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-           JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-           jvirt_barray_ptr *src_coef_arrays,
-           jvirt_barray_ptr *dst_coef_arrays)
+do_rot_90(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+          JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+          jvirt_barray_ptr *src_coef_arrays,
+          jvirt_barray_ptr *dst_coef_arrays)
 /* 90 degree rotation is equivalent to
  *   1. Transposing the image;
  *   2. Horizontal mirroring.
@@ -406,7 +405,7 @@
    * not mirrored.
    */
   MCU_cols = srcinfo->output_height /
-    (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
+             (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
 
   for (ci = 0; ci < dstinfo->num_components; ci++) {
     compptr = dstinfo->comp_info + ci;
@@ -416,24 +415,24 @@
     for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
          dst_blk_y += compptr->v_samp_factor) {
       dst_buffer = (*srcinfo->mem->access_virt_barray)
-        ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-         (JDIMENSION) compptr->v_samp_factor, TRUE);
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
       for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
         for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
              dst_blk_x += compptr->h_samp_factor) {
           if (x_crop_blocks + dst_blk_x < comp_width) {
             /* Block is within the mirrorable area. */
             src_buffer = (*srcinfo->mem->access_virt_barray)
-              ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+              ((j_common_ptr)srcinfo, src_coef_arrays[ci],
                comp_width - x_crop_blocks - dst_blk_x -
-               (JDIMENSION) compptr->h_samp_factor,
-               (JDIMENSION) compptr->h_samp_factor, FALSE);
+               (JDIMENSION)compptr->h_samp_factor,
+               (JDIMENSION)compptr->h_samp_factor, FALSE);
           } else {
             /* Edge blocks are transposed but not mirrored. */
             src_buffer = (*srcinfo->mem->access_virt_barray)
-              ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+              ((j_common_ptr)srcinfo, src_coef_arrays[ci],
                dst_blk_x + x_crop_blocks,
-               (JDIMENSION) compptr->h_samp_factor, FALSE);
+               (JDIMENSION)compptr->h_samp_factor, FALSE);
           }
           for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
             dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
@@ -443,10 +442,10 @@
                 [dst_blk_y + offset_y + y_crop_blocks];
               for (i = 0; i < DCTSIZE; i++) {
                 for (j = 0; j < DCTSIZE; j++)
-                  dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+                  dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
                 i++;
                 for (j = 0; j < DCTSIZE; j++)
-                  dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
+                  dst_ptr[j * DCTSIZE + i] = -src_ptr[i * DCTSIZE + j];
               }
             } else {
               /* Edge blocks are transposed but not mirrored. */
@@ -454,7 +453,7 @@
                 [dst_blk_y + offset_y + y_crop_blocks];
               for (i = 0; i < DCTSIZE; i++)
                 for (j = 0; j < DCTSIZE; j++)
-                  dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+                  dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
             }
           }
         }
@@ -465,10 +464,10 @@
 
 
 LOCAL(void)
-do_rot_270 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-            JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-            jvirt_barray_ptr *src_coef_arrays,
-            jvirt_barray_ptr *dst_coef_arrays)
+do_rot_270(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+           JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+           jvirt_barray_ptr *src_coef_arrays,
+           jvirt_barray_ptr *dst_coef_arrays)
 /* 270 degree rotation is equivalent to
  *   1. Horizontal mirroring;
  *   2. Transposing the image.
@@ -487,7 +486,7 @@
    * not mirrored.
    */
   MCU_rows = srcinfo->output_width /
-    (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
+             (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
 
   for (ci = 0; ci < dstinfo->num_components; ci++) {
     compptr = dstinfo->comp_info + ci;
@@ -497,15 +496,15 @@
     for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
          dst_blk_y += compptr->v_samp_factor) {
       dst_buffer = (*srcinfo->mem->access_virt_barray)
-        ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-         (JDIMENSION) compptr->v_samp_factor, TRUE);
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
       for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
         for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
              dst_blk_x += compptr->h_samp_factor) {
           src_buffer = (*srcinfo->mem->access_virt_barray)
-            ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+            ((j_common_ptr)srcinfo, src_coef_arrays[ci],
              dst_blk_x + x_crop_blocks,
-             (JDIMENSION) compptr->h_samp_factor, FALSE);
+             (JDIMENSION)compptr->h_samp_factor, FALSE);
           for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
             dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
             if (y_crop_blocks + dst_blk_y < comp_height) {
@@ -514,9 +513,9 @@
                 [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1];
               for (i = 0; i < DCTSIZE; i++) {
                 for (j = 0; j < DCTSIZE; j++) {
-                  dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+                  dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
                   j++;
-                  dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
+                  dst_ptr[j * DCTSIZE + i] = -src_ptr[i * DCTSIZE + j];
                 }
               }
             } else {
@@ -525,7 +524,7 @@
                 [dst_blk_y + offset_y + y_crop_blocks];
               for (i = 0; i < DCTSIZE; i++)
                 for (j = 0; j < DCTSIZE; j++)
-                  dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+                  dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
             }
           }
         }
@@ -536,10 +535,10 @@
 
 
 LOCAL(void)
-do_rot_180 (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-            JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-            jvirt_barray_ptr *src_coef_arrays,
-            jvirt_barray_ptr *dst_coef_arrays)
+do_rot_180(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+           JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+           jvirt_barray_ptr *src_coef_arrays,
+           jvirt_barray_ptr *dst_coef_arrays)
 /* 180 degree rotation is equivalent to
  *   1. Vertical mirroring;
  *   2. Horizontal mirroring.
@@ -555,9 +554,9 @@
   jpeg_component_info *compptr;
 
   MCU_cols = srcinfo->output_width /
-    (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
+             (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
   MCU_rows = srcinfo->output_height /
-    (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
+             (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
 
   for (ci = 0; ci < dstinfo->num_components; ci++) {
     compptr = dstinfo->comp_info + ci;
@@ -568,32 +567,34 @@
     for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
          dst_blk_y += compptr->v_samp_factor) {
       dst_buffer = (*srcinfo->mem->access_virt_barray)
-        ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-         (JDIMENSION) compptr->v_samp_factor, TRUE);
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
       if (y_crop_blocks + dst_blk_y < comp_height) {
         /* Row is within the vertically mirrorable area. */
         src_buffer = (*srcinfo->mem->access_virt_barray)
-          ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
            comp_height - y_crop_blocks - dst_blk_y -
-           (JDIMENSION) compptr->v_samp_factor,
-           (JDIMENSION) compptr->v_samp_factor, FALSE);
+           (JDIMENSION)compptr->v_samp_factor,
+           (JDIMENSION)compptr->v_samp_factor, FALSE);
       } else {
         /* Bottom-edge rows are only mirrored horizontally. */
         src_buffer = (*srcinfo->mem->access_virt_barray)
-          ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
            dst_blk_y + y_crop_blocks,
-           (JDIMENSION) compptr->v_samp_factor, FALSE);
+           (JDIMENSION)compptr->v_samp_factor, FALSE);
       }
       for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
         dst_row_ptr = dst_buffer[offset_y];
         if (y_crop_blocks + dst_blk_y < comp_height) {
           /* Row is within the mirrorable area. */
           src_row_ptr = src_buffer[compptr->v_samp_factor - offset_y - 1];
-          for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) {
+          for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
+               dst_blk_x++) {
             dst_ptr = dst_row_ptr[dst_blk_x];
             if (x_crop_blocks + dst_blk_x < comp_width) {
               /* Process the blocks that can be mirrored both ways. */
-              src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1];
+              src_ptr =
+                src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1];
               for (i = 0; i < DCTSIZE; i += 2) {
                 /* For even row, negate every odd column. */
                 for (j = 0; j < DCTSIZE; j += 2) {
@@ -620,11 +621,13 @@
         } else {
           /* Remaining rows are just mirrored horizontally. */
           src_row_ptr = src_buffer[offset_y];
-          for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks; dst_blk_x++) {
+          for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
+               dst_blk_x++) {
             if (x_crop_blocks + dst_blk_x < comp_width) {
               /* Process the blocks that can be mirrored. */
               dst_ptr = dst_row_ptr[dst_blk_x];
-              src_ptr = src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1];
+              src_ptr =
+                src_row_ptr[comp_width - x_crop_blocks - dst_blk_x - 1];
               for (i = 0; i < DCTSIZE2; i += 2) {
                 *dst_ptr++ = *src_ptr++;
                 *dst_ptr++ = - *src_ptr++;
@@ -632,8 +635,7 @@
             } else {
               /* Any remaining right-edge blocks are only copied. */
               jcopy_block_row(src_row_ptr + dst_blk_x + x_crop_blocks,
-                              dst_row_ptr + dst_blk_x,
-                              (JDIMENSION) 1);
+                              dst_row_ptr + dst_blk_x, (JDIMENSION)1);
             }
           }
         }
@@ -644,10 +646,10 @@
 
 
 LOCAL(void)
-do_transverse (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-               JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
-               jvirt_barray_ptr *src_coef_arrays,
-               jvirt_barray_ptr *dst_coef_arrays)
+do_transverse(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+              JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+              jvirt_barray_ptr *src_coef_arrays,
+              jvirt_barray_ptr *dst_coef_arrays)
 /* Transverse transpose is equivalent to
  *   1. 180 degree rotation;
  *   2. Transposition;
@@ -666,9 +668,9 @@
   jpeg_component_info *compptr;
 
   MCU_cols = srcinfo->output_height /
-    (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
+             (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
   MCU_rows = srcinfo->output_width /
-    (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
+             (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
 
   for (ci = 0; ci < dstinfo->num_components; ci++) {
     compptr = dstinfo->comp_info + ci;
@@ -679,23 +681,23 @@
     for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
          dst_blk_y += compptr->v_samp_factor) {
       dst_buffer = (*srcinfo->mem->access_virt_barray)
-        ((j_common_ptr) srcinfo, dst_coef_arrays[ci], dst_blk_y,
-         (JDIMENSION) compptr->v_samp_factor, TRUE);
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
       for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
         for (dst_blk_x = 0; dst_blk_x < compptr->width_in_blocks;
              dst_blk_x += compptr->h_samp_factor) {
           if (x_crop_blocks + dst_blk_x < comp_width) {
             /* Block is within the mirrorable area. */
             src_buffer = (*srcinfo->mem->access_virt_barray)
-              ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+              ((j_common_ptr)srcinfo, src_coef_arrays[ci],
                comp_width - x_crop_blocks - dst_blk_x -
-               (JDIMENSION) compptr->h_samp_factor,
-               (JDIMENSION) compptr->h_samp_factor, FALSE);
+               (JDIMENSION)compptr->h_samp_factor,
+               (JDIMENSION)compptr->h_samp_factor, FALSE);
           } else {
             src_buffer = (*srcinfo->mem->access_virt_barray)
-              ((j_common_ptr) srcinfo, src_coef_arrays[ci],
+              ((j_common_ptr)srcinfo, src_coef_arrays[ci],
                dst_blk_x + x_crop_blocks,
-               (JDIMENSION) compptr->h_samp_factor, FALSE);
+               (JDIMENSION)compptr->h_samp_factor, FALSE);
           }
           for (offset_x = 0; offset_x < compptr->h_samp_factor; offset_x++) {
             dst_ptr = dst_buffer[offset_y][dst_blk_x + offset_x];
@@ -706,15 +708,15 @@
                   [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1];
                 for (i = 0; i < DCTSIZE; i++) {
                   for (j = 0; j < DCTSIZE; j++) {
-                    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+                    dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
                     j++;
-                    dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
+                    dst_ptr[j * DCTSIZE + i] = -src_ptr[i * DCTSIZE + j];
                   }
                   i++;
                   for (j = 0; j < DCTSIZE; j++) {
-                    dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
+                    dst_ptr[j * DCTSIZE + i] = -src_ptr[i * DCTSIZE + j];
                     j++;
-                    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+                    dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
                   }
                 }
               } else {
@@ -723,9 +725,9 @@
                   [comp_height - y_crop_blocks - dst_blk_y - offset_y - 1];
                 for (i = 0; i < DCTSIZE; i++) {
                   for (j = 0; j < DCTSIZE; j++) {
-                    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+                    dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
                     j++;
-                    dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
+                    dst_ptr[j * DCTSIZE + i] = -src_ptr[i * DCTSIZE + j];
                   }
                 }
               }
@@ -736,10 +738,10 @@
                   [dst_blk_y + offset_y + y_crop_blocks];
                 for (i = 0; i < DCTSIZE; i++) {
                   for (j = 0; j < DCTSIZE; j++)
-                    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+                    dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
                   i++;
                   for (j = 0; j < DCTSIZE; j++)
-                    dst_ptr[j*DCTSIZE+i] = -src_ptr[i*DCTSIZE+j];
+                    dst_ptr[j * DCTSIZE + i] = -src_ptr[i * DCTSIZE + j];
                 }
               } else {
                 /* At lower right corner, just transpose, no mirroring */
@@ -747,7 +749,7 @@
                   [dst_blk_y + offset_y + y_crop_blocks];
                 for (i = 0; i < DCTSIZE; i++)
                   for (j = 0; j < DCTSIZE; j++)
-                    dst_ptr[j*DCTSIZE+i] = src_ptr[i*DCTSIZE+j];
+                    dst_ptr[j * DCTSIZE + i] = src_ptr[i * DCTSIZE + j];
               }
             }
           }
@@ -764,13 +766,13 @@
  */
 
 LOCAL(boolean)
-jt_read_integer (const char **strptr, JDIMENSION *result)
+jt_read_integer(const char **strptr, JDIMENSION *result)
 {
   const char *ptr = *strptr;
   JDIMENSION val = 0;
 
   for (; isdigit(*ptr); ptr++) {
-    val = val * 10 + (JDIMENSION) (*ptr - '0');
+    val = val * 10 + (JDIMENSION)(*ptr - '0');
   }
   *result = val;
   if (ptr == *strptr)
@@ -794,7 +796,7 @@
  */
 
 GLOBAL(boolean)
-jtransform_parse_crop_spec (jpeg_transform_info *info, const char *spec)
+jtransform_parse_crop_spec(jpeg_transform_info *info, const char *spec)
 {
   info->crop = FALSE;
   info->crop_width_set = JCROP_UNSET;
@@ -804,7 +806,7 @@
 
   if (isdigit(*spec)) {
     /* fetch width */
-    if (! jt_read_integer(&spec, &info->crop_width))
+    if (!jt_read_integer(&spec, &info->crop_width))
       return FALSE;
     if (*spec == 'f' || *spec == 'F') {
       spec++;
@@ -815,7 +817,7 @@
   if (*spec == 'x' || *spec == 'X') {
     /* fetch height */
     spec++;
-    if (! jt_read_integer(&spec, &info->crop_height))
+    if (!jt_read_integer(&spec, &info->crop_height))
       return FALSE;
     if (*spec == 'f' || *spec == 'F') {
       spec++;
@@ -827,14 +829,14 @@
     /* fetch xoffset */
     info->crop_xoffset_set = (*spec == '-') ? JCROP_NEG : JCROP_POS;
     spec++;
-    if (! jt_read_integer(&spec, &info->crop_xoffset))
+    if (!jt_read_integer(&spec, &info->crop_xoffset))
       return FALSE;
   }
   if (*spec == '+' || *spec == '-') {
     /* fetch yoffset */
     info->crop_yoffset_set = (*spec == '-') ? JCROP_NEG : JCROP_POS;
     spec++;
-    if (! jt_read_integer(&spec, &info->crop_yoffset))
+    if (!jt_read_integer(&spec, &info->crop_yoffset))
       return FALSE;
   }
   /* We had better have gotten to the end of the string. */
@@ -848,7 +850,7 @@
 /* Trim off any partial iMCUs on the indicated destination edge */
 
 LOCAL(void)
-trim_right_edge (jpeg_transform_info *info, JDIMENSION full_width)
+trim_right_edge(jpeg_transform_info *info, JDIMENSION full_width)
 {
   JDIMENSION MCU_cols;
 
@@ -859,7 +861,7 @@
 }
 
 LOCAL(void)
-trim_bottom_edge (jpeg_transform_info *info, JDIMENSION full_height)
+trim_bottom_edge(jpeg_transform_info *info, JDIMENSION full_height)
 {
   JDIMENSION MCU_rows;
 
@@ -888,8 +890,8 @@
  */
 
 GLOBAL(boolean)
-jtransform_request_workspace (j_decompress_ptr srcinfo,
-                              jpeg_transform_info *info)
+jtransform_request_workspace(j_decompress_ptr srcinfo,
+                             jpeg_transform_info *info)
 {
   jvirt_barray_ptr *coef_arrays;
   boolean need_workspace, transpose_it;
@@ -1093,14 +1095,12 @@
    */
   if (need_workspace) {
     coef_arrays = (jvirt_barray_ptr *)
-      (*srcinfo->mem->alloc_small) ((j_common_ptr) srcinfo, JPOOL_IMAGE,
+      (*srcinfo->mem->alloc_small) ((j_common_ptr)srcinfo, JPOOL_IMAGE,
                 sizeof(jvirt_barray_ptr) * info->num_components);
     width_in_iMCUs = (JDIMENSION)
-      jdiv_round_up((long) info->output_width,
-                    (long) info->iMCU_sample_width);
+      jdiv_round_up((long)info->output_width, (long)info->iMCU_sample_width);
     height_in_iMCUs = (JDIMENSION)
-      jdiv_round_up((long) info->output_height,
-                    (long) info->iMCU_sample_height);
+      jdiv_round_up((long)info->output_height, (long)info->iMCU_sample_height);
     for (ci = 0; ci < info->num_components; ci++) {
       compptr = srcinfo->comp_info + ci;
       if (info->num_components == 1) {
@@ -1116,8 +1116,8 @@
       width_in_blocks = width_in_iMCUs * h_samp_factor;
       height_in_blocks = height_in_iMCUs * v_samp_factor;
       coef_arrays[ci] = (*srcinfo->mem->request_virt_barray)
-        ((j_common_ptr) srcinfo, JPOOL_IMAGE, FALSE,
-         width_in_blocks, height_in_blocks, (JDIMENSION) v_samp_factor);
+        ((j_common_ptr)srcinfo, JPOOL_IMAGE, FALSE,
+         width_in_blocks, height_in_blocks, (JDIMENSION)v_samp_factor);
     }
     info->workspace_coef_arrays = coef_arrays;
   } else
@@ -1130,7 +1130,7 @@
 /* Transpose destination image parameters */
 
 LOCAL(void)
-transpose_critical_parameters (j_compress_ptr dstinfo)
+transpose_critical_parameters(j_compress_ptr dstinfo)
 {
   int tblno, i, j, ci, itemp;
   jpeg_component_info *compptr;
@@ -1162,9 +1162,10 @@
     if (qtblptr != NULL) {
       for (i = 0; i < DCTSIZE; i++) {
         for (j = 0; j < i; j++) {
-          qtemp = qtblptr->quantval[i*DCTSIZE+j];
-          qtblptr->quantval[i*DCTSIZE+j] = qtblptr->quantval[j*DCTSIZE+i];
-          qtblptr->quantval[j*DCTSIZE+i] = qtemp;
+          qtemp = qtblptr->quantval[i * DCTSIZE + j];
+          qtblptr->quantval[i * DCTSIZE + j] =
+            qtblptr->quantval[j * DCTSIZE + i];
+          qtblptr->quantval[j * DCTSIZE + i] = qtemp;
         }
       }
     }
@@ -1178,8 +1179,8 @@
  */
 
 LOCAL(void)
-adjust_exif_parameters (JOCTET *data, unsigned int length,
-                        JDIMENSION new_width, JDIMENSION new_height)
+adjust_exif_parameters(JOCTET *data, unsigned int length, JDIMENSION new_width,
+                       JDIMENSION new_height)
 {
   boolean is_motorola; /* Flag for byte order */
   unsigned int number_of_tags, tagnum;
@@ -1225,9 +1226,9 @@
   if (is_motorola) {
     number_of_tags = GETJOCTET(data[firstoffset]);
     number_of_tags <<= 8;
-    number_of_tags += GETJOCTET(data[firstoffset+1]);
+    number_of_tags += GETJOCTET(data[firstoffset + 1]);
   } else {
-    number_of_tags = GETJOCTET(data[firstoffset+1]);
+    number_of_tags = GETJOCTET(data[firstoffset + 1]);
     number_of_tags <<= 8;
     number_of_tags += GETJOCTET(data[firstoffset]);
   }
@@ -1241,9 +1242,9 @@
     if (is_motorola) {
       tagnum = GETJOCTET(data[firstoffset]);
       tagnum <<= 8;
-      tagnum += GETJOCTET(data[firstoffset+1]);
+      tagnum += GETJOCTET(data[firstoffset + 1]);
     } else {
-      tagnum = GETJOCTET(data[firstoffset+1]);
+      tagnum = GETJOCTET(data[firstoffset + 1]);
       tagnum <<= 8;
       tagnum += GETJOCTET(data[firstoffset]);
     }
@@ -1254,17 +1255,17 @@
 
   /* Get the ExifSubIFD offset */
   if (is_motorola) {
-    if (GETJOCTET(data[firstoffset+8]) != 0) return;
-    if (GETJOCTET(data[firstoffset+9]) != 0) return;
-    offset = GETJOCTET(data[firstoffset+10]);
+    if (GETJOCTET(data[firstoffset + 8]) != 0) return;
+    if (GETJOCTET(data[firstoffset + 9]) != 0) return;
+    offset = GETJOCTET(data[firstoffset + 10]);
     offset <<= 8;
-    offset += GETJOCTET(data[firstoffset+11]);
+    offset += GETJOCTET(data[firstoffset + 11]);
   } else {
-    if (GETJOCTET(data[firstoffset+11]) != 0) return;
-    if (GETJOCTET(data[firstoffset+10]) != 0) return;
-    offset = GETJOCTET(data[firstoffset+9]);
+    if (GETJOCTET(data[firstoffset + 11]) != 0) return;
+    if (GETJOCTET(data[firstoffset + 10]) != 0) return;
+    offset = GETJOCTET(data[firstoffset + 9]);
     offset <<= 8;
-    offset += GETJOCTET(data[firstoffset+8]);
+    offset += GETJOCTET(data[firstoffset + 8]);
   }
   if (offset > length - 2) return; /* check end of data segment */
 
@@ -1272,9 +1273,9 @@
   if (is_motorola) {
     number_of_tags = GETJOCTET(data[offset]);
     number_of_tags <<= 8;
-    number_of_tags += GETJOCTET(data[offset+1]);
+    number_of_tags += GETJOCTET(data[offset + 1]);
   } else {
-    number_of_tags = GETJOCTET(data[offset+1]);
+    number_of_tags = GETJOCTET(data[offset + 1]);
     number_of_tags <<= 8;
     number_of_tags += GETJOCTET(data[offset]);
   }
@@ -1288,9 +1289,9 @@
     if (is_motorola) {
       tagnum = GETJOCTET(data[offset]);
       tagnum <<= 8;
-      tagnum += GETJOCTET(data[offset+1]);
+      tagnum += GETJOCTET(data[offset + 1]);
     } else {
-      tagnum = GETJOCTET(data[offset+1]);
+      tagnum = GETJOCTET(data[offset + 1]);
       tagnum <<= 8;
       tagnum += GETJOCTET(data[offset]);
     }
@@ -1300,27 +1301,27 @@
       else
         new_value = new_height; /* ExifImageHeight Tag */
       if (is_motorola) {
-        data[offset+2] = 0; /* Format = unsigned long (4 octets) */
-        data[offset+3] = 4;
-        data[offset+4] = 0; /* Number Of Components = 1 */
-        data[offset+5] = 0;
-        data[offset+6] = 0;
-        data[offset+7] = 1;
-        data[offset+8] = 0;
-        data[offset+9] = 0;
-        data[offset+10] = (JOCTET)((new_value >> 8) & 0xFF);
-        data[offset+11] = (JOCTET)(new_value & 0xFF);
+        data[offset + 2] = 0; /* Format = unsigned long (4 octets) */
+        data[offset + 3] = 4;
+        data[offset + 4] = 0; /* Number Of Components = 1 */
+        data[offset + 5] = 0;
+        data[offset + 6] = 0;
+        data[offset + 7] = 1;
+        data[offset + 8] = 0;
+        data[offset + 9] = 0;
+        data[offset + 10] = (JOCTET)((new_value >> 8) & 0xFF);
+        data[offset + 11] = (JOCTET)(new_value & 0xFF);
       } else {
-        data[offset+2] = 4; /* Format = unsigned long (4 octets) */
-        data[offset+3] = 0;
-        data[offset+4] = 1; /* Number Of Components = 1 */
-        data[offset+5] = 0;
-        data[offset+6] = 0;
-        data[offset+7] = 0;
-        data[offset+8] = (JOCTET)(new_value & 0xFF);
-        data[offset+9] = (JOCTET)((new_value >> 8) & 0xFF);
-        data[offset+10] = 0;
-        data[offset+11] = 0;
+        data[offset + 2] = 4; /* Format = unsigned long (4 octets) */
+        data[offset + 3] = 0;
+        data[offset + 4] = 1; /* Number Of Components = 1 */
+        data[offset + 5] = 0;
+        data[offset + 6] = 0;
+        data[offset + 7] = 0;
+        data[offset + 8] = (JOCTET)(new_value & 0xFF);
+        data[offset + 9] = (JOCTET)((new_value >> 8) & 0xFF);
+        data[offset + 10] = 0;
+        data[offset + 11] = 0;
       }
     }
     offset += 12;
@@ -1340,10 +1341,9 @@
  */
 
 GLOBAL(jvirt_barray_ptr *)
-jtransform_adjust_parameters (j_decompress_ptr srcinfo,
-                              j_compress_ptr dstinfo,
-                              jvirt_barray_ptr *src_coef_arrays,
-                              jpeg_transform_info *info)
+jtransform_adjust_parameters(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+                             jvirt_barray_ptr *src_coef_arrays,
+                             jpeg_transform_info *info)
 {
   /* If force-to-grayscale is requested, adjust destination parameters */
   if (info->force_grayscale) {
@@ -1409,7 +1409,7 @@
 
   /* Adjust Exif properties */
   if (srcinfo->marker_list != NULL &&
-      srcinfo->marker_list->marker == JPEG_APP0+1 &&
+      srcinfo->marker_list->marker == JPEG_APP0 + 1 &&
       srcinfo->marker_list->data_length >= 6 &&
       GETJOCTET(srcinfo->marker_list->data[0]) == 0x45 &&
       GETJOCTET(srcinfo->marker_list->data[1]) == 0x78 &&
@@ -1425,15 +1425,15 @@
         dstinfo->jpeg_height != srcinfo->image_height)
       /* Align data segment to start of TIFF structure for parsing */
       adjust_exif_parameters(srcinfo->marker_list->data + 6,
-        srcinfo->marker_list->data_length - 6,
-        dstinfo->jpeg_width, dstinfo->jpeg_height);
+                             srcinfo->marker_list->data_length - 6,
+                             dstinfo->jpeg_width, dstinfo->jpeg_height);
 #else
     if (dstinfo->image_width != srcinfo->image_width ||
         dstinfo->image_height != srcinfo->image_height)
       /* Align data segment to start of TIFF structure for parsing */
       adjust_exif_parameters(srcinfo->marker_list->data + 6,
-        srcinfo->marker_list->data_length - 6,
-        dstinfo->image_width, dstinfo->image_height);
+                             srcinfo->marker_list->data_length - 6,
+                             dstinfo->image_width, dstinfo->image_height);
 #endif
   }
 
@@ -1454,10 +1454,9 @@
  */
 
 GLOBAL(void)
-jtransform_execute_transform (j_decompress_ptr srcinfo,
-                              j_compress_ptr dstinfo,
-                              jvirt_barray_ptr *src_coef_arrays,
-                              jpeg_transform_info *info)
+jtransform_execute_transform(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+                             jvirt_barray_ptr *src_coef_arrays,
+                             jpeg_transform_info *info)
 {
   jvirt_barray_ptr *dst_coef_arrays = info->workspace_coef_arrays;
 
@@ -1536,19 +1535,19 @@
   switch (transform) {
   case JXFORM_FLIP_H:
   case JXFORM_ROT_270:
-    if (image_width % (JDIMENSION) MCU_width)
+    if (image_width % (JDIMENSION)MCU_width)
       result = FALSE;
     break;
   case JXFORM_FLIP_V:
   case JXFORM_ROT_90:
-    if (image_height % (JDIMENSION) MCU_height)
+    if (image_height % (JDIMENSION)MCU_height)
       result = FALSE;
     break;
   case JXFORM_TRANSVERSE:
   case JXFORM_ROT_180:
-    if (image_width % (JDIMENSION) MCU_width)
+    if (image_width % (JDIMENSION)MCU_width)
       result = FALSE;
-    if (image_height % (JDIMENSION) MCU_height)
+    if (image_height % (JDIMENSION)MCU_height)
       result = FALSE;
     break;
   default:
@@ -1566,7 +1565,7 @@
  */
 
 GLOBAL(void)
-jcopy_markers_setup (j_decompress_ptr srcinfo, JCOPY_OPTION option)
+jcopy_markers_setup(j_decompress_ptr srcinfo, JCOPY_OPTION option)
 {
 #ifdef SAVE_MARKERS_SUPPORTED
   int m;
@@ -1576,9 +1575,12 @@
     jpeg_save_markers(srcinfo, JPEG_COM, 0xFFFF);
   }
   /* Save all types of APPn markers iff ALL option */
-  if (option == JCOPYOPT_ALL) {
-    for (m = 0; m < 16; m++)
+  if (option == JCOPYOPT_ALL || option == JCOPYOPT_ALL_EXCEPT_ICC) {
+    for (m = 0; m < 16; m++) {
+      if (option == JCOPYOPT_ALL_EXCEPT_ICC && m == 2)
+        continue;
       jpeg_save_markers(srcinfo, JPEG_APP0 + m, 0xFFFF);
+    }
   }
 #endif /* SAVE_MARKERS_SUPPORTED */
 }
@@ -1591,8 +1593,8 @@
  */
 
 GLOBAL(void)
-jcopy_markers_execute (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-                       JCOPY_OPTION option)
+jcopy_markers_execute(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+                      JCOPY_OPTION option)
 {
   jpeg_saved_marker_ptr marker;
 
@@ -1612,7 +1614,7 @@
         GETJOCTET(marker->data[4]) == 0)
       continue;                 /* reject duplicate JFIF */
     if (dstinfo->write_Adobe_marker &&
-        marker->marker == JPEG_APP0+14 &&
+        marker->marker == JPEG_APP0 + 14 &&
         marker->data_length >= 5 &&
         GETJOCTET(marker->data[0]) == 0x41 &&
         GETJOCTET(marker->data[1]) == 0x64 &&
diff --git a/transupp.h b/transupp.h
index bf3118a..945ec5d 100644
--- a/transupp.h
+++ b/transupp.h
@@ -3,8 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1997-2011, Thomas G. Lane, Guido Vollbeding.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2017, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -156,25 +156,27 @@
 #if TRANSFORMS_SUPPORTED
 
 /* Parse a crop specification (written in X11 geometry style) */
-EXTERN(boolean) jtransform_parse_crop_spec
-        (jpeg_transform_info *info, const char *spec);
+EXTERN(boolean) jtransform_parse_crop_spec(jpeg_transform_info *info,
+                                           const char *spec);
 /* Request any required workspace */
-EXTERN(boolean) jtransform_request_workspace
-        (j_decompress_ptr srcinfo, jpeg_transform_info *info);
+EXTERN(boolean) jtransform_request_workspace(j_decompress_ptr srcinfo,
+                                             jpeg_transform_info *info);
 /* Adjust output image parameters */
 EXTERN(jvirt_barray_ptr *) jtransform_adjust_parameters
-        (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-         jvirt_barray_ptr *src_coef_arrays, jpeg_transform_info *info);
+  (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+   jvirt_barray_ptr *src_coef_arrays, jpeg_transform_info *info);
 /* Execute the actual transformation, if any */
-EXTERN(void) jtransform_execute_transform
-        (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-         jvirt_barray_ptr *src_coef_arrays, jpeg_transform_info *info);
+EXTERN(void) jtransform_execute_transform(j_decompress_ptr srcinfo,
+                                          j_compress_ptr dstinfo,
+                                          jvirt_barray_ptr *src_coef_arrays,
+                                          jpeg_transform_info *info);
 /* Determine whether lossless transformation is perfectly
  * possible for a specified image and transformation.
  */
-EXTERN(boolean) jtransform_perfect_transform
-        (JDIMENSION image_width, JDIMENSION image_height, int MCU_width,
-         int MCU_height, JXFORM_CODE transform);
+EXTERN(boolean) jtransform_perfect_transform(JDIMENSION image_width,
+                                             JDIMENSION image_height,
+                                             int MCU_width, int MCU_height,
+                                             JXFORM_CODE transform);
 
 /* jtransform_execute_transform used to be called
  * jtransform_execute_transformation, but some compilers complain about
@@ -193,15 +195,16 @@
 typedef enum {
   JCOPYOPT_NONE,          /* copy no optional markers */
   JCOPYOPT_COMMENTS,      /* copy only comment (COM) markers */
-  JCOPYOPT_ALL            /* copy all optional markers */
+  JCOPYOPT_ALL,           /* copy all optional markers */
+  JCOPYOPT_ALL_EXCEPT_ICC /* copy all optional markers except APP2 */
 } JCOPY_OPTION;
 
 #define JCOPYOPT_DEFAULT  JCOPYOPT_COMMENTS     /* recommended default */
 
 /* Setup decompression object to save desired markers in memory */
-EXTERN(void) jcopy_markers_setup
-        (j_decompress_ptr srcinfo, JCOPY_OPTION option);
+EXTERN(void) jcopy_markers_setup(j_decompress_ptr srcinfo,
+                                 JCOPY_OPTION option);
 /* Copy markers saved in the given source object to the destination object */
-EXTERN(void) jcopy_markers_execute
-        (j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-         JCOPY_OPTION option);
+EXTERN(void) jcopy_markers_execute(j_decompress_ptr srcinfo,
+                                   j_compress_ptr dstinfo,
+                                   JCOPY_OPTION option);
diff --git a/turbojpeg-jni.c b/turbojpeg-jni.c
index eaba670..55d3342 100644
--- a/turbojpeg-jni.c
+++ b/turbojpeg-jni.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2011-2016 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2011-2017 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,1130 +37,1151 @@
 #include "java/org_libjpegturbo_turbojpeg_TJDecompressor.h"
 #include "java/org_libjpegturbo_turbojpeg_TJ.h"
 
-#define PAD(v, p) ((v+(p)-1)&(~((p)-1)))
+#define PAD(v, p) ((v + (p) - 1) & (~((p) - 1)))
 
-#define _throw(msg, exceptionClass) {  \
-	jclass _exccls=(*env)->FindClass(env, exceptionClass);  \
-	if(!_exccls || (*env)->ExceptionCheck(env)) goto bailout;  \
-	(*env)->ThrowNew(env, _exccls, msg);  \
-	goto bailout;  \
+#define bailif0(f) { \
+  if (!(f) || (*env)->ExceptionCheck(env)) { \
+    goto bailout; \
+  } \
 }
 
-#define _throwtj() _throw(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException")
+#define _throw(msg, exceptionClass) { \
+  jclass _exccls = (*env)->FindClass(env, exceptionClass); \
+  \
+  bailif0(_exccls); \
+  (*env)->ThrowNew(env, _exccls, msg); \
+  goto bailout; \
+}
+
+#define _throwtj() { \
+  jclass _exccls; \
+  jmethodID _excid; \
+  jobject _excobj; \
+  jstring _errstr; \
+  \
+  bailif0(_errstr = (*env)->NewStringUTF(env, tjGetErrorStr2(handle))); \
+  bailif0(_exccls = (*env)->FindClass(env, \
+    "org/libjpegturbo/turbojpeg/TJException")); \
+  bailif0(_excid = (*env)->GetMethodID(env, _exccls, "<init>", \
+                                       "(Ljava/lang/String;I)V")); \
+  bailif0(_excobj = (*env)->NewObject(env, _exccls, _excid, _errstr, \
+                                      tjGetErrorCode(handle))); \
+  (*env)->Throw(env, _excobj); \
+  goto bailout; \
+}
 
 #define _throwarg(msg) _throw(msg, "java/lang/IllegalArgumentException")
 
-#define _throwmem() _throw("Memory allocation failure", "java/lang/OutOfMemoryError");
+#define _throwmem() \
+  _throw("Memory allocation failure", "java/lang/OutOfMemoryError");
 
-#define bailif0(f) {if(!(f) || (*env)->ExceptionCheck(env)) {  \
-	goto bailout;  \
-}}
-
-#define gethandle()  \
-	jclass _cls=(*env)->GetObjectClass(env, obj);  \
-	jfieldID _fid;  \
-	if(!_cls || (*env)->ExceptionCheck(env)) goto bailout;  \
-	bailif0(_fid=(*env)->GetFieldID(env, _cls, "handle", "J"));  \
-	handle=(tjhandle)(size_t)(*env)->GetLongField(env, obj, _fid);  \
+#define gethandle() \
+  jclass _cls = (*env)->GetObjectClass(env, obj); \
+  jfieldID _fid; \
+  \
+  bailif0(_cls); \
+  bailif0(_fid = (*env)->GetFieldID(env, _cls, "handle", "J")); \
+  handle = (tjhandle)(size_t)(*env)->GetLongField(env, obj, _fid);
 
 #ifdef _WIN32
 #define setenv(envvar, value, dummy) _putenv_s(envvar, value)
 #endif
 
-#define prop2env(property, envvar)  \
-{  \
-	if((jName=(*env)->NewStringUTF(env, property))!=NULL  \
-		&& (jValue=(*env)->CallStaticObjectMethod(env, cls, mid, jName))!=NULL)  \
-	{  \
-		if((value=(*env)->GetStringUTFChars(env, jValue, 0))!=NULL)  \
-		{  \
-			setenv(envvar, value, 1);  \
-			(*env)->ReleaseStringUTFChars(env, jValue, value);  \
-		}  \
-	}  \
+#define prop2env(property, envvar) { \
+  if ((jName = (*env)->NewStringUTF(env, property)) != NULL && \
+      (jValue = (*env)->CallStaticObjectMethod(env, cls, mid, \
+                                               jName)) != NULL) { \
+    if ((value = (*env)->GetStringUTFChars(env, jValue, 0)) != NULL) { \
+      setenv(envvar, value, 1); \
+      (*env)->ReleaseStringUTFChars(env, jValue, value); \
+    } \
+  } \
 }
 
 int ProcessSystemProperties(JNIEnv *env)
 {
-	jclass cls;  jmethodID mid;
-	jstring jName, jValue;
-	const char *value;
+  jclass cls;
+  jmethodID mid;
+  jstring jName, jValue;
+  const char *value;
 
-	bailif0(cls=(*env)->FindClass(env, "java/lang/System"));
-	bailif0(mid=(*env)->GetStaticMethodID(env, cls, "getProperty",
-		"(Ljava/lang/String;)Ljava/lang/String;"));
+  bailif0(cls = (*env)->FindClass(env, "java/lang/System"));
+  bailif0(mid = (*env)->GetStaticMethodID(env, cls, "getProperty",
+    "(Ljava/lang/String;)Ljava/lang/String;"));
 
-	prop2env("turbojpeg.optimize", "TJ_OPTIMIZE");
-	prop2env("turbojpeg.arithmetic", "TJ_ARITHMETIC");
-	prop2env("turbojpeg.restart", "TJ_RESTART");
-	prop2env("turbojpeg.progressive", "TJ_PROGRESSIVE");
-	return 0;
+  prop2env("turbojpeg.optimize", "TJ_OPTIMIZE");
+  prop2env("turbojpeg.arithmetic", "TJ_ARITHMETIC");
+  prop2env("turbojpeg.restart", "TJ_RESTART");
+  prop2env("turbojpeg.progressive", "TJ_PROGRESSIVE");
+  return 0;
 
-	bailout:
-	return -1;
+bailout:
+  return -1;
 }
 
 /* TurboJPEG 1.2.x: TJ::bufSize() */
 JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSize
-	(JNIEnv *env, jclass cls, jint width, jint height, jint jpegSubsamp)
+  (JNIEnv *env, jclass cls, jint width, jint height, jint jpegSubsamp)
 {
-	jint retval=(jint)tjBufSize(width, height, jpegSubsamp);
-	if(retval==-1) _throwarg(tjGetErrorStr());
+  jint retval = (jint)tjBufSize(width, height, jpegSubsamp);
 
-	bailout:
-	return retval;
+  if (retval == -1) _throwarg(tjGetErrorStr());
+
+bailout:
+  return retval;
 }
 
 /* TurboJPEG 1.4.x: TJ::bufSizeYUV() */
 JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII
-	(JNIEnv *env, jclass cls, jint width, jint pad, jint height, jint subsamp)
+  (JNIEnv *env, jclass cls, jint width, jint pad, jint height, jint subsamp)
 {
-	jint retval=(jint)tjBufSizeYUV2(width, pad, height, subsamp);
-	if(retval==-1) _throwarg(tjGetErrorStr());
+  jint retval = (jint)tjBufSizeYUV2(width, pad, height, subsamp);
 
-	bailout:
-	return retval;
+  if (retval == -1) _throwarg(tjGetErrorStr());
+
+bailout:
+  return retval;
 }
 
 /* TurboJPEG 1.2.x: TJ::bufSizeYUV() */
 JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__III
-	(JNIEnv *env, jclass cls, jint width, jint height, jint subsamp)
+  (JNIEnv *env, jclass cls, jint width, jint height, jint subsamp)
 {
-	return Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII(env, cls, width,
-		4, height, subsamp);
+  return Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII(env, cls, width,
+                                                             4, height,
+                                                             subsamp);
 }
 
 /* TurboJPEG 1.4.x: TJ::planeSizeYUV() */
 JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_planeSizeYUV__IIIII
-	(JNIEnv *env, jclass cls, jint componentID, jint width, jint stride,
-		jint height, jint subsamp)
+  (JNIEnv *env, jclass cls, jint componentID, jint width, jint stride,
+   jint height, jint subsamp)
 {
-	jint retval=(jint)tjPlaneSizeYUV(componentID, width, stride, height,
-		subsamp);
-	if(retval==-1) _throwarg(tjGetErrorStr());
+  jint retval = (jint)tjPlaneSizeYUV(componentID, width, stride, height,
+                                     subsamp);
 
-	bailout:
-	return retval;
+  if (retval == -1) _throwarg(tjGetErrorStr());
+
+bailout:
+  return retval;
 }
 
 /* TurboJPEG 1.4.x: TJ::planeWidth() */
 JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_planeWidth__III
-	(JNIEnv *env, jclass cls, jint componentID, jint width, jint subsamp)
+  (JNIEnv *env, jclass cls, jint componentID, jint width, jint subsamp)
 {
-	jint retval=(jint)tjPlaneWidth(componentID, width, subsamp);
-	if(retval==-1) _throwarg(tjGetErrorStr());
+  jint retval = (jint)tjPlaneWidth(componentID, width, subsamp);
 
-	bailout:
-	return retval;
+  if (retval == -1) _throwarg(tjGetErrorStr());
+
+bailout:
+  return retval;
 }
 
 /* TurboJPEG 1.4.x: TJ::planeHeight() */
 JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_planeHeight__III
-	(JNIEnv *env, jclass cls, jint componentID, jint height, jint subsamp)
+  (JNIEnv *env, jclass cls, jint componentID, jint height, jint subsamp)
 {
-	jint retval=(jint)tjPlaneHeight(componentID, height, subsamp);
-	if(retval==-1) _throwarg(tjGetErrorStr());
+  jint retval = (jint)tjPlaneHeight(componentID, height, subsamp);
 
-	bailout:
-	return retval;
+  if (retval == -1) _throwarg(tjGetErrorStr());
+
+bailout:
+  return retval;
 }
 
 /* TurboJPEG 1.2.x: TJCompressor::init() */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_init
-	(JNIEnv *env, jobject obj)
+  (JNIEnv *env, jobject obj)
 {
-	jclass cls;
-	jfieldID fid;
-	tjhandle handle;
+  jclass cls;
+  jfieldID fid;
+  tjhandle handle;
 
-	if((handle=tjInitCompress())==NULL)
-		_throwtj();
+  if ((handle = tjInitCompress()) == NULL)
+    _throw(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException");
 
-	bailif0(cls=(*env)->GetObjectClass(env, obj));
-	bailif0(fid=(*env)->GetFieldID(env, cls, "handle", "J"));
-	(*env)->SetLongField(env, obj, fid, (size_t)handle);
+  bailif0(cls = (*env)->GetObjectClass(env, obj));
+  bailif0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
+  (*env)->SetLongField(env, obj, fid, (size_t)handle);
 
-	bailout:
-	return;
+bailout:
+  return;
 }
 
 static jint TJCompressor_compress
-	(JNIEnv *env, jobject obj, jarray src, jint srcElementSize, jint x, jint y,
-		jint width, jint pitch, jint height, jint pf, jbyteArray dst,
-		jint jpegSubsamp, jint jpegQual, jint flags)
+  (JNIEnv *env, jobject obj, jarray src, jint srcElementSize, jint x, jint y,
+   jint width, jint pitch, jint height, jint pf, jbyteArray dst,
+   jint jpegSubsamp, jint jpegQual, jint flags)
 {
-	tjhandle handle=0;
-	unsigned long jpegSize=0;
-	jsize arraySize=0, actualPitch;
-	unsigned char *srcBuf=NULL, *jpegBuf=NULL;
+  tjhandle handle = 0;
+  unsigned long jpegSize = 0;
+  jsize arraySize = 0, actualPitch;
+  unsigned char *srcBuf = NULL, *jpegBuf = NULL;
 
-	gethandle();
+  gethandle();
 
-	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF || width<1 || height<1
-		|| pitch<0)
-		_throwarg("Invalid argument in compress()");
-	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF)
-		_throwarg("Mismatch between Java and C API");
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF || width < 1 ||
+      height < 1 || pitch < 0)
+    _throwarg("Invalid argument in compress()");
+  if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF)
+    _throwarg("Mismatch between Java and C API");
 
-	actualPitch=(pitch==0)? width*tjPixelSize[pf]:pitch;
-	arraySize=(y+height-1)*actualPitch + (x+width)*tjPixelSize[pf];
-	if((*env)->GetArrayLength(env, src)*srcElementSize<arraySize)
-		_throwarg("Source buffer is not large enough");
-	jpegSize=tjBufSize(width, height, jpegSubsamp);
-	if((*env)->GetArrayLength(env, dst)<(jsize)jpegSize)
-		_throwarg("Destination buffer is not large enough");
+  actualPitch = (pitch == 0) ? width * tjPixelSize[pf] : pitch;
+  arraySize = (y + height - 1) * actualPitch + (x + width) * tjPixelSize[pf];
+  if ((*env)->GetArrayLength(env, src) * srcElementSize < arraySize)
+    _throwarg("Source buffer is not large enough");
+  jpegSize = tjBufSize(width, height, jpegSubsamp);
+  if ((*env)->GetArrayLength(env, dst) < (jsize)jpegSize)
+    _throwarg("Destination buffer is not large enough");
 
-	bailif0(srcBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
-	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
+  bailif0(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
-	if(ProcessSystemProperties(env)<0) goto bailout;
+  if (ProcessSystemProperties(env) < 0) goto bailout;
 
-	if(tjCompress2(handle, &srcBuf[y*actualPitch + x*tjPixelSize[pf]], width,
-		pitch, height, pf, &jpegBuf, &jpegSize, jpegSubsamp, jpegQual,
-		flags|TJFLAG_NOREALLOC)==-1)
-		_throwtj();
+  if (tjCompress2(handle, &srcBuf[y * actualPitch + x * tjPixelSize[pf]],
+                  width, pitch, height, pf, &jpegBuf, &jpegSize, jpegSubsamp,
+                  jpegQual, flags | TJFLAG_NOREALLOC) == -1)
+    _throwtj();
 
-	bailout:
-	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, jpegBuf, 0);
-	if(srcBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
-	return (jint)jpegSize;
+bailout:
+  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, jpegBuf, 0);
+  if (srcBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
+  return (jint)jpegSize;
 }
 
 /* TurboJPEG 1.3.x: TJCompressor::compress() byte source */
 JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3BIIIIII_3BIII
-	(JNIEnv *env, jobject obj, jbyteArray src, jint x, jint y, jint width,
-		jint pitch, jint height, jint pf, jbyteArray dst, jint jpegSubsamp,
-		jint jpegQual, jint flags)
+  (JNIEnv *env, jobject obj, jbyteArray src, jint x, jint y, jint width,
+   jint pitch, jint height, jint pf, jbyteArray dst, jint jpegSubsamp,
+   jint jpegQual, jint flags)
 {
-	return TJCompressor_compress(env, obj, src, 1, x, y, width, pitch, height,
-		pf, dst, jpegSubsamp, jpegQual, flags);
+  return TJCompressor_compress(env, obj, src, 1, x, y, width, pitch, height,
+                               pf, dst, jpegSubsamp, jpegQual, flags);
 }
 
 /* TurboJPEG 1.2.x: TJCompressor::compress() byte source */
 JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3BIIII_3BIII
-	(JNIEnv *env, jobject obj, jbyteArray src, jint width, jint pitch,
-		jint height, jint pf, jbyteArray dst, jint jpegSubsamp, jint jpegQual,
-		jint flags)
+  (JNIEnv *env, jobject obj, jbyteArray src, jint width, jint pitch,
+   jint height, jint pf, jbyteArray dst, jint jpegSubsamp, jint jpegQual,
+   jint flags)
 {
-	return TJCompressor_compress(env, obj, src, 1, 0, 0, width, pitch, height,
-		pf, dst, jpegSubsamp, jpegQual, flags);
+  return TJCompressor_compress(env, obj, src, 1, 0, 0, width, pitch, height,
+                               pf, dst, jpegSubsamp, jpegQual, flags);
 }
 
 /* TurboJPEG 1.3.x: TJCompressor::compress() int source */
 JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3IIIIIII_3BIII
-	(JNIEnv *env, jobject obj, jintArray src, jint x, jint y, jint width,
-		jint stride, jint height, jint pf, jbyteArray dst, jint jpegSubsamp,
-		jint jpegQual, jint flags)
+  (JNIEnv *env, jobject obj, jintArray src, jint x, jint y, jint width,
+   jint stride, jint height, jint pf, jbyteArray dst, jint jpegSubsamp,
+   jint jpegQual, jint flags)
 {
-	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
-		_throwarg("Invalid argument in compress()");
-	if(tjPixelSize[pf]!=sizeof(jint))
-		_throwarg("Pixel format must be 32-bit when compressing from an integer buffer.");
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    _throwarg("Invalid argument in compress()");
+  if (tjPixelSize[pf] != sizeof(jint))
+    _throwarg("Pixel format must be 32-bit when compressing from an integer buffer.");
 
-	return TJCompressor_compress(env, obj, src, sizeof(jint), x, y, width,
-		stride*sizeof(jint), height, pf, dst, jpegSubsamp, jpegQual, flags);
+  return TJCompressor_compress(env, obj, src, sizeof(jint), x, y, width,
+                               stride * sizeof(jint), height, pf, dst,
+                               jpegSubsamp, jpegQual, flags);
 
-	bailout:
-	return 0;
+bailout:
+  return 0;
 }
 
 /* TurboJPEG 1.2.x: TJCompressor::compress() int source */
 JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3IIIII_3BIII
-	(JNIEnv *env, jobject obj, jintArray src, jint width, jint stride,
-		jint height, jint pf, jbyteArray dst, jint jpegSubsamp, jint jpegQual,
-		jint flags)
+  (JNIEnv *env, jobject obj, jintArray src, jint width, jint stride,
+   jint height, jint pf, jbyteArray dst, jint jpegSubsamp, jint jpegQual,
+   jint flags)
 {
-	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
-		_throwarg("Invalid argument in compress()");
-	if(tjPixelSize[pf]!=sizeof(jint))
-		_throwarg("Pixel format must be 32-bit when compressing from an integer buffer.");
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    _throwarg("Invalid argument in compress()");
+  if (tjPixelSize[pf] != sizeof(jint))
+    _throwarg("Pixel format must be 32-bit when compressing from an integer buffer.");
 
-	return TJCompressor_compress(env, obj, src, sizeof(jint), 0, 0, width,
-		stride*sizeof(jint), height, pf, dst, jpegSubsamp, jpegQual, flags);
+  return TJCompressor_compress(env, obj, src, sizeof(jint), 0, 0, width,
+                               stride * sizeof(jint), height, pf, dst,
+                               jpegSubsamp, jpegQual, flags);
 
-	bailout:
-	return 0;
+bailout:
+  return 0;
 }
 
 /* TurboJPEG 1.4.x: TJCompressor::compressFromYUV() */
 JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compressFromYUV___3_3B_3II_3III_3BII
-	(JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
-		jint width, jintArray jSrcStrides, jint height, jint subsamp,
-		jbyteArray dst, jint jpegQual, jint flags)
+  (JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
+   jint width, jintArray jSrcStrides, jint height, jint subsamp,
+   jbyteArray dst, jint jpegQual, jint flags)
 {
-	tjhandle handle=0;
-	unsigned long jpegSize=0;
-	jbyteArray jSrcPlanes[3]={NULL, NULL, NULL};
-	const unsigned char *srcPlanes[3];
-	unsigned char *jpegBuf=NULL;
-	int *srcOffsets=NULL, *srcStrides=NULL;
-	int nc=(subsamp==org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY? 1:3), i;
+  tjhandle handle = 0;
+  unsigned long jpegSize = 0;
+  jbyteArray jSrcPlanes[3] = { NULL, NULL, NULL };
+  const unsigned char *srcPlanes[3];
+  unsigned char *jpegBuf = NULL;
+  int *srcOffsets = NULL, *srcStrides = NULL;
+  int nc = (subsamp == org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY ? 1 : 3), i;
 
-	gethandle();
+  gethandle();
 
-	if(subsamp<0 || subsamp>=org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
-		_throwarg("Invalid argument in compressFromYUV()");
-	if(org_libjpegturbo_turbojpeg_TJ_NUMSAMP!=TJ_NUMSAMP)
-		_throwarg("Mismatch between Java and C API");
+  if (subsamp < 0 || subsamp >= org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
+    _throwarg("Invalid argument in compressFromYUV()");
+  if (org_libjpegturbo_turbojpeg_TJ_NUMSAMP != TJ_NUMSAMP)
+    _throwarg("Mismatch between Java and C API");
 
-	if((*env)->GetArrayLength(env, srcobjs)<nc)
-		_throwarg("Planes array is too small for the subsampling type");
-	if((*env)->GetArrayLength(env, jSrcOffsets)<nc)
-		_throwarg("Offsets array is too small for the subsampling type");
-	if((*env)->GetArrayLength(env, jSrcStrides)<nc)
-		_throwarg("Strides array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, srcobjs) < nc)
+    _throwarg("Planes array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, jSrcOffsets) < nc)
+    _throwarg("Offsets array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, jSrcStrides) < nc)
+    _throwarg("Strides array is too small for the subsampling type");
 
-	jpegSize=tjBufSize(width, height, subsamp);
-	if((*env)->GetArrayLength(env, dst)<(jsize)jpegSize)
-		_throwarg("Destination buffer is not large enough");
+  jpegSize = tjBufSize(width, height, subsamp);
+  if ((*env)->GetArrayLength(env, dst) < (jsize)jpegSize)
+    _throwarg("Destination buffer is not large enough");
 
-	bailif0(srcOffsets=(*env)->GetPrimitiveArrayCritical(env, jSrcOffsets, 0));
-	bailif0(srcStrides=(*env)->GetPrimitiveArrayCritical(env, jSrcStrides, 0));
-	for(i=0; i<nc; i++)
-	{
-		int planeSize=tjPlaneSizeYUV(i, width, srcStrides[i], height, subsamp);
-		int pw=tjPlaneWidth(i, width, subsamp);
+  bailif0(srcOffsets = (*env)->GetPrimitiveArrayCritical(env, jSrcOffsets, 0));
+  bailif0(srcStrides = (*env)->GetPrimitiveArrayCritical(env, jSrcStrides, 0));
+  for (i = 0; i < nc; i++) {
+    int planeSize = tjPlaneSizeYUV(i, width, srcStrides[i], height, subsamp);
+    int pw = tjPlaneWidth(i, width, subsamp);
 
-		if(planeSize<0 || pw<0)
-			_throwarg(tjGetErrorStr());
+    if (planeSize < 0 || pw < 0)
+      _throwarg(tjGetErrorStr());
 
-		if(srcOffsets[i]<0)
-			_throwarg("Invalid argument in compressFromYUV()");
-		if(srcStrides[i]<0 && srcOffsets[i]-planeSize+pw<0)
-			_throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
+    if (srcOffsets[i] < 0)
+      _throwarg("Invalid argument in compressFromYUV()");
+    if (srcStrides[i] < 0 && srcOffsets[i] - planeSize + pw < 0)
+      _throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
 
-		bailif0(jSrcPlanes[i]=(*env)->GetObjectArrayElement(env, srcobjs, i));
-		if((*env)->GetArrayLength(env, jSrcPlanes[i])<srcOffsets[i]+planeSize)
-			_throwarg("Source plane is not large enough");
+    bailif0(jSrcPlanes[i] = (*env)->GetObjectArrayElement(env, srcobjs, i));
+    if ((*env)->GetArrayLength(env, jSrcPlanes[i]) < srcOffsets[i] + planeSize)
+      _throwarg("Source plane is not large enough");
 
-		bailif0(srcPlanes[i]=(*env)->GetPrimitiveArrayCritical(env, jSrcPlanes[i],
-			0));
-		srcPlanes[i]=&srcPlanes[i][srcOffsets[i]];
-	}
-	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
+    bailif0(srcPlanes[i] =
+            (*env)->GetPrimitiveArrayCritical(env, jSrcPlanes[i], 0));
+    srcPlanes[i] = &srcPlanes[i][srcOffsets[i]];
+  }
+  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
-	if(ProcessSystemProperties(env)<0) goto bailout;
+  if (ProcessSystemProperties(env) < 0) goto bailout;
 
-	if(tjCompressFromYUVPlanes(handle, srcPlanes, width, srcStrides, height,
-		subsamp, &jpegBuf, &jpegSize, jpegQual, flags|TJFLAG_NOREALLOC)==-1)
-		_throwtj();
+  if (tjCompressFromYUVPlanes(handle, srcPlanes, width, srcStrides, height,
+                              subsamp, &jpegBuf, &jpegSize, jpegQual,
+                              flags | TJFLAG_NOREALLOC) == -1)
+    _throwtj();
 
-	bailout:
-	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, jpegBuf, 0);
-	for(i=0; i<nc; i++)
-	{
-		if(srcPlanes[i] && jSrcPlanes[i])
-			(*env)->ReleasePrimitiveArrayCritical(env, jSrcPlanes[i],
-				(unsigned char *)srcPlanes[i], 0);
-	}
-	if(srcStrides)
-		(*env)->ReleasePrimitiveArrayCritical(env, jSrcStrides, srcStrides, 0);
-	if(srcOffsets)
-		(*env)->ReleasePrimitiveArrayCritical(env, jSrcOffsets, srcOffsets, 0);
-	return (jint)jpegSize;
+bailout:
+  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, jpegBuf, 0);
+  for (i = 0; i < nc; i++) {
+    if (srcPlanes[i] && jSrcPlanes[i])
+      (*env)->ReleasePrimitiveArrayCritical(env, jSrcPlanes[i],
+                                            (unsigned char *)srcPlanes[i], 0);
+  }
+  if (srcStrides)
+    (*env)->ReleasePrimitiveArrayCritical(env, jSrcStrides, srcStrides, 0);
+  if (srcOffsets)
+    (*env)->ReleasePrimitiveArrayCritical(env, jSrcOffsets, srcOffsets, 0);
+  return (jint)jpegSize;
 }
 
 static void TJCompressor_encodeYUV
-	(JNIEnv *env, jobject obj, jarray src, jint srcElementSize, jint x, jint y,
-		jint width, jint pitch, jint height, jint pf, jobjectArray dstobjs,
-		jintArray jDstOffsets, jintArray jDstStrides, jint subsamp, jint flags)
+  (JNIEnv *env, jobject obj, jarray src, jint srcElementSize, jint x, jint y,
+   jint width, jint pitch, jint height, jint pf, jobjectArray dstobjs,
+   jintArray jDstOffsets, jintArray jDstStrides, jint subsamp, jint flags)
 {
-	tjhandle handle=0;
-	jsize arraySize=0, actualPitch;
-	jbyteArray jDstPlanes[3]={NULL, NULL, NULL};
-	unsigned char *srcBuf=NULL, *dstPlanes[3];
-	int *dstOffsets=NULL, *dstStrides=NULL;
-	int nc=(subsamp==org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY? 1:3), i;
+  tjhandle handle = 0;
+  jsize arraySize = 0, actualPitch;
+  jbyteArray jDstPlanes[3] = { NULL, NULL, NULL };
+  unsigned char *srcBuf = NULL, *dstPlanes[3];
+  int *dstOffsets = NULL, *dstStrides = NULL;
+  int nc = (subsamp == org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY ? 1 : 3), i;
 
-	gethandle();
+  gethandle();
 
-	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF || width<1 || height<1
-		|| pitch<0 || subsamp<0 || subsamp>=org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
-		_throwarg("Invalid argument in encodeYUV()");
-	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF
-		|| org_libjpegturbo_turbojpeg_TJ_NUMSAMP!=TJ_NUMSAMP)
-		_throwarg("Mismatch between Java and C API");
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF || width < 1 ||
+      height < 1 || pitch < 0 || subsamp < 0 ||
+      subsamp >= org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
+    _throwarg("Invalid argument in encodeYUV()");
+  if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF ||
+      org_libjpegturbo_turbojpeg_TJ_NUMSAMP != TJ_NUMSAMP)
+    _throwarg("Mismatch between Java and C API");
 
-	if((*env)->GetArrayLength(env, dstobjs)<nc)
-		_throwarg("Planes array is too small for the subsampling type");
-	if((*env)->GetArrayLength(env, jDstOffsets)<nc)
-		_throwarg("Offsets array is too small for the subsampling type");
-	if((*env)->GetArrayLength(env, jDstStrides)<nc)
-		_throwarg("Strides array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, dstobjs) < nc)
+    _throwarg("Planes array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, jDstOffsets) < nc)
+    _throwarg("Offsets array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, jDstStrides) < nc)
+    _throwarg("Strides array is too small for the subsampling type");
 
-	actualPitch=(pitch==0)? width*tjPixelSize[pf]:pitch;
-	arraySize=(y+height-1)*actualPitch + (x+width)*tjPixelSize[pf];
-	if((*env)->GetArrayLength(env, src)*srcElementSize<arraySize)
-		_throwarg("Source buffer is not large enough");
+  actualPitch = (pitch == 0) ? width * tjPixelSize[pf] : pitch;
+  arraySize = (y + height - 1) * actualPitch + (x + width) * tjPixelSize[pf];
+  if ((*env)->GetArrayLength(env, src) * srcElementSize < arraySize)
+    _throwarg("Source buffer is not large enough");
 
-	bailif0(dstOffsets=(*env)->GetPrimitiveArrayCritical(env, jDstOffsets, 0));
-	bailif0(dstStrides=(*env)->GetPrimitiveArrayCritical(env, jDstStrides, 0));
-	for(i=0; i<nc; i++)
-	{
-		int planeSize=tjPlaneSizeYUV(i, width, dstStrides[i], height, subsamp);
-		int pw=tjPlaneWidth(i, width, subsamp);
+  bailif0(dstOffsets = (*env)->GetPrimitiveArrayCritical(env, jDstOffsets, 0));
+  bailif0(dstStrides = (*env)->GetPrimitiveArrayCritical(env, jDstStrides, 0));
+  for (i = 0; i < nc; i++) {
+    int planeSize = tjPlaneSizeYUV(i, width, dstStrides[i], height, subsamp);
+    int pw = tjPlaneWidth(i, width, subsamp);
 
-		if(planeSize<0 || pw<0)
-			_throwarg(tjGetErrorStr());
+    if (planeSize < 0 || pw < 0)
+      _throwarg(tjGetErrorStr());
 
-		if(dstOffsets[i]<0)
-			_throwarg("Invalid argument in encodeYUV()");
-		if(dstStrides[i]<0 && dstOffsets[i]-planeSize+pw<0)
-			_throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
+    if (dstOffsets[i] < 0)
+      _throwarg("Invalid argument in encodeYUV()");
+    if (dstStrides[i] < 0 && dstOffsets[i] - planeSize + pw < 0)
+      _throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
 
-		bailif0(jDstPlanes[i]=(*env)->GetObjectArrayElement(env, dstobjs, i));
-		if((*env)->GetArrayLength(env, jDstPlanes[i])<dstOffsets[i]+planeSize)
-			_throwarg("Destination plane is not large enough");
+    bailif0(jDstPlanes[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
+    if ((*env)->GetArrayLength(env, jDstPlanes[i]) < dstOffsets[i] + planeSize)
+      _throwarg("Destination plane is not large enough");
 
-		bailif0(dstPlanes[i]=(*env)->GetPrimitiveArrayCritical(env, jDstPlanes[i],
-			0));
-		dstPlanes[i]=&dstPlanes[i][dstOffsets[i]];
-	}
-	bailif0(srcBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
+    bailif0(dstPlanes[i] =
+            (*env)->GetPrimitiveArrayCritical(env, jDstPlanes[i], 0));
+    dstPlanes[i] = &dstPlanes[i][dstOffsets[i]];
+  }
+  bailif0(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
 
-	if(tjEncodeYUVPlanes(handle, &srcBuf[y*actualPitch + x*tjPixelSize[pf]],
-		width, pitch, height, pf, dstPlanes, dstStrides, subsamp, flags)==-1)
-		_throwtj();
+  if (tjEncodeYUVPlanes(handle, &srcBuf[y * actualPitch + x * tjPixelSize[pf]],
+                        width, pitch, height, pf, dstPlanes, dstStrides,
+                        subsamp, flags) == -1)
+    _throwtj();
 
-	bailout:
-	if(srcBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
-	for(i=0; i<nc; i++)
-	{
-		if(dstPlanes[i] && jDstPlanes[i])
-			(*env)->ReleasePrimitiveArrayCritical(env, jDstPlanes[i], dstPlanes[i],
-				0);
-	}
-	if(dstStrides)
-		(*env)->ReleasePrimitiveArrayCritical(env, jDstStrides, dstStrides, 0);
-	if(dstOffsets)
-		(*env)->ReleasePrimitiveArrayCritical(env, jDstOffsets, dstOffsets, 0);
-	return;
+bailout:
+  if (srcBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
+  for (i = 0; i < nc; i++) {
+    if (dstPlanes[i] && jDstPlanes[i])
+      (*env)->ReleasePrimitiveArrayCritical(env, jDstPlanes[i], dstPlanes[i],
+                                            0);
+  }
+  if (dstStrides)
+    (*env)->ReleasePrimitiveArrayCritical(env, jDstStrides, dstStrides, 0);
+  if (dstOffsets)
+    (*env)->ReleasePrimitiveArrayCritical(env, jDstOffsets, dstOffsets, 0);
 }
 
 /* TurboJPEG 1.4.x: TJCompressor::encodeYUV() byte source */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIIIII_3_3B_3I_3III
-	(JNIEnv *env, jobject obj, jbyteArray src, jint x, jint y, jint width,
-		jint pitch, jint height, jint pf, jobjectArray dstobjs,
-		jintArray jDstOffsets, jintArray jDstStrides, jint subsamp, jint flags)
+  (JNIEnv *env, jobject obj, jbyteArray src, jint x, jint y, jint width,
+   jint pitch, jint height, jint pf, jobjectArray dstobjs,
+   jintArray jDstOffsets, jintArray jDstStrides, jint subsamp, jint flags)
 {
-	TJCompressor_encodeYUV(env, obj, src, 1, x, y, width, pitch, height, pf,
-		dstobjs, jDstOffsets, jDstStrides, subsamp, flags);
+  TJCompressor_encodeYUV(env, obj, src, 1, x, y, width, pitch, height, pf,
+                         dstobjs, jDstOffsets, jDstStrides, subsamp, flags);
 }
 
 /* TurboJPEG 1.4.x: TJCompressor::encodeYUV() int source */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIIIII_3_3B_3I_3III
-	(JNIEnv *env, jobject obj, jintArray src, jint x, jint y, jint width,
-		jint stride, jint height, jint pf, jobjectArray dstobjs,
-		jintArray jDstOffsets, jintArray jDstStrides, jint subsamp, jint flags)
+  (JNIEnv *env, jobject obj, jintArray src, jint x, jint y, jint width,
+   jint stride, jint height, jint pf, jobjectArray dstobjs,
+   jintArray jDstOffsets, jintArray jDstStrides, jint subsamp, jint flags)
 {
-	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
-		_throwarg("Invalid argument in encodeYUV()");
-	if(tjPixelSize[pf]!=sizeof(jint))
-		_throwarg("Pixel format must be 32-bit when encoding from an integer buffer.");
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    _throwarg("Invalid argument in encodeYUV()");
+  if (tjPixelSize[pf] != sizeof(jint))
+    _throwarg("Pixel format must be 32-bit when encoding from an integer buffer.");
 
-	TJCompressor_encodeYUV(env, obj, src, sizeof(jint), x, y, width,
-		stride*sizeof(jint), height, pf, dstobjs, jDstOffsets, jDstStrides,
-		subsamp, flags);
+  TJCompressor_encodeYUV(env, obj, src, sizeof(jint), x, y, width,
+                         stride * sizeof(jint), height, pf, dstobjs,
+                         jDstOffsets, jDstStrides, subsamp, flags);
 
-	bailout:
-	return;
+bailout:
+  return;
 }
 
 JNIEXPORT void JNICALL TJCompressor_encodeYUV_12
-	(JNIEnv *env, jobject obj, jarray src, jint srcElementSize, jint width,
-		jint pitch, jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
+  (JNIEnv *env, jobject obj, jarray src, jint srcElementSize, jint width,
+   jint pitch, jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
 {
-	tjhandle handle=0;
-	jsize arraySize=0;
-	unsigned char *srcBuf=NULL, *dstBuf=NULL;
+  tjhandle handle = 0;
+  jsize arraySize = 0;
+  unsigned char *srcBuf = NULL, *dstBuf = NULL;
 
-	gethandle();
+  gethandle();
 
-	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF || width<1 || height<1
-		|| pitch<0)
-		_throwarg("Invalid argument in encodeYUV()");
-	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF)
-		_throwarg("Mismatch between Java and C API");
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF || width < 1 ||
+      height < 1 || pitch < 0)
+    _throwarg("Invalid argument in encodeYUV()");
+  if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF)
+    _throwarg("Mismatch between Java and C API");
 
-	arraySize=(pitch==0)? width*tjPixelSize[pf]*height:pitch*height;
-	if((*env)->GetArrayLength(env, src)*srcElementSize<arraySize)
-		_throwarg("Source buffer is not large enough");
-	if((*env)->GetArrayLength(env, dst)
-		<(jsize)tjBufSizeYUV(width, height, subsamp))
-		_throwarg("Destination buffer is not large enough");
+  arraySize = (pitch == 0) ? width * tjPixelSize[pf] * height : pitch * height;
+  if ((*env)->GetArrayLength(env, src) * srcElementSize < arraySize)
+    _throwarg("Source buffer is not large enough");
+  if ((*env)->GetArrayLength(env, dst) <
+      (jsize)tjBufSizeYUV(width, height, subsamp))
+    _throwarg("Destination buffer is not large enough");
 
-	bailif0(srcBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
-	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
+  bailif0(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  bailif0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
-	if(tjEncodeYUV2(handle, srcBuf, width, pitch, height, pf, dstBuf, subsamp,
-		flags)==-1)
-		_throwtj();
+  if (tjEncodeYUV2(handle, srcBuf, width, pitch, height, pf, dstBuf, subsamp,
+                   flags) == -1)
+    _throwtj();
 
-	bailout:
-	if(dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-	if(srcBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
-	return;
+bailout:
+  if (dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
+  if (srcBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, srcBuf, 0);
 }
 
 /* TurboJPEG 1.2.x: TJCompressor::encodeYUV() byte source */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIII_3BII
-	(JNIEnv *env, jobject obj, jbyteArray src, jint width, jint pitch,
-		jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
+  (JNIEnv *env, jobject obj, jbyteArray src, jint width, jint pitch,
+   jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
 {
-	TJCompressor_encodeYUV_12(env, obj, src, 1, width, pitch, height, pf, dst,
-		subsamp, flags);
+  TJCompressor_encodeYUV_12(env, obj, src, 1, width, pitch, height, pf, dst,
+                            subsamp, flags);
 }
 
 /* TurboJPEG 1.2.x: TJCompressor::encodeYUV() int source */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BII
-	(JNIEnv *env, jobject obj, jintArray src, jint width, jint stride,
-		jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
+  (JNIEnv *env, jobject obj, jintArray src, jint width, jint stride,
+   jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
 {
-	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
-		_throwarg("Invalid argument in encodeYUV()");
-	if(tjPixelSize[pf]!=sizeof(jint))
-		_throwarg("Pixel format must be 32-bit when encoding from an integer buffer.");
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    _throwarg("Invalid argument in encodeYUV()");
+  if (tjPixelSize[pf] != sizeof(jint))
+    _throwarg("Pixel format must be 32-bit when encoding from an integer buffer.");
 
-	TJCompressor_encodeYUV_12(env, obj, src, sizeof(jint), width,
-		stride*sizeof(jint), height, pf, dst, subsamp, flags);
+  TJCompressor_encodeYUV_12(env, obj, src, sizeof(jint), width,
+                            stride * sizeof(jint), height, pf, dst, subsamp,
+                            flags);
 
-	bailout:
-	return;
+bailout:
+  return;
 }
 
 /* TurboJPEG 1.2.x: TJCompressor::destroy() */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy
-	(JNIEnv *env, jobject obj)
+  (JNIEnv *env, jobject obj)
 {
-	tjhandle handle=0;
+  tjhandle handle = 0;
 
-	gethandle();
+  gethandle();
 
-	if(tjDestroy(handle)==-1) _throwtj();
-	(*env)->SetLongField(env, obj, _fid, 0);
+  if (tjDestroy(handle) == -1) _throwtj();
+  (*env)->SetLongField(env, obj, _fid, 0);
 
-	bailout:
-	return;
+bailout:
+  return;
 }
 
 /* TurboJPEG 1.2.x: TJDecompressor::init() */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_init
-	(JNIEnv *env, jobject obj)
+  (JNIEnv *env, jobject obj)
 {
-	jclass cls;
-	jfieldID fid;
-	tjhandle handle;
+  jclass cls;
+  jfieldID fid;
+  tjhandle handle;
 
-	if((handle=tjInitDecompress())==NULL) _throwtj();
+  if ((handle = tjInitDecompress()) == NULL)
+    _throw(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException");
 
-	bailif0(cls=(*env)->GetObjectClass(env, obj));
-	bailif0(fid=(*env)->GetFieldID(env, cls, "handle", "J"));
-	(*env)->SetLongField(env, obj, fid, (size_t)handle);
+  bailif0(cls = (*env)->GetObjectClass(env, obj));
+  bailif0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
+  (*env)->SetLongField(env, obj, fid, (size_t)handle);
 
-	bailout:
-	return;
+bailout:
+  return;
 }
 
 /* TurboJPEG 1.2.x: TJDecompressor::getScalingFactors() */
 JNIEXPORT jobjectArray JNICALL Java_org_libjpegturbo_turbojpeg_TJ_getScalingFactors
-	(JNIEnv *env, jclass cls)
+  (JNIEnv *env, jclass cls)
 {
-	jclass sfcls=NULL;  jfieldID fid=0;
-	tjscalingfactor *sf=NULL;  int n=0, i;
-	jobject sfobj=NULL;
-	jobjectArray sfjava=NULL;
+  jclass sfcls = NULL;
+  jfieldID fid = 0;
+  tjscalingfactor *sf = NULL;
+  int n = 0, i;
+  jobject sfobj = NULL;
+  jobjectArray sfjava = NULL;
 
-	if((sf=tjGetScalingFactors(&n))==NULL || n==0)
-		_throwarg(tjGetErrorStr());
+  if ((sf = tjGetScalingFactors(&n)) == NULL || n == 0)
+    _throwarg(tjGetErrorStr());
 
-	bailif0(sfcls=(*env)->FindClass(env, "org/libjpegturbo/turbojpeg/TJScalingFactor"));
-	bailif0(sfjava=(jobjectArray)(*env)->NewObjectArray(env, n, sfcls, 0));
+  bailif0(sfcls = (*env)->FindClass(env,
+    "org/libjpegturbo/turbojpeg/TJScalingFactor"));
+  bailif0(sfjava = (jobjectArray)(*env)->NewObjectArray(env, n, sfcls, 0));
 
-	for(i=0; i<n; i++)
-	{
-		bailif0(sfobj=(*env)->AllocObject(env, sfcls));
-		bailif0(fid=(*env)->GetFieldID(env, sfcls, "num", "I"));
-		(*env)->SetIntField(env, sfobj, fid, sf[i].num);
-		bailif0(fid=(*env)->GetFieldID(env, sfcls, "denom", "I"));
-		(*env)->SetIntField(env, sfobj, fid, sf[i].denom);
-		(*env)->SetObjectArrayElement(env, sfjava, i, sfobj);
-	}
+  for (i = 0; i < n; i++) {
+    bailif0(sfobj = (*env)->AllocObject(env, sfcls));
+    bailif0(fid = (*env)->GetFieldID(env, sfcls, "num", "I"));
+    (*env)->SetIntField(env, sfobj, fid, sf[i].num);
+    bailif0(fid = (*env)->GetFieldID(env, sfcls, "denom", "I"));
+    (*env)->SetIntField(env, sfobj, fid, sf[i].denom);
+    (*env)->SetObjectArrayElement(env, sfjava, i, sfobj);
+  }
 
-	bailout:
-	return sfjava;
+bailout:
+  return sfjava;
 }
 
 /* TurboJPEG 1.2.x: TJDecompressor::decompressHeader() */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressHeader
-	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize)
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize)
 {
-	tjhandle handle=0;
-	unsigned char *jpegBuf=NULL;
-	int width=0, height=0, jpegSubsamp=-1, jpegColorspace=-1;
+  tjhandle handle = 0;
+  unsigned char *jpegBuf = NULL;
+  int width = 0, height = 0, jpegSubsamp = -1, jpegColorspace = -1;
 
-	gethandle();
+  gethandle();
 
-	if((*env)->GetArrayLength(env, src)<jpegSize)
-		_throwarg("Source buffer is not large enough");
+  if ((*env)->GetArrayLength(env, src) < jpegSize)
+    _throwarg("Source buffer is not large enough");
 
-	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
+  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
 
-	if(tjDecompressHeader3(handle, jpegBuf, (unsigned long)jpegSize,
-		&width, &height, &jpegSubsamp, &jpegColorspace)==-1)
-		_throwtj();
+  if (tjDecompressHeader3(handle, jpegBuf, (unsigned long)jpegSize, &width,
+                          &height, &jpegSubsamp, &jpegColorspace) == -1)
+    _throwtj();
 
-	(*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);  jpegBuf=NULL;
+  (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
+  jpegBuf = NULL;
 
-	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
-	(*env)->SetIntField(env, obj, _fid, jpegSubsamp);
-	if((_fid=(*env)->GetFieldID(env, _cls, "jpegColorspace", "I"))==0)
-		(*env)->ExceptionClear(env);
-	else
-		(*env)->SetIntField(env, obj, _fid, jpegColorspace);
-	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
-	(*env)->SetIntField(env, obj, _fid, width);
-	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
-	(*env)->SetIntField(env, obj, _fid, height);
+  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
+  (*env)->SetIntField(env, obj, _fid, jpegSubsamp);
+  if ((_fid = (*env)->GetFieldID(env, _cls, "jpegColorspace", "I")) == 0)
+    (*env)->ExceptionClear(env);
+  else
+    (*env)->SetIntField(env, obj, _fid, jpegColorspace);
+  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
+  (*env)->SetIntField(env, obj, _fid, width);
+  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
+  (*env)->SetIntField(env, obj, _fid, height);
 
-	bailout:
-	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
-	return;
+bailout:
+  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
 }
 
 static void TJDecompressor_decompress
-	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jarray dst,
-		jint dstElementSize, jint x, jint y, jint width, jint pitch, jint height,
-		jint pf, jint flags)
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jarray dst,
+   jint dstElementSize, jint x, jint y, jint width, jint pitch, jint height,
+   jint pf, jint flags)
 {
-	tjhandle handle=0;
-	jsize arraySize=0, actualPitch;
-	unsigned char *jpegBuf=NULL, *dstBuf=NULL;
+  tjhandle handle = 0;
+  jsize arraySize = 0, actualPitch;
+  unsigned char *jpegBuf = NULL, *dstBuf = NULL;
 
-	gethandle();
+  gethandle();
 
-	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
-		_throwarg("Invalid argument in decompress()");
-	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF)
-		_throwarg("Mismatch between Java and C API");
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    _throwarg("Invalid argument in decompress()");
+  if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF)
+    _throwarg("Mismatch between Java and C API");
 
-	if((*env)->GetArrayLength(env, src)<jpegSize)
-		_throwarg("Source buffer is not large enough");
-	actualPitch=(pitch==0)? width*tjPixelSize[pf]:pitch;
-	arraySize=(y+height-1)*actualPitch + (x+width)*tjPixelSize[pf];
-	if((*env)->GetArrayLength(env, dst)*dstElementSize<arraySize)
-		_throwarg("Destination buffer is not large enough");
+  if ((*env)->GetArrayLength(env, src) < jpegSize)
+    _throwarg("Source buffer is not large enough");
+  actualPitch = (pitch == 0) ? width * tjPixelSize[pf] : pitch;
+  arraySize = (y + height - 1) * actualPitch + (x + width) * tjPixelSize[pf];
+  if ((*env)->GetArrayLength(env, dst) * dstElementSize < arraySize)
+    _throwarg("Destination buffer is not large enough");
 
-	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
-	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
+  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  bailif0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
-	if(tjDecompress2(handle, jpegBuf, (unsigned long)jpegSize,
-		&dstBuf[y*actualPitch + x*tjPixelSize[pf]], width, pitch, height, pf,
-		flags)==-1)
-		_throwtj();
+  if (tjDecompress2(handle, jpegBuf, (unsigned long)jpegSize,
+                    &dstBuf[y * actualPitch + x * tjPixelSize[pf]], width,
+                    pitch, height, pf, flags) == -1)
+    _throwtj();
 
-	bailout:
-	if(dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
-	return;
+bailout:
+  if (dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
+  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
 }
 
 /* TurboJPEG 1.3.x: TJDecompressor::decompress() byte destination */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIIIII
-	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
-		jint x, jint y, jint width, jint pitch, jint height, jint pf, jint flags)
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
+   jint x, jint y, jint width, jint pitch, jint height, jint pf, jint flags)
 {
-	TJDecompressor_decompress(env, obj, src, jpegSize, dst, 1, x, y, width,
-		pitch, height, pf, flags);
+  TJDecompressor_decompress(env, obj, src, jpegSize, dst, 1, x, y, width,
+                            pitch, height, pf, flags);
 }
 
 /* TurboJPEG 1.2.x: TJDecompressor::decompress() byte destination */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIII
-	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
-		jint width, jint pitch, jint height, jint pf, jint flags)
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
+   jint width, jint pitch, jint height, jint pf, jint flags)
 {
-	TJDecompressor_decompress(env, obj, src, jpegSize, dst, 1, 0, 0, width,
-		pitch, height, pf, flags);
+  TJDecompressor_decompress(env, obj, src, jpegSize, dst, 1, 0, 0, width,
+                            pitch, height, pf, flags);
 }
 
 /* TurboJPEG 1.3.x: TJDecompressor::decompress() int destination */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIIIII
-	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jintArray dst,
-		jint x, jint y, jint width, jint stride, jint height, jint pf, jint flags)
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jintArray dst,
+   jint x, jint y, jint width, jint stride, jint height, jint pf, jint flags)
 {
-	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
-		_throwarg("Invalid argument in decompress()");
-	if(tjPixelSize[pf]!=sizeof(jint))
-		_throwarg("Pixel format must be 32-bit when decompressing to an integer buffer.");
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    _throwarg("Invalid argument in decompress()");
+  if (tjPixelSize[pf] != sizeof(jint))
+    _throwarg("Pixel format must be 32-bit when decompressing to an integer buffer.");
 
-	TJDecompressor_decompress(env, obj, src, jpegSize, dst, sizeof(jint), x, y,
-		width, stride*sizeof(jint), height, pf, flags);
+  TJDecompressor_decompress(env, obj, src, jpegSize, dst, sizeof(jint), x, y,
+                            width, stride * sizeof(jint), height, pf, flags);
 
-	bailout:
-	return;
+bailout:
+  return;
 }
 
 /* TurboJPEG 1.2.x: TJDecompressor::decompress() int destination */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIII
-	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jintArray dst,
-		jint width, jint stride, jint height, jint pf, jint flags)
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jintArray dst,
+   jint width, jint stride, jint height, jint pf, jint flags)
 {
-	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
-		_throwarg("Invalid argument in decompress()");
-	if(tjPixelSize[pf]!=sizeof(jint))
-		_throwarg("Pixel format must be 32-bit when decompressing to an integer buffer.");
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    _throwarg("Invalid argument in decompress()");
+  if (tjPixelSize[pf] != sizeof(jint))
+    _throwarg("Pixel format must be 32-bit when decompressing to an integer buffer.");
 
-	TJDecompressor_decompress(env, obj, src, jpegSize, dst, sizeof(jint), 0, 0,
-		width, stride*sizeof(jint), height, pf, flags);
+  TJDecompressor_decompress(env, obj, src, jpegSize, dst, sizeof(jint), 0, 0,
+                            width, stride * sizeof(jint), height, pf, flags);
 
-	bailout:
-	return;
-
+bailout:
+  return;
 }
 
 /* TurboJPEG 1.4.x: TJDecompressor::decompressToYUV() */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3_3B_3II_3III
-	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize,
-		jobjectArray dstobjs, jintArray jDstOffsets, jint desiredWidth,
-		jintArray jDstStrides, jint desiredHeight, jint flags)
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize,
+   jobjectArray dstobjs, jintArray jDstOffsets, jint desiredWidth,
+   jintArray jDstStrides, jint desiredHeight, jint flags)
 {
-	tjhandle handle=0;
-	jbyteArray jDstPlanes[3]={NULL, NULL, NULL};
-	unsigned char *jpegBuf=NULL, *dstPlanes[3];
-	int *dstOffsets=NULL, *dstStrides=NULL;
-	int jpegSubsamp=-1, jpegWidth=0, jpegHeight=0;
-	int nc=0, i, width, height, scaledWidth, scaledHeight, nsf=0;
-	tjscalingfactor *sf;
+  tjhandle handle = 0;
+  jbyteArray jDstPlanes[3] = { NULL, NULL, NULL };
+  unsigned char *jpegBuf = NULL, *dstPlanes[3];
+  int *dstOffsets = NULL, *dstStrides = NULL;
+  int jpegSubsamp = -1, jpegWidth = 0, jpegHeight = 0;
+  int nc = 0, i, width, height, scaledWidth, scaledHeight, nsf = 0;
+  tjscalingfactor *sf;
 
+  gethandle();
 
-	gethandle();
+  if ((*env)->GetArrayLength(env, src) < jpegSize)
+    _throwarg("Source buffer is not large enough");
+  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
+  jpegSubsamp = (int)(*env)->GetIntField(env, obj, _fid);
+  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
+  jpegWidth = (int)(*env)->GetIntField(env, obj, _fid);
+  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
+  jpegHeight = (int)(*env)->GetIntField(env, obj, _fid);
 
-	if((*env)->GetArrayLength(env, src)<jpegSize)
-		_throwarg("Source buffer is not large enough");
-	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
-	jpegSubsamp=(int)(*env)->GetIntField(env, obj, _fid);
-	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
-	jpegWidth=(int)(*env)->GetIntField(env, obj, _fid);
-	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
-	jpegHeight=(int)(*env)->GetIntField(env, obj, _fid);
+  nc = (jpegSubsamp == org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY ? 1 : 3);
 
-	nc=(jpegSubsamp==org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY? 1:3);
+  width = desiredWidth;
+  height = desiredHeight;
+  if (width == 0) width = jpegWidth;
+  if (height == 0) height = jpegHeight;
+  sf = tjGetScalingFactors(&nsf);
+  if (!sf || nsf < 1)
+    _throwarg(tjGetErrorStr());
+  for (i = 0; i < nsf; i++) {
+    scaledWidth = TJSCALED(jpegWidth, sf[i]);
+    scaledHeight = TJSCALED(jpegHeight, sf[i]);
+    if (scaledWidth <= width && scaledHeight <= height)
+      break;
+  }
+  if (i >= nsf)
+    _throwarg("Could not scale down to desired image dimensions");
 
-	width=desiredWidth;  height=desiredHeight;
-	if(width==0) width=jpegWidth;
-	if(height==0) height=jpegHeight;
-	sf=tjGetScalingFactors(&nsf);
-	if(!sf || nsf<1)
-		_throwarg(tjGetErrorStr());
-	for(i=0; i<nsf; i++)
-	{
-		scaledWidth=TJSCALED(jpegWidth, sf[i]);
-		scaledHeight=TJSCALED(jpegHeight, sf[i]);
-		if(scaledWidth<=width && scaledHeight<=height)
-			break;
-	}
-	if(i>=nsf)
-		_throwarg("Could not scale down to desired image dimensions");
+  bailif0(dstOffsets = (*env)->GetPrimitiveArrayCritical(env, jDstOffsets, 0));
+  bailif0(dstStrides = (*env)->GetPrimitiveArrayCritical(env, jDstStrides, 0));
+  for (i = 0; i < nc; i++) {
+    int planeSize = tjPlaneSizeYUV(i, scaledWidth, dstStrides[i], scaledHeight,
+                                   jpegSubsamp);
+    int pw = tjPlaneWidth(i, scaledWidth, jpegSubsamp);
 
-	bailif0(dstOffsets=(*env)->GetPrimitiveArrayCritical(env, jDstOffsets, 0));
-	bailif0(dstStrides=(*env)->GetPrimitiveArrayCritical(env, jDstStrides, 0));
-	for(i=0; i<nc; i++)
-	{
-		int planeSize=tjPlaneSizeYUV(i, scaledWidth, dstStrides[i], scaledHeight,
-			jpegSubsamp);
-		int pw=tjPlaneWidth(i, scaledWidth, jpegSubsamp);
+    if (planeSize < 0 || pw < 0)
+      _throwarg(tjGetErrorStr());
 
-		if(planeSize<0 || pw<0)
-			_throwarg(tjGetErrorStr());
+    if (dstOffsets[i] < 0)
+      _throwarg("Invalid argument in decompressToYUV()");
+    if (dstStrides[i] < 0 && dstOffsets[i] - planeSize + pw < 0)
+      _throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
 
-		if(dstOffsets[i]<0)
-			_throwarg("Invalid argument in decompressToYUV()");
-		if(dstStrides[i]<0 && dstOffsets[i]-planeSize+pw<0)
-			_throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
+    bailif0(jDstPlanes[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
+    if ((*env)->GetArrayLength(env, jDstPlanes[i]) < dstOffsets[i] + planeSize)
+      _throwarg("Destination plane is not large enough");
 
-		bailif0(jDstPlanes[i]=(*env)->GetObjectArrayElement(env, dstobjs, i));
-		if((*env)->GetArrayLength(env, jDstPlanes[i])<dstOffsets[i]+planeSize)
-			_throwarg("Destination plane is not large enough");
+    bailif0(dstPlanes[i] =
+            (*env)->GetPrimitiveArrayCritical(env, jDstPlanes[i], 0));
+    dstPlanes[i] = &dstPlanes[i][dstOffsets[i]];
+  }
+  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
 
-		bailif0(dstPlanes[i]=(*env)->GetPrimitiveArrayCritical(env, jDstPlanes[i],
-			0));
-		dstPlanes[i]=&dstPlanes[i][dstOffsets[i]];
-	}
-	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
+  if (tjDecompressToYUVPlanes(handle, jpegBuf, (unsigned long)jpegSize,
+                              dstPlanes, desiredWidth, dstStrides,
+                              desiredHeight, flags) == -1)
+    _throwtj();
 
-	if(tjDecompressToYUVPlanes(handle, jpegBuf, (unsigned long)jpegSize,
-		dstPlanes, desiredWidth, dstStrides, desiredHeight, flags)==-1)
-		_throwtj();
-
-	bailout:
-	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
-	for(i=0; i<nc; i++)
-	{
-		if(dstPlanes[i] && jDstPlanes[i])
-			(*env)->ReleasePrimitiveArrayCritical(env, jDstPlanes[i], dstPlanes[i],
-				0);
-	}
-	if(dstStrides)
-		(*env)->ReleasePrimitiveArrayCritical(env, jDstStrides, dstStrides, 0);
-	if(dstOffsets)
-		(*env)->ReleasePrimitiveArrayCritical(env, jDstOffsets, dstOffsets, 0);
-	return;
+bailout:
+  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
+  for (i = 0; i < nc; i++) {
+    if (dstPlanes[i] && jDstPlanes[i])
+      (*env)->ReleasePrimitiveArrayCritical(env, jDstPlanes[i], dstPlanes[i],
+                                            0);
+  }
+  if (dstStrides)
+    (*env)->ReleasePrimitiveArrayCritical(env, jDstStrides, dstStrides, 0);
+  if (dstOffsets)
+    (*env)->ReleasePrimitiveArrayCritical(env, jDstOffsets, dstOffsets, 0);
 }
 
 /* TurboJPEG 1.2.x: TJDecompressor::decompressToYUV() */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BI
-	(JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
-		jint flags)
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
+   jint flags)
 {
-	tjhandle handle=0;
-	unsigned char *jpegBuf=NULL, *dstBuf=NULL;
-	int jpegSubsamp=-1, jpegWidth=0, jpegHeight=0;
+  tjhandle handle = 0;
+  unsigned char *jpegBuf = NULL, *dstBuf = NULL;
+  int jpegSubsamp = -1, jpegWidth = 0, jpegHeight = 0;
 
-	gethandle();
+  gethandle();
 
-	if((*env)->GetArrayLength(env, src)<jpegSize)
-		_throwarg("Source buffer is not large enough");
-	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
-	jpegSubsamp=(int)(*env)->GetIntField(env, obj, _fid);
-	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
-	jpegWidth=(int)(*env)->GetIntField(env, obj, _fid);
-	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
-	jpegHeight=(int)(*env)->GetIntField(env, obj, _fid);
-	if((*env)->GetArrayLength(env, dst)
-		<(jsize)tjBufSizeYUV(jpegWidth, jpegHeight, jpegSubsamp))
-		_throwarg("Destination buffer is not large enough");
+  if ((*env)->GetArrayLength(env, src) < jpegSize)
+    _throwarg("Source buffer is not large enough");
+  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
+  jpegSubsamp = (int)(*env)->GetIntField(env, obj, _fid);
+  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
+  jpegWidth = (int)(*env)->GetIntField(env, obj, _fid);
+  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
+  jpegHeight = (int)(*env)->GetIntField(env, obj, _fid);
+  if ((*env)->GetArrayLength(env, dst) <
+      (jsize)tjBufSizeYUV(jpegWidth, jpegHeight, jpegSubsamp))
+    _throwarg("Destination buffer is not large enough");
 
-	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, src, 0));
-	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
+  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  bailif0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
-	if(tjDecompressToYUV(handle, jpegBuf, (unsigned long)jpegSize, dstBuf,
-		flags)==-1)
-		_throwtj();
+  if (tjDecompressToYUV(handle, jpegBuf, (unsigned long)jpegSize, dstBuf,
+                        flags) == -1)
+    _throwtj();
 
-	bailout:
-	if(dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
-	return;
+bailout:
+  if (dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
+  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, src, jpegBuf, 0);
 }
 
 static void TJDecompressor_decodeYUV
-	(JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
-		jintArray jSrcStrides, jint subsamp, jarray dst, jint dstElementSize,
-		jint x, jint y, jint width, jint pitch, jint height, jint pf, jint flags)
+  (JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
+   jintArray jSrcStrides, jint subsamp, jarray dst, jint dstElementSize,
+   jint x, jint y, jint width, jint pitch, jint height, jint pf, jint flags)
 {
-	tjhandle handle=0;
-	jsize arraySize=0, actualPitch;
-	jbyteArray jSrcPlanes[3]={NULL, NULL, NULL};
-	const unsigned char *srcPlanes[3];
-	unsigned char *dstBuf=NULL;
-	int *srcOffsets=NULL, *srcStrides=NULL;
-	int nc=(subsamp==org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY? 1:3), i;
+  tjhandle handle = 0;
+  jsize arraySize = 0, actualPitch;
+  jbyteArray jSrcPlanes[3] = { NULL, NULL, NULL };
+  const unsigned char *srcPlanes[3];
+  unsigned char *dstBuf = NULL;
+  int *srcOffsets = NULL, *srcStrides = NULL;
+  int nc = (subsamp == org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY ? 1 : 3), i;
 
-	gethandle();
+  gethandle();
 
-	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF || subsamp<0
-		|| subsamp>=org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
-		_throwarg("Invalid argument in decodeYUV()");
-	if(org_libjpegturbo_turbojpeg_TJ_NUMPF!=TJ_NUMPF
-		|| org_libjpegturbo_turbojpeg_TJ_NUMSAMP!=TJ_NUMSAMP)
-		_throwarg("Mismatch between Java and C API");
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF || subsamp < 0 ||
+      subsamp >= org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
+    _throwarg("Invalid argument in decodeYUV()");
+  if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF ||
+      org_libjpegturbo_turbojpeg_TJ_NUMSAMP != TJ_NUMSAMP)
+    _throwarg("Mismatch between Java and C API");
 
-	if((*env)->GetArrayLength(env, srcobjs)<nc)
-		_throwarg("Planes array is too small for the subsampling type");
-	if((*env)->GetArrayLength(env, jSrcOffsets)<nc)
-		_throwarg("Offsets array is too small for the subsampling type");
-	if((*env)->GetArrayLength(env, jSrcStrides)<nc)
-		_throwarg("Strides array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, srcobjs) < nc)
+    _throwarg("Planes array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, jSrcOffsets) < nc)
+    _throwarg("Offsets array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, jSrcStrides) < nc)
+    _throwarg("Strides array is too small for the subsampling type");
 
-	actualPitch=(pitch==0)? width*tjPixelSize[pf]:pitch;
-	arraySize=(y+height-1)*actualPitch + (x+width)*tjPixelSize[pf];
-	if((*env)->GetArrayLength(env, dst)*dstElementSize<arraySize)
-		_throwarg("Destination buffer is not large enough");
+  actualPitch = (pitch == 0) ? width * tjPixelSize[pf] : pitch;
+  arraySize = (y + height - 1) * actualPitch + (x + width) * tjPixelSize[pf];
+  if ((*env)->GetArrayLength(env, dst) * dstElementSize < arraySize)
+    _throwarg("Destination buffer is not large enough");
 
-	bailif0(srcOffsets=(*env)->GetPrimitiveArrayCritical(env, jSrcOffsets, 0));
-	bailif0(srcStrides=(*env)->GetPrimitiveArrayCritical(env, jSrcStrides, 0));
-	for(i=0; i<nc; i++)
-	{
-		int planeSize=tjPlaneSizeYUV(i, width, srcStrides[i], height, subsamp);
-		int pw=tjPlaneWidth(i, width, subsamp);
+  bailif0(srcOffsets = (*env)->GetPrimitiveArrayCritical(env, jSrcOffsets, 0));
+  bailif0(srcStrides = (*env)->GetPrimitiveArrayCritical(env, jSrcStrides, 0));
+  for (i = 0; i < nc; i++) {
+    int planeSize = tjPlaneSizeYUV(i, width, srcStrides[i], height, subsamp);
+    int pw = tjPlaneWidth(i, width, subsamp);
 
-		if(planeSize<0 || pw<0)
-			_throwarg(tjGetErrorStr());
+    if (planeSize < 0 || pw < 0)
+      _throwarg(tjGetErrorStr());
 
-		if(srcOffsets[i]<0)
-			_throwarg("Invalid argument in decodeYUV()");
-		if(srcStrides[i]<0 && srcOffsets[i]-planeSize+pw<0)
-			_throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
+    if (srcOffsets[i] < 0)
+      _throwarg("Invalid argument in decodeYUV()");
+    if (srcStrides[i] < 0 && srcOffsets[i] - planeSize + pw < 0)
+      _throwarg("Negative plane stride would cause memory to be accessed below plane boundary");
 
-		bailif0(jSrcPlanes[i]=(*env)->GetObjectArrayElement(env, srcobjs, i));
-		if((*env)->GetArrayLength(env, jSrcPlanes[i])<srcOffsets[i]+planeSize)
-			_throwarg("Source plane is not large enough");
+    bailif0(jSrcPlanes[i] = (*env)->GetObjectArrayElement(env, srcobjs, i));
+    if ((*env)->GetArrayLength(env, jSrcPlanes[i]) < srcOffsets[i] + planeSize)
+      _throwarg("Source plane is not large enough");
 
-		bailif0(srcPlanes[i]=(*env)->GetPrimitiveArrayCritical(env, jSrcPlanes[i],
-			0));
-		srcPlanes[i]=&srcPlanes[i][srcOffsets[i]];
-	}
-	bailif0(dstBuf=(*env)->GetPrimitiveArrayCritical(env, dst, 0));
+    bailif0(srcPlanes[i] =
+            (*env)->GetPrimitiveArrayCritical(env, jSrcPlanes[i], 0));
+    srcPlanes[i] = &srcPlanes[i][srcOffsets[i]];
+  }
+  bailif0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
 
-	if(tjDecodeYUVPlanes(handle, srcPlanes, srcStrides, subsamp,
-		&dstBuf[y*actualPitch + x*tjPixelSize[pf]], width, pitch, height, pf,
-		flags)==-1)
-		_throwtj();
+  if (tjDecodeYUVPlanes(handle, srcPlanes, srcStrides, subsamp,
+                        &dstBuf[y * actualPitch + x * tjPixelSize[pf]], width,
+                        pitch, height, pf, flags) == -1)
+    _throwtj();
 
-	bailout:
-	if(dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
-	for(i=0; i<nc; i++)
-	{
-		if(srcPlanes[i] && jSrcPlanes[i])
-			(*env)->ReleasePrimitiveArrayCritical(env, jSrcPlanes[i],
-				(unsigned char *)srcPlanes[i], 0);
-	}
-	if(srcStrides)
-		(*env)->ReleasePrimitiveArrayCritical(env, jSrcStrides, srcStrides, 0);
-	if(srcOffsets)
-		(*env)->ReleasePrimitiveArrayCritical(env, jSrcOffsets, srcOffsets, 0);
-	return;
+bailout:
+  if (dstBuf) (*env)->ReleasePrimitiveArrayCritical(env, dst, dstBuf, 0);
+  for (i = 0; i < nc; i++) {
+    if (srcPlanes[i] && jSrcPlanes[i])
+      (*env)->ReleasePrimitiveArrayCritical(env, jSrcPlanes[i],
+                                            (unsigned char *)srcPlanes[i], 0);
+  }
+  if (srcStrides)
+    (*env)->ReleasePrimitiveArrayCritical(env, jSrcStrides, srcStrides, 0);
+  if (srcOffsets)
+    (*env)->ReleasePrimitiveArrayCritical(env, jSrcOffsets, srcOffsets, 0);
 }
 
 /* TurboJPEG 1.4.x: TJDecompressor::decodeYUV() byte destination */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decodeYUV___3_3B_3I_3II_3BIIIIIII
-	(JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
-		jintArray jSrcStrides, jint subsamp, jbyteArray dst, jint x, jint y,
-		jint width, jint pitch, jint height, jint pf, jint flags)
+  (JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
+   jintArray jSrcStrides, jint subsamp, jbyteArray dst, jint x, jint y,
+   jint width, jint pitch, jint height, jint pf, jint flags)
 {
-	TJDecompressor_decodeYUV(env, obj, srcobjs, jSrcOffsets, jSrcStrides,
-		subsamp, dst, 1, x, y, width, pitch, height, pf, flags);
+  TJDecompressor_decodeYUV(env, obj, srcobjs, jSrcOffsets, jSrcStrides,
+                           subsamp, dst, 1, x, y, width, pitch, height, pf,
+                           flags);
 }
 
 /* TurboJPEG 1.4.x: TJDecompressor::decodeYUV() int destination */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decodeYUV___3_3B_3I_3II_3IIIIIIII
-	(JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
-		jintArray jSrcStrides, jint subsamp, jintArray dst, jint x, jint y,
-		jint width, jint stride, jint height, jint pf, jint flags)
+  (JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
+   jintArray jSrcStrides, jint subsamp, jintArray dst, jint x, jint y,
+   jint width, jint stride, jint height, jint pf, jint flags)
 {
-	if(pf<0 || pf>=org_libjpegturbo_turbojpeg_TJ_NUMPF)
-		_throwarg("Invalid argument in decodeYUV()");
-	if(tjPixelSize[pf]!=sizeof(jint))
-		_throwarg("Pixel format must be 32-bit when decoding to an integer buffer.");
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    _throwarg("Invalid argument in decodeYUV()");
+  if (tjPixelSize[pf] != sizeof(jint))
+    _throwarg("Pixel format must be 32-bit when decoding to an integer buffer.");
 
-	TJDecompressor_decodeYUV(env, obj, srcobjs, jSrcOffsets, jSrcStrides,
-		subsamp, dst, sizeof(jint), x, y, width, stride*sizeof(jint), height, pf,
-		flags);
+  TJDecompressor_decodeYUV(env, obj, srcobjs, jSrcOffsets, jSrcStrides,
+                           subsamp, dst, sizeof(jint), x, y, width,
+                           stride * sizeof(jint), height, pf, flags);
 
-	bailout:
-	return;
+bailout:
+  return;
 }
 
 /* TurboJPEG 1.2.x: TJTransformer::init() */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJTransformer_init
-	(JNIEnv *env, jobject obj)
+  (JNIEnv *env, jobject obj)
 {
-	jclass cls;
-	jfieldID fid;
-	tjhandle handle;
+  jclass cls;
+  jfieldID fid;
+  tjhandle handle;
 
-	if((handle=tjInitTransform())==NULL) _throwtj();
+  if ((handle = tjInitTransform()) == NULL)
+    _throw(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException");
 
-	bailif0(cls=(*env)->GetObjectClass(env, obj));
-	bailif0(fid=(*env)->GetFieldID(env, cls, "handle", "J"));
-	(*env)->SetLongField(env, obj, fid, (size_t)handle);
+  bailif0(cls = (*env)->GetObjectClass(env, obj));
+  bailif0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
+  (*env)->SetLongField(env, obj, fid, (size_t)handle);
 
-	bailout:
-	return;
+bailout:
+  return;
 }
 
-typedef struct _JNICustomFilterParams
-{
-	JNIEnv *env;
-	jobject tobj;
-	jobject cfobj;
+typedef struct _JNICustomFilterParams {
+  JNIEnv *env;
+  jobject tobj;
+  jobject cfobj;
 } JNICustomFilterParams;
 
 static int JNICustomFilter(short *coeffs, tjregion arrayRegion,
-	tjregion planeRegion, int componentIndex, int transformIndex,
-	tjtransform *transform)
+                           tjregion planeRegion, int componentIndex,
+                           int transformIndex, tjtransform *transform)
 {
-	JNICustomFilterParams *params=(JNICustomFilterParams *)transform->data;
-	JNIEnv *env=params->env;
-	jobject tobj=params->tobj, cfobj=params->cfobj;
-	jobject arrayRegionObj, planeRegionObj, bufobj, borobj;
-	jclass cls;  jmethodID mid;  jfieldID fid;
+  JNICustomFilterParams *params = (JNICustomFilterParams *)transform->data;
+  JNIEnv *env = params->env;
+  jobject tobj = params->tobj, cfobj = params->cfobj;
+  jobject arrayRegionObj, planeRegionObj, bufobj, borobj;
+  jclass cls;
+  jmethodID mid;
+  jfieldID fid;
 
-	bailif0(bufobj=(*env)->NewDirectByteBuffer(env, coeffs,
-		sizeof(short)*arrayRegion.w*arrayRegion.h));
-	bailif0(cls=(*env)->FindClass(env, "java/nio/ByteOrder"));
-	bailif0(mid=(*env)->GetStaticMethodID(env, cls, "nativeOrder",
-		"()Ljava/nio/ByteOrder;"));
-	bailif0(borobj=(*env)->CallStaticObjectMethod(env, cls, mid));
-	bailif0(cls=(*env)->GetObjectClass(env, bufobj));
-	bailif0(mid=(*env)->GetMethodID(env, cls, "order",
-		"(Ljava/nio/ByteOrder;)Ljava/nio/ByteBuffer;"));
-	(*env)->CallObjectMethod(env, bufobj, mid, borobj);
-	bailif0(mid=(*env)->GetMethodID(env, cls, "asShortBuffer",
-		"()Ljava/nio/ShortBuffer;"));
-	bailif0(bufobj=(*env)->CallObjectMethod(env, bufobj, mid));
+  bailif0(bufobj = (*env)->NewDirectByteBuffer(env, coeffs,
+    sizeof(short) * arrayRegion.w * arrayRegion.h));
+  bailif0(cls = (*env)->FindClass(env, "java/nio/ByteOrder"));
+  bailif0(mid = (*env)->GetStaticMethodID(env, cls, "nativeOrder",
+                                          "()Ljava/nio/ByteOrder;"));
+  bailif0(borobj = (*env)->CallStaticObjectMethod(env, cls, mid));
+  bailif0(cls = (*env)->GetObjectClass(env, bufobj));
+  bailif0(mid = (*env)->GetMethodID(env, cls, "order",
+    "(Ljava/nio/ByteOrder;)Ljava/nio/ByteBuffer;"));
+  (*env)->CallObjectMethod(env, bufobj, mid, borobj);
+  bailif0(mid = (*env)->GetMethodID(env, cls, "asShortBuffer",
+                                    "()Ljava/nio/ShortBuffer;"));
+  bailif0(bufobj = (*env)->CallObjectMethod(env, bufobj, mid));
 
-	bailif0(cls=(*env)->FindClass(env, "java/awt/Rectangle"));
-	bailif0(arrayRegionObj=(*env)->AllocObject(env, cls));
-	bailif0(fid=(*env)->GetFieldID(env, cls, "x", "I"));
-	(*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.x);
-	bailif0(fid=(*env)->GetFieldID(env, cls, "y", "I"));
-	(*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.y);
-	bailif0(fid=(*env)->GetFieldID(env, cls, "width", "I"));
-	(*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.w);
-	bailif0(fid=(*env)->GetFieldID(env, cls, "height", "I"));
-	(*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.h);
+  bailif0(cls = (*env)->FindClass(env, "java/awt/Rectangle"));
+  bailif0(arrayRegionObj = (*env)->AllocObject(env, cls));
+  bailif0(fid = (*env)->GetFieldID(env, cls, "x", "I"));
+  (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.x);
+  bailif0(fid = (*env)->GetFieldID(env, cls, "y", "I"));
+  (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.y);
+  bailif0(fid = (*env)->GetFieldID(env, cls, "width", "I"));
+  (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.w);
+  bailif0(fid = (*env)->GetFieldID(env, cls, "height", "I"));
+  (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.h);
 
-	bailif0(planeRegionObj=(*env)->AllocObject(env, cls));
-	bailif0(fid=(*env)->GetFieldID(env, cls, "x", "I"));
-	(*env)->SetIntField(env, planeRegionObj, fid, planeRegion.x);
-	bailif0(fid=(*env)->GetFieldID(env, cls, "y", "I"));
-	(*env)->SetIntField(env, planeRegionObj, fid, planeRegion.y);
-	bailif0(fid=(*env)->GetFieldID(env, cls, "width", "I"));
-	(*env)->SetIntField(env, planeRegionObj, fid, planeRegion.w);
-	bailif0(fid=(*env)->GetFieldID(env, cls, "height", "I"));
-	(*env)->SetIntField(env, planeRegionObj, fid, planeRegion.h);
+  bailif0(planeRegionObj = (*env)->AllocObject(env, cls));
+  bailif0(fid = (*env)->GetFieldID(env, cls, "x", "I"));
+  (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.x);
+  bailif0(fid = (*env)->GetFieldID(env, cls, "y", "I"));
+  (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.y);
+  bailif0(fid = (*env)->GetFieldID(env, cls, "width", "I"));
+  (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.w);
+  bailif0(fid = (*env)->GetFieldID(env, cls, "height", "I"));
+  (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.h);
 
-	bailif0(cls=(*env)->GetObjectClass(env, cfobj));
-	bailif0(mid=(*env)->GetMethodID(env, cls, "customFilter",
-		"(Ljava/nio/ShortBuffer;Ljava/awt/Rectangle;Ljava/awt/Rectangle;IILorg/libjpegturbo/turbojpeg/TJTransform;)V"));
-	(*env)->CallVoidMethod(env, cfobj, mid, bufobj, arrayRegionObj,
-		planeRegionObj, componentIndex, transformIndex, tobj);
+  bailif0(cls = (*env)->GetObjectClass(env, cfobj));
+  bailif0(mid = (*env)->GetMethodID(env, cls, "customFilter",
+    "(Ljava/nio/ShortBuffer;Ljava/awt/Rectangle;Ljava/awt/Rectangle;IILorg/libjpegturbo/turbojpeg/TJTransform;)V"));
+  (*env)->CallVoidMethod(env, cfobj, mid, bufobj, arrayRegionObj,
+                         planeRegionObj, componentIndex, transformIndex, tobj);
 
-	return 0;
+  return 0;
 
-	bailout:
-	return -1;
+bailout:
+  return -1;
 }
 
 /* TurboJPEG 1.2.x: TJTransformer::transform() */
 JNIEXPORT jintArray JNICALL Java_org_libjpegturbo_turbojpeg_TJTransformer_transform
-	(JNIEnv *env, jobject obj, jbyteArray jsrcBuf, jint jpegSize,
-		jobjectArray dstobjs, jobjectArray tobjs, jint flags)
+  (JNIEnv *env, jobject obj, jbyteArray jsrcBuf, jint jpegSize,
+   jobjectArray dstobjs, jobjectArray tobjs, jint flags)
 {
-	tjhandle handle=0;  int i;
-	unsigned char *jpegBuf=NULL, **dstBufs=NULL;  jsize n=0;
-	unsigned long *dstSizes=NULL;  tjtransform *t=NULL;
-	jbyteArray *jdstBufs=NULL;
-	int jpegWidth=0, jpegHeight=0, jpegSubsamp;
-	jintArray jdstSizes=0;  jint *dstSizesi=NULL;
-	JNICustomFilterParams *params=NULL;
+  tjhandle handle = 0;
+  unsigned char *jpegBuf = NULL, **dstBufs = NULL;
+  jsize n = 0;
+  unsigned long *dstSizes = NULL;
+  tjtransform *t = NULL;
+  jbyteArray *jdstBufs = NULL;
+  int i, jpegWidth = 0, jpegHeight = 0, jpegSubsamp;
+  jintArray jdstSizes = 0;
+  jint *dstSizesi = NULL;
+  JNICustomFilterParams *params = NULL;
 
-	gethandle();
+  gethandle();
 
-	if((*env)->GetArrayLength(env, jsrcBuf)<jpegSize)
-		_throwarg("Source buffer is not large enough");
-	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
-	jpegWidth=(int)(*env)->GetIntField(env, obj, _fid);
-	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
-	jpegHeight=(int)(*env)->GetIntField(env, obj, _fid);
-	bailif0(_fid=(*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
-	jpegSubsamp=(int)(*env)->GetIntField(env, obj, _fid);
+  if ((*env)->GetArrayLength(env, jsrcBuf) < jpegSize)
+    _throwarg("Source buffer is not large enough");
+  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
+  jpegWidth = (int)(*env)->GetIntField(env, obj, _fid);
+  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
+  jpegHeight = (int)(*env)->GetIntField(env, obj, _fid);
+  bailif0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
+  jpegSubsamp = (int)(*env)->GetIntField(env, obj, _fid);
 
-	n=(*env)->GetArrayLength(env, dstobjs);
-	if(n!=(*env)->GetArrayLength(env, tobjs))
-		_throwarg("Mismatch between size of transforms array and destination buffers array");
+  n = (*env)->GetArrayLength(env, dstobjs);
+  if (n != (*env)->GetArrayLength(env, tobjs))
+    _throwarg("Mismatch between size of transforms array and destination buffers array");
 
-	if((dstBufs=(unsigned char **)malloc(sizeof(unsigned char *)*n))==NULL)
-		_throwmem();
-	if((jdstBufs=(jbyteArray *)malloc(sizeof(jbyteArray)*n))==NULL)
-		_throwmem();
-	if((dstSizes=(unsigned long *)malloc(sizeof(unsigned long)*n))==NULL)
-		_throwmem();
-	if((t=(tjtransform *)malloc(sizeof(tjtransform)*n))==NULL)
-		_throwmem();
-	if((params=(JNICustomFilterParams *)malloc(sizeof(JNICustomFilterParams)*n))
-		==NULL)
-		_throwmem();
-	for(i=0; i<n; i++)
-	{
-		dstBufs[i]=NULL;  jdstBufs[i]=NULL;  dstSizes[i]=0;
-		memset(&t[i], 0, sizeof(tjtransform));
-		memset(&params[i], 0, sizeof(JNICustomFilterParams));
-	}
+  if ((dstBufs =
+       (unsigned char **)malloc(sizeof(unsigned char *) * n)) == NULL)
+    _throwmem();
+  if ((jdstBufs = (jbyteArray *)malloc(sizeof(jbyteArray) * n)) == NULL)
+    _throwmem();
+  if ((dstSizes = (unsigned long *)malloc(sizeof(unsigned long) * n)) == NULL)
+    _throwmem();
+  if ((t = (tjtransform *)malloc(sizeof(tjtransform) * n)) == NULL)
+    _throwmem();
+  if ((params = (JNICustomFilterParams *)malloc(sizeof(JNICustomFilterParams) *
+                                                n)) == NULL)
+    _throwmem();
+  for (i = 0; i < n; i++) {
+    dstBufs[i] = NULL;  jdstBufs[i] = NULL;  dstSizes[i] = 0;
+    memset(&t[i], 0, sizeof(tjtransform));
+    memset(&params[i], 0, sizeof(JNICustomFilterParams));
+  }
 
-	for(i=0; i<n; i++)
-	{
-		jobject tobj, cfobj;
+  for (i = 0; i < n; i++) {
+    jobject tobj, cfobj;
 
-		bailif0(tobj=(*env)->GetObjectArrayElement(env, tobjs, i));
-		bailif0(_cls=(*env)->GetObjectClass(env, tobj));
-		bailif0(_fid=(*env)->GetFieldID(env, _cls, "op", "I"));
-		t[i].op=(*env)->GetIntField(env, tobj, _fid);
-		bailif0(_fid=(*env)->GetFieldID(env, _cls, "options", "I"));
-		t[i].options=(*env)->GetIntField(env, tobj, _fid);
-		bailif0(_fid=(*env)->GetFieldID(env, _cls, "x", "I"));
-		t[i].r.x=(*env)->GetIntField(env, tobj, _fid);
-		bailif0(_fid=(*env)->GetFieldID(env, _cls, "y", "I"));
-		t[i].r.y=(*env)->GetIntField(env, tobj, _fid);
-		bailif0(_fid=(*env)->GetFieldID(env, _cls, "width", "I"));
-		t[i].r.w=(*env)->GetIntField(env, tobj, _fid);
-		bailif0(_fid=(*env)->GetFieldID(env, _cls, "height", "I"));
-		t[i].r.h=(*env)->GetIntField(env, tobj, _fid);
+    bailif0(tobj = (*env)->GetObjectArrayElement(env, tobjs, i));
+    bailif0(_cls = (*env)->GetObjectClass(env, tobj));
+    bailif0(_fid = (*env)->GetFieldID(env, _cls, "op", "I"));
+    t[i].op = (*env)->GetIntField(env, tobj, _fid);
+    bailif0(_fid = (*env)->GetFieldID(env, _cls, "options", "I"));
+    t[i].options = (*env)->GetIntField(env, tobj, _fid);
+    bailif0(_fid = (*env)->GetFieldID(env, _cls, "x", "I"));
+    t[i].r.x = (*env)->GetIntField(env, tobj, _fid);
+    bailif0(_fid = (*env)->GetFieldID(env, _cls, "y", "I"));
+    t[i].r.y = (*env)->GetIntField(env, tobj, _fid);
+    bailif0(_fid = (*env)->GetFieldID(env, _cls, "width", "I"));
+    t[i].r.w = (*env)->GetIntField(env, tobj, _fid);
+    bailif0(_fid = (*env)->GetFieldID(env, _cls, "height", "I"));
+    t[i].r.h = (*env)->GetIntField(env, tobj, _fid);
 
-		bailif0(_fid=(*env)->GetFieldID(env, _cls, "cf",
-			"Lorg/libjpegturbo/turbojpeg/TJCustomFilter;"));
-		cfobj=(*env)->GetObjectField(env, tobj, _fid);
-		if(cfobj)
-		{
-			params[i].env=env;
-			params[i].tobj=tobj;
-			params[i].cfobj=cfobj;
-			t[i].customFilter=JNICustomFilter;
-			t[i].data=(void *)&params[i];
-		}
-	}
+    bailif0(_fid = (*env)->GetFieldID(env, _cls, "cf",
+      "Lorg/libjpegturbo/turbojpeg/TJCustomFilter;"));
+    cfobj = (*env)->GetObjectField(env, tobj, _fid);
+    if (cfobj) {
+      params[i].env = env;
+      params[i].tobj = tobj;
+      params[i].cfobj = cfobj;
+      t[i].customFilter = JNICustomFilter;
+      t[i].data = (void *)&params[i];
+    }
+  }
 
-	for(i=0; i<n; i++)
-	{
-		int w=jpegWidth, h=jpegHeight;
-		if(t[i].r.w!=0) w=t[i].r.w;
-		if(t[i].r.h!=0) h=t[i].r.h;
-		bailif0(jdstBufs[i]=(*env)->GetObjectArrayElement(env, dstobjs, i));
-		if((unsigned long)(*env)->GetArrayLength(env, jdstBufs[i])
-			<tjBufSize(w, h, jpegSubsamp))
-			_throwarg("Destination buffer is not large enough");
-	}
-	bailif0(jpegBuf=(*env)->GetPrimitiveArrayCritical(env, jsrcBuf, 0));
-	for(i=0; i<n; i++)
-		bailif0(dstBufs[i]=(*env)->GetPrimitiveArrayCritical(env, jdstBufs[i], 0));
+  for (i = 0; i < n; i++) {
+    int w = jpegWidth, h = jpegHeight;
 
-	if(tjTransform(handle, jpegBuf, jpegSize, n, dstBufs, dstSizes, t,
-		flags|TJFLAG_NOREALLOC)==-1)
-		_throwtj();
+    if (t[i].r.w != 0) w = t[i].r.w;
+    if (t[i].r.h != 0) h = t[i].r.h;
+    bailif0(jdstBufs[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
+    if ((unsigned long)(*env)->GetArrayLength(env, jdstBufs[i]) <
+        tjBufSize(w, h, jpegSubsamp))
+      _throwarg("Destination buffer is not large enough");
+  }
+  bailif0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, jsrcBuf, 0));
+  for (i = 0; i < n; i++)
+    bailif0(dstBufs[i] =
+            (*env)->GetPrimitiveArrayCritical(env, jdstBufs[i], 0));
 
-	for(i=0; i<n; i++)
-	{
-		(*env)->ReleasePrimitiveArrayCritical(env, jdstBufs[i], dstBufs[i], 0);
-		dstBufs[i]=NULL;
-	}
-	(*env)->ReleasePrimitiveArrayCritical(env, jsrcBuf, jpegBuf, 0);
-	jpegBuf=NULL;
+  if (tjTransform(handle, jpegBuf, jpegSize, n, dstBufs, dstSizes, t,
+                  flags | TJFLAG_NOREALLOC) == -1)
+    _throwtj();
 
-	jdstSizes=(*env)->NewIntArray(env, n);
-	bailif0(dstSizesi=(*env)->GetIntArrayElements(env, jdstSizes, 0));
-	for(i=0; i<n; i++) dstSizesi[i]=(int)dstSizes[i];
+  for (i = 0; i < n; i++) {
+    (*env)->ReleasePrimitiveArrayCritical(env, jdstBufs[i], dstBufs[i], 0);
+    dstBufs[i] = NULL;
+  }
+  (*env)->ReleasePrimitiveArrayCritical(env, jsrcBuf, jpegBuf, 0);
+  jpegBuf = NULL;
 
-	bailout:
-	if(dstSizesi) (*env)->ReleaseIntArrayElements(env, jdstSizes, dstSizesi, 0);
-	if(dstBufs)
-	{
-		for(i=0; i<n; i++)
-		{
-			if(dstBufs[i] && jdstBufs && jdstBufs[i])
-				(*env)->ReleasePrimitiveArrayCritical(env, jdstBufs[i], dstBufs[i], 0);
-		}
-		free(dstBufs);
-	}
-	if(jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, jsrcBuf, jpegBuf, 0);
-	if(jdstBufs) free(jdstBufs);
-	if(dstSizes) free(dstSizes);
-	if(t) free(t);
-	return jdstSizes;
+  jdstSizes = (*env)->NewIntArray(env, n);
+  bailif0(dstSizesi = (*env)->GetIntArrayElements(env, jdstSizes, 0));
+  for (i = 0; i < n; i++) dstSizesi[i] = (int)dstSizes[i];
+
+bailout:
+  if (dstSizesi) (*env)->ReleaseIntArrayElements(env, jdstSizes, dstSizesi, 0);
+  if (dstBufs) {
+    for (i = 0; i < n; i++) {
+      if (dstBufs[i] && jdstBufs && jdstBufs[i])
+        (*env)->ReleasePrimitiveArrayCritical(env, jdstBufs[i], dstBufs[i], 0);
+    }
+    free(dstBufs);
+  }
+  if (jpegBuf) (*env)->ReleasePrimitiveArrayCritical(env, jsrcBuf, jpegBuf, 0);
+  if (jdstBufs) free(jdstBufs);
+  if (dstSizes) free(dstSizes);
+  if (t) free(t);
+  return jdstSizes;
 }
 
 /* TurboJPEG 1.2.x: TJDecompressor::destroy() */
 JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_destroy
-	(JNIEnv *env, jobject obj)
+  (JNIEnv *env, jobject obj)
 {
-	Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy(env, obj);
+  Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy(env, obj);
 }
diff --git a/turbojpeg-mapfile b/turbojpeg-mapfile
index 35d55ae..b1381ed 100755
--- a/turbojpeg-mapfile
+++ b/turbojpeg-mapfile
@@ -1,56 +1,65 @@
 TURBOJPEG_1.0
 {
-	global:
-		tjInitCompress;
-		tjCompress;
-		TJBUFSIZE;
-		tjInitDecompress;
-		tjDecompressHeader;
-		tjDecompress;
-		tjDestroy;
-		tjGetErrorStr;
-	local:
-		*;
+  global:
+    tjInitCompress;
+    tjCompress;
+    TJBUFSIZE;
+    tjInitDecompress;
+    tjDecompressHeader;
+    tjDecompress;
+    tjDestroy;
+    tjGetErrorStr;
+  local:
+    *;
 };
 
 TURBOJPEG_1.1
 {
-	global:
-		TJBUFSIZEYUV;
-		tjDecompressHeader2;
-		tjDecompressToYUV;
-		tjEncodeYUV;
+  global:
+    TJBUFSIZEYUV;
+    tjDecompressHeader2;
+    tjDecompressToYUV;
+    tjEncodeYUV;
 } TURBOJPEG_1.0;
 
 TURBOJPEG_1.2
 {
-	global:
-		tjAlloc;
-		tjBufSize;
-		tjBufSizeYUV;
-		tjCompress2;
-		tjDecompress2;
-		tjEncodeYUV2;
-		tjFree;
-		tjGetScalingFactors;
-		tjInitTransform;
-		tjTransform;
+  global:
+    tjAlloc;
+    tjBufSize;
+    tjBufSizeYUV;
+    tjCompress2;
+    tjDecompress2;
+    tjEncodeYUV2;
+    tjFree;
+    tjGetScalingFactors;
+    tjInitTransform;
+    tjTransform;
 } TURBOJPEG_1.1;
 
 TURBOJPEG_1.4
 {
-	global:
-		tjBufSizeYUV2;
-		tjCompressFromYUV;
-		tjCompressFromYUVPlanes;
-		tjDecodeYUV;
-		tjDecodeYUVPlanes;
-		tjDecompressHeader3;
-		tjDecompressToYUV2;
-		tjDecompressToYUVPlanes;
-		tjEncodeYUV3;
-		tjEncodeYUVPlanes;
-		tjPlaneHeight;
-		tjPlaneSizeYUV;
-		tjPlaneWidth;
+  global:
+    tjBufSizeYUV2;
+    tjCompressFromYUV;
+    tjCompressFromYUVPlanes;
+    tjDecodeYUV;
+    tjDecodeYUVPlanes;
+    tjDecompressHeader3;
+    tjDecompressToYUV2;
+    tjDecompressToYUVPlanes;
+    tjEncodeYUV3;
+    tjEncodeYUVPlanes;
+    tjPlaneHeight;
+    tjPlaneSizeYUV;
+    tjPlaneWidth;
 } TURBOJPEG_1.2;
+
+TURBOJPEG_1.6
+{
+  global:
+    tjGetErrorCode;
+    tjGetErrorStr2;
+    tjLoadImage;
+    tjSaveImage;
+} TURBOJPEG_1.4;
diff --git a/turbojpeg-mapfile.jni b/turbojpeg-mapfile.jni
index 9c1d25b..ce8913c 100755
--- a/turbojpeg-mapfile.jni
+++ b/turbojpeg-mapfile.jni
@@ -1,92 +1,101 @@
 TURBOJPEG_1.0
 {
-	global:
-		tjInitCompress;
-		tjCompress;
-		TJBUFSIZE;
-		tjInitDecompress;
-		tjDecompressHeader;
-		tjDecompress;
-		tjDestroy;
-		tjGetErrorStr;
-	local:
-		*;
+  global:
+    tjInitCompress;
+    tjCompress;
+    TJBUFSIZE;
+    tjInitDecompress;
+    tjDecompressHeader;
+    tjDecompress;
+    tjDestroy;
+    tjGetErrorStr;
+  local:
+    *;
 };
 
 TURBOJPEG_1.1
 {
-	global:
-		TJBUFSIZEYUV;
-		tjDecompressHeader2;
-		tjDecompressToYUV;
-		tjEncodeYUV;
+  global:
+    TJBUFSIZEYUV;
+    tjDecompressHeader2;
+    tjDecompressToYUV;
+    tjEncodeYUV;
 } TURBOJPEG_1.0;
 
 TURBOJPEG_1.2
 {
-	global:
-		tjAlloc;
-		tjBufSize;
-		tjBufSizeYUV;
-		tjCompress2;
-		tjDecompress2;
-		tjEncodeYUV2;
-		tjFree;
-		tjGetScalingFactors;
-		tjInitTransform;
-		tjTransform;
-		Java_org_libjpegturbo_turbojpeg_TJ_bufSize;
-		Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__III;
-		Java_org_libjpegturbo_turbojpeg_TJ_getScalingFactors;
-		Java_org_libjpegturbo_turbojpeg_TJCompressor_init;
-		Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3BIIII_3BIII;
-		Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3IIIII_3BIII;
-		Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIII_3BII;
-		Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BII;
-		Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy;
-		Java_org_libjpegturbo_turbojpeg_TJDecompressor_init;
-		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressHeader;
-		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIII;
-		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIII;
-		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BI;
-		Java_org_libjpegturbo_turbojpeg_TJDecompressor_destroy;
-		Java_org_libjpegturbo_turbojpeg_TJTransformer_init;
-		Java_org_libjpegturbo_turbojpeg_TJTransformer_transform;
+  global:
+    tjAlloc;
+    tjBufSize;
+    tjBufSizeYUV;
+    tjCompress2;
+    tjDecompress2;
+    tjEncodeYUV2;
+    tjFree;
+    tjGetScalingFactors;
+    tjInitTransform;
+    tjTransform;
+    Java_org_libjpegturbo_turbojpeg_TJ_bufSize;
+    Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__III;
+    Java_org_libjpegturbo_turbojpeg_TJ_getScalingFactors;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_init;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3BIIII_3BIII;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3IIIII_3BIII;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIII_3BII;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BII;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_init;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressHeader;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIII;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIII;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BI;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_destroy;
+    Java_org_libjpegturbo_turbojpeg_TJTransformer_init;
+    Java_org_libjpegturbo_turbojpeg_TJTransformer_transform;
 } TURBOJPEG_1.1;
 
 TURBOJPEG_1.3
 {
-	global:
-		Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3BIIIIII_3BIII;
-		Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3IIIIIII_3BIII;
-		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIIIII;
-		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIIIII;
+  global:
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3BIIIIII_3BIII;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3IIIIIII_3BIII;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIIIII;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIIIII;
 } TURBOJPEG_1.2;
 
 TURBOJPEG_1.4
 {
-	global:
-		tjBufSizeYUV2;
-		tjCompressFromYUV;
-		tjCompressFromYUVPlanes;
-		tjDecodeYUV;
-		tjDecodeYUVPlanes;
-		tjDecompressHeader3;
-		tjDecompressToYUV2;
-		tjDecompressToYUVPlanes;
-		tjEncodeYUV3;
-		tjEncodeYUVPlanes;
-		tjPlaneHeight;
-		tjPlaneSizeYUV;
-		tjPlaneWidth;
-		Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII;
-		Java_org_libjpegturbo_turbojpeg_TJCompressor_compressFromYUV___3_3B_3II_3III_3BII;
-		Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIIIII_3_3B_3I_3III;
-		Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIIIII_3_3B_3I_3III;
-		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3_3B_3II_3III;
-		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decodeYUV___3_3B_3I_3II_3BIIIIIII;
-		Java_org_libjpegturbo_turbojpeg_TJDecompressor_decodeYUV___3_3B_3I_3II_3IIIIIIII;
-		Java_org_libjpegturbo_turbojpeg_TJ_planeHeight__III;
-		Java_org_libjpegturbo_turbojpeg_TJ_planeSizeYUV__IIIII;
-		Java_org_libjpegturbo_turbojpeg_TJ_planeWidth__III;
+  global:
+    tjBufSizeYUV2;
+    tjCompressFromYUV;
+    tjCompressFromYUVPlanes;
+    tjDecodeYUV;
+    tjDecodeYUVPlanes;
+    tjDecompressHeader3;
+    tjDecompressToYUV2;
+    tjDecompressToYUVPlanes;
+    tjEncodeYUV3;
+    tjEncodeYUVPlanes;
+    tjPlaneHeight;
+    tjPlaneSizeYUV;
+    tjPlaneWidth;
+    Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_compressFromYUV___3_3B_3II_3III_3BII;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIIIII_3_3B_3I_3III;
+    Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIIIII_3_3B_3I_3III;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3_3B_3II_3III;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_decodeYUV___3_3B_3I_3II_3BIIIIIII;
+    Java_org_libjpegturbo_turbojpeg_TJDecompressor_decodeYUV___3_3B_3I_3II_3IIIIIIII;
+    Java_org_libjpegturbo_turbojpeg_TJ_planeHeight__III;
+    Java_org_libjpegturbo_turbojpeg_TJ_planeSizeYUV__IIIII;
+    Java_org_libjpegturbo_turbojpeg_TJ_planeWidth__III;
 } TURBOJPEG_1.3;
+
+TURBOJPEG_1.6
+{
+  global:
+    tjGetErrorCode;
+    tjGetErrorStr2;
+    tjLoadImage;
+    tjSaveImage;
+} TURBOJPEG_1.4;
diff --git a/turbojpeg.c b/turbojpeg.c
index 662c68f..847cf6f 100644
--- a/turbojpeg.c
+++ b/turbojpeg.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2017 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2018 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,525 +37,363 @@
 #include <jpeglib.h>
 #include <jerror.h>
 #include <setjmp.h>
+#include <errno.h>
 #include "./turbojpeg.h"
-#include "./tjutil.h"
 #include "transupp.h"
 #include "./jpegcomp.h"
+#include "./cdjpeg.h"
 
-extern void jpeg_mem_dest_tj(j_compress_ptr, unsigned char **,
-	unsigned long *, boolean);
+extern void jpeg_mem_dest_tj(j_compress_ptr, unsigned char **, unsigned long *,
+                             boolean);
 extern void jpeg_mem_src_tj(j_decompress_ptr, const unsigned char *,
-	unsigned long);
+                            unsigned long);
 
-#define PAD(v, p) ((v+(p)-1)&(~((p)-1)))
-#define isPow2(x) (((x)&(x-1))==0)
+#define PAD(v, p) ((v + (p) - 1) & (~((p) - 1)))
+#define isPow2(x) (((x) & (x - 1)) == 0)
 
 
-/* Error handling (based on example in example.c) */
+/* Error handling (based on example in example.txt) */
 
-static char errStr[JMSG_LENGTH_MAX]="No error";
+static char errStr[JMSG_LENGTH_MAX] = "No error";
 
-struct my_error_mgr
-{
-	struct jpeg_error_mgr pub;
-	jmp_buf setjmp_buffer;
-	void (*emit_message)(j_common_ptr, int);
-	boolean warning;
+struct my_error_mgr {
+  struct jpeg_error_mgr pub;
+  jmp_buf setjmp_buffer;
+  void (*emit_message) (j_common_ptr, int);
+  boolean warning, stopOnWarning;
 };
 typedef struct my_error_mgr *my_error_ptr;
 
 static void my_error_exit(j_common_ptr cinfo)
 {
-	my_error_ptr myerr=(my_error_ptr)cinfo->err;
-	(*cinfo->err->output_message)(cinfo);
-	longjmp(myerr->setjmp_buffer, 1);
+  my_error_ptr myerr = (my_error_ptr)cinfo->err;
+
+  (*cinfo->err->output_message) (cinfo);
+  longjmp(myerr->setjmp_buffer, 1);
 }
 
 /* Based on output_message() in jerror.c */
 
 static void my_output_message(j_common_ptr cinfo)
 {
-	(*cinfo->err->format_message)(cinfo, errStr);
+  (*cinfo->err->format_message) (cinfo, errStr);
 }
 
 static void my_emit_message(j_common_ptr cinfo, int msg_level)
 {
-	my_error_ptr myerr=(my_error_ptr)cinfo->err;
-	myerr->emit_message(cinfo, msg_level);
-	if(msg_level<0) myerr->warning=TRUE;
+  my_error_ptr myerr = (my_error_ptr)cinfo->err;
+
+  myerr->emit_message(cinfo, msg_level);
+  if (msg_level < 0) {
+    myerr->warning = TRUE;
+    if (myerr->stopOnWarning) longjmp(myerr->setjmp_buffer, 1);
+  }
 }
 
 
 /* Global structures, macros, etc. */
 
-enum {COMPRESS=1, DECOMPRESS=2};
+enum { COMPRESS = 1, DECOMPRESS = 2 };
 
-typedef struct _tjinstance
-{
-	struct jpeg_compress_struct cinfo;
-	struct jpeg_decompress_struct dinfo;
-	struct my_error_mgr jerr;
-	int init, headerRead;
+typedef struct _tjinstance {
+  struct jpeg_compress_struct cinfo;
+  struct jpeg_decompress_struct dinfo;
+  struct my_error_mgr jerr;
+  int init, headerRead;
+  char errStr[JMSG_LENGTH_MAX];
+  boolean isInstanceError;
 } tjinstance;
 
-static const int pixelsize[TJ_NUMSAMP]={3, 3, 3, 1, 3, 3};
+static const int pixelsize[TJ_NUMSAMP] = { 3, 3, 3, 1, 3, 3 };
 
-static const JXFORM_CODE xformtypes[TJ_NUMXOP]=
-{
-	JXFORM_NONE, JXFORM_FLIP_H, JXFORM_FLIP_V, JXFORM_TRANSPOSE,
-	JXFORM_TRANSVERSE, JXFORM_ROT_90, JXFORM_ROT_180, JXFORM_ROT_270
+static const JXFORM_CODE xformtypes[TJ_NUMXOP] = {
+  JXFORM_NONE, JXFORM_FLIP_H, JXFORM_FLIP_V, JXFORM_TRANSPOSE,
+  JXFORM_TRANSVERSE, JXFORM_ROT_90, JXFORM_ROT_180, JXFORM_ROT_270
 };
 
 #define NUMSF 16
-static const tjscalingfactor sf[NUMSF]={
-	{2, 1},
-	{15, 8},
-	{7, 4},
-	{13, 8},
-	{3, 2},
-	{11, 8},
-	{5, 4},
-	{9, 8},
-	{1, 1},
-	{7, 8},
-	{3, 4},
-	{5, 8},
-	{1, 2},
-	{3, 8},
-	{1, 4},
-	{1, 8}
+static const tjscalingfactor sf[NUMSF] = {
+  { 2, 1 },
+  { 15, 8 },
+  { 7, 4 },
+  { 13, 8 },
+  { 3, 2 },
+  { 11, 8 },
+  { 5, 4 },
+  { 9, 8 },
+  { 1, 1 },
+  { 7, 8 },
+  { 3, 4 },
+  { 5, 8 },
+  { 1, 2 },
+  { 3, 8 },
+  { 1, 4 },
+  { 1, 8 }
 };
 
-#define _throw(m) {snprintf(errStr, JMSG_LENGTH_MAX, "%s", m);  \
-	retval=-1;  goto bailout;}
-#define getinstance(handle) tjinstance *this=(tjinstance *)handle;  \
-	j_compress_ptr cinfo=NULL;  j_decompress_ptr dinfo=NULL;  \
-	if(!this) {snprintf(errStr, JMSG_LENGTH_MAX, "Invalid handle");  \
-		return -1;}  \
-	cinfo=&this->cinfo;  dinfo=&this->dinfo;  \
-	this->jerr.warning=FALSE;
-#define getcinstance(handle) tjinstance *this=(tjinstance *)handle;  \
-	j_compress_ptr cinfo=NULL;  \
-	if(!this) {snprintf(errStr, JMSG_LENGTH_MAX, "Invalid handle");  \
-		return -1;}  \
-	cinfo=&this->cinfo;  \
-	this->jerr.warning=FALSE;
-#define getdinstance(handle) tjinstance *this=(tjinstance *)handle;  \
-	j_decompress_ptr dinfo=NULL;  \
-	if(!this) {snprintf(errStr, JMSG_LENGTH_MAX, "Invalid handle");  \
-		return -1;}  \
-	dinfo=&this->dinfo;  \
-	this->jerr.warning=FALSE;
+static J_COLOR_SPACE pf2cs[TJ_NUMPF] = {
+  JCS_EXT_RGB, JCS_EXT_BGR, JCS_EXT_RGBX, JCS_EXT_BGRX, JCS_EXT_XBGR,
+  JCS_EXT_XRGB, JCS_GRAYSCALE, JCS_EXT_RGBA, JCS_EXT_BGRA, JCS_EXT_ABGR,
+  JCS_EXT_ARGB, JCS_CMYK
+};
+
+static int cs2pf[JPEG_NUMCS] = {
+  TJPF_UNKNOWN, TJPF_GRAY,
+#if RGB_RED == 0 && RGB_GREEN == 1 && RGB_BLUE == 2 && RGB_PIXELSIZE == 3
+  TJPF_RGB,
+#elif RGB_RED == 2 && RGB_GREEN == 1 && RGB_BLUE == 0 && RGB_PIXELSIZE == 3
+  TJPF_BGR,
+#elif RGB_RED == 0 && RGB_GREEN == 1 && RGB_BLUE == 2 && RGB_PIXELSIZE == 4
+  TJPF_RGBX,
+#elif RGB_RED == 2 && RGB_GREEN == 1 && RGB_BLUE == 0 && RGB_PIXELSIZE == 4
+  TJPF_BGRX,
+#elif RGB_RED == 3 && RGB_GREEN == 2 && RGB_BLUE == 1 && RGB_PIXELSIZE == 4
+  TJPF_XBGR,
+#elif RGB_RED == 1 && RGB_GREEN == 2 && RGB_BLUE == 3 && RGB_PIXELSIZE == 4
+  TJPF_XRGB,
+#endif
+  TJPF_UNKNOWN, TJPF_CMYK, TJPF_UNKNOWN, TJPF_RGB, TJPF_RGBX, TJPF_BGR,
+  TJPF_BGRX, TJPF_XBGR, TJPF_XRGB, TJPF_RGBA, TJPF_BGRA, TJPF_ABGR, TJPF_ARGB,
+  TJPF_UNKNOWN
+};
+
+#define _throwg(m) { \
+  snprintf(errStr, JMSG_LENGTH_MAX, "%s", m); \
+  retval = -1;  goto bailout; \
+}
+#define _throwunix(m) { \
+  snprintf(errStr, JMSG_LENGTH_MAX, "%s\n%s", m, strerror(errno)); \
+  retval = -1;  goto bailout; \
+}
+#define _throw(m) { \
+  snprintf(this->errStr, JMSG_LENGTH_MAX, "%s", m); \
+  this->isInstanceError = TRUE;  _throwg(m); \
+}
+
+#define getinstance(handle) \
+  tjinstance *this = (tjinstance *)handle; \
+  j_compress_ptr cinfo = NULL; \
+  j_decompress_ptr dinfo = NULL; \
+  \
+  if (!this) { \
+    snprintf(errStr, JMSG_LENGTH_MAX, "Invalid handle"); \
+    return -1; \
+  } \
+  cinfo = &this->cinfo;  dinfo = &this->dinfo; \
+  this->jerr.warning = FALSE; \
+  this->isInstanceError = FALSE;
+
+#define getcinstance(handle) \
+  tjinstance *this = (tjinstance *)handle; \
+  j_compress_ptr cinfo = NULL; \
+  \
+  if (!this) { \
+    snprintf(errStr, JMSG_LENGTH_MAX, "Invalid handle"); \
+    return -1; \
+  } \
+  cinfo = &this->cinfo; \
+  this->jerr.warning = FALSE; \
+  this->isInstanceError = FALSE;
+
+#define getdinstance(handle) \
+  tjinstance *this = (tjinstance *)handle; \
+  j_decompress_ptr dinfo = NULL; \
+  \
+  if (!this) { \
+    snprintf(errStr, JMSG_LENGTH_MAX, "Invalid handle"); \
+    return -1; \
+  } \
+  dinfo = &this->dinfo; \
+  this->jerr.warning = FALSE; \
+  this->isInstanceError = FALSE;
 
 static int getPixelFormat(int pixelSize, int flags)
 {
-	if(pixelSize==1) return TJPF_GRAY;
-	if(pixelSize==3)
-	{
-		if(flags&TJ_BGR) return TJPF_BGR;
-		else return TJPF_RGB;
-	}
-	if(pixelSize==4)
-	{
-		if(flags&TJ_ALPHAFIRST)
-		{
-			if(flags&TJ_BGR) return TJPF_XBGR;
-			else return TJPF_XRGB;
-		}
-		else
-		{
-			if(flags&TJ_BGR) return TJPF_BGRX;
-			else return TJPF_RGBX;
-		}
-	}
-	return -1;
+  if (pixelSize == 1) return TJPF_GRAY;
+  if (pixelSize == 3) {
+    if (flags & TJ_BGR) return TJPF_BGR;
+    else return TJPF_RGB;
+  }
+  if (pixelSize == 4) {
+    if (flags & TJ_ALPHAFIRST) {
+      if (flags & TJ_BGR) return TJPF_XBGR;
+      else return TJPF_XRGB;
+    } else {
+      if (flags & TJ_BGR) return TJPF_BGRX;
+      else return TJPF_RGBX;
+    }
+  }
+  return -1;
 }
 
-static int setCompDefaults(struct jpeg_compress_struct *cinfo,
-	int pixelFormat, int subsamp, int jpegQual, int flags)
+static int setCompDefaults(struct jpeg_compress_struct *cinfo, int pixelFormat,
+                           int subsamp, int jpegQual, int flags)
 {
-	int retval=0;
-	char *env=NULL;
+  int retval = 0;
+  char *env = NULL;
 
-	switch(pixelFormat)
-	{
-		case TJPF_GRAY:
-			cinfo->in_color_space=JCS_GRAYSCALE;  break;
-		#if JCS_EXTENSIONS==1
-		case TJPF_RGB:
-			cinfo->in_color_space=JCS_EXT_RGB;  break;
-		case TJPF_BGR:
-			cinfo->in_color_space=JCS_EXT_BGR;  break;
-		case TJPF_RGBX:
-		case TJPF_RGBA:
-			cinfo->in_color_space=JCS_EXT_RGBX;  break;
-		case TJPF_BGRX:
-		case TJPF_BGRA:
-			cinfo->in_color_space=JCS_EXT_BGRX;  break;
-		case TJPF_XRGB:
-		case TJPF_ARGB:
-			cinfo->in_color_space=JCS_EXT_XRGB;  break;
-		case TJPF_XBGR:
-		case TJPF_ABGR:
-			cinfo->in_color_space=JCS_EXT_XBGR;  break;
-		#else
-		case TJPF_RGB:
-		case TJPF_BGR:
-		case TJPF_RGBX:
-		case TJPF_BGRX:
-		case TJPF_XRGB:
-		case TJPF_XBGR:
-		case TJPF_RGBA:
-		case TJPF_BGRA:
-		case TJPF_ARGB:
-		case TJPF_ABGR:
-			cinfo->in_color_space=JCS_RGB;  pixelFormat=TJPF_RGB;
-			break;
-		#endif
-		case TJPF_CMYK:
-			cinfo->in_color_space=JCS_CMYK;  break;
-	}
-
-	cinfo->input_components=tjPixelSize[pixelFormat];
-	jpeg_set_defaults(cinfo);
+  cinfo->in_color_space = pf2cs[pixelFormat];
+  cinfo->input_components = tjPixelSize[pixelFormat];
+  jpeg_set_defaults(cinfo);
 
 #ifndef NO_GETENV
-	if((env=getenv("TJ_OPTIMIZE"))!=NULL && strlen(env)>0 && !strcmp(env, "1"))
-		cinfo->optimize_coding=TRUE;
-	if((env=getenv("TJ_ARITHMETIC"))!=NULL && strlen(env)>0 && !strcmp(env, "1"))
-		cinfo->arith_code=TRUE;
-	if((env=getenv("TJ_RESTART"))!=NULL && strlen(env)>0)
-	{
-		int temp=-1;  char tempc=0;
-		if(sscanf(env, "%d%c", &temp, &tempc)>=1 && temp>=0 && temp<=65535)
-		{
-			if(toupper(tempc)=='B')
-			{
-				cinfo->restart_interval=temp;
-				cinfo->restart_in_rows=0;
-			}
-			else
-				cinfo->restart_in_rows=temp;
-		}
-	}
+  if ((env = getenv("TJ_OPTIMIZE")) != NULL && strlen(env) > 0 &&
+      !strcmp(env, "1"))
+    cinfo->optimize_coding = TRUE;
+  if ((env = getenv("TJ_ARITHMETIC")) != NULL && strlen(env) > 0 &&
+      !strcmp(env, "1"))
+    cinfo->arith_code = TRUE;
+  if ((env = getenv("TJ_RESTART")) != NULL && strlen(env) > 0) {
+    int temp = -1;
+    char tempc = 0;
+
+    if (sscanf(env, "%d%c", &temp, &tempc) >= 1 && temp >= 0 &&
+        temp <= 65535) {
+      if (toupper(tempc) == 'B') {
+        cinfo->restart_interval = temp;
+        cinfo->restart_in_rows = 0;
+      } else
+        cinfo->restart_in_rows = temp;
+    }
+  }
 #endif
 
-	if(jpegQual>=0)
-	{
-		jpeg_set_quality(cinfo, jpegQual, TRUE);
-		if(jpegQual>=96 || flags&TJFLAG_ACCURATEDCT) cinfo->dct_method=JDCT_ISLOW;
-		else cinfo->dct_method=JDCT_FASTEST;
-	}
-	if(subsamp==TJSAMP_GRAY)
-		jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
-	else if(pixelFormat==TJPF_CMYK)
-		jpeg_set_colorspace(cinfo, JCS_YCCK);
-	else jpeg_set_colorspace(cinfo, JCS_YCbCr);
+  if (jpegQual >= 0) {
+    jpeg_set_quality(cinfo, jpegQual, TRUE);
+    if (jpegQual >= 96 || flags & TJFLAG_ACCURATEDCT)
+      cinfo->dct_method = JDCT_ISLOW;
+    else
+      cinfo->dct_method = JDCT_FASTEST;
+  }
+  if (subsamp == TJSAMP_GRAY)
+    jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
+  else if (pixelFormat == TJPF_CMYK)
+    jpeg_set_colorspace(cinfo, JCS_YCCK);
+  else
+    jpeg_set_colorspace(cinfo, JCS_YCbCr);
 
+  if (flags & TJFLAG_PROGRESSIVE)
+    jpeg_simple_progression(cinfo);
 #ifndef NO_GETENV
-	if((env=getenv("TJ_PROGRESSIVE"))!=NULL && strlen(env)>0
-		&& !strcmp(env, "1"))
-		jpeg_simple_progression(cinfo);
+  else if ((env = getenv("TJ_PROGRESSIVE")) != NULL && strlen(env) > 0 &&
+           !strcmp(env, "1"))
+    jpeg_simple_progression(cinfo);
 #endif
 
-	cinfo->comp_info[0].h_samp_factor=tjMCUWidth[subsamp]/8;
-	cinfo->comp_info[1].h_samp_factor=1;
-	cinfo->comp_info[2].h_samp_factor=1;
-	if(cinfo->num_components>3)
-		cinfo->comp_info[3].h_samp_factor=tjMCUWidth[subsamp]/8;
-	cinfo->comp_info[0].v_samp_factor=tjMCUHeight[subsamp]/8;
-	cinfo->comp_info[1].v_samp_factor=1;
-	cinfo->comp_info[2].v_samp_factor=1;
-	if(cinfo->num_components>3)
-		cinfo->comp_info[3].v_samp_factor=tjMCUHeight[subsamp]/8;
+  cinfo->comp_info[0].h_samp_factor = tjMCUWidth[subsamp] / 8;
+  cinfo->comp_info[1].h_samp_factor = 1;
+  cinfo->comp_info[2].h_samp_factor = 1;
+  if (cinfo->num_components > 3)
+    cinfo->comp_info[3].h_samp_factor = tjMCUWidth[subsamp] / 8;
+  cinfo->comp_info[0].v_samp_factor = tjMCUHeight[subsamp] / 8;
+  cinfo->comp_info[1].v_samp_factor = 1;
+  cinfo->comp_info[2].v_samp_factor = 1;
+  if (cinfo->num_components > 3)
+    cinfo->comp_info[3].v_samp_factor = tjMCUHeight[subsamp] / 8;
 
-	return retval;
-}
-
-static int setDecompDefaults(struct jpeg_decompress_struct *dinfo,
-	int pixelFormat, int flags)
-{
-	int retval=0;
-
-	switch(pixelFormat)
-	{
-		case TJPF_GRAY:
-			dinfo->out_color_space=JCS_GRAYSCALE;  break;
-		#if JCS_EXTENSIONS==1
-		case TJPF_RGB:
-			dinfo->out_color_space=JCS_EXT_RGB;  break;
-		case TJPF_BGR:
-			dinfo->out_color_space=JCS_EXT_BGR;  break;
-		case TJPF_RGBX:
-			dinfo->out_color_space=JCS_EXT_RGBX;  break;
-		case TJPF_BGRX:
-			dinfo->out_color_space=JCS_EXT_BGRX;  break;
-		case TJPF_XRGB:
-			dinfo->out_color_space=JCS_EXT_XRGB;  break;
-		case TJPF_XBGR:
-			dinfo->out_color_space=JCS_EXT_XBGR;  break;
-		#if JCS_ALPHA_EXTENSIONS==1
-		case TJPF_RGBA:
-			dinfo->out_color_space=JCS_EXT_RGBA;  break;
-		case TJPF_BGRA:
-			dinfo->out_color_space=JCS_EXT_BGRA;  break;
-		case TJPF_ARGB:
-			dinfo->out_color_space=JCS_EXT_ARGB;  break;
-		case TJPF_ABGR:
-			dinfo->out_color_space=JCS_EXT_ABGR;  break;
-		#endif
-		#else
-		case TJPF_RGB:
-		case TJPF_BGR:
-		case TJPF_RGBX:
-		case TJPF_BGRX:
-		case TJPF_XRGB:
-		case TJPF_XBGR:
-		case TJPF_RGBA:
-		case TJPF_BGRA:
-		case TJPF_ARGB:
-		case TJPF_ABGR:
-			dinfo->out_color_space=JCS_RGB;  break;
-		#endif
-		case TJPF_CMYK:
-			dinfo->out_color_space=JCS_CMYK;  break;
-		default:
-			_throw("Unsupported pixel format");
-	}
-
-	if(flags&TJFLAG_FASTDCT) dinfo->dct_method=JDCT_FASTEST;
-
-	bailout:
-	return retval;
+  return retval;
 }
 
 
 static int getSubsamp(j_decompress_ptr dinfo)
 {
-	int retval=-1, i, k;
+  int retval = -1, i, k;
 
-	/* The sampling factors actually have no meaning with grayscale JPEG files,
-	   and in fact it's possible to generate grayscale JPEGs with sampling
-	   factors > 1 (even though those sampling factors are ignored by the
-	   decompressor.)  Thus, we need to treat grayscale as a special case. */
-	if(dinfo->num_components==1 && dinfo->jpeg_color_space==JCS_GRAYSCALE)
-		return TJSAMP_GRAY;
+  /* The sampling factors actually have no meaning with grayscale JPEG files,
+     and in fact it's possible to generate grayscale JPEGs with sampling
+     factors > 1 (even though those sampling factors are ignored by the
+     decompressor.)  Thus, we need to treat grayscale as a special case. */
+  if (dinfo->num_components == 1 && dinfo->jpeg_color_space == JCS_GRAYSCALE)
+    return TJSAMP_GRAY;
 
-	for(i=0; i<NUMSUBOPT; i++)
-	{
-		if(dinfo->num_components==pixelsize[i]
-			|| ((dinfo->jpeg_color_space==JCS_YCCK
-				|| dinfo->jpeg_color_space==JCS_CMYK)
-					&& pixelsize[i]==3 && dinfo->num_components==4))
-		{
-			if(dinfo->comp_info[0].h_samp_factor==tjMCUWidth[i]/8
-				&& dinfo->comp_info[0].v_samp_factor==tjMCUHeight[i]/8)
-			{
-				int match=0;
-				for(k=1; k<dinfo->num_components; k++)
-				{
-					int href=1, vref=1;
-					if(dinfo->jpeg_color_space==JCS_YCCK && k==3)
-					{
-						href=tjMCUWidth[i]/8;  vref=tjMCUHeight[i]/8;
-					}
-					if(dinfo->comp_info[k].h_samp_factor==href
-						&& dinfo->comp_info[k].v_samp_factor==vref)
-						match++;
-				}
-				if(match==dinfo->num_components-1)
-				{
-					retval=i;  break;
-				}
-			}
-			/* Handle 4:2:2 and 4:4:0 images whose sampling factors are specified
-			   in non-standard ways. */
-			if(dinfo->comp_info[0].h_samp_factor==2 &&
-				dinfo->comp_info[0].v_samp_factor==2 &&
-				(i==TJSAMP_422 || i==TJSAMP_440))
-			{
-				int match=0;
-				for(k=1; k<dinfo->num_components; k++)
-				{
-					int href=tjMCUHeight[i]/8, vref=tjMCUWidth[i]/8;
-					if(dinfo->jpeg_color_space==JCS_YCCK && k==3)
-					{
-						href=vref=2;
-					}
-					if(dinfo->comp_info[k].h_samp_factor==href
-						&& dinfo->comp_info[k].v_samp_factor==vref)
-						match++;
-				}
-				if(match==dinfo->num_components-1)
-				{
-					retval=i;  break;
-				}
-			}
-		}
-	}
-	return retval;
+  for (i = 0; i < NUMSUBOPT; i++) {
+    if (dinfo->num_components == pixelsize[i] ||
+        ((dinfo->jpeg_color_space == JCS_YCCK ||
+          dinfo->jpeg_color_space == JCS_CMYK) &&
+         pixelsize[i] == 3 && dinfo->num_components == 4)) {
+      if (dinfo->comp_info[0].h_samp_factor == tjMCUWidth[i] / 8 &&
+          dinfo->comp_info[0].v_samp_factor == tjMCUHeight[i] / 8) {
+        int match = 0;
+
+        for (k = 1; k < dinfo->num_components; k++) {
+          int href = 1, vref = 1;
+
+          if (dinfo->jpeg_color_space == JCS_YCCK && k == 3) {
+            href = tjMCUWidth[i] / 8;  vref = tjMCUHeight[i] / 8;
+          }
+          if (dinfo->comp_info[k].h_samp_factor == href &&
+              dinfo->comp_info[k].v_samp_factor == vref)
+            match++;
+        }
+        if (match == dinfo->num_components - 1) {
+          retval = i;  break;
+        }
+      }
+      /* Handle 4:2:2 and 4:4:0 images whose sampling factors are specified
+         in non-standard ways. */
+      if (dinfo->comp_info[0].h_samp_factor == 2 &&
+          dinfo->comp_info[0].v_samp_factor == 2 &&
+          (i == TJSAMP_422 || i == TJSAMP_440)) {
+        int match = 0;
+
+        for (k = 1; k < dinfo->num_components; k++) {
+          int href = tjMCUHeight[i] / 8, vref = tjMCUWidth[i] / 8;
+
+          if (dinfo->jpeg_color_space == JCS_YCCK && k == 3) {
+            href = vref = 2;
+          }
+          if (dinfo->comp_info[k].h_samp_factor == href &&
+              dinfo->comp_info[k].v_samp_factor == vref)
+            match++;
+        }
+        if (match == dinfo->num_components - 1) {
+          retval = i;  break;
+        }
+      }
+    }
+  }
+  return retval;
 }
 
 
-#ifndef JCS_EXTENSIONS
-
-/* Conversion functions to emulate the colorspace extensions.  This allows the
-   TurboJPEG wrapper to be used with libjpeg */
-
-#define TORGB(PS, ROFFSET, GOFFSET, BOFFSET) {  \
-	int rowPad=pitch-width*PS;  \
-	while(height--)  \
-	{  \
-		unsigned char *endOfRow=src+width*PS;  \
-		while(src<endOfRow)  \
-		{  \
-			dst[RGB_RED]=src[ROFFSET];  \
-			dst[RGB_GREEN]=src[GOFFSET];  \
-			dst[RGB_BLUE]=src[BOFFSET];  \
-			dst+=RGB_PIXELSIZE;  src+=PS;  \
-		}  \
-		src+=rowPad;  \
-	}  \
-}
-
-static unsigned char *toRGB(unsigned char *src, int width, int pitch,
-	int height, int pixelFormat, unsigned char *dst)
-{
-	unsigned char *retval=src;
-	switch(pixelFormat)
-	{
-		case TJPF_RGB:
-			#if RGB_RED!=0 || RGB_GREEN!=1 || RGB_BLUE!=2 || RGB_PIXELSIZE!=3
-			retval=dst;  TORGB(3, 0, 1, 2);
-			#endif
-			break;
-		case TJPF_BGR:
-			#if RGB_RED!=2 || RGB_GREEN!=1 || RGB_BLUE!=0 || RGB_PIXELSIZE!=3
-			retval=dst;  TORGB(3, 2, 1, 0);
-			#endif
-			break;
-		case TJPF_RGBX:
-		case TJPF_RGBA:
-			#if RGB_RED!=0 || RGB_GREEN!=1 || RGB_BLUE!=2 || RGB_PIXELSIZE!=4
-			retval=dst;  TORGB(4, 0, 1, 2);
-			#endif
-			break;
-		case TJPF_BGRX:
-		case TJPF_BGRA:
-			#if RGB_RED!=2 || RGB_GREEN!=1 || RGB_BLUE!=0 || RGB_PIXELSIZE!=4
-			retval=dst;  TORGB(4, 2, 1, 0);
-			#endif
-			break;
-		case TJPF_XRGB:
-		case TJPF_ARGB:
-			#if RGB_RED!=1 || RGB_GREEN!=2 || RGB_BLUE!=3 || RGB_PIXELSIZE!=4
-			retval=dst;  TORGB(4, 1, 2, 3);
-			#endif
-			break;
-		case TJPF_XBGR:
-		case TJPF_ABGR:
-			#if RGB_RED!=3 || RGB_GREEN!=2 || RGB_BLUE!=1 || RGB_PIXELSIZE!=4
-			retval=dst;  TORGB(4, 3, 2, 1);
-			#endif
-			break;
-	}
-	return retval;
-}
-
-#define FROMRGB(PS, ROFFSET, GOFFSET, BOFFSET, SETALPHA) {  \
-	int rowPad=pitch-width*PS;  \
-	while(height--)  \
-	{  \
-		unsigned char *endOfRow=dst+width*PS;  \
-		while(dst<endOfRow)  \
-		{  \
-			dst[ROFFSET]=src[RGB_RED];  \
-			dst[GOFFSET]=src[RGB_GREEN];  \
-			dst[BOFFSET]=src[RGB_BLUE];  \
-			SETALPHA  \
-			dst+=PS;  src+=RGB_PIXELSIZE;  \
-		}  \
-		dst+=rowPad;  \
-	}  \
-}
-
-static void fromRGB(unsigned char *src, unsigned char *dst, int width,
-	int pitch, int height, int pixelFormat)
-{
-	switch(pixelFormat)
-	{
-		case TJPF_RGB:
-			#if RGB_RED!=0 || RGB_GREEN!=1 || RGB_BLUE!=2 || RGB_PIXELSIZE!=3
-			FROMRGB(3, 0, 1, 2,);
-			#endif
-			break;
-		case TJPF_BGR:
-			#if RGB_RED!=2 || RGB_GREEN!=1 || RGB_BLUE!=0 || RGB_PIXELSIZE!=3
-			FROMRGB(3, 2, 1, 0,);
-			#endif
-			break;
-		case TJPF_RGBX:
-			#if RGB_RED!=0 || RGB_GREEN!=1 || RGB_BLUE!=2 || RGB_PIXELSIZE!=4
-			FROMRGB(4, 0, 1, 2,);
-			#endif
-			break;
-		case TJPF_RGBA:
-			#if RGB_RED!=0 || RGB_GREEN!=1 || RGB_BLUE!=2 || RGB_PIXELSIZE!=4
-			FROMRGB(4, 0, 1, 2, dst[3]=0xFF;);
-			#endif
-			break;
-		case TJPF_BGRX:
-			#if RGB_RED!=2 || RGB_GREEN!=1 || RGB_BLUE!=0 || RGB_PIXELSIZE!=4
-			FROMRGB(4, 2, 1, 0,);
-			#endif
-			break;
-		case TJPF_BGRA:
-			#if RGB_RED!=2 || RGB_GREEN!=1 || RGB_BLUE!=0 || RGB_PIXELSIZE!=4
-			FROMRGB(4, 2, 1, 0, dst[3]=0xFF;);  return;
-			#endif
-			break;
-		case TJPF_XRGB:
-			#if RGB_RED!=1 || RGB_GREEN!=2 || RGB_BLUE!=3 || RGB_PIXELSIZE!=4
-			FROMRGB(4, 1, 2, 3,);  return;
-			#endif
-			break;
-		case TJPF_ARGB:
-			#if RGB_RED!=1 || RGB_GREEN!=2 || RGB_BLUE!=3 || RGB_PIXELSIZE!=4
-			FROMRGB(4, 1, 2, 3, dst[0]=0xFF;);  return;
-			#endif
-			break;
-		case TJPF_XBGR:
-			#if RGB_RED!=3 || RGB_GREEN!=2 || RGB_BLUE!=1 || RGB_PIXELSIZE!=4
-			FROMRGB(4, 3, 2, 1,);  return;
-			#endif
-			break;
-		case TJPF_ABGR:
-			#if RGB_RED!=3 || RGB_GREEN!=2 || RGB_BLUE!=1 || RGB_PIXELSIZE!=4
-			FROMRGB(4, 3, 2, 1, dst[0]=0xFF;);  return;
-			#endif
-			break;
-	}
-}
-
-#endif
-
-
 /* General API functions */
 
-DLLEXPORT char* DLLCALL tjGetErrorStr(void)
+DLLEXPORT char *tjGetErrorStr2(tjhandle handle)
 {
-	return errStr;
+  tjinstance *this = (tjinstance *)handle;
+
+  if (this && this->isInstanceError) {
+    this->isInstanceError = FALSE;
+    return this->errStr;
+  } else
+    return errStr;
 }
 
 
-DLLEXPORT int DLLCALL tjDestroy(tjhandle handle)
+DLLEXPORT char *tjGetErrorStr(void)
 {
-	getinstance(handle);
-	if(setjmp(this->jerr.setjmp_buffer)) return -1;
-	if(this->init&COMPRESS) jpeg_destroy_compress(cinfo);
-	if(this->init&DECOMPRESS) jpeg_destroy_decompress(dinfo);
-	free(this);
-	return 0;
+  return errStr;
+}
+
+
+DLLEXPORT int tjGetErrorCode(tjhandle handle)
+{
+  tjinstance *this = (tjinstance *)handle;
+
+  if (this && this->jerr.warning) return TJERR_WARNING;
+  else return TJERR_FATAL;
+}
+
+
+DLLEXPORT int tjDestroy(tjhandle handle)
+{
+  getinstance(handle);
+
+  if (setjmp(this->jerr.setjmp_buffer)) return -1;
+  if (this->init & COMPRESS) jpeg_destroy_compress(cinfo);
+  if (this->init & DECOMPRESS) jpeg_destroy_decompress(dinfo);
+  free(this);
+  return 0;
 }
 
 
@@ -564,15 +402,15 @@
    with turbojpeg.dll for compatibility reasons.  However, these functions
    can potentially be used for other purposes by different implementations. */
 
-DLLEXPORT void DLLCALL tjFree(unsigned char *buf)
+DLLEXPORT void tjFree(unsigned char *buf)
 {
-	if(buf) free(buf);
+  if (buf) free(buf);
 }
 
 
-DLLEXPORT unsigned char *DLLCALL tjAlloc(int bytes)
+DLLEXPORT unsigned char *tjAlloc(int bytes)
 {
-	return (unsigned char *)malloc(bytes);
+  return (unsigned char *)malloc(bytes);
 }
 
 
@@ -580,672 +418,651 @@
 
 static tjhandle _tjInitCompress(tjinstance *this)
 {
-	static unsigned char buffer[1];
-	unsigned char *buf=buffer;  unsigned long size=1;
+  static unsigned char buffer[1];
+  unsigned char *buf = buffer;
+  unsigned long size = 1;
 
-	/* This is also straight out of example.c */
-	this->cinfo.err=jpeg_std_error(&this->jerr.pub);
-	this->jerr.pub.error_exit=my_error_exit;
-	this->jerr.pub.output_message=my_output_message;
-	this->jerr.emit_message=this->jerr.pub.emit_message;
-	this->jerr.pub.emit_message=my_emit_message;
+  /* This is also straight out of example.txt */
+  this->cinfo.err = jpeg_std_error(&this->jerr.pub);
+  this->jerr.pub.error_exit = my_error_exit;
+  this->jerr.pub.output_message = my_output_message;
+  this->jerr.emit_message = this->jerr.pub.emit_message;
+  this->jerr.pub.emit_message = my_emit_message;
 
-	if(setjmp(this->jerr.setjmp_buffer))
-	{
-		/* If we get here, the JPEG code has signaled an error. */
-		if(this) free(this);
-		return NULL;
-	}
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    if (this) free(this);
+    return NULL;
+  }
 
-	jpeg_create_compress(&this->cinfo);
-	/* Make an initial call so it will create the destination manager */
-	jpeg_mem_dest_tj(&this->cinfo, &buf, &size, 0);
+  jpeg_create_compress(&this->cinfo);
+  /* Make an initial call so it will create the destination manager */
+  jpeg_mem_dest_tj(&this->cinfo, &buf, &size, 0);
 
-	this->init|=COMPRESS;
-	return (tjhandle)this;
+  this->init |= COMPRESS;
+  return (tjhandle)this;
 }
 
-DLLEXPORT tjhandle DLLCALL tjInitCompress(void)
+DLLEXPORT tjhandle tjInitCompress(void)
 {
-	tjinstance *this=NULL;
-	if((this=(tjinstance *)malloc(sizeof(tjinstance)))==NULL)
-	{
-		snprintf(errStr, JMSG_LENGTH_MAX,
-			"tjInitCompress(): Memory allocation failure");
-		return NULL;
-	}
-	MEMZERO(this, sizeof(tjinstance));
-	return _tjInitCompress(this);
+  tjinstance *this = NULL;
+
+  if ((this = (tjinstance *)malloc(sizeof(tjinstance))) == NULL) {
+    snprintf(errStr, JMSG_LENGTH_MAX,
+             "tjInitCompress(): Memory allocation failure");
+    return NULL;
+  }
+  MEMZERO(this, sizeof(tjinstance));
+  snprintf(this->errStr, JMSG_LENGTH_MAX, "No error");
+  return _tjInitCompress(this);
 }
 
 
-DLLEXPORT unsigned long DLLCALL tjBufSize(int width, int height,
-	int jpegSubsamp)
+DLLEXPORT unsigned long tjBufSize(int width, int height, int jpegSubsamp)
 {
-	unsigned long retval=0;  int mcuw, mcuh, chromasf;
-	if(width<1 || height<1 || jpegSubsamp<0 || jpegSubsamp>=NUMSUBOPT)
-		_throw("tjBufSize(): Invalid argument");
+  unsigned long retval = 0;
+  int mcuw, mcuh, chromasf;
 
-	/* This allows for rare corner cases in which a JPEG image can actually be
-	   larger than the uncompressed input (we wouldn't mention it if it hadn't
-	   happened before.) */
-	mcuw=tjMCUWidth[jpegSubsamp];
-	mcuh=tjMCUHeight[jpegSubsamp];
-	chromasf=jpegSubsamp==TJSAMP_GRAY? 0: 4*64/(mcuw*mcuh);
-	retval=PAD(width, mcuw) * PAD(height, mcuh) * (2 + chromasf) + 2048;
+  if (width < 1 || height < 1 || jpegSubsamp < 0 || jpegSubsamp >= NUMSUBOPT)
+    _throwg("tjBufSize(): Invalid argument");
 
-	bailout:
-	return retval;
+  /* This allows for rare corner cases in which a JPEG image can actually be
+     larger than the uncompressed input (we wouldn't mention it if it hadn't
+     happened before.) */
+  mcuw = tjMCUWidth[jpegSubsamp];
+  mcuh = tjMCUHeight[jpegSubsamp];
+  chromasf = jpegSubsamp == TJSAMP_GRAY ? 0 : 4 * 64 / (mcuw * mcuh);
+  retval = PAD(width, mcuw) * PAD(height, mcuh) * (2 + chromasf) + 2048;
+
+bailout:
+  return retval;
 }
 
-DLLEXPORT unsigned long DLLCALL TJBUFSIZE(int width, int height)
+DLLEXPORT unsigned long TJBUFSIZE(int width, int height)
 {
-	unsigned long retval=0;
-	if(width<1 || height<1)
-		_throw("TJBUFSIZE(): Invalid argument");
+  unsigned long retval = 0;
 
-	/* This allows for rare corner cases in which a JPEG image can actually be
-	   larger than the uncompressed input (we wouldn't mention it if it hadn't
-	   happened before.) */
-	retval=PAD(width, 16) * PAD(height, 16) * 6 + 2048;
+  if (width < 1 || height < 1)
+    _throwg("TJBUFSIZE(): Invalid argument");
 
-	bailout:
-	return retval;
+  /* This allows for rare corner cases in which a JPEG image can actually be
+     larger than the uncompressed input (we wouldn't mention it if it hadn't
+     happened before.) */
+  retval = PAD(width, 16) * PAD(height, 16) * 6 + 2048;
+
+bailout:
+  return retval;
 }
 
 
-DLLEXPORT unsigned long DLLCALL tjBufSizeYUV2(int width, int pad, int height,
-	int subsamp)
+DLLEXPORT unsigned long tjBufSizeYUV2(int width, int pad, int height,
+                                      int subsamp)
 {
-	int retval=0, nc, i;
+  int retval = 0, nc, i;
 
-	if(subsamp<0 || subsamp>=NUMSUBOPT)
-		_throw("tjBufSizeYUV2(): Invalid argument");
+  if (subsamp < 0 || subsamp >= NUMSUBOPT)
+    _throwg("tjBufSizeYUV2(): Invalid argument");
 
-	nc=(subsamp==TJSAMP_GRAY? 1:3);
-	for(i=0; i<nc; i++)
-	{
-		int pw=tjPlaneWidth(i, width, subsamp);
-		int stride=PAD(pw, pad);
-		int ph=tjPlaneHeight(i, height, subsamp);
-		if(pw<0 || ph<0) return -1;
-		else retval+=stride*ph;
-	}
+  nc = (subsamp == TJSAMP_GRAY ? 1 : 3);
+  for (i = 0; i < nc; i++) {
+    int pw = tjPlaneWidth(i, width, subsamp);
+    int stride = PAD(pw, pad);
+    int ph = tjPlaneHeight(i, height, subsamp);
 
-	bailout:
-	return retval;
+    if (pw < 0 || ph < 0) return -1;
+    else retval += stride * ph;
+  }
+
+bailout:
+  return retval;
 }
 
-DLLEXPORT unsigned long DLLCALL tjBufSizeYUV(int width, int height,
-	int subsamp)
+DLLEXPORT unsigned long tjBufSizeYUV(int width, int height, int subsamp)
 {
-	return tjBufSizeYUV2(width, 4, height, subsamp);
+  return tjBufSizeYUV2(width, 4, height, subsamp);
 }
 
-DLLEXPORT unsigned long DLLCALL TJBUFSIZEYUV(int width, int height,
-	int subsamp)
+DLLEXPORT unsigned long TJBUFSIZEYUV(int width, int height, int subsamp)
 {
-	return tjBufSizeYUV(width, height, subsamp);
+  return tjBufSizeYUV(width, height, subsamp);
 }
 
 
 DLLEXPORT int tjPlaneWidth(int componentID, int width, int subsamp)
 {
-	int pw, nc, retval=0;
+  int pw, nc, retval = 0;
 
-	if(width<1 || subsamp<0 || subsamp>=TJ_NUMSAMP)
-		_throw("tjPlaneWidth(): Invalid argument");
-	nc=(subsamp==TJSAMP_GRAY? 1:3);
-	if(componentID<0 || componentID>=nc)
-		_throw("tjPlaneWidth(): Invalid argument");
+  if (width < 1 || subsamp < 0 || subsamp >= TJ_NUMSAMP)
+    _throwg("tjPlaneWidth(): Invalid argument");
+  nc = (subsamp == TJSAMP_GRAY ? 1 : 3);
+  if (componentID < 0 || componentID >= nc)
+    _throwg("tjPlaneWidth(): Invalid argument");
 
-	pw=PAD(width, tjMCUWidth[subsamp]/8);
-	if(componentID==0)
-		retval=pw;
-	else
-		retval=pw*8/tjMCUWidth[subsamp];
+  pw = PAD(width, tjMCUWidth[subsamp] / 8);
+  if (componentID == 0)
+    retval = pw;
+  else
+    retval = pw * 8 / tjMCUWidth[subsamp];
 
-	bailout:
-	return retval;
+bailout:
+  return retval;
 }
 
 
 DLLEXPORT int tjPlaneHeight(int componentID, int height, int subsamp)
 {
-	int ph, nc, retval=0;
+  int ph, nc, retval = 0;
 
-	if(height<1 || subsamp<0 || subsamp>=TJ_NUMSAMP)
-		_throw("tjPlaneHeight(): Invalid argument");
-	nc=(subsamp==TJSAMP_GRAY? 1:3);
-	if(componentID<0 || componentID>=nc)
-		_throw("tjPlaneHeight(): Invalid argument");
+  if (height < 1 || subsamp < 0 || subsamp >= TJ_NUMSAMP)
+    _throwg("tjPlaneHeight(): Invalid argument");
+  nc = (subsamp == TJSAMP_GRAY ? 1 : 3);
+  if (componentID < 0 || componentID >= nc)
+    _throwg("tjPlaneHeight(): Invalid argument");
 
-	ph=PAD(height, tjMCUHeight[subsamp]/8);
-	if(componentID==0)
-		retval=ph;
-	else
-		retval=ph*8/tjMCUHeight[subsamp];
+  ph = PAD(height, tjMCUHeight[subsamp] / 8);
+  if (componentID == 0)
+    retval = ph;
+  else
+    retval = ph * 8 / tjMCUHeight[subsamp];
 
-	bailout:
-	return retval;
+bailout:
+  return retval;
 }
 
 
-DLLEXPORT unsigned long DLLCALL tjPlaneSizeYUV(int componentID, int width,
-	int stride, int height, int subsamp)
+DLLEXPORT unsigned long tjPlaneSizeYUV(int componentID, int width, int stride,
+                                       int height, int subsamp)
 {
-	unsigned long retval=0;
-	int pw, ph;
+  unsigned long retval = 0;
+  int pw, ph;
 
-	if(width<1 || height<1 || subsamp<0 || subsamp>=NUMSUBOPT)
-		_throw("tjPlaneSizeYUV(): Invalid argument");
+  if (width < 1 || height < 1 || subsamp < 0 || subsamp >= NUMSUBOPT)
+    _throwg("tjPlaneSizeYUV(): Invalid argument");
 
-	pw=tjPlaneWidth(componentID, width, subsamp);
-	ph=tjPlaneHeight(componentID, height, subsamp);
-	if(pw<0 || ph<0) return -1;
+  pw = tjPlaneWidth(componentID, width, subsamp);
+  ph = tjPlaneHeight(componentID, height, subsamp);
+  if (pw < 0 || ph < 0) return -1;
 
-	if(stride==0) stride=pw;
-	else stride=abs(stride);
+  if (stride == 0) stride = pw;
+  else stride = abs(stride);
 
-	retval=stride*(ph-1)+pw;
+  retval = stride * (ph - 1) + pw;
 
-	bailout:
-	return retval;
+bailout:
+  return retval;
 }
 
 
-DLLEXPORT int DLLCALL tjCompress2(tjhandle handle, const unsigned char *srcBuf,
-	int width, int pitch, int height, int pixelFormat, unsigned char **jpegBuf,
-	unsigned long *jpegSize, int jpegSubsamp, int jpegQual, int flags)
+DLLEXPORT int tjCompress2(tjhandle handle, const unsigned char *srcBuf,
+                          int width, int pitch, int height, int pixelFormat,
+                          unsigned char **jpegBuf, unsigned long *jpegSize,
+                          int jpegSubsamp, int jpegQual, int flags)
 {
-	int i, retval=0, alloc=1;  JSAMPROW *row_pointer=NULL;
-	#ifndef JCS_EXTENSIONS
-	unsigned char *rgbBuf=NULL;
-	#endif
+  int i, retval = 0, alloc = 1;
+  JSAMPROW *row_pointer = NULL;
 
-	getcinstance(handle)
-	if((this->init&COMPRESS)==0)
-		_throw("tjCompress2(): Instance has not been initialized for compression");
+  getcinstance(handle)
+  this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
+  if ((this->init & COMPRESS) == 0)
+    _throw("tjCompress2(): Instance has not been initialized for compression");
 
-	if(srcBuf==NULL || width<=0 || pitch<0 || height<=0 || pixelFormat<0
-		|| pixelFormat>=TJ_NUMPF || jpegBuf==NULL || jpegSize==NULL
-		|| jpegSubsamp<0 || jpegSubsamp>=NUMSUBOPT || jpegQual<0 || jpegQual>100)
-		_throw("tjCompress2(): Invalid argument");
+  if (srcBuf == NULL || width <= 0 || pitch < 0 || height <= 0 ||
+      pixelFormat < 0 || pixelFormat >= TJ_NUMPF || jpegBuf == NULL ||
+      jpegSize == NULL || jpegSubsamp < 0 || jpegSubsamp >= NUMSUBOPT ||
+      jpegQual < 0 || jpegQual > 100)
+    _throw("tjCompress2(): Invalid argument");
 
-	if(pitch==0) pitch=width*tjPixelSize[pixelFormat];
+  if (pitch == 0) pitch = width * tjPixelSize[pixelFormat];
 
-	#ifndef JCS_EXTENSIONS
-	if(pixelFormat!=TJPF_GRAY && pixelFormat!=TJPF_CMYK)
-	{
-		rgbBuf=(unsigned char *)malloc(width*height*RGB_PIXELSIZE);
-		if(!rgbBuf) _throw("tjCompress2(): Memory allocation failure");
-		srcBuf=toRGB(srcBuf, width, pitch, height, pixelFormat, rgbBuf);
-		pitch=width*RGB_PIXELSIZE;
-	}
-	#endif
+  if ((row_pointer = (JSAMPROW *)malloc(sizeof(JSAMPROW) * height)) == NULL)
+    _throw("tjCompress2(): Memory allocation failure");
 
-	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*height))==NULL)
-		_throw("tjCompress2(): Memory allocation failure");
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
-	if(setjmp(this->jerr.setjmp_buffer))
-	{
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+  cinfo->image_width = width;
+  cinfo->image_height = height;
 
-	cinfo->image_width=width;
-	cinfo->image_height=height;
+  if (flags & TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
+  else if (flags & TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
+  else if (flags & TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 
-	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
-	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
-	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
+  if (flags & TJFLAG_NOREALLOC) {
+    alloc = 0;  *jpegSize = tjBufSize(width, height, jpegSubsamp);
+  }
+  jpeg_mem_dest_tj(cinfo, jpegBuf, jpegSize, alloc);
+  if (setCompDefaults(cinfo, pixelFormat, jpegSubsamp, jpegQual, flags) == -1)
+    return -1;
 
-	if(flags&TJFLAG_NOREALLOC)
-	{
-		alloc=0;  *jpegSize=tjBufSize(width, height, jpegSubsamp);
-	}
-	jpeg_mem_dest_tj(cinfo, jpegBuf, jpegSize, alloc);
-	if(setCompDefaults(cinfo, pixelFormat, jpegSubsamp, jpegQual, flags)==-1)
-		return -1;
+  jpeg_start_compress(cinfo, TRUE);
+  for (i = 0; i < height; i++) {
+    if (flags & TJFLAG_BOTTOMUP)
+      row_pointer[i] = (JSAMPROW)&srcBuf[(height - i - 1) * pitch];
+    else
+      row_pointer[i] = (JSAMPROW)&srcBuf[i * pitch];
+  }
+  while (cinfo->next_scanline < cinfo->image_height)
+    jpeg_write_scanlines(cinfo, &row_pointer[cinfo->next_scanline],
+                         cinfo->image_height - cinfo->next_scanline);
+  jpeg_finish_compress(cinfo);
 
-	jpeg_start_compress(cinfo, TRUE);
-	for(i=0; i<height; i++)
-	{
-		if(flags&TJFLAG_BOTTOMUP)
-			row_pointer[i]=(JSAMPROW)&srcBuf[(height-i-1)*pitch];
-		else row_pointer[i]=(JSAMPROW)&srcBuf[i*pitch];
-	}
-	while(cinfo->next_scanline<cinfo->image_height)
-	{
-		jpeg_write_scanlines(cinfo, &row_pointer[cinfo->next_scanline],
-			cinfo->image_height-cinfo->next_scanline);
-	}
-	jpeg_finish_compress(cinfo);
-
-	bailout:
-	if(cinfo->global_state>CSTATE_START) jpeg_abort_compress(cinfo);
-	#ifndef JCS_EXTENSIONS
-	if(rgbBuf) free(rgbBuf);
-	#endif
-	if(row_pointer) free(row_pointer);
-	if(this->jerr.warning) retval=-1;
-	return retval;
+bailout:
+  if (cinfo->global_state > CSTATE_START) jpeg_abort_compress(cinfo);
+  if (row_pointer) free(row_pointer);
+  if (this->jerr.warning) retval = -1;
+  this->jerr.stopOnWarning = FALSE;
+  return retval;
 }
 
-DLLEXPORT int DLLCALL tjCompress(tjhandle handle, unsigned char *srcBuf,
-	int width, int pitch, int height, int pixelSize, unsigned char *jpegBuf,
-	unsigned long *jpegSize, int jpegSubsamp, int jpegQual, int flags)
+DLLEXPORT int tjCompress(tjhandle handle, unsigned char *srcBuf, int width,
+                         int pitch, int height, int pixelSize,
+                         unsigned char *jpegBuf, unsigned long *jpegSize,
+                         int jpegSubsamp, int jpegQual, int flags)
 {
-	int retval=0;  unsigned long size;
-	if(flags&TJ_YUV)
-	{
-		size=tjBufSizeYUV(width, height, jpegSubsamp);
-		retval=tjEncodeYUV2(handle, srcBuf, width, pitch, height,
-			getPixelFormat(pixelSize, flags), jpegBuf, jpegSubsamp, flags);
-	}
-	else
-	{
-		retval=tjCompress2(handle, srcBuf, width, pitch, height,
-			getPixelFormat(pixelSize, flags), &jpegBuf, &size, jpegSubsamp, jpegQual,
-			flags|TJFLAG_NOREALLOC);
-	}
-	*jpegSize=size;
-	return retval;
+  int retval = 0;
+  unsigned long size;
+
+  if (flags & TJ_YUV) {
+    size = tjBufSizeYUV(width, height, jpegSubsamp);
+    retval = tjEncodeYUV2(handle, srcBuf, width, pitch, height,
+                          getPixelFormat(pixelSize, flags), jpegBuf,
+                          jpegSubsamp, flags);
+  } else {
+    retval = tjCompress2(handle, srcBuf, width, pitch, height,
+                         getPixelFormat(pixelSize, flags), &jpegBuf, &size,
+                         jpegSubsamp, jpegQual, flags | TJFLAG_NOREALLOC);
+  }
+  *jpegSize = size;
+  return retval;
 }
 
 
-DLLEXPORT int DLLCALL tjEncodeYUVPlanes(tjhandle handle,
-	const unsigned char *srcBuf, int width, int pitch, int height,
-	int pixelFormat, unsigned char **dstPlanes, int *strides, int subsamp,
-	int flags)
+DLLEXPORT int tjEncodeYUVPlanes(tjhandle handle, const unsigned char *srcBuf,
+                                int width, int pitch, int height,
+                                int pixelFormat, unsigned char **dstPlanes,
+                                int *strides, int subsamp, int flags)
 {
-	int i, retval=0;  JSAMPROW *row_pointer=NULL;
-	JSAMPLE *_tmpbuf[MAX_COMPONENTS], *_tmpbuf2[MAX_COMPONENTS];
-	JSAMPROW *tmpbuf[MAX_COMPONENTS], *tmpbuf2[MAX_COMPONENTS];
-	JSAMPROW *outbuf[MAX_COMPONENTS];
-	int row, pw0, ph0, pw[MAX_COMPONENTS], ph[MAX_COMPONENTS];
-	JSAMPLE *ptr;
-	jpeg_component_info *compptr;
-	#ifndef JCS_EXTENSIONS
-	unsigned char *rgbBuf=NULL;
-	#endif
+  JSAMPROW *row_pointer = NULL;
+  JSAMPLE *_tmpbuf[MAX_COMPONENTS], *_tmpbuf2[MAX_COMPONENTS];
+  JSAMPROW *tmpbuf[MAX_COMPONENTS], *tmpbuf2[MAX_COMPONENTS];
+  JSAMPROW *outbuf[MAX_COMPONENTS];
+  int i, retval = 0, row, pw0, ph0, pw[MAX_COMPONENTS], ph[MAX_COMPONENTS];
+  JSAMPLE *ptr;
+  jpeg_component_info *compptr;
 
-	getcinstance(handle);
+  getcinstance(handle);
+  this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
 
-	for(i=0; i<MAX_COMPONENTS; i++)
-	{
-		tmpbuf[i]=NULL;  _tmpbuf[i]=NULL;
-		tmpbuf2[i]=NULL;  _tmpbuf2[i]=NULL;  outbuf[i]=NULL;
-	}
+  for (i = 0; i < MAX_COMPONENTS; i++) {
+    tmpbuf[i] = NULL;  _tmpbuf[i] = NULL;
+    tmpbuf2[i] = NULL;  _tmpbuf2[i] = NULL;  outbuf[i] = NULL;
+  }
 
-	if((this->init&COMPRESS)==0)
-		_throw("tjEncodeYUVPlanes(): Instance has not been initialized for compression");
+  if ((this->init & COMPRESS) == 0)
+    _throw("tjEncodeYUVPlanes(): Instance has not been initialized for compression");
 
-	if(srcBuf==NULL || width<=0 || pitch<0 || height<=0 || pixelFormat<0
-		|| pixelFormat>=TJ_NUMPF || !dstPlanes || !dstPlanes[0] || subsamp<0
-		|| subsamp>=NUMSUBOPT)
-		_throw("tjEncodeYUVPlanes(): Invalid argument");
-	if(subsamp!=TJSAMP_GRAY && (!dstPlanes[1] || !dstPlanes[2]))
-		_throw("tjEncodeYUVPlanes(): Invalid argument");
+  if (srcBuf == NULL || width <= 0 || pitch < 0 || height <= 0 ||
+      pixelFormat < 0 || pixelFormat >= TJ_NUMPF || !dstPlanes ||
+      !dstPlanes[0] || subsamp < 0 || subsamp >= NUMSUBOPT)
+    _throw("tjEncodeYUVPlanes(): Invalid argument");
+  if (subsamp != TJSAMP_GRAY && (!dstPlanes[1] || !dstPlanes[2]))
+    _throw("tjEncodeYUVPlanes(): Invalid argument");
 
-	if(pixelFormat==TJPF_CMYK)
-		_throw("tjEncodeYUVPlanes(): Cannot generate YUV images from CMYK pixels");
+  if (pixelFormat == TJPF_CMYK)
+    _throw("tjEncodeYUVPlanes(): Cannot generate YUV images from CMYK pixels");
 
-	if(pitch==0) pitch=width*tjPixelSize[pixelFormat];
+  if (pitch == 0) pitch = width * tjPixelSize[pixelFormat];
 
-	#ifndef JCS_EXTENSIONS
-	if(pixelFormat!=TJPF_GRAY && pixelFormat!=TJPF_CMYK)
-	{
-		rgbBuf=(unsigned char *)malloc(width*height*RGB_PIXELSIZE);
-		if(!rgbBuf) _throw("tjEncodeYUVPlanes(): Memory allocation failure");
-		srcBuf=toRGB(srcBuf, width, pitch, height, pixelFormat, rgbBuf);
-		pitch=width*RGB_PIXELSIZE;
-	}
-	#endif
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
-	if(setjmp(this->jerr.setjmp_buffer))
-	{
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+  cinfo->image_width = width;
+  cinfo->image_height = height;
 
-	cinfo->image_width=width;
-	cinfo->image_height=height;
+  if (flags & TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
+  else if (flags & TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
+  else if (flags & TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 
-	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
-	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
-	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
+  if (setCompDefaults(cinfo, pixelFormat, subsamp, -1, flags) == -1) return -1;
 
-	if(setCompDefaults(cinfo, pixelFormat, subsamp, -1, flags)==-1) return -1;
+  /* Execute only the parts of jpeg_start_compress() that we need.  If we
+     were to call the whole jpeg_start_compress() function, then it would try
+     to write the file headers, which could overflow the output buffer if the
+     YUV image were very small. */
+  if (cinfo->global_state != CSTATE_START)
+    _throw("tjEncodeYUVPlanes(): libjpeg API is in the wrong state");
+  (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
+  jinit_c_master_control(cinfo, FALSE);
+  jinit_color_converter(cinfo);
+  jinit_downsampler(cinfo);
+  (*cinfo->cconvert->start_pass) (cinfo);
 
-	/* Execute only the parts of jpeg_start_compress() that we need.  If we
-	   were to call the whole jpeg_start_compress() function, then it would try
-	   to write the file headers, which could overflow the output buffer if the
-	   YUV image were very small. */
-	if(cinfo->global_state!=CSTATE_START)
-		_throw("tjEncodeYUVPlanes(): libjpeg API is in the wrong state");
-	(*cinfo->err->reset_error_mgr)((j_common_ptr)cinfo);
-	jinit_c_master_control(cinfo, FALSE);
-	jinit_color_converter(cinfo);
-	jinit_downsampler(cinfo);
-	(*cinfo->cconvert->start_pass)(cinfo);
+  pw0 = PAD(width, cinfo->max_h_samp_factor);
+  ph0 = PAD(height, cinfo->max_v_samp_factor);
 
-	pw0=PAD(width, cinfo->max_h_samp_factor);
-	ph0=PAD(height, cinfo->max_v_samp_factor);
+  if ((row_pointer = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph0)) == NULL)
+    _throw("tjEncodeYUVPlanes(): Memory allocation failure");
+  for (i = 0; i < height; i++) {
+    if (flags & TJFLAG_BOTTOMUP)
+      row_pointer[i] = (JSAMPROW)&srcBuf[(height - i - 1) * pitch];
+    else
+      row_pointer[i] = (JSAMPROW)&srcBuf[i * pitch];
+  }
+  if (height < ph0)
+    for (i = height; i < ph0; i++) row_pointer[i] = row_pointer[height - 1];
 
-	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph0))==NULL)
-		_throw("tjEncodeYUVPlanes(): Memory allocation failure");
-	for(i=0; i<height; i++)
-	{
-		if(flags&TJFLAG_BOTTOMUP)
-			row_pointer[i]=(JSAMPROW)&srcBuf[(height-i-1)*pitch];
-		else row_pointer[i]=(JSAMPROW)&srcBuf[i*pitch];
-	}
-	if(height<ph0)
-		for(i=height; i<ph0; i++) row_pointer[i]=row_pointer[height-1];
+  for (i = 0; i < cinfo->num_components; i++) {
+    compptr = &cinfo->comp_info[i];
+    _tmpbuf[i] = (JSAMPLE *)malloc(
+      PAD((compptr->width_in_blocks * cinfo->max_h_samp_factor * DCTSIZE) /
+          compptr->h_samp_factor, 32) *
+      cinfo->max_v_samp_factor + 32);
+    if (!_tmpbuf[i])
+      _throw("tjEncodeYUVPlanes(): Memory allocation failure");
+    tmpbuf[i] =
+      (JSAMPROW *)malloc(sizeof(JSAMPROW) * cinfo->max_v_samp_factor);
+    if (!tmpbuf[i])
+      _throw("tjEncodeYUVPlanes(): Memory allocation failure");
+    for (row = 0; row < cinfo->max_v_samp_factor; row++) {
+      unsigned char *_tmpbuf_aligned =
+        (unsigned char *)PAD((size_t)_tmpbuf[i], 32);
 
-	for(i=0; i<cinfo->num_components; i++)
-	{
-		compptr=&cinfo->comp_info[i];
-		_tmpbuf[i]=(JSAMPLE *)malloc(
-			PAD((compptr->width_in_blocks*cinfo->max_h_samp_factor*DCTSIZE)
-				/compptr->h_samp_factor, 16) * cinfo->max_v_samp_factor + 16);
-		if(!_tmpbuf[i]) _throw("tjEncodeYUVPlanes(): Memory allocation failure");
-		tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*cinfo->max_v_samp_factor);
-		if(!tmpbuf[i]) _throw("tjEncodeYUVPlanes(): Memory allocation failure");
-		for(row=0; row<cinfo->max_v_samp_factor; row++)
-		{
-			unsigned char *_tmpbuf_aligned=
-				(unsigned char *)PAD((size_t)_tmpbuf[i], 16);
-			tmpbuf[i][row]=&_tmpbuf_aligned[
-				PAD((compptr->width_in_blocks*cinfo->max_h_samp_factor*DCTSIZE)
-					/compptr->h_samp_factor, 16) * row];
-		}
-		_tmpbuf2[i]=(JSAMPLE *)malloc(PAD(compptr->width_in_blocks*DCTSIZE, 16)
-			* compptr->v_samp_factor + 16);
-		if(!_tmpbuf2[i]) _throw("tjEncodeYUVPlanes(): Memory allocation failure");
-		tmpbuf2[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*compptr->v_samp_factor);
-		if(!tmpbuf2[i]) _throw("tjEncodeYUVPlanes(): Memory allocation failure");
-		for(row=0; row<compptr->v_samp_factor; row++)
-		{
-			unsigned char *_tmpbuf2_aligned=
-				(unsigned char *)PAD((size_t)_tmpbuf2[i], 16);
-			tmpbuf2[i][row]=&_tmpbuf2_aligned[
-				PAD(compptr->width_in_blocks*DCTSIZE, 16) * row];
-		}
-		pw[i]=pw0*compptr->h_samp_factor/cinfo->max_h_samp_factor;
-		ph[i]=ph0*compptr->v_samp_factor/cinfo->max_v_samp_factor;
-		outbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph[i]);
-		if(!outbuf[i]) _throw("tjEncodeYUVPlanes(): Memory allocation failure");
-		ptr=dstPlanes[i];
-		for(row=0; row<ph[i]; row++)
-		{
-			outbuf[i][row]=ptr;
-			ptr+=(strides && strides[i]!=0)? strides[i]:pw[i];
-		}
-	}
+      tmpbuf[i][row] = &_tmpbuf_aligned[
+        PAD((compptr->width_in_blocks * cinfo->max_h_samp_factor * DCTSIZE) /
+            compptr->h_samp_factor, 32) * row];
+    }
+    _tmpbuf2[i] =
+      (JSAMPLE *)malloc(PAD(compptr->width_in_blocks * DCTSIZE, 32) *
+                        compptr->v_samp_factor + 32);
+    if (!_tmpbuf2[i])
+      _throw("tjEncodeYUVPlanes(): Memory allocation failure");
+    tmpbuf2[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * compptr->v_samp_factor);
+    if (!tmpbuf2[i])
+      _throw("tjEncodeYUVPlanes(): Memory allocation failure");
+    for (row = 0; row < compptr->v_samp_factor; row++) {
+      unsigned char *_tmpbuf2_aligned =
+        (unsigned char *)PAD((size_t)_tmpbuf2[i], 32);
 
-	if(setjmp(this->jerr.setjmp_buffer))
-	{
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+      tmpbuf2[i][row] =
+        &_tmpbuf2_aligned[PAD(compptr->width_in_blocks * DCTSIZE, 32) * row];
+    }
+    pw[i] = pw0 * compptr->h_samp_factor / cinfo->max_h_samp_factor;
+    ph[i] = ph0 * compptr->v_samp_factor / cinfo->max_v_samp_factor;
+    outbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph[i]);
+    if (!outbuf[i])
+      _throw("tjEncodeYUVPlanes(): Memory allocation failure");
+    ptr = dstPlanes[i];
+    for (row = 0; row < ph[i]; row++) {
+      outbuf[i][row] = ptr;
+      ptr += (strides && strides[i] != 0) ? strides[i] : pw[i];
+    }
+  }
 
-	for(row=0; row<ph0; row+=cinfo->max_v_samp_factor)
-	{
-		(*cinfo->cconvert->color_convert)(cinfo, &row_pointer[row], tmpbuf, 0,
-			cinfo->max_v_samp_factor);
-		(cinfo->downsample->downsample)(cinfo, tmpbuf, 0, tmpbuf2, 0);
-		for(i=0, compptr=cinfo->comp_info; i<cinfo->num_components; i++, compptr++)
-			jcopy_sample_rows(tmpbuf2[i], 0, outbuf[i],
-				row*compptr->v_samp_factor/cinfo->max_v_samp_factor,
-				compptr->v_samp_factor, pw[i]);
-	}
-	cinfo->next_scanline+=height;
-	jpeg_abort_compress(cinfo);
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
-	bailout:
-	if(cinfo->global_state>CSTATE_START) jpeg_abort_compress(cinfo);
-	#ifndef JCS_EXTENSIONS
-	if(rgbBuf) free(rgbBuf);
-	#endif
-	if(row_pointer) free(row_pointer);
-	for(i=0; i<MAX_COMPONENTS; i++)
-	{
-		if(tmpbuf[i]!=NULL) free(tmpbuf[i]);
-		if(_tmpbuf[i]!=NULL) free(_tmpbuf[i]);
-		if(tmpbuf2[i]!=NULL) free(tmpbuf2[i]);
-		if(_tmpbuf2[i]!=NULL) free(_tmpbuf2[i]);
-		if(outbuf[i]!=NULL) free(outbuf[i]);
-	}
-	if(this->jerr.warning) retval=-1;
-	return retval;
+  for (row = 0; row < ph0; row += cinfo->max_v_samp_factor) {
+    (*cinfo->cconvert->color_convert) (cinfo, &row_pointer[row], tmpbuf, 0,
+                                       cinfo->max_v_samp_factor);
+    (cinfo->downsample->downsample) (cinfo, tmpbuf, 0, tmpbuf2, 0);
+    for (i = 0, compptr = cinfo->comp_info; i < cinfo->num_components;
+         i++, compptr++)
+      jcopy_sample_rows(tmpbuf2[i], 0, outbuf[i],
+        row * compptr->v_samp_factor / cinfo->max_v_samp_factor,
+        compptr->v_samp_factor, pw[i]);
+  }
+  cinfo->next_scanline += height;
+  jpeg_abort_compress(cinfo);
+
+bailout:
+  if (cinfo->global_state > CSTATE_START) jpeg_abort_compress(cinfo);
+  if (row_pointer) free(row_pointer);
+  for (i = 0; i < MAX_COMPONENTS; i++) {
+    if (tmpbuf[i] != NULL) free(tmpbuf[i]);
+    if (_tmpbuf[i] != NULL) free(_tmpbuf[i]);
+    if (tmpbuf2[i] != NULL) free(tmpbuf2[i]);
+    if (_tmpbuf2[i] != NULL) free(_tmpbuf2[i]);
+    if (outbuf[i] != NULL) free(outbuf[i]);
+  }
+  if (this->jerr.warning) retval = -1;
+  this->jerr.stopOnWarning = FALSE;
+  return retval;
 }
 
-DLLEXPORT int DLLCALL tjEncodeYUV3(tjhandle handle,
-	const unsigned char *srcBuf, int width, int pitch, int height,
-	int pixelFormat, unsigned char *dstBuf, int pad, int subsamp, int flags)
+DLLEXPORT int tjEncodeYUV3(tjhandle handle, const unsigned char *srcBuf,
+                           int width, int pitch, int height, int pixelFormat,
+                           unsigned char *dstBuf, int pad, int subsamp,
+                           int flags)
 {
-	unsigned char *dstPlanes[3];
-	int pw0, ph0, strides[3], retval=-1;
+  unsigned char *dstPlanes[3];
+  int pw0, ph0, strides[3], retval = -1;
+  tjinstance *this = (tjinstance *)handle;
 
-	if(width<=0 || height<=0 || dstBuf==NULL || pad<0 || !isPow2(pad)
-		|| subsamp<0 || subsamp>=NUMSUBOPT)
-		_throw("tjEncodeYUV3(): Invalid argument");
+  if (!this) _throwg("tjEncodeYUV3(): Invalid handle");
+  this->isInstanceError = FALSE;
 
-	pw0=tjPlaneWidth(0, width, subsamp);
-	ph0=tjPlaneHeight(0, height, subsamp);
-	dstPlanes[0]=dstBuf;
-	strides[0]=PAD(pw0, pad);
-	if(subsamp==TJSAMP_GRAY)
-	{
-		strides[1]=strides[2]=0;
-		dstPlanes[1]=dstPlanes[2]=NULL;
-	}
-	else
-	{
-		int pw1=tjPlaneWidth(1, width, subsamp);
-		int ph1=tjPlaneHeight(1, height, subsamp);
-		strides[1]=strides[2]=PAD(pw1, pad);
-		dstPlanes[1]=dstPlanes[0]+strides[0]*ph0;
-		dstPlanes[2]=dstPlanes[1]+strides[1]*ph1;
-	}
+  if (width <= 0 || height <= 0 || dstBuf == NULL || pad < 0 || !isPow2(pad) ||
+      subsamp < 0 || subsamp >= NUMSUBOPT)
+    _throw("tjEncodeYUV3(): Invalid argument");
 
-	return tjEncodeYUVPlanes(handle, srcBuf, width, pitch, height, pixelFormat,
-		dstPlanes, strides, subsamp, flags);
+  pw0 = tjPlaneWidth(0, width, subsamp);
+  ph0 = tjPlaneHeight(0, height, subsamp);
+  dstPlanes[0] = dstBuf;
+  strides[0] = PAD(pw0, pad);
+  if (subsamp == TJSAMP_GRAY) {
+    strides[1] = strides[2] = 0;
+    dstPlanes[1] = dstPlanes[2] = NULL;
+  } else {
+    int pw1 = tjPlaneWidth(1, width, subsamp);
+    int ph1 = tjPlaneHeight(1, height, subsamp);
 
-	bailout:
-	return retval;
+    strides[1] = strides[2] = PAD(pw1, pad);
+    dstPlanes[1] = dstPlanes[0] + strides[0] * ph0;
+    dstPlanes[2] = dstPlanes[1] + strides[1] * ph1;
+  }
+
+  return tjEncodeYUVPlanes(handle, srcBuf, width, pitch, height, pixelFormat,
+                           dstPlanes, strides, subsamp, flags);
+
+bailout:
+  return retval;
 }
 
-DLLEXPORT int DLLCALL tjEncodeYUV2(tjhandle handle, unsigned char *srcBuf,
-	int width, int pitch, int height, int pixelFormat, unsigned char *dstBuf,
-	int subsamp, int flags)
+DLLEXPORT int tjEncodeYUV2(tjhandle handle, unsigned char *srcBuf, int width,
+                           int pitch, int height, int pixelFormat,
+                           unsigned char *dstBuf, int subsamp, int flags)
 {
-	return tjEncodeYUV3(handle, srcBuf, width, pitch, height, pixelFormat,
-		dstBuf, 4, subsamp, flags);
+  return tjEncodeYUV3(handle, srcBuf, width, pitch, height, pixelFormat,
+                      dstBuf, 4, subsamp, flags);
 }
 
-DLLEXPORT int DLLCALL tjEncodeYUV(tjhandle handle, unsigned char *srcBuf,
-	int width, int pitch, int height, int pixelSize, unsigned char *dstBuf,
-	int subsamp, int flags)
+DLLEXPORT int tjEncodeYUV(tjhandle handle, unsigned char *srcBuf, int width,
+                          int pitch, int height, int pixelSize,
+                          unsigned char *dstBuf, int subsamp, int flags)
 {
-	return tjEncodeYUV2(handle, srcBuf, width, pitch, height,
-		getPixelFormat(pixelSize, flags), dstBuf, subsamp, flags);
+  return tjEncodeYUV2(handle, srcBuf, width, pitch, height,
+                      getPixelFormat(pixelSize, flags), dstBuf, subsamp,
+                      flags);
 }
 
 
-DLLEXPORT int DLLCALL tjCompressFromYUVPlanes(tjhandle handle,
-	const unsigned char **srcPlanes, int width, const int *strides, int height,
-	int subsamp, unsigned char **jpegBuf, unsigned long *jpegSize, int jpegQual,
-	int flags)
+DLLEXPORT int tjCompressFromYUVPlanes(tjhandle handle,
+                                      const unsigned char **srcPlanes,
+                                      int width, const int *strides,
+                                      int height, int subsamp,
+                                      unsigned char **jpegBuf,
+                                      unsigned long *jpegSize, int jpegQual,
+                                      int flags)
 {
-	int i, row, retval=0, alloc=1;  JSAMPROW *inbuf[MAX_COMPONENTS];
-	int pw[MAX_COMPONENTS], ph[MAX_COMPONENTS], iw[MAX_COMPONENTS],
-		tmpbufsize=0, usetmpbuf=0, th[MAX_COMPONENTS];
-	JSAMPLE *_tmpbuf=NULL, *ptr;  JSAMPROW *tmpbuf[MAX_COMPONENTS];
+  int i, row, retval = 0, alloc = 1;
+  int pw[MAX_COMPONENTS], ph[MAX_COMPONENTS], iw[MAX_COMPONENTS],
+    tmpbufsize = 0, usetmpbuf = 0, th[MAX_COMPONENTS];
+  JSAMPLE *_tmpbuf = NULL, *ptr;
+  JSAMPROW *inbuf[MAX_COMPONENTS], *tmpbuf[MAX_COMPONENTS];
 
-	getcinstance(handle)
+  getcinstance(handle)
+  this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
 
-	for(i=0; i<MAX_COMPONENTS; i++)
-	{
-		tmpbuf[i]=NULL;  inbuf[i]=NULL;
-	}
+  for (i = 0; i < MAX_COMPONENTS; i++) {
+    tmpbuf[i] = NULL;  inbuf[i] = NULL;
+  }
 
-	if((this->init&COMPRESS)==0)
-		_throw("tjCompressFromYUVPlanes(): Instance has not been initialized for compression");
+  if ((this->init & COMPRESS) == 0)
+    _throw("tjCompressFromYUVPlanes(): Instance has not been initialized for compression");
 
-	if(!srcPlanes || !srcPlanes[0] || width<=0 || height<=0 || subsamp<0
-		|| subsamp>=NUMSUBOPT || jpegBuf==NULL || jpegSize==NULL || jpegQual<0
-		|| jpegQual>100)
-		_throw("tjCompressFromYUVPlanes(): Invalid argument");
-	if(subsamp!=TJSAMP_GRAY && (!srcPlanes[1] || !srcPlanes[2]))
-		_throw("tjCompressFromYUVPlanes(): Invalid argument");
+  if (!srcPlanes || !srcPlanes[0] || width <= 0 || height <= 0 ||
+      subsamp < 0 || subsamp >= NUMSUBOPT || jpegBuf == NULL ||
+      jpegSize == NULL || jpegQual < 0 || jpegQual > 100)
+    _throw("tjCompressFromYUVPlanes(): Invalid argument");
+  if (subsamp != TJSAMP_GRAY && (!srcPlanes[1] || !srcPlanes[2]))
+    _throw("tjCompressFromYUVPlanes(): Invalid argument");
 
-	if(setjmp(this->jerr.setjmp_buffer))
-	{
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
-	cinfo->image_width=width;
-	cinfo->image_height=height;
+  cinfo->image_width = width;
+  cinfo->image_height = height;
 
-	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
-	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
-	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
+  if (flags & TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
+  else if (flags & TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
+  else if (flags & TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 
-	if(flags&TJFLAG_NOREALLOC)
-	{
-		alloc=0;  *jpegSize=tjBufSize(width, height, subsamp);
-	}
-	jpeg_mem_dest_tj(cinfo, jpegBuf, jpegSize, alloc);
-	if(setCompDefaults(cinfo, TJPF_RGB, subsamp, jpegQual, flags)==-1)
-		return -1;
-	cinfo->raw_data_in=TRUE;
+  if (flags & TJFLAG_NOREALLOC) {
+    alloc = 0;  *jpegSize = tjBufSize(width, height, subsamp);
+  }
+  jpeg_mem_dest_tj(cinfo, jpegBuf, jpegSize, alloc);
+  if (setCompDefaults(cinfo, TJPF_RGB, subsamp, jpegQual, flags) == -1)
+    return -1;
+  cinfo->raw_data_in = TRUE;
 
-	jpeg_start_compress(cinfo, TRUE);
-	for(i=0; i<cinfo->num_components; i++)
-	{
-		jpeg_component_info *compptr=&cinfo->comp_info[i];
-		int ih;
-		iw[i]=compptr->width_in_blocks*DCTSIZE;
-		ih=compptr->height_in_blocks*DCTSIZE;
-		pw[i]=PAD(cinfo->image_width, cinfo->max_h_samp_factor)
-			*compptr->h_samp_factor/cinfo->max_h_samp_factor;
-		ph[i]=PAD(cinfo->image_height, cinfo->max_v_samp_factor)
-			*compptr->v_samp_factor/cinfo->max_v_samp_factor;
-		if(iw[i]!=pw[i] || ih!=ph[i]) usetmpbuf=1;
-		th[i]=compptr->v_samp_factor*DCTSIZE;
-		tmpbufsize+=iw[i]*th[i];
-		if((inbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph[i]))==NULL)
-			_throw("tjCompressFromYUVPlanes(): Memory allocation failure");
-		ptr=(JSAMPLE *)srcPlanes[i];
-		for(row=0; row<ph[i]; row++)
-		{
-			inbuf[i][row]=ptr;
-			ptr+=(strides && strides[i]!=0)? strides[i]:pw[i];
-		}
-	}
-	if(usetmpbuf)
-	{
-		if((_tmpbuf=(JSAMPLE *)malloc(sizeof(JSAMPLE)*tmpbufsize))==NULL)
-			_throw("tjCompressFromYUVPlanes(): Memory allocation failure");
-		ptr=_tmpbuf;
-		for(i=0; i<cinfo->num_components; i++)
-		{
-			if((tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*th[i]))==NULL)
-				_throw("tjCompressFromYUVPlanes(): Memory allocation failure");
-			for(row=0; row<th[i]; row++)
-			{
-				tmpbuf[i][row]=ptr;
-				ptr+=iw[i];
-			}
-		}
-	}
+  jpeg_start_compress(cinfo, TRUE);
+  for (i = 0; i < cinfo->num_components; i++) {
+    jpeg_component_info *compptr = &cinfo->comp_info[i];
+    int ih;
 
-	if(setjmp(this->jerr.setjmp_buffer))
-	{
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+    iw[i] = compptr->width_in_blocks * DCTSIZE;
+    ih = compptr->height_in_blocks * DCTSIZE;
+    pw[i] = PAD(cinfo->image_width, cinfo->max_h_samp_factor) *
+            compptr->h_samp_factor / cinfo->max_h_samp_factor;
+    ph[i] = PAD(cinfo->image_height, cinfo->max_v_samp_factor) *
+            compptr->v_samp_factor / cinfo->max_v_samp_factor;
+    if (iw[i] != pw[i] || ih != ph[i]) usetmpbuf = 1;
+    th[i] = compptr->v_samp_factor * DCTSIZE;
+    tmpbufsize += iw[i] * th[i];
+    if ((inbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph[i])) == NULL)
+      _throw("tjCompressFromYUVPlanes(): Memory allocation failure");
+    ptr = (JSAMPLE *)srcPlanes[i];
+    for (row = 0; row < ph[i]; row++) {
+      inbuf[i][row] = ptr;
+      ptr += (strides && strides[i] != 0) ? strides[i] : pw[i];
+    }
+  }
+  if (usetmpbuf) {
+    if ((_tmpbuf = (JSAMPLE *)malloc(sizeof(JSAMPLE) * tmpbufsize)) == NULL)
+      _throw("tjCompressFromYUVPlanes(): Memory allocation failure");
+    ptr = _tmpbuf;
+    for (i = 0; i < cinfo->num_components; i++) {
+      if ((tmpbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * th[i])) == NULL)
+        _throw("tjCompressFromYUVPlanes(): Memory allocation failure");
+      for (row = 0; row < th[i]; row++) {
+        tmpbuf[i][row] = ptr;
+        ptr += iw[i];
+      }
+    }
+  }
 
-	for(row=0; row<(int)cinfo->image_height;
-		row+=cinfo->max_v_samp_factor*DCTSIZE)
-	{
-		JSAMPARRAY yuvptr[MAX_COMPONENTS];
-		int crow[MAX_COMPONENTS];
-		for(i=0; i<cinfo->num_components; i++)
-		{
-			jpeg_component_info *compptr=&cinfo->comp_info[i];
-			crow[i]=row*compptr->v_samp_factor/cinfo->max_v_samp_factor;
-			if(usetmpbuf)
-			{
-				int j, k;
-				for(j=0; j<min(th[i], ph[i]-crow[i]); j++)
-				{
-					memcpy(tmpbuf[i][j], inbuf[i][crow[i]+j], pw[i]);
-					/* Duplicate last sample in row to fill out MCU */
-					for(k=pw[i]; k<iw[i]; k++) tmpbuf[i][j][k]=tmpbuf[i][j][pw[i]-1];
-				}
-				/* Duplicate last row to fill out MCU */
-				for(j=ph[i]-crow[i]; j<th[i]; j++)
-					memcpy(tmpbuf[i][j], tmpbuf[i][ph[i]-crow[i]-1], iw[i]);
-				yuvptr[i]=tmpbuf[i];
-			}
-			else
-				yuvptr[i]=&inbuf[i][crow[i]];
-		}
-		jpeg_write_raw_data(cinfo, yuvptr, cinfo->max_v_samp_factor*DCTSIZE);
-	}
-	jpeg_finish_compress(cinfo);
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
-	bailout:
-	if(cinfo->global_state>CSTATE_START) jpeg_abort_compress(cinfo);
-	for(i=0; i<MAX_COMPONENTS; i++)
-	{
-		if(tmpbuf[i]) free(tmpbuf[i]);
-		if(inbuf[i]) free(inbuf[i]);
-	}
-	if(_tmpbuf) free(_tmpbuf);
-	if(this->jerr.warning) retval=-1;
-	return retval;
+  for (row = 0; row < (int)cinfo->image_height;
+       row += cinfo->max_v_samp_factor * DCTSIZE) {
+    JSAMPARRAY yuvptr[MAX_COMPONENTS];
+    int crow[MAX_COMPONENTS];
+
+    for (i = 0; i < cinfo->num_components; i++) {
+      jpeg_component_info *compptr = &cinfo->comp_info[i];
+
+      crow[i] = row * compptr->v_samp_factor / cinfo->max_v_samp_factor;
+      if (usetmpbuf) {
+        int j, k;
+
+        for (j = 0; j < MIN(th[i], ph[i] - crow[i]); j++) {
+          memcpy(tmpbuf[i][j], inbuf[i][crow[i] + j], pw[i]);
+          /* Duplicate last sample in row to fill out MCU */
+          for (k = pw[i]; k < iw[i]; k++)
+            tmpbuf[i][j][k] = tmpbuf[i][j][pw[i] - 1];
+        }
+        /* Duplicate last row to fill out MCU */
+        for (j = ph[i] - crow[i]; j < th[i]; j++)
+          memcpy(tmpbuf[i][j], tmpbuf[i][ph[i] - crow[i] - 1], iw[i]);
+        yuvptr[i] = tmpbuf[i];
+      } else
+        yuvptr[i] = &inbuf[i][crow[i]];
+    }
+    jpeg_write_raw_data(cinfo, yuvptr, cinfo->max_v_samp_factor * DCTSIZE);
+  }
+  jpeg_finish_compress(cinfo);
+
+bailout:
+  if (cinfo->global_state > CSTATE_START) jpeg_abort_compress(cinfo);
+  for (i = 0; i < MAX_COMPONENTS; i++) {
+    if (tmpbuf[i]) free(tmpbuf[i]);
+    if (inbuf[i]) free(inbuf[i]);
+  }
+  if (_tmpbuf) free(_tmpbuf);
+  if (this->jerr.warning) retval = -1;
+  this->jerr.stopOnWarning = FALSE;
+  return retval;
 }
 
-DLLEXPORT int DLLCALL tjCompressFromYUV(tjhandle handle,
-	const unsigned char *srcBuf, int width, int pad, int height, int subsamp,
-	unsigned char **jpegBuf, unsigned long *jpegSize, int jpegQual, int flags)
+DLLEXPORT int tjCompressFromYUV(tjhandle handle, const unsigned char *srcBuf,
+                                int width, int pad, int height, int subsamp,
+                                unsigned char **jpegBuf,
+                                unsigned long *jpegSize, int jpegQual,
+                                int flags)
 {
-	const unsigned char *srcPlanes[3];
-	int pw0, ph0, strides[3], retval=-1;
+  const unsigned char *srcPlanes[3];
+  int pw0, ph0, strides[3], retval = -1;
+  tjinstance *this = (tjinstance *)handle;
 
-	if(srcBuf==NULL || width<=0 || pad<1 || height<=0 || subsamp<0
-		|| subsamp>=NUMSUBOPT)
-		_throw("tjCompressFromYUV(): Invalid argument");
+  if (!this) _throwg("tjCompressFromYUV(): Invalid handle");
+  this->isInstanceError = FALSE;
 
-	pw0=tjPlaneWidth(0, width, subsamp);
-	ph0=tjPlaneHeight(0, height, subsamp);
-	srcPlanes[0]=srcBuf;
-	strides[0]=PAD(pw0, pad);
-	if(subsamp==TJSAMP_GRAY)
-	{
-		strides[1]=strides[2]=0;
-		srcPlanes[1]=srcPlanes[2]=NULL;
-	}
-	else
-	{
-		int pw1=tjPlaneWidth(1, width, subsamp);
-		int ph1=tjPlaneHeight(1, height, subsamp);
-		strides[1]=strides[2]=PAD(pw1, pad);
-		srcPlanes[1]=srcPlanes[0]+strides[0]*ph0;
-		srcPlanes[2]=srcPlanes[1]+strides[1]*ph1;
-	}
+  if (srcBuf == NULL || width <= 0 || pad < 1 || height <= 0 || subsamp < 0 ||
+      subsamp >= NUMSUBOPT)
+    _throw("tjCompressFromYUV(): Invalid argument");
 
-	return tjCompressFromYUVPlanes(handle, srcPlanes, width, strides, height,
-		subsamp, jpegBuf, jpegSize, jpegQual, flags);
+  pw0 = tjPlaneWidth(0, width, subsamp);
+  ph0 = tjPlaneHeight(0, height, subsamp);
+  srcPlanes[0] = srcBuf;
+  strides[0] = PAD(pw0, pad);
+  if (subsamp == TJSAMP_GRAY) {
+    strides[1] = strides[2] = 0;
+    srcPlanes[1] = srcPlanes[2] = NULL;
+  } else {
+    int pw1 = tjPlaneWidth(1, width, subsamp);
+    int ph1 = tjPlaneHeight(1, height, subsamp);
 
-	bailout:
-	return retval;
+    strides[1] = strides[2] = PAD(pw1, pad);
+    srcPlanes[1] = srcPlanes[0] + strides[0] * ph0;
+    srcPlanes[2] = srcPlanes[1] + strides[1] * ph1;
+  }
+
+  return tjCompressFromYUVPlanes(handle, srcPlanes, width, strides, height,
+                                 subsamp, jpegBuf, jpegSize, jpegQual, flags);
+
+bailout:
+  return retval;
 }
 
 
@@ -1253,923 +1070,1021 @@
 
 static tjhandle _tjInitDecompress(tjinstance *this)
 {
-	static unsigned char buffer[1];
+  static unsigned char buffer[1];
 
-	/* This is also straight out of example.c */
-	this->dinfo.err=jpeg_std_error(&this->jerr.pub);
-	this->jerr.pub.error_exit=my_error_exit;
-	this->jerr.pub.output_message=my_output_message;
-	this->jerr.emit_message=this->jerr.pub.emit_message;
-	this->jerr.pub.emit_message=my_emit_message;
+  /* This is also straight out of example.txt */
+  this->dinfo.err = jpeg_std_error(&this->jerr.pub);
+  this->jerr.pub.error_exit = my_error_exit;
+  this->jerr.pub.output_message = my_output_message;
+  this->jerr.emit_message = this->jerr.pub.emit_message;
+  this->jerr.pub.emit_message = my_emit_message;
 
-	if(setjmp(this->jerr.setjmp_buffer))
-	{
-		/* If we get here, the JPEG code has signaled an error. */
-		if(this) free(this);
-		return NULL;
-	}
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    if (this) free(this);
+    return NULL;
+  }
 
-	jpeg_create_decompress(&this->dinfo);
-	/* Make an initial call so it will create the source manager */
-	jpeg_mem_src_tj(&this->dinfo, buffer, 1);
+  jpeg_create_decompress(&this->dinfo);
+  /* Make an initial call so it will create the source manager */
+  jpeg_mem_src_tj(&this->dinfo, buffer, 1);
 
-	this->init|=DECOMPRESS;
-	return (tjhandle)this;
+  this->init |= DECOMPRESS;
+  return (tjhandle)this;
 }
 
-DLLEXPORT tjhandle DLLCALL tjInitDecompress(void)
+DLLEXPORT tjhandle tjInitDecompress(void)
 {
-	tjinstance *this;
-	if((this=(tjinstance *)malloc(sizeof(tjinstance)))==NULL)
-	{
-		snprintf(errStr, JMSG_LENGTH_MAX,
-			"tjInitDecompress(): Memory allocation failure");
-		return NULL;
-	}
-	MEMZERO(this, sizeof(tjinstance));
-	return _tjInitDecompress(this);
+  tjinstance *this;
+
+  if ((this = (tjinstance *)malloc(sizeof(tjinstance))) == NULL) {
+    snprintf(errStr, JMSG_LENGTH_MAX,
+             "tjInitDecompress(): Memory allocation failure");
+    return NULL;
+  }
+  MEMZERO(this, sizeof(tjinstance));
+  snprintf(this->errStr, JMSG_LENGTH_MAX, "No error");
+  return _tjInitDecompress(this);
 }
 
 
-DLLEXPORT int DLLCALL tjDecompressHeader3(tjhandle handle,
-	const unsigned char *jpegBuf, unsigned long jpegSize, int *width,
-	int *height, int *jpegSubsamp, int *jpegColorspace)
+DLLEXPORT int tjDecompressHeader3(tjhandle handle,
+                                  const unsigned char *jpegBuf,
+                                  unsigned long jpegSize, int *width,
+                                  int *height, int *jpegSubsamp,
+                                  int *jpegColorspace)
 {
-	int retval=0;
+  int retval = 0;
 
-	getdinstance(handle);
-	if((this->init&DECOMPRESS)==0)
-		_throw("tjDecompressHeader3(): Instance has not been initialized for decompression");
+  getdinstance(handle);
+  if ((this->init & DECOMPRESS) == 0)
+    _throw("tjDecompressHeader3(): Instance has not been initialized for decompression");
 
-	if(jpegBuf==NULL || jpegSize<=0 || width==NULL || height==NULL
-		|| jpegSubsamp==NULL || jpegColorspace==NULL)
-		_throw("tjDecompressHeader3(): Invalid argument");
+  if (jpegBuf == NULL || jpegSize <= 0 || width == NULL || height == NULL ||
+      jpegSubsamp == NULL || jpegColorspace == NULL)
+    _throw("tjDecompressHeader3(): Invalid argument");
 
-	if(setjmp(this->jerr.setjmp_buffer))
-	{
-		/* If we get here, the JPEG code has signaled an error. */
-		return -1;
-	}
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    return -1;
+  }
 
-	jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
-	jpeg_read_header(dinfo, TRUE);
+  jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+  jpeg_read_header(dinfo, TRUE);
 
-	*width=dinfo->image_width;
-	*height=dinfo->image_height;
-	*jpegSubsamp=getSubsamp(dinfo);
-	switch(dinfo->jpeg_color_space)
-	{
-		case JCS_GRAYSCALE:  *jpegColorspace=TJCS_GRAY;  break;
-		case JCS_RGB:        *jpegColorspace=TJCS_RGB;  break;
-		case JCS_YCbCr:      *jpegColorspace=TJCS_YCbCr;  break;
-		case JCS_CMYK:       *jpegColorspace=TJCS_CMYK;  break;
-		case JCS_YCCK:       *jpegColorspace=TJCS_YCCK;  break;
-		default:             *jpegColorspace=-1;  break;
-	}
+  *width = dinfo->image_width;
+  *height = dinfo->image_height;
+  *jpegSubsamp = getSubsamp(dinfo);
+  switch (dinfo->jpeg_color_space) {
+  case JCS_GRAYSCALE:  *jpegColorspace = TJCS_GRAY;  break;
+  case JCS_RGB:        *jpegColorspace = TJCS_RGB;  break;
+  case JCS_YCbCr:      *jpegColorspace = TJCS_YCbCr;  break;
+  case JCS_CMYK:       *jpegColorspace = TJCS_CMYK;  break;
+  case JCS_YCCK:       *jpegColorspace = TJCS_YCCK;  break;
+  default:             *jpegColorspace = -1;  break;
+  }
 
-	jpeg_abort_decompress(dinfo);
+  jpeg_abort_decompress(dinfo);
 
-	if(*jpegSubsamp<0)
-		_throw("tjDecompressHeader3(): Could not determine subsampling type for JPEG image");
-	if(*jpegColorspace<0)
-		_throw("tjDecompressHeader3(): Could not determine colorspace of JPEG image");
-	if(*width<1 || *height<1)
-		_throw("tjDecompressHeader3(): Invalid data returned in header");
+  if (*jpegSubsamp < 0)
+    _throw("tjDecompressHeader3(): Could not determine subsampling type for JPEG image");
+  if (*jpegColorspace < 0)
+    _throw("tjDecompressHeader3(): Could not determine colorspace of JPEG image");
+  if (*width < 1 || *height < 1)
+    _throw("tjDecompressHeader3(): Invalid data returned in header");
 
-	bailout:
-	if(this->jerr.warning) retval=-1;
-	return retval;
+bailout:
+  if (this->jerr.warning) retval = -1;
+  return retval;
 }
 
-DLLEXPORT int DLLCALL tjDecompressHeader2(tjhandle handle,
-	unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height,
-	int *jpegSubsamp)
+DLLEXPORT int tjDecompressHeader2(tjhandle handle, unsigned char *jpegBuf,
+                                  unsigned long jpegSize, int *width,
+                                  int *height, int *jpegSubsamp)
 {
-	int jpegColorspace;
-	return tjDecompressHeader3(handle, jpegBuf, jpegSize, width, height,
-		jpegSubsamp, &jpegColorspace);
+  int jpegColorspace;
+
+  return tjDecompressHeader3(handle, jpegBuf, jpegSize, width, height,
+                             jpegSubsamp, &jpegColorspace);
 }
 
-DLLEXPORT int DLLCALL tjDecompressHeader(tjhandle handle,
-	unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height)
+DLLEXPORT int tjDecompressHeader(tjhandle handle, unsigned char *jpegBuf,
+                                 unsigned long jpegSize, int *width,
+                                 int *height)
 {
-	int jpegSubsamp;
-	return tjDecompressHeader2(handle, jpegBuf, jpegSize, width, height,
-		&jpegSubsamp);
+  int jpegSubsamp;
+
+  return tjDecompressHeader2(handle, jpegBuf, jpegSize, width, height,
+                             &jpegSubsamp);
 }
 
 
-DLLEXPORT tjscalingfactor* DLLCALL tjGetScalingFactors(int *numscalingfactors)
+DLLEXPORT tjscalingfactor *tjGetScalingFactors(int *numscalingfactors)
 {
-	if(numscalingfactors==NULL)
-	{
-		snprintf(errStr, JMSG_LENGTH_MAX,
-			"tjGetScalingFactors(): Invalid argument");
-		return NULL;
-	}
+  if (numscalingfactors == NULL) {
+    snprintf(errStr, JMSG_LENGTH_MAX,
+             "tjGetScalingFactors(): Invalid argument");
+    return NULL;
+  }
 
-	*numscalingfactors=NUMSF;
-	return (tjscalingfactor *)sf;
+  *numscalingfactors = NUMSF;
+  return (tjscalingfactor *)sf;
 }
 
 
-DLLEXPORT int DLLCALL tjDecompress2(tjhandle handle,
-	const unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
-	int width, int pitch, int height, int pixelFormat, int flags)
+DLLEXPORT int tjDecompress2(tjhandle handle, const unsigned char *jpegBuf,
+                            unsigned long jpegSize, unsigned char *dstBuf,
+                            int width, int pitch, int height, int pixelFormat,
+                            int flags)
 {
-	int i, retval=0;  JSAMPROW *row_pointer=NULL;
-	int jpegwidth, jpegheight, scaledw, scaledh;
-	#ifndef JCS_EXTENSIONS
-	unsigned char *rgbBuf=NULL;
-	unsigned char *_dstBuf=NULL;  int _pitch=0;
-	#endif
+  JSAMPROW *row_pointer = NULL;
+  int i, retval = 0, jpegwidth, jpegheight, scaledw, scaledh;
 
-	getdinstance(handle);
-	if((this->init&DECOMPRESS)==0)
-		_throw("tjDecompress2(): Instance has not been initialized for decompression");
+  getdinstance(handle);
+  this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
+  if ((this->init & DECOMPRESS) == 0)
+    _throw("tjDecompress2(): Instance has not been initialized for decompression");
 
-	if(jpegBuf==NULL || jpegSize<=0 || dstBuf==NULL || width<0 || pitch<0
-		|| height<0 || pixelFormat<0 || pixelFormat>=TJ_NUMPF)
-		_throw("tjDecompress2(): Invalid argument");
+  if (jpegBuf == NULL || jpegSize <= 0 || dstBuf == NULL || width < 0 ||
+      pitch < 0 || height < 0 || pixelFormat < 0 || pixelFormat >= TJ_NUMPF)
+    _throw("tjDecompress2(): Invalid argument");
 
-	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
-	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
-	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
+  if (flags & TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
+  else if (flags & TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
+  else if (flags & TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 
-	if(setjmp(this->jerr.setjmp_buffer))
-	{
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
-	jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
-	jpeg_read_header(dinfo, TRUE);
-	if(setDecompDefaults(dinfo, pixelFormat, flags)==-1)
-	{
-		retval=-1;  goto bailout;
-	}
+  jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+  jpeg_read_header(dinfo, TRUE);
+  this->dinfo.out_color_space = pf2cs[pixelFormat];
+  if (flags & TJFLAG_FASTDCT) this->dinfo.dct_method = JDCT_FASTEST;
+  if (flags & TJFLAG_FASTUPSAMPLE) dinfo->do_fancy_upsampling = FALSE;
 
-	if(flags&TJFLAG_FASTUPSAMPLE) dinfo->do_fancy_upsampling=FALSE;
+  jpegwidth = dinfo->image_width;  jpegheight = dinfo->image_height;
+  if (width == 0) width = jpegwidth;
+  if (height == 0) height = jpegheight;
+  for (i = 0; i < NUMSF; i++) {
+    scaledw = TJSCALED(jpegwidth, sf[i]);
+    scaledh = TJSCALED(jpegheight, sf[i]);
+    if (scaledw <= width && scaledh <= height)
+      break;
+  }
+  if (i >= NUMSF)
+    _throw("tjDecompress2(): Could not scale down to desired image dimensions");
+  width = scaledw;  height = scaledh;
+  dinfo->scale_num = sf[i].num;
+  dinfo->scale_denom = sf[i].denom;
 
-	jpegwidth=dinfo->image_width;  jpegheight=dinfo->image_height;
-	if(width==0) width=jpegwidth;
-	if(height==0) height=jpegheight;
-	for(i=0; i<NUMSF; i++)
-	{
-		scaledw=TJSCALED(jpegwidth, sf[i]);
-		scaledh=TJSCALED(jpegheight, sf[i]);
-		if(scaledw<=width && scaledh<=height)
-			break;
-	}
-	if(i>=NUMSF)
-		_throw("tjDecompress2(): Could not scale down to desired image dimensions");
-	width=scaledw;  height=scaledh;
-	dinfo->scale_num=sf[i].num;
-	dinfo->scale_denom=sf[i].denom;
+  jpeg_start_decompress(dinfo);
+  if (pitch == 0) pitch = dinfo->output_width * tjPixelSize[pixelFormat];
 
-	jpeg_start_decompress(dinfo);
-	if(pitch==0) pitch=dinfo->output_width*tjPixelSize[pixelFormat];
+  if ((row_pointer =
+       (JSAMPROW *)malloc(sizeof(JSAMPROW) * dinfo->output_height)) == NULL)
+    _throw("tjDecompress2(): Memory allocation failure");
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+  for (i = 0; i < (int)dinfo->output_height; i++) {
+    if (flags & TJFLAG_BOTTOMUP)
+      row_pointer[i] = &dstBuf[(dinfo->output_height - i - 1) * pitch];
+    else
+      row_pointer[i] = &dstBuf[i * pitch];
+  }
+  while (dinfo->output_scanline < dinfo->output_height)
+    jpeg_read_scanlines(dinfo, &row_pointer[dinfo->output_scanline],
+                        dinfo->output_height - dinfo->output_scanline);
+  jpeg_finish_decompress(dinfo);
 
-	#ifndef JCS_EXTENSIONS
-	if(pixelFormat!=TJPF_GRAY && pixelFormat!=TJPF_CMYK &&
-		(RGB_RED!=tjRedOffset[pixelFormat] ||
-			RGB_GREEN!=tjGreenOffset[pixelFormat] ||
-			RGB_BLUE!=tjBlueOffset[pixelFormat] ||
-			RGB_PIXELSIZE!=tjPixelSize[pixelFormat]))
-	{
-		rgbBuf=(unsigned char *)malloc(width*height*3);
-		if(!rgbBuf) _throw("tjDecompress2(): Memory allocation failure");
-		_pitch=pitch;  pitch=width*3;
-		_dstBuf=dstBuf;  dstBuf=rgbBuf;
-	}
-	#endif
-
-	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)
-		*dinfo->output_height))==NULL)
-		_throw("tjDecompress2(): Memory allocation failure");
-	if(setjmp(this->jerr.setjmp_buffer))
-	{
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
-	for(i=0; i<(int)dinfo->output_height; i++)
-	{
-		if(flags&TJFLAG_BOTTOMUP)
-			row_pointer[i]=&dstBuf[(dinfo->output_height-i-1)*pitch];
-		else row_pointer[i]=&dstBuf[i*pitch];
-	}
-	while(dinfo->output_scanline<dinfo->output_height)
-	{
-		jpeg_read_scanlines(dinfo, &row_pointer[dinfo->output_scanline],
-			dinfo->output_height-dinfo->output_scanline);
-	}
-	jpeg_finish_decompress(dinfo);
-
-	#ifndef JCS_EXTENSIONS
-	fromRGB(rgbBuf, _dstBuf, width, _pitch, height, pixelFormat);
-	#endif
-
-	bailout:
-	if(dinfo->global_state>DSTATE_START) jpeg_abort_decompress(dinfo);
-	#ifndef JCS_EXTENSIONS
-	if(rgbBuf) free(rgbBuf);
-	#endif
-	if(row_pointer) free(row_pointer);
-	if(this->jerr.warning) retval=-1;
-	return retval;
+bailout:
+  if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
+  if (row_pointer) free(row_pointer);
+  if (this->jerr.warning) retval = -1;
+  this->jerr.stopOnWarning = FALSE;
+  return retval;
 }
 
-DLLEXPORT int DLLCALL tjDecompress(tjhandle handle, unsigned char *jpegBuf,
-	unsigned long jpegSize, unsigned char *dstBuf, int width, int pitch,
-	int height, int pixelSize, int flags)
+DLLEXPORT int tjDecompress(tjhandle handle, unsigned char *jpegBuf,
+                           unsigned long jpegSize, unsigned char *dstBuf,
+                           int width, int pitch, int height, int pixelSize,
+                           int flags)
 {
-	if(flags&TJ_YUV)
-		return tjDecompressToYUV(handle, jpegBuf, jpegSize, dstBuf, flags);
-	else
-		return tjDecompress2(handle, jpegBuf, jpegSize, dstBuf, width, pitch,
-			height, getPixelFormat(pixelSize, flags), flags);
+  if (flags & TJ_YUV)
+    return tjDecompressToYUV(handle, jpegBuf, jpegSize, dstBuf, flags);
+  else
+    return tjDecompress2(handle, jpegBuf, jpegSize, dstBuf, width, pitch,
+                         height, getPixelFormat(pixelSize, flags), flags);
 }
 
 
 static int setDecodeDefaults(struct jpeg_decompress_struct *dinfo,
-	int pixelFormat, int subsamp, int flags)
+                             int pixelFormat, int subsamp, int flags)
 {
-	int i;
+  int i;
 
-	dinfo->scale_num=dinfo->scale_denom=1;
+  dinfo->scale_num = dinfo->scale_denom = 1;
 
-	if(subsamp==TJSAMP_GRAY)
-	{
-		dinfo->num_components=dinfo->comps_in_scan=1;
-		dinfo->jpeg_color_space=JCS_GRAYSCALE;
-	}
-	else
-	{
-		dinfo->num_components=dinfo->comps_in_scan=3;
-		dinfo->jpeg_color_space=JCS_YCbCr;
-	}
+  if (subsamp == TJSAMP_GRAY) {
+    dinfo->num_components = dinfo->comps_in_scan = 1;
+    dinfo->jpeg_color_space = JCS_GRAYSCALE;
+  } else {
+    dinfo->num_components = dinfo->comps_in_scan = 3;
+    dinfo->jpeg_color_space = JCS_YCbCr;
+  }
 
-	dinfo->comp_info=(jpeg_component_info *)
-		(*dinfo->mem->alloc_small)((j_common_ptr)dinfo, JPOOL_IMAGE,
-			dinfo->num_components*sizeof(jpeg_component_info));
+  dinfo->comp_info = (jpeg_component_info *)
+    (*dinfo->mem->alloc_small) ((j_common_ptr)dinfo, JPOOL_IMAGE,
+                                dinfo->num_components *
+                                sizeof(jpeg_component_info));
 
-	for(i=0; i<dinfo->num_components; i++)
-	{
-		jpeg_component_info *compptr=&dinfo->comp_info[i];
-		compptr->h_samp_factor=(i==0)? tjMCUWidth[subsamp]/8:1;
-		compptr->v_samp_factor=(i==0)? tjMCUHeight[subsamp]/8:1;
-		compptr->component_index=i;
-		compptr->component_id=i+1;
-		compptr->quant_tbl_no=compptr->dc_tbl_no=compptr->ac_tbl_no=
-			(i==0)? 0:1;
-		dinfo->cur_comp_info[i]=compptr;
-	}
-	dinfo->data_precision=8;
-	for(i=0; i<2; i++)
-	{
-		if(dinfo->quant_tbl_ptrs[i]==NULL)
-			dinfo->quant_tbl_ptrs[i]=jpeg_alloc_quant_table((j_common_ptr)dinfo);
-	}
+  for (i = 0; i < dinfo->num_components; i++) {
+    jpeg_component_info *compptr = &dinfo->comp_info[i];
 
-	return 0;
+    compptr->h_samp_factor = (i == 0) ? tjMCUWidth[subsamp] / 8 : 1;
+    compptr->v_samp_factor = (i == 0) ? tjMCUHeight[subsamp] / 8 : 1;
+    compptr->component_index = i;
+    compptr->component_id = i + 1;
+    compptr->quant_tbl_no = compptr->dc_tbl_no =
+      compptr->ac_tbl_no = (i == 0) ? 0 : 1;
+    dinfo->cur_comp_info[i] = compptr;
+  }
+  dinfo->data_precision = 8;
+  for (i = 0; i < 2; i++) {
+    if (dinfo->quant_tbl_ptrs[i] == NULL)
+      dinfo->quant_tbl_ptrs[i] = jpeg_alloc_quant_table((j_common_ptr)dinfo);
+  }
+
+  return 0;
 }
 
 
 int my_read_markers(j_decompress_ptr dinfo)
 {
-	return JPEG_REACHED_SOS;
+  return JPEG_REACHED_SOS;
 }
 
 void my_reset_marker_reader(j_decompress_ptr dinfo)
 {
 }
 
-DLLEXPORT int DLLCALL tjDecodeYUVPlanes(tjhandle handle,
-	const unsigned char **srcPlanes, const int *strides, int subsamp,
-	unsigned char *dstBuf, int width, int pitch, int height, int pixelFormat,
-	int flags)
+DLLEXPORT int tjDecodeYUVPlanes(tjhandle handle,
+                                const unsigned char **srcPlanes,
+                                const int *strides, int subsamp,
+                                unsigned char *dstBuf, int width, int pitch,
+                                int height, int pixelFormat, int flags)
 {
-	int i, retval=0;  JSAMPROW *row_pointer=NULL;
-	JSAMPLE *_tmpbuf[MAX_COMPONENTS];
-	JSAMPROW *tmpbuf[MAX_COMPONENTS], *inbuf[MAX_COMPONENTS];
-	int row, pw0, ph0, pw[MAX_COMPONENTS], ph[MAX_COMPONENTS];
-	JSAMPLE *ptr;
-	jpeg_component_info *compptr;
-	#ifndef JCS_EXTENSIONS
-	unsigned char *rgbBuf=NULL;
-	unsigned char *_dstBuf=NULL;  int _pitch=0;
-	#endif
-	int (*old_read_markers)(j_decompress_ptr);
-	void (*old_reset_marker_reader)(j_decompress_ptr);
+  JSAMPROW *row_pointer = NULL;
+  JSAMPLE *_tmpbuf[MAX_COMPONENTS];
+  JSAMPROW *tmpbuf[MAX_COMPONENTS], *inbuf[MAX_COMPONENTS];
+  int i, retval = 0, row, pw0, ph0, pw[MAX_COMPONENTS], ph[MAX_COMPONENTS];
+  JSAMPLE *ptr;
+  jpeg_component_info *compptr;
+  int (*old_read_markers) (j_decompress_ptr);
+  void (*old_reset_marker_reader) (j_decompress_ptr);
 
-	getdinstance(handle);
+  getdinstance(handle);
+  this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
 
-	for(i=0; i<MAX_COMPONENTS; i++)
-	{
-		tmpbuf[i]=NULL;  _tmpbuf[i]=NULL;  inbuf[i]=NULL;
-	}
+  for (i = 0; i < MAX_COMPONENTS; i++) {
+    tmpbuf[i] = NULL;  _tmpbuf[i] = NULL;  inbuf[i] = NULL;
+  }
 
-	if((this->init&DECOMPRESS)==0)
-		_throw("tjDecodeYUVPlanes(): Instance has not been initialized for decompression");
+  if ((this->init & DECOMPRESS) == 0)
+    _throw("tjDecodeYUVPlanes(): Instance has not been initialized for decompression");
 
-	if(!srcPlanes || !srcPlanes[0] || subsamp<0 || subsamp>=NUMSUBOPT
-		|| dstBuf==NULL || width<=0 || pitch<0 || height<=0 || pixelFormat<0
-		|| pixelFormat>=TJ_NUMPF)
-		_throw("tjDecodeYUVPlanes(): Invalid argument");
-	if(subsamp!=TJSAMP_GRAY && (!srcPlanes[1] || !srcPlanes[2]))
-		_throw("tjDecodeYUVPlanes(): Invalid argument");
+  if (!srcPlanes || !srcPlanes[0] || subsamp < 0 || subsamp >= NUMSUBOPT ||
+      dstBuf == NULL || width <= 0 || pitch < 0 || height <= 0 ||
+      pixelFormat < 0 || pixelFormat >= TJ_NUMPF)
+    _throw("tjDecodeYUVPlanes(): Invalid argument");
+  if (subsamp != TJSAMP_GRAY && (!srcPlanes[1] || !srcPlanes[2]))
+    _throw("tjDecodeYUVPlanes(): Invalid argument");
 
-	if(setjmp(this->jerr.setjmp_buffer))
-	{
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
-	if(pixelFormat==TJPF_CMYK)
-		_throw("tjDecodeYUVPlanes(): Cannot decode YUV images into CMYK pixels.");
+  if (pixelFormat == TJPF_CMYK)
+    _throw("tjDecodeYUVPlanes(): Cannot decode YUV images into CMYK pixels.");
 
-	if(pitch==0) pitch=width*tjPixelSize[pixelFormat];
-	dinfo->image_width=width;
-	dinfo->image_height=height;
+  if (pitch == 0) pitch = width * tjPixelSize[pixelFormat];
+  dinfo->image_width = width;
+  dinfo->image_height = height;
 
-	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
-	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
-	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
+  if (flags & TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
+  else if (flags & TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
+  else if (flags & TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 
-	if(setDecodeDefaults(dinfo, pixelFormat, subsamp, flags)==-1)
-	{
-		retval=-1;  goto bailout;
-	}
-	old_read_markers=dinfo->marker->read_markers;
-	dinfo->marker->read_markers=my_read_markers;
-	old_reset_marker_reader=dinfo->marker->reset_marker_reader;
-	dinfo->marker->reset_marker_reader=my_reset_marker_reader;
-	jpeg_read_header(dinfo, TRUE);
-	dinfo->marker->read_markers=old_read_markers;
-	dinfo->marker->reset_marker_reader=old_reset_marker_reader;
+  if (setDecodeDefaults(dinfo, pixelFormat, subsamp, flags) == -1) {
+    retval = -1;  goto bailout;
+  }
+  old_read_markers = dinfo->marker->read_markers;
+  dinfo->marker->read_markers = my_read_markers;
+  old_reset_marker_reader = dinfo->marker->reset_marker_reader;
+  dinfo->marker->reset_marker_reader = my_reset_marker_reader;
+  jpeg_read_header(dinfo, TRUE);
+  dinfo->marker->read_markers = old_read_markers;
+  dinfo->marker->reset_marker_reader = old_reset_marker_reader;
 
-	if(setDecompDefaults(dinfo, pixelFormat, flags)==-1)
-	{
-		retval=-1;  goto bailout;
-	}
-	dinfo->do_fancy_upsampling=FALSE;
-	dinfo->Se=DCTSIZE2-1;
-	jinit_master_decompress(dinfo);
-	(*dinfo->upsample->start_pass)(dinfo);
+  this->dinfo.out_color_space = pf2cs[pixelFormat];
+  if (flags & TJFLAG_FASTDCT) this->dinfo.dct_method = JDCT_FASTEST;
+  dinfo->do_fancy_upsampling = FALSE;
+  dinfo->Se = DCTSIZE2 - 1;
+  jinit_master_decompress(dinfo);
+  (*dinfo->upsample->start_pass) (dinfo);
 
-	pw0=PAD(width, dinfo->max_h_samp_factor);
-	ph0=PAD(height, dinfo->max_v_samp_factor);
+  pw0 = PAD(width, dinfo->max_h_samp_factor);
+  ph0 = PAD(height, dinfo->max_v_samp_factor);
 
-	if(pitch==0) pitch=dinfo->output_width*tjPixelSize[pixelFormat];
+  if (pitch == 0) pitch = dinfo->output_width * tjPixelSize[pixelFormat];
 
-	#ifndef JCS_EXTENSIONS
-	if(pixelFormat!=TJPF_GRAY && pixelFormat!=TJPF_CMYK &&
-		(RGB_RED!=tjRedOffset[pixelFormat] ||
-			RGB_GREEN!=tjGreenOffset[pixelFormat] ||
-			RGB_BLUE!=tjBlueOffset[pixelFormat] ||
-			RGB_PIXELSIZE!=tjPixelSize[pixelFormat]))
-	{
-		rgbBuf=(unsigned char *)malloc(width*height*3);
-		if(!rgbBuf) _throw("tjDecodeYUVPlanes(): Memory allocation failure");
-		_pitch=pitch;  pitch=width*3;
-		_dstBuf=dstBuf;  dstBuf=rgbBuf;
-	}
-	#endif
+  if ((row_pointer = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph0)) == NULL)
+    _throw("tjDecodeYUVPlanes(): Memory allocation failure");
+  for (i = 0; i < height; i++) {
+    if (flags & TJFLAG_BOTTOMUP)
+      row_pointer[i] = &dstBuf[(height - i - 1) * pitch];
+    else
+      row_pointer[i] = &dstBuf[i * pitch];
+  }
+  if (height < ph0)
+    for (i = height; i < ph0; i++) row_pointer[i] = row_pointer[height - 1];
 
-	if((row_pointer=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph0))==NULL)
-		_throw("tjDecodeYUVPlanes(): Memory allocation failure");
-	for(i=0; i<height; i++)
-	{
-		if(flags&TJFLAG_BOTTOMUP) row_pointer[i]=&dstBuf[(height-i-1)*pitch];
-		else row_pointer[i]=&dstBuf[i*pitch];
-	}
-	if(height<ph0)
-		for(i=height; i<ph0; i++) row_pointer[i]=row_pointer[height-1];
+  for (i = 0; i < dinfo->num_components; i++) {
+    compptr = &dinfo->comp_info[i];
+    _tmpbuf[i] =
+      (JSAMPLE *)malloc(PAD(compptr->width_in_blocks * DCTSIZE, 32) *
+                        compptr->v_samp_factor + 32);
+    if (!_tmpbuf[i])
+      _throw("tjDecodeYUVPlanes(): Memory allocation failure");
+    tmpbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * compptr->v_samp_factor);
+    if (!tmpbuf[i])
+      _throw("tjDecodeYUVPlanes(): Memory allocation failure");
+    for (row = 0; row < compptr->v_samp_factor; row++) {
+      unsigned char *_tmpbuf_aligned =
+        (unsigned char *)PAD((size_t)_tmpbuf[i], 32);
 
-	for(i=0; i<dinfo->num_components; i++)
-	{
-		compptr=&dinfo->comp_info[i];
-		_tmpbuf[i]=(JSAMPLE *)malloc(PAD(compptr->width_in_blocks*DCTSIZE, 16)
-			* compptr->v_samp_factor + 16);
-		if(!_tmpbuf[i]) _throw("tjDecodeYUVPlanes(): Memory allocation failure");
-		tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*compptr->v_samp_factor);
-		if(!tmpbuf[i]) _throw("tjDecodeYUVPlanes(): Memory allocation failure");
-		for(row=0; row<compptr->v_samp_factor; row++)
-		{
-			unsigned char *_tmpbuf_aligned=
-				(unsigned char *)PAD((size_t)_tmpbuf[i], 16);
-			tmpbuf[i][row]=&_tmpbuf_aligned[
-				PAD(compptr->width_in_blocks*DCTSIZE, 16) * row];
-		}
-		pw[i]=pw0*compptr->h_samp_factor/dinfo->max_h_samp_factor;
-		ph[i]=ph0*compptr->v_samp_factor/dinfo->max_v_samp_factor;
-		inbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph[i]);
-		if(!inbuf[i]) _throw("tjDecodeYUVPlanes(): Memory allocation failure");
-		ptr=(JSAMPLE *)srcPlanes[i];
-		for(row=0; row<ph[i]; row++)
-		{
-			inbuf[i][row]=ptr;
-			ptr+=(strides && strides[i]!=0)? strides[i]:pw[i];
-		}
-	}
+      tmpbuf[i][row] =
+        &_tmpbuf_aligned[PAD(compptr->width_in_blocks * DCTSIZE, 32) * row];
+    }
+    pw[i] = pw0 * compptr->h_samp_factor / dinfo->max_h_samp_factor;
+    ph[i] = ph0 * compptr->v_samp_factor / dinfo->max_v_samp_factor;
+    inbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph[i]);
+    if (!inbuf[i])
+      _throw("tjDecodeYUVPlanes(): Memory allocation failure");
+    ptr = (JSAMPLE *)srcPlanes[i];
+    for (row = 0; row < ph[i]; row++) {
+      inbuf[i][row] = ptr;
+      ptr += (strides && strides[i] != 0) ? strides[i] : pw[i];
+    }
+  }
 
-	if(setjmp(this->jerr.setjmp_buffer))
-	{
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
-	for(row=0; row<ph0; row+=dinfo->max_v_samp_factor)
-	{
-		JDIMENSION inrow=0, outrow=0;
-		for(i=0, compptr=dinfo->comp_info; i<dinfo->num_components; i++, compptr++)
-			jcopy_sample_rows(inbuf[i],
-				row*compptr->v_samp_factor/dinfo->max_v_samp_factor, tmpbuf[i], 0,
-				compptr->v_samp_factor, pw[i]);
-		(dinfo->upsample->upsample)(dinfo, tmpbuf, &inrow,
-			dinfo->max_v_samp_factor, &row_pointer[row], &outrow,
-			dinfo->max_v_samp_factor);
-	}
-	jpeg_abort_decompress(dinfo);
+  for (row = 0; row < ph0; row += dinfo->max_v_samp_factor) {
+    JDIMENSION inrow = 0, outrow = 0;
 
-	#ifndef JCS_EXTENSIONS
-	fromRGB(rgbBuf, _dstBuf, width, _pitch, height, pixelFormat);
-	#endif
+    for (i = 0, compptr = dinfo->comp_info; i < dinfo->num_components;
+         i++, compptr++)
+      jcopy_sample_rows(inbuf[i],
+        row * compptr->v_samp_factor / dinfo->max_v_samp_factor, tmpbuf[i], 0,
+        compptr->v_samp_factor, pw[i]);
+    (dinfo->upsample->upsample) (dinfo, tmpbuf, &inrow,
+                                 dinfo->max_v_samp_factor, &row_pointer[row],
+                                 &outrow, dinfo->max_v_samp_factor);
+  }
+  jpeg_abort_decompress(dinfo);
 
-	bailout:
-	if(dinfo->global_state>DSTATE_START) jpeg_abort_decompress(dinfo);
-	#ifndef JCS_EXTENSIONS
-	if(rgbBuf) free(rgbBuf);
-	#endif
-	if(row_pointer) free(row_pointer);
-	for(i=0; i<MAX_COMPONENTS; i++)
-	{
-		if(tmpbuf[i]!=NULL) free(tmpbuf[i]);
-		if(_tmpbuf[i]!=NULL) free(_tmpbuf[i]);
-		if(inbuf[i]!=NULL) free(inbuf[i]);
-	}
-	if(this->jerr.warning) retval=-1;
-	return retval;
+bailout:
+  if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
+  if (row_pointer) free(row_pointer);
+  for (i = 0; i < MAX_COMPONENTS; i++) {
+    if (tmpbuf[i] != NULL) free(tmpbuf[i]);
+    if (_tmpbuf[i] != NULL) free(_tmpbuf[i]);
+    if (inbuf[i] != NULL) free(inbuf[i]);
+  }
+  if (this->jerr.warning) retval = -1;
+  this->jerr.stopOnWarning = FALSE;
+  return retval;
 }
 
-DLLEXPORT int DLLCALL tjDecodeYUV(tjhandle handle, const unsigned char *srcBuf,
-	int pad, int subsamp, unsigned char *dstBuf, int width, int pitch,
-	int height, int pixelFormat, int flags)
+DLLEXPORT int tjDecodeYUV(tjhandle handle, const unsigned char *srcBuf,
+                          int pad, int subsamp, unsigned char *dstBuf,
+                          int width, int pitch, int height, int pixelFormat,
+                          int flags)
 {
-	const unsigned char *srcPlanes[3];
-	int pw0, ph0, strides[3], retval=-1;
+  const unsigned char *srcPlanes[3];
+  int pw0, ph0, strides[3], retval = -1;
+  tjinstance *this = (tjinstance *)handle;
 
-	if(srcBuf==NULL || pad<0 || !isPow2(pad) || subsamp<0 || subsamp>=NUMSUBOPT
-		|| width<=0 || height<=0)
-		_throw("tjDecodeYUV(): Invalid argument");
+  if (!this) _throwg("tjDecodeYUV(): Invalid handle");
+  this->isInstanceError = FALSE;
 
-	pw0=tjPlaneWidth(0, width, subsamp);
-	ph0=tjPlaneHeight(0, height, subsamp);
-	srcPlanes[0]=srcBuf;
-	strides[0]=PAD(pw0, pad);
-	if(subsamp==TJSAMP_GRAY)
-	{
-		strides[1]=strides[2]=0;
-		srcPlanes[1]=srcPlanes[2]=NULL;
-	}
-	else
-	{
-		int pw1=tjPlaneWidth(1, width, subsamp);
-		int ph1=tjPlaneHeight(1, height, subsamp);
-		strides[1]=strides[2]=PAD(pw1, pad);
-		srcPlanes[1]=srcPlanes[0]+strides[0]*ph0;
-		srcPlanes[2]=srcPlanes[1]+strides[1]*ph1;
-	}
+  if (srcBuf == NULL || pad < 0 || !isPow2(pad) || subsamp < 0 ||
+      subsamp >= NUMSUBOPT || width <= 0 || height <= 0)
+    _throw("tjDecodeYUV(): Invalid argument");
 
-	return tjDecodeYUVPlanes(handle, srcPlanes, strides, subsamp, dstBuf, width,
-		pitch, height, pixelFormat, flags);
+  pw0 = tjPlaneWidth(0, width, subsamp);
+  ph0 = tjPlaneHeight(0, height, subsamp);
+  srcPlanes[0] = srcBuf;
+  strides[0] = PAD(pw0, pad);
+  if (subsamp == TJSAMP_GRAY) {
+    strides[1] = strides[2] = 0;
+    srcPlanes[1] = srcPlanes[2] = NULL;
+  } else {
+    int pw1 = tjPlaneWidth(1, width, subsamp);
+    int ph1 = tjPlaneHeight(1, height, subsamp);
 
-	bailout:
-	return retval;
+    strides[1] = strides[2] = PAD(pw1, pad);
+    srcPlanes[1] = srcPlanes[0] + strides[0] * ph0;
+    srcPlanes[2] = srcPlanes[1] + strides[1] * ph1;
+  }
+
+  return tjDecodeYUVPlanes(handle, srcPlanes, strides, subsamp, dstBuf, width,
+                           pitch, height, pixelFormat, flags);
+
+bailout:
+  return retval;
 }
 
-DLLEXPORT int DLLCALL tjDecompressToYUVPlanes(tjhandle handle,
-	const unsigned char *jpegBuf, unsigned long jpegSize,
-	unsigned char **dstPlanes, int width, int *strides, int height, int flags)
+DLLEXPORT int tjDecompressToYUVPlanes(tjhandle handle,
+                                      const unsigned char *jpegBuf,
+                                      unsigned long jpegSize,
+                                      unsigned char **dstPlanes, int width,
+                                      int *strides, int height, int flags)
 {
-	int i, sfi, row, retval=0;  JSAMPROW *outbuf[MAX_COMPONENTS];
-	int jpegwidth, jpegheight, jpegSubsamp, scaledw, scaledh;
-	int pw[MAX_COMPONENTS], ph[MAX_COMPONENTS], iw[MAX_COMPONENTS],
-		tmpbufsize=0, usetmpbuf=0, th[MAX_COMPONENTS];
-	JSAMPLE *_tmpbuf=NULL, *ptr;  JSAMPROW *tmpbuf[MAX_COMPONENTS];
-	int dctsize;
+  int i, sfi, row, retval = 0;
+  int jpegwidth, jpegheight, jpegSubsamp, scaledw, scaledh;
+  int pw[MAX_COMPONENTS], ph[MAX_COMPONENTS], iw[MAX_COMPONENTS],
+    tmpbufsize = 0, usetmpbuf = 0, th[MAX_COMPONENTS];
+  JSAMPLE *_tmpbuf = NULL, *ptr;
+  JSAMPROW *outbuf[MAX_COMPONENTS], *tmpbuf[MAX_COMPONENTS];
+  int dctsize;
 
-	getdinstance(handle);
+  getdinstance(handle);
+  this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
 
-	for(i=0; i<MAX_COMPONENTS; i++)
-	{
-		tmpbuf[i]=NULL;  outbuf[i]=NULL;
-	}
+  for (i = 0; i < MAX_COMPONENTS; i++) {
+    tmpbuf[i] = NULL;  outbuf[i] = NULL;
+  }
 
-	if((this->init&DECOMPRESS)==0)
-		_throw("tjDecompressToYUVPlanes(): Instance has not been initialized for decompression");
+  if ((this->init & DECOMPRESS) == 0)
+    _throw("tjDecompressToYUVPlanes(): Instance has not been initialized for decompression");
 
-	if(jpegBuf==NULL || jpegSize<=0 || !dstPlanes || !dstPlanes[0] || width<0
-		|| height<0)
-		_throw("tjDecompressToYUVPlanes(): Invalid argument");
+  if (jpegBuf == NULL || jpegSize <= 0 || !dstPlanes || !dstPlanes[0] ||
+      width < 0 || height < 0)
+    _throw("tjDecompressToYUVPlanes(): Invalid argument");
 
-	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
-	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
-	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
+  if (flags & TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
+  else if (flags & TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
+  else if (flags & TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 
-	if(setjmp(this->jerr.setjmp_buffer))
-	{
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
-	if(!this->headerRead)
-	{
-		jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
-		jpeg_read_header(dinfo, TRUE);
-	}
-	this->headerRead=0;
-	jpegSubsamp=getSubsamp(dinfo);
-	if(jpegSubsamp<0)
-		_throw("tjDecompressToYUVPlanes(): Could not determine subsampling type for JPEG image");
+  if (!this->headerRead) {
+    jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+    jpeg_read_header(dinfo, TRUE);
+  }
+  this->headerRead = 0;
+  jpegSubsamp = getSubsamp(dinfo);
+  if (jpegSubsamp < 0)
+    _throw("tjDecompressToYUVPlanes(): Could not determine subsampling type for JPEG image");
 
-	if(jpegSubsamp!=TJSAMP_GRAY && (!dstPlanes[1] || !dstPlanes[2]))
-		_throw("tjDecompressToYUVPlanes(): Invalid argument");
+  if (jpegSubsamp != TJSAMP_GRAY && (!dstPlanes[1] || !dstPlanes[2]))
+    _throw("tjDecompressToYUVPlanes(): Invalid argument");
 
-	jpegwidth=dinfo->image_width;  jpegheight=dinfo->image_height;
-	if(width==0) width=jpegwidth;
-	if(height==0) height=jpegheight;
-	for(i=0; i<NUMSF; i++)
-	{
-		scaledw=TJSCALED(jpegwidth, sf[i]);
-		scaledh=TJSCALED(jpegheight, sf[i]);
-		if(scaledw<=width && scaledh<=height)
-			break;
-	}
-	if(i>=NUMSF)
-		_throw("tjDecompressToYUVPlanes(): Could not scale down to desired image dimensions");
-	if(dinfo->num_components>3)
-		_throw("tjDecompressToYUVPlanes(): JPEG image must have 3 or fewer components");
+  jpegwidth = dinfo->image_width;  jpegheight = dinfo->image_height;
+  if (width == 0) width = jpegwidth;
+  if (height == 0) height = jpegheight;
+  for (i = 0; i < NUMSF; i++) {
+    scaledw = TJSCALED(jpegwidth, sf[i]);
+    scaledh = TJSCALED(jpegheight, sf[i]);
+    if (scaledw <= width && scaledh <= height)
+      break;
+  }
+  if (i >= NUMSF)
+    _throw("tjDecompressToYUVPlanes(): Could not scale down to desired image dimensions");
+  if (dinfo->num_components > 3)
+    _throw("tjDecompressToYUVPlanes(): JPEG image must have 3 or fewer components");
 
-	width=scaledw;  height=scaledh;
-	dinfo->scale_num=sf[i].num;
-	dinfo->scale_denom=sf[i].denom;
-	sfi=i;
-	jpeg_calc_output_dimensions(dinfo);
+  width = scaledw;  height = scaledh;
+  dinfo->scale_num = sf[i].num;
+  dinfo->scale_denom = sf[i].denom;
+  sfi = i;
+  jpeg_calc_output_dimensions(dinfo);
 
-	dctsize=DCTSIZE*sf[sfi].num/sf[sfi].denom;
+  dctsize = DCTSIZE * sf[sfi].num / sf[sfi].denom;
 
-	for(i=0; i<dinfo->num_components; i++)
-	{
-		jpeg_component_info *compptr=&dinfo->comp_info[i];
-		int ih;
-		iw[i]=compptr->width_in_blocks*dctsize;
-		ih=compptr->height_in_blocks*dctsize;
-		pw[i]=PAD(dinfo->output_width, dinfo->max_h_samp_factor)
-			*compptr->h_samp_factor/dinfo->max_h_samp_factor;
-		ph[i]=PAD(dinfo->output_height, dinfo->max_v_samp_factor)
-			*compptr->v_samp_factor/dinfo->max_v_samp_factor;
-		if(iw[i]!=pw[i] || ih!=ph[i]) usetmpbuf=1;
-		th[i]=compptr->v_samp_factor*dctsize;
-		tmpbufsize+=iw[i]*th[i];
-		if((outbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*ph[i]))==NULL)
-			_throw("tjDecompressToYUVPlanes(): Memory allocation failure");
-		ptr=dstPlanes[i];
-		for(row=0; row<ph[i]; row++)
-		{
-			outbuf[i][row]=ptr;
-			ptr+=(strides && strides[i]!=0)? strides[i]:pw[i];
-		}
-	}
-	if(usetmpbuf)
-	{
-		if((_tmpbuf=(JSAMPLE *)malloc(sizeof(JSAMPLE)*tmpbufsize))==NULL)
-			_throw("tjDecompressToYUVPlanes(): Memory allocation failure");
-		ptr=_tmpbuf;
-		for(i=0; i<dinfo->num_components; i++)
-		{
-			if((tmpbuf[i]=(JSAMPROW *)malloc(sizeof(JSAMPROW)*th[i]))==NULL)
-				_throw("tjDecompressToYUVPlanes(): Memory allocation failure");
-			for(row=0; row<th[i]; row++)
-			{
-				tmpbuf[i][row]=ptr;
-				ptr+=iw[i];
-			}
-		}
-	}
+  for (i = 0; i < dinfo->num_components; i++) {
+    jpeg_component_info *compptr = &dinfo->comp_info[i];
+    int ih;
 
-	if(setjmp(this->jerr.setjmp_buffer))
-	{
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+    iw[i] = compptr->width_in_blocks * dctsize;
+    ih = compptr->height_in_blocks * dctsize;
+    pw[i] = PAD(dinfo->output_width, dinfo->max_h_samp_factor) *
+            compptr->h_samp_factor / dinfo->max_h_samp_factor;
+    ph[i] = PAD(dinfo->output_height, dinfo->max_v_samp_factor) *
+            compptr->v_samp_factor / dinfo->max_v_samp_factor;
+    if (iw[i] != pw[i] || ih != ph[i]) usetmpbuf = 1;
+    th[i] = compptr->v_samp_factor * dctsize;
+    tmpbufsize += iw[i] * th[i];
+    if ((outbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * ph[i])) == NULL)
+      _throw("tjDecompressToYUVPlanes(): Memory allocation failure");
+    ptr = dstPlanes[i];
+    for (row = 0; row < ph[i]; row++) {
+      outbuf[i][row] = ptr;
+      ptr += (strides && strides[i] != 0) ? strides[i] : pw[i];
+    }
+  }
+  if (usetmpbuf) {
+    if ((_tmpbuf = (JSAMPLE *)malloc(sizeof(JSAMPLE) * tmpbufsize)) == NULL)
+      _throw("tjDecompressToYUVPlanes(): Memory allocation failure");
+    ptr = _tmpbuf;
+    for (i = 0; i < dinfo->num_components; i++) {
+      if ((tmpbuf[i] = (JSAMPROW *)malloc(sizeof(JSAMPROW) * th[i])) == NULL)
+        _throw("tjDecompressToYUVPlanes(): Memory allocation failure");
+      for (row = 0; row < th[i]; row++) {
+        tmpbuf[i][row] = ptr;
+        ptr += iw[i];
+      }
+    }
+  }
 
-	if(flags&TJFLAG_FASTUPSAMPLE) dinfo->do_fancy_upsampling=FALSE;
-	if(flags&TJFLAG_FASTDCT) dinfo->dct_method=JDCT_FASTEST;
-	dinfo->raw_data_out=TRUE;
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
-	jpeg_start_decompress(dinfo);
-	for(row=0; row<(int)dinfo->output_height;
-		row+=dinfo->max_v_samp_factor*dinfo->_min_DCT_scaled_size)
-	{
-		JSAMPARRAY yuvptr[MAX_COMPONENTS];
-		int crow[MAX_COMPONENTS];
-		for(i=0; i<dinfo->num_components; i++)
-		{
-			jpeg_component_info *compptr=&dinfo->comp_info[i];
-			if(jpegSubsamp==TJ_420)
-			{
-				/* When 4:2:0 subsampling is used with IDCT scaling, libjpeg will try
-				   to be clever and use the IDCT to perform upsampling on the U and V
-				   planes.  For instance, if the output image is to be scaled by 1/2
-				   relative to the JPEG image, then the scaling factor and upsampling
-				   effectively cancel each other, so a normal 8x8 IDCT can be used.
-				   However, this is not desirable when using the decompress-to-YUV
-				   functionality in TurboJPEG, since we want to output the U and V
-				   planes in their subsampled form.  Thus, we have to override some
-				   internal libjpeg parameters to force it to use the "scaled" IDCT
-				   functions on the U and V planes. */
-				compptr->_DCT_scaled_size=dctsize;
-				compptr->MCU_sample_width=tjMCUWidth[jpegSubsamp]*
-					sf[sfi].num/sf[sfi].denom*
-					compptr->v_samp_factor/dinfo->max_v_samp_factor;
-				dinfo->idct->inverse_DCT[i] = dinfo->idct->inverse_DCT[0];
-			}
-			crow[i]=row*compptr->v_samp_factor/dinfo->max_v_samp_factor;
-			if(usetmpbuf) yuvptr[i]=tmpbuf[i];
-			else yuvptr[i]=&outbuf[i][crow[i]];
-		}
-		jpeg_read_raw_data(dinfo, yuvptr,
-			dinfo->max_v_samp_factor*dinfo->_min_DCT_scaled_size);
-		if(usetmpbuf)
-		{
-			int j;
-			for(i=0; i<dinfo->num_components; i++)
-			{
-				for(j=0; j<min(th[i], ph[i]-crow[i]); j++)
-				{
-					memcpy(outbuf[i][crow[i]+j], tmpbuf[i][j], pw[i]);
-				}
-			}
-		}
-	}
-	jpeg_finish_decompress(dinfo);
+  if (flags & TJFLAG_FASTUPSAMPLE) dinfo->do_fancy_upsampling = FALSE;
+  if (flags & TJFLAG_FASTDCT) dinfo->dct_method = JDCT_FASTEST;
+  dinfo->raw_data_out = TRUE;
 
-	bailout:
-	if(dinfo->global_state>DSTATE_START) jpeg_abort_decompress(dinfo);
-	for(i=0; i<MAX_COMPONENTS; i++)
-	{
-		if(tmpbuf[i]) free(tmpbuf[i]);
-		if(outbuf[i]) free(outbuf[i]);
-	}
-	if(_tmpbuf) free(_tmpbuf);
-	if(this->jerr.warning) retval=-1;
-	return retval;
+  jpeg_start_decompress(dinfo);
+  for (row = 0; row < (int)dinfo->output_height;
+       row += dinfo->max_v_samp_factor * dinfo->_min_DCT_scaled_size) {
+    JSAMPARRAY yuvptr[MAX_COMPONENTS];
+    int crow[MAX_COMPONENTS];
+
+    for (i = 0; i < dinfo->num_components; i++) {
+      jpeg_component_info *compptr = &dinfo->comp_info[i];
+
+      if (jpegSubsamp == TJ_420) {
+        /* When 4:2:0 subsampling is used with IDCT scaling, libjpeg will try
+           to be clever and use the IDCT to perform upsampling on the U and V
+           planes.  For instance, if the output image is to be scaled by 1/2
+           relative to the JPEG image, then the scaling factor and upsampling
+           effectively cancel each other, so a normal 8x8 IDCT can be used.
+           However, this is not desirable when using the decompress-to-YUV
+           functionality in TurboJPEG, since we want to output the U and V
+           planes in their subsampled form.  Thus, we have to override some
+           internal libjpeg parameters to force it to use the "scaled" IDCT
+           functions on the U and V planes. */
+        compptr->_DCT_scaled_size = dctsize;
+        compptr->MCU_sample_width = tjMCUWidth[jpegSubsamp] *
+          sf[sfi].num / sf[sfi].denom *
+          compptr->v_samp_factor / dinfo->max_v_samp_factor;
+        dinfo->idct->inverse_DCT[i] = dinfo->idct->inverse_DCT[0];
+      }
+      crow[i] = row * compptr->v_samp_factor / dinfo->max_v_samp_factor;
+      if (usetmpbuf) yuvptr[i] = tmpbuf[i];
+      else yuvptr[i] = &outbuf[i][crow[i]];
+    }
+    jpeg_read_raw_data(dinfo, yuvptr,
+                       dinfo->max_v_samp_factor * dinfo->_min_DCT_scaled_size);
+    if (usetmpbuf) {
+      int j;
+
+      for (i = 0; i < dinfo->num_components; i++) {
+        for (j = 0; j < MIN(th[i], ph[i] - crow[i]); j++) {
+          memcpy(outbuf[i][crow[i] + j], tmpbuf[i][j], pw[i]);
+        }
+      }
+    }
+  }
+  jpeg_finish_decompress(dinfo);
+
+bailout:
+  if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
+  for (i = 0; i < MAX_COMPONENTS; i++) {
+    if (tmpbuf[i]) free(tmpbuf[i]);
+    if (outbuf[i]) free(outbuf[i]);
+  }
+  if (_tmpbuf) free(_tmpbuf);
+  if (this->jerr.warning) retval = -1;
+  this->jerr.stopOnWarning = FALSE;
+  return retval;
 }
 
-DLLEXPORT int DLLCALL tjDecompressToYUV2(tjhandle handle,
-	const unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
-	int width, int pad, int height, int flags)
+DLLEXPORT int tjDecompressToYUV2(tjhandle handle, const unsigned char *jpegBuf,
+                                 unsigned long jpegSize, unsigned char *dstBuf,
+                                 int width, int pad, int height, int flags)
 {
-	unsigned char *dstPlanes[3];
-	int pw0, ph0, strides[3], retval=-1, jpegSubsamp=-1;
-	int i, jpegwidth, jpegheight, scaledw, scaledh;
+  unsigned char *dstPlanes[3];
+  int pw0, ph0, strides[3], retval = -1, jpegSubsamp = -1;
+  int i, jpegwidth, jpegheight, scaledw, scaledh;
 
-	getdinstance(handle);
+  getdinstance(handle);
+  this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
 
-	if(jpegBuf==NULL || jpegSize<=0 || dstBuf==NULL || width<0 || pad<1
-		|| !isPow2(pad) || height<0)
-		_throw("tjDecompressToYUV2(): Invalid argument");
+  if (jpegBuf == NULL || jpegSize <= 0 || dstBuf == NULL || width < 0 ||
+      pad < 1 || !isPow2(pad) || height < 0)
+    _throw("tjDecompressToYUV2(): Invalid argument");
 
-	if(setjmp(this->jerr.setjmp_buffer))
-	{
-		/* If we get here, the JPEG code has signaled an error. */
-		return -1;
-	}
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    return -1;
+  }
 
-	jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
-	jpeg_read_header(dinfo, TRUE);
-	jpegSubsamp=getSubsamp(dinfo);
-	if(jpegSubsamp<0)
-		_throw("tjDecompressToYUV2(): Could not determine subsampling type for JPEG image");
+  jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+  jpeg_read_header(dinfo, TRUE);
+  jpegSubsamp = getSubsamp(dinfo);
+  if (jpegSubsamp < 0)
+    _throw("tjDecompressToYUV2(): Could not determine subsampling type for JPEG image");
 
-	jpegwidth=dinfo->image_width;  jpegheight=dinfo->image_height;
-	if(width==0) width=jpegwidth;
-	if(height==0) height=jpegheight;
+  jpegwidth = dinfo->image_width;  jpegheight = dinfo->image_height;
+  if (width == 0) width = jpegwidth;
+  if (height == 0) height = jpegheight;
 
-	for(i=0; i<NUMSF; i++)
-	{
-		scaledw=TJSCALED(jpegwidth, sf[i]);
-		scaledh=TJSCALED(jpegheight, sf[i]);
-		if(scaledw<=width && scaledh<=height)
-			break;
-	}
-	if(i>=NUMSF)
-		_throw("tjDecompressToYUV2(): Could not scale down to desired image dimensions");
+  for (i = 0; i < NUMSF; i++) {
+    scaledw = TJSCALED(jpegwidth, sf[i]);
+    scaledh = TJSCALED(jpegheight, sf[i]);
+    if (scaledw <= width && scaledh <= height)
+      break;
+  }
+  if (i >= NUMSF)
+    _throw("tjDecompressToYUV2(): Could not scale down to desired image dimensions");
 
-	pw0=tjPlaneWidth(0, width, jpegSubsamp);
-	ph0=tjPlaneHeight(0, height, jpegSubsamp);
-	dstPlanes[0]=dstBuf;
-	strides[0]=PAD(pw0, pad);
-	if(jpegSubsamp==TJSAMP_GRAY)
-	{
-		strides[1]=strides[2]=0;
-		dstPlanes[1]=dstPlanes[2]=NULL;
-	}
-	else
-	{
-		int pw1=tjPlaneWidth(1, width, jpegSubsamp);
-		int ph1=tjPlaneHeight(1, height, jpegSubsamp);
-		strides[1]=strides[2]=PAD(pw1, pad);
-		dstPlanes[1]=dstPlanes[0]+strides[0]*ph0;
-		dstPlanes[2]=dstPlanes[1]+strides[1]*ph1;
-	}
+  pw0 = tjPlaneWidth(0, width, jpegSubsamp);
+  ph0 = tjPlaneHeight(0, height, jpegSubsamp);
+  dstPlanes[0] = dstBuf;
+  strides[0] = PAD(pw0, pad);
+  if (jpegSubsamp == TJSAMP_GRAY) {
+    strides[1] = strides[2] = 0;
+    dstPlanes[1] = dstPlanes[2] = NULL;
+  } else {
+    int pw1 = tjPlaneWidth(1, width, jpegSubsamp);
+    int ph1 = tjPlaneHeight(1, height, jpegSubsamp);
 
-	this->headerRead=1;
-	return tjDecompressToYUVPlanes(handle, jpegBuf, jpegSize, dstPlanes, width,
-		strides, height, flags);
+    strides[1] = strides[2] = PAD(pw1, pad);
+    dstPlanes[1] = dstPlanes[0] + strides[0] * ph0;
+    dstPlanes[2] = dstPlanes[1] + strides[1] * ph1;
+  }
 
-	bailout:
-	return retval;
+  this->headerRead = 1;
+  return tjDecompressToYUVPlanes(handle, jpegBuf, jpegSize, dstPlanes, width,
+                                 strides, height, flags);
 
+bailout:
+  this->jerr.stopOnWarning = FALSE;
+  return retval;
 }
 
-DLLEXPORT int DLLCALL tjDecompressToYUV(tjhandle handle,
-	unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
-	int flags)
+DLLEXPORT int tjDecompressToYUV(tjhandle handle, unsigned char *jpegBuf,
+                                unsigned long jpegSize, unsigned char *dstBuf,
+                                int flags)
 {
-	return tjDecompressToYUV2(handle, jpegBuf, jpegSize, dstBuf, 0, 4, 0, flags);
+  return tjDecompressToYUV2(handle, jpegBuf, jpegSize, dstBuf, 0, 4, 0, flags);
 }
 
 
 /* Transformer */
 
-DLLEXPORT tjhandle DLLCALL tjInitTransform(void)
+DLLEXPORT tjhandle tjInitTransform(void)
 {
-	tjinstance *this=NULL;  tjhandle handle=NULL;
-	if((this=(tjinstance *)malloc(sizeof(tjinstance)))==NULL)
-	{
-		snprintf(errStr, JMSG_LENGTH_MAX,
-			"tjInitTransform(): Memory allocation failure");
-		return NULL;
-	}
-	MEMZERO(this, sizeof(tjinstance));
-	handle=_tjInitCompress(this);
-	if(!handle) return NULL;
-	handle=_tjInitDecompress(this);
-	return handle;
+  tjinstance *this = NULL;
+  tjhandle handle = NULL;
+
+  if ((this = (tjinstance *)malloc(sizeof(tjinstance))) == NULL) {
+    snprintf(errStr, JMSG_LENGTH_MAX,
+             "tjInitTransform(): Memory allocation failure");
+    return NULL;
+  }
+  MEMZERO(this, sizeof(tjinstance));
+  snprintf(this->errStr, JMSG_LENGTH_MAX, "No error");
+  handle = _tjInitCompress(this);
+  if (!handle) return NULL;
+  handle = _tjInitDecompress(this);
+  return handle;
 }
 
 
-DLLEXPORT int DLLCALL tjTransform(tjhandle handle,
-	const unsigned char *jpegBuf, unsigned long jpegSize, int n,
-	unsigned char **dstBufs, unsigned long *dstSizes, tjtransform *t, int flags)
+DLLEXPORT int tjTransform(tjhandle handle, const unsigned char *jpegBuf,
+                          unsigned long jpegSize, int n,
+                          unsigned char **dstBufs, unsigned long *dstSizes,
+                          tjtransform *t, int flags)
 {
-	jpeg_transform_info *xinfo=NULL;
-	jvirt_barray_ptr *srccoefs, *dstcoefs;
-	int retval=0, i, jpegSubsamp;
+  jpeg_transform_info *xinfo = NULL;
+  jvirt_barray_ptr *srccoefs, *dstcoefs;
+  int retval = 0, i, jpegSubsamp, saveMarkers = 0;
 
-	getinstance(handle);
-	if((this->init&COMPRESS)==0 || (this->init&DECOMPRESS)==0)
-		_throw("tjTransform(): Instance has not been initialized for transformation");
+  getinstance(handle);
+  this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
+  if ((this->init & COMPRESS) == 0 || (this->init & DECOMPRESS) == 0)
+    _throw("tjTransform(): Instance has not been initialized for transformation");
 
-	if(jpegBuf==NULL || jpegSize<=0 || n<1 || dstBufs==NULL || dstSizes==NULL
-		|| t==NULL || flags<0)
-		_throw("tjTransform(): Invalid argument");
+  if (jpegBuf == NULL || jpegSize <= 0 || n < 1 || dstBufs == NULL ||
+      dstSizes == NULL || t == NULL || flags < 0)
+    _throw("tjTransform(): Invalid argument");
 
-	if(flags&TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
-	else if(flags&TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
-	else if(flags&TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
+  if (flags & TJFLAG_FORCEMMX) putenv("JSIMD_FORCEMMX=1");
+  else if (flags & TJFLAG_FORCESSE) putenv("JSIMD_FORCESSE=1");
+  else if (flags & TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 
-	if((xinfo=(jpeg_transform_info *)malloc(sizeof(jpeg_transform_info)*n))
-		==NULL)
-		_throw("tjTransform(): Memory allocation failure");
-	MEMZERO(xinfo, sizeof(jpeg_transform_info)*n);
+  if ((xinfo =
+       (jpeg_transform_info *)malloc(sizeof(jpeg_transform_info) * n)) == NULL)
+    _throw("tjTransform(): Memory allocation failure");
+  MEMZERO(xinfo, sizeof(jpeg_transform_info) * n);
 
-	if(setjmp(this->jerr.setjmp_buffer))
-	{
-		/* If we get here, the JPEG code has signaled an error. */
-		retval=-1;  goto bailout;
-	}
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
 
-	jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
+  jpeg_mem_src_tj(dinfo, jpegBuf, jpegSize);
 
-	for(i=0; i<n; i++)
-	{
-		xinfo[i].transform=xformtypes[t[i].op];
-		xinfo[i].perfect=(t[i].options&TJXOPT_PERFECT)? 1:0;
-		xinfo[i].trim=(t[i].options&TJXOPT_TRIM)? 1:0;
-		xinfo[i].force_grayscale=(t[i].options&TJXOPT_GRAY)? 1:0;
-		xinfo[i].crop=(t[i].options&TJXOPT_CROP)? 1:0;
-		if(n!=1 && t[i].op==TJXOP_HFLIP) xinfo[i].slow_hflip=1;
-		else xinfo[i].slow_hflip=0;
+  for (i = 0; i < n; i++) {
+    xinfo[i].transform = xformtypes[t[i].op];
+    xinfo[i].perfect = (t[i].options & TJXOPT_PERFECT) ? 1 : 0;
+    xinfo[i].trim = (t[i].options & TJXOPT_TRIM) ? 1 : 0;
+    xinfo[i].force_grayscale = (t[i].options & TJXOPT_GRAY) ? 1 : 0;
+    xinfo[i].crop = (t[i].options & TJXOPT_CROP) ? 1 : 0;
+    if (n != 1 && t[i].op == TJXOP_HFLIP) xinfo[i].slow_hflip = 1;
+    else xinfo[i].slow_hflip = 0;
 
-		if(xinfo[i].crop)
-		{
-			xinfo[i].crop_xoffset=t[i].r.x;  xinfo[i].crop_xoffset_set=JCROP_POS;
-			xinfo[i].crop_yoffset=t[i].r.y;  xinfo[i].crop_yoffset_set=JCROP_POS;
-			if(t[i].r.w!=0)
-			{
-				xinfo[i].crop_width=t[i].r.w;  xinfo[i].crop_width_set=JCROP_POS;
-			}
-			else xinfo[i].crop_width=JCROP_UNSET;
-			if(t[i].r.h!=0)
-			{
-				xinfo[i].crop_height=t[i].r.h;  xinfo[i].crop_height_set=JCROP_POS;
-			}
-			else xinfo[i].crop_height=JCROP_UNSET;
-		}
-	}
+    if (xinfo[i].crop) {
+      xinfo[i].crop_xoffset = t[i].r.x;  xinfo[i].crop_xoffset_set = JCROP_POS;
+      xinfo[i].crop_yoffset = t[i].r.y;  xinfo[i].crop_yoffset_set = JCROP_POS;
+      if (t[i].r.w != 0) {
+        xinfo[i].crop_width = t[i].r.w;  xinfo[i].crop_width_set = JCROP_POS;
+      } else
+        xinfo[i].crop_width = JCROP_UNSET;
+      if (t[i].r.h != 0) {
+        xinfo[i].crop_height = t[i].r.h;  xinfo[i].crop_height_set = JCROP_POS;
+      } else
+        xinfo[i].crop_height = JCROP_UNSET;
+    }
+    if (!(t[i].options & TJXOPT_COPYNONE)) saveMarkers = 1;
+  }
 
-	jcopy_markers_setup(dinfo, JCOPYOPT_ALL);
-	jpeg_read_header(dinfo, TRUE);
-	jpegSubsamp=getSubsamp(dinfo);
-	if(jpegSubsamp<0)
-		_throw("tjTransform(): Could not determine subsampling type for JPEG image");
+  jcopy_markers_setup(dinfo, saveMarkers ? JCOPYOPT_ALL : JCOPYOPT_NONE);
+  jpeg_read_header(dinfo, TRUE);
+  jpegSubsamp = getSubsamp(dinfo);
+  if (jpegSubsamp < 0)
+    _throw("tjTransform(): Could not determine subsampling type for JPEG image");
 
-	for(i=0; i<n; i++)
-	{
-		if(!jtransform_request_workspace(dinfo, &xinfo[i]))
-			_throw("tjTransform(): Transform is not perfect");
+  for (i = 0; i < n; i++) {
+    if (!jtransform_request_workspace(dinfo, &xinfo[i]))
+      _throw("tjTransform(): Transform is not perfect");
 
-		if(xinfo[i].crop)
-		{
-			if((t[i].r.x%xinfo[i].iMCU_sample_width)!=0
-				|| (t[i].r.y%xinfo[i].iMCU_sample_height)!=0)
-			{
-				snprintf(errStr, JMSG_LENGTH_MAX,
-					"To crop this JPEG image, x must be a multiple of %d\n"
-					"and y must be a multiple of %d.\n",
-					xinfo[i].iMCU_sample_width, xinfo[i].iMCU_sample_height);
-				retval=-1;  goto bailout;
-			}
-		}
-	}
+    if (xinfo[i].crop) {
+      if ((t[i].r.x % xinfo[i].iMCU_sample_width) != 0 ||
+          (t[i].r.y % xinfo[i].iMCU_sample_height) != 0) {
+        snprintf(errStr, JMSG_LENGTH_MAX,
+                 "To crop this JPEG image, x must be a multiple of %d\n"
+                 "and y must be a multiple of %d.\n",
+                 xinfo[i].iMCU_sample_width, xinfo[i].iMCU_sample_height);
+        retval = -1;  goto bailout;
+      }
+    }
+  }
 
-	srccoefs=jpeg_read_coefficients(dinfo);
+  srccoefs = jpeg_read_coefficients(dinfo);
 
-	for(i=0; i<n; i++)
-	{
-		int w, h, alloc=1;
-		if(!xinfo[i].crop)
-		{
-			w=dinfo->image_width;  h=dinfo->image_height;
-		}
-		else
-		{
-			w=xinfo[i].crop_width;  h=xinfo[i].crop_height;
-		}
-		if(flags&TJFLAG_NOREALLOC)
-		{
-			alloc=0;  dstSizes[i]=tjBufSize(w, h, jpegSubsamp);
-		}
-		if(!(t[i].options&TJXOPT_NOOUTPUT))
-			jpeg_mem_dest_tj(cinfo, &dstBufs[i], &dstSizes[i], alloc);
-		jpeg_copy_critical_parameters(dinfo, cinfo);
-		dstcoefs=jtransform_adjust_parameters(dinfo, cinfo, srccoefs,
-			&xinfo[i]);
-		if(!(t[i].options&TJXOPT_NOOUTPUT))
-		{
-			jpeg_write_coefficients(cinfo, dstcoefs);
-			jcopy_markers_execute(dinfo, cinfo, JCOPYOPT_ALL);
-		}
-		else jinit_c_master_control(cinfo, TRUE);
-		jtransform_execute_transformation(dinfo, cinfo, srccoefs,
-			&xinfo[i]);
-		if(t[i].customFilter)
-		{
-			int ci, y;  JDIMENSION by;
-			for(ci=0; ci<cinfo->num_components; ci++)
-			{
-				jpeg_component_info *compptr=&cinfo->comp_info[ci];
-				tjregion arrayRegion={0, 0, compptr->width_in_blocks*DCTSIZE,
-					DCTSIZE};
-				tjregion planeRegion={0, 0, compptr->width_in_blocks*DCTSIZE,
-					compptr->height_in_blocks*DCTSIZE};
-				for(by=0; by<compptr->height_in_blocks; by+=compptr->v_samp_factor)
-				{
-					JBLOCKARRAY barray=(dinfo->mem->access_virt_barray)
-						((j_common_ptr)dinfo, dstcoefs[ci], by, compptr->v_samp_factor,
-						TRUE);
-					for(y=0; y<compptr->v_samp_factor; y++)
-					{
-						if(t[i].customFilter(barray[y][0], arrayRegion, planeRegion,
-							ci, i, &t[i])==-1)
-							_throw("tjTransform(): Error in custom filter");
-						arrayRegion.y+=DCTSIZE;
-					}
-				}
-			}
-		}
-		if(!(t[i].options&TJXOPT_NOOUTPUT)) jpeg_finish_compress(cinfo);
-	}
+  for (i = 0; i < n; i++) {
+    int w, h, alloc = 1;
 
-	jpeg_finish_decompress(dinfo);
+    if (!xinfo[i].crop) {
+      w = dinfo->image_width;  h = dinfo->image_height;
+    } else {
+      w = xinfo[i].crop_width;  h = xinfo[i].crop_height;
+    }
+    if (flags & TJFLAG_NOREALLOC) {
+      alloc = 0;  dstSizes[i] = tjBufSize(w, h, jpegSubsamp);
+    }
+    if (!(t[i].options & TJXOPT_NOOUTPUT))
+      jpeg_mem_dest_tj(cinfo, &dstBufs[i], &dstSizes[i], alloc);
+    jpeg_copy_critical_parameters(dinfo, cinfo);
+    dstcoefs = jtransform_adjust_parameters(dinfo, cinfo, srccoefs, &xinfo[i]);
+    if (flags & TJFLAG_PROGRESSIVE || t[i].options & TJXOPT_PROGRESSIVE)
+      jpeg_simple_progression(cinfo);
+    if (!(t[i].options & TJXOPT_NOOUTPUT)) {
+      jpeg_write_coefficients(cinfo, dstcoefs);
+      jcopy_markers_execute(dinfo, cinfo, t[i].options & TJXOPT_COPYNONE ?
+                                          JCOPYOPT_NONE : JCOPYOPT_ALL);
+    } else
+      jinit_c_master_control(cinfo, TRUE);
+    jtransform_execute_transformation(dinfo, cinfo, srccoefs, &xinfo[i]);
+    if (t[i].customFilter) {
+      int ci, y;
+      JDIMENSION by;
 
-	bailout:
-	if(cinfo->global_state>CSTATE_START) jpeg_abort_compress(cinfo);
-	if(dinfo->global_state>DSTATE_START) jpeg_abort_decompress(dinfo);
-	if(xinfo) free(xinfo);
-	if(this->jerr.warning) retval=-1;
-	return retval;
+      for (ci = 0; ci < cinfo->num_components; ci++) {
+        jpeg_component_info *compptr = &cinfo->comp_info[ci];
+        tjregion arrayRegion = {
+          0, 0, compptr->width_in_blocks * DCTSIZE, DCTSIZE
+        };
+        tjregion planeRegion = {
+          0, 0, compptr->width_in_blocks * DCTSIZE,
+          compptr->height_in_blocks * DCTSIZE
+        };
+
+        for (by = 0; by < compptr->height_in_blocks;
+             by += compptr->v_samp_factor) {
+          JBLOCKARRAY barray = (dinfo->mem->access_virt_barray)
+            ((j_common_ptr)dinfo, dstcoefs[ci], by, compptr->v_samp_factor,
+             TRUE);
+
+          for (y = 0; y < compptr->v_samp_factor; y++) {
+            if (t[i].customFilter(barray[y][0], arrayRegion, planeRegion, ci,
+                                  i, &t[i]) == -1)
+              _throw("tjTransform(): Error in custom filter");
+            arrayRegion.y += DCTSIZE;
+          }
+        }
+      }
+    }
+    if (!(t[i].options & TJXOPT_NOOUTPUT)) jpeg_finish_compress(cinfo);
+  }
+
+  jpeg_finish_decompress(dinfo);
+
+bailout:
+  if (cinfo->global_state > CSTATE_START) jpeg_abort_compress(cinfo);
+  if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
+  if (xinfo) free(xinfo);
+  if (this->jerr.warning) retval = -1;
+  this->jerr.stopOnWarning = FALSE;
+  return retval;
+}
+
+
+DLLEXPORT unsigned char *tjLoadImage(const char *filename, int *width,
+                                     int align, int *height, int *pixelFormat,
+                                     int flags)
+{
+  int retval = 0, tempc, pitch;
+  tjhandle handle = NULL;
+  tjinstance *this;
+  j_compress_ptr cinfo = NULL;
+  cjpeg_source_ptr src;
+  unsigned char *dstBuf = NULL;
+  FILE *file = NULL;
+  boolean invert;
+
+  if (!filename || !width || align < 1 || !height || !pixelFormat ||
+      *pixelFormat < TJPF_UNKNOWN || *pixelFormat >= TJ_NUMPF)
+    _throwg("tjLoadImage(): Invalid argument");
+  if ((align & (align - 1)) != 0)
+    _throwg("tjLoadImage(): Alignment must be a power of 2");
+
+  if ((handle = tjInitCompress()) == NULL) return NULL;
+  this = (tjinstance *)handle;
+  cinfo = &this->cinfo;
+
+  if ((file = fopen(filename, "rb")) == NULL)
+    _throwunix("tjLoadImage(): Cannot open input file");
+
+  if ((tempc = getc(file)) < 0 || ungetc(tempc, file) == EOF)
+    _throwunix("tjLoadImage(): Could not read input file")
+  else if (tempc == EOF)
+    _throwg("tjLoadImage(): Input file contains no data");
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  if (*pixelFormat == TJPF_UNKNOWN) cinfo->in_color_space = JCS_UNKNOWN;
+  else cinfo->in_color_space = pf2cs[*pixelFormat];
+  if (tempc == 'B') {
+    if ((src = jinit_read_bmp(cinfo, FALSE)) == NULL)
+      _throwg("tjLoadImage(): Could not initialize bitmap loader");
+    invert = (flags & TJFLAG_BOTTOMUP) == 0;
+  } else if (tempc == 'P') {
+    if ((src = jinit_read_ppm(cinfo)) == NULL)
+      _throwg("tjLoadImage(): Could not initialize bitmap loader");
+    invert = (flags & TJFLAG_BOTTOMUP) != 0;
+  } else
+    _throwg("tjLoadImage(): Unsupported file type");
+
+  src->input_file = file;
+  (*src->start_input) (cinfo, src);
+  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
+
+  *width = cinfo->image_width;  *height = cinfo->image_height;
+  *pixelFormat = cs2pf[cinfo->in_color_space];
+
+  pitch = PAD((*width) * tjPixelSize[*pixelFormat], align);
+  if ((dstBuf = (unsigned char *)malloc(pitch * (*height))) == NULL)
+    _throwg("tjLoadImage(): Memory allocation failure");
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  while (cinfo->next_scanline < cinfo->image_height) {
+    int i, nlines = (*src->get_pixel_rows) (cinfo, src);
+
+    for (i = 0; i < nlines; i++) {
+      unsigned char *dstptr;
+      int row;
+
+      row = cinfo->next_scanline + i;
+      if (invert) dstptr = &dstBuf[((*height) - row - 1) * pitch];
+      else dstptr = &dstBuf[row * pitch];
+      memcpy(dstptr, src->buffer[i], (*width) * tjPixelSize[*pixelFormat]);
+    }
+    cinfo->next_scanline += nlines;
+  }
+
+  (*src->finish_input) (cinfo, src);
+
+bailout:
+  if (handle) tjDestroy(handle);
+  if (file) fclose(file);
+  if (retval < 0 && dstBuf) { free(dstBuf);  dstBuf = NULL; }
+  return dstBuf;
+}
+
+
+DLLEXPORT int tjSaveImage(const char *filename, unsigned char *buffer,
+                          int width, int pitch, int height, int pixelFormat,
+                          int flags)
+{
+  int retval = 0;
+  tjhandle handle = NULL;
+  tjinstance *this;
+  j_decompress_ptr dinfo = NULL;
+  djpeg_dest_ptr dst;
+  FILE *file = NULL;
+  char *ptr = NULL;
+  boolean invert;
+
+  if (!filename || !buffer || width < 1 || pitch < 0 || height < 1 ||
+      pixelFormat < 0 || pixelFormat >= TJ_NUMPF)
+    _throwg("tjSaveImage(): Invalid argument");
+
+  if ((handle = tjInitDecompress()) == NULL)
+    return -1;
+  this = (tjinstance *)handle;
+  dinfo = &this->dinfo;
+
+  if ((file = fopen(filename, "wb")) == NULL)
+    _throwunix("tjSaveImage(): Cannot open output file");
+
+  if (setjmp(this->jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error. */
+    retval = -1;  goto bailout;
+  }
+
+  this->dinfo.out_color_space = pf2cs[pixelFormat];
+  dinfo->image_width = width;  dinfo->image_height = height;
+  dinfo->global_state = DSTATE_READY;
+  dinfo->scale_num = dinfo->scale_denom = 1;
+
+  ptr = strrchr(filename, '.');
+  if (ptr && !strcasecmp(ptr, ".bmp")) {
+    if ((dst = jinit_write_bmp(dinfo, FALSE, FALSE)) == NULL)
+      _throwg("tjSaveImage(): Could not initialize bitmap writer");
+    invert = (flags & TJFLAG_BOTTOMUP) == 0;
+  } else {
+    if ((dst = jinit_write_ppm(dinfo)) == NULL)
+      _throwg("tjSaveImage(): Could not initialize PPM writer");
+    invert = (flags & TJFLAG_BOTTOMUP) != 0;
+  }
+
+  dst->output_file = file;
+  (*dst->start_output) (dinfo, dst);
+  (*dinfo->mem->realize_virt_arrays) ((j_common_ptr)dinfo);
+
+  if (pitch == 0) pitch = width * tjPixelSize[pixelFormat];
+
+  while (dinfo->output_scanline < dinfo->output_height) {
+    unsigned char *rowptr;
+
+    if (invert)
+      rowptr = &buffer[(height - dinfo->output_scanline - 1) * pitch];
+    else
+      rowptr = &buffer[dinfo->output_scanline * pitch];
+    memcpy(dst->buffer[0], rowptr, width * tjPixelSize[pixelFormat]);
+    (*dst->put_pixel_rows) (dinfo, dst, 1);
+    dinfo->output_scanline++;
+  }
+
+  (*dst->finish_output) (dinfo, dst);
+
+bailout:
+  if (handle) tjDestroy(handle);
+  if (file) fclose(file);
+  return retval;
 }
diff --git a/turbojpeg.h b/turbojpeg.h
index 307dc6f..ed59ac3 100644
--- a/turbojpeg.h
+++ b/turbojpeg.h
@@ -89,14 +89,13 @@
  * (the human eye is more sensitive to small changes in brightness than to
  * small changes in color.)  This is called "chrominance subsampling".
  */
-enum TJSAMP
-{
+enum TJSAMP {
   /**
    * 4:4:4 chrominance subsampling (no chrominance subsampling).  The JPEG or
    * YUV image will contain one chrominance component for every pixel in the
    * source image.
    */
-  TJSAMP_444=0,
+  TJSAMP_444 = 0,
   /**
    * 4:2:2 chrominance subsampling.  The JPEG or YUV image will contain one
    * chrominance component for every 2x1 block of pixels in the source image.
@@ -141,7 +140,7 @@
  * - 16x16 for 4:2:0
  * - 32x8 for 4:1:1
  */
-static const int tjMCUWidth[TJ_NUMSAMP]  = {8, 16, 16, 8, 8, 32};
+static const int tjMCUWidth[TJ_NUMSAMP]  = { 8, 16, 16, 8, 8, 32 };
 
 /**
  * MCU block height (in pixels) for a given level of chrominance subsampling.
@@ -152,7 +151,7 @@
  * - 16x16 for 4:2:0
  * - 32x8 for 4:1:1
  */
-static const int tjMCUHeight[TJ_NUMSAMP] = {8, 8, 16, 8, 16, 8};
+static const int tjMCUHeight[TJ_NUMSAMP] = { 8, 8, 16, 8, 16, 8 };
 
 
 /**
@@ -163,14 +162,13 @@
 /**
  * Pixel formats
  */
-enum TJPF
-{
+enum TJPF {
   /**
    * RGB pixel format.  The red, green, and blue components in the image are
    * stored in 3-byte pixels in the order R, G, B from lowest to highest byte
    * address within each pixel.
    */
-  TJPF_RGB=0,
+  TJPF_RGB = 0,
   /**
    * BGR pixel format.  The red, green, and blue components in the image are
    * stored in 3-byte pixels in the order B, G, R from lowest to highest byte
@@ -249,36 +247,61 @@
    * CMYK pixels into a YCCK JPEG image (see #TJCS_YCCK) and decompressing YCCK
    * JPEG images into CMYK pixels.
    */
-  TJPF_CMYK
+  TJPF_CMYK,
+  /**
+   * Unknown pixel format.  Currently this is only used by #tjLoadImage().
+   */
+  TJPF_UNKNOWN = -1
 };
 
-
 /**
  * Red offset (in bytes) for a given pixel format.  This specifies the number
  * of bytes that the red component is offset from the start of the pixel.  For
  * instance, if a pixel of format TJ_BGRX is stored in <tt>char pixel[]</tt>,
- * then the red component will be <tt>pixel[tjRedOffset[TJ_BGRX]]</tt>.
+ * then the red component will be <tt>pixel[tjRedOffset[TJ_BGRX]]</tt>.  This
+ * will be -1 if the pixel format does not have a red component.
  */
-static const int tjRedOffset[TJ_NUMPF] = {0, 2, 0, 2, 3, 1, 0, 0, 2, 3, 1, -1};
+static const int tjRedOffset[TJ_NUMPF] = {
+  0, 2, 0, 2, 3, 1, -1, 0, 2, 3, 1, -1
+};
 /**
  * Green offset (in bytes) for a given pixel format.  This specifies the number
  * of bytes that the green component is offset from the start of the pixel.
  * For instance, if a pixel of format TJ_BGRX is stored in
  * <tt>char pixel[]</tt>, then the green component will be
- * <tt>pixel[tjGreenOffset[TJ_BGRX]]</tt>.
+ * <tt>pixel[tjGreenOffset[TJ_BGRX]]</tt>.  This will be -1 if the pixel format
+ * does not have a green component.
  */
-static const int tjGreenOffset[TJ_NUMPF] = {1, 1, 1, 1, 2, 2, 0, 1, 1, 2, 2, -1};
+static const int tjGreenOffset[TJ_NUMPF] = {
+  1, 1, 1, 1, 2, 2, -1, 1, 1, 2, 2, -1
+};
 /**
  * Blue offset (in bytes) for a given pixel format.  This specifies the number
  * of bytes that the Blue component is offset from the start of the pixel.  For
  * instance, if a pixel of format TJ_BGRX is stored in <tt>char pixel[]</tt>,
- * then the blue component will be <tt>pixel[tjBlueOffset[TJ_BGRX]]</tt>.
+ * then the blue component will be <tt>pixel[tjBlueOffset[TJ_BGRX]]</tt>.  This
+ * will be -1 if the pixel format does not have a blue component.
  */
-static const int tjBlueOffset[TJ_NUMPF] = {2, 0, 2, 0, 1, 3, 0, 2, 0, 1, 3, -1};
+static const int tjBlueOffset[TJ_NUMPF] = {
+  2, 0, 2, 0, 1, 3, -1, 2, 0, 1, 3, -1
+};
 /**
- * Pixel size (in bytes) for a given pixel format.
+ * Alpha offset (in bytes) for a given pixel format.  This specifies the number
+ * of bytes that the Alpha component is offset from the start of the pixel.
+ * For instance, if a pixel of format TJ_BGRA is stored in
+ * <tt>char pixel[]</tt>, then the alpha component will be
+ * <tt>pixel[tjAlphaOffset[TJ_BGRA]]</tt>.  This will be -1 if the pixel format
+ * does not have an alpha component.
  */
-static const int tjPixelSize[TJ_NUMPF] = {3, 3, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4};
+static const int tjAlphaOffset[TJ_NUMPF] = {
+  -1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0, -1
+};
+/**
+ * Pixel size (in bytes) for a given pixel format
+ */
+static const int tjPixelSize[TJ_NUMPF] = {
+  3, 3, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4
+};
 
 
 /**
@@ -289,8 +312,7 @@
 /**
  * JPEG colorspaces
  */
-enum TJCS
-{
+enum TJCS {
   /**
    * RGB colorspace.  When compressing the JPEG image, the R, G, and B
    * components in the source image are reordered into image planes, but no
@@ -298,7 +320,7 @@
    * decompressed to any of the extended RGB pixel formats or grayscale, but
    * they cannot be decompressed to YUV images.
    */
-  TJCS_RGB=0,
+  TJCS_RGB = 0,
   /**
    * YCbCr colorspace.  YCbCr is not an absolute colorspace but rather a
    * mathematical transformation of RGB designed solely for storage and
@@ -382,6 +404,41 @@
  * when decompressing, because this has been shown to have a larger effect.
  */
 #define TJFLAG_ACCURATEDCT   4096
+/**
+ * Immediately discontinue the current compression/decompression/transform
+ * operation if the underlying codec throws a warning (non-fatal error).  The
+ * default behavior is to allow the operation to complete unless a fatal error
+ * is encountered.
+ */
+#define TJFLAG_STOPONWARNING 8192
+/**
+ * Use progressive entropy coding in JPEG images generated by the compression
+ * and transform functions.  Progressive entropy coding will generally improve
+ * compression relative to baseline entropy coding (the default), but it will
+ * reduce compression and decompression performance considerably.
+ */
+#define TJFLAG_PROGRESSIVE   16384
+
+
+/**
+ * The number of error codes
+ */
+#define TJ_NUMERR 2
+
+/**
+ * Error codes
+ */
+enum TJERR {
+  /**
+   * The error was non-fatal and recoverable, but the image may still be
+   * corrupt.
+   */
+  TJERR_WARNING = 0,
+  /**
+   * The error was fatal and non-recoverable.
+   */
+  TJERR_FATAL
+};
 
 
 /**
@@ -392,12 +449,11 @@
 /**
  * Transform operations for #tjTransform()
  */
-enum TJXOP
-{
+enum TJXOP {
   /**
    * Do not transform the position of the image pixels
    */
-  TJXOP_NONE=0,
+  TJXOP_NONE = 0,
   /**
    * Flip (mirror) image horizontally.  This transform is imperfect if there
    * are any partial MCU blocks on the right edge (see #TJXOPT_PERFECT.)
@@ -451,36 +507,49 @@
  * that cannot be transformed will be left in place, which will create
  * odd-looking strips on the right or bottom edge of the image.
  */
-#define TJXOPT_PERFECT  1
+#define TJXOPT_PERFECT     1
 /**
  * This option will cause #tjTransform() to discard any partial MCU blocks that
  * cannot be transformed.
  */
-#define TJXOPT_TRIM     2
+#define TJXOPT_TRIM        2
 /**
  * This option will enable lossless cropping.  See #tjTransform() for more
  * information.
  */
-#define TJXOPT_CROP     4
+#define TJXOPT_CROP        4
 /**
  * This option will discard the color data in the input image and produce
  * a grayscale output image.
  */
-#define TJXOPT_GRAY     8
+#define TJXOPT_GRAY        8
 /**
  * This option will prevent #tjTransform() from outputting a JPEG image for
  * this particular transform (this can be used in conjunction with a custom
  * filter to capture the transformed DCT coefficients without transcoding
  * them.)
  */
-#define TJXOPT_NOOUTPUT 16
+#define TJXOPT_NOOUTPUT    16
+/**
+ * This option will enable progressive entropy coding in the output image
+ * generated by this particular transform.  Progressive entropy coding will
+ * generally improve compression relative to baseline entropy coding (the
+ * default), but it will reduce compression and decompression performance
+ * considerably.
+ */
+#define TJXOPT_PROGRESSIVE 32
+/**
+ * This option will prevent #tjTransform() from copying any extra markers
+ * (including EXIF and ICC profile data) from the source image to the output
+ * image.
+ */
+#define TJXOPT_COPYNONE    64
 
 
 /**
  * Scaling factor
  */
-typedef struct
-{
+typedef struct {
   /**
    * Numerator
    */
@@ -494,8 +563,7 @@
 /**
  * Cropping region
  */
-typedef struct
-{
+typedef struct {
   /**
    * The left boundary of the cropping region.  This must be evenly divisible
    * by the MCU block width (see #tjMCUWidth.)
@@ -521,8 +589,7 @@
 /**
  * Lossless transform
  */
-typedef struct tjtransform
-{
+typedef struct tjtransform {
   /**
    * Cropping region
    */
@@ -573,29 +640,30 @@
    *
    * @return 0 if the callback was successful, or -1 if an error occurred.
    */
-  int (*customFilter)(short *coeffs, tjregion arrayRegion,
-    tjregion planeRegion, int componentIndex, int transformIndex,
-    struct tjtransform *transform);
+  int (*customFilter) (short *coeffs, tjregion arrayRegion,
+                       tjregion planeRegion, int componentIndex,
+                       int transformIndex, struct tjtransform *transform);
 } tjtransform;
 
 /**
  * TurboJPEG instance handle
  */
-typedef void* tjhandle;
+typedef void *tjhandle;
 
 
 /**
  * Pad the given width to the nearest 32-bit boundary
  */
-#define TJPAD(width) (((width)+3)&(~3))
+#define TJPAD(width) (((width) + 3) & (~3))
 
 /**
  * Compute the scaled value of <tt>dimension</tt> using the given scaling
  * factor.  This macro performs the integer equivalent of <tt>ceil(dimension *
  * scalingFactor)</tt>.
  */
-#define TJSCALED(dimension, scalingFactor) ((dimension * scalingFactor.num \
-  + scalingFactor.denom - 1) / scalingFactor.denom)
+#define TJSCALED(dimension, scalingFactor) \
+  ((dimension * scalingFactor.num + scalingFactor.denom - 1) / \
+   scalingFactor.denom)
 
 
 #ifdef __cplusplus
@@ -607,9 +675,9 @@
  * Create a TurboJPEG compressor instance.
  *
  * @return a handle to the newly-created instance, or NULL if an error
- * occurred (see #tjGetErrorStr().)
+ * occurred (see #tjGetErrorStr2().)
  */
-DLLEXPORT tjhandle DLLCALL tjInitCompress(void);
+DLLEXPORT tjhandle tjInitCompress(void);
 
 
 /**
@@ -669,11 +737,13 @@
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_ACCURATEDCT
  * "flags"
  *
- * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr2()
+ * and #tjGetErrorCode().)
 */
-DLLEXPORT int DLLCALL tjCompress2(tjhandle handle, const unsigned char *srcBuf,
-  int width, int pitch, int height, int pixelFormat, unsigned char **jpegBuf,
-  unsigned long *jpegSize, int jpegSubsamp, int jpegQual, int flags);
+DLLEXPORT int tjCompress2(tjhandle handle, const unsigned char *srcBuf,
+                          int width, int pitch, int height, int pixelFormat,
+                          unsigned char **jpegBuf, unsigned long *jpegSize,
+                          int jpegSubsamp, int jpegQual, int flags);
 
 
 /**
@@ -733,11 +803,14 @@
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_ACCURATEDCT
  * "flags"
  *
- * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr2()
+ * and #tjGetErrorCode().)
 */
-DLLEXPORT int DLLCALL tjCompressFromYUV(tjhandle handle,
-  const unsigned char *srcBuf, int width, int pad, int height, int subsamp,
-  unsigned char **jpegBuf, unsigned long *jpegSize, int jpegQual, int flags);
+DLLEXPORT int tjCompressFromYUV(tjhandle handle, const unsigned char *srcBuf,
+                                int width, int pad, int height, int subsamp,
+                                unsigned char **jpegBuf,
+                                unsigned long *jpegSize, int jpegQual,
+                                int flags);
 
 
 /**
@@ -803,12 +876,16 @@
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_ACCURATEDCT
  * "flags"
  *
- * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr2()
+ * and #tjGetErrorCode().)
 */
-DLLEXPORT int DLLCALL tjCompressFromYUVPlanes(tjhandle handle,
-  const unsigned char **srcPlanes, int width, const int *strides, int height,
-  int subsamp, unsigned char **jpegBuf, unsigned long *jpegSize, int jpegQual,
-  int flags);
+DLLEXPORT int tjCompressFromYUVPlanes(tjhandle handle,
+                                      const unsigned char **srcPlanes,
+                                      int width, const int *strides,
+                                      int height, int subsamp,
+                                      unsigned char **jpegBuf,
+                                      unsigned long *jpegSize, int jpegQual,
+                                      int flags);
 
 
 /**
@@ -833,8 +910,7 @@
  * @return the maximum size of the buffer (in bytes) required to hold the
  * image, or -1 if the arguments are out of bounds.
  */
-DLLEXPORT unsigned long DLLCALL tjBufSize(int width, int height,
-  int jpegSubsamp);
+DLLEXPORT unsigned long tjBufSize(int width, int height, int jpegSubsamp);
 
 
 /**
@@ -854,8 +930,8 @@
  * @return the size of the buffer (in bytes) required to hold the image, or
  * -1 if the arguments are out of bounds.
  */
-DLLEXPORT unsigned long DLLCALL tjBufSizeYUV2(int width, int pad, int height,
-  int subsamp);
+DLLEXPORT unsigned long tjBufSizeYUV2(int width, int pad, int height,
+                                      int subsamp);
 
 
 /**
@@ -879,8 +955,8 @@
  * @return the size of the buffer (in bytes) required to hold the YUV image
  * plane, or -1 if the arguments are out of bounds.
  */
-DLLEXPORT unsigned long DLLCALL tjPlaneSizeYUV(int componentID, int width,
-  int stride, int height, int subsamp);
+DLLEXPORT unsigned long tjPlaneSizeYUV(int componentID, int width, int stride,
+                                       int height, int subsamp);
 
 
 /**
@@ -963,11 +1039,13 @@
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_ACCURATEDCT
  * "flags"
  *
- * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr2()
+ * and #tjGetErrorCode().)
 */
-DLLEXPORT int DLLCALL tjEncodeYUV3(tjhandle handle,
-  const unsigned char *srcBuf, int width, int pitch, int height,
-  int pixelFormat, unsigned char *dstBuf, int pad, int subsamp, int flags);
+DLLEXPORT int tjEncodeYUV3(tjhandle handle, const unsigned char *srcBuf,
+                           int width, int pitch, int height, int pixelFormat,
+                           unsigned char *dstBuf, int pad, int subsamp,
+                           int flags);
 
 
 /**
@@ -1021,21 +1099,22 @@
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_ACCURATEDCT
  * "flags"
  *
- * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr2()
+ * and #tjGetErrorCode().)
 */
-DLLEXPORT int DLLCALL tjEncodeYUVPlanes(tjhandle handle,
-  const unsigned char *srcBuf, int width, int pitch, int height,
-  int pixelFormat, unsigned char **dstPlanes, int *strides, int subsamp,
-  int flags);
+DLLEXPORT int tjEncodeYUVPlanes(tjhandle handle, const unsigned char *srcBuf,
+                                int width, int pitch, int height,
+                                int pixelFormat, unsigned char **dstPlanes,
+                                int *strides, int subsamp, int flags);
 
 
 /**
  * Create a TurboJPEG decompressor instance.
  *
  * @return a handle to the newly-created instance, or NULL if an error
- * occurred (see #tjGetErrorStr().)
+ * occurred (see #tjGetErrorStr2().)
 */
-DLLEXPORT tjhandle DLLCALL tjInitDecompress(void);
+DLLEXPORT tjhandle tjInitDecompress(void);
 
 
 /**
@@ -1061,11 +1140,14 @@
  * of the JPEG colorspace constants, indicating the colorspace of the JPEG
  * image (see @ref TJCS "JPEG colorspaces".)
  *
- * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr2()
+ * and #tjGetErrorCode().)
 */
-DLLEXPORT int DLLCALL tjDecompressHeader3(tjhandle handle,
-  const unsigned char *jpegBuf, unsigned long jpegSize, int *width,
-  int *height, int *jpegSubsamp, int *jpegColorspace);
+DLLEXPORT int tjDecompressHeader3(tjhandle handle,
+                                  const unsigned char *jpegBuf,
+                                  unsigned long jpegSize, int *width,
+                                  int *height, int *jpegSubsamp,
+                                  int *jpegColorspace);
 
 
 /**
@@ -1076,9 +1158,9 @@
  * the number of elements in the list
  *
  * @return a pointer to a list of fractional scaling factors, or NULL if an
- * error is encountered (see #tjGetErrorStr().)
+ * error is encountered (see #tjGetErrorStr2().)
 */
-DLLEXPORT tjscalingfactor* DLLCALL tjGetScalingFactors(int *numscalingfactors);
+DLLEXPORT tjscalingfactor *tjGetScalingFactors(int *numscalingfactors);
 
 
 /**
@@ -1128,11 +1210,13 @@
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_ACCURATEDCT
  * "flags"
  *
- * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr2()
+ * and #tjGetErrorCode().)
  */
-DLLEXPORT int DLLCALL tjDecompress2(tjhandle handle,
-  const unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
-  int width, int pitch, int height, int pixelFormat, int flags);
+DLLEXPORT int tjDecompress2(tjhandle handle, const unsigned char *jpegBuf,
+                            unsigned long jpegSize, unsigned char *dstBuf,
+                            int width, int pitch, int height, int pixelFormat,
+                            int flags);
 
 
 /**
@@ -1178,11 +1262,12 @@
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_ACCURATEDCT
  * "flags"
  *
- * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr2()
+ * and #tjGetErrorCode().)
  */
-DLLEXPORT int DLLCALL tjDecompressToYUV2(tjhandle handle,
-  const unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
-  int width, int pad, int height, int flags);
+DLLEXPORT int tjDecompressToYUV2(tjhandle handle, const unsigned char *jpegBuf,
+                                 unsigned long jpegSize, unsigned char *dstBuf,
+                                 int width, int pad, int height, int flags);
 
 
 /**
@@ -1234,11 +1319,14 @@
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_ACCURATEDCT
  * "flags"
  *
- * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr2()
+ * and #tjGetErrorCode().)
  */
-DLLEXPORT int DLLCALL tjDecompressToYUVPlanes(tjhandle handle,
-  const unsigned char *jpegBuf, unsigned long jpegSize,
-  unsigned char **dstPlanes, int width, int *strides, int height, int flags);
+DLLEXPORT int tjDecompressToYUVPlanes(tjhandle handle,
+                                      const unsigned char *jpegBuf,
+                                      unsigned long jpegSize,
+                                      unsigned char **dstPlanes, int width,
+                                      int *strides, int height, int flags);
 
 
 /**
@@ -1286,11 +1374,13 @@
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_ACCURATEDCT
  * "flags"
  *
- * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr2()
+ * and #tjGetErrorCode().)
  */
-DLLEXPORT int DLLCALL tjDecodeYUV(tjhandle handle, const unsigned char *srcBuf,
-  int pad, int subsamp, unsigned char *dstBuf, int width, int pitch,
-  int height, int pixelFormat, int flags);
+DLLEXPORT int tjDecodeYUV(tjhandle handle, const unsigned char *srcBuf,
+                          int pad, int subsamp, unsigned char *dstBuf,
+                          int width, int pitch, int height, int pixelFormat,
+                          int flags);
 
 
 /**
@@ -1343,21 +1433,23 @@
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_ACCURATEDCT
  * "flags"
  *
- * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr2()
+ * and #tjGetErrorCode().)
  */
-DLLEXPORT int DLLCALL tjDecodeYUVPlanes(tjhandle handle,
-  const unsigned char **srcPlanes, const int *strides, int subsamp,
-  unsigned char *dstBuf, int width, int pitch, int height, int pixelFormat,
-  int flags);
+DLLEXPORT int tjDecodeYUVPlanes(tjhandle handle,
+                                const unsigned char **srcPlanes,
+                                const int *strides, int subsamp,
+                                unsigned char *dstBuf, int width, int pitch,
+                                int height, int pixelFormat, int flags);
 
 
 /**
  * Create a new TurboJPEG transformer instance.
  *
  * @return a handle to the newly-created instance, or NULL if an error
- * occurred (see #tjGetErrorStr().)
+ * occurred (see #tjGetErrorStr2().)
  */
-DLLEXPORT tjhandle DLLCALL tjInitTransform(void);
+DLLEXPORT tjhandle tjInitTransform(void);
 
 
 /**
@@ -1417,12 +1509,13 @@
  * @param flags the bitwise OR of one or more of the @ref TJFLAG_ACCURATEDCT
  * "flags"
  *
- * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr2()
+ * and #tjGetErrorCode().)
  */
-DLLEXPORT int DLLCALL tjTransform(tjhandle handle,
-  const unsigned char *jpegBuf, unsigned long jpegSize, int n,
-  unsigned char **dstBufs, unsigned long *dstSizes, tjtransform *transforms,
-  int flags);
+DLLEXPORT int tjTransform(tjhandle handle, const unsigned char *jpegBuf,
+                          unsigned long jpegSize, int n,
+                          unsigned char **dstBufs, unsigned long *dstSizes,
+                          tjtransform *transforms, int flags);
 
 
 /**
@@ -1431,9 +1524,9 @@
  * @param handle a handle to a TurboJPEG compressor, decompressor or
  * transformer instance
  *
- * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr2().)
  */
-DLLEXPORT int DLLCALL tjDestroy(tjhandle handle);
+DLLEXPORT int tjDestroy(tjhandle handle);
 
 
 /**
@@ -1449,7 +1542,92 @@
  *
  * @sa tjFree()
  */
-DLLEXPORT unsigned char* DLLCALL tjAlloc(int bytes);
+DLLEXPORT unsigned char *tjAlloc(int bytes);
+
+
+/**
+ * Load an uncompressed image from disk into memory.
+ *
+ * @param filename name of a file containing an uncompressed image in Windows
+ * BMP or PBMPLUS (PPM/PGM) format
+ *
+ * @param width pointer to an integer variable that will receive the width (in
+ * pixels) of the uncompressed image
+ *
+ * @param align row alignment of the image buffer to be returned (must be a
+ * power of 2.)  For instance, setting this parameter to 4 will cause all rows
+ * in the image buffer to be padded to the nearest 32-bit boundary, and setting
+ * this parameter to 1 will cause all rows in the image buffer to be unpadded.
+ *
+ * @param height pointer to an integer variable that will receive the height
+ * (in pixels) of the uncompressed image
+ *
+ * @param pixelFormat pointer to an integer variable that specifies or will
+ * receive the pixel format of the uncompressed image buffer.  The behavior of
+ * #tjLoadImage() will vary depending on the value of <tt>*pixelFormat</tt>
+ * passed to the function:
+ * - @ref TJPF_UNKNOWN : The uncompressed image buffer returned by the function
+ * will use the most optimal pixel format for the file type, and
+ * <tt>*pixelFormat</tt> will contain the ID of this pixel format upon
+ * successful return from the function.
+ * - @ref TJPF_GRAY : Only PGM files and 8-bit BMP files with a grayscale
+ * colormap can be loaded.
+ * - @ref TJPF_CMYK : The RGB or grayscale pixels stored in the file will be
+ * converted using a quick & dirty algorithm that is suitable only for testing
+ * purposes (proper conversion between CMYK and other formats requires a color
+ * management system.)
+ * - Other @ref TJPF "pixel formats" : The uncompressed image buffer will use
+ * the specified pixel format, and pixel format conversion will be performed if
+ * necessary.
+ *
+ * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
+ * "flags".
+ *
+ * @return a pointer to a newly-allocated buffer containing the uncompressed
+ * image, converted to the chosen pixel format and with the chosen row
+ * alignment, or NULL if an error occurred (see #tjGetErrorStr2().)  This
+ * buffer should be freed using #tjFree().
+ */
+DLLEXPORT unsigned char *tjLoadImage(const char *filename, int *width,
+                                     int align, int *height, int *pixelFormat,
+                                     int flags);
+
+
+/**
+ * Save an uncompressed image from memory to disk.
+ *
+ * @param filename name of a file to which to save the uncompressed image.
+ * The image will be stored in Windows BMP or PBMPLUS (PPM/PGM) format,
+ * depending on the file extension.
+ *
+ * @param buffer pointer to an image buffer containing RGB, grayscale, or
+ * CMYK pixels to be saved
+ *
+ * @param width width (in pixels) of the uncompressed image
+ *
+ * @param pitch bytes per line in the image buffer.  Setting this parameter to
+ * 0 is the equivalent of setting it to
+ * <tt>width * #tjPixelSize[pixelFormat]</tt>.
+ *
+ * @param height height (in pixels) of the uncompressed image
+ *
+ * @param pixelFormat pixel format of the image buffer (see @ref TJPF
+ * "Pixel formats".)  If this parameter is set to @ref TJPF_GRAY, then the
+ * image will be stored in PGM or 8-bit (indexed color) BMP format.  Otherwise,
+ * the image will be stored in PPM or 24-bit BMP format.  If this parameter
+ * is set to @ref TJPF_CMYK, then the CMYK pixels will be converted to RGB
+ * using a quick & dirty algorithm that is suitable only for testing (proper
+ * conversion between CMYK and other formats requires a color management
+ * system.)
+ *
+ * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
+ * "flags".
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr2().)
+ */
+DLLEXPORT int tjSaveImage(const char *filename, unsigned char *buffer,
+                          int width, int pitch, int height, int pixelFormat,
+                          int flags);
 
 
 /**
@@ -1462,15 +1640,33 @@
  *
  * @sa tjAlloc()
  */
-DLLEXPORT void DLLCALL tjFree(unsigned char *buffer);
+DLLEXPORT void tjFree(unsigned char *buffer);
 
 
 /**
  * Returns a descriptive error message explaining why the last command failed.
  *
+ * @param handle a handle to a TurboJPEG compressor, decompressor, or
+ * transformer instance, or NULL if the error was generated by a global
+ * function (but note that retrieving the error message for a global function
+ * is not thread-safe.)
+ *
  * @return a descriptive error message explaining why the last command failed.
  */
-DLLEXPORT char* DLLCALL tjGetErrorStr(void);
+DLLEXPORT char *tjGetErrorStr2(tjhandle handle);
+
+
+/**
+ * Returns a code indicating the severity of the last error.  See
+ * @ref TJERR "Error codes".
+ *
+ * @param handle a handle to a TurboJPEG compressor, decompressor or
+ * transformer instance
+ *
+ * @return a code indicating the severity of the last error.  See
+ * @ref TJERR "Error codes".
+ */
+DLLEXPORT int tjGetErrorCode(tjhandle handle);
 
 
 /* Deprecated functions and macros */
@@ -1498,40 +1694,43 @@
 #define TJ_FASTUPSAMPLE TJFLAG_FASTUPSAMPLE
 #define TJ_YUV 512
 
-DLLEXPORT unsigned long DLLCALL TJBUFSIZE(int width, int height);
+DLLEXPORT unsigned long TJBUFSIZE(int width, int height);
 
-DLLEXPORT unsigned long DLLCALL TJBUFSIZEYUV(int width, int height,
-  int jpegSubsamp);
+DLLEXPORT unsigned long TJBUFSIZEYUV(int width, int height, int jpegSubsamp);
 
-DLLEXPORT unsigned long DLLCALL tjBufSizeYUV(int width, int height,
-  int subsamp);
+DLLEXPORT unsigned long tjBufSizeYUV(int width, int height, int subsamp);
 
-DLLEXPORT int DLLCALL tjCompress(tjhandle handle, unsigned char *srcBuf,
-  int width, int pitch, int height, int pixelSize, unsigned char *dstBuf,
-  unsigned long *compressedSize, int jpegSubsamp, int jpegQual, int flags);
+DLLEXPORT int tjCompress(tjhandle handle, unsigned char *srcBuf, int width,
+                         int pitch, int height, int pixelSize,
+                         unsigned char *dstBuf, unsigned long *compressedSize,
+                         int jpegSubsamp, int jpegQual, int flags);
 
-DLLEXPORT int DLLCALL tjEncodeYUV(tjhandle handle,
-  unsigned char *srcBuf, int width, int pitch, int height, int pixelSize,
-  unsigned char *dstBuf, int subsamp, int flags);
+DLLEXPORT int tjEncodeYUV(tjhandle handle, unsigned char *srcBuf, int width,
+                          int pitch, int height, int pixelSize,
+                          unsigned char *dstBuf, int subsamp, int flags);
 
-DLLEXPORT int DLLCALL tjEncodeYUV2(tjhandle handle,
-  unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat,
-  unsigned char *dstBuf, int subsamp, int flags);
+DLLEXPORT int tjEncodeYUV2(tjhandle handle, unsigned char *srcBuf, int width,
+                           int pitch, int height, int pixelFormat,
+                           unsigned char *dstBuf, int subsamp, int flags);
 
-DLLEXPORT int DLLCALL tjDecompressHeader(tjhandle handle,
-  unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height);
+DLLEXPORT int tjDecompressHeader(tjhandle handle, unsigned char *jpegBuf,
+                                 unsigned long jpegSize, int *width,
+                                 int *height);
 
-DLLEXPORT int DLLCALL tjDecompressHeader2(tjhandle handle,
-  unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height,
-  int *jpegSubsamp);
+DLLEXPORT int tjDecompressHeader2(tjhandle handle, unsigned char *jpegBuf,
+                                  unsigned long jpegSize, int *width,
+                                  int *height, int *jpegSubsamp);
 
-DLLEXPORT int DLLCALL tjDecompress(tjhandle handle,
-  unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
-  int width, int pitch, int height, int pixelSize, int flags);
+DLLEXPORT int tjDecompress(tjhandle handle, unsigned char *jpegBuf,
+                           unsigned long jpegSize, unsigned char *dstBuf,
+                           int width, int pitch, int height, int pixelSize,
+                           int flags);
 
-DLLEXPORT int DLLCALL tjDecompressToYUV(tjhandle handle,
-  unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
-  int flags);
+DLLEXPORT int tjDecompressToYUV(tjhandle handle, unsigned char *jpegBuf,
+                                unsigned long jpegSize, unsigned char *dstBuf,
+                                int flags);
+
+DLLEXPORT char *tjGetErrorStr(void);
 
 
 /**
diff --git a/win/jconfig.h.in b/win/jconfig.h.in
index 9d35121..e3c3d1c 100644
--- a/win/jconfig.h.in
+++ b/win/jconfig.h.in
@@ -1,35 +1,24 @@
-/* jconfig.vc --- jconfig.h for Microsoft Visual C++ on Windows 95 or NT. */
-/* see jconfig.txt for explanations */
-
 #define JPEG_LIB_VERSION @JPEG_LIB_VERSION@
 #define LIBJPEG_TURBO_VERSION @VERSION@
 #define LIBJPEG_TURBO_VERSION_NUMBER @LIBJPEG_TURBO_VERSION_NUMBER@
+
 #cmakedefine C_ARITH_CODING_SUPPORTED
 #cmakedefine D_ARITH_CODING_SUPPORTED
 #cmakedefine MEM_SRCDST_SUPPORTED
-
-/*
- * Define BITS_IN_JSAMPLE as either
- *   8   for 8-bit sample values (the usual setting)
- *   12  for 12-bit sample values
- * Only 8 and 12 are legal data precisions for lossy JPEG according to the
- * JPEG standard, and the IJG code does not support anything else!
- * We do not support run-time selection of data precision, sorry.
- */
+#cmakedefine WITH_SIMD
 
 #define BITS_IN_JSAMPLE  @BITS_IN_JSAMPLE@      /* use 8 or 12 */
 
-#define HAVE_UNSIGNED_CHAR
-#define HAVE_UNSIGNED_SHORT
-/* #define void char */
-/* #define const */
-#undef __CHAR_UNSIGNED__
 #define HAVE_STDDEF_H
 #define HAVE_STDLIB_H
-#undef NEED_BSD_STRINGS
 #undef NEED_SYS_TYPES_H
-#undef NEED_FAR_POINTERS	/* we presume a 32-bit flat memory model */
+#undef NEED_BSD_STRINGS
+
+#define HAVE_UNSIGNED_CHAR
+#define HAVE_UNSIGNED_SHORT
 #undef INCOMPLETE_TYPES_BROKEN
+#undef RIGHT_SHIFT_IS_UNSIGNED
+#undef __CHAR_UNSIGNED__
 
 /* Define "boolean" as unsigned char, not int, per Windows custom */
 #ifndef __RPCNDR_H__		/* don't conflict if rpcndr.h already read */
@@ -43,9 +32,3 @@
 typedef signed int INT32;
 #endif
 #define XMD_H                   /* prevent jmorecfg.h from redefining it */
-
-#ifdef JPEG_INTERNALS
-
-#undef RIGHT_SHIFT_IS_UNSIGNED
-
-#endif /* JPEG_INTERNALS */
diff --git a/win/jconfigint.h.in b/win/jconfigint.h.in
deleted file mode 100644
index 2131bf5..0000000
--- a/win/jconfigint.h.in
+++ /dev/null
@@ -1,13 +0,0 @@
-#define VERSION "@VERSION@"
-#define BUILD "@BUILD@"
-#define PACKAGE_NAME "@CMAKE_PROJECT_NAME@"
-
-#ifndef INLINE
-#if defined(__GNUC__)
-#define INLINE inline __attribute__((always_inline))
-#elif defined(_MSC_VER)
-#define INLINE __forceinline
-#else
-#define INLINE
-#endif
-#endif
diff --git a/win/jpeg62-memsrcdst.def b/win/jpeg62-memsrcdst.def
index 6499316..4d24a14 100755
--- a/win/jpeg62-memsrcdst.def
+++ b/win/jpeg62-memsrcdst.def
@@ -1,106 +1,108 @@
 EXPORTS
-	jcopy_block_row @ 1 ; 
-	jcopy_sample_rows @ 2 ; 
-	jdiv_round_up @ 3 ; 
-	jinit_1pass_quantizer @ 4 ; 
-	jinit_2pass_quantizer @ 5 ; 
-	jinit_c_coef_controller @ 6 ; 
-	jinit_c_main_controller @ 7 ; 
-	jinit_c_master_control @ 8 ; 
-	jinit_c_prep_controller @ 9 ; 
-	jinit_color_converter @ 10 ; 
-	jinit_color_deconverter @ 11 ; 
-	jinit_compress_master @ 12 ; 
-	jinit_d_coef_controller @ 13 ; 
-	jinit_d_main_controller @ 14 ; 
-	jinit_d_post_controller @ 15 ; 
-	jinit_downsampler @ 16 ; 
-	jinit_forward_dct @ 17 ; 
-	jinit_huff_decoder @ 18 ; 
-	jinit_huff_encoder @ 19 ; 
-	jinit_input_controller @ 20 ; 
-	jinit_inverse_dct @ 21 ; 
-	jinit_marker_reader @ 22 ; 
-	jinit_marker_writer @ 23 ; 
-	jinit_master_decompress @ 24 ; 
-	jinit_memory_mgr @ 25 ; 
-	jinit_merged_upsampler @ 26 ; 
-	jinit_phuff_decoder @ 27 ; 
-	jinit_phuff_encoder @ 28 ; 
-	jinit_upsampler @ 29 ; 
-	jpeg_CreateCompress @ 30 ; 
-	jpeg_CreateDecompress @ 31 ; 
-	jpeg_abort @ 32 ; 
-	jpeg_abort_compress @ 33 ; 
-	jpeg_abort_decompress @ 34 ; 
-	jpeg_add_quant_table @ 35 ; 
-	jpeg_alloc_huff_table @ 36 ; 
-	jpeg_alloc_quant_table @ 37 ; 
-	jpeg_calc_output_dimensions @ 38 ; 
-	jpeg_consume_input @ 39 ; 
-	jpeg_copy_critical_parameters @ 40 ; 
-	jpeg_default_colorspace @ 41 ; 
-	jpeg_destroy @ 42 ; 
-	jpeg_destroy_compress @ 43 ; 
-	jpeg_destroy_decompress @ 44 ; 
-	jpeg_fdct_float @ 45 ; 
-	jpeg_fdct_ifast @ 46 ; 
-	jpeg_fdct_islow @ 47 ; 
-	jpeg_fill_bit_buffer @ 48 ; 
-	jpeg_finish_compress @ 49 ; 
-	jpeg_finish_decompress @ 50 ; 
-	jpeg_finish_output @ 51 ; 
-	jpeg_free_large @ 52 ; 
-	jpeg_free_small @ 53 ; 
-	jpeg_gen_optimal_table @ 54 ; 
-	jpeg_get_large @ 55 ; 
-	jpeg_get_small @ 56 ; 
-	jpeg_has_multiple_scans @ 57 ; 
-	jpeg_huff_decode @ 58 ; 
-	jpeg_idct_1x1 @ 59 ; 
-	jpeg_idct_2x2 @ 60 ; 
-	jpeg_idct_4x4 @ 61 ; 
-	jpeg_idct_float @ 62 ; 
-	jpeg_idct_ifast @ 63 ; 
-	jpeg_idct_islow @ 64 ; 
-	jpeg_input_complete @ 65 ; 
-	jpeg_make_c_derived_tbl @ 66 ; 
-	jpeg_make_d_derived_tbl @ 67 ; 
-	jpeg_mem_available @ 68 ; 
-	jpeg_mem_init @ 69 ; 
-	jpeg_mem_term @ 70 ; 
-	jpeg_new_colormap @ 71 ; 
-	jpeg_open_backing_store @ 72 ; 
-	jpeg_quality_scaling @ 73 ; 
-	jpeg_read_coefficients @ 74 ; 
-	jpeg_read_header @ 75 ; 
-	jpeg_read_raw_data @ 76 ; 
-	jpeg_read_scanlines @ 77 ; 
-	jpeg_resync_to_restart @ 78 ; 
-	jpeg_save_markers @ 79 ; 
-	jpeg_set_colorspace @ 80 ; 
-	jpeg_set_defaults @ 81 ; 
-	jpeg_set_linear_quality @ 82 ; 
-	jpeg_set_marker_processor @ 83 ; 
-	jpeg_set_quality @ 84 ; 
-	jpeg_simple_progression @ 85 ; 
-	jpeg_start_compress @ 86 ; 
-	jpeg_start_decompress @ 87 ; 
-	jpeg_start_output @ 88 ; 
-	jpeg_std_error @ 89 ; 
-	jpeg_stdio_dest @ 90 ; 
-	jpeg_stdio_src @ 91 ; 
-	jpeg_suppress_tables @ 92 ; 
-	jpeg_write_coefficients @ 93 ; 
-	jpeg_write_m_byte @ 94 ; 
-	jpeg_write_m_header @ 95 ; 
-	jpeg_write_marker @ 96 ; 
-	jpeg_write_raw_data @ 97 ; 
-	jpeg_write_scanlines @ 98 ; 
-	jpeg_write_tables @ 99 ; 
-	jround_up @ 100 ; 
-	jzero_far @ 101 ; 
-	jpeg_mem_dest @ 102 ; 
-	jpeg_mem_src @ 103 ; 
-	jpeg_skip_scanlines @ 104 ; 
-	jpeg_crop_scanline @ 105 ; 
+  jcopy_block_row @ 1 ;
+  jcopy_sample_rows @ 2 ;
+  jdiv_round_up @ 3 ;
+  jinit_1pass_quantizer @ 4 ;
+  jinit_2pass_quantizer @ 5 ;
+  jinit_c_coef_controller @ 6 ;
+  jinit_c_main_controller @ 7 ;
+  jinit_c_master_control @ 8 ;
+  jinit_c_prep_controller @ 9 ;
+  jinit_color_converter @ 10 ;
+  jinit_color_deconverter @ 11 ;
+  jinit_compress_master @ 12 ;
+  jinit_d_coef_controller @ 13 ;
+  jinit_d_main_controller @ 14 ;
+  jinit_d_post_controller @ 15 ;
+  jinit_downsampler @ 16 ;
+  jinit_forward_dct @ 17 ;
+  jinit_huff_decoder @ 18 ;
+  jinit_huff_encoder @ 19 ;
+  jinit_input_controller @ 20 ;
+  jinit_inverse_dct @ 21 ;
+  jinit_marker_reader @ 22 ;
+  jinit_marker_writer @ 23 ;
+  jinit_master_decompress @ 24 ;
+  jinit_memory_mgr @ 25 ;
+  jinit_merged_upsampler @ 26 ;
+  jinit_phuff_decoder @ 27 ;
+  jinit_phuff_encoder @ 28 ;
+  jinit_upsampler @ 29 ;
+  jpeg_CreateCompress @ 30 ;
+  jpeg_CreateDecompress @ 31 ;
+  jpeg_abort @ 32 ;
+  jpeg_abort_compress @ 33 ;
+  jpeg_abort_decompress @ 34 ;
+  jpeg_add_quant_table @ 35 ;
+  jpeg_alloc_huff_table @ 36 ;
+  jpeg_alloc_quant_table @ 37 ;
+  jpeg_calc_output_dimensions @ 38 ;
+  jpeg_consume_input @ 39 ;
+  jpeg_copy_critical_parameters @ 40 ;
+  jpeg_default_colorspace @ 41 ;
+  jpeg_destroy @ 42 ;
+  jpeg_destroy_compress @ 43 ;
+  jpeg_destroy_decompress @ 44 ;
+  jpeg_fdct_float @ 45 ;
+  jpeg_fdct_ifast @ 46 ;
+  jpeg_fdct_islow @ 47 ;
+  jpeg_fill_bit_buffer @ 48 ;
+  jpeg_finish_compress @ 49 ;
+  jpeg_finish_decompress @ 50 ;
+  jpeg_finish_output @ 51 ;
+  jpeg_free_large @ 52 ;
+  jpeg_free_small @ 53 ;
+  jpeg_gen_optimal_table @ 54 ;
+  jpeg_get_large @ 55 ;
+  jpeg_get_small @ 56 ;
+  jpeg_has_multiple_scans @ 57 ;
+  jpeg_huff_decode @ 58 ;
+  jpeg_idct_1x1 @ 59 ;
+  jpeg_idct_2x2 @ 60 ;
+  jpeg_idct_4x4 @ 61 ;
+  jpeg_idct_float @ 62 ;
+  jpeg_idct_ifast @ 63 ;
+  jpeg_idct_islow @ 64 ;
+  jpeg_input_complete @ 65 ;
+  jpeg_make_c_derived_tbl @ 66 ;
+  jpeg_make_d_derived_tbl @ 67 ;
+  jpeg_mem_available @ 68 ;
+  jpeg_mem_init @ 69 ;
+  jpeg_mem_term @ 70 ;
+  jpeg_new_colormap @ 71 ;
+  jpeg_open_backing_store @ 72 ;
+  jpeg_quality_scaling @ 73 ;
+  jpeg_read_coefficients @ 74 ;
+  jpeg_read_header @ 75 ;
+  jpeg_read_raw_data @ 76 ;
+  jpeg_read_scanlines @ 77 ;
+  jpeg_resync_to_restart @ 78 ;
+  jpeg_save_markers @ 79 ;
+  jpeg_set_colorspace @ 80 ;
+  jpeg_set_defaults @ 81 ;
+  jpeg_set_linear_quality @ 82 ;
+  jpeg_set_marker_processor @ 83 ;
+  jpeg_set_quality @ 84 ;
+  jpeg_simple_progression @ 85 ;
+  jpeg_start_compress @ 86 ;
+  jpeg_start_decompress @ 87 ;
+  jpeg_start_output @ 88 ;
+  jpeg_std_error @ 89 ;
+  jpeg_stdio_dest @ 90 ;
+  jpeg_stdio_src @ 91 ;
+  jpeg_suppress_tables @ 92 ;
+  jpeg_write_coefficients @ 93 ;
+  jpeg_write_m_byte @ 94 ;
+  jpeg_write_m_header @ 95 ;
+  jpeg_write_marker @ 96 ;
+  jpeg_write_raw_data @ 97 ;
+  jpeg_write_scanlines @ 98 ;
+  jpeg_write_tables @ 99 ;
+  jround_up @ 100 ;
+  jzero_far @ 101 ;
+  jpeg_mem_dest @ 102 ;
+  jpeg_mem_src @ 103 ;
+  jpeg_skip_scanlines @ 104 ;
+  jpeg_crop_scanline @ 105 ;
+  jpeg_read_icc_profile @ 106 ;
+  jpeg_write_icc_profile @ 107 ;
diff --git a/win/jpeg62.def b/win/jpeg62.def
index 9f30b1a..f3c69b2 100755
--- a/win/jpeg62.def
+++ b/win/jpeg62.def
@@ -1,104 +1,106 @@
 EXPORTS
-	jcopy_block_row @ 1 ; 
-	jcopy_sample_rows @ 2 ; 
-	jdiv_round_up @ 3 ; 
-	jinit_1pass_quantizer @ 4 ; 
-	jinit_2pass_quantizer @ 5 ; 
-	jinit_c_coef_controller @ 6 ; 
-	jinit_c_main_controller @ 7 ; 
-	jinit_c_master_control @ 8 ; 
-	jinit_c_prep_controller @ 9 ; 
-	jinit_color_converter @ 10 ; 
-	jinit_color_deconverter @ 11 ; 
-	jinit_compress_master @ 12 ; 
-	jinit_d_coef_controller @ 13 ; 
-	jinit_d_main_controller @ 14 ; 
-	jinit_d_post_controller @ 15 ; 
-	jinit_downsampler @ 16 ; 
-	jinit_forward_dct @ 17 ; 
-	jinit_huff_decoder @ 18 ; 
-	jinit_huff_encoder @ 19 ; 
-	jinit_input_controller @ 20 ; 
-	jinit_inverse_dct @ 21 ; 
-	jinit_marker_reader @ 22 ; 
-	jinit_marker_writer @ 23 ; 
-	jinit_master_decompress @ 24 ; 
-	jinit_memory_mgr @ 25 ; 
-	jinit_merged_upsampler @ 26 ; 
-	jinit_phuff_decoder @ 27 ; 
-	jinit_phuff_encoder @ 28 ; 
-	jinit_upsampler @ 29 ; 
-	jpeg_CreateCompress @ 30 ; 
-	jpeg_CreateDecompress @ 31 ; 
-	jpeg_abort @ 32 ; 
-	jpeg_abort_compress @ 33 ; 
-	jpeg_abort_decompress @ 34 ; 
-	jpeg_add_quant_table @ 35 ; 
-	jpeg_alloc_huff_table @ 36 ; 
-	jpeg_alloc_quant_table @ 37 ; 
-	jpeg_calc_output_dimensions @ 38 ; 
-	jpeg_consume_input @ 39 ; 
-	jpeg_copy_critical_parameters @ 40 ; 
-	jpeg_default_colorspace @ 41 ; 
-	jpeg_destroy @ 42 ; 
-	jpeg_destroy_compress @ 43 ; 
-	jpeg_destroy_decompress @ 44 ; 
-	jpeg_fdct_float @ 45 ; 
-	jpeg_fdct_ifast @ 46 ; 
-	jpeg_fdct_islow @ 47 ; 
-	jpeg_fill_bit_buffer @ 48 ; 
-	jpeg_finish_compress @ 49 ; 
-	jpeg_finish_decompress @ 50 ; 
-	jpeg_finish_output @ 51 ; 
-	jpeg_free_large @ 52 ; 
-	jpeg_free_small @ 53 ; 
-	jpeg_gen_optimal_table @ 54 ; 
-	jpeg_get_large @ 55 ; 
-	jpeg_get_small @ 56 ; 
-	jpeg_has_multiple_scans @ 57 ; 
-	jpeg_huff_decode @ 58 ; 
-	jpeg_idct_1x1 @ 59 ; 
-	jpeg_idct_2x2 @ 60 ; 
-	jpeg_idct_4x4 @ 61 ; 
-	jpeg_idct_float @ 62 ; 
-	jpeg_idct_ifast @ 63 ; 
-	jpeg_idct_islow @ 64 ; 
-	jpeg_input_complete @ 65 ; 
-	jpeg_make_c_derived_tbl @ 66 ; 
-	jpeg_make_d_derived_tbl @ 67 ; 
-	jpeg_mem_available @ 68 ; 
-	jpeg_mem_init @ 69 ; 
-	jpeg_mem_term @ 70 ; 
-	jpeg_new_colormap @ 71 ; 
-	jpeg_open_backing_store @ 72 ; 
-	jpeg_quality_scaling @ 73 ; 
-	jpeg_read_coefficients @ 74 ; 
-	jpeg_read_header @ 75 ; 
-	jpeg_read_raw_data @ 76 ; 
-	jpeg_read_scanlines @ 77 ; 
-	jpeg_resync_to_restart @ 78 ; 
-	jpeg_save_markers @ 79 ; 
-	jpeg_set_colorspace @ 80 ; 
-	jpeg_set_defaults @ 81 ; 
-	jpeg_set_linear_quality @ 82 ; 
-	jpeg_set_marker_processor @ 83 ; 
-	jpeg_set_quality @ 84 ; 
-	jpeg_simple_progression @ 85 ; 
-	jpeg_start_compress @ 86 ; 
-	jpeg_start_decompress @ 87 ; 
-	jpeg_start_output @ 88 ; 
-	jpeg_std_error @ 89 ; 
-	jpeg_stdio_dest @ 90 ; 
-	jpeg_stdio_src @ 91 ; 
-	jpeg_suppress_tables @ 92 ; 
-	jpeg_write_coefficients @ 93 ; 
-	jpeg_write_m_byte @ 94 ; 
-	jpeg_write_m_header @ 95 ; 
-	jpeg_write_marker @ 96 ; 
-	jpeg_write_raw_data @ 97 ; 
-	jpeg_write_scanlines @ 98 ; 
-	jpeg_write_tables @ 99 ; 
-	jround_up @ 100 ; 
-	jzero_far @ 101 ; 
-	jpeg_skip_scanlines @ 102 ; 
-	jpeg_crop_scanline @ 103 ; 
+  jcopy_block_row @ 1 ;
+  jcopy_sample_rows @ 2 ;
+  jdiv_round_up @ 3 ;
+  jinit_1pass_quantizer @ 4 ;
+  jinit_2pass_quantizer @ 5 ;
+  jinit_c_coef_controller @ 6 ;
+  jinit_c_main_controller @ 7 ;
+  jinit_c_master_control @ 8 ;
+  jinit_c_prep_controller @ 9 ;
+  jinit_color_converter @ 10 ;
+  jinit_color_deconverter @ 11 ;
+  jinit_compress_master @ 12 ;
+  jinit_d_coef_controller @ 13 ;
+  jinit_d_main_controller @ 14 ;
+  jinit_d_post_controller @ 15 ;
+  jinit_downsampler @ 16 ;
+  jinit_forward_dct @ 17 ;
+  jinit_huff_decoder @ 18 ;
+  jinit_huff_encoder @ 19 ;
+  jinit_input_controller @ 20 ;
+  jinit_inverse_dct @ 21 ;
+  jinit_marker_reader @ 22 ;
+  jinit_marker_writer @ 23 ;
+  jinit_master_decompress @ 24 ;
+  jinit_memory_mgr @ 25 ;
+  jinit_merged_upsampler @ 26 ;
+  jinit_phuff_decoder @ 27 ;
+  jinit_phuff_encoder @ 28 ;
+  jinit_upsampler @ 29 ;
+  jpeg_CreateCompress @ 30 ;
+  jpeg_CreateDecompress @ 31 ;
+  jpeg_abort @ 32 ;
+  jpeg_abort_compress @ 33 ;
+  jpeg_abort_decompress @ 34 ;
+  jpeg_add_quant_table @ 35 ;
+  jpeg_alloc_huff_table @ 36 ;
+  jpeg_alloc_quant_table @ 37 ;
+  jpeg_calc_output_dimensions @ 38 ;
+  jpeg_consume_input @ 39 ;
+  jpeg_copy_critical_parameters @ 40 ;
+  jpeg_default_colorspace @ 41 ;
+  jpeg_destroy @ 42 ;
+  jpeg_destroy_compress @ 43 ;
+  jpeg_destroy_decompress @ 44 ;
+  jpeg_fdct_float @ 45 ;
+  jpeg_fdct_ifast @ 46 ;
+  jpeg_fdct_islow @ 47 ;
+  jpeg_fill_bit_buffer @ 48 ;
+  jpeg_finish_compress @ 49 ;
+  jpeg_finish_decompress @ 50 ;
+  jpeg_finish_output @ 51 ;
+  jpeg_free_large @ 52 ;
+  jpeg_free_small @ 53 ;
+  jpeg_gen_optimal_table @ 54 ;
+  jpeg_get_large @ 55 ;
+  jpeg_get_small @ 56 ;
+  jpeg_has_multiple_scans @ 57 ;
+  jpeg_huff_decode @ 58 ;
+  jpeg_idct_1x1 @ 59 ;
+  jpeg_idct_2x2 @ 60 ;
+  jpeg_idct_4x4 @ 61 ;
+  jpeg_idct_float @ 62 ;
+  jpeg_idct_ifast @ 63 ;
+  jpeg_idct_islow @ 64 ;
+  jpeg_input_complete @ 65 ;
+  jpeg_make_c_derived_tbl @ 66 ;
+  jpeg_make_d_derived_tbl @ 67 ;
+  jpeg_mem_available @ 68 ;
+  jpeg_mem_init @ 69 ;
+  jpeg_mem_term @ 70 ;
+  jpeg_new_colormap @ 71 ;
+  jpeg_open_backing_store @ 72 ;
+  jpeg_quality_scaling @ 73 ;
+  jpeg_read_coefficients @ 74 ;
+  jpeg_read_header @ 75 ;
+  jpeg_read_raw_data @ 76 ;
+  jpeg_read_scanlines @ 77 ;
+  jpeg_resync_to_restart @ 78 ;
+  jpeg_save_markers @ 79 ;
+  jpeg_set_colorspace @ 80 ;
+  jpeg_set_defaults @ 81 ;
+  jpeg_set_linear_quality @ 82 ;
+  jpeg_set_marker_processor @ 83 ;
+  jpeg_set_quality @ 84 ;
+  jpeg_simple_progression @ 85 ;
+  jpeg_start_compress @ 86 ;
+  jpeg_start_decompress @ 87 ;
+  jpeg_start_output @ 88 ;
+  jpeg_std_error @ 89 ;
+  jpeg_stdio_dest @ 90 ;
+  jpeg_stdio_src @ 91 ;
+  jpeg_suppress_tables @ 92 ;
+  jpeg_write_coefficients @ 93 ;
+  jpeg_write_m_byte @ 94 ;
+  jpeg_write_m_header @ 95 ;
+  jpeg_write_marker @ 96 ;
+  jpeg_write_raw_data @ 97 ;
+  jpeg_write_scanlines @ 98 ;
+  jpeg_write_tables @ 99 ;
+  jround_up @ 100 ;
+  jzero_far @ 101 ;
+  jpeg_skip_scanlines @ 102 ;
+  jpeg_crop_scanline @ 103 ;
+  jpeg_read_icc_profile @ 104 ;
+  jpeg_write_icc_profile @ 105 ;
diff --git a/win/jpeg7-memsrcdst.def b/win/jpeg7-memsrcdst.def
index 37a4777..a005aff 100644
--- a/win/jpeg7-memsrcdst.def
+++ b/win/jpeg7-memsrcdst.def
@@ -1,108 +1,110 @@
 EXPORTS
-	jcopy_block_row @ 1 ; 
-	jcopy_sample_rows @ 2 ; 
-	jdiv_round_up @ 3 ; 
-	jinit_1pass_quantizer @ 4 ; 
-	jinit_2pass_quantizer @ 5 ; 
-	jinit_c_coef_controller @ 6 ; 
-	jinit_c_main_controller @ 7 ; 
-	jinit_c_master_control @ 8 ; 
-	jinit_c_prep_controller @ 9 ; 
-	jinit_color_converter @ 10 ; 
-	jinit_color_deconverter @ 11 ; 
-	jinit_compress_master @ 12 ; 
-	jinit_d_coef_controller @ 13 ; 
-	jinit_d_main_controller @ 14 ; 
-	jinit_d_post_controller @ 15 ; 
-	jinit_downsampler @ 16 ; 
-	jinit_forward_dct @ 17 ; 
-	jinit_huff_decoder @ 18 ; 
-	jinit_huff_encoder @ 19 ; 
-	jinit_input_controller @ 20 ; 
-	jinit_inverse_dct @ 21 ; 
-	jinit_marker_reader @ 22 ; 
-	jinit_marker_writer @ 23 ; 
-	jinit_master_decompress @ 24 ; 
-	jinit_memory_mgr @ 25 ; 
-	jinit_merged_upsampler @ 26 ; 
-	jinit_phuff_decoder @ 27 ; 
-	jinit_phuff_encoder @ 28 ; 
-	jinit_upsampler @ 29 ; 
-	jpeg_CreateCompress @ 30 ; 
-	jpeg_CreateDecompress @ 31 ; 
-	jpeg_abort @ 32 ; 
-	jpeg_abort_compress @ 33 ; 
-	jpeg_abort_decompress @ 34 ; 
-	jpeg_add_quant_table @ 35 ; 
-	jpeg_alloc_huff_table @ 36 ; 
-	jpeg_alloc_quant_table @ 37 ; 
-	jpeg_calc_jpeg_dimensions @ 38 ; 
-	jpeg_calc_output_dimensions @ 39 ; 
-	jpeg_consume_input @ 40 ; 
-	jpeg_copy_critical_parameters @ 41 ; 
-	jpeg_default_colorspace @ 42 ; 
-	jpeg_default_qtables @ 43 ;
-	jpeg_destroy @ 44 ; 
-	jpeg_destroy_compress @ 45 ; 
-	jpeg_destroy_decompress @ 46 ; 
-	jpeg_fdct_float @ 47 ; 
-	jpeg_fdct_ifast @ 48 ; 
-	jpeg_fdct_islow @ 49 ; 
-	jpeg_fill_bit_buffer @ 50 ; 
-	jpeg_finish_compress @ 51 ; 
-	jpeg_finish_decompress @ 52 ; 
-	jpeg_finish_output @ 53 ; 
-	jpeg_free_large @ 54 ; 
-	jpeg_free_small @ 55 ; 
-	jpeg_gen_optimal_table @ 56 ; 
-	jpeg_get_large @ 57 ; 
-	jpeg_get_small @ 58 ; 
-	jpeg_has_multiple_scans @ 59 ; 
-	jpeg_huff_decode @ 60 ; 
-	jpeg_idct_1x1 @ 61 ; 
-	jpeg_idct_2x2 @ 62 ; 
-	jpeg_idct_4x4 @ 63 ; 
-	jpeg_idct_float @ 64 ; 
-	jpeg_idct_ifast @ 65 ; 
-	jpeg_idct_islow @ 66 ; 
-	jpeg_input_complete @ 67 ; 
-	jpeg_make_c_derived_tbl @ 68 ; 
-	jpeg_make_d_derived_tbl @ 69 ; 
-	jpeg_mem_available @ 70 ; 
-	jpeg_mem_init @ 71 ; 
-	jpeg_mem_term @ 72 ; 
-	jpeg_new_colormap @ 73 ; 
-	jpeg_open_backing_store @ 74 ; 
-	jpeg_quality_scaling @ 75 ; 
-	jpeg_read_coefficients @ 76 ; 
-	jpeg_read_header @ 77 ; 
-	jpeg_read_raw_data @ 78 ; 
-	jpeg_read_scanlines @ 79 ; 
-	jpeg_resync_to_restart @ 80 ; 
-	jpeg_save_markers @ 81 ; 
-	jpeg_set_colorspace @ 82 ; 
-	jpeg_set_defaults @ 83 ; 
-	jpeg_set_linear_quality @ 84 ; 
-	jpeg_set_marker_processor @ 85 ; 
-	jpeg_set_quality @ 86 ; 
-	jpeg_simple_progression @ 87 ; 
-	jpeg_start_compress @ 88 ; 
-	jpeg_start_decompress @ 89 ; 
-	jpeg_start_output @ 90 ; 
-	jpeg_std_error @ 91 ; 
-	jpeg_stdio_dest @ 92 ; 
-	jpeg_stdio_src @ 93 ; 
-	jpeg_suppress_tables @ 94 ; 
-	jpeg_write_coefficients @ 95 ; 
-	jpeg_write_m_byte @ 96 ; 
-	jpeg_write_m_header @ 97 ; 
-	jpeg_write_marker @ 98 ; 
-	jpeg_write_raw_data @ 99 ; 
-	jpeg_write_scanlines @ 100 ; 
-	jpeg_write_tables @ 101 ; 
-	jround_up @ 102 ; 
-	jzero_far @ 103 ; 
-	jpeg_mem_dest @ 104 ; 
-	jpeg_mem_src @ 105 ; 
-	jpeg_skip_scanlines @ 106 ; 
-	jpeg_crop_scanline @ 107 ; 
+  jcopy_block_row @ 1 ;
+  jcopy_sample_rows @ 2 ;
+  jdiv_round_up @ 3 ;
+  jinit_1pass_quantizer @ 4 ;
+  jinit_2pass_quantizer @ 5 ;
+  jinit_c_coef_controller @ 6 ;
+  jinit_c_main_controller @ 7 ;
+  jinit_c_master_control @ 8 ;
+  jinit_c_prep_controller @ 9 ;
+  jinit_color_converter @ 10 ;
+  jinit_color_deconverter @ 11 ;
+  jinit_compress_master @ 12 ;
+  jinit_d_coef_controller @ 13 ;
+  jinit_d_main_controller @ 14 ;
+  jinit_d_post_controller @ 15 ;
+  jinit_downsampler @ 16 ;
+  jinit_forward_dct @ 17 ;
+  jinit_huff_decoder @ 18 ;
+  jinit_huff_encoder @ 19 ;
+  jinit_input_controller @ 20 ;
+  jinit_inverse_dct @ 21 ;
+  jinit_marker_reader @ 22 ;
+  jinit_marker_writer @ 23 ;
+  jinit_master_decompress @ 24 ;
+  jinit_memory_mgr @ 25 ;
+  jinit_merged_upsampler @ 26 ;
+  jinit_phuff_decoder @ 27 ;
+  jinit_phuff_encoder @ 28 ;
+  jinit_upsampler @ 29 ;
+  jpeg_CreateCompress @ 30 ;
+  jpeg_CreateDecompress @ 31 ;
+  jpeg_abort @ 32 ;
+  jpeg_abort_compress @ 33 ;
+  jpeg_abort_decompress @ 34 ;
+  jpeg_add_quant_table @ 35 ;
+  jpeg_alloc_huff_table @ 36 ;
+  jpeg_alloc_quant_table @ 37 ;
+  jpeg_calc_jpeg_dimensions @ 38 ;
+  jpeg_calc_output_dimensions @ 39 ;
+  jpeg_consume_input @ 40 ;
+  jpeg_copy_critical_parameters @ 41 ;
+  jpeg_default_colorspace @ 42 ;
+  jpeg_default_qtables @ 43 ;
+  jpeg_destroy @ 44 ;
+  jpeg_destroy_compress @ 45 ;
+  jpeg_destroy_decompress @ 46 ;
+  jpeg_fdct_float @ 47 ;
+  jpeg_fdct_ifast @ 48 ;
+  jpeg_fdct_islow @ 49 ;
+  jpeg_fill_bit_buffer @ 50 ;
+  jpeg_finish_compress @ 51 ;
+  jpeg_finish_decompress @ 52 ;
+  jpeg_finish_output @ 53 ;
+  jpeg_free_large @ 54 ;
+  jpeg_free_small @ 55 ;
+  jpeg_gen_optimal_table @ 56 ;
+  jpeg_get_large @ 57 ;
+  jpeg_get_small @ 58 ;
+  jpeg_has_multiple_scans @ 59 ;
+  jpeg_huff_decode @ 60 ;
+  jpeg_idct_1x1 @ 61 ;
+  jpeg_idct_2x2 @ 62 ;
+  jpeg_idct_4x4 @ 63 ;
+  jpeg_idct_float @ 64 ;
+  jpeg_idct_ifast @ 65 ;
+  jpeg_idct_islow @ 66 ;
+  jpeg_input_complete @ 67 ;
+  jpeg_make_c_derived_tbl @ 68 ;
+  jpeg_make_d_derived_tbl @ 69 ;
+  jpeg_mem_available @ 70 ;
+  jpeg_mem_init @ 71 ;
+  jpeg_mem_term @ 72 ;
+  jpeg_new_colormap @ 73 ;
+  jpeg_open_backing_store @ 74 ;
+  jpeg_quality_scaling @ 75 ;
+  jpeg_read_coefficients @ 76 ;
+  jpeg_read_header @ 77 ;
+  jpeg_read_raw_data @ 78 ;
+  jpeg_read_scanlines @ 79 ;
+  jpeg_resync_to_restart @ 80 ;
+  jpeg_save_markers @ 81 ;
+  jpeg_set_colorspace @ 82 ;
+  jpeg_set_defaults @ 83 ;
+  jpeg_set_linear_quality @ 84 ;
+  jpeg_set_marker_processor @ 85 ;
+  jpeg_set_quality @ 86 ;
+  jpeg_simple_progression @ 87 ;
+  jpeg_start_compress @ 88 ;
+  jpeg_start_decompress @ 89 ;
+  jpeg_start_output @ 90 ;
+  jpeg_std_error @ 91 ;
+  jpeg_stdio_dest @ 92 ;
+  jpeg_stdio_src @ 93 ;
+  jpeg_suppress_tables @ 94 ;
+  jpeg_write_coefficients @ 95 ;
+  jpeg_write_m_byte @ 96 ;
+  jpeg_write_m_header @ 97 ;
+  jpeg_write_marker @ 98 ;
+  jpeg_write_raw_data @ 99 ;
+  jpeg_write_scanlines @ 100 ;
+  jpeg_write_tables @ 101 ;
+  jround_up @ 102 ;
+  jzero_far @ 103 ;
+  jpeg_mem_dest @ 104 ;
+  jpeg_mem_src @ 105 ;
+  jpeg_skip_scanlines @ 106 ;
+  jpeg_crop_scanline @ 107 ;
+  jpeg_read_icc_profile @ 108 ;
+  jpeg_write_icc_profile @ 109 ;
diff --git a/win/jpeg7.def b/win/jpeg7.def
index 92463c5..49f4c02 100644
--- a/win/jpeg7.def
+++ b/win/jpeg7.def
@@ -1,106 +1,108 @@
 EXPORTS
-	jcopy_block_row @ 1 ; 
-	jcopy_sample_rows @ 2 ; 
-	jdiv_round_up @ 3 ; 
-	jinit_1pass_quantizer @ 4 ; 
-	jinit_2pass_quantizer @ 5 ; 
-	jinit_c_coef_controller @ 6 ; 
-	jinit_c_main_controller @ 7 ; 
-	jinit_c_master_control @ 8 ; 
-	jinit_c_prep_controller @ 9 ; 
-	jinit_color_converter @ 10 ; 
-	jinit_color_deconverter @ 11 ; 
-	jinit_compress_master @ 12 ; 
-	jinit_d_coef_controller @ 13 ; 
-	jinit_d_main_controller @ 14 ; 
-	jinit_d_post_controller @ 15 ; 
-	jinit_downsampler @ 16 ; 
-	jinit_forward_dct @ 17 ; 
-	jinit_huff_decoder @ 18 ; 
-	jinit_huff_encoder @ 19 ; 
-	jinit_input_controller @ 20 ; 
-	jinit_inverse_dct @ 21 ; 
-	jinit_marker_reader @ 22 ; 
-	jinit_marker_writer @ 23 ; 
-	jinit_master_decompress @ 24 ; 
-	jinit_memory_mgr @ 25 ; 
-	jinit_merged_upsampler @ 26 ; 
-	jinit_phuff_decoder @ 27 ; 
-	jinit_phuff_encoder @ 28 ; 
-	jinit_upsampler @ 29 ; 
-	jpeg_CreateCompress @ 30 ; 
-	jpeg_CreateDecompress @ 31 ; 
-	jpeg_abort @ 32 ; 
-	jpeg_abort_compress @ 33 ; 
-	jpeg_abort_decompress @ 34 ; 
-	jpeg_add_quant_table @ 35 ; 
-	jpeg_alloc_huff_table @ 36 ; 
-	jpeg_alloc_quant_table @ 37 ; 
-	jpeg_calc_jpeg_dimensions @ 38 ; 
-	jpeg_calc_output_dimensions @ 39 ; 
-	jpeg_consume_input @ 40 ; 
-	jpeg_copy_critical_parameters @ 41 ; 
-	jpeg_default_colorspace @ 42 ; 
-	jpeg_default_qtables @ 43 ;
-	jpeg_destroy @ 44 ; 
-	jpeg_destroy_compress @ 45 ; 
-	jpeg_destroy_decompress @ 46 ; 
-	jpeg_fdct_float @ 47 ; 
-	jpeg_fdct_ifast @ 48 ; 
-	jpeg_fdct_islow @ 49 ; 
-	jpeg_fill_bit_buffer @ 50 ; 
-	jpeg_finish_compress @ 51 ; 
-	jpeg_finish_decompress @ 52 ; 
-	jpeg_finish_output @ 53 ; 
-	jpeg_free_large @ 54 ; 
-	jpeg_free_small @ 55 ; 
-	jpeg_gen_optimal_table @ 56 ; 
-	jpeg_get_large @ 57 ; 
-	jpeg_get_small @ 58 ; 
-	jpeg_has_multiple_scans @ 59 ; 
-	jpeg_huff_decode @ 60 ; 
-	jpeg_idct_1x1 @ 61 ; 
-	jpeg_idct_2x2 @ 62 ; 
-	jpeg_idct_4x4 @ 63 ; 
-	jpeg_idct_float @ 64 ; 
-	jpeg_idct_ifast @ 65 ; 
-	jpeg_idct_islow @ 66 ; 
-	jpeg_input_complete @ 67 ; 
-	jpeg_make_c_derived_tbl @ 68 ; 
-	jpeg_make_d_derived_tbl @ 69 ; 
-	jpeg_mem_available @ 70 ; 
-	jpeg_mem_init @ 71 ; 
-	jpeg_mem_term @ 72 ; 
-	jpeg_new_colormap @ 73 ; 
-	jpeg_open_backing_store @ 74 ; 
-	jpeg_quality_scaling @ 75 ; 
-	jpeg_read_coefficients @ 76 ; 
-	jpeg_read_header @ 77 ; 
-	jpeg_read_raw_data @ 78 ; 
-	jpeg_read_scanlines @ 79 ; 
-	jpeg_resync_to_restart @ 80 ; 
-	jpeg_save_markers @ 81 ; 
-	jpeg_set_colorspace @ 82 ; 
-	jpeg_set_defaults @ 83 ; 
-	jpeg_set_linear_quality @ 84 ; 
-	jpeg_set_marker_processor @ 85 ; 
-	jpeg_set_quality @ 86 ; 
-	jpeg_simple_progression @ 87 ; 
-	jpeg_start_compress @ 88 ; 
-	jpeg_start_decompress @ 89 ; 
-	jpeg_start_output @ 90 ; 
-	jpeg_std_error @ 91 ; 
-	jpeg_stdio_dest @ 92 ; 
-	jpeg_stdio_src @ 93 ; 
-	jpeg_suppress_tables @ 94 ; 
-	jpeg_write_coefficients @ 95 ; 
-	jpeg_write_m_byte @ 96 ; 
-	jpeg_write_m_header @ 97 ; 
-	jpeg_write_marker @ 98 ; 
-	jpeg_write_raw_data @ 99 ; 
-	jpeg_write_scanlines @ 100 ; 
-	jpeg_write_tables @ 101 ; 
-	jround_up @ 102 ; 
-	jzero_far @ 103 ; 
-	jpeg_skip_scanlines @ 104 ; 
-	jpeg_crop_scanline @ 105 ; 
+  jcopy_block_row @ 1 ;
+  jcopy_sample_rows @ 2 ;
+  jdiv_round_up @ 3 ;
+  jinit_1pass_quantizer @ 4 ;
+  jinit_2pass_quantizer @ 5 ;
+  jinit_c_coef_controller @ 6 ;
+  jinit_c_main_controller @ 7 ;
+  jinit_c_master_control @ 8 ;
+  jinit_c_prep_controller @ 9 ;
+  jinit_color_converter @ 10 ;
+  jinit_color_deconverter @ 11 ;
+  jinit_compress_master @ 12 ;
+  jinit_d_coef_controller @ 13 ;
+  jinit_d_main_controller @ 14 ;
+  jinit_d_post_controller @ 15 ;
+  jinit_downsampler @ 16 ;
+  jinit_forward_dct @ 17 ;
+  jinit_huff_decoder @ 18 ;
+  jinit_huff_encoder @ 19 ;
+  jinit_input_controller @ 20 ;
+  jinit_inverse_dct @ 21 ;
+  jinit_marker_reader @ 22 ;
+  jinit_marker_writer @ 23 ;
+  jinit_master_decompress @ 24 ;
+  jinit_memory_mgr @ 25 ;
+  jinit_merged_upsampler @ 26 ;
+  jinit_phuff_decoder @ 27 ;
+  jinit_phuff_encoder @ 28 ;
+  jinit_upsampler @ 29 ;
+  jpeg_CreateCompress @ 30 ;
+  jpeg_CreateDecompress @ 31 ;
+  jpeg_abort @ 32 ;
+  jpeg_abort_compress @ 33 ;
+  jpeg_abort_decompress @ 34 ;
+  jpeg_add_quant_table @ 35 ;
+  jpeg_alloc_huff_table @ 36 ;
+  jpeg_alloc_quant_table @ 37 ;
+  jpeg_calc_jpeg_dimensions @ 38 ;
+  jpeg_calc_output_dimensions @ 39 ;
+  jpeg_consume_input @ 40 ;
+  jpeg_copy_critical_parameters @ 41 ;
+  jpeg_default_colorspace @ 42 ;
+  jpeg_default_qtables @ 43 ;
+  jpeg_destroy @ 44 ;
+  jpeg_destroy_compress @ 45 ;
+  jpeg_destroy_decompress @ 46 ;
+  jpeg_fdct_float @ 47 ;
+  jpeg_fdct_ifast @ 48 ;
+  jpeg_fdct_islow @ 49 ;
+  jpeg_fill_bit_buffer @ 50 ;
+  jpeg_finish_compress @ 51 ;
+  jpeg_finish_decompress @ 52 ;
+  jpeg_finish_output @ 53 ;
+  jpeg_free_large @ 54 ;
+  jpeg_free_small @ 55 ;
+  jpeg_gen_optimal_table @ 56 ;
+  jpeg_get_large @ 57 ;
+  jpeg_get_small @ 58 ;
+  jpeg_has_multiple_scans @ 59 ;
+  jpeg_huff_decode @ 60 ;
+  jpeg_idct_1x1 @ 61 ;
+  jpeg_idct_2x2 @ 62 ;
+  jpeg_idct_4x4 @ 63 ;
+  jpeg_idct_float @ 64 ;
+  jpeg_idct_ifast @ 65 ;
+  jpeg_idct_islow @ 66 ;
+  jpeg_input_complete @ 67 ;
+  jpeg_make_c_derived_tbl @ 68 ;
+  jpeg_make_d_derived_tbl @ 69 ;
+  jpeg_mem_available @ 70 ;
+  jpeg_mem_init @ 71 ;
+  jpeg_mem_term @ 72 ;
+  jpeg_new_colormap @ 73 ;
+  jpeg_open_backing_store @ 74 ;
+  jpeg_quality_scaling @ 75 ;
+  jpeg_read_coefficients @ 76 ;
+  jpeg_read_header @ 77 ;
+  jpeg_read_raw_data @ 78 ;
+  jpeg_read_scanlines @ 79 ;
+  jpeg_resync_to_restart @ 80 ;
+  jpeg_save_markers @ 81 ;
+  jpeg_set_colorspace @ 82 ;
+  jpeg_set_defaults @ 83 ;
+  jpeg_set_linear_quality @ 84 ;
+  jpeg_set_marker_processor @ 85 ;
+  jpeg_set_quality @ 86 ;
+  jpeg_simple_progression @ 87 ;
+  jpeg_start_compress @ 88 ;
+  jpeg_start_decompress @ 89 ;
+  jpeg_start_output @ 90 ;
+  jpeg_std_error @ 91 ;
+  jpeg_stdio_dest @ 92 ;
+  jpeg_stdio_src @ 93 ;
+  jpeg_suppress_tables @ 94 ;
+  jpeg_write_coefficients @ 95 ;
+  jpeg_write_m_byte @ 96 ;
+  jpeg_write_m_header @ 97 ;
+  jpeg_write_marker @ 98 ;
+  jpeg_write_raw_data @ 99 ;
+  jpeg_write_scanlines @ 100 ;
+  jpeg_write_tables @ 101 ;
+  jround_up @ 102 ;
+  jzero_far @ 103 ;
+  jpeg_skip_scanlines @ 104 ;
+  jpeg_crop_scanline @ 105 ;
+  jpeg_read_icc_profile @ 106 ;
+  jpeg_write_icc_profile @ 107 ;
diff --git a/win/jpeg8.def b/win/jpeg8.def
index 19246ac..0a53125 100644
--- a/win/jpeg8.def
+++ b/win/jpeg8.def
@@ -1,109 +1,111 @@
 EXPORTS
-	jcopy_block_row @ 1 ; 
-	jcopy_sample_rows @ 2 ; 
-	jdiv_round_up @ 3 ; 
-	jinit_1pass_quantizer @ 4 ; 
-	jinit_2pass_quantizer @ 5 ; 
-	jinit_c_coef_controller @ 6 ; 
-	jinit_c_main_controller @ 7 ; 
-	jinit_c_master_control @ 8 ; 
-	jinit_c_prep_controller @ 9 ; 
-	jinit_color_converter @ 10 ; 
-	jinit_color_deconverter @ 11 ; 
-	jinit_compress_master @ 12 ; 
-	jinit_d_coef_controller @ 13 ; 
-	jinit_d_main_controller @ 14 ; 
-	jinit_d_post_controller @ 15 ; 
-	jinit_downsampler @ 16 ; 
-	jinit_forward_dct @ 17 ; 
-	jinit_huff_decoder @ 18 ; 
-	jinit_huff_encoder @ 19 ; 
-	jinit_input_controller @ 20 ; 
-	jinit_inverse_dct @ 21 ; 
-	jinit_marker_reader @ 22 ; 
-	jinit_marker_writer @ 23 ; 
-	jinit_master_decompress @ 24 ; 
-	jinit_memory_mgr @ 25 ; 
-	jinit_merged_upsampler @ 26 ; 
-	jinit_phuff_decoder @ 27 ; 
-	jinit_phuff_encoder @ 28 ; 
-	jinit_upsampler @ 29 ; 
-	jpeg_CreateCompress @ 30 ; 
-	jpeg_CreateDecompress @ 31 ; 
-	jpeg_abort @ 32 ; 
-	jpeg_abort_compress @ 33 ; 
-	jpeg_abort_decompress @ 34 ; 
-	jpeg_add_quant_table @ 35 ; 
-	jpeg_alloc_huff_table @ 36 ; 
-	jpeg_alloc_quant_table @ 37 ; 
-	jpeg_calc_jpeg_dimensions @ 38 ; 
-	jpeg_calc_output_dimensions @ 39 ; 
-	jpeg_consume_input @ 40 ; 
-	jpeg_copy_critical_parameters @ 41 ; 
-	jpeg_core_output_dimensions @ 42 ; 
-	jpeg_default_colorspace @ 43 ; 
-	jpeg_default_qtables @ 44 ;
-	jpeg_destroy @ 45 ; 
-	jpeg_destroy_compress @ 46 ; 
-	jpeg_destroy_decompress @ 47 ; 
-	jpeg_fdct_float @ 48 ; 
-	jpeg_fdct_ifast @ 49 ; 
-	jpeg_fdct_islow @ 50 ; 
-	jpeg_fill_bit_buffer @ 51 ; 
-	jpeg_finish_compress @ 52 ; 
-	jpeg_finish_decompress @ 53 ; 
-	jpeg_finish_output @ 54 ; 
-	jpeg_free_large @ 55 ; 
-	jpeg_free_small @ 56 ; 
-	jpeg_gen_optimal_table @ 57 ; 
-	jpeg_get_large @ 58 ; 
-	jpeg_get_small @ 59 ; 
-	jpeg_has_multiple_scans @ 60 ; 
-	jpeg_huff_decode @ 61 ; 
-	jpeg_idct_1x1 @ 62 ; 
-	jpeg_idct_2x2 @ 63 ; 
-	jpeg_idct_4x4 @ 64 ; 
-	jpeg_idct_float @ 65 ; 
-	jpeg_idct_ifast @ 66 ; 
-	jpeg_idct_islow @ 67 ; 
-	jpeg_input_complete @ 68 ; 
-	jpeg_make_c_derived_tbl @ 69 ; 
-	jpeg_make_d_derived_tbl @ 70 ; 
-	jpeg_mem_available @ 71 ; 
-	jpeg_mem_dest @ 72 ;
-	jpeg_mem_init @ 73 ; 
-	jpeg_mem_src @ 74 ;
-	jpeg_mem_term @ 75 ; 
-	jpeg_new_colormap @ 76 ; 
-	jpeg_open_backing_store @ 77 ; 
-	jpeg_quality_scaling @ 78 ; 
-	jpeg_read_coefficients @ 79 ; 
-	jpeg_read_header @ 80 ; 
-	jpeg_read_raw_data @ 81 ; 
-	jpeg_read_scanlines @ 82 ; 
-	jpeg_resync_to_restart @ 83 ; 
-	jpeg_save_markers @ 84 ; 
-	jpeg_set_colorspace @ 85 ; 
-	jpeg_set_defaults @ 86 ; 
-	jpeg_set_linear_quality @ 87 ; 
-	jpeg_set_marker_processor @ 88 ; 
-	jpeg_set_quality @ 89 ; 
-	jpeg_simple_progression @ 90 ; 
-	jpeg_start_compress @ 91 ; 
-	jpeg_start_decompress @ 92 ; 
-	jpeg_start_output @ 93 ; 
-	jpeg_std_error @ 94 ; 
-	jpeg_stdio_dest @ 95 ; 
-	jpeg_stdio_src @ 96 ; 
-	jpeg_suppress_tables @ 97 ; 
-	jpeg_write_coefficients @ 98 ; 
-	jpeg_write_m_byte @ 99 ; 
-	jpeg_write_m_header @ 100 ; 
-	jpeg_write_marker @ 101 ; 
-	jpeg_write_raw_data @ 102 ; 
-	jpeg_write_scanlines @ 103 ; 
-	jpeg_write_tables @ 104 ; 
-	jround_up @ 105 ; 
-	jzero_far @ 106 ; 
-	jpeg_skip_scanlines @ 107 ; 
-	jpeg_crop_scanline @ 108 ; 
+  jcopy_block_row @ 1 ;
+  jcopy_sample_rows @ 2 ;
+  jdiv_round_up @ 3 ;
+  jinit_1pass_quantizer @ 4 ;
+  jinit_2pass_quantizer @ 5 ;
+  jinit_c_coef_controller @ 6 ;
+  jinit_c_main_controller @ 7 ;
+  jinit_c_master_control @ 8 ;
+  jinit_c_prep_controller @ 9 ;
+  jinit_color_converter @ 10 ;
+  jinit_color_deconverter @ 11 ;
+  jinit_compress_master @ 12 ;
+  jinit_d_coef_controller @ 13 ;
+  jinit_d_main_controller @ 14 ;
+  jinit_d_post_controller @ 15 ;
+  jinit_downsampler @ 16 ;
+  jinit_forward_dct @ 17 ;
+  jinit_huff_decoder @ 18 ;
+  jinit_huff_encoder @ 19 ;
+  jinit_input_controller @ 20 ;
+  jinit_inverse_dct @ 21 ;
+  jinit_marker_reader @ 22 ;
+  jinit_marker_writer @ 23 ;
+  jinit_master_decompress @ 24 ;
+  jinit_memory_mgr @ 25 ;
+  jinit_merged_upsampler @ 26 ;
+  jinit_phuff_decoder @ 27 ;
+  jinit_phuff_encoder @ 28 ;
+  jinit_upsampler @ 29 ;
+  jpeg_CreateCompress @ 30 ;
+  jpeg_CreateDecompress @ 31 ;
+  jpeg_abort @ 32 ;
+  jpeg_abort_compress @ 33 ;
+  jpeg_abort_decompress @ 34 ;
+  jpeg_add_quant_table @ 35 ;
+  jpeg_alloc_huff_table @ 36 ;
+  jpeg_alloc_quant_table @ 37 ;
+  jpeg_calc_jpeg_dimensions @ 38 ;
+  jpeg_calc_output_dimensions @ 39 ;
+  jpeg_consume_input @ 40 ;
+  jpeg_copy_critical_parameters @ 41 ;
+  jpeg_core_output_dimensions @ 42 ;
+  jpeg_default_colorspace @ 43 ;
+  jpeg_default_qtables @ 44 ;
+  jpeg_destroy @ 45 ;
+  jpeg_destroy_compress @ 46 ;
+  jpeg_destroy_decompress @ 47 ;
+  jpeg_fdct_float @ 48 ;
+  jpeg_fdct_ifast @ 49 ;
+  jpeg_fdct_islow @ 50 ;
+  jpeg_fill_bit_buffer @ 51 ;
+  jpeg_finish_compress @ 52 ;
+  jpeg_finish_decompress @ 53 ;
+  jpeg_finish_output @ 54 ;
+  jpeg_free_large @ 55 ;
+  jpeg_free_small @ 56 ;
+  jpeg_gen_optimal_table @ 57 ;
+  jpeg_get_large @ 58 ;
+  jpeg_get_small @ 59 ;
+  jpeg_has_multiple_scans @ 60 ;
+  jpeg_huff_decode @ 61 ;
+  jpeg_idct_1x1 @ 62 ;
+  jpeg_idct_2x2 @ 63 ;
+  jpeg_idct_4x4 @ 64 ;
+  jpeg_idct_float @ 65 ;
+  jpeg_idct_ifast @ 66 ;
+  jpeg_idct_islow @ 67 ;
+  jpeg_input_complete @ 68 ;
+  jpeg_make_c_derived_tbl @ 69 ;
+  jpeg_make_d_derived_tbl @ 70 ;
+  jpeg_mem_available @ 71 ;
+  jpeg_mem_dest @ 72 ;
+  jpeg_mem_init @ 73 ;
+  jpeg_mem_src @ 74 ;
+  jpeg_mem_term @ 75 ;
+  jpeg_new_colormap @ 76 ;
+  jpeg_open_backing_store @ 77 ;
+  jpeg_quality_scaling @ 78 ;
+  jpeg_read_coefficients @ 79 ;
+  jpeg_read_header @ 80 ;
+  jpeg_read_raw_data @ 81 ;
+  jpeg_read_scanlines @ 82 ;
+  jpeg_resync_to_restart @ 83 ;
+  jpeg_save_markers @ 84 ;
+  jpeg_set_colorspace @ 85 ;
+  jpeg_set_defaults @ 86 ;
+  jpeg_set_linear_quality @ 87 ;
+  jpeg_set_marker_processor @ 88 ;
+  jpeg_set_quality @ 89 ;
+  jpeg_simple_progression @ 90 ;
+  jpeg_start_compress @ 91 ;
+  jpeg_start_decompress @ 92 ;
+  jpeg_start_output @ 93 ;
+  jpeg_std_error @ 94 ;
+  jpeg_stdio_dest @ 95 ;
+  jpeg_stdio_src @ 96 ;
+  jpeg_suppress_tables @ 97 ;
+  jpeg_write_coefficients @ 98 ;
+  jpeg_write_m_byte @ 99 ;
+  jpeg_write_m_header @ 100 ;
+  jpeg_write_marker @ 101 ;
+  jpeg_write_raw_data @ 102 ;
+  jpeg_write_scanlines @ 103 ;
+  jpeg_write_tables @ 104 ;
+  jround_up @ 105 ;
+  jzero_far @ 106 ;
+  jpeg_skip_scanlines @ 107 ;
+  jpeg_crop_scanline @ 108 ;
+  jpeg_read_icc_profile @ 109 ;
+  jpeg_write_icc_profile @ 110 ;
diff --git a/win/jsimdcfg.inc b/win/jsimdcfg.inc
index 9d4aede..667024a 100755
--- a/win/jsimdcfg.inc
+++ b/win/jsimdcfg.inc
@@ -90,5 +90,4 @@
 %define JSIMD_3DNOW 0x02
 %define JSIMD_SSE 0x04
 %define JSIMD_SSE2 0x08
-; Short forms of external names for systems with brain-damaged linkers.
-;
+%define JSIMD_AVX2 0x80
diff --git a/wrbmp.c b/wrbmp.c
index 728bbad..38a64e8 100644
--- a/wrbmp.c
+++ b/wrbmp.c
@@ -21,6 +21,7 @@
  * This code contributed by James Arthur Boucher.
  */
 
+#include "cmyk.h"
 #include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
 #include "jconfigint.h"
 
@@ -56,15 +57,26 @@
   JDIMENSION row_width;         /* physical width of one row in the BMP file */
   int pad_bytes;                /* number of padding bytes needed per row */
   JDIMENSION cur_output_row;    /* next row# to write to virtual array */
+
+  boolean use_inversion_array;  /* TRUE = buffer the whole image, which is
+                                   stored to disk in bottom-up order, and
+                                   receive rows from the calling program in
+                                   top-down order
+
+                                   FALSE = the calling program will maintain
+                                   its own image buffer and write the rows in
+                                   bottom-up order */
+
+  JSAMPLE *iobuffer;            /* I/O buffer (used to buffer a single row to
+                                   disk if use_inversion_array == FALSE) */
 } bmp_dest_struct;
 
 typedef bmp_dest_struct *bmp_dest_ptr;
 
 
 /* Forward declarations */
-LOCAL(void) write_colormap
-        (j_decompress_ptr cinfo, bmp_dest_ptr dest, int map_colors,
-         int map_entry_size);
+LOCAL(void) write_colormap(j_decompress_ptr cinfo, bmp_dest_ptr dest,
+                           int map_colors, int map_entry_size);
 
 
 static INLINE boolean is_big_endian(void)
@@ -82,29 +94,36 @@
  */
 
 METHODDEF(void)
-put_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-                JDIMENSION rows_supplied)
+put_pixel_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
+               JDIMENSION rows_supplied)
 /* This version is for writing 24-bit pixels */
 {
-  bmp_dest_ptr dest = (bmp_dest_ptr) dinfo;
+  bmp_dest_ptr dest = (bmp_dest_ptr)dinfo;
   JSAMPARRAY image_ptr;
   register JSAMPROW inptr, outptr;
   register JDIMENSION col;
   int pad;
 
-  /* Access next row in virtual array */
-  image_ptr = (*cinfo->mem->access_virt_sarray)
-    ((j_common_ptr) cinfo, dest->whole_image,
-     dest->cur_output_row, (JDIMENSION) 1, TRUE);
-  dest->cur_output_row++;
+  if (dest->use_inversion_array) {
+    /* Access next row in virtual array */
+    image_ptr = (*cinfo->mem->access_virt_sarray)
+      ((j_common_ptr)cinfo, dest->whole_image,
+       dest->cur_output_row, (JDIMENSION)1, TRUE);
+    dest->cur_output_row++;
+    outptr = image_ptr[0];
+  } else {
+    outptr = dest->iobuffer;
+  }
 
   /* Transfer data.  Note destination values must be in BGR order
    * (even though Microsoft's own documents say the opposite).
    */
   inptr = dest->pub.buffer[0];
-  outptr = image_ptr[0];
 
-  if (cinfo->out_color_space == JCS_RGB565) {
+  if (cinfo->out_color_space == JCS_EXT_BGR) {
+    MEMCOPY(outptr, inptr, dest->row_width);
+    outptr += cinfo->output_width * 3;
+  } else if (cinfo->out_color_space == JCS_RGB565) {
     boolean big_endian = is_big_endian();
     unsigned short *inptr2 = (unsigned short *)inptr;
     for (col = cinfo->output_width; col > 0; col--) {
@@ -120,61 +139,70 @@
       outptr += 3;
       inptr2++;
     }
-  } else {
+  } else if (cinfo->out_color_space == JCS_CMYK) {
     for (col = cinfo->output_width; col > 0; col--) {
-      outptr[2] = *inptr++;       /* can omit GETJSAMPLE() safely */
-      outptr[1] = *inptr++;
-      outptr[0] = *inptr++;
+      /* can omit GETJSAMPLE() safely */
+      JSAMPLE c = *inptr++, m = *inptr++, y = *inptr++, k = *inptr++;
+      cmyk_to_rgb(c, m, y, k, outptr + 2, outptr + 1, outptr);
       outptr += 3;
     }
+  } else {
+    register int rindex = rgb_red[cinfo->out_color_space];
+    register int gindex = rgb_green[cinfo->out_color_space];
+    register int bindex = rgb_blue[cinfo->out_color_space];
+    register int ps = rgb_pixelsize[cinfo->out_color_space];
+
+    for (col = cinfo->output_width; col > 0; col--) {
+      /* can omit GETJSAMPLE() safely */
+      outptr[0] = inptr[bindex];
+      outptr[1] = inptr[gindex];
+      outptr[2] = inptr[rindex];
+      outptr += 3;  inptr += ps;
+    }
   }
 
   /* Zero out the pad bytes. */
   pad = dest->pad_bytes;
   while (--pad >= 0)
     *outptr++ = 0;
+
+  if (!dest->use_inversion_array)
+    (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->row_width);
 }
 
 METHODDEF(void)
-put_gray_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-               JDIMENSION rows_supplied)
+put_gray_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
+              JDIMENSION rows_supplied)
 /* This version is for grayscale OR quantized color output */
 {
-  bmp_dest_ptr dest = (bmp_dest_ptr) dinfo;
+  bmp_dest_ptr dest = (bmp_dest_ptr)dinfo;
   JSAMPARRAY image_ptr;
   register JSAMPROW inptr, outptr;
-  register JDIMENSION col;
   int pad;
 
-  /* Access next row in virtual array */
-  image_ptr = (*cinfo->mem->access_virt_sarray)
-    ((j_common_ptr) cinfo, dest->whole_image,
-     dest->cur_output_row, (JDIMENSION) 1, TRUE);
-  dest->cur_output_row++;
+  if (dest->use_inversion_array) {
+    /* Access next row in virtual array */
+    image_ptr = (*cinfo->mem->access_virt_sarray)
+      ((j_common_ptr)cinfo, dest->whole_image,
+       dest->cur_output_row, (JDIMENSION)1, TRUE);
+    dest->cur_output_row++;
+    outptr = image_ptr[0];
+  } else {
+    outptr = dest->iobuffer;
+  }
 
   /* Transfer data. */
   inptr = dest->pub.buffer[0];
-  outptr = image_ptr[0];
-  for (col = cinfo->output_width; col > 0; col--) {
-    *outptr++ = *inptr++;       /* can omit GETJSAMPLE() safely */
-  }
+  MEMCOPY(outptr, inptr, cinfo->output_width);
+  outptr += cinfo->output_width;
 
   /* Zero out the pad bytes. */
   pad = dest->pad_bytes;
   while (--pad >= 0)
     *outptr++ = 0;
-}
 
-
-/*
- * Startup: normally writes the file header.
- * In this module we may as well postpone everything until finish_output.
- */
-
-METHODDEF(void)
-start_output_bmp (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
-{
-  /* no work here */
+  if (!dest->use_inversion_array)
+    (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->row_width);
 }
 
 
@@ -187,24 +215,26 @@
  */
 
 LOCAL(void)
-write_bmp_header (j_decompress_ptr cinfo, bmp_dest_ptr dest)
+write_bmp_header(j_decompress_ptr cinfo, bmp_dest_ptr dest)
 /* Write a Windows-style BMP file header, including colormap if needed */
 {
   char bmpfileheader[14];
   char bmpinfoheader[40];
-#define PUT_2B(array,offset,value)  \
-        (array[offset] = (char) ((value) & 0xFF), \
-         array[offset+1] = (char) (((value) >> 8) & 0xFF))
-#define PUT_4B(array,offset,value)  \
-        (array[offset] = (char) ((value) & 0xFF), \
-         array[offset+1] = (char) (((value) >> 8) & 0xFF), \
-         array[offset+2] = (char) (((value) >> 16) & 0xFF), \
-         array[offset+3] = (char) (((value) >> 24) & 0xFF))
+
+#define PUT_2B(array, offset, value) \
+  (array[offset] = (char)((value) & 0xFF), \
+   array[offset + 1] = (char)(((value) >> 8) & 0xFF))
+#define PUT_4B(array, offset, value) \
+  (array[offset] = (char)((value) & 0xFF), \
+   array[offset + 1] = (char)(((value) >> 8) & 0xFF), \
+   array[offset + 2] = (char)(((value) >> 16) & 0xFF), \
+   array[offset + 3] = (char)(((value) >> 24) & 0xFF))
+
   long headersize, bfSize;
   int bits_per_pixel, cmap_entries;
 
   /* Compute colormap size and total file size */
-  if (cinfo->out_color_space == JCS_RGB) {
+  if (IsExtRGB(cinfo->out_color_space)) {
     if (cinfo->quantize_colors) {
       /* Colormapped RGB */
       bits_per_pixel = 8;
@@ -214,7 +244,8 @@
       bits_per_pixel = 24;
       cmap_entries = 0;
     }
-  } else if (cinfo->out_color_space == JCS_RGB565) {
+  } else if (cinfo->out_color_space == JCS_RGB565 ||
+             cinfo->out_color_space == JCS_CMYK) {
     bits_per_pixel = 24;
     cmap_entries   = 0;
   } else {
@@ -224,7 +255,7 @@
   }
   /* File size */
   headersize = 14 + 40 + cmap_entries * 4; /* Header and colormap */
-  bfSize = headersize + (long) dest->row_width * (long) cinfo->output_height;
+  bfSize = headersize + (long)dest->row_width * (long)cinfo->output_height;
 
   /* Set unused fields of header to 0 */
   MEMZERO(bmpfileheader, sizeof(bmpfileheader));
@@ -246,15 +277,15 @@
   /* we leave biCompression = 0, for none */
   /* we leave biSizeImage = 0; this is correct for uncompressed data */
   if (cinfo->density_unit == 2) { /* if have density in dots/cm, then */
-    PUT_4B(bmpinfoheader, 24, (long) (cinfo->X_density*100)); /* XPels/M */
-    PUT_4B(bmpinfoheader, 28, (long) (cinfo->Y_density*100)); /* XPels/M */
+    PUT_4B(bmpinfoheader, 24, (long)(cinfo->X_density * 100)); /* XPels/M */
+    PUT_4B(bmpinfoheader, 28, (long)(cinfo->Y_density * 100)); /* XPels/M */
   }
   PUT_2B(bmpinfoheader, 32, cmap_entries); /* biClrUsed */
   /* we leave biClrImportant = 0 */
 
-  if (JFWRITE(dest->pub.output_file, bmpfileheader, 14) != (size_t) 14)
+  if (JFWRITE(dest->pub.output_file, bmpfileheader, 14) != (size_t)14)
     ERREXIT(cinfo, JERR_FILE_WRITE);
-  if (JFWRITE(dest->pub.output_file, bmpinfoheader, 40) != (size_t) 40)
+  if (JFWRITE(dest->pub.output_file, bmpinfoheader, 40) != (size_t)40)
     ERREXIT(cinfo, JERR_FILE_WRITE);
 
   if (cmap_entries > 0)
@@ -263,7 +294,7 @@
 
 
 LOCAL(void)
-write_os2_header (j_decompress_ptr cinfo, bmp_dest_ptr dest)
+write_os2_header(j_decompress_ptr cinfo, bmp_dest_ptr dest)
 /* Write an OS2-style BMP file header, including colormap if needed */
 {
   char bmpfileheader[14];
@@ -272,7 +303,9 @@
   int bits_per_pixel, cmap_entries;
 
   /* Compute colormap size and total file size */
-  if (cinfo->out_color_space == JCS_RGB) {
+  if (cinfo->out_color_space == JCS_RGB ||
+      (cinfo->out_color_space >= JCS_EXT_RGB &&
+       cinfo->out_color_space <= JCS_EXT_ARGB)) {
     if (cinfo->quantize_colors) {
       /* Colormapped RGB */
       bits_per_pixel = 8;
@@ -282,7 +315,8 @@
       bits_per_pixel = 24;
       cmap_entries = 0;
     }
-  } else if (cinfo->out_color_space == JCS_RGB565) {
+  } else if (cinfo->out_color_space == JCS_RGB565 ||
+             cinfo->out_color_space == JCS_CMYK) {
     bits_per_pixel = 24;
     cmap_entries   = 0;
   } else {
@@ -292,7 +326,7 @@
   }
   /* File size */
   headersize = 14 + 12 + cmap_entries * 3; /* Header and colormap */
-  bfSize = headersize + (long) dest->row_width * (long) cinfo->output_height;
+  bfSize = headersize + (long)dest->row_width * (long)cinfo->output_height;
 
   /* Set unused fields of header to 0 */
   MEMZERO(bmpfileheader, sizeof(bmpfileheader));
@@ -312,9 +346,9 @@
   PUT_2B(bmpcoreheader, 8, 1);  /* bcPlanes - must be 1 */
   PUT_2B(bmpcoreheader, 10, bits_per_pixel); /* bcBitCount */
 
-  if (JFWRITE(dest->pub.output_file, bmpfileheader, 14) != (size_t) 14)
+  if (JFWRITE(dest->pub.output_file, bmpfileheader, 14) != (size_t)14)
     ERREXIT(cinfo, JERR_FILE_WRITE);
-  if (JFWRITE(dest->pub.output_file, bmpcoreheader, 12) != (size_t) 12)
+  if (JFWRITE(dest->pub.output_file, bmpcoreheader, 12) != (size_t)12)
     ERREXIT(cinfo, JERR_FILE_WRITE);
 
   if (cmap_entries > 0)
@@ -328,8 +362,8 @@
  */
 
 LOCAL(void)
-write_colormap (j_decompress_ptr cinfo, bmp_dest_ptr dest,
-                int map_colors, int map_entry_size)
+write_colormap(j_decompress_ptr cinfo, bmp_dest_ptr dest, int map_colors,
+               int map_entry_size)
 {
   JSAMPARRAY colormap = cinfo->colormap;
   int num_colors = cinfo->actual_number_of_colors;
@@ -379,40 +413,62 @@
 }
 
 
+/*
+ * Startup: write the file header unless the inversion array is being used.
+ */
+
 METHODDEF(void)
-finish_output_bmp (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
+start_output_bmp(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
 {
-  bmp_dest_ptr dest = (bmp_dest_ptr) dinfo;
+  bmp_dest_ptr dest = (bmp_dest_ptr)dinfo;
+
+  if (!dest->use_inversion_array) {
+    /* Write the header and colormap */
+    if (dest->is_os2)
+      write_os2_header(cinfo, dest);
+    else
+      write_bmp_header(cinfo, dest);
+  }
+}
+
+
+METHODDEF(void)
+finish_output_bmp(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
+{
+  bmp_dest_ptr dest = (bmp_dest_ptr)dinfo;
   register FILE *outfile = dest->pub.output_file;
   JSAMPARRAY image_ptr;
   register JSAMPROW data_ptr;
   JDIMENSION row;
   register JDIMENSION col;
-  cd_progress_ptr progress = (cd_progress_ptr) cinfo->progress;
+  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
 
-  /* Write the header and colormap */
-  if (dest->is_os2)
-    write_os2_header(cinfo, dest);
-  else
-    write_bmp_header(cinfo, dest);
+  if (dest->use_inversion_array) {
+    /* Write the header and colormap */
+    if (dest->is_os2)
+      write_os2_header(cinfo, dest);
+    else
+      write_bmp_header(cinfo, dest);
 
-  /* Write the file body from our virtual array */
-  for (row = cinfo->output_height; row > 0; row--) {
-    if (progress != NULL) {
-      progress->pub.pass_counter = (long) (cinfo->output_height - row);
-      progress->pub.pass_limit = (long) cinfo->output_height;
-      (*progress->pub.progress_monitor) ((j_common_ptr) cinfo);
+    /* Write the file body from our virtual array */
+    for (row = cinfo->output_height; row > 0; row--) {
+      if (progress != NULL) {
+        progress->pub.pass_counter = (long)(cinfo->output_height - row);
+        progress->pub.pass_limit = (long)cinfo->output_height;
+        (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
+      }
+      image_ptr = (*cinfo->mem->access_virt_sarray)
+        ((j_common_ptr)cinfo, dest->whole_image, row - 1, (JDIMENSION)1,
+         FALSE);
+      data_ptr = image_ptr[0];
+      for (col = dest->row_width; col > 0; col--) {
+        putc(GETJSAMPLE(*data_ptr), outfile);
+        data_ptr++;
+      }
     }
-    image_ptr = (*cinfo->mem->access_virt_sarray)
-      ((j_common_ptr) cinfo, dest->whole_image, row-1, (JDIMENSION) 1, FALSE);
-    data_ptr = image_ptr[0];
-    for (col = dest->row_width; col > 0; col--) {
-      putc(GETJSAMPLE(*data_ptr), outfile);
-      data_ptr++;
-    }
+    if (progress != NULL)
+      progress->completed_extra_passes++;
   }
-  if (progress != NULL)
-    progress->completed_extra_passes++;
 
   /* Make sure we wrote the output file OK */
   fflush(outfile);
@@ -426,15 +482,16 @@
  */
 
 GLOBAL(djpeg_dest_ptr)
-jinit_write_bmp (j_decompress_ptr cinfo, boolean is_os2)
+jinit_write_bmp(j_decompress_ptr cinfo, boolean is_os2,
+                boolean use_inversion_array)
 {
   bmp_dest_ptr dest;
   JDIMENSION row_width;
 
   /* Create module interface object, fill in method pointers */
   dest = (bmp_dest_ptr)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  sizeof(bmp_dest_struct));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(bmp_dest_struct));
   dest->pub.start_output = start_output_bmp;
   dest->pub.finish_output = finish_output_bmp;
   dest->pub.calc_buffer_dimensions = NULL;
@@ -442,13 +499,16 @@
 
   if (cinfo->out_color_space == JCS_GRAYSCALE) {
     dest->pub.put_pixel_rows = put_gray_rows;
-  } else if (cinfo->out_color_space == JCS_RGB) {
+  } else if (cinfo->out_color_space == JCS_RGB ||
+             (cinfo->out_color_space >= JCS_EXT_RGB &&
+              cinfo->out_color_space <= JCS_EXT_ARGB)) {
     if (cinfo->quantize_colors)
       dest->pub.put_pixel_rows = put_gray_rows;
     else
       dest->pub.put_pixel_rows = put_pixel_rows;
-  } else if (cinfo->out_color_space == JCS_RGB565) {
-      dest->pub.put_pixel_rows = put_pixel_rows;
+  } else if (cinfo->out_color_space == JCS_RGB565 ||
+             cinfo->out_color_space == JCS_CMYK) {
+    dest->pub.put_pixel_rows = put_pixel_rows;
   } else {
     ERREXIT(cinfo, JERR_BMP_COLORSPACE);
   }
@@ -460,35 +520,42 @@
   if (cinfo->out_color_space == JCS_RGB565) {
     row_width = cinfo->output_width * 2;
     dest->row_width = dest->data_width = cinfo->output_width * 3;
+    while ((row_width & 3) != 0) row_width++;
+  } else if (!cinfo->quantize_colors &&
+             (IsExtRGB(cinfo->out_color_space) ||
+              cinfo->out_color_space == JCS_CMYK)) {
+    row_width = cinfo->output_width * cinfo->output_components;
+    dest->row_width = dest->data_width = cinfo->output_width * 3;
   } else {
     row_width = cinfo->output_width * cinfo->output_components;
     dest->row_width = dest->data_width = row_width;
   }
   while ((dest->row_width & 3) != 0) dest->row_width++;
-  dest->pad_bytes = (int) (dest->row_width - dest->data_width);
-  if (cinfo->out_color_space == JCS_RGB565) {
-    while ((row_width & 3) != 0) row_width++;
+  dest->pad_bytes = (int)(dest->row_width - dest->data_width);
+
+
+  if (use_inversion_array) {
+    /* Allocate space for inversion array, prepare for write pass */
+    dest->whole_image = (*cinfo->mem->request_virt_sarray)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
+       dest->row_width, cinfo->output_height, (JDIMENSION)1);
+    dest->cur_output_row = 0;
+    if (cinfo->progress != NULL) {
+      cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
+      progress->total_extra_passes++; /* count file input as separate pass */
+    }
   } else {
-    row_width = dest->row_width;
+    dest->iobuffer = (JSAMPLE *)(*cinfo->mem->alloc_small)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE, dest->row_width);
   }
-
-
-  /* Allocate space for inversion array, prepare for write pass */
-  dest->whole_image = (*cinfo->mem->request_virt_sarray)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
-     dest->row_width, cinfo->output_height, (JDIMENSION) 1);
-  dest->cur_output_row = 0;
-  if (cinfo->progress != NULL) {
-    cd_progress_ptr progress = (cd_progress_ptr) cinfo->progress;
-    progress->total_extra_passes++; /* count file input as separate pass */
-  }
+  dest->use_inversion_array = use_inversion_array;
 
   /* Create decompressor output buffer. */
   dest->pub.buffer = (*cinfo->mem->alloc_sarray)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, row_width, (JDIMENSION) 1);
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, row_width, (JDIMENSION)1);
   dest->pub.buffer_height = 1;
 
-  return (djpeg_dest_ptr) dest;
+  return (djpeg_dest_ptr)dest;
 }
 
 #endif /* BMP_SUPPORTED */
diff --git a/wrgif.c b/wrgif.c
index 8d2050f..5eed808 100644
--- a/wrgif.c
+++ b/wrgif.c
@@ -81,13 +81,13 @@
  */
 
 LOCAL(void)
-flush_packet (gif_dest_ptr dinfo)
+flush_packet(gif_dest_ptr dinfo)
 /* flush any accumulated data */
 {
   if (dinfo->bytesinpkt > 0) {  /* never write zero-length packet */
-    dinfo->packetbuf[0] = (char) dinfo->bytesinpkt++;
-    if (JFWRITE(dinfo->pub.output_file, dinfo->packetbuf, dinfo->bytesinpkt)
-        != (size_t) dinfo->bytesinpkt)
+    dinfo->packetbuf[0] = (char)dinfo->bytesinpkt++;
+    if (JFWRITE(dinfo->pub.output_file, dinfo->packetbuf, dinfo->bytesinpkt) !=
+        (size_t)dinfo->bytesinpkt)
       ERREXIT(dinfo->cinfo, JERR_FILE_WRITE);
     dinfo->bytesinpkt = 0;
   }
@@ -95,21 +95,21 @@
 
 
 /* Add a character to current packet; flush to disk if necessary */
-#define CHAR_OUT(dinfo,c)  \
-        { (dinfo)->packetbuf[++(dinfo)->bytesinpkt] = (char) (c);  \
-            if ((dinfo)->bytesinpkt >= 255)  \
-              flush_packet(dinfo);  \
-        }
+#define CHAR_OUT(dinfo, c) { \
+  (dinfo)->packetbuf[++(dinfo)->bytesinpkt] = (char)(c); \
+  if ((dinfo)->bytesinpkt >= 255) \
+    flush_packet(dinfo); \
+}
 
 
 /* Routine to convert variable-width codes into a byte stream */
 
 LOCAL(void)
-output (gif_dest_ptr dinfo, int code)
+output(gif_dest_ptr dinfo, int code)
 /* Emit a code of n_bits bits */
 /* Uses cur_accum and cur_bits to reblock into 8-bit bytes */
 {
-  dinfo->cur_accum |= ((long) code) << dinfo->cur_bits;
+  dinfo->cur_accum |= ((long)code) << dinfo->cur_bits;
   dinfo->cur_bits += dinfo->n_bits;
 
   while (dinfo->cur_bits >= 8) {
@@ -143,7 +143,7 @@
  */
 
 LOCAL(void)
-compress_init (gif_dest_ptr dinfo, int i_bits)
+compress_init(gif_dest_ptr dinfo, int i_bits)
 /* Initialize pseudo-compressor */
 {
   /* init all the state variables */
@@ -162,7 +162,7 @@
 
 
 LOCAL(void)
-compress_pixel (gif_dest_ptr dinfo, int c)
+compress_pixel(gif_dest_ptr dinfo, int c)
 /* Accept and "compress" one pixel value.
  * The given value must be less than n_bits wide.
  */
@@ -182,7 +182,7 @@
 
 
 LOCAL(void)
-compress_term (gif_dest_ptr dinfo)
+compress_term(gif_dest_ptr dinfo)
 /* Clean up at end */
 {
   /* Send an EOF code */
@@ -200,7 +200,7 @@
 
 
 LOCAL(void)
-put_word (gif_dest_ptr dinfo, unsigned int w)
+put_word(gif_dest_ptr dinfo, unsigned int w)
 /* Emit a 16-bit word, LSB first */
 {
   putc(w & 0xFF, dinfo->pub.output_file);
@@ -209,7 +209,7 @@
 
 
 LOCAL(void)
-put_3bytes (gif_dest_ptr dinfo, int val)
+put_3bytes(gif_dest_ptr dinfo, int val)
 /* Emit 3 copies of same byte value --- handy subr for colormap construction */
 {
   putc(val, dinfo->pub.output_file);
@@ -219,7 +219,7 @@
 
 
 LOCAL(void)
-emit_header (gif_dest_ptr dinfo, int num_colors, JSAMPARRAY colormap)
+emit_header(gif_dest_ptr dinfo, int num_colors, JSAMPARRAY colormap)
 /* Output the GIF file header, including color map */
 /* If colormap==NULL, synthesize a grayscale colormap */
 {
@@ -249,18 +249,18 @@
   putc('7', dinfo->pub.output_file);
   putc('a', dinfo->pub.output_file);
   /* Write the Logical Screen Descriptor */
-  put_word(dinfo, (unsigned int) dinfo->cinfo->output_width);
-  put_word(dinfo, (unsigned int) dinfo->cinfo->output_height);
+  put_word(dinfo, (unsigned int)dinfo->cinfo->output_width);
+  put_word(dinfo, (unsigned int)dinfo->cinfo->output_height);
   FlagByte = 0x80;              /* Yes, there is a global color table */
-  FlagByte |= (BitsPerPixel-1) << 4; /* color resolution */
-  FlagByte |= (BitsPerPixel-1); /* size of global color table */
+  FlagByte |= (BitsPerPixel - 1) << 4; /* color resolution */
+  FlagByte |= (BitsPerPixel - 1); /* size of global color table */
   putc(FlagByte, dinfo->pub.output_file);
   putc(0, dinfo->pub.output_file); /* Background color index */
   putc(0, dinfo->pub.output_file); /* Reserved (aspect ratio in GIF89) */
   /* Write the Global Color Map */
   /* If the color map is more than 8 bits precision, */
   /* we reduce it to 8 bits by shifting */
-  for (i=0; i < ColorMapSize; i++) {
+  for (i = 0; i < ColorMapSize; i++) {
     if (i < num_colors) {
       if (colormap != NULL) {
         if (dinfo->cinfo->out_color_space == JCS_RGB) {
@@ -274,7 +274,7 @@
         }
       } else {
         /* Create a grayscale map of num_colors values, range 0..255 */
-        put_3bytes(dinfo, (i * 255 + (num_colors-1)/2) / (num_colors-1));
+        put_3bytes(dinfo, (i * 255 + (num_colors - 1) / 2) / (num_colors - 1));
       }
     } else {
       /* fill out the map to a power of 2 */
@@ -285,15 +285,15 @@
   putc(',', dinfo->pub.output_file); /* separator */
   put_word(dinfo, 0);           /* left/top offset */
   put_word(dinfo, 0);
-  put_word(dinfo, (unsigned int) dinfo->cinfo->output_width); /* image size */
-  put_word(dinfo, (unsigned int) dinfo->cinfo->output_height);
+  put_word(dinfo, (unsigned int)dinfo->cinfo->output_width); /* image size */
+  put_word(dinfo, (unsigned int)dinfo->cinfo->output_height);
   /* flag byte: not interlaced, no local color map */
   putc(0x00, dinfo->pub.output_file);
   /* Write Initial Code Size byte */
   putc(InitCodeSize, dinfo->pub.output_file);
 
   /* Initialize for "compression" of image data */
-  compress_init(dinfo, InitCodeSize+1);
+  compress_init(dinfo, InitCodeSize + 1);
 }
 
 
@@ -302,14 +302,14 @@
  */
 
 METHODDEF(void)
-start_output_gif (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
+start_output_gif(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
 {
-  gif_dest_ptr dest = (gif_dest_ptr) dinfo;
+  gif_dest_ptr dest = (gif_dest_ptr)dinfo;
 
   if (cinfo->quantize_colors)
     emit_header(dest, cinfo->actual_number_of_colors, cinfo->colormap);
   else
-    emit_header(dest, 256, (JSAMPARRAY) NULL);
+    emit_header(dest, 256, (JSAMPARRAY)NULL);
 }
 
 
@@ -319,10 +319,10 @@
  */
 
 METHODDEF(void)
-put_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-                JDIMENSION rows_supplied)
+put_pixel_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
+               JDIMENSION rows_supplied)
 {
-  gif_dest_ptr dest = (gif_dest_ptr) dinfo;
+  gif_dest_ptr dest = (gif_dest_ptr)dinfo;
   register JSAMPROW ptr;
   register JDIMENSION col;
 
@@ -338,9 +338,9 @@
  */
 
 METHODDEF(void)
-finish_output_gif (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
+finish_output_gif(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
 {
-  gif_dest_ptr dest = (gif_dest_ptr) dinfo;
+  gif_dest_ptr dest = (gif_dest_ptr)dinfo;
 
   /* Flush "compression" mechanism */
   compress_term(dest);
@@ -360,7 +360,7 @@
  */
 
 METHODDEF(void)
-calc_buffer_dimensions_gif (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
+calc_buffer_dimensions_gif(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
 {
 }
 
@@ -370,14 +370,14 @@
  */
 
 GLOBAL(djpeg_dest_ptr)
-jinit_write_gif (j_decompress_ptr cinfo)
+jinit_write_gif(j_decompress_ptr cinfo)
 {
   gif_dest_ptr dest;
 
   /* Create module interface object, fill in method pointers */
   dest = (gif_dest_ptr)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  sizeof(gif_dest_struct));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(gif_dest_struct));
   dest->cinfo = cinfo;          /* make back link for subroutines */
   dest->pub.start_output = start_output_gif;
   dest->pub.put_pixel_rows = put_pixel_rows;
@@ -404,10 +404,10 @@
 
   /* Create decompressor output buffer. */
   dest->pub.buffer = (*cinfo->mem->alloc_sarray)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, cinfo->output_width, (JDIMENSION) 1);
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, cinfo->output_width, (JDIMENSION)1);
   dest->pub.buffer_height = 1;
 
-  return (djpeg_dest_ptr) dest;
+  return (djpeg_dest_ptr)dest;
 }
 
 #endif /* GIF_SUPPORTED */
diff --git a/wrjpgcom.1 b/wrjpgcom.1
index d419a99..a255cab 100644
--- a/wrjpgcom.1
+++ b/wrjpgcom.1
@@ -56,7 +56,7 @@
 If you give neither
 .B \-comment
 nor
-.BR \-cfile ,
+.BR \-cfile,
 then
 .B wrjpgcom
 will read the comment text from standard input.  (In this case an input image
diff --git a/wrjpgcom.c b/wrjpgcom.c
index 531c152..6e80006 100644
--- a/wrjpgcom.c
+++ b/wrjpgcom.c
@@ -18,7 +18,7 @@
 #include "jinclude.h"           /* get auto-config symbols, <stdio.h> */
 
 #ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc() */
-extern void *malloc ();
+extern void *malloc();
 #endif
 #include <ctype.h>              /* to declare isupper(), tolower() */
 #ifdef USE_SETMODE
@@ -83,7 +83,7 @@
 
 /* Read one byte, testing for EOF */
 static int
-read_1_byte (void)
+read_1_byte(void)
 {
   int c;
 
@@ -96,7 +96,7 @@
 /* Read 2 bytes, convert to unsigned int */
 /* All 2-byte quantities in JPEG markers are MSB first */
 static unsigned int
-read_2_bytes (void)
+read_2_bytes(void)
 {
   int c1, c2;
 
@@ -106,34 +106,34 @@
   c2 = NEXTBYTE();
   if (c2 == EOF)
     ERREXIT("Premature EOF in JPEG file");
-  return (((unsigned int) c1) << 8) + ((unsigned int) c2);
+  return (((unsigned int)c1) << 8) + ((unsigned int)c2);
 }
 
 
 /* Routines to write data to output file */
 
 static void
-write_1_byte (int c)
+write_1_byte(int c)
 {
   PUTBYTE(c);
 }
 
 static void
-write_2_bytes (unsigned int val)
+write_2_bytes(unsigned int val)
 {
   PUTBYTE((val >> 8) & 0xFF);
   PUTBYTE(val & 0xFF);
 }
 
 static void
-write_marker (int marker)
+write_marker(int marker)
 {
   PUTBYTE(0xFF);
   PUTBYTE(marker);
 }
 
 static void
-copy_rest_of_file (void)
+copy_rest_of_file(void)
 {
   int c;
 
@@ -178,7 +178,7 @@
  */
 
 static int
-next_marker (void)
+next_marker(void)
 {
   int c;
   int discarded_bytes = 0;
@@ -213,7 +213,7 @@
  */
 
 static int
-first_marker (void)
+first_marker(void)
 {
   int c1, c2;
 
@@ -235,7 +235,7 @@
  */
 
 static void
-copy_variable (void)
+copy_variable(void)
 /* Copy an unknown or uninteresting variable-length marker */
 {
   unsigned int length;
@@ -255,7 +255,7 @@
 }
 
 static void
-skip_variable (void)
+skip_variable(void)
 /* Skip over an unknown or uninteresting variable-length marker */
 {
   unsigned int length;
@@ -268,7 +268,7 @@
   length -= 2;
   /* Skip over the remaining bytes */
   while (length > 0) {
-    (void) read_1_byte();
+    (void)read_1_byte();
     length--;
   }
 }
@@ -280,7 +280,7 @@
  */
 
 static int
-scan_JPEG_header (int keep_COM)
+scan_JPEG_header(int keep_COM)
 {
   int marker;
 
@@ -342,7 +342,7 @@
 
 
 static void
-usage (void)
+usage(void)
 /* complain about bad command line */
 {
   fprintf(stderr, "wrjpgcom inserts a textual comment in a JPEG file.\n");
@@ -364,7 +364,7 @@
   fprintf(stderr, "If you do not give either -comment or -cfile on the command line,\n");
   fprintf(stderr, "then the comment text is read from standard input.\n");
   fprintf(stderr, "It can be multiple lines, up to %u characters total.\n",
-          (unsigned int) MAX_COM_LENGTH);
+          (unsigned int)MAX_COM_LENGTH);
 #ifndef TWO_FILE_COMMANDLINE
   fprintf(stderr, "You must specify an input JPEG file name when supplying\n");
   fprintf(stderr, "comment text from standard input.\n");
@@ -375,7 +375,7 @@
 
 
 static int
-keymatch (char *arg, const char *keyword, int minchars)
+keymatch(char *arg, const char *keyword, int minchars)
 /* Case-insensitive matching of (possibly abbreviated) keyword switches. */
 /* keyword is the constant keyword (must be lower case already), */
 /* minchars is length of minimum legal abbreviation. */
@@ -404,7 +404,7 @@
  */
 
 int
-main (int argc, char **argv)
+main(int argc, char **argv)
 {
   int argn;
   char *arg;
@@ -444,38 +444,38 @@
        * under MS-DOG and must parse out the quoted string ourselves.  Sigh.
        */
       if (comment_arg[0] == '"') {
-        comment_arg = (char *) malloc((size_t) MAX_COM_LENGTH);
+        comment_arg = (char *)malloc((size_t)MAX_COM_LENGTH);
         if (comment_arg == NULL)
           ERREXIT("Insufficient memory");
-        if (strlen(argv[argn]) + 2 >= (size_t) MAX_COM_LENGTH) {
+        if (strlen(argv[argn]) + 2 >= (size_t)MAX_COM_LENGTH) {
           fprintf(stderr, "Comment text may not exceed %u bytes\n",
-                  (unsigned int) MAX_COM_LENGTH);
+                  (unsigned int)MAX_COM_LENGTH);
           exit(EXIT_FAILURE);
         }
-        strcpy(comment_arg, argv[argn]+1);
+        strcpy(comment_arg, argv[argn] + 1);
         for (;;) {
-          comment_length = (unsigned int) strlen(comment_arg);
-          if (comment_length > 0 && comment_arg[comment_length-1] == '"') {
-            comment_arg[comment_length-1] = '\0'; /* zap terminating quote */
+          comment_length = (unsigned int)strlen(comment_arg);
+          if (comment_length > 0 && comment_arg[comment_length - 1] == '"') {
+            comment_arg[comment_length - 1] = '\0'; /* zap terminating quote */
             break;
           }
           if (++argn >= argc)
             ERREXIT("Missing ending quote mark");
           if (strlen(comment_arg) + strlen(argv[argn]) + 2 >=
-              (size_t) MAX_COM_LENGTH) {
+              (size_t)MAX_COM_LENGTH) {
             fprintf(stderr, "Comment text may not exceed %u bytes\n",
-                    (unsigned int) MAX_COM_LENGTH);
+                    (unsigned int)MAX_COM_LENGTH);
             exit(EXIT_FAILURE);
           }
           strcat(comment_arg, " ");
           strcat(comment_arg, argv[argn]);
         }
-      } else if (strlen(argv[argn]) >= (size_t) MAX_COM_LENGTH) {
+      } else if (strlen(argv[argn]) >= (size_t)MAX_COM_LENGTH) {
         fprintf(stderr, "Comment text may not exceed %u bytes\n",
-                (unsigned int) MAX_COM_LENGTH);
+                (unsigned int)MAX_COM_LENGTH);
         exit(EXIT_FAILURE);
       }
-      comment_length = (unsigned int) strlen(comment_arg);
+      comment_length = (unsigned int)strlen(comment_arg);
     } else
       usage();
   }
@@ -513,18 +513,17 @@
   /* Open the output file. */
 #ifdef TWO_FILE_COMMANDLINE
   /* Must have explicit output file name */
-  if (argn != argc-2) {
-    fprintf(stderr, "%s: must name one input and one output file\n",
-            progname);
+  if (argn != argc - 2) {
+    fprintf(stderr, "%s: must name one input and one output file\n", progname);
     usage();
   }
-  if ((outfile = fopen(argv[argn+1], WRITE_BINARY)) == NULL) {
-    fprintf(stderr, "%s: can't open %s\n", progname, argv[argn+1]);
+  if ((outfile = fopen(argv[argn + 1], WRITE_BINARY)) == NULL) {
+    fprintf(stderr, "%s: can't open %s\n", progname, argv[argn + 1]);
     exit(EXIT_FAILURE);
   }
 #else
   /* Unix style: expect zero or one file name */
-  if (argn < argc-1) {
+  if (argn < argc - 1) {
     fprintf(stderr, "%s: only one input file\n", progname);
     usage();
   }
@@ -547,18 +546,18 @@
     FILE *src_file;
     int c;
 
-    comment_arg = (char *) malloc((size_t) MAX_COM_LENGTH);
+    comment_arg = (char *)malloc((size_t)MAX_COM_LENGTH);
     if (comment_arg == NULL)
       ERREXIT("Insufficient memory");
     comment_length = 0;
     src_file = (comment_file != NULL ? comment_file : stdin);
     while ((c = getc(src_file)) != EOF) {
-      if (comment_length >= (unsigned int) MAX_COM_LENGTH) {
+      if (comment_length >= (unsigned int)MAX_COM_LENGTH) {
         fprintf(stderr, "Comment text may not exceed %u bytes\n",
-                (unsigned int) MAX_COM_LENGTH);
+                (unsigned int)MAX_COM_LENGTH);
         exit(EXIT_FAILURE);
       }
-      comment_arg[comment_length++] = (char) c;
+      comment_arg[comment_length++] = (char)c;
     }
     if (comment_file != NULL)
       fclose(comment_file);
diff --git a/wrppm.c b/wrppm.c
index 91cb10b..0382452 100644
--- a/wrppm.c
+++ b/wrppm.c
@@ -19,6 +19,7 @@
  * an ordinary stdio stream.
  */
 
+#include "cmyk.h"
 #include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
 
 #ifdef PPM_SUPPORTED
@@ -35,23 +36,23 @@
  */
 
 #if BITS_IN_JSAMPLE == 8
-#define PUTPPMSAMPLE(ptr,v)  *ptr++ = (char) (v)
+#define PUTPPMSAMPLE(ptr, v)  *ptr++ = (char)(v)
 #define BYTESPERSAMPLE 1
 #define PPM_MAXVAL 255
 #else
 #ifdef PPM_NORAWWORD
-#define PUTPPMSAMPLE(ptr,v)  *ptr++ = (char) ((v) >> (BITS_IN_JSAMPLE-8))
+#define PUTPPMSAMPLE(ptr, v)  *ptr++ = (char)((v) >> (BITS_IN_JSAMPLE - 8))
 #define BYTESPERSAMPLE 1
 #define PPM_MAXVAL 255
 #else
 /* The word-per-sample format always puts the MSB first. */
-#define PUTPPMSAMPLE(ptr,v)                     \
-        { register int val_ = v;                \
-          *ptr++ = (char) ((val_ >> 8) & 0xFF); \
-          *ptr++ = (char) (val_ & 0xFF);        \
-        }
+#define PUTPPMSAMPLE(ptr, v) { \
+  register int val_ = v; \
+  *ptr++ = (char)((val_ >> 8) & 0xFF); \
+  *ptr++ = (char)(val_ & 0xFF); \
+}
 #define BYTESPERSAMPLE 2
-#define PPM_MAXVAL ((1<<BITS_IN_JSAMPLE)-1)
+#define PPM_MAXVAL ((1 << BITS_IN_JSAMPLE) - 1)
 #endif
 #endif
 
@@ -86,12 +87,12 @@
  */
 
 METHODDEF(void)
-put_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-                JDIMENSION rows_supplied)
+put_pixel_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
+               JDIMENSION rows_supplied)
 {
-  ppm_dest_ptr dest = (ppm_dest_ptr) dinfo;
+  ppm_dest_ptr dest = (ppm_dest_ptr)dinfo;
 
-  (void) JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
+  (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
 }
 
 
@@ -101,20 +102,80 @@
  */
 
 METHODDEF(void)
-copy_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-                 JDIMENSION rows_supplied)
+copy_pixel_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
+                JDIMENSION rows_supplied)
 {
-  ppm_dest_ptr dest = (ppm_dest_ptr) dinfo;
+  ppm_dest_ptr dest = (ppm_dest_ptr)dinfo;
+  register char *bufferptr;
+  register JSAMPROW ptr;
+#if BITS_IN_JSAMPLE != 8 || (!defined(HAVE_UNSIGNED_CHAR) && !defined(__CHAR_UNSIGNED__))
+  register JDIMENSION col;
+#endif
+
+  ptr = dest->pub.buffer[0];
+  bufferptr = dest->iobuffer;
+#if BITS_IN_JSAMPLE == 8 && (defined(HAVE_UNSIGNED_CHAR) || defined(__CHAR_UNSIGNED__))
+  MEMCOPY(bufferptr, ptr, dest->samples_per_row);
+#else
+  for (col = dest->samples_per_row; col > 0; col--) {
+    PUTPPMSAMPLE(bufferptr, GETJSAMPLE(*ptr++));
+  }
+#endif
+  (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
+}
+
+
+/*
+ * Convert extended RGB to RGB.
+ */
+
+METHODDEF(void)
+put_rgb(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo, JDIMENSION rows_supplied)
+{
+  ppm_dest_ptr dest = (ppm_dest_ptr)dinfo;
+  register char *bufferptr;
+  register JSAMPROW ptr;
+  register JDIMENSION col;
+  register int rindex = rgb_red[cinfo->out_color_space];
+  register int gindex = rgb_green[cinfo->out_color_space];
+  register int bindex = rgb_blue[cinfo->out_color_space];
+  register int ps = rgb_pixelsize[cinfo->out_color_space];
+
+  ptr = dest->pub.buffer[0];
+  bufferptr = dest->iobuffer;
+  for (col = cinfo->output_width; col > 0; col--) {
+    PUTPPMSAMPLE(bufferptr, ptr[rindex]);
+    PUTPPMSAMPLE(bufferptr, ptr[gindex]);
+    PUTPPMSAMPLE(bufferptr, ptr[bindex]);
+    ptr += ps;
+  }
+  (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
+}
+
+
+/*
+ * Convert CMYK to RGB.
+ */
+
+METHODDEF(void)
+put_cmyk(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
+         JDIMENSION rows_supplied)
+{
+  ppm_dest_ptr dest = (ppm_dest_ptr)dinfo;
   register char *bufferptr;
   register JSAMPROW ptr;
   register JDIMENSION col;
 
   ptr = dest->pub.buffer[0];
   bufferptr = dest->iobuffer;
-  for (col = dest->samples_per_row; col > 0; col--) {
-    PUTPPMSAMPLE(bufferptr, GETJSAMPLE(*ptr++));
+  for (col = cinfo->output_width; col > 0; col--) {
+    JSAMPLE r, g, b, c = *ptr++, m = *ptr++, y = *ptr++, k = *ptr++;
+    cmyk_to_rgb(c, m, y, k, &r, &g, &b);
+    PUTPPMSAMPLE(bufferptr, r);
+    PUTPPMSAMPLE(bufferptr, g);
+    PUTPPMSAMPLE(bufferptr, b);
   }
-  (void) JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
+  (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
 }
 
 
@@ -124,10 +185,10 @@
  */
 
 METHODDEF(void)
-put_demapped_rgb (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-                  JDIMENSION rows_supplied)
+put_demapped_rgb(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
+                 JDIMENSION rows_supplied)
 {
-  ppm_dest_ptr dest = (ppm_dest_ptr) dinfo;
+  ppm_dest_ptr dest = (ppm_dest_ptr)dinfo;
   register char *bufferptr;
   register int pixval;
   register JSAMPROW ptr;
@@ -144,15 +205,15 @@
     PUTPPMSAMPLE(bufferptr, GETJSAMPLE(color_map1[pixval]));
     PUTPPMSAMPLE(bufferptr, GETJSAMPLE(color_map2[pixval]));
   }
-  (void) JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
+  (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
 }
 
 
 METHODDEF(void)
-put_demapped_gray (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-                   JDIMENSION rows_supplied)
+put_demapped_gray(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
+                  JDIMENSION rows_supplied)
 {
-  ppm_dest_ptr dest = (ppm_dest_ptr) dinfo;
+  ppm_dest_ptr dest = (ppm_dest_ptr)dinfo;
   register char *bufferptr;
   register JSAMPROW ptr;
   register JSAMPROW color_map = cinfo->colormap[0];
@@ -163,7 +224,7 @@
   for (col = cinfo->output_width; col > 0; col--) {
     PUTPPMSAMPLE(bufferptr, GETJSAMPLE(color_map[GETJSAMPLE(*ptr++)]));
   }
-  (void) JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
+  (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
 }
 
 
@@ -172,23 +233,32 @@
  */
 
 METHODDEF(void)
-start_output_ppm (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
+start_output_ppm(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
 {
-  ppm_dest_ptr dest = (ppm_dest_ptr) dinfo;
+  ppm_dest_ptr dest = (ppm_dest_ptr)dinfo;
 
   /* Emit file header */
   switch (cinfo->out_color_space) {
   case JCS_GRAYSCALE:
     /* emit header for raw PGM format */
     fprintf(dest->pub.output_file, "P5\n%ld %ld\n%d\n",
-            (long) cinfo->output_width, (long) cinfo->output_height,
-            PPM_MAXVAL);
+            (long)cinfo->output_width, (long)cinfo->output_height, PPM_MAXVAL);
     break;
   case JCS_RGB:
+  case JCS_EXT_RGB:
+  case JCS_EXT_RGBX:
+  case JCS_EXT_BGR:
+  case JCS_EXT_BGRX:
+  case JCS_EXT_XBGR:
+  case JCS_EXT_XRGB:
+  case JCS_EXT_RGBA:
+  case JCS_EXT_BGRA:
+  case JCS_EXT_ABGR:
+  case JCS_EXT_ARGB:
+  case JCS_CMYK:
     /* emit header for raw PPM format */
     fprintf(dest->pub.output_file, "P6\n%ld %ld\n%d\n",
-            (long) cinfo->output_width, (long) cinfo->output_height,
-            PPM_MAXVAL);
+            (long)cinfo->output_width, (long)cinfo->output_height, PPM_MAXVAL);
     break;
   default:
     ERREXIT(cinfo, JERR_PPM_COLORSPACE);
@@ -201,7 +271,7 @@
  */
 
 METHODDEF(void)
-finish_output_ppm (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
+finish_output_ppm(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
 {
   /* Make sure we wrote the output file OK */
   fflush(dinfo->output_file);
@@ -215,11 +285,14 @@
  */
 
 METHODDEF(void)
-calc_buffer_dimensions_ppm (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
+calc_buffer_dimensions_ppm(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
 {
-  ppm_dest_ptr dest = (ppm_dest_ptr) dinfo;
+  ppm_dest_ptr dest = (ppm_dest_ptr)dinfo;
 
-  dest->samples_per_row = cinfo->output_width * cinfo->out_color_components;
+  if (cinfo->out_color_space == JCS_GRAYSCALE)
+    dest->samples_per_row = cinfo->output_width * cinfo->out_color_components;
+  else
+    dest->samples_per_row = cinfo->output_width * 3;
   dest->buffer_width = dest->samples_per_row * (BYTESPERSAMPLE * sizeof(char));
 }
 
@@ -229,13 +302,13 @@
  */
 
 GLOBAL(djpeg_dest_ptr)
-jinit_write_ppm (j_decompress_ptr cinfo)
+jinit_write_ppm(j_decompress_ptr cinfo)
 {
   ppm_dest_ptr dest;
 
   /* Create module interface object, fill in method pointers */
   dest = (ppm_dest_ptr)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(ppm_dest_struct));
   dest->pub.start_output = start_output_ppm;
   dest->pub.finish_output = finish_output_ppm;
@@ -245,21 +318,30 @@
   jpeg_calc_output_dimensions(cinfo);
 
   /* Create physical I/O buffer */
-  dest->pub.calc_buffer_dimensions (cinfo, (djpeg_dest_ptr) dest);
-  dest->iobuffer = (char *) (*cinfo->mem->alloc_small)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, dest->buffer_width);
+  dest->pub.calc_buffer_dimensions(cinfo, (djpeg_dest_ptr)dest);
+  dest->iobuffer = (char *)(*cinfo->mem->alloc_small)
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, dest->buffer_width);
 
   if (cinfo->quantize_colors || BITS_IN_JSAMPLE != 8 ||
-      sizeof(JSAMPLE) != sizeof(char)) {
+      sizeof(JSAMPLE) != sizeof(char) ||
+      (cinfo->out_color_space != JCS_EXT_RGB
+#if RGB_RED == 0 && RGB_GREEN == 1 && RGB_BLUE == 2 && RGB_PIXELSIZE == 3
+       && cinfo->out_color_space != JCS_RGB
+#endif
+      )) {
     /* When quantizing, we need an output buffer for colormap indexes
      * that's separate from the physical I/O buffer.  We also need a
      * separate buffer if pixel format translation must take place.
      */
     dest->pub.buffer = (*cinfo->mem->alloc_sarray)
-      ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       cinfo->output_width * cinfo->output_components, (JDIMENSION) 1);
+      ((j_common_ptr)cinfo, JPOOL_IMAGE,
+       cinfo->output_width * cinfo->output_components, (JDIMENSION)1);
     dest->pub.buffer_height = 1;
-    if (! cinfo->quantize_colors)
+    if (IsExtRGB(cinfo->out_color_space))
+      dest->pub.put_pixel_rows = put_rgb;
+    else if (cinfo->out_color_space == JCS_CMYK)
+      dest->pub.put_pixel_rows = put_cmyk;
+    else if (!cinfo->quantize_colors)
       dest->pub.put_pixel_rows = copy_pixel_rows;
     else if (cinfo->out_color_space == JCS_GRAYSCALE)
       dest->pub.put_pixel_rows = put_demapped_gray;
@@ -268,13 +350,13 @@
   } else {
     /* We will fwrite() directly from decompressor output buffer. */
     /* Synthesize a JSAMPARRAY pointer structure */
-    dest->pixrow = (JSAMPROW) dest->iobuffer;
-    dest->pub.buffer = & dest->pixrow;
+    dest->pixrow = (JSAMPROW)dest->iobuffer;
+    dest->pub.buffer = &dest->pixrow;
     dest->pub.buffer_height = 1;
     dest->pub.put_pixel_rows = put_pixel_rows;
   }
 
-  return (djpeg_dest_ptr) dest;
+  return (djpeg_dest_ptr)dest;
 }
 
 #endif /* PPM_SUPPORTED */
diff --git a/wrrle.c b/wrrle.c
index 880fadf..5c98ec0 100644
--- a/wrrle.c
+++ b/wrrle.c
@@ -51,7 +51,7 @@
  */
 
 #define CMAPBITS        8
-#define CMAPLENGTH      (1<<(CMAPBITS))
+#define CMAPLENGTH      (1 << (CMAPBITS))
 
 typedef struct {
   struct djpeg_dest_struct pub; /* public fields */
@@ -65,9 +65,9 @@
 typedef rle_dest_struct *rle_dest_ptr;
 
 /* Forward declarations */
-METHODDEF(void) rle_put_pixel_rows
-        (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-         JDIMENSION rows_supplied);
+METHODDEF(void) rle_put_pixel_rows(j_decompress_ptr cinfo,
+                                   djpeg_dest_ptr dinfo,
+                                   JDIMENSION rows_supplied);
 
 
 /*
@@ -77,13 +77,13 @@
  */
 
 METHODDEF(void)
-start_output_rle (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
+start_output_rle(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
 {
-  rle_dest_ptr dest = (rle_dest_ptr) dinfo;
+  rle_dest_ptr dest = (rle_dest_ptr)dinfo;
   size_t cmapsize;
   int i, ci;
 #ifdef PROGRESS_REPORT
-  cd_progress_ptr progress = (cd_progress_ptr) cinfo->progress;
+  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
 #endif
 
   /*
@@ -117,8 +117,8 @@
   if (cinfo->quantize_colors) {
     /* Allocate storage for RLE-style cmap, zero any extra entries */
     cmapsize = cinfo->out_color_components * CMAPLENGTH * sizeof(rle_map);
-    dest->colormap = (rle_map *) (*cinfo->mem->alloc_small)
-      ((j_common_ptr) cinfo, JPOOL_IMAGE, cmapsize);
+    dest->colormap = (rle_map *)(*cinfo->mem->alloc_small)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE, cmapsize);
     MEMZERO(dest->colormap, cmapsize);
 
     /* Save away data in RLE format --- note 8-bit left shift! */
@@ -133,7 +133,7 @@
 
   /* Set the output buffer to the first row */
   dest->pub.buffer = (*cinfo->mem->access_virt_sarray)
-    ((j_common_ptr) cinfo, dest->image, (JDIMENSION) 0, (JDIMENSION) 1, TRUE);
+    ((j_common_ptr)cinfo, dest->image, (JDIMENSION)0, (JDIMENSION)1, TRUE);
   dest->pub.buffer_height = 1;
 
   dest->pub.put_pixel_rows = rle_put_pixel_rows;
@@ -153,15 +153,15 @@
  */
 
 METHODDEF(void)
-rle_put_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-                    JDIMENSION rows_supplied)
+rle_put_pixel_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
+                   JDIMENSION rows_supplied)
 {
-  rle_dest_ptr dest = (rle_dest_ptr) dinfo;
+  rle_dest_ptr dest = (rle_dest_ptr)dinfo;
 
   if (cinfo->output_scanline < cinfo->output_height) {
     dest->pub.buffer = (*cinfo->mem->access_virt_sarray)
-      ((j_common_ptr) cinfo, dest->image,
-       cinfo->output_scanline, (JDIMENSION) 1, TRUE);
+      ((j_common_ptr)cinfo, dest->image,
+       cinfo->output_scanline, (JDIMENSION)1, TRUE);
   }
 }
 
@@ -172,9 +172,9 @@
  */
 
 METHODDEF(void)
-finish_output_rle (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
+finish_output_rle(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
 {
-  rle_dest_ptr dest = (rle_dest_ptr) dinfo;
+  rle_dest_ptr dest = (rle_dest_ptr)dinfo;
   rle_hdr header;               /* Output file information */
   rle_pixel **rle_row, *red, *green, *blue;
   JSAMPROW output_row;
@@ -182,7 +182,7 @@
   int row, col;
   int ci;
 #ifdef PROGRESS_REPORT
-  cd_progress_ptr progress = (cd_progress_ptr) cinfo->progress;
+  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
 #endif
 
   /* Initialize the header info */
@@ -202,7 +202,8 @@
     header.cmaplen = CMAPBITS;
     header.cmap    = dest->colormap;
     /* Add a comment to the output image with the true colormap length. */
-    sprintf(cmapcomment, "color_map_length=%d", cinfo->actual_number_of_colors);
+    sprintf(cmapcomment, "color_map_length=%d",
+            cinfo->actual_number_of_colors);
     rle_putcom(cmapcomment, &header);
   }
 
@@ -217,29 +218,29 @@
   if (progress != NULL) {
     progress->pub.pass_limit = cinfo->output_height;
     progress->pub.pass_counter = 0;
-    (*progress->pub.progress_monitor) ((j_common_ptr) cinfo);
+    (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
   }
 #endif
 
   if (cinfo->output_components == 1) {
-    for (row = cinfo->output_height-1; row >= 0; row--) {
-      rle_row = (rle_pixel **) (*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr) cinfo, dest->image,
-         (JDIMENSION) row, (JDIMENSION) 1, FALSE);
-      rle_putrow(rle_row, (int) cinfo->output_width, &header);
+    for (row = cinfo->output_height - 1; row >= 0; row--) {
+      rle_row = (rle_pixel **)(*cinfo->mem->access_virt_sarray)
+        ((j_common_ptr)cinfo, dest->image,
+         (JDIMENSION)row, (JDIMENSION)1, FALSE);
+      rle_putrow(rle_row, (int)cinfo->output_width, &header);
 #ifdef PROGRESS_REPORT
       if (progress != NULL) {
         progress->pub.pass_counter++;
-        (*progress->pub.progress_monitor) ((j_common_ptr) cinfo);
+        (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
       }
 #endif
     }
   } else {
-    for (row = cinfo->output_height-1; row >= 0; row--) {
-      rle_row = (rle_pixel **) dest->rle_row;
+    for (row = cinfo->output_height - 1; row >= 0; row--) {
+      rle_row = (rle_pixel **)dest->rle_row;
       output_row = *(*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr) cinfo, dest->image,
-         (JDIMENSION) row, (JDIMENSION) 1, FALSE);
+        ((j_common_ptr)cinfo, dest->image,
+         (JDIMENSION)row, (JDIMENSION)1, FALSE);
       red = rle_row[0];
       green = rle_row[1];
       blue = rle_row[2];
@@ -248,11 +249,11 @@
         *green++ = GETJSAMPLE(*output_row++);
         *blue++ = GETJSAMPLE(*output_row++);
       }
-      rle_putrow(rle_row, (int) cinfo->output_width, &header);
+      rle_putrow(rle_row, (int)cinfo->output_width, &header);
 #ifdef PROGRESS_REPORT
       if (progress != NULL) {
         progress->pub.pass_counter++;
-        (*progress->pub.progress_monitor) ((j_common_ptr) cinfo);
+        (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
       }
 #endif
     }
@@ -276,14 +277,14 @@
  */
 
 GLOBAL(djpeg_dest_ptr)
-jinit_write_rle (j_decompress_ptr cinfo)
+jinit_write_rle(j_decompress_ptr cinfo)
 {
   rle_dest_ptr dest;
 
   /* Create module interface object, fill in method pointers */
   dest = (rle_dest_ptr)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  sizeof(rle_dest_struct));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(rle_dest_struct));
   dest->pub.start_output = start_output_rle;
   dest->pub.finish_output = finish_output_rle;
   dest->pub.calc_buffer_dimensions = NULL;
@@ -293,16 +294,16 @@
 
   /* Allocate a work array for output to the RLE library. */
   dest->rle_row = (*cinfo->mem->alloc_sarray)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE,
-     cinfo->output_width, (JDIMENSION) cinfo->output_components);
+    ((j_common_ptr)cinfo, JPOOL_IMAGE,
+     cinfo->output_width, (JDIMENSION)cinfo->output_components);
 
   /* Allocate a virtual array to hold the image. */
   dest->image = (*cinfo->mem->request_virt_sarray)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
-     (JDIMENSION) (cinfo->output_width * cinfo->output_components),
-     cinfo->output_height, (JDIMENSION) 1);
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
+     (JDIMENSION)(cinfo->output_width * cinfo->output_components),
+     cinfo->output_height, (JDIMENSION)1);
 
-  return (djpeg_dest_ptr) dest;
+  return (djpeg_dest_ptr)dest;
 }
 
 #endif /* RLE_SUPPORTED */
diff --git a/wrtarga.c b/wrtarga.c
index 4db9313..9dfa920 100644
--- a/wrtarga.c
+++ b/wrtarga.c
@@ -45,7 +45,7 @@
 
 
 LOCAL(void)
-write_header (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo, int num_colors)
+write_header(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo, int num_colors)
 /* Create and write a Targa header */
 {
   char targaheader[18];
@@ -55,15 +55,15 @@
 
   if (num_colors > 0) {
     targaheader[1] = 1;         /* color map type 1 */
-    targaheader[5] = (char) (num_colors & 0xFF);
-    targaheader[6] = (char) (num_colors >> 8);
+    targaheader[5] = (char)(num_colors & 0xFF);
+    targaheader[6] = (char)(num_colors >> 8);
     targaheader[7] = 24;        /* 24 bits per cmap entry */
   }
 
-  targaheader[12] = (char) (cinfo->output_width & 0xFF);
-  targaheader[13] = (char) (cinfo->output_width >> 8);
-  targaheader[14] = (char) (cinfo->output_height & 0xFF);
-  targaheader[15] = (char) (cinfo->output_height >> 8);
+  targaheader[12] = (char)(cinfo->output_width & 0xFF);
+  targaheader[13] = (char)(cinfo->output_width >> 8);
+  targaheader[14] = (char)(cinfo->output_height & 0xFF);
+  targaheader[15] = (char)(cinfo->output_height >> 8);
   targaheader[17] = 0x20;       /* Top-down, non-interlaced */
 
   if (cinfo->out_color_space == JCS_GRAYSCALE) {
@@ -79,7 +79,7 @@
     }
   }
 
-  if (JFWRITE(dinfo->output_file, targaheader, 18) != (size_t) 18)
+  if (JFWRITE(dinfo->output_file, targaheader, 18) != (size_t)18)
     ERREXIT(cinfo, JERR_FILE_WRITE);
 }
 
@@ -90,11 +90,11 @@
  */
 
 METHODDEF(void)
-put_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-                JDIMENSION rows_supplied)
+put_pixel_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
+               JDIMENSION rows_supplied)
 /* used for unquantized full-color output */
 {
-  tga_dest_ptr dest = (tga_dest_ptr) dinfo;
+  tga_dest_ptr dest = (tga_dest_ptr)dinfo;
   register JSAMPROW inptr;
   register char *outptr;
   register JDIMENSION col;
@@ -102,20 +102,20 @@
   inptr = dest->pub.buffer[0];
   outptr = dest->iobuffer;
   for (col = cinfo->output_width; col > 0; col--) {
-    outptr[0] = (char) GETJSAMPLE(inptr[2]); /* RGB to BGR order */
-    outptr[1] = (char) GETJSAMPLE(inptr[1]);
-    outptr[2] = (char) GETJSAMPLE(inptr[0]);
+    outptr[0] = (char)GETJSAMPLE(inptr[2]); /* RGB to BGR order */
+    outptr[1] = (char)GETJSAMPLE(inptr[1]);
+    outptr[2] = (char)GETJSAMPLE(inptr[0]);
     inptr += 3, outptr += 3;
   }
-  (void) JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
+  (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
 }
 
 METHODDEF(void)
-put_gray_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-               JDIMENSION rows_supplied)
+put_gray_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
+              JDIMENSION rows_supplied)
 /* used for grayscale OR quantized color output */
 {
-  tga_dest_ptr dest = (tga_dest_ptr) dinfo;
+  tga_dest_ptr dest = (tga_dest_ptr)dinfo;
   register JSAMPROW inptr;
   register char *outptr;
   register JDIMENSION col;
@@ -123,9 +123,9 @@
   inptr = dest->pub.buffer[0];
   outptr = dest->iobuffer;
   for (col = cinfo->output_width; col > 0; col--) {
-    *outptr++ = (char) GETJSAMPLE(*inptr++);
+    *outptr++ = (char)GETJSAMPLE(*inptr++);
   }
-  (void) JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
+  (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
 }
 
 
@@ -135,10 +135,10 @@
  */
 
 METHODDEF(void)
-put_demapped_gray (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-                   JDIMENSION rows_supplied)
+put_demapped_gray(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
+                  JDIMENSION rows_supplied)
 {
-  tga_dest_ptr dest = (tga_dest_ptr) dinfo;
+  tga_dest_ptr dest = (tga_dest_ptr)dinfo;
   register JSAMPROW inptr;
   register char *outptr;
   register JSAMPROW color_map0 = cinfo->colormap[0];
@@ -147,9 +147,9 @@
   inptr = dest->pub.buffer[0];
   outptr = dest->iobuffer;
   for (col = cinfo->output_width; col > 0; col--) {
-    *outptr++ = (char) GETJSAMPLE(color_map0[GETJSAMPLE(*inptr++)]);
+    *outptr++ = (char)GETJSAMPLE(color_map0[GETJSAMPLE(*inptr++)]);
   }
-  (void) JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
+  (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
 }
 
 
@@ -158,9 +158,9 @@
  */
 
 METHODDEF(void)
-start_output_tga (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
+start_output_tga(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
 {
-  tga_dest_ptr dest = (tga_dest_ptr) dinfo;
+  tga_dest_ptr dest = (tga_dest_ptr)dinfo;
   int num_colors, i;
   FILE *outfile;
 
@@ -202,7 +202,7 @@
  */
 
 METHODDEF(void)
-finish_output_tga (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
+finish_output_tga(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
 {
   /* Make sure we wrote the output file OK */
   fflush(dinfo->output_file);
@@ -216,9 +216,9 @@
  */
 
 METHODDEF(void)
-calc_buffer_dimensions_tga (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
+calc_buffer_dimensions_tga(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
 {
-  tga_dest_ptr dest = (tga_dest_ptr) dinfo;
+  tga_dest_ptr dest = (tga_dest_ptr)dinfo;
 
   dest->buffer_width = cinfo->output_width * cinfo->output_components;
 }
@@ -229,14 +229,14 @@
  */
 
 GLOBAL(djpeg_dest_ptr)
-jinit_write_targa (j_decompress_ptr cinfo)
+jinit_write_targa(j_decompress_ptr cinfo)
 {
   tga_dest_ptr dest;
 
   /* Create module interface object, fill in method pointers */
   dest = (tga_dest_ptr)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  sizeof(tga_dest_struct));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(tga_dest_struct));
   dest->pub.start_output = start_output_tga;
   dest->pub.finish_output = finish_output_tga;
   dest->pub.calc_buffer_dimensions = calc_buffer_dimensions_tga;
@@ -245,17 +245,17 @@
   jpeg_calc_output_dimensions(cinfo);
 
   /* Create I/O buffer. */
-  dest->pub.calc_buffer_dimensions (cinfo, (djpeg_dest_ptr) dest);
+  dest->pub.calc_buffer_dimensions(cinfo, (djpeg_dest_ptr)dest);
   dest->iobuffer = (char *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (size_t) (dest->buffer_width * sizeof(char)));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (size_t)(dest->buffer_width * sizeof(char)));
 
   /* Create decompressor output buffer. */
   dest->pub.buffer = (*cinfo->mem->alloc_sarray)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, dest->buffer_width, (JDIMENSION) 1);
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, dest->buffer_width, (JDIMENSION)1);
   dest->pub.buffer_height = 1;
 
-  return (djpeg_dest_ptr) dest;
+  return (djpeg_dest_ptr)dest;
 }
 
 #endif /* TARGA_SUPPORTED */