Merge branch 'master' into dev
diff --git a/.travis.yml b/.travis.yml
index dfddbd6..6038e7d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -33,14 +33,19 @@
         CMAKE_FLAGS="-DWITH_12BIT=1"
         CTEST_OUTPUT_ON_FAILURE=1
     - os: linux
+      dist: bionic
       compiler: gcc
-      arch: ppc64le
       env:
-        CMAKE_FLAGS="-DWITH_JPEG7=1"
+        BUILD_X32=1
+        CMAKE_FLAGS="-DWITH_JPEG7=1 -DCMAKE_C_FLAGS=-mx32"
         CTEST_OUTPUT_ON_FAILURE=1
+      addons:
+        apt:
+          packages:
+            - nasm
     - os: linux
       compiler: gcc
-      arch: arm64
+      arch: ppc64le
       env:
         CMAKE_FLAGS="-DWITH_JPEG8=1"
         CTEST_OUTPUT_ON_FAILURE=1
@@ -51,20 +56,28 @@
         CFLAGS_RELWITHDEBINFO="-O3 -g -fsanitize=memory -fPIE"
         CMAKE_FLAGS="-DWITH_SIMD=0"
         CTEST_OUTPUT_ON_FAILURE=1
+    - os: linux
+      compiler: gcc
+      arch: arm64
+      env:
+        CTEST_OUTPUT_ON_FAILURE=1
+    - os: linux
+      compiler: clang
+      arch: arm64
+      env:
+        CTEST_OUTPUT_ON_FAILURE=1
 
 before_install:
+  - if [ "$BUILD_X32" = "1" ]; then
+      sudo apt -y --install-recommends install libc6-dev-x32;
+    fi
   - if [ "$TRAVIS_OS_NAME" = "osx" ]; then
       pushd $HOME/Downloads &&
-      curl -LO https://updates.cdn-apple.com/2019/cert/041-88384-20191011-3d8da658-dca4-4a5b-b67c-26e686876403/JavaForOSX.dmg &&
-      hdid JavaForOSX.dmg &&
-      sudo installer -pkg /Volumes/Java\ for\ macOS\ 2017-001/JavaForOSX.pkg -target / &&
-      hdiutil detach /Volumes/Java\ for\ macOS\ 2017-001 &&
       curl -LO https://raw.githubusercontent.com/GiovanniBussi/macports-ci/master/macports-ci &&
       . ./macports-ci install &&
-      sudo /opt/local/bin/port -N install gcc5 yasm md5sha1sum &&
+      sudo /opt/local/bin/port -N install yasm md5sha1sum &&
       popd &&
-      git clone --depth=1 https://github.com/libjpeg-turbo/gas-preprocessor.git ~/src/gas-preprocessor &&
-      ln -fs /Applications/Xcode.app /Applications/Xcode72.app;
+      ln -fs /Applications/Xcode.app /Applications/Xcode83.app;
     fi
   - if [ "${BUILD_OFFICIAL:-}" != "" ]; then
       if [ "$TRAVIS_OS_NAME" = "linux" ]; then
@@ -87,11 +100,11 @@
       if [ "$TRAVIS_OS_NAME" = "linux" ]; then
         mkdir $HOME/rpmkeys &&
         wget --no-check-certificate "http://www.libjpeg-turbo.org/key/LJTPR-GPG-KEY" -O $HOME/rpmkeys/LJTPR-GPG-KEY &&
-        docker run -v $HOME/src/ljt.nightly:/root/src/ljt.nightly -v $HOME/src/buildscripts:/root/src/buildscripts -v $TRAVIS_BUILD_DIR:/root/src/libjpeg-turbo -v $HOME/.gnupg:/root/.gnupg -v $HOME/rpmkeys:/rpmkeys -t dcommander/buildljt:latest bash -c "rpm --import /rpmkeys/LJTPR-GPG-KEY && ~/src/buildscripts/buildljt -d /root/src/libjpeg-turbo -v" &&
+        docker run -v $HOME/src/ljt.nightly:/root/src/ljt.nightly -v $HOME/src/buildscripts:/root/src/buildscripts -v $TRAVIS_BUILD_DIR:/root/src/libjpeg-turbo -v $HOME/.gnupg:/root/.gnupg -v $HOME/rpmkeys:/rpmkeys -t dcommander/buildljt:$TRAVIS_BRANCH bash -c "rpm --import /rpmkeys/LJTPR-GPG-KEY && ~/src/buildscripts/buildljt -d /root/src/libjpeg-turbo -v" &&
         sudo chown -R travis:travis ~/src/ljt.nightly &&
         mv ~/src/ljt.nightly/latest/log-$TRAVIS_OS_NAME.txt ~/src/ljt.nightly/latest/files/;
       else
-        PATH=$PATH:~/src/gas-preprocessor ~/src/buildscripts/buildljt -d $TRAVIS_BUILD_DIR -v &&
+        ~/src/buildscripts/buildljt -d $TRAVIS_BUILD_DIR -v &&
         mv ~/src/ljt.nightly/latest/log-$TRAVIS_OS_NAME.txt ~/src/ljt.nightly/latest/files/;
       fi
     fi
diff --git a/BUILDING.md b/BUILDING.md
index 2aef181..a119daa 100644
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -12,10 +12,7 @@
 
 - [NASM](http://www.nasm.us) or [YASM](http://yasm.tortall.net)
   (if building x86 or x86-64 SIMD extensions)
-  * If using NASM, 2.10 or later is required.
-  * If using NASM, 2.10 or later (except 2.11.08) is required for an x86-64 Mac
-    build (2.11.08 does not work properly with libjpeg-turbo's x86-64 SIMD code
-    when building macho64 objects.)
+  * If using NASM, 2.13 or later is required.
   * If using YASM, 1.2.0 or later is required.
   * If building on macOS, NASM or YASM can be obtained from
     [MacPorts](http://www.macports.org/) or [Homebrew](http://brew.sh/).
@@ -49,10 +46,8 @@
 
 - If building the TurboJPEG Java wrapper, JDK or OpenJDK 1.5 or later is
   required.  Most modern Linux distributions, as well as Solaris 10 and later,
-  include JDK or OpenJDK.  On OS X 10.5 and 10.6, it will be necessary to
-  install the Java Developer Package, which can be downloaded from
-  <http://developer.apple.com/downloads> (Apple ID required.)  For other
-  systems, you can obtain the Oracle Java Development Kit from
+  include JDK or OpenJDK.  For other systems, you can obtain the Oracle Java
+  Development Kit from
   <http://www.oracle.com/technetwork/java/javase/downloads>.
 
   * If using JDK 11 or later, CMake 3.10.x or later must also be used.
@@ -62,22 +57,22 @@
 - Microsoft Visual C++ 2005 or later
 
   If you don't already have Visual C++, then the easiest way to get it is by
-  installing the
-  [Windows SDK](http://msdn.microsoft.com/en-us/windows/bb980924.aspx).
-  The Windows SDK includes both 32-bit and 64-bit Visual C++ compilers and
-  everything necessary to build libjpeg-turbo.
+  installing
+  [Visual Studio Community Edition](https://visualstudio.microsoft.com),
+  which includes everything necessary to build libjpeg-turbo.
 
-  * You can also use Microsoft Visual Studio Express/Community Edition, which
-    is a free download.  (NOTE: versions prior to 2012 can only be used to
-    build 32-bit code.)
+  * You can also download and install the standalone Windows SDK (for Windows 7
+    or later), which includes command-line versions of the 32-bit and 64-bit
+    Visual C++ compilers.
   * If you intend to build libjpeg-turbo from the command line, then add the
     appropriate compiler and SDK directories to the `INCLUDE`, `LIB`, and
     `PATH` environment variables.  This is generally accomplished by
-    executing `vcvars32.bat` or `vcvars64.bat` and `SetEnv.cmd`.
-    `vcvars32.bat` and `vcvars64.bat` are part of Visual C++ and are located in
-    the same directory as the compiler.  `SetEnv.cmd` is part of the Windows
-    SDK.  You can pass optional arguments to `SetEnv.cmd` to specify a 32-bit
-    or 64-bit build environment.
+    executing `vcvars32.bat` or `vcvars64.bat`, which are located in the same
+    directory as the compiler.
+  * If built with Visual C++ 2015 or later, the libjpeg-turbo static libraries
+    cannot be used with earlier versions of Visual C++, and vice versa.
+  * The libjpeg API DLL (**jpeg{version}.dll**) will depend on the C run-time
+    DLLs corresponding to the version of Visual C++ that was used to build it.
 
    ... OR ...
 
@@ -333,7 +328,7 @@
 -------------
 
 
-### 32-bit Build on 64-bit Linux/Unix/Mac
+### 32-bit Build on 64-bit Linux/Unix
 
 Use export/setenv to set the following environment variables before running
 CMake:
@@ -405,103 +400,9 @@
 for these platforms.
 
 
-### Additional build requirements
-
-- For configurations that require [gas-preprocessor.pl]
-  (https://raw.githubusercontent.com/libjpeg-turbo/gas-preprocessor/master/gas-preprocessor.pl),
-  it should be installed in your `PATH`.
-
-
-### Armv7 (32-bit)
-
-**gas-preprocessor.pl required**
-
-The following scripts demonstrate how to build libjpeg-turbo to run on the
-iPhone 3GS-4S/iPad 1st-3rd Generation and newer:
-
-#### Xcode 4.2 and earlier (LLVM-GCC)
-
-    IOS_PLATFORMDIR=/Developer/Platforms/iPhoneOS.platform
-    IOS_SYSROOT=($IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk)
-    export CFLAGS="-mfloat-abi=softfp -march=armv7 -mcpu=cortex-a8 -mtune=cortex-a8 -mfpu=neon -miphoneos-version-min=3.0"
-
-    cd {build_directory}
-
-    cat <<EOF >toolchain.cmake
-    set(CMAKE_SYSTEM_NAME Darwin)
-    set(CMAKE_SYSTEM_PROCESSOR arm)
-    set(CMAKE_C_COMPILER ${IOS_PLATFORMDIR}/Developer/usr/bin/arm-apple-darwin10-llvm-gcc-4.2)
-    EOF
-
-    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
-      -DCMAKE_OSX_SYSROOT=${IOS_SYSROOT[0]} \
-      [additional CMake flags] {source_directory}
-    make
-
-#### Xcode 4.3-4.6 (LLVM-GCC)
-
-Same as above, but replace the first line with:
-
-    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
-
-#### Xcode 5 and later (Clang)
-
-    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
-    IOS_SYSROOT=($IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk)
-    export CFLAGS="-mfloat-abi=softfp -arch armv7 -miphoneos-version-min=3.0"
-    export ASMFLAGS="-no-integrated-as"
-
-    cd {build_directory}
-
-    cat <<EOF >toolchain.cmake
-    set(CMAKE_SYSTEM_NAME Darwin)
-    set(CMAKE_SYSTEM_PROCESSOR arm)
-    set(CMAKE_C_COMPILER /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang)
-    EOF
-
-    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
-      -DCMAKE_OSX_SYSROOT=${IOS_SYSROOT[0]} \
-      [additional CMake flags] {source_directory}
-    make
-
-
-### Armv7s (32-bit)
-
-**gas-preprocessor.pl required**
-
-The following scripts demonstrate how to build libjpeg-turbo to run on the
-iPhone 5/iPad 4th Generation and newer:
-
-#### Xcode 4.5-4.6 (LLVM-GCC)
-
-    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
-    IOS_SYSROOT=($IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk)
-    export CFLAGS="-Wall -mfloat-abi=softfp -march=armv7s -mcpu=swift -mtune=swift -mfpu=neon -miphoneos-version-min=6.0"
-
-    cd {build_directory}
-
-    cat <<EOF >toolchain.cmake
-    set(CMAKE_SYSTEM_NAME Darwin)
-    set(CMAKE_SYSTEM_PROCESSOR arm)
-    set(CMAKE_C_COMPILER ${IOS_PLATFORMDIR}/Developer/usr/bin/arm-apple-darwin10-llvm-gcc-4.2)
-    EOF
-
-    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
-      -DCMAKE_OSX_SYSROOT=${IOS_SYSROOT[0]} \
-      [additional CMake flags] {source_directory}
-    make
-
-#### Xcode 5 and later (Clang)
-
-Same as the Armv7 build procedure for Xcode 5 and later, except replace the
-compiler flags as follows:
-
-    export CFLAGS="-Wall -mfloat-abi=softfp -arch armv7s -miphoneos-version-min=6.0"
-
-
 ### Armv8 (64-bit)
 
-**gas-preprocessor.pl required if using Xcode < 6**
+**Xcode 5 or later required, Xcode 6.3.x or later recommended**
 
 The following script demonstrates how to build libjpeg-turbo to run on the
 iPhone 5S/iPad Mini 2/iPad Air and newer.
@@ -523,9 +424,6 @@
       [additional CMake flags] {source_directory}
     make
 
-Once built, lipo can be used to combine the Armv7, v7s, and/or v8 variants into
-a universal library.
-
 
 Building libjpeg-turbo for Android
 ----------------------------------
@@ -536,6 +434,8 @@
 
 ### Armv7 (32-bit)
 
+**NDK r19 or later with Clang recommended**
+
 The following is a general recipe script that can be modified for your specific
 needs.
 
@@ -561,6 +461,8 @@
 
 ### Armv8 (64-bit)
 
+**Clang recommended**
+
 The following is a general recipe script that can be modified for your specific
 needs.
 
@@ -735,44 +637,22 @@
     make dmg
 
 Create Mac package/disk image.  This requires pkgbuild and productbuild, which
-are installed by default on OS X 10.7 and later and which can be obtained by
-installing Xcode 3.2.6 (with the "Unix Development" option) on OS X 10.6.
-Packages built in this manner can be installed on OS X 10.5 and later, but they
-must be built on OS X 10.6 or later.
+are installed by default on OS X 10.7 and later.
 
-    make udmg
+In order to create a Mac package/disk image that contains universal
+x86-64/Arm binaries, set the following CMake variable:
 
-This creates a Mac package/disk image that contains universal x86-64/i386/Arm
-binaries.  The following CMake variables control which architectures are
-included in the universal binaries.  Setting any of these variables to an empty
-string excludes that architecture from the package.
-
-* `OSX_32BIT_BUILD`: Directory containing an i386 (32-bit) Mac build of
-  libjpeg-turbo (default: *{source_directory}*/osxx86)
-* `IOS_ARMV7_BUILD`: Directory containing an Armv7 (32-bit) iOS build of
-  libjpeg-turbo (default: *{source_directory}*/iosarmv7)
-* `IOS_ARMV7S_BUILD`: Directory containing an Armv7s (32-bit) iOS build of
-  libjpeg-turbo (default: *{source_directory}*/iosarmv7s)
 * `IOS_ARMV8_BUILD`: Directory containing an Armv8 (64-bit) iOS build of
-  libjpeg-turbo (default: *{source_directory}*/iosarmv8)
+  libjpeg-turbo to include in the universal binaries
 
-You should first use CMake to configure i386, Armv7, Armv7s, and/or Armv8
-sub-builds of libjpeg-turbo (see "Build Recipes" and "Building libjpeg-turbo
-for iOS" above) in build directories that match those specified in the
-aforementioned CMake variables.  Next, configure the primary build of
-libjpeg-turbo as an out-of-tree build, and build it.  Once the primary build
-has been built, run `make udmg` from the build directory.  The packaging system
-will build the sub-builds, use lipo to combine them into a single set of
-universal binaries, then package the universal binaries in the same manner as
-`make dmg`.
-
-
-Cygwin
-------
-
-    make cygwinpkg
-
-Build a Cygwin binary package.
+You should first use CMake to configure an Armv8 sub-build of libjpeg-turbo
+(see "Building libjpeg-turbo for iOS" above) in a build directory that matches
+the one specified in the aforementioned CMake variable.  Next, configure the
+primary (x86-64) build of libjpeg-turbo as an out-of-tree build, specifying the
+aforementioned CMake variable, and build it.  Once the primary build has been
+built, run `make dmg` from the build directory.  The packaging system will
+build the sub-build, use lipo to combine it with the primary build into a
+single set of universal binaries, then package the universal binaries.
 
 
 Windows
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b6ed7b5..50d6490 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@
 endif()
 
 project(libjpeg-turbo C)
-set(VERSION 2.0.6)
+set(VERSION 2.0.80)
 string(REPLACE "." ";" VERSION_TRIPLET ${VERSION})
 list(GET VERSION_TRIPLET 0 VERSION_MAJOR)
 list(GET VERSION_TRIPLET 1 VERSION_MINOR)
@@ -46,7 +46,7 @@
   CMAKE_SYSTEM_PROCESSOR_LC MATCHES "i[0-9]86" OR
   CMAKE_SYSTEM_PROCESSOR_LC MATCHES "x86" OR
   CMAKE_SYSTEM_PROCESSOR_LC MATCHES "ia32")
-  if(BITS EQUAL 64)
+  if(BITS EQUAL 64 OR CMAKE_C_COMPILER_ABI MATCHES "ELF X32")
     set(CPU_TYPE x86_64)
   else()
     set(CPU_TYPE i386)
@@ -84,7 +84,9 @@
     set(CMAKE_INSTALL_DEFAULT_PREFIX "${CMAKE_INSTALL_DEFAULT_PREFIX}64")
   endif()
 else()
-  set(CMAKE_INSTALL_DEFAULT_PREFIX /opt/${CMAKE_PROJECT_NAME})
+  if(NOT CMAKE_INSTALL_DEFAULT_PREFIX)
+    set(CMAKE_INSTALL_DEFAULT_PREFIX /opt/${CMAKE_PROJECT_NAME})
+  endif()
 endif()
 if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
   set(CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_DEFAULT_PREFIX}" CACHE PATH
@@ -103,6 +105,8 @@
   if(UNIX AND NOT APPLE)
     if(BITS EQUAL 64)
       set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib64")
+    elseif(CMAKE_C_COMPILER_ABI MATCHES "ELF X32")
+      set(CMAKE_INSTALL_DEFAULT_LIBDIR "libx32")
     else()
       set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib32")
     endif()
@@ -135,9 +139,9 @@
 
 macro(boolean_number var)
   if(${var})
-    set(${var} 1)
+    set(${var} 1 ${ARGN})
   else()
-    set(${var} 0)
+    set(${var} 0 ${ARGN})
   endif()
 endmacro()
 
@@ -153,8 +157,12 @@
 boolean_number(WITH_ARITH_DEC)
 option(WITH_ARITH_ENC "Include arithmetic encoding support when emulating the libjpeg v6b API/ABI" TRUE)
 boolean_number(WITH_ARITH_ENC)
-option(WITH_JAVA "Build Java wrapper for the TurboJPEG API library (implies ENABLE_SHARED=1)" FALSE)
-boolean_number(WITH_JAVA)
+if(CMAKE_C_COMPILER_ABI MATCHES "ELF X32")
+  set(WITH_JAVA 0)
+else()
+  option(WITH_JAVA "Build Java wrapper for the TurboJPEG API library (implies ENABLE_SHARED=1)" FALSE)
+  boolean_number(WITH_JAVA)
+endif()
 option(WITH_JPEG7 "Emulate libjpeg v7 API/ABI (this makes ${CMAKE_PROJECT_NAME} backward-incompatible with libjpeg v6b)" FALSE)
 boolean_number(WITH_JPEG7)
 option(WITH_JPEG8 "Emulate libjpeg v8 API/ABI (this makes ${CMAKE_PROJECT_NAME} backward-incompatible with libjpeg v6b)" FALSE)
@@ -416,13 +424,6 @@
         exit(is_shifting_signed(-0x7F7E80B1L));
       }" RIGHT_SHIFT_IS_UNSIGNED)
   endif()
-
-  if(CMAKE_CROSSCOMPILING)
-    set(__CHAR_UNSIGNED__ 0)
-  else()
-    check_c_source_runs("int main(void) { return ((char) -1 < 0); }"
-      __CHAR_UNSIGNED__)
-  endif()
 endif()
 
 if(MSVC)
@@ -548,6 +549,9 @@
 
 if(WITH_SIMD)
   add_subdirectory(simd)
+  if(NEON_INTRINSICS)
+    add_definitions(-DNEON_INTRINSICS)
+  endif()
 elseif(NOT WITH_12BIT)
   message(STATUS "SIMD extensions: None (WITH_SIMD = ${WITH_SIMD})")
 endif()
@@ -698,6 +702,8 @@
   set(MD5_PPM_RGB_ISLOW f3301d2219783b8b3d942b7239fa50c0)
   set(MD5_JPEG_422_IFAST_OPT 7322e3bd2f127f7de4b40d4480ce60e4)
   set(MD5_PPM_422_IFAST 79807fa552899e66a04708f533e16950)
+  set(MD5_JPEG_440_ISLOW e25c1912e38367be505a89c410c1c2d2)
+  set(MD5_PPM_440_ISLOW e7d2e26288870cfcb30f3114ad01e380)
   set(MD5_PPM_422M_IFAST 07737bfe8a7c1c87aaa393a0098d16b0)
   set(MD5_JPEG_420_IFAST_Q100_PROG 008ab68d6ddbba04a8f01deee4e0f9f8)
   set(MD5_PPM_420_Q100_IFAST 1b3730122709f53d007255e8dfd3305e)
@@ -746,6 +752,8 @@
   set(MD5_BMP_RGB_ISLOW_565D 4cfa0928ef3e6bb626d7728c924cfda4)
   set(MD5_JPEG_422_IFAST_OPT 2540287b79d913f91665e660303ab2c8)
   set(MD5_PPM_422_IFAST 35bd6b3f833bad23de82acea847129fa)
+  set(MD5_JPEG_440_ISLOW 538bc02bd4b4658fd85de6ece6cbeda6)
+  set(MD5_PPM_440_ISLOW 11e7eab7ef7ef3276934bb7e7b6bb377)
   set(MD5_PPM_422M_IFAST 8dbc65323d62cca7c91ba02dd1cfa81d)
   set(MD5_BMP_422M_IFAST_565 3294bd4d9a1f2b3d08ea6020d0db7065)
   set(MD5_BMP_422M_IFAST_565D da98c9c7b6039511be4a79a878a9abc1)
@@ -774,29 +782,7 @@
   set(MD5_PPM_3x2_IFAST fd283664b3b49127984af0a7f118fccd)
   set(MD5_JPEG_420_ISLOW_ARI e986fb0a637a8d833d96e8a6d6d84ea1)
   set(MD5_JPEG_444_ISLOW_PROGARI 0a8f1c8f66e113c3cf635df0a475a617)
-  # Since v1.5.1, libjpeg-turbo uses the separate non-fancy upsampling and
-  # YCbCr -> RGB color conversion routines rather than merged upsampling/color
-  # conversion when fancy upsampling is disabled on platforms that have a SIMD
-  # implementation of YCbCr -> RGB color conversion but no SIMD implementation
-  # of merged upsampling/color conversion.  This was intended to improve the
-  # performance of the Arm Neon SIMD extensions, the only SIMD extensions for
-  # which those circumstances currently apply.  The separate non-fancy
-  # upsampling and color conversion routines usually produce bitwise-identical
-  # output to the merged upsampling/color conversion routines, but that is not
-  # the case when skipping scanlines starting at an odd-numbered scanline.  In
-  # libjpeg-turbo 2.0.5 and prior, doing that while using merged h2v2
-  # upsampling caused a segfault, so this test validates the fix for that
-  # segfault.  Unfortunately, however, the test also produces different bitwise
-  # output when using the Neon SIMD extensions, because of the aforementioned
-  # optimization.  The easiest workaround is to use the old test from
-  # libjpeg-turbo 2.0.5 and prior when using the Neon SIMD extensions.  The
-  # aforementioned segfault never would have occurred with the Neon SIMD
-  # extensions anyhow, since merged upsampling is disabled when using them.
-  if((CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm") AND WITH_SIMD)
-    set(MD5_PPM_420M_IFAST_ARI 72b59a99bcf1de24c5b27d151bde2437)
-  else()
-    set(MD5_PPM_420M_IFAST_ARI 57251da28a35b46eecb7177d82d10e0e)
-  endif()
+  set(MD5_PPM_420M_IFAST_ARI 57251da28a35b46eecb7177d82d10e0e)
   set(MD5_JPEG_420_ISLOW 9a68f56bc76e466aa7e52f415d0f4a5f)
   set(MD5_PPM_420M_ISLOW_2_1 9f9de8c0612f8d06869b960b05abf9c9)
   set(MD5_PPM_420M_ISLOW_15_8 b6875bc070720b899566cc06459b63b7)
@@ -1079,6 +1065,16 @@
     testout_422_ifast.ppm testout_422_ifast_opt.jpg
     ${MD5_PPM_422_IFAST} cjpeg-${libtype}-422-ifast-opt)
 
+  # CC: RGB->YCC  SAMP: fullsize/h1v2  FDCT: islow  ENT: huff
+  add_bittest(cjpeg 440-islow "-sample;1x2;-dct;int"
+    testout_440_islow.jpg ${TESTIMAGES}/testorig.ppm
+    ${MD5_JPEG_440_ISLOW})
+
+  # CC: YCC->RGB  SAMP: fullsize/h1v2 fancy  IDCT: islow  ENT: huff
+  add_bittest(djpeg 440-islow "-dct;int"
+    testout_440_islow.ppm testout_440_islow.jpg
+    ${MD5_PPM_440_ISLOW} cjpeg-${libtype}-440-islow)
+
   # CC: YCC->RGB  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
   add_bittest(djpeg 422m-ifast "-dct;fast;-nosmooth"
     testout_422m_ifast.ppm testout_422_ifast_opt.jpg
@@ -1187,17 +1183,9 @@
 
   if(WITH_ARITH_DEC)
     # CC: RGB->YCC  SAMP: h2v2 merged  IDCT: ifast  ENT: arith
-    if((CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm") AND WITH_SIMD)
-      # Refer to the comment above the definition of MD5_PPM_420M_IFAST_ARI for
-      # an explanation of why this is necessary.
-      add_bittest(djpeg 420m-ifast-ari "-fast;-ppm"
-        testout_420m_ifast_ari.ppm ${TESTIMAGES}/testimgari.jpg
-        ${MD5_PPM_420M_IFAST_ARI})
-    else()
-      add_bittest(djpeg 420m-ifast-ari "-fast;-skip;1,20;-ppm"
-        testout_420m_ifast_ari.ppm ${TESTIMAGES}/testimgari.jpg
-        ${MD5_PPM_420M_IFAST_ARI})
-    endif()
+    add_bittest(djpeg 420m-ifast-ari "-fast;-skip;1,20;-ppm"
+      testout_420m_ifast_ari.ppm ${TESTIMAGES}/testimgari.jpg
+      ${MD5_PPM_420M_IFAST_ARI})
 
     add_bittest(jpegtran 420-islow ""
       testout_420_islow.jpg ${TESTIMAGES}/testimgari.jpg
@@ -1403,10 +1391,13 @@
 
 if(WITH_TURBOJPEG)
   if(ENABLE_SHARED)
-    install(TARGETS turbojpeg tjbench
+    install(TARGETS turbojpeg EXPORT ${CMAKE_PROJECT_NAME}Targets
+      INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
       ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
       LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
       RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+    install(TARGETS tjbench
+      RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
     if(NOT CMAKE_VERSION VERSION_LESS "3.1" AND MSVC AND
       CMAKE_C_LINKER_SUPPORTS_PDB)
       install(FILES "$<TARGET_PDB_FILE:turbojpeg>"
@@ -1414,8 +1405,9 @@
     endif()
   endif()
   if(ENABLE_STATIC)
-    install(TARGETS turbojpeg-static ARCHIVE
-      DESTINATION ${CMAKE_INSTALL_LIBDIR})
+    install(TARGETS turbojpeg-static EXPORT ${CMAKE_PROJECT_NAME}Targets
+      INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+      ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
     if(NOT ENABLE_SHARED)
       if(MSVC_IDE OR XCODE)
         set(DIR "${CMAKE_CURRENT_BINARY_DIR}/\${CMAKE_INSTALL_CONFIG_NAME}")
@@ -1431,7 +1423,9 @@
 endif()
 
 if(ENABLE_STATIC)
-  install(TARGETS jpeg-static ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+  install(TARGETS jpeg-static EXPORT ${CMAKE_PROJECT_NAME}Targets
+    INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
   if(NOT ENABLE_SHARED)
     if(MSVC_IDE OR XCODE)
       set(DIR "${CMAKE_CURRENT_BINARY_DIR}/\${CMAKE_INSTALL_CONFIG_NAME}")
@@ -1471,6 +1465,13 @@
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/pkgscripts/libjpeg.pc
   ${CMAKE_CURRENT_BINARY_DIR}/pkgscripts/libturbojpeg.pc
   DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
+install(FILES
+  ${CMAKE_CURRENT_BINARY_DIR}/pkgscripts/${CMAKE_PROJECT_NAME}Config.cmake
+  ${CMAKE_CURRENT_BINARY_DIR}/pkgscripts/${CMAKE_PROJECT_NAME}ConfigVersion.cmake
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CMAKE_PROJECT_NAME})
+install(EXPORT ${CMAKE_PROJECT_NAME}Targets
+  NAMESPACE ${CMAKE_PROJECT_NAME}::
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CMAKE_PROJECT_NAME})
 
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/jconfig.h
   ${CMAKE_CURRENT_SOURCE_DIR}/jerror.h ${CMAKE_CURRENT_SOURCE_DIR}/jmorecfg.h
diff --git a/ChangeLog.md b/ChangeLog.md
index 7b5f875..f830aba 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -1,3 +1,136 @@
+2.1 pre-beta
+============
+
+### Significant changes relative to 2.0.6:
+
+1. The build system, x86-64 SIMD extensions, and accelerated Huffman codec now
+support the x32 ABI on Linux, which allows for using x86-64 instructions with
+32-bit pointers.  The x32 ABI is generally enabled by adding `-mx32` to the
+compiler flags.
+
+     Caveats:
+     - CMake 3.9.0 or later is required in order for the build system to
+automatically detect an x32 build.
+     - Java does not support the x32 ABI, and thus the TurboJPEG Java API will
+automatically be disabled with x32 builds.
+
+2. Added Loongson MMI SIMD implementations of the RGB-to-grayscale, 4:2:2 fancy
+chroma upsampling, 4:2:2 and 4:2:0 merged chroma upsampling/color conversion,
+and fast integer DCT/IDCT algorithms.  Relative to libjpeg-turbo 2.0.x, this
+speeds up:
+
+     - the compression of RGB source images into grayscale JPEG images by
+approximately 20%
+     - the decompression of 4:2:2 JPEG images by approximately 40-60% when
+using fancy upsampling
+     - the decompression of 4:2:2 and 4:2:0 JPEG images by approximately
+15-20% when using merged upsampling
+     - the compression of RGB source images by approximately 30-45% when using
+the fast integer DCT
+     - the decompression of JPEG images into RGB destination images by
+approximately 2x when using the fast integer IDCT
+
+    The overall decompression speedup for RGB images is now approximately
+2.3-3.7x (compared to 2-3.5x with libjpeg-turbo 2.0.x.)
+
+3. 32-bit (Armv7 or Armv7s) iOS builds of libjpeg-turbo are no longer
+supported, and the libjpeg-turbo build system can no longer be used to package
+such builds.  32-bit iOS apps cannot run in iOS 11 and later, and the App Store
+no longer allows them.
+
+4. 32-bit (i386) OS X/macOS builds of libjpeg-turbo are no longer supported,
+and the libjpeg-turbo build system can no longer be used to package such
+builds.  32-bit Mac applications cannot run in macOS 10.15 "Catalina" and
+later, and the App Store no longer allows them.
+
+5. The SSE2 (x86 SIMD) and C Huffman encoding algorithms have been
+significantly optimized, resulting in a measured average overall compression
+speedup of 12-28% for 64-bit code and 22-52% for 32-bit code on various Intel
+and AMD CPUs, as well as a measured average overall compression speedup of
+0-23% on platforms that do not have a SIMD-accelerated Huffman encoding
+implementation.
+
+6. When decompressing progressive Huffman-encoded JPEG images, the block
+smoothing algorithm that the libjpeg API library optionally applies is now more
+fault-tolerant.  Previously, if a particular scan was incomplete, then the
+smoothing parameters for the incomplete scan would be applied to the entire
+output image, including the parts of the image that were generated by the prior
+(complete) scan.  Visually, this had the effect of removing block smoothing
+from lower-frequency scans if they were followed by an incomplete
+higher-frequency scan.  libjpeg-turbo now applies block smoothing parameters to
+each iMCU row based on which scan generated the pixels in that row, rather than
+always using the block smoothing parameters for the most recent scan.
+
+7. Added SIMD acceleration for progressive Huffman encoding on Arm platforms.
+This speeds up the compression of full-color progressive JPEGs by about 30-40%
+on average (relative to libjpeg-turbo 2.0.x) when using modern Arm CPUs.
+
+8. Added configure-time and run-time auto-detection of Loongson MMI SIMD
+instructions, so that the Loongson MMI SIMD extensions can be included in any
+MIPS64 libjpeg-turbo build.
+
+9. Added fault tolerance features to djpeg and jpegtran, mainly to demonstrate
+methods by which applications can guard against the exploits of the JPEG format
+described in the report
+["Two Issues with the JPEG Standard"](https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf).
+
+     - Both programs now accept a `-maxscans` argument, which can be used to
+limit the number of allowable scans in the input file.
+     - Both programs now accept a `-strict` argument, which can be used to
+treat all warnings as fatal.
+
+10. CMake package config files are now included for both the libjpeg and
+TurboJPEG API libraries.  This facilitates using libjpeg-turbo with CMake's
+`find_package()` function.  For example:
+
+        find_package(libjpeg-turbo CONFIG REQUIRED)
+
+        add_executable(libjpeg_program libjpeg_program.c)
+        target_link_libraries(libjpeg_program PUBLIC libjpeg-turbo::jpeg)
+
+        add_executable(libjpeg_program_static libjpeg_program.c)
+        target_link_libraries(libjpeg_program_static PUBLIC
+          libjpeg-turbo::jpeg-static)
+
+        add_executable(turbojpeg_program turbojpeg_program.c)
+        target_link_libraries(turbojpeg_program PUBLIC
+          libjpeg-turbo::turbojpeg)
+
+        add_executable(turbojpeg_program_static turbojpeg_program.c)
+        target_link_libraries(turbojpeg_program_static PUBLIC
+          libjpeg-turbo::turbojpeg-static)
+
+11. Since the Unisys LZW patent has long expired, cjpeg and djpeg can now
+read/write both LZW-compressed and uncompressed GIF files (feature ported from
+jpeg-6a and jpeg-9d.)
+
+12. jpegtran now includes the `-wipe` and `-drop` options from jpeg-9a and
+jpeg-9d, as well as the ability to expand the image size using the `-crop`
+option.  Refer to jpegtran.1 or usage.txt for more details.
+
+13. Added a complete intrinsics implementation of the Arm Neon SIMD extensions,
+thus providing SIMD acceleration on Arm platforms for all of the algorithms
+that are SIMD-accelerated on x86 platforms.  This new implementation is
+significantly faster in some cases than the old GAS implementation--
+depending on the algorithms used, the type of CPU core, and the compiler.  GCC,
+as of this writing, does not provide a full or optimal set of Neon intrinsics,
+so for performance reasons, the default when building libjpeg-turbo with GCC is
+to continue using the GAS implementation of the following algorithms:
+
+     - 32-bit RGB-to-YCbCr color conversion
+     - 32-bit fast and accurate inverse DCT
+     - 64-bit RGB-to-YCbCr and YCbCr-to-RGB color conversion
+     - 64-bit accurate forward and inverse DCT
+     - 64-bit Huffman encoding
+
+    A new CMake variable (`NEON_INTRINSICS`) can be used to override this
+default.
+
+    Since the new intrinsics implementation includes SIMD acceleration
+for merged upsampling/color conversion, 1.5.1[5] is no longer necessary and has
+been reverted.
+
+
 2.0.6
 =====
 
diff --git a/README.ijg b/README.ijg
index d681cf1..9453c19 100644
--- a/README.ijg
+++ b/README.ijg
@@ -128,7 +128,7 @@
 fitness for a particular purpose.  This software is provided "AS IS", and you,
 its user, assume the entire risk as to its quality and accuracy.
 
-This software is copyright (C) 1991-2016, Thomas G. Lane, Guido Vollbeding.
+This software is copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding.
 All Rights Reserved except as specified below.
 
 Permission is hereby granted to use, copy, modify, and distribute this
@@ -159,19 +159,6 @@
 assumed by the product vendor.
 
 
-The IJG distribution formerly included code to read and write GIF files.
-To avoid entanglement with the Unisys LZW patent (now expired), GIF reading
-support has been removed altogether, and the GIF writer has been simplified
-to produce "uncompressed GIFs".  This technique does not use the LZW
-algorithm; the resulting GIF files are larger than usual, but are readable
-by all standard GIF decoders.
-
-We are required to state that
-    "The Graphics Interchange Format(c) is the Copyright property of
-    CompuServe Incorporated.  GIF(sm) is a Service Mark property of
-    CompuServe Incorporated."
-
-
 REFERENCES
 ==========
 
diff --git a/README.md b/README.md
index 90a4a43..01e391e 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 
 libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
 baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and
-MIPS systems, as well as progressive JPEG compression on x86 and x86-64
+MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Arm
 systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
 all else being equal.  On other types of systems, libjpeg-turbo can still
 outperform libjpeg by a significant amount, by virtue of its highly-optimized
diff --git a/appveyor.yml b/appveyor.yml
index 36af19b..1e5f557 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -20,16 +20,18 @@
 
       7z x c:\installers\x86_64-6.4.0-release-posix-seh-rt_v5-rev0.7z -o"c:\Program Files\mingw-w64\x86_64-6.4.0-posix-seh-rt_v5-rev0" > c:\installers\mingw64.install.log
 
-      set INCLUDE=c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include;c:\Program Files (x86)\Microsoft SDKs\Windows\v7.1A\include
+      set PATH=c:\nasm-2.10.01;c:\Program Files (x86)\NSIS;c:\msys64\usr\bin;%PATH%
 
-      set LIB=c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\lib\amd64;c:\Program Files (x86)\Microsoft SDKs\Windows\v7.1A\lib\x64
+      "c:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\amd64\vcvars64.bat"
 
-      set PATH=c:\nasm-2.10.01;c:\Program Files (x86)\NSIS;c:\msys64\usr\bin;c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\amd64;c:\Program Files (x86)\Microsoft Visual Studio 10.0\Common7\IDE;c:\Program Files (x86)\Microsoft SDKs\Windows\v7.1A\bin\x64;c:\Program Files (x86)\Microsoft SDKs\Windows\v7.1A\bin;%PATH%
+      set INCLUDE
+
+      set LIB
+
+      set PATH
 
       set MSYSTEM=MINGW32
 
-      bash -c "pacman --noconfirm -S zip"
-
       mklink /d "%ProgramData%\Oracle\Java32" "c:\Program Files (x86)\Java\jdk1.6.0"
 
       git clone --depth=1 https://github.com/libjpeg-turbo/buildscripts.git -b %APPVEYOR_REPO_BRANCH% c:/buildscripts
@@ -49,8 +51,6 @@
 
       move c:\ljt.nightly\files\*.exe .
 
-      move c:\ljt.nightly\files\*.zip .
-
       move c:\ljt.nightly\log-windows.txt .
 
 artifacts:
@@ -63,9 +63,6 @@
   - path: '*-vc*.exe'
     name: SDK for Visual C++
 
-  - path: '*.zip'
-    name: Windows JNI JARs
-
   - path: 'log-windows.txt'
     name: Build log
 
diff --git a/cderror.h b/cderror.h
index 4f2c7a3..a386b69 100644
--- a/cderror.h
+++ b/cderror.h
@@ -42,7 +42,7 @@
 
 #ifdef BMP_SUPPORTED
 JMESSAGE(JERR_BMP_BADCMAP, "Unsupported BMP colormap format")
-JMESSAGE(JERR_BMP_BADDEPTH, "Only 8- and 24-bit BMP files are supported")
+JMESSAGE(JERR_BMP_BADDEPTH, "Only 8-, 24-, and 32-bit BMP files are supported")
 JMESSAGE(JERR_BMP_BADHEADER, "Invalid BMP file: bad header length")
 JMESSAGE(JERR_BMP_BADPLANES, "Invalid BMP file: biPlanes not equal to 1")
 JMESSAGE(JERR_BMP_COLORSPACE, "BMP output must be grayscale or RGB")
@@ -50,9 +50,9 @@
 JMESSAGE(JERR_BMP_EMPTY, "Empty BMP image")
 JMESSAGE(JERR_BMP_NOT, "Not a BMP file - does not start with BM")
 JMESSAGE(JERR_BMP_OUTOFRANGE, "Numeric value out of range in BMP file")
-JMESSAGE(JTRC_BMP, "%ux%u 24-bit BMP image")
+JMESSAGE(JTRC_BMP, "%ux%u %d-bit BMP image")
 JMESSAGE(JTRC_BMP_MAPPED, "%ux%u 8-bit colormapped BMP image")
-JMESSAGE(JTRC_BMP_OS2, "%ux%u 24-bit OS2 BMP image")
+JMESSAGE(JTRC_BMP_OS2, "%ux%u %d-bit OS2 BMP image")
 JMESSAGE(JTRC_BMP_OS2_MAPPED, "%ux%u 8-bit colormapped OS2 BMP image")
 #endif /* BMP_SUPPORTED */
 
@@ -84,23 +84,6 @@
 JMESSAGE(JTRC_PPM_TEXT, "%ux%u text PPM image")
 #endif /* PPM_SUPPORTED */
 
-#ifdef RLE_SUPPORTED
-JMESSAGE(JERR_RLE_BADERROR, "Bogus error code from RLE library")
-JMESSAGE(JERR_RLE_COLORSPACE, "RLE output must be grayscale or RGB")
-JMESSAGE(JERR_RLE_DIMENSIONS, "Image dimensions (%ux%u) too large for RLE")
-JMESSAGE(JERR_RLE_EMPTY, "Empty RLE file")
-JMESSAGE(JERR_RLE_EOF, "Premature EOF in RLE header")
-JMESSAGE(JERR_RLE_MEM, "Insufficient memory for RLE header")
-JMESSAGE(JERR_RLE_NOT, "Not an RLE file")
-JMESSAGE(JERR_RLE_TOOMANYCHANNELS, "Cannot handle %d output channels for RLE")
-JMESSAGE(JERR_RLE_UNSUPPORTED, "Cannot handle this RLE setup")
-JMESSAGE(JTRC_RLE, "%ux%u full-color RLE file")
-JMESSAGE(JTRC_RLE_FULLMAP, "%ux%u full-color RLE file with map of length %d")
-JMESSAGE(JTRC_RLE_GRAY, "%ux%u grayscale RLE file")
-JMESSAGE(JTRC_RLE_MAPGRAY, "%ux%u grayscale RLE file with map of length %d")
-JMESSAGE(JTRC_RLE_MAPPED, "%ux%u colormapped RLE file with map of length %d")
-#endif /* RLE_SUPPORTED */
-
 #ifdef TARGA_SUPPORTED
 JMESSAGE(JERR_TGA_BADCMAP, "Unsupported Targa colormap format")
 JMESSAGE(JERR_TGA_BADPARMS, "Invalid or unsupported Targa file")
diff --git a/cdjpeg.c b/cdjpeg.c
index e0e382d..5278c1d 100644
--- a/cdjpeg.c
+++ b/cdjpeg.c
@@ -3,8 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -25,26 +25,37 @@
  * Optional progress monitor: display a percent-done figure on stderr.
  */
 
-#ifdef PROGRESS_REPORT
-
 METHODDEF(void)
 progress_monitor(j_common_ptr cinfo)
 {
   cd_progress_ptr prog = (cd_progress_ptr)cinfo->progress;
-  int total_passes = prog->pub.total_passes + prog->total_extra_passes;
-  int percent_done =
-    (int)(prog->pub.pass_counter * 100L / prog->pub.pass_limit);
 
-  if (percent_done != prog->percent_done) {
-    prog->percent_done = percent_done;
-    if (total_passes > 1) {
-      fprintf(stderr, "\rPass %d/%d: %3d%% ",
-              prog->pub.completed_passes + prog->completed_extra_passes + 1,
-              total_passes, percent_done);
-    } else {
-      fprintf(stderr, "\r %3d%% ", percent_done);
+  if (prog->max_scans != 0 && cinfo->is_decompressor) {
+    int scan_no = ((j_decompress_ptr)cinfo)->input_scan_number;
+
+    if (scan_no > (int)prog->max_scans) {
+      fprintf(stderr, "Scan number %d exceeds maximum scans (%d)\n", scan_no,
+              prog->max_scans);
+      exit(EXIT_FAILURE);
     }
-    fflush(stderr);
+  }
+
+  if (prog->report) {
+    int total_passes = prog->pub.total_passes + prog->total_extra_passes;
+    int percent_done =
+      (int)(prog->pub.pass_counter * 100L / prog->pub.pass_limit);
+
+    if (percent_done != prog->percent_done) {
+      prog->percent_done = percent_done;
+      if (total_passes > 1) {
+        fprintf(stderr, "\rPass %d/%d: %3d%% ",
+                prog->pub.completed_passes + prog->completed_extra_passes + 1,
+                total_passes, percent_done);
+      } else {
+        fprintf(stderr, "\r %3d%% ", percent_done);
+      }
+      fflush(stderr);
+    }
   }
 }
 
@@ -57,6 +68,8 @@
     progress->pub.progress_monitor = progress_monitor;
     progress->completed_extra_passes = 0;
     progress->total_extra_passes = 0;
+    progress->max_scans = 0;
+    progress->report = FALSE;
     progress->percent_done = -1;
     cinfo->progress = &progress->pub;
   }
@@ -73,8 +86,6 @@
   }
 }
 
-#endif
-
 
 /*
  * Case-insensitive matching of possibly-abbreviated keyword switches.
diff --git a/cdjpeg.h b/cdjpeg.h
index 8f35714..71b0c4f 100644
--- a/cdjpeg.h
+++ b/cdjpeg.h
@@ -3,8 +3,9 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 2019 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2017, D. R. Commander.
+ * Copyright (C) 2017, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -56,9 +57,9 @@
   void (*finish_output) (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo);
   /* Re-calculate buffer dimensions based on output dimensions (for use with
      partial image decompression.)  If this is NULL, then the output format
-     does not support partial image decompression (BMP and RLE, in particular,
-     cannot support partial decompression because they use an inversion buffer
-     to write the image in bottom-up order.) */
+     does not support partial image decompression (BMP, in particular, cannot
+     support partial decompression because it uses an inversion buffer to write
+     the image in bottom-up order.) */
   void (*calc_buffer_dimensions) (j_decompress_ptr cinfo,
                                   djpeg_dest_ptr dinfo);
 
@@ -87,6 +88,9 @@
   struct jpeg_progress_mgr pub; /* fields known to JPEG library */
   int completed_extra_passes;   /* extra passes completed */
   int total_extra_passes;       /* total extra */
+  JDIMENSION max_scans;         /* abort if the number of scans exceeds this
+                                   value and the value is non-zero */
+  boolean report;               /* whether or not to report progress */
   /* last printed percentage stored here to avoid multiple printouts */
   int percent_done;
 };
@@ -101,11 +105,9 @@
 EXTERN(djpeg_dest_ptr) jinit_write_bmp(j_decompress_ptr cinfo, boolean is_os2,
                                        boolean use_inversion_array);
 EXTERN(cjpeg_source_ptr) jinit_read_gif(j_compress_ptr cinfo);
-EXTERN(djpeg_dest_ptr) jinit_write_gif(j_decompress_ptr cinfo);
+EXTERN(djpeg_dest_ptr) jinit_write_gif(j_decompress_ptr cinfo, boolean is_lzw);
 EXTERN(cjpeg_source_ptr) jinit_read_ppm(j_compress_ptr cinfo);
 EXTERN(djpeg_dest_ptr) jinit_write_ppm(j_decompress_ptr cinfo);
-EXTERN(cjpeg_source_ptr) jinit_read_rle(j_compress_ptr cinfo);
-EXTERN(djpeg_dest_ptr) jinit_write_rle(j_decompress_ptr cinfo);
 EXTERN(cjpeg_source_ptr) jinit_read_targa(j_compress_ptr cinfo);
 EXTERN(djpeg_dest_ptr) jinit_write_targa(j_decompress_ptr cinfo);
 
diff --git a/change.log b/change.log
index f090d77..e4d0ddc 100644
--- a/change.log
+++ b/change.log
@@ -6,6 +6,25 @@
 CHANGE LOG for Independent JPEG Group's JPEG software
 
 
+Version 9d  12-Jan-2020
+-----------------------
+
+Restore GIF read and write support from libjpeg version 6a.
+Thank to Wolfgang Werner (W.W.) Heinz for suggestion.
+
+Add jpegtran -drop option; add options to the crop extension and wipe
+to fill the extra area with content from the source image region,
+instead of gray out.
+
+
+Version 9c  14-Jan-2018
+-----------------------
+
+jpegtran: add an option to the -wipe switch to fill the region
+with the average of adjacent blocks, instead of gray out.
+Thank to Caitlyn Feddock and Maddie Ziegler for inspiration.
+
+
 Version 9b  17-Jan-2016
 -----------------------
 
@@ -13,6 +32,13 @@
 Thank to Michele Martone for suggestion.
 
 
+Version 9a  19-Jan-2014
+-----------------------
+
+Add jpegtran -wipe option and extension for -crop.
+Thank to Andrew Senior, David Clunie, and Josef Schmid for suggestion.
+
+
 Version 9  13-Jan-2013
 ----------------------
 
@@ -138,11 +164,6 @@
 
 Huffman tables are checked for validity much more carefully than before.
 
-To avoid the Unisys LZW patent, djpeg's GIF output capability has been
-changed to produce "uncompressed GIFs", and cjpeg's GIF input capability
-has been removed altogether.  We're not happy about it either, but there
-seems to be no good alternative.
-
 The configure script now supports building libjpeg as a shared library
 on many flavors of Unix (all the ones that GNU libtool knows how to
 build shared libraries for).  Use "./configure --enable-shared" to
diff --git a/cjpeg.1 b/cjpeg.1
index 587ad02..569dc3f 100644
--- a/cjpeg.1
+++ b/cjpeg.1
@@ -16,8 +16,7 @@
 compresses the named image file, or the standard input if no file is
 named, and produces a JPEG/JFIF file on the standard output.
 The currently supported input file formats are: PPM (PBMPLUS color
-format), PGM (PBMPLUS grayscale format), BMP, Targa, and RLE (Utah Raster
-Toolkit format).  (RLE is supported only if the URT library is available.)
+format), PGM (PBMPLUS grayscale format), BMP, GIF, and Targa.
 .SH OPTIONS
 All switch names may be abbreviated; for example,
 .B \-grayscale
@@ -42,10 +41,10 @@
 .TP
 .B \-grayscale
 Create monochrome JPEG file from color input.  Be sure to use this switch when
-compressing a grayscale BMP file, because
+compressing a grayscale BMP or GIF file, because
 .B cjpeg
-isn't bright enough to notice whether a BMP file uses only shades of gray.
-By saying
+isn't bright enough to notice whether a BMP or GIF file uses only shades of
+gray.  By saying
 .BR \-grayscale,
 you'll get a smaller JPEG file that takes less time to process.
 .TP
@@ -224,6 +223,9 @@
 way of testing the in-memory destination manager (jpeg_mem_dest()), but it is
 also useful for benchmarking, since it reduces the I/O overhead.
 .TP
+.BI \-report
+Report compression progress.
+.TP
 .B \-verbose
 Enable debug printout.  More
 .BR \-v 's
@@ -350,11 +352,6 @@
 relevant to libjpeg-turbo, to wordsmith certain sections, and to describe
 features not present in libjpeg.
 .SH ISSUES
-Support for GIF input files was removed in cjpeg v6b due to concerns over
-the Unisys LZW patent.  Although this patent expired in 2006, cjpeg still
-lacks GIF support, for these historical reasons.  (Conversion of GIF files to
-JPEG is usually a bad idea anyway, since GIF is a 256-color format.)
-.PP
 Not all variants of BMP and Targa file formats are supported.
 .PP
 The
diff --git a/cjpeg.c b/cjpeg.c
index 5678195..e5a9211 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modified 2003-2011 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2013-2014, 2017, 2020, D. R. Commander.
+ * Copyright (C) 2010, 2013-2014, 2017, 2019-2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -69,9 +69,9 @@
  *     2) assume we can push back more than one character (works in
  *        some C implementations, but unportable);
  *     3) provide our own buffering (breaks input readers that want to use
- *        stdio directly, such as the RLE library);
+ *        stdio directly);
  * or  4) don't put back the data, and modify the input_init methods to assume
- *        they start reading after the start of file (also breaks RLE library).
+ *        they start reading after the start of file.
  * #1 is attractive for MS-DOS but is untenable on Unix.
  *
  * The most portable solution for file types that can't be identified by their
@@ -117,10 +117,6 @@
   case 'P':
     return jinit_read_ppm(cinfo);
 #endif
-#ifdef RLE_SUPPORTED
-  case 'R':
-    return jinit_read_rle(cinfo);
-#endif
 #ifdef TARGA_SUPPORTED
   case 0x00:
     return jinit_read_targa(cinfo);
@@ -147,6 +143,7 @@
 static char *icc_filename;      /* for -icc switch */
 static char *outfilename;       /* for -outfile switch */
 boolean memdst;                 /* for -memdst switch */
+boolean report;                 /* for -report switch */
 
 
 LOCAL(void)
@@ -200,6 +197,7 @@
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
   fprintf(stderr, "  -memdst        Compress to memory instead of file (useful for benchmarking)\n");
 #endif
+  fprintf(stderr, "  -report        Report compression progress\n");
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
   fprintf(stderr, "  -version       Print version information and exit\n");
   fprintf(stderr, "Switches for wizards:\n");
@@ -244,6 +242,7 @@
   icc_filename = NULL;
   outfilename = NULL;
   memdst = FALSE;
+  report = FALSE;
   cinfo->err->trace_level = 0;
 
   /* Scan command line options, adjust parameters */
@@ -395,6 +394,9 @@
       qtablefile = argv[argn];
       /* We postpone actually reading the file in case -quality comes later. */
 
+    } else if (keymatch(arg, "report", 3)) {
+      report = TRUE;
+
     } else if (keymatch(arg, "restart", 1)) {
       /* Restart interval in MCU rows (or in MCUs with 'b'). */
       long lval;
@@ -505,9 +507,7 @@
 {
   struct jpeg_compress_struct cinfo;
   struct jpeg_error_mgr jerr;
-#ifdef PROGRESS_REPORT
   struct cdjpeg_progress_mgr progress;
-#endif
   int file_index;
   cjpeg_source_ptr src_mgr;
   FILE *input_file;
@@ -628,9 +628,10 @@
     fclose(icc_file);
   }
 
-#ifdef PROGRESS_REPORT
-  start_progress_monitor((j_common_ptr)&cinfo, &progress);
-#endif
+  if (report) {
+    start_progress_monitor((j_common_ptr)&cinfo, &progress);
+    progress.report = report;
+  }
 
   /* Figure out the input file format, and set up to read it. */
   src_mgr = select_file_type(&cinfo, input_file);
@@ -676,9 +677,8 @@
   if (output_file != stdout && output_file != NULL)
     fclose(output_file);
 
-#ifdef PROGRESS_REPORT
-  end_progress_monitor((j_common_ptr)&cinfo);
-#endif
+  if (report)
+    end_progress_monitor((j_common_ptr)&cinfo);
 
   if (memdst) {
     fprintf(stderr, "Compressed size:  %lu bytes\n", outsize);
diff --git a/cmakescripts/BuildPackages.cmake b/cmakescripts/BuildPackages.cmake
index 277c72f..367b78a 100644
--- a/cmakescripts/BuildPackages.cmake
+++ b/cmakescripts/BuildPackages.cmake
@@ -78,12 +78,14 @@
 
 if(MSVC)
   set(INST_PLATFORM "Visual C++")
-  set(INST_NAME ${CMAKE_PROJECT_NAME}-${VERSION}-vc)
+  set(INST_ID vc)
+  set(INST_NAME ${CMAKE_PROJECT_NAME}-${VERSION}-${INST_ID})
   set(INST_REG_NAME ${CMAKE_PROJECT_NAME})
 elseif(MINGW)
   set(INST_PLATFORM GCC)
-  set(INST_NAME ${CMAKE_PROJECT_NAME}-${VERSION}-gcc)
-  set(INST_REG_NAME ${CMAKE_PROJECT_NAME}-gcc)
+  set(INST_ID gcc)
+  set(INST_NAME ${CMAKE_PROJECT_NAME}-${VERSION}-${INST_ID})
+  set(INST_REG_NAME ${CMAKE_PROJECT_NAME}-${INST_ID})
   set(INST_DEFS -DGCC)
 endif()
 
@@ -107,6 +109,12 @@
 string(REGEX REPLACE "/" "\\\\" INST_DIR ${CMAKE_INSTALL_PREFIX})
 
 configure_file(release/installer.nsi.in installer.nsi @ONLY)
+# TODO: It would be nice to eventually switch to CPack and eliminate this mess,
+# but not today.
+configure_file(win/projectTargets.cmake.in
+  win/${CMAKE_PROJECT_NAME}Targets.cmake @ONLY)
+configure_file(win/${INST_ID}/projectTargets-release.cmake.in
+  win/${CMAKE_PROJECT_NAME}Targets-release.cmake @ONLY)
 
 if(WITH_JAVA)
   set(JAVA_DEPEND turbojpeg-java)
@@ -121,36 +129,13 @@
 
 
 ###############################################################################
-# Cygwin Package
-###############################################################################
-
-if(CYGWIN)
-
-configure_file(release/makecygwinpkg.in pkgscripts/makecygwinpkg)
-
-add_custom_target(cygwinpkg pkgscripts/makecygwinpkg)
-
-endif() # CYGWIN
-
-
-###############################################################################
 # Mac DMG
 ###############################################################################
 
 if(APPLE)
 
-set(DEFAULT_OSX_32BIT_BUILD ${CMAKE_SOURCE_DIR}/osxx86)
-set(OSX_32BIT_BUILD ${DEFAULT_OSX_32BIT_BUILD} CACHE PATH
-  "Directory containing 32-bit (i386) Mac build to include in universal binaries (default: ${DEFAULT_OSX_32BIT_BUILD})")
-set(DEFAULT_IOS_ARMV7_BUILD ${CMAKE_SOURCE_DIR}/iosarmv7)
-set(IOS_ARMV7_BUILD ${DEFAULT_IOS_ARMV7_BUILD} CACHE PATH
-  "Directory containing Armv7 iOS build to include in universal binaries (default: ${DEFAULT_IOS_ARMV7_BUILD})")
-set(DEFAULT_IOS_ARMV7S_BUILD ${CMAKE_SOURCE_DIR}/iosarmv7s)
-set(IOS_ARMV7S_BUILD ${DEFAULT_IOS_ARMV7S_BUILD} CACHE PATH
-  "Directory containing Armv7s iOS build to include in universal binaries (default: ${DEFAULT_IOS_ARMV7S_BUILD})")
-set(DEFAULT_IOS_ARMV8_BUILD ${CMAKE_SOURCE_DIR}/iosarmv8)
-set(IOS_ARMV8_BUILD ${DEFAULT_IOS_ARMV8_BUILD} CACHE PATH
-  "Directory containing Armv8 iOS build to include in universal binaries (default: ${DEFAULT_IOS_ARMV8_BUILD})")
+set(IOS_ARMV8_BUILD "" CACHE PATH
+  "Directory containing Armv8 iOS build to include in universal binaries")
 
 set(OSX_APP_CERT_NAME "" CACHE STRING
   "Name of the Developer ID Application certificate (in the macOS keychain) that should be used to sign the libjpeg-turbo DMG.  Leave this blank to generate an unsigned DMG.")
@@ -159,14 +144,12 @@
 
 configure_file(release/makemacpkg.in pkgscripts/makemacpkg)
 configure_file(release/Distribution.xml.in pkgscripts/Distribution.xml)
+configure_file(release/Welcome.rtf.in pkgscripts/Welcome.rtf)
 configure_file(release/uninstall.in pkgscripts/uninstall)
 
 add_custom_target(dmg pkgscripts/makemacpkg
   SOURCES pkgscripts/makemacpkg)
 
-add_custom_target(udmg pkgscripts/makemacpkg universal
-  SOURCES pkgscripts/makemacpkg)
-
 endif() # APPLE
 
 
@@ -187,3 +170,12 @@
 configure_file(release/libjpeg.pc.in pkgscripts/libjpeg.pc @ONLY)
 
 configure_file(release/libturbojpeg.pc.in pkgscripts/libturbojpeg.pc @ONLY)
+
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file(
+  pkgscripts/${CMAKE_PROJECT_NAME}ConfigVersion.cmake
+  VERSION ${VERSION} COMPATIBILITY AnyNewerVersion)
+
+configure_package_config_file(release/Config.cmake.in
+  pkgscripts/${CMAKE_PROJECT_NAME}Config.cmake
+  INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CMAKE_PROJECT_NAME})
diff --git a/cmakescripts/GNUInstallDirs.cmake b/cmakescripts/GNUInstallDirs.cmake
index 7c41196..6408fca 100644
--- a/cmakescripts/GNUInstallDirs.cmake
+++ b/cmakescripts/GNUInstallDirs.cmake
@@ -118,6 +118,7 @@
 #   absolute paths where necessary, using the same logic.
 
 #=============================================================================
+# Copyright 2018 Matthias Räncker
 # Copyright 2016, 2019 D. R. Commander
 # Copyright 2016 Dmitry Marakasov
 # Copyright 2016 Roger Leigh
@@ -259,6 +260,8 @@
       else()
         if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
           set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib64")
+        elseif(CMAKE_C_COMPILER_ABI MATCHES "ELF X32")
+          set(CMAKE_INSTALL_DEFAULT_LIBDIR "libx32")
         endif()
       endif()
     endif()
diff --git a/djpeg.1 b/djpeg.1
index 80cf860..31431b9 100644
--- a/djpeg.1
+++ b/djpeg.1
@@ -15,8 +15,7 @@
 .B djpeg
 decompresses the named JPEG file, or the standard input if no file is named,
 and produces an image file on the standard output.  PBMPLUS (PPM/PGM), BMP,
-GIF, Targa, or RLE (Utah Raster Toolkit) output format can be selected.
-(RLE is supported only if the URT library is available.)
+GIF, or Targa output format can be selected.
 .SH OPTIONS
 All switch names may be abbreviated; for example,
 .B \-grayscale
@@ -81,9 +80,20 @@
 format is emitted.
 .TP
 .B \-gif
-Select GIF output format.  Since GIF does not support more than 256 colors,
+Select GIF output format (LZW-compressed).  Since GIF does not support more
+than 256 colors,
 .B \-colors 256
-is assumed (unless you specify a smaller number of colors).
+is assumed (unless you specify a smaller number of colors).  If you specify
+.BR \-fast,
+the default number of colors is 216.
+.TP
+.B \-gif0
+Select GIF output format (uncompressed).  Since GIF does not support more than
+256 colors,
+.B \-colors 256
+is assumed (unless you specify a smaller number of colors).  If you specify
+.BR \-fast,
+the default number of colors is 216.
 .TP
 .B \-os2
 Select BMP output format (OS/2 1.x flavor).  8-bit colormapped format is
@@ -100,9 +110,6 @@
 .B \-grayscale
 is specified; otherwise PPM is emitted.
 .TP
-.B \-rle
-Select RLE output format.  (Requires URT library.)
-.TP
 .B \-targa
 Select Targa output format.  Grayscale format is emitted if the JPEG file is
 grayscale or if
@@ -198,6 +205,19 @@
 .B \-max 4m
 selects 4000000 bytes.  If more space is needed, an error will occur.
 .TP
+.BI \-maxscans " N"
+Abort if the JPEG image contains more than
+.I N
+scans.  This feature demonstrates a method by which applications can guard
+against denial-of-service attacks instigated by specially-crafted malformed
+JPEG images containing numerous scans with missing image data or image data
+consisting only of "EOB runs" (a feature of progressive JPEG images that allows
+potentially hundreds of thousands of adjoining zero-value pixels to be
+represented using only a few bytes.)  Attempting to decompress such malformed
+JPEG images can cause excessive CPU activity, since the decompressor must fully
+process each scan (even if the scan is corrupt) before it can proceed to the
+next scan.
+.TP
 .BI \-outfile " name"
 Send output image to the named file, not to standard output.
 .TP
@@ -205,6 +225,9 @@
 Load input file into memory before decompressing.  This feature was implemented
 mainly as a way of testing the in-memory source manager (jpeg_mem_src().)
 .TP
+.BI \-report
+Report decompression progress.
+.TP
 .BI \-skip " Y0,Y1"
 Decompress all rows of the JPEG image except those between Y0 and Y1
 (inclusive.)  Note that if decompression scaling is being used, then Y0 and Y1
@@ -218,6 +241,12 @@
 scaled image dimensions.  Currently this option only works with the
 PBMPLUS (PPM/PGM), GIF, and Targa output formats.
 .TP
+.BI \-strict
+Treat all warnings as fatal.  This feature also demonstrates a method by which
+applications can guard against attacks instigated by specially-crafted
+malformed JPEG images.  Enabling this option will cause the decompressor to
+abort if the JPEG image contains incomplete or corrupt image data.
+.TP
 .B \-verbose
 Enable debug printout.  More
 .BR \-v 's
@@ -289,10 +318,3 @@
 This file was modified by The libjpeg-turbo Project to include only information
 relevant to libjpeg-turbo, to wordsmith certain sections, and to describe
 features not present in libjpeg.
-.SH ISSUES
-Support for compressed GIF output files was removed in djpeg v6b due to
-concerns over the Unisys LZW patent.  Although this patent expired in 2006,
-djpeg still lacks compressed GIF support, for these historical reasons.
-(Conversion of JPEG files to GIF is usually a bad idea anyway, since GIF is a
-256-color format.)  The uncompressed GIF files that djpeg generates are larger
-than they should be, but they are readable by standard GIF decoders.
diff --git a/djpeg.c b/djpeg.c
index 693aadb..d47984e 100644
--- a/djpeg.c
+++ b/djpeg.c
@@ -3,9 +3,9 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * Modified 2013 by Guido Vollbeding.
+ * Modified 2013-2019 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010-2011, 2013-2017, 2020, D. R. Commander.
+ * Copyright (C) 2010-2011, 2013-2017, 2019-2020, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -68,10 +68,10 @@
 
 typedef enum {
   FMT_BMP,                      /* BMP format (Windows flavor) */
-  FMT_GIF,                      /* GIF format */
+  FMT_GIF,                      /* GIF format (LZW-compressed) */
+  FMT_GIF0,                     /* GIF format (uncompressed) */
   FMT_OS2,                      /* BMP format (OS/2 flavor) */
   FMT_PPM,                      /* PPM/PGM (PBMPLUS formats) */
-  FMT_RLE,                      /* RLE format */
   FMT_TARGA,                    /* Targa format */
   FMT_TIFF                      /* TIFF format */
 } IMAGE_FORMATS;
@@ -94,11 +94,14 @@
 
 static const char *progname;    /* program name for error messages */
 static char *icc_filename;      /* for -icc switch */
+JDIMENSION max_scans;           /* for -maxscans switch */
 static char *outfilename;       /* for -outfile switch */
 boolean memsrc;                 /* for -memsrc switch */
+boolean report;                 /* for -report switch */
 boolean skip, crop;
 JDIMENSION skip_start, skip_end;
 JDIMENSION crop_x, crop_y, crop_width, crop_height;
+boolean strict;                 /* for -strict switch */
 #define INPUT_BUF_SIZE  4096
 
 
@@ -127,8 +130,10 @@
           (DEFAULT_FMT == FMT_BMP ? " (default)" : ""));
 #endif
 #ifdef GIF_SUPPORTED
-  fprintf(stderr, "  -gif           Select GIF output format%s\n",
+  fprintf(stderr, "  -gif           Select GIF output format (LZW-compressed)%s\n",
           (DEFAULT_FMT == FMT_GIF ? " (default)" : ""));
+  fprintf(stderr, "  -gif0          Select GIF output format (uncompressed)%s\n",
+          (DEFAULT_FMT == FMT_GIF0 ? " (default)" : ""));
 #endif
 #ifdef BMP_SUPPORTED
   fprintf(stderr, "  -os2           Select BMP output format (OS/2 style)%s\n",
@@ -138,10 +143,6 @@
   fprintf(stderr, "  -pnm           Select PBMPLUS (PPM/PGM) output format%s\n",
           (DEFAULT_FMT == FMT_PPM ? " (default)" : ""));
 #endif
-#ifdef RLE_SUPPORTED
-  fprintf(stderr, "  -rle           Select Utah RLE output format%s\n",
-          (DEFAULT_FMT == FMT_RLE ? " (default)" : ""));
-#endif
 #ifdef TARGA_SUPPORTED
   fprintf(stderr, "  -targa         Select Targa output format%s\n",
           (DEFAULT_FMT == FMT_TARGA ? " (default)" : ""));
@@ -171,14 +172,16 @@
   fprintf(stderr, "  -onepass       Use 1-pass quantization (fast, low quality)\n");
 #endif
   fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
+  fprintf(stderr, "  -maxscans N    Maximum number of scans to allow in input file\n");
   fprintf(stderr, "  -outfile name  Specify name for output file\n");
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
   fprintf(stderr, "  -memsrc        Load input file into memory before decompressing\n");
 #endif
-
+  fprintf(stderr, "  -report        Report decompression progress\n");
   fprintf(stderr, "  -skip Y0,Y1    Decompress all rows except those between Y0 and Y1 (inclusive)\n");
   fprintf(stderr, "  -crop WxH+X+Y  Decompress only a rectangular subregion of the image\n");
   fprintf(stderr, "                 [requires PBMPLUS (PPM/PGM), GIF, or Targa output format]\n");
+  fprintf(stderr, "  -strict        Treat all warnings as fatal\n");
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
   fprintf(stderr, "  -version       Print version information and exit\n");
   exit(EXIT_FAILURE);
@@ -203,10 +206,13 @@
   /* Set up default JPEG parameters. */
   requested_fmt = DEFAULT_FMT;  /* set default output file format */
   icc_filename = NULL;
+  max_scans = 0;
   outfilename = NULL;
   memsrc = FALSE;
+  report = FALSE;
   skip = FALSE;
   crop = FALSE;
+  strict = FALSE;
   cinfo->err->trace_level = 0;
 
   /* Scan command line options, adjust parameters */
@@ -224,7 +230,7 @@
     arg++;                      /* advance past switch marker character */
 
     if (keymatch(arg, "bmp", 1)) {
-      /* BMP output format. */
+      /* BMP output format (Windows flavor). */
       requested_fmt = FMT_BMP;
 
     } else if (keymatch(arg, "colors", 1) || keymatch(arg, "colours", 1) ||
@@ -295,9 +301,13 @@
       cinfo->do_fancy_upsampling = FALSE;
 
     } else if (keymatch(arg, "gif", 1)) {
-      /* GIF output format. */
+      /* GIF output format (LZW-compressed). */
       requested_fmt = FMT_GIF;
 
+    } else if (keymatch(arg, "gif0", 4)) {
+      /* GIF output format (uncompressed). */
+      requested_fmt = FMT_GIF0;
+
     } else if (keymatch(arg, "grayscale", 2) ||
                keymatch(arg, "greyscale", 2)) {
       /* Force monochrome output. */
@@ -351,6 +361,12 @@
         lval *= 1000L;
       cinfo->mem->max_memory_to_use = lval * 1000L;
 
+    } else if (keymatch(arg, "maxscans", 4)) {
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (sscanf(argv[argn], "%u", &max_scans) != 1)
+        usage();
+
     } else if (keymatch(arg, "nosmooth", 3)) {
       /* Suppress fancy upsampling */
       cinfo->do_fancy_upsampling = FALSE;
@@ -383,9 +399,8 @@
       /* PPM/PGM output format. */
       requested_fmt = FMT_PPM;
 
-    } else if (keymatch(arg, "rle", 1)) {
-      /* RLE output format. */
-      requested_fmt = FMT_RLE;
+    } else if (keymatch(arg, "report", 2)) {
+      report = TRUE;
 
     } else if (keymatch(arg, "scale", 2)) {
       /* Scale the output image by a fraction M/N. */
@@ -413,6 +428,9 @@
         usage();
       crop = TRUE;
 
+    } else if (keymatch(arg, "strict", 2)) {
+      strict = TRUE;
+
     } else if (keymatch(arg, "targa", 1)) {
       /* Targa output format. */
       requested_fmt = FMT_TARGA;
@@ -444,7 +462,7 @@
       ERREXIT(cinfo, JERR_CANT_SUSPEND);
   }
   datasrc->bytes_in_buffer--;
-  return GETJOCTET(*datasrc->next_input_byte++);
+  return *datasrc->next_input_byte++;
 }
 
 
@@ -499,6 +517,19 @@
 }
 
 
+METHODDEF(void)
+my_emit_message(j_common_ptr cinfo, int msg_level)
+{
+  if (msg_level < 0) {
+    /* Treat warning as fatal */
+    cinfo->err->error_exit(cinfo);
+  } else {
+    if (cinfo->err->trace_level >= msg_level)
+      cinfo->err->output_message(cinfo);
+  }
+}
+
+
 /*
  * The main program.
  */
@@ -508,9 +539,7 @@
 {
   struct jpeg_decompress_struct cinfo;
   struct jpeg_error_mgr jerr;
-#ifdef PROGRESS_REPORT
   struct cdjpeg_progress_mgr progress;
-#endif
   int file_index;
   djpeg_dest_ptr dest_mgr = NULL;
   FILE *input_file;
@@ -557,6 +586,9 @@
 
   file_index = parse_switches(&cinfo, argc, argv, 0, FALSE);
 
+  if (strict)
+    jerr.emit_message = my_emit_message;
+
 #ifdef TWO_FILE_COMMANDLINE
   /* Must have either -outfile switch or explicit output file name */
   if (outfilename == NULL) {
@@ -603,9 +635,11 @@
     output_file = write_stdout();
   }
 
-#ifdef PROGRESS_REPORT
-  start_progress_monitor((j_common_ptr)&cinfo, &progress);
-#endif
+  if (report || max_scans != 0) {
+    start_progress_monitor((j_common_ptr)&cinfo, &progress);
+    progress.report = report;
+    progress.max_scans = max_scans;
+  }
 
   /* Specify data source for decompression */
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
@@ -653,7 +687,10 @@
 #endif
 #ifdef GIF_SUPPORTED
   case FMT_GIF:
-    dest_mgr = jinit_write_gif(&cinfo);
+    dest_mgr = jinit_write_gif(&cinfo, TRUE);
+    break;
+  case FMT_GIF0:
+    dest_mgr = jinit_write_gif(&cinfo, FALSE);
     break;
 #endif
 #ifdef PPM_SUPPORTED
@@ -661,11 +698,6 @@
     dest_mgr = jinit_write_ppm(&cinfo);
     break;
 #endif
-#ifdef RLE_SUPPORTED
-  case FMT_RLE:
-    dest_mgr = jinit_write_rle(&cinfo);
-    break;
-#endif
 #ifdef TARGA_SUPPORTED
   case FMT_TARGA:
     dest_mgr = jinit_write_targa(&cinfo);
@@ -781,12 +813,11 @@
     }
   }
 
-#ifdef PROGRESS_REPORT
   /* Hack: count final pass as done in case finish_output does an extra pass.
    * The library won't have updated completed_passes.
    */
-  progress.pub.completed_passes = progress.pub.total_passes;
-#endif
+  if (report || max_scans != 0)
+    progress.pub.completed_passes = progress.pub.total_passes;
 
   if (icc_filename != NULL) {
     FILE *icc_file;
@@ -825,9 +856,8 @@
   if (output_file != stdout)
     fclose(output_file);
 
-#ifdef PROGRESS_REPORT
-  end_progress_monitor((j_common_ptr)&cinfo);
-#endif
+  if (report || max_scans != 0)
+    end_progress_monitor((j_common_ptr)&cinfo);
 
   if (memsrc)
     free(inbuffer);
diff --git a/java/README b/java/README
index 88ddc3b..5af1e31 100644
--- a/java/README
+++ b/java/README
@@ -38,7 +38,7 @@
 ----------------------
 
 The TurboJPEG Java Wrapper will look for the TurboJPEG JNI library
-(libturbojpeg.so, libturbojpeg.jnilib, or turbojpeg.dll) in the system library
+(libturbojpeg.so, libturbojpeg.dylib, or turbojpeg.dll) in the system library
 paths or in any paths specified in LD_LIBRARY_PATH (Un*x), DYLD_LIBRARY_PATH
 (Mac), or PATH (Windows.)  Failing this, on Un*x and Mac systems, the wrapper
 will look for the JNI library under the library directory configured when
diff --git a/java/org/libjpegturbo/turbojpeg/TJLoader-unix.java.in b/java/org/libjpegturbo/turbojpeg/TJLoader-unix.java.in
index 65884e8..d8cc495 100644
--- a/java/org/libjpegturbo/turbojpeg/TJLoader-unix.java.in
+++ b/java/org/libjpegturbo/turbojpeg/TJLoader-unix.java.in
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2011-2013, 2016 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2011-2013, 2016, 2020 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,9 +36,9 @@
       String os = System.getProperty("os.name").toLowerCase();
       if (os.indexOf("mac") >= 0) {
         try {
-          System.load("@CMAKE_INSTALL_FULL_LIBDIR@/libturbojpeg.jnilib");
+          System.load("@CMAKE_INSTALL_FULL_LIBDIR@/libturbojpeg.dylib");
         } catch (java.lang.UnsatisfiedLinkError e2) {
-          System.load("/usr/lib/libturbojpeg.jnilib");
+          System.load("/usr/lib/libturbojpeg.dylib");
         }
       } else {
         try {
diff --git a/jccolext.c b/jccolext.c
index 19c955c..303b322 100644
--- a/jccolext.c
+++ b/jccolext.c
@@ -48,9 +48,9 @@
     outptr2 = output_buf[2][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr[RGB_RED]);
-      g = GETJSAMPLE(inptr[RGB_GREEN]);
-      b = GETJSAMPLE(inptr[RGB_BLUE]);
+      r = inptr[RGB_RED];
+      g = inptr[RGB_GREEN];
+      b = inptr[RGB_BLUE];
       inptr += RGB_PIXELSIZE;
       /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
        * must be too; we do not need an explicit range-limiting operation.
@@ -100,9 +100,9 @@
     outptr = output_buf[0][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr[RGB_RED]);
-      g = GETJSAMPLE(inptr[RGB_GREEN]);
-      b = GETJSAMPLE(inptr[RGB_BLUE]);
+      r = inptr[RGB_RED];
+      g = inptr[RGB_GREEN];
+      b = inptr[RGB_BLUE];
       inptr += RGB_PIXELSIZE;
       /* Y */
       outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
@@ -135,9 +135,9 @@
     outptr2 = output_buf[2][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      outptr0[col] = GETJSAMPLE(inptr[RGB_RED]);
-      outptr1[col] = GETJSAMPLE(inptr[RGB_GREEN]);
-      outptr2[col] = GETJSAMPLE(inptr[RGB_BLUE]);
+      outptr0[col] = inptr[RGB_RED];
+      outptr1[col] = inptr[RGB_GREEN];
+      outptr2[col] = inptr[RGB_BLUE];
       inptr += RGB_PIXELSIZE;
     }
   }
diff --git a/jccolor.c b/jccolor.c
index 036f601..bdc563c 100644
--- a/jccolor.c
+++ b/jccolor.c
@@ -392,11 +392,11 @@
     outptr3 = output_buf[3][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = MAXJSAMPLE - GETJSAMPLE(inptr[0]);
-      g = MAXJSAMPLE - GETJSAMPLE(inptr[1]);
-      b = MAXJSAMPLE - GETJSAMPLE(inptr[2]);
+      r = MAXJSAMPLE - inptr[0];
+      g = MAXJSAMPLE - inptr[1];
+      b = MAXJSAMPLE - inptr[2];
       /* K passes through as-is */
-      outptr3[col] = inptr[3];  /* don't need GETJSAMPLE here */
+      outptr3[col] = inptr[3];
       inptr += 4;
       /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
        * must be too; we do not need an explicit range-limiting operation.
@@ -438,7 +438,7 @@
     outptr = output_buf[0][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      outptr[col] = inptr[0];   /* don't need GETJSAMPLE() here */
+      outptr[col] = inptr[0];
       inptr += instride;
     }
   }
@@ -497,7 +497,7 @@
         inptr = *input_buf;
         outptr = output_buf[ci][output_row];
         for (col = 0; col < num_cols; col++) {
-          outptr[col] = inptr[ci]; /* don't need GETJSAMPLE() here */
+          outptr[col] = inptr[ci];
           inptr += nc;
         }
       }
diff --git a/jcdctmgr.c b/jcdctmgr.c
index c04058e..7dae17a 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -381,19 +381,19 @@
     elemptr = sample_data[elemr] + start_col;
 
 #if DCTSIZE == 8                /* unroll the inner loop */
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
 #else
     {
       register int elemc;
       for (elemc = DCTSIZE; elemc > 0; elemc--)
-        *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+        *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
     }
 #endif
   }
@@ -533,20 +533,19 @@
   for (elemr = 0; elemr < DCTSIZE; elemr++) {
     elemptr = sample_data[elemr] + start_col;
 #if DCTSIZE == 8                /* unroll the inner loop */
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
 #else
     {
       register int elemc;
       for (elemc = DCTSIZE; elemc > 0; elemc--)
-        *workspaceptr++ = (FAST_FLOAT)
-                          (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+        *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
     }
 #endif
   }
diff --git a/jchuff.c b/jchuff.c
index db85ce1..2417cac 100644
--- a/jchuff.c
+++ b/jchuff.c
@@ -4,8 +4,9 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2014-2016, 2018-2019, D. R. Commander.
+ * Copyright (C) 2009-2011, 2014-2016, 2018-2020, D. R. Commander.
  * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2018, Matthias Räncker.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -65,32 +66,42 @@
  * but must not be updated permanently until we complete the MCU.
  */
 
+#if defined(__x86_64__) && defined(__ILP32__)
+typedef unsigned long long bit_buf_type;
+#else
+typedef size_t bit_buf_type;
+#endif
+
+/* NOTE: The more optimal Huffman encoding algorithm is only used by the
+ * intrinsics implementation of the Arm Neon SIMD extensions, which is why we
+ * retain the old Huffman encoder behavior when using the GAS implementation.
+ */
+#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__))
+typedef unsigned long long simd_bit_buf_type;
+#else
+typedef bit_buf_type simd_bit_buf_type;
+#endif
+
+#if (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 8) || defined(_WIN64) || \
+    (defined(__x86_64__) && defined(__ILP32__))
+#define BIT_BUF_SIZE  64
+#elif (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 4) || defined(_WIN32)
+#define BIT_BUF_SIZE  32
+#else
+#error Cannot determine word size
+#endif
+#define SIMD_BIT_BUF_SIZE  (sizeof(simd_bit_buf_type) * 8)
+
 typedef struct {
-  size_t put_buffer;                    /* current bit-accumulation buffer */
-  int put_bits;                         /* # of bits now in it */
+  union {
+    bit_buf_type c;
+    simd_bit_buf_type simd;
+  } put_buffer;                         /* current bit accumulation buffer */
+  int free_bits;                        /* # of bits available in it */
+                                        /* (Neon GAS: # of bits now in it) */
   int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
 } savable_state;
 
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest, src)  ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest, src) \
-  ((dest).put_buffer = (src).put_buffer, \
-   (dest).put_bits = (src).put_bits, \
-   (dest).last_dc_val[0] = (src).last_dc_val[0], \
-   (dest).last_dc_val[1] = (src).last_dc_val[1], \
-   (dest).last_dc_val[2] = (src).last_dc_val[2], \
-   (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
 typedef struct {
   struct jpeg_entropy_encoder pub; /* public fields */
 
@@ -123,6 +134,7 @@
   size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
   savable_state cur;            /* Current bit buffer & DC state */
   j_compress_ptr cinfo;         /* dump_buffer needs access to this */
+  int simd;
 } working_state;
 
 
@@ -201,8 +213,17 @@
   }
 
   /* Initialize bit buffer to empty */
-  entropy->saved.put_buffer = 0;
-  entropy->saved.put_bits = 0;
+  if (entropy->simd) {
+    entropy->saved.put_buffer.simd = 0;
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+    entropy->saved.free_bits = 0;
+#else
+    entropy->saved.free_bits = SIMD_BIT_BUF_SIZE;
+#endif
+  } else {
+    entropy->saved.put_buffer.c = 0;
+    entropy->saved.free_bits = BIT_BUF_SIZE;
+  }
 
   /* Initialize restart stuff */
   entropy->restarts_to_go = cinfo->restart_interval;
@@ -334,94 +355,94 @@
 
 /* Outputting bits to the file */
 
-/* These macros perform the same task as the emit_bits() function in the
- * original libjpeg code.  In addition to reducing overhead by explicitly
- * inlining the code, additional performance is achieved by taking into
- * account the size of the bit buffer and waiting until it is almost full
- * before emptying it.  This mostly benefits 64-bit platforms, since 6
- * bytes can be stored in a 64-bit bit buffer before it has to be emptied.
+/* Output byte b and, speculatively, an additional 0 byte.  0xFF must be
+ * encoded as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the
+ * byte is 0xFF.  Otherwise, the output buffer pointer is advanced by 1, and
+ * the speculative 0 byte will be overwritten by the next byte.
  */
-
-#define EMIT_BYTE() { \
-  JOCTET c; \
-  put_bits -= 8; \
-  c = (JOCTET)GETJOCTET(put_buffer >> put_bits); \
-  *buffer++ = c; \
-  if (c == 0xFF)  /* need to stuff a zero byte? */ \
-    *buffer++ = 0; \
+#define EMIT_BYTE(b) { \
+  buffer[0] = (JOCTET)(b); \
+  buffer[1] = 0; \
+  buffer -= -2 + ((JOCTET)(b) < 0xFF); \
 }
 
-#define PUT_BITS(code, size) { \
-  put_bits += size; \
-  put_buffer = (put_buffer << size) | code; \
-}
+/* Output the entire bit buffer.  If there are no 0xFF bytes in it, then write
+ * directly to the output buffer.  Otherwise, use the EMIT_BYTE() macro to
+ * encode 0xFF as 0xFF 0x00.
+ */
+#if BIT_BUF_SIZE == 64
 
-#if SIZEOF_SIZE_T != 8 && !defined(_WIN64)
-
-#define CHECKBUF15() { \
-  if (put_bits > 15) { \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
+#define FLUSH() { \
+  if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+    EMIT_BYTE(put_buffer >> 56) \
+    EMIT_BYTE(put_buffer >> 48) \
+    EMIT_BYTE(put_buffer >> 40) \
+    EMIT_BYTE(put_buffer >> 32) \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    buffer[0] = (JOCTET)(put_buffer >> 56); \
+    buffer[1] = (JOCTET)(put_buffer >> 48); \
+    buffer[2] = (JOCTET)(put_buffer >> 40); \
+    buffer[3] = (JOCTET)(put_buffer >> 32); \
+    buffer[4] = (JOCTET)(put_buffer >> 24); \
+    buffer[5] = (JOCTET)(put_buffer >> 16); \
+    buffer[6] = (JOCTET)(put_buffer >> 8); \
+    buffer[7] = (JOCTET)(put_buffer); \
+    buffer += 8; \
   } \
 }
 
-#endif
-
-#define CHECKBUF31() { \
-  if (put_bits > 31) { \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-  } \
-}
-
-#define CHECKBUF47() { \
-  if (put_bits > 47) { \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-  } \
-}
-
-#if !defined(_WIN32) && !defined(SIZEOF_SIZE_T)
-#error Cannot determine word size
-#endif
-
-#if SIZEOF_SIZE_T == 8 || defined(_WIN64)
-
-#define EMIT_BITS(code, size) { \
-  CHECKBUF47() \
-  PUT_BITS(code, size) \
-}
-
-#define EMIT_CODE(code, size) { \
-  temp2 &= (((JLONG)1) << nbits) - 1; \
-  CHECKBUF31() \
-  PUT_BITS(code, size) \
-  PUT_BITS(temp2, nbits) \
-}
-
 #else
 
-#define EMIT_BITS(code, size) { \
-  PUT_BITS(code, size) \
-  CHECKBUF15() \
-}
-
-#define EMIT_CODE(code, size) { \
-  temp2 &= (((JLONG)1) << nbits) - 1; \
-  PUT_BITS(code, size) \
-  CHECKBUF15() \
-  PUT_BITS(temp2, nbits) \
-  CHECKBUF15() \
+#define FLUSH() { \
+  if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    buffer[0] = (JOCTET)(put_buffer >> 24); \
+    buffer[1] = (JOCTET)(put_buffer >> 16); \
+    buffer[2] = (JOCTET)(put_buffer >> 8); \
+    buffer[3] = (JOCTET)(put_buffer); \
+    buffer += 4; \
+  } \
 }
 
 #endif
 
+/* Fill the bit buffer to capacity with the leading bits from code, then output
+ * the bit buffer and put the remaining bits from code into the bit buffer.
+ */
+#define PUT_AND_FLUSH(code, size) { \
+  put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+  FLUSH() \
+  free_bits += BIT_BUF_SIZE; \
+  put_buffer = code; \
+}
+
+/* Insert code into the bit buffer and output the bit buffer if needed.
+ * NOTE: We can't flush with free_bits == 0, since the left shift in
+ * PUT_AND_FLUSH() would have undefined behavior.
+ */
+#define PUT_BITS(code, size) { \
+  free_bits -= size; \
+  if (free_bits < 0) \
+    PUT_AND_FLUSH(code, size) \
+  else \
+    put_buffer = (put_buffer << size) | code; \
+}
+
+#define PUT_CODE(code, size) { \
+  temp &= (((JLONG)1) << nbits) - 1; \
+  temp |= code << nbits; \
+  nbits += size; \
+  PUT_BITS(temp, nbits) \
+}
+
 
 /* Although it is exceedingly rare, it is possible for a Huffman-encoded
  * coefficient block to be larger than the 128-byte unencoded block.  For each
@@ -444,6 +465,7 @@
 
 #define STORE_BUFFER() { \
   if (localbuf) { \
+    size_t bytes, bytestocopy; \
     bytes = buffer - _buffer; \
     buffer = _buffer; \
     while (bytes > 0) { \
@@ -466,20 +488,46 @@
 LOCAL(boolean)
 flush_bits(working_state *state)
 {
-  JOCTET _buffer[BUFSIZE], *buffer;
-  size_t put_buffer;  int put_bits;
-  size_t bytes, bytestocopy;  int localbuf = 0;
+  JOCTET _buffer[BUFSIZE], *buffer, temp;
+  simd_bit_buf_type put_buffer;  int put_bits;
+  int localbuf = 0;
 
-  put_buffer = state->cur.put_buffer;
-  put_bits = state->cur.put_bits;
+  if (state->simd) {
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+    put_bits = state->cur.free_bits;
+#else
+    put_bits = SIMD_BIT_BUF_SIZE - state->cur.free_bits;
+#endif
+    put_buffer = state->cur.put_buffer.simd;
+  } else {
+    put_bits = BIT_BUF_SIZE - state->cur.free_bits;
+    put_buffer = state->cur.put_buffer.c;
+  }
+
   LOAD_BUFFER()
 
-  /* fill any partial byte with ones */
-  PUT_BITS(0x7F, 7)
-  while (put_bits >= 8) EMIT_BYTE()
+  while (put_bits >= 8) {
+    put_bits -= 8;
+    temp = (JOCTET)(put_buffer >> put_bits);
+    EMIT_BYTE(temp)
+  }
+  if (put_bits) {
+    /* fill partial byte with ones */
+    temp = (JOCTET)((put_buffer << (8 - put_bits)) | (0xFF >> put_bits));
+    EMIT_BYTE(temp)
+  }
 
-  state->cur.put_buffer = 0;    /* and reset bit-buffer to empty */
-  state->cur.put_bits = 0;
+  if (state->simd) {                    /* and reset bit buffer to empty */
+    state->cur.put_buffer.simd = 0;
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+    state->cur.free_bits = 0;
+#else
+    state->cur.free_bits = SIMD_BIT_BUF_SIZE;
+#endif
+  } else {
+    state->cur.put_buffer.c = 0;
+    state->cur.free_bits = BIT_BUF_SIZE;
+  }
   STORE_BUFFER()
 
   return TRUE;
@@ -493,7 +541,7 @@
                       c_derived_tbl *dctbl, c_derived_tbl *actbl)
 {
   JOCTET _buffer[BUFSIZE], *buffer;
-  size_t bytes, bytestocopy;  int localbuf = 0;
+  int localbuf = 0;
 
   LOAD_BUFFER()
 
@@ -509,53 +557,41 @@
 encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val,
                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
 {
-  int temp, temp2, temp3;
-  int nbits;
-  int r, code, size;
+  int temp, nbits, free_bits;
+  bit_buf_type put_buffer;
   JOCTET _buffer[BUFSIZE], *buffer;
-  size_t put_buffer;  int put_bits;
-  int code_0xf0 = actbl->ehufco[0xf0], size_0xf0 = actbl->ehufsi[0xf0];
-  size_t bytes, bytestocopy;  int localbuf = 0;
+  int localbuf = 0;
 
-  put_buffer = state->cur.put_buffer;
-  put_bits = state->cur.put_bits;
+  free_bits = state->cur.free_bits;
+  put_buffer = state->cur.put_buffer.c;
   LOAD_BUFFER()
 
   /* Encode the DC coefficient difference per section F.1.2.1 */
 
-  temp = temp2 = block[0] - last_dc_val;
+  temp = block[0] - last_dc_val;
 
   /* This is a well-known technique for obtaining the absolute value without a
    * branch.  It is derived from an assembly language technique presented in
    * "How to Optimize for the Pentium Processors", Copyright (c) 1996, 1997 by
-   * Agner Fog.
+   * Agner Fog.  This code assumes we are on a two's complement machine.
    */
-  temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
-  temp ^= temp3;
-  temp -= temp3;
-
-  /* For a negative input, want temp2 = bitwise complement of abs(input) */
-  /* This code assumes we are on a two's complement machine */
-  temp2 += temp3;
+  nbits = temp >> (CHAR_BIT * sizeof(int) - 1);
+  temp += nbits;
+  nbits ^= temp;
 
   /* Find the number of bits needed for the magnitude of the coefficient */
-  nbits = JPEG_NBITS(temp);
+  nbits = JPEG_NBITS(nbits);
 
-  /* Emit the Huffman-coded symbol for the number of bits */
-  code = dctbl->ehufco[nbits];
-  size = dctbl->ehufsi[nbits];
-  EMIT_BITS(code, size)
-
-  /* Mask off any extra bits in code */
-  temp2 &= (((JLONG)1) << nbits) - 1;
-
-  /* Emit that number of bits of the value, if positive, */
-  /* or the complement of its magnitude, if negative. */
-  EMIT_BITS(temp2, nbits)
+  /* Emit the Huffman-coded symbol for the number of bits.
+   * Emit that number of bits of the value, if positive,
+   * or the complement of its magnitude, if negative.
+   */
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits])
 
   /* Encode the AC coefficients per section F.1.2.2 */
 
-  r = 0;                        /* r = run length of zeros */
+  {
+    int r = 0;                  /* r = run length of zeros */
 
 /* Manually unroll the k loop to eliminate the counter variable.  This
  * improves performance greatly on systems with a limited number of
@@ -563,51 +599,46 @@
  */
 #define kloop(jpeg_natural_order_of_k) { \
   if ((temp = block[jpeg_natural_order_of_k]) == 0) { \
-    r++; \
+    r += 16; \
   } else { \
-    temp2 = temp; \
     /* Branch-less absolute value, bitwise complement, etc., same as above */ \
-    temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); \
-    temp ^= temp3; \
-    temp -= temp3; \
-    temp2 += temp3; \
-    nbits = JPEG_NBITS_NONZERO(temp); \
+    nbits = temp >> (CHAR_BIT * sizeof(int) - 1); \
+    temp += nbits; \
+    nbits ^= temp; \
+    nbits = JPEG_NBITS_NONZERO(nbits); \
     /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
-    while (r > 15) { \
-      EMIT_BITS(code_0xf0, size_0xf0) \
-      r -= 16; \
+    while (r >= 16 * 16) { \
+      r -= 16 * 16; \
+      PUT_BITS(actbl->ehufco[0xf0], actbl->ehufsi[0xf0]) \
     } \
     /* Emit Huffman symbol for run length / number of bits */ \
-    temp3 = (r << 4) + nbits; \
-    code = actbl->ehufco[temp3]; \
-    size = actbl->ehufsi[temp3]; \
-    EMIT_CODE(code, size) \
+    r += nbits; \
+    PUT_CODE(actbl->ehufco[r], actbl->ehufsi[r]) \
     r = 0; \
   } \
 }
 
-  /* One iteration for each value in jpeg_natural_order[] */
-  kloop(1);   kloop(8);   kloop(16);  kloop(9);   kloop(2);   kloop(3);
-  kloop(10);  kloop(17);  kloop(24);  kloop(32);  kloop(25);  kloop(18);
-  kloop(11);  kloop(4);   kloop(5);   kloop(12);  kloop(19);  kloop(26);
-  kloop(33);  kloop(40);  kloop(48);  kloop(41);  kloop(34);  kloop(27);
-  kloop(20);  kloop(13);  kloop(6);   kloop(7);   kloop(14);  kloop(21);
-  kloop(28);  kloop(35);  kloop(42);  kloop(49);  kloop(56);  kloop(57);
-  kloop(50);  kloop(43);  kloop(36);  kloop(29);  kloop(22);  kloop(15);
-  kloop(23);  kloop(30);  kloop(37);  kloop(44);  kloop(51);  kloop(58);
-  kloop(59);  kloop(52);  kloop(45);  kloop(38);  kloop(31);  kloop(39);
-  kloop(46);  kloop(53);  kloop(60);  kloop(61);  kloop(54);  kloop(47);
-  kloop(55);  kloop(62);  kloop(63);
+    /* One iteration for each value in jpeg_natural_order[] */
+    kloop(1);   kloop(8);   kloop(16);  kloop(9);   kloop(2);   kloop(3);
+    kloop(10);  kloop(17);  kloop(24);  kloop(32);  kloop(25);  kloop(18);
+    kloop(11);  kloop(4);   kloop(5);   kloop(12);  kloop(19);  kloop(26);
+    kloop(33);  kloop(40);  kloop(48);  kloop(41);  kloop(34);  kloop(27);
+    kloop(20);  kloop(13);  kloop(6);   kloop(7);   kloop(14);  kloop(21);
+    kloop(28);  kloop(35);  kloop(42);  kloop(49);  kloop(56);  kloop(57);
+    kloop(50);  kloop(43);  kloop(36);  kloop(29);  kloop(22);  kloop(15);
+    kloop(23);  kloop(30);  kloop(37);  kloop(44);  kloop(51);  kloop(58);
+    kloop(59);  kloop(52);  kloop(45);  kloop(38);  kloop(31);  kloop(39);
+    kloop(46);  kloop(53);  kloop(60);  kloop(61);  kloop(54);  kloop(47);
+    kloop(55);  kloop(62);  kloop(63);
 
-  /* If the last coef(s) were zero, emit an end-of-block code */
-  if (r > 0) {
-    code = actbl->ehufco[0];
-    size = actbl->ehufsi[0];
-    EMIT_BITS(code, size)
+    /* If the last coef(s) were zero, emit an end-of-block code */
+    if (r > 0) {
+      PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+    }
   }
 
-  state->cur.put_buffer = put_buffer;
-  state->cur.put_bits = put_bits;
+  state->cur.put_buffer.c = put_buffer;
+  state->cur.free_bits = free_bits;
   STORE_BUFFER()
 
   return TRUE;
@@ -654,8 +685,9 @@
   /* Load up working state */
   state.next_output_byte = cinfo->dest->next_output_byte;
   state.free_in_buffer = cinfo->dest->free_in_buffer;
-  ASSIGN_STATE(state.cur, entropy->saved);
+  state.cur = entropy->saved;
   state.cinfo = cinfo;
+  state.simd = entropy->simd;
 
   /* Emit restart marker if needed */
   if (cinfo->restart_interval) {
@@ -694,7 +726,7 @@
   /* Completed MCU, so update state */
   cinfo->dest->next_output_byte = state.next_output_byte;
   cinfo->dest->free_in_buffer = state.free_in_buffer;
-  ASSIGN_STATE(entropy->saved, state.cur);
+  entropy->saved = state.cur;
 
   /* Update restart-interval state too */
   if (cinfo->restart_interval) {
@@ -723,8 +755,9 @@
   /* Load up working state ... flush_bits needs it */
   state.next_output_byte = cinfo->dest->next_output_byte;
   state.free_in_buffer = cinfo->dest->free_in_buffer;
-  ASSIGN_STATE(state.cur, entropy->saved);
+  state.cur = entropy->saved;
   state.cinfo = cinfo;
+  state.simd = entropy->simd;
 
   /* Flush out the last data */
   if (!flush_bits(&state))
@@ -733,7 +766,7 @@
   /* Update state */
   cinfo->dest->next_output_byte = state.next_output_byte;
   cinfo->dest->free_in_buffer = state.free_in_buffer;
-  ASSIGN_STATE(entropy->saved, state.cur);
+  entropy->saved = state.cur;
 }
 
 
diff --git a/jconfig.h.in b/jconfig.h.in
index 18a69a4..d4284d9 100644
--- a/jconfig.h.in
+++ b/jconfig.h.in
@@ -61,11 +61,6 @@
    unsigned. */
 #cmakedefine RIGHT_SHIFT_IS_UNSIGNED 1
 
-/* Define to 1 if type `char' is unsigned and you are not using gcc.  */
-#ifndef __CHAR_UNSIGNED__
-  #cmakedefine __CHAR_UNSIGNED__ 1
-#endif
-
 /* Define to empty if `const' does not conform to ANSI C. */
 /* #undef const */
 
diff --git a/jconfig.txt b/jconfig.txt
index 90cd724..21f35c1 100644
--- a/jconfig.txt
+++ b/jconfig.txt
@@ -42,12 +42,6 @@
  */
 /* #define const */
 
-/* Define this if an ordinary "char" type is unsigned.
- * If you're not sure, leaving it undefined will work at some cost in speed.
- * If you defined HAVE_UNSIGNED_CHAR then the speed difference is minimal.
- */
-#undef __CHAR_UNSIGNED__
-
 /* Define this if your system has an ANSI-conforming <stddef.h> file.
  */
 #define HAVE_STDDEF_H
@@ -118,7 +112,6 @@
 #define BMP_SUPPORTED           /* BMP image file format */
 #define GIF_SUPPORTED           /* GIF image file format */
 #define PPM_SUPPORTED           /* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED            /* Utah RLE image file format */
 #define TARGA_SUPPORTED         /* Targa image file format */
 
 /* Define this if you want to name both input and output files on the command
diff --git a/jcsample.c b/jcsample.c
index bd27b84..e8515eb 100644
--- a/jcsample.c
+++ b/jcsample.c
@@ -6,7 +6,7 @@
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2014, MIPS Technologies, Inc., California.
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -103,7 +103,7 @@
   if (numcols > 0) {
     for (row = 0; row < num_rows; row++) {
       ptr = image_data[row] + input_cols;
-      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
+      pixval = ptr[-1];
       for (count = numcols; count > 0; count--)
         *ptr++ = pixval;
     }
@@ -174,7 +174,7 @@
       for (v = 0; v < v_expand; v++) {
         inptr = input_data[inrow + v] + outcol_h;
         for (h = 0; h < h_expand; h++) {
-          outvalue += (JLONG)GETJSAMPLE(*inptr++);
+          outvalue += (JLONG)(*inptr++);
         }
       }
       *outptr++ = (JSAMPLE)((outvalue + numpix2) / numpix);
@@ -237,8 +237,7 @@
     inptr = input_data[outrow];
     bias = 0;                   /* bias = 0,1,0,1,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
-      *outptr++ =
-        (JSAMPLE)((GETJSAMPLE(*inptr) + GETJSAMPLE(inptr[1]) + bias) >> 1);
+      *outptr++ = (JSAMPLE)((inptr[0] + inptr[1] + bias) >> 1);
       bias ^= 1;                /* 0=>1, 1=>0 */
       inptr += 2;
     }
@@ -277,8 +276,7 @@
     bias = 1;                   /* bias = 1,2,1,2,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
       *outptr++ =
-        (JSAMPLE)((GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                   GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]) + bias) >> 2);
+        (JSAMPLE)((inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1] + bias) >> 2);
       bias ^= 3;                /* 1=>2, 2=>1 */
       inptr0 += 2;  inptr1 += 2;
     }
@@ -337,33 +335,25 @@
     below_ptr = input_data[inrow + 2];
 
     /* Special case for first column: pretend column -1 is same as column 0 */
-    membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
-    neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-               GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-               GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[2]) +
-               GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[2]);
+    membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
+    neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+               inptr0[0] + inptr0[2] + inptr1[0] + inptr1[2];
     neighsum += neighsum;
-    neighsum += GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[2]) +
-                GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[2]);
+    neighsum += above_ptr[0] + above_ptr[2] + below_ptr[0] + below_ptr[2];
     membersum = membersum * memberscale + neighsum * neighscale;
     *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
     inptr0 += 2;  inptr1 += 2;  above_ptr += 2;  below_ptr += 2;
 
     for (colctr = output_cols - 2; colctr > 0; colctr--) {
       /* sum of pixels directly mapped to this output element */
-      membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                  GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
+      membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
       /* sum of edge-neighbor pixels */
-      neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-                 GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-                 GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[2]) +
-                 GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[2]);
+      neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+                 inptr0[-1] + inptr0[2] + inptr1[-1] + inptr1[2];
       /* The edge-neighbors count twice as much as corner-neighbors */
       neighsum += neighsum;
       /* Add in the corner-neighbors */
-      neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[2]) +
-                  GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[2]);
+      neighsum += above_ptr[-1] + above_ptr[2] + below_ptr[-1] + below_ptr[2];
       /* form final output scaled up by 2^16 */
       membersum = membersum * memberscale + neighsum * neighscale;
       /* round, descale and output it */
@@ -372,15 +362,11 @@
     }
 
     /* Special case for last column */
-    membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
-    neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-               GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-               GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[1]) +
-               GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[1]);
+    membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
+    neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+               inptr0[-1] + inptr0[1] + inptr1[-1] + inptr1[1];
     neighsum += neighsum;
-    neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[1]) +
-                GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[1]);
+    neighsum += above_ptr[-1] + above_ptr[1] + below_ptr[-1] + below_ptr[1];
     membersum = membersum * memberscale + neighsum * neighscale;
     *outptr = (JSAMPLE)((membersum + 32768) >> 16);
 
@@ -429,21 +415,18 @@
     below_ptr = input_data[outrow + 1];
 
     /* Special case for first column */
-    colsum = GETJSAMPLE(*above_ptr++) + GETJSAMPLE(*below_ptr++) +
-             GETJSAMPLE(*inptr);
-    membersum = GETJSAMPLE(*inptr++);
-    nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) +
-                 GETJSAMPLE(*inptr);
+    colsum = (*above_ptr++) + (*below_ptr++) + inptr[0];
+    membersum = *inptr++;
+    nextcolsum = above_ptr[0] + below_ptr[0] + inptr[0];
     neighsum = colsum + (colsum - membersum) + nextcolsum;
     membersum = membersum * memberscale + neighsum * neighscale;
     *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
     lastcolsum = colsum;  colsum = nextcolsum;
 
     for (colctr = output_cols - 2; colctr > 0; colctr--) {
-      membersum = GETJSAMPLE(*inptr++);
+      membersum = *inptr++;
       above_ptr++;  below_ptr++;
-      nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) +
-                   GETJSAMPLE(*inptr);
+      nextcolsum = above_ptr[0] + below_ptr[0] + inptr[0];
       neighsum = lastcolsum + (colsum - membersum) + nextcolsum;
       membersum = membersum * memberscale + neighsum * neighscale;
       *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
@@ -451,7 +434,7 @@
     }
 
     /* Special case for last column */
-    membersum = GETJSAMPLE(*inptr);
+    membersum = *inptr;
     neighsum = lastcolsum + (colsum - membersum) + colsum;
     membersum = membersum * memberscale + neighsum * neighscale;
     *outptr = (JSAMPLE)((membersum + 32768) >> 16);
diff --git a/jdapistd.c b/jdapistd.c
index 38bd111..bd68bdc 100644
--- a/jdapistd.c
+++ b/jdapistd.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2015-2018, 2020, D. R. Commander.
+ * Copyright (C) 2010, 2015-2020, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -532,6 +532,8 @@
          * decoded coefficients.  This is ~5% faster for large subsets, but
          * it's tough to tell a difference for smaller images.
          */
+        if (!cinfo->entropy->insufficient_data)
+          cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
         (*cinfo->entropy->decode_mcu) (cinfo, NULL);
       }
     }
diff --git a/jdarith.c b/jdarith.c
index 6002481..cbbde24 100644
--- a/jdarith.c
+++ b/jdarith.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Developed 1997-2015 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2018, D. R. Commander.
+ * Copyright (C) 2015-2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -80,7 +80,7 @@
     if (!(*src->fill_input_buffer) (cinfo))
       ERREXIT(cinfo, JERR_CANT_SUSPEND);
   src->bytes_in_buffer--;
-  return GETJOCTET(*src->next_input_byte++);
+  return *src->next_input_byte++;
 }
 
 
@@ -665,12 +665,16 @@
     for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
       int coefi, cindex = cinfo->cur_comp_info[ci]->component_index;
       int *coef_bit_ptr = &cinfo->coef_bits[cindex][0];
+      int *prev_coef_bit_ptr =
+        &cinfo->coef_bits[cindex + cinfo->num_components][0];
       if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
         WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
       for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
         int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
         if (cinfo->Ah != expected)
           WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
+        if (cinfo->input_scan_number > 1)
+          prev_coef_bit_ptr[coefi] = coef_bit_ptr[coefi];
         coef_bit_ptr[coefi] = cinfo->Al;
       }
     }
@@ -727,6 +731,7 @@
   entropy->c = 0;
   entropy->a = 0;
   entropy->ct = -16;    /* force reading 2 initial bytes to fill C */
+  entropy->pub.insufficient_data = FALSE;
 
   /* Initialize restart counter */
   entropy->restarts_to_go = cinfo->restart_interval;
@@ -763,7 +768,7 @@
     int *coef_bit_ptr, ci;
     cinfo->coef_bits = (int (*)[DCTSIZE2])
       (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                  cinfo->num_components * DCTSIZE2 *
+                                  cinfo->num_components * 2 * DCTSIZE2 *
                                   sizeof(int));
     coef_bit_ptr = &cinfo->coef_bits[0][0];
     for (ci = 0; ci < cinfo->num_components; ci++)
diff --git a/jdcoefct.c b/jdcoefct.c
index 2ba6aa1..ea2febd 100644
--- a/jdcoefct.c
+++ b/jdcoefct.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2010, 2015-2016, D. R. Commander.
+ * Copyright (C) 2010, 2015-2016, 2019, D. R. Commander.
  * Copyright (C) 2015, 2020, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -102,6 +102,8 @@
       /* Try to fetch an MCU.  Entropy decoder expects buffer to be zeroed. */
       jzero_far((void *)coef->MCU_buffer[0],
                 (size_t)(cinfo->blocks_in_MCU * sizeof(JBLOCK)));
+      if (!cinfo->entropy->insufficient_data)
+        cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
       if (!(*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
         coef->MCU_vert_offset = yoffset;
@@ -227,6 +229,8 @@
           }
         }
       }
+      if (!cinfo->entropy->insufficient_data)
+        cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
       /* Try to fetch the MCU. */
       if (!(*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
@@ -356,8 +360,8 @@
   int ci, coefi;
   jpeg_component_info *compptr;
   JQUANT_TBL *qtable;
-  int *coef_bits;
-  int *coef_bits_latch;
+  int *coef_bits, *prev_coef_bits;
+  int *coef_bits_latch, *prev_coef_bits_latch;
 
   if (!cinfo->progressive_mode || cinfo->coef_bits == NULL)
     return FALSE;
@@ -366,9 +370,11 @@
   if (coef->coef_bits_latch == NULL)
     coef->coef_bits_latch = (int *)
       (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                  cinfo->num_components *
+                                  cinfo->num_components * 2 *
                                   (SAVED_COEFS * sizeof(int)));
   coef_bits_latch = coef->coef_bits_latch;
+  prev_coef_bits_latch =
+    &coef->coef_bits_latch[cinfo->num_components * SAVED_COEFS];
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
@@ -385,15 +391,19 @@
       return FALSE;
     /* DC values must be at least partly known for all components. */
     coef_bits = cinfo->coef_bits[ci];
+    prev_coef_bits = cinfo->coef_bits[ci + cinfo->num_components];
     if (coef_bits[0] < 0)
       return FALSE;
     /* Block smoothing is helpful if some AC coefficients remain inaccurate. */
     for (coefi = 1; coefi <= 5; coefi++) {
+      if (cinfo->input_scan_number > 1)
+        prev_coef_bits_latch[coefi] = prev_coef_bits[coefi];
       coef_bits_latch[coefi] = coef_bits[coefi];
       if (coef_bits[coefi] != 0)
         smoothing_useful = TRUE;
     }
     coef_bits_latch += SAVED_COEFS;
+    prev_coef_bits_latch += SAVED_COEFS;
   }
 
   return smoothing_useful;
@@ -478,8 +488,15 @@
          (JDIMENSION)0, (JDIMENSION)access_rows, FALSE);
       first_row = TRUE;
     }
-    /* Fetch component-dependent info */
-    coef_bits = coef->coef_bits_latch + (ci * SAVED_COEFS);
+    /* Fetch component-dependent info.
+     * If the current scan is incomplete, then we use the component-dependent
+     * info from the previous scan.
+     */
+    if (cinfo->output_iMCU_row > cinfo->master->last_good_iMCU_row)
+      coef_bits =
+        coef->coef_bits_latch + ((ci + cinfo->num_components) * SAVED_COEFS);
+    else
+      coef_bits = coef->coef_bits_latch + (ci * SAVED_COEFS);
     quanttbl = compptr->quant_table;
     Q00 = quanttbl->quantval[0];
     Q01 = quanttbl->quantval[Q01_POS];
diff --git a/jdcol565.c b/jdcol565.c
index 40068ef..53c7bd9 100644
--- a/jdcol565.c
+++ b/jdcol565.c
@@ -45,9 +45,9 @@
     outptr = *output_buf++;
 
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
@@ -58,18 +58,18 @@
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
       b = range_limit[y + Cbbtab[cb]];
       rgb = PACK_SHORT_565(r, g, b);
 
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
@@ -80,9 +80,9 @@
       outptr += 4;
     }
     if (num_cols & 1) {
-      y  = GETJSAMPLE(*inptr0);
-      cb = GETJSAMPLE(*inptr1);
-      cr = GETJSAMPLE(*inptr2);
+      y  = *inptr0;
+      cb = *inptr1;
+      cr = *inptr2;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
@@ -125,9 +125,9 @@
     input_row++;
     outptr = *output_buf++;
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -139,9 +139,9 @@
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -150,9 +150,9 @@
       d0 = DITHER_ROTATE(d0);
       rgb = PACK_SHORT_565(r, g, b);
 
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -165,9 +165,9 @@
       outptr += 4;
     }
     if (num_cols & 1) {
-      y  = GETJSAMPLE(*inptr0);
-      cb = GETJSAMPLE(*inptr1);
-      cr = GETJSAMPLE(*inptr2);
+      y  = *inptr0;
+      cb = *inptr1;
+      cr = *inptr2;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -202,32 +202,32 @@
     input_row++;
     outptr = *output_buf++;
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      r = GETJSAMPLE(*inptr0++);
-      g = GETJSAMPLE(*inptr1++);
-      b = GETJSAMPLE(*inptr2++);
+      r = *inptr0++;
+      g = *inptr1++;
+      b = *inptr2++;
       rgb = PACK_SHORT_565(r, g, b);
       *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      r = GETJSAMPLE(*inptr0++);
-      g = GETJSAMPLE(*inptr1++);
-      b = GETJSAMPLE(*inptr2++);
+      r = *inptr0++;
+      g = *inptr1++;
+      b = *inptr2++;
       rgb = PACK_SHORT_565(r, g, b);
 
-      r = GETJSAMPLE(*inptr0++);
-      g = GETJSAMPLE(*inptr1++);
-      b = GETJSAMPLE(*inptr2++);
+      r = *inptr0++;
+      g = *inptr1++;
+      b = *inptr2++;
       rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
 
       WRITE_TWO_ALIGNED_PIXELS(outptr, rgb);
       outptr += 4;
     }
     if (num_cols & 1) {
-      r = GETJSAMPLE(*inptr0);
-      g = GETJSAMPLE(*inptr1);
-      b = GETJSAMPLE(*inptr2);
+      r = *inptr0;
+      g = *inptr1;
+      b = *inptr2;
       rgb = PACK_SHORT_565(r, g, b);
       *(INT16 *)outptr = (INT16)rgb;
     }
@@ -259,24 +259,24 @@
     input_row++;
     outptr = *output_buf++;
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      r = range_limit[DITHER_565_R(*inptr0++, d0)];
+      g = range_limit[DITHER_565_G(*inptr1++, d0)];
+      b = range_limit[DITHER_565_B(*inptr2++, d0)];
       rgb = PACK_SHORT_565(r, g, b);
       *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      r = range_limit[DITHER_565_R(*inptr0++, d0)];
+      g = range_limit[DITHER_565_G(*inptr1++, d0)];
+      b = range_limit[DITHER_565_B(*inptr2++, d0)];
       d0 = DITHER_ROTATE(d0);
       rgb = PACK_SHORT_565(r, g, b);
 
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      r = range_limit[DITHER_565_R(*inptr0++, d0)];
+      g = range_limit[DITHER_565_G(*inptr1++, d0)];
+      b = range_limit[DITHER_565_B(*inptr2++, d0)];
       d0 = DITHER_ROTATE(d0);
       rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
 
@@ -284,9 +284,9 @@
       outptr += 4;
     }
     if (num_cols & 1) {
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2), d0)];
+      r = range_limit[DITHER_565_R(*inptr0, d0)];
+      g = range_limit[DITHER_565_G(*inptr1, d0)];
+      b = range_limit[DITHER_565_B(*inptr2, d0)];
       rgb = PACK_SHORT_565(r, g, b);
       *(INT16 *)outptr = (INT16)rgb;
     }
diff --git a/jdcolext.c b/jdcolext.c
index 72a5301..863c7a2 100644
--- a/jdcolext.c
+++ b/jdcolext.c
@@ -53,9 +53,9 @@
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      y  = GETJSAMPLE(inptr0[col]);
-      cb = GETJSAMPLE(inptr1[col]);
-      cr = GETJSAMPLE(inptr2[col]);
+      y  = inptr0[col];
+      cb = inptr1[col];
+      cr = inptr2[col];
       /* Range-limiting is essential due to noise introduced by DCT losses. */
       outptr[RGB_RED] =   range_limit[y + Crrtab[cr]];
       outptr[RGB_GREEN] = range_limit[y +
@@ -93,7 +93,6 @@
     inptr = input_buf[0][input_row++];
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      /* We can dispense with GETJSAMPLE() here */
       outptr[RGB_RED] = outptr[RGB_GREEN] = outptr[RGB_BLUE] = inptr[col];
       /* Set unused byte to 0xFF so it can be interpreted as an opaque */
       /* alpha channel value */
@@ -128,7 +127,6 @@
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      /* We can dispense with GETJSAMPLE() here */
       outptr[RGB_RED] = inptr0[col];
       outptr[RGB_GREEN] = inptr1[col];
       outptr[RGB_BLUE] = inptr2[col];
diff --git a/jdcolor.c b/jdcolor.c
index d3ae40c..8da2b4e 100644
--- a/jdcolor.c
+++ b/jdcolor.c
@@ -341,9 +341,9 @@
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr0[col]);
-      g = GETJSAMPLE(inptr1[col]);
-      b = GETJSAMPLE(inptr2[col]);
+      r = inptr0[col];
+      g = inptr1[col];
+      b = inptr2[col];
       /* Y */
       outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
                                ctab[b + B_Y_OFF]) >> SCALEBITS);
@@ -550,9 +550,9 @@
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      y  = GETJSAMPLE(inptr0[col]);
-      cb = GETJSAMPLE(inptr1[col]);
-      cr = GETJSAMPLE(inptr2[col]);
+      y  = inptr0[col];
+      cb = inptr1[col];
+      cr = inptr2[col];
       /* Range-limiting is essential due to noise introduced by DCT losses. */
       outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])];   /* red */
       outptr[1] = range_limit[MAXJSAMPLE - (y +                 /* green */
@@ -560,7 +560,7 @@
                                                  SCALEBITS)))];
       outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])];   /* blue */
       /* K passes through unchanged */
-      outptr[3] = inptr3[col];  /* don't need GETJSAMPLE here */
+      outptr[3] = inptr3[col];
       outptr += 4;
     }
   }
diff --git a/jdhuff.c b/jdhuff.c
index a112817..b5665d5 100644
--- a/jdhuff.c
+++ b/jdhuff.c
@@ -5,6 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2009-2011, 2016, 2018-2019, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -39,24 +40,6 @@
   int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
 } savable_state;
 
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest, src)  ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest, src) \
-  ((dest).last_dc_val[0] = (src).last_dc_val[0], \
-   (dest).last_dc_val[1] = (src).last_dc_val[1], \
-   (dest).last_dc_val[2] = (src).last_dc_val[2], \
-   (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
 typedef struct {
   struct jpeg_entropy_decoder pub; /* public fields */
 
@@ -325,7 +308,7 @@
         bytes_in_buffer = cinfo->src->bytes_in_buffer;
       }
       bytes_in_buffer--;
-      c = GETJOCTET(*next_input_byte++);
+      c = *next_input_byte++;
 
       /* If it's 0xFF, check and discard stuffed zero byte */
       if (c == 0xFF) {
@@ -342,7 +325,7 @@
             bytes_in_buffer = cinfo->src->bytes_in_buffer;
           }
           bytes_in_buffer--;
-          c = GETJOCTET(*next_input_byte++);
+          c = *next_input_byte++;
         } while (c == 0xFF);
 
         if (c == 0) {
@@ -405,8 +388,8 @@
 
 #define GET_BYTE { \
   register int c0, c1; \
-  c0 = GETJOCTET(*buffer++); \
-  c1 = GETJOCTET(*buffer); \
+  c0 = *buffer++; \
+  c1 = *buffer; \
   /* Pre-execute most common case */ \
   get_buffer = (get_buffer << 8) | c0; \
   bits_left += 8; \
@@ -423,7 +406,7 @@
   } \
 }
 
-#if SIZEOF_SIZE_T == 8 || defined(_WIN64)
+#if SIZEOF_SIZE_T == 8 || defined(_WIN64) || (defined(__x86_64__) && defined(__ILP32__))
 
 /* Pre-fetch 48 bytes, because the holding register is 64-bit */
 #define FILL_BIT_BUFFER_FAST \
@@ -568,7 +551,7 @@
 
   /* Load up working state */
   BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
-  ASSIGN_STATE(state, entropy->saved);
+  state = entropy->saved;
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
     JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
@@ -653,7 +636,7 @@
 
   /* Completed MCU, so update state */
   BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
-  ASSIGN_STATE(entropy->saved, state);
+  entropy->saved = state;
   return TRUE;
 }
 
@@ -671,7 +654,7 @@
   /* Load up working state */
   BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
   buffer = (JOCTET *)br_state.next_input_byte;
-  ASSIGN_STATE(state, entropy->saved);
+  state = entropy->saved;
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
     JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
@@ -740,7 +723,7 @@
   br_state.bytes_in_buffer -= (buffer - br_state.next_input_byte);
   br_state.next_input_byte = buffer;
   BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
-  ASSIGN_STATE(entropy->saved, state);
+  entropy->saved = state;
   return TRUE;
 }
 
diff --git a/jdhuff.h b/jdhuff.h
index 6a8d90f..ac6e0e5 100644
--- a/jdhuff.h
+++ b/jdhuff.h
@@ -5,6 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2010-2011, 2015-2016, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -78,6 +79,11 @@
 typedef size_t bit_buf_type;            /* type of bit-extraction buffer */
 #define BIT_BUF_SIZE  64                /* size of buffer in bits */
 
+#elif defined(__x86_64__) && defined(__ILP32__)
+
+typedef unsigned long long bit_buf_type; /* type of bit-extraction buffer */
+#define BIT_BUF_SIZE  64                 /* size of buffer in bits */
+
 #else
 
 typedef unsigned long bit_buf_type;     /* type of bit-extraction buffer */
diff --git a/jdicc.c b/jdicc.c
index 7224695..a1a5b86 100644
--- a/jdicc.c
+++ b/jdicc.c
@@ -38,18 +38,18 @@
     marker->marker == ICC_MARKER &&
     marker->data_length >= ICC_OVERHEAD_LEN &&
     /* verify the identifying string */
-    GETJOCTET(marker->data[0]) == 0x49 &&
-    GETJOCTET(marker->data[1]) == 0x43 &&
-    GETJOCTET(marker->data[2]) == 0x43 &&
-    GETJOCTET(marker->data[3]) == 0x5F &&
-    GETJOCTET(marker->data[4]) == 0x50 &&
-    GETJOCTET(marker->data[5]) == 0x52 &&
-    GETJOCTET(marker->data[6]) == 0x4F &&
-    GETJOCTET(marker->data[7]) == 0x46 &&
-    GETJOCTET(marker->data[8]) == 0x49 &&
-    GETJOCTET(marker->data[9]) == 0x4C &&
-    GETJOCTET(marker->data[10]) == 0x45 &&
-    GETJOCTET(marker->data[11]) == 0x0;
+    marker->data[0] == 0x49 &&
+    marker->data[1] == 0x43 &&
+    marker->data[2] == 0x43 &&
+    marker->data[3] == 0x5F &&
+    marker->data[4] == 0x50 &&
+    marker->data[5] == 0x52 &&
+    marker->data[6] == 0x4F &&
+    marker->data[7] == 0x46 &&
+    marker->data[8] == 0x49 &&
+    marker->data[9] == 0x4C &&
+    marker->data[10] == 0x45 &&
+    marker->data[11] == 0x0;
 }
 
 
@@ -102,12 +102,12 @@
   for (marker = cinfo->marker_list; marker != NULL; marker = marker->next) {
     if (marker_is_icc(marker)) {
       if (num_markers == 0)
-        num_markers = GETJOCTET(marker->data[13]);
-      else if (num_markers != GETJOCTET(marker->data[13])) {
+        num_markers = marker->data[13];
+      else if (num_markers != marker->data[13]) {
         WARNMS(cinfo, JWRN_BOGUS_ICC);  /* inconsistent num_markers fields */
         return FALSE;
       }
-      seq_no = GETJOCTET(marker->data[12]);
+      seq_no = marker->data[12];
       if (seq_no <= 0 || seq_no > num_markers) {
         WARNMS(cinfo, JWRN_BOGUS_ICC);  /* bogus sequence number */
         return FALSE;
@@ -154,7 +154,7 @@
       JOCTET FAR *src_ptr;
       JOCTET *dst_ptr;
       unsigned int length;
-      seq_no = GETJOCTET(marker->data[12]);
+      seq_no = marker->data[12];
       dst_ptr = icc_data + data_offset[seq_no];
       src_ptr = marker->data + ICC_OVERHEAD_LEN;
       length = data_length[seq_no];
diff --git a/jdmarker.c b/jdmarker.c
index c9c7ef6..b964c3a 100644
--- a/jdmarker.c
+++ b/jdmarker.c
@@ -151,7 +151,7 @@
 #define INPUT_BYTE(cinfo, V, action) \
   MAKESTMT( MAKE_BYTE_AVAIL(cinfo, action); \
             bytes_in_buffer--; \
-            V = GETJOCTET(*next_input_byte++); )
+            V = *next_input_byte++; )
 
 /* As above, but read two bytes interpreted as an unsigned 16-bit integer.
  * V should be declared unsigned int or perhaps JLONG.
@@ -159,10 +159,10 @@
 #define INPUT_2BYTES(cinfo, V, action) \
   MAKESTMT( MAKE_BYTE_AVAIL(cinfo, action); \
             bytes_in_buffer--; \
-            V = ((unsigned int)GETJOCTET(*next_input_byte++)) << 8; \
+            V = ((unsigned int)(*next_input_byte++)) << 8; \
             MAKE_BYTE_AVAIL(cinfo, action); \
             bytes_in_buffer--; \
-            V += GETJOCTET(*next_input_byte++); )
+            V += *next_input_byte++; )
 
 
 /*
@@ -608,18 +608,18 @@
   JLONG totallen = (JLONG)datalen + remaining;
 
   if (datalen >= APP0_DATA_LEN &&
-      GETJOCTET(data[0]) == 0x4A &&
-      GETJOCTET(data[1]) == 0x46 &&
-      GETJOCTET(data[2]) == 0x49 &&
-      GETJOCTET(data[3]) == 0x46 &&
-      GETJOCTET(data[4]) == 0) {
+      data[0] == 0x4A &&
+      data[1] == 0x46 &&
+      data[2] == 0x49 &&
+      data[3] == 0x46 &&
+      data[4] == 0) {
     /* Found JFIF APP0 marker: save info */
     cinfo->saw_JFIF_marker = TRUE;
-    cinfo->JFIF_major_version = GETJOCTET(data[5]);
-    cinfo->JFIF_minor_version = GETJOCTET(data[6]);
-    cinfo->density_unit = GETJOCTET(data[7]);
-    cinfo->X_density = (GETJOCTET(data[8]) << 8) + GETJOCTET(data[9]);
-    cinfo->Y_density = (GETJOCTET(data[10]) << 8) + GETJOCTET(data[11]);
+    cinfo->JFIF_major_version = data[5];
+    cinfo->JFIF_minor_version = data[6];
+    cinfo->density_unit = data[7];
+    cinfo->X_density = (data[8] << 8) + data[9];
+    cinfo->Y_density = (data[10] << 8) + data[11];
     /* Check version.
      * Major version must be 1, anything else signals an incompatible change.
      * (We used to treat this as an error, but now it's a nonfatal warning,
@@ -634,24 +634,22 @@
              cinfo->JFIF_major_version, cinfo->JFIF_minor_version,
              cinfo->X_density, cinfo->Y_density, cinfo->density_unit);
     /* Validate thumbnail dimensions and issue appropriate messages */
-    if (GETJOCTET(data[12]) | GETJOCTET(data[13]))
-      TRACEMS2(cinfo, 1, JTRC_JFIF_THUMBNAIL,
-               GETJOCTET(data[12]), GETJOCTET(data[13]));
+    if (data[12] | data[13])
+      TRACEMS2(cinfo, 1, JTRC_JFIF_THUMBNAIL, data[12], data[13]);
     totallen -= APP0_DATA_LEN;
-    if (totallen !=
-        ((JLONG)GETJOCTET(data[12]) * (JLONG)GETJOCTET(data[13]) * (JLONG)3))
+    if (totallen != ((JLONG)data[12] * (JLONG)data[13] * (JLONG)3))
       TRACEMS1(cinfo, 1, JTRC_JFIF_BADTHUMBNAILSIZE, (int)totallen);
   } else if (datalen >= 6 &&
-             GETJOCTET(data[0]) == 0x4A &&
-             GETJOCTET(data[1]) == 0x46 &&
-             GETJOCTET(data[2]) == 0x58 &&
-             GETJOCTET(data[3]) == 0x58 &&
-             GETJOCTET(data[4]) == 0) {
+             data[0] == 0x4A &&
+             data[1] == 0x46 &&
+             data[2] == 0x58 &&
+             data[3] == 0x58 &&
+             data[4] == 0) {
     /* Found JFIF "JFXX" extension APP0 marker */
     /* The library doesn't actually do anything with these,
      * but we try to produce a helpful trace message.
      */
-    switch (GETJOCTET(data[5])) {
+    switch (data[5]) {
     case 0x10:
       TRACEMS1(cinfo, 1, JTRC_THUMB_JPEG, (int)totallen);
       break;
@@ -662,8 +660,7 @@
       TRACEMS1(cinfo, 1, JTRC_THUMB_RGB, (int)totallen);
       break;
     default:
-      TRACEMS2(cinfo, 1, JTRC_JFIF_EXTENSION,
-               GETJOCTET(data[5]), (int)totallen);
+      TRACEMS2(cinfo, 1, JTRC_JFIF_EXTENSION, data[5], (int)totallen);
       break;
     }
   } else {
@@ -684,16 +681,16 @@
   unsigned int version, flags0, flags1, transform;
 
   if (datalen >= APP14_DATA_LEN &&
-      GETJOCTET(data[0]) == 0x41 &&
-      GETJOCTET(data[1]) == 0x64 &&
-      GETJOCTET(data[2]) == 0x6F &&
-      GETJOCTET(data[3]) == 0x62 &&
-      GETJOCTET(data[4]) == 0x65) {
+      data[0] == 0x41 &&
+      data[1] == 0x64 &&
+      data[2] == 0x6F &&
+      data[3] == 0x62 &&
+      data[4] == 0x65) {
     /* Found Adobe APP14 marker */
-    version = (GETJOCTET(data[5]) << 8) + GETJOCTET(data[6]);
-    flags0 = (GETJOCTET(data[7]) << 8) + GETJOCTET(data[8]);
-    flags1 = (GETJOCTET(data[9]) << 8) + GETJOCTET(data[10]);
-    transform = GETJOCTET(data[11]);
+    version = (data[5] << 8) + data[6];
+    flags0 = (data[7] << 8) + data[8];
+    flags1 = (data[9] << 8) + data[10];
+    transform = data[11];
     TRACEMS4(cinfo, 1, JTRC_ADOBE, version, flags0, flags1, transform);
     cinfo->saw_Adobe_marker = TRUE;
     cinfo->Adobe_transform = (UINT8)transform;
diff --git a/jdmaster.c b/jdmaster.c
index b209064..cbc8774 100644
--- a/jdmaster.c
+++ b/jdmaster.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2002-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2016, D. R. Commander.
+ * Copyright (C) 2009-2011, 2016, 2019, D. R. Commander.
  * Copyright (C) 2013, Linaro Limited.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -22,7 +22,6 @@
 #include "jpeglib.h"
 #include "jpegcomp.h"
 #include "jdmaster.h"
-#include "jsimd.h"
 
 
 /*
@@ -70,17 +69,6 @@
       cinfo->comp_info[1]._DCT_scaled_size != cinfo->_min_DCT_scaled_size ||
       cinfo->comp_info[2]._DCT_scaled_size != cinfo->_min_DCT_scaled_size)
     return FALSE;
-#ifdef WITH_SIMD
-  /* If YCbCr-to-RGB color conversion is SIMD-accelerated but merged upsampling
-     isn't, then disabling merged upsampling is likely to be faster when
-     decompressing YCbCr JPEG images. */
-  if (!jsimd_can_h2v2_merged_upsample() && !jsimd_can_h2v1_merged_upsample() &&
-      jsimd_can_ycc_rgb() && cinfo->jpeg_color_space == JCS_YCbCr &&
-      (cinfo->out_color_space == JCS_RGB ||
-       (cinfo->out_color_space >= JCS_EXT_RGB &&
-        cinfo->out_color_space <= JCS_EXT_ARGB)))
-    return FALSE;
-#endif
   /* ??? also need to test for upsample-time rescaling, when & if supported */
   return TRUE;                  /* by golly, it'll work... */
 #else
@@ -580,6 +568,7 @@
    */
   cinfo->master->first_iMCU_col = 0;
   cinfo->master->last_iMCU_col = cinfo->MCUs_per_row - 1;
+  cinfo->master->last_good_iMCU_row = 0;
 
 #ifdef D_MULTISCAN_FILES_SUPPORTED
   /* If jpeg_start_decompress will read the whole file, initialize
diff --git a/jdmrg565.c b/jdmrg565.c
index 53f1e16..980a4e2 100644
--- a/jdmrg565.c
+++ b/jdmrg565.c
@@ -43,20 +43,20 @@
   /* Loop for each pair of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -68,12 +68,12 @@
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
+    y  = *inptr0;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -115,21 +115,21 @@
   /* Loop for each pair of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     d0 = DITHER_ROTATE(d0);
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
@@ -142,12 +142,12 @@
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
+    y  = *inptr0;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
@@ -189,20 +189,20 @@
   /* Loop for each group of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -211,13 +211,13 @@
     WRITE_TWO_PIXELS(outptr0, rgb);
     outptr0 += 4;
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -229,20 +229,20 @@
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
-    y  = GETJSAMPLE(*inptr00);
+    y  = *inptr00;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
     *(INT16 *)outptr0 = (INT16)rgb;
 
-    y  = GETJSAMPLE(*inptr01);
+    y  = *inptr01;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -287,21 +287,21 @@
   /* Loop for each group of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     d0 = DITHER_ROTATE(d0);
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
@@ -311,14 +311,14 @@
     WRITE_TWO_PIXELS(outptr0, rgb);
     outptr0 += 4;
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[DITHER_565_R(y + cred, d1)];
     g = range_limit[DITHER_565_G(y + cgreen, d1)];
     b = range_limit[DITHER_565_B(y + cblue, d1)];
     d1 = DITHER_ROTATE(d1);
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[DITHER_565_R(y + cred, d1)];
     g = range_limit[DITHER_565_G(y + cgreen, d1)];
     b = range_limit[DITHER_565_B(y + cblue, d1)];
@@ -331,20 +331,20 @@
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
-    y  = GETJSAMPLE(*inptr00);
+    y  = *inptr00;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     rgb = PACK_SHORT_565(r, g, b);
     *(INT16 *)outptr0 = (INT16)rgb;
 
-    y  = GETJSAMPLE(*inptr01);
+    y  = *inptr01;
     r = range_limit[DITHER_565_R(y + cred, d1)];
     g = range_limit[DITHER_565_G(y + cgreen, d1)];
     b = range_limit[DITHER_565_B(y + cblue, d1)];
diff --git a/jdmrgext.c b/jdmrgext.c
index c9a44d8..9bf4f1a 100644
--- a/jdmrgext.c
+++ b/jdmrgext.c
@@ -46,13 +46,13 @@
   /* Loop for each pair of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
@@ -60,7 +60,7 @@
     outptr[RGB_ALPHA] = 0xFF;
 #endif
     outptr += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
@@ -71,12 +71,12 @@
   }
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
+    y  = *inptr0;
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
@@ -120,13 +120,13 @@
   /* Loop for each group of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
@@ -134,7 +134,7 @@
     outptr0[RGB_ALPHA] = 0xFF;
 #endif
     outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
@@ -142,7 +142,7 @@
     outptr0[RGB_ALPHA] = 0xFF;
 #endif
     outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
@@ -150,7 +150,7 @@
     outptr1[RGB_ALPHA] = 0xFF;
 #endif
     outptr1 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
@@ -161,19 +161,19 @@
   }
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr00);
+    y  = *inptr00;
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
 #ifdef RGB_ALPHA
     outptr0[RGB_ALPHA] = 0xFF;
 #endif
-    y  = GETJSAMPLE(*inptr01);
+    y  = *inptr01;
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
diff --git a/jdphuff.c b/jdphuff.c
index 9e82636..5ab99d6 100644
--- a/jdphuff.c
+++ b/jdphuff.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2016, 2018, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018-2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -41,25 +41,6 @@
   int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
 } savable_state;
 
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest, src)  ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest, src) \
-  ((dest).EOBRUN = (src).EOBRUN, \
-   (dest).last_dc_val[0] = (src).last_dc_val[0], \
-   (dest).last_dc_val[1] = (src).last_dc_val[1], \
-   (dest).last_dc_val[2] = (src).last_dc_val[2], \
-   (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
 typedef struct {
   struct jpeg_entropy_decoder pub; /* public fields */
 
@@ -102,7 +83,7 @@
   boolean is_DC_band, bad;
   int ci, coefi, tbl;
   d_derived_tbl **pdtbl;
-  int *coef_bit_ptr;
+  int *coef_bit_ptr, *prev_coef_bit_ptr;
   jpeg_component_info *compptr;
 
   is_DC_band = (cinfo->Ss == 0);
@@ -143,12 +124,15 @@
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     int cindex = cinfo->cur_comp_info[ci]->component_index;
     coef_bit_ptr = &cinfo->coef_bits[cindex][0];
+    prev_coef_bit_ptr = &cinfo->coef_bits[cindex + cinfo->num_components][0];
     if (!is_DC_band && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
       WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
     for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
       int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
       if (cinfo->Ah != expected)
         WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
+      if (cinfo->input_scan_number > 1)
+        prev_coef_bit_ptr[coefi] = coef_bit_ptr[coefi];
       coef_bit_ptr[coefi] = cinfo->Al;
     }
   }
@@ -323,7 +307,7 @@
 
     /* Load up working state */
     BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
-    ASSIGN_STATE(state, entropy->saved);
+    state = entropy->saved;
 
     /* Outer loop handles each block in the MCU */
 
@@ -356,7 +340,7 @@
 
     /* Completed MCU, so update state */
     BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
-    ASSIGN_STATE(entropy->saved, state);
+    entropy->saved = state;
   }
 
   /* Account for restart interval (no-op if not using restarts) */
@@ -676,7 +660,7 @@
   /* Create progression status table */
   cinfo->coef_bits = (int (*)[DCTSIZE2])
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                cinfo->num_components * DCTSIZE2 *
+                                cinfo->num_components * 2 * DCTSIZE2 *
                                 sizeof(int));
   coef_bit_ptr = &cinfo->coef_bits[0][0];
   for (ci = 0; ci < cinfo->num_components; ci++)
diff --git a/jdsample.c b/jdsample.c
index 50a68b3..2d34710 100644
--- a/jdsample.c
+++ b/jdsample.c
@@ -8,7 +8,7 @@
  * Copyright (C) 2010, 2015-2016, D. R. Commander.
  * Copyright (C) 2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2015, Google, Inc.
- * Copyright (C) 2019, Arm Limited.
+ * Copyright (C) 2019-2020, Arm Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -177,7 +177,7 @@
     outptr = output_data[outrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
+      invalue = *inptr++;
       for (h = h_expand; h > 0; h--) {
         *outptr++ = invalue;
       }
@@ -213,7 +213,7 @@
     outptr = output_data[inrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
+      invalue = *inptr++;
       *outptr++ = invalue;
       *outptr++ = invalue;
     }
@@ -242,7 +242,7 @@
     outptr = output_data[outrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
+      invalue = *inptr++;
       *outptr++ = invalue;
       *outptr++ = invalue;
     }
@@ -283,20 +283,20 @@
     inptr = input_data[inrow];
     outptr = output_data[inrow];
     /* Special case for first column */
-    invalue = GETJSAMPLE(*inptr++);
+    invalue = *inptr++;
     *outptr++ = (JSAMPLE)invalue;
-    *outptr++ = (JSAMPLE)((invalue * 3 + GETJSAMPLE(*inptr) + 2) >> 2);
+    *outptr++ = (JSAMPLE)((invalue * 3 + inptr[0] + 2) >> 2);
 
     for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
       /* General case: 3/4 * nearer pixel + 1/4 * further pixel */
-      invalue = GETJSAMPLE(*inptr++) * 3;
-      *outptr++ = (JSAMPLE)((invalue + GETJSAMPLE(inptr[-2]) + 1) >> 2);
-      *outptr++ = (JSAMPLE)((invalue + GETJSAMPLE(*inptr) + 2) >> 2);
+      invalue = (*inptr++) * 3;
+      *outptr++ = (JSAMPLE)((invalue + inptr[-2] + 1) >> 2);
+      *outptr++ = (JSAMPLE)((invalue + inptr[0] + 2) >> 2);
     }
 
     /* Special case for last column */
-    invalue = GETJSAMPLE(*inptr);
-    *outptr++ = (JSAMPLE)((invalue * 3 + GETJSAMPLE(inptr[-1]) + 1) >> 2);
+    invalue = *inptr;
+    *outptr++ = (JSAMPLE)((invalue * 3 + inptr[-1] + 1) >> 2);
     *outptr++ = (JSAMPLE)invalue;
   }
 }
@@ -338,7 +338,7 @@
       outptr = output_data[outrow++];
 
       for (colctr = 0; colctr < compptr->downsampled_width; colctr++) {
-        thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
+        thiscolsum = (*inptr0++) * 3 + (*inptr1++);
         *outptr++ = (JSAMPLE)((thiscolsum + bias) >> 2);
       }
     }
@@ -381,8 +381,8 @@
       outptr = output_data[outrow++];
 
       /* Special case for first column */
-      thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-      nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
+      thiscolsum = (*inptr0++) * 3 + (*inptr1++);
+      nextcolsum = (*inptr0++) * 3 + (*inptr1++);
       *outptr++ = (JSAMPLE)((thiscolsum * 4 + 8) >> 4);
       *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
       lastcolsum = thiscolsum;  thiscolsum = nextcolsum;
@@ -390,7 +390,7 @@
       for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
         /* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */
         /* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */
-        nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
+        nextcolsum = (*inptr0++) * 3 + (*inptr1++);
         *outptr++ = (JSAMPLE)((thiscolsum * 3 + lastcolsum + 8) >> 4);
         *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
         lastcolsum = thiscolsum;  thiscolsum = nextcolsum;
@@ -477,7 +477,12 @@
     } else if (h_in_group == h_out_group &&
                v_in_group * 2 == v_out_group && do_fancy) {
       /* Non-fancy upsampling is handled by the generic method */
-      upsample->methods[ci] = h1v2_fancy_upsample;
+#if defined(__arm__) || defined(__aarch64__)
+      if (jsimd_can_h1v2_fancy_upsample())
+        upsample->methods[ci] = jsimd_h1v2_fancy_upsample;
+      else
+#endif
+        upsample->methods[ci] = h1v2_fancy_upsample;
       upsample->pub.need_context_rows = TRUE;
     } else if (h_in_group * 2 == h_out_group &&
                v_in_group * 2 == v_out_group) {
diff --git a/jerror.h b/jerror.h
index 933a369..4476df2 100644
--- a/jerror.h
+++ b/jerror.h
@@ -207,6 +207,10 @@
 #endif
 #endif
 JMESSAGE(JWRN_BOGUS_ICC, "Corrupt JPEG data: bad ICC marker")
+#if JPEG_LIB_VERSION < 70
+JMESSAGE(JERR_BAD_DROP_SAMPLING,
+         "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c")
+#endif
 
 #ifdef JMAKE_ENUM_LIST
 
@@ -252,6 +256,15 @@
    (cinfo)->err->msg_parm.i[2] = (p3), \
    (cinfo)->err->msg_parm.i[3] = (p4), \
    (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT6(cinfo, code, p1, p2, p3, p4, p5, p6) \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (cinfo)->err->msg_parm.i[2] = (p3), \
+   (cinfo)->err->msg_parm.i[3] = (p4), \
+   (cinfo)->err->msg_parm.i[4] = (p5), \
+   (cinfo)->err->msg_parm.i[5] = (p6), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
 #define ERREXITS(cinfo, code, str) \
   ((cinfo)->err->msg_code = (code), \
    strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
diff --git a/jidctint.c b/jidctint.c
index 50f385d..bb08748 100644
--- a/jidctint.c
+++ b/jidctint.c
@@ -3,7 +3,7 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
- * Modification developed 2002-2009 by Guido Vollbeding.
+ * Modification developed 2002-2018 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2015, 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -417,7 +417,7 @@
 
 /*
  * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a 7x7 output block.
+ * producing a reduced-size 7x7 output block.
  *
  * Optimized algorithm with 12 multiplications in the 1-D kernel.
  * cK represents sqrt(2) * cos(K*pi/14).
@@ -1258,7 +1258,7 @@
 
 /*
  * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a 11x11 output block.
+ * producing an 11x11 output block.
  *
  * Optimized algorithm with 24 multiplications in the 1-D kernel.
  * cK represents sqrt(2) * cos(K*pi/22).
@@ -2398,7 +2398,7 @@
     tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp0 += 1 << (CONST_BITS - PASS1_BITS - 1);
+    tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
     z1 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
diff --git a/jmorecfg.h b/jmorecfg.h
index aa29f0f..fb3a9cf 100644
--- a/jmorecfg.h
+++ b/jmorecfg.h
@@ -43,25 +43,11 @@
 
 #if BITS_IN_JSAMPLE == 8
 /* JSAMPLE should be the smallest type that will hold the values 0..255.
- * You can use a signed char by having GETJSAMPLE mask it with 0xFF.
  */
 
-#ifdef HAVE_UNSIGNED_CHAR
-
 typedef unsigned char JSAMPLE;
 #define GETJSAMPLE(value)  ((int)(value))
 
-#else /* not HAVE_UNSIGNED_CHAR */
-
-typedef char JSAMPLE;
-#ifdef __CHAR_UNSIGNED__
-#define GETJSAMPLE(value)  ((int)(value))
-#else
-#define GETJSAMPLE(value)  ((int)(value) & 0xFF)
-#endif /* __CHAR_UNSIGNED__ */
-
-#endif /* HAVE_UNSIGNED_CHAR */
-
 #define MAXJSAMPLE      255
 #define CENTERJSAMPLE   128
 
@@ -97,22 +83,9 @@
  * managers, this is also the data type passed to fread/fwrite.
  */
 
-#ifdef HAVE_UNSIGNED_CHAR
-
 typedef unsigned char JOCTET;
 #define GETJOCTET(value)  (value)
 
-#else /* not HAVE_UNSIGNED_CHAR */
-
-typedef char JOCTET;
-#ifdef __CHAR_UNSIGNED__
-#define GETJOCTET(value)  (value)
-#else
-#define GETJOCTET(value)  ((value) & 0xFF)
-#endif /* __CHAR_UNSIGNED__ */
-
-#endif /* HAVE_UNSIGNED_CHAR */
-
 
 /* These typedefs are used for various table entries and so forth.
  * They must be at least as wide as specified; but making them too big
@@ -123,15 +96,7 @@
 
 /* UINT8 must hold at least the values 0..255. */
 
-#ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char UINT8;
-#else /* not HAVE_UNSIGNED_CHAR */
-#ifdef __CHAR_UNSIGNED__
-typedef char UINT8;
-#else /* not __CHAR_UNSIGNED__ */
-typedef short UINT8;
-#endif /* __CHAR_UNSIGNED__ */
-#endif /* HAVE_UNSIGNED_CHAR */
 
 /* UINT16 must hold at least the values 0..65535. */
 
diff --git a/jpegint.h b/jpegint.h
index ad36ca8..195fbcb 100644
--- a/jpegint.h
+++ b/jpegint.h
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2016, D. R. Commander.
+ * Copyright (C) 2015-2016, 2019, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -158,6 +158,9 @@
   JDIMENSION first_MCU_col[MAX_COMPONENTS];
   JDIMENSION last_MCU_col[MAX_COMPONENTS];
   boolean jinit_upsampler_no_alloc;
+
+  /* Last iMCU row that was successfully decoded */
+  JDIMENSION last_good_iMCU_row;
 };
 
 /* Input control module */
diff --git a/jpegtran.1 b/jpegtran.1
index f330d14..da7a266 100644
--- a/jpegtran.1
+++ b/jpegtran.1
@@ -1,4 +1,4 @@
-.TH JPEGTRAN 1 "18 March 2017"
+.TH JPEGTRAN 1 "26 October 2020"
 .SH NAME
 jpegtran \- lossless transformation of JPEG files
 .SH SYNOPSIS
@@ -180,6 +180,47 @@
 doesn't, then it is silently moved up and/or left to the nearest iMCU boundary
 (the lower right corner is unchanged.)
 .PP
+If W or H is larger than the width/height of the input image, then the output
+image is expanded in size, and the expanded region is filled in with zeros
+(neutral gray).  Attaching an 'f' character ("flatten") to the width number
+will cause each block in the expanded region to be filled in with the DC
+coefficient of the nearest block in the input image rather than grayed out.
+Attaching an 'r' character ("reflect") to the width number will cause the
+expanded region to be filled in with repeated reflections of the input image
+rather than grayed out.
+.PP
+A complementary lossless wipe option is provided to discard (gray out) data
+inside a given image region while losslessly preserving what is outside:
+.TP
+.B \-wipe WxH+X+Y
+Wipe (gray out) a rectangular region of width W and height H from the input
+image, starting at point X,Y.
+.PP
+Attaching an 'f' character ("flatten") to the width number will cause the
+region to be filled with the average of adjacent blocks rather than grayed out.
+If the wipe region and the region outside the wipe region, when adjusted to the
+nearest iMCU boundary, form two horizontally adjacent rectangles, then
+attaching an 'r' character ("reflect") to the width number will cause the wipe
+region to be filled with repeated reflections of the outside region rather than
+grayed out.
+.PP
+A lossless drop option is also provided, which allows another JPEG image to be
+inserted ("dropped") into the input image data at a given position, replacing
+the existing image data at that position:
+.TP
+.B \-drop +X+Y filename
+Drop (insert) another image at point X,Y
+.PP
+Both the input image and the drop image must have the same subsampling level.
+It is best if they also have the same quantization (quality.)  Otherwise, the
+quantization of the output image will be adapted to accommodate the higher of
+the input image quality and the drop image quality.  The trim option can be
+used with the drop option to requantize the drop image to match the input
+image.  Note that a grayscale image can be dropped into a full-color image or
+vice versa, as long as the full-color image has no vertical subsampling.  If
+the input image is grayscale and the drop image is full-color, then the
+chrominance channels from the drop image will be discarded.
+.PP
 Other not-strictly-lossless transformation switches are:
 .TP
 .B \-grayscale
@@ -229,9 +270,31 @@
 .B \-max 4m
 selects 4000000 bytes.  If more space is needed, an error will occur.
 .TP
+.BI \-maxscans " N"
+Abort if the input image contains more than
+.I N
+scans.  This feature demonstrates a method by which applications can guard
+against denial-of-service attacks instigated by specially-crafted malformed
+JPEG images containing numerous scans with missing image data or image data
+consisting only of "EOB runs" (a feature of progressive JPEG images that allows
+potentially hundreds of thousands of adjoining zero-value pixels to be
+represented using only a few bytes.)  Attempting to transform such malformed
+JPEG images can cause excessive CPU activity, since the decompressor must fully
+process each scan (even if the scan is corrupt) before it can proceed to the
+next scan.
+.TP
 .BI \-outfile " name"
 Send output image to the named file, not to standard output.
 .TP
+.BI \-report
+Report transformation progress.
+.TP
+.BI \-strict
+Treat all warnings as fatal.  This feature also demonstrates a method by which
+applications can guard against attacks instigated by specially-crafted
+malformed JPEG images.  Enabling this option will cause the decompressor to
+abort if the input image contains incomplete or corrupt image data.
+.TP
 .B \-verbose
 Enable debug printout.  More
 .BR \-v 's
diff --git a/jpegtran.c b/jpegtran.c
index e1fe41a..244996d 100644
--- a/jpegtran.c
+++ b/jpegtran.c
@@ -2,9 +2,9 @@
  * jpegtran.c
  *
  * This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1995-2010, Thomas G. Lane, Guido Vollbeding.
+ * Copyright (C) 1995-2019, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2014, 2017, 2020, D. R. Commander.
+ * Copyright (C) 2010, 2014, 2017, 2019-2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -41,7 +41,11 @@
 
 static const char *progname;    /* program name for error messages */
 static char *icc_filename;      /* for -icc switch */
+JDIMENSION max_scans;           /* for -maxscans switch */
 static char *outfilename;       /* for -outfile switch */
+static char *dropfilename;      /* for -drop switch */
+boolean report;                 /* for -report switch */
+boolean strict;                 /* for -strict switch */
 static JCOPY_OPTION copyoption; /* -copy switch */
 static jpeg_transform_info transformoption; /* image transformation options */
 
@@ -70,8 +74,9 @@
   fprintf(stderr, "Switches for modifying the image:\n");
 #if TRANSFORMS_SUPPORTED
   fprintf(stderr, "  -crop WxH+X+Y  Crop to a rectangular region\n");
-  fprintf(stderr, "  -grayscale     Reduce to grayscale (omit color data)\n");
+  fprintf(stderr, "  -drop +X+Y filename          Drop (insert) another image\n");
   fprintf(stderr, "  -flip [horizontal|vertical]  Mirror image (left-right or top-bottom)\n");
+  fprintf(stderr, "  -grayscale     Reduce to grayscale (omit color data)\n");
   fprintf(stderr, "  -perfect       Fail if there is non-transformable edge blocks\n");
   fprintf(stderr, "  -rotate [90|180|270]         Rotate image (degrees clockwise)\n");
 #endif
@@ -79,6 +84,8 @@
   fprintf(stderr, "  -transpose     Transpose image\n");
   fprintf(stderr, "  -transverse    Transverse transpose image\n");
   fprintf(stderr, "  -trim          Drop non-transformable edge blocks\n");
+  fprintf(stderr, "                 with -drop: Requantize drop file to match source file\n");
+  fprintf(stderr, "  -wipe WxH+X+Y  Wipe (gray out) a rectangular region\n");
 #endif
   fprintf(stderr, "Switches for advanced users:\n");
 #ifdef C_ARITH_CODING_SUPPORTED
@@ -87,7 +94,10 @@
   fprintf(stderr, "  -icc FILE      Embed ICC profile contained in FILE\n");
   fprintf(stderr, "  -restart N     Set restart interval in rows, or in blocks with B\n");
   fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
+  fprintf(stderr, "  -maxscans N    Maximum number of scans to allow in input file\n");
   fprintf(stderr, "  -outfile name  Specify name for output file\n");
+  fprintf(stderr, "  -report        Report transformation progress\n");
+  fprintf(stderr, "  -strict        Treat all warnings as fatal\n");
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
   fprintf(stderr, "  -version       Print version information and exit\n");
   fprintf(stderr, "Switches for wizards:\n");
@@ -141,7 +151,10 @@
   /* Set up default JPEG parameters. */
   simple_progressive = FALSE;
   icc_filename = NULL;
+  max_scans = 0;
   outfilename = NULL;
+  report = FALSE;
+  strict = FALSE;
   copyoption = JCOPYOPT_DEFAULT;
   transformoption.transform = JXFORM_NONE;
   transformoption.perfect = FALSE;
@@ -193,7 +206,8 @@
 #if TRANSFORMS_SUPPORTED
       if (++argn >= argc)       /* advance to next argument */
         usage();
-      if (!jtransform_parse_crop_spec(&transformoption, argv[argn])) {
+      if (transformoption.crop /* reject multiple crop/drop/wipe requests */ ||
+          !jtransform_parse_crop_spec(&transformoption, argv[argn])) {
         fprintf(stderr, "%s: bogus -crop argument '%s'\n",
                 progname, argv[argn]);
         exit(EXIT_FAILURE);
@@ -202,6 +216,26 @@
       select_transform(JXFORM_NONE);    /* force an error */
 #endif
 
+    } else if (keymatch(arg, "drop", 2)) {
+#if TRANSFORMS_SUPPORTED
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (transformoption.crop /* reject multiple crop/drop/wipe requests */ ||
+          !jtransform_parse_crop_spec(&transformoption, argv[argn]) ||
+          transformoption.crop_width_set != JCROP_UNSET ||
+          transformoption.crop_height_set != JCROP_UNSET) {
+        fprintf(stderr, "%s: bogus -drop argument '%s'\n",
+                progname, argv[argn]);
+        exit(EXIT_FAILURE);
+      }
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      dropfilename = argv[argn];
+      select_transform(JXFORM_DROP);
+#else
+      select_transform(JXFORM_NONE);    /* force an error */
+#endif
+
     } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) {
       /* Enable debug printouts. */
       /* On first -d, print version identification */
@@ -261,6 +295,12 @@
         lval *= 1000L;
       cinfo->mem->max_memory_to_use = lval * 1000L;
 
+    } else if (keymatch(arg, "maxscans", 4)) {
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (sscanf(argv[argn], "%u", &max_scans) != 1)
+        usage();
+
     } else if (keymatch(arg, "optimize", 1) || keymatch(arg, "optimise", 1)) {
       /* Enable entropy parm optimization. */
 #ifdef ENTROPY_OPT_SUPPORTED
@@ -293,6 +333,9 @@
       exit(EXIT_FAILURE);
 #endif
 
+    } else if (keymatch(arg, "report", 3)) {
+      report = TRUE;
+
     } else if (keymatch(arg, "restart", 1)) {
       /* Restart interval in MCU rows (or in MCUs with 'b'). */
       long lval;
@@ -338,6 +381,9 @@
       exit(EXIT_FAILURE);
 #endif
 
+    } else if (keymatch(arg, "strict", 2)) {
+      strict = TRUE;
+
     } else if (keymatch(arg, "transpose", 1)) {
       /* Transpose (across UL-to-LR axis). */
       select_transform(JXFORM_TRANSPOSE);
@@ -350,6 +396,21 @@
       /* Trim off any partial edge MCUs that the transform can't handle. */
       transformoption.trim = TRUE;
 
+    } else if (keymatch(arg, "wipe", 1)) {
+#if TRANSFORMS_SUPPORTED
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (transformoption.crop /* reject multiple crop/drop/wipe requests */ ||
+          !jtransform_parse_crop_spec(&transformoption, argv[argn])) {
+        fprintf(stderr, "%s: bogus -wipe argument '%s'\n",
+                progname, argv[argn]);
+        exit(EXIT_FAILURE);
+      }
+      select_transform(JXFORM_WIPE);
+#else
+      select_transform(JXFORM_NONE);    /* force an error */
+#endif
+
     } else {
       usage();                  /* bogus switch */
     }
@@ -375,6 +436,19 @@
 }
 
 
+METHODDEF(void)
+my_emit_message(j_common_ptr cinfo, int msg_level)
+{
+  if (msg_level < 0) {
+    /* Treat warning as fatal */
+    cinfo->err->error_exit(cinfo);
+  } else {
+    if (cinfo->err->trace_level >= msg_level)
+      cinfo->err->output_message(cinfo);
+  }
+}
+
+
 /*
  * The main program.
  */
@@ -383,11 +457,14 @@
 main(int argc, char **argv)
 {
   struct jpeg_decompress_struct srcinfo;
+#if TRANSFORMS_SUPPORTED
+  struct jpeg_decompress_struct dropinfo;
+  struct jpeg_error_mgr jdroperr;
+  FILE *drop_file;
+#endif
   struct jpeg_compress_struct dstinfo;
   struct jpeg_error_mgr jsrcerr, jdsterr;
-#ifdef PROGRESS_REPORT
-  struct cdjpeg_progress_mgr progress;
-#endif
+  struct cdjpeg_progress_mgr src_progress, dst_progress;
   jvirt_barray_ptr *src_coef_arrays;
   jvirt_barray_ptr *dst_coef_arrays;
   int file_index;
@@ -420,13 +497,16 @@
    * values read here are mostly ignored; we will rescan the switches after
    * opening the input file.  Also note that most of the switches affect the
    * destination JPEG object, so we parse into that and then copy over what
-   * needs to affects the source too.
+   * needs to affect the source too.
    */
 
   file_index = parse_switches(&dstinfo, argc, argv, 0, FALSE);
   jsrcerr.trace_level = jdsterr.trace_level;
   srcinfo.mem->max_memory_to_use = dstinfo.mem->max_memory_to_use;
 
+  if (strict)
+    jsrcerr.emit_message = my_emit_message;
+
 #ifdef TWO_FILE_COMMANDLINE
   /* Must have either -outfile switch or explicit output file name */
   if (outfilename == NULL) {
@@ -492,8 +572,29 @@
       copyoption = JCOPYOPT_ALL_EXCEPT_ICC;
   }
 
-#ifdef PROGRESS_REPORT
-  start_progress_monitor((j_common_ptr)&dstinfo, &progress);
+  if (report) {
+    start_progress_monitor((j_common_ptr)&dstinfo, &dst_progress);
+    dst_progress.report = report;
+  }
+  if (report || max_scans != 0) {
+    start_progress_monitor((j_common_ptr)&srcinfo, &src_progress);
+    src_progress.report = report;
+    src_progress.max_scans = max_scans;
+  }
+#if TRANSFORMS_SUPPORTED
+  /* Open the drop file. */
+  if (dropfilename != NULL) {
+    if ((drop_file = fopen(dropfilename, READ_BINARY)) == NULL) {
+      fprintf(stderr, "%s: can't open %s for reading\n", progname,
+              dropfilename);
+      exit(EXIT_FAILURE);
+    }
+    dropinfo.err = jpeg_std_error(&jdroperr);
+    jpeg_create_decompress(&dropinfo);
+    jpeg_stdio_src(&dropinfo, drop_file);
+  } else {
+    drop_file = NULL;
+  }
 #endif
 
   /* Specify data source for decompression */
@@ -505,6 +606,17 @@
   /* Read file header */
   (void)jpeg_read_header(&srcinfo, TRUE);
 
+#if TRANSFORMS_SUPPORTED
+  if (dropfilename != NULL) {
+    (void)jpeg_read_header(&dropinfo, TRUE);
+    transformoption.crop_width = dropinfo.image_width;
+    transformoption.crop_width_set = JCROP_POS;
+    transformoption.crop_height = dropinfo.image_height;
+    transformoption.crop_height_set = JCROP_POS;
+    transformoption.drop_ptr = &dropinfo;
+  }
+#endif
+
   /* Any space needed by a transform option must be requested before
    * jpeg_read_coefficients so that memory allocation will be done right.
    */
@@ -520,6 +632,12 @@
   /* Read source file as DCT coefficients */
   src_coef_arrays = jpeg_read_coefficients(&srcinfo);
 
+#if TRANSFORMS_SUPPORTED
+  if (dropfilename != NULL) {
+    transformoption.drop_coef_arrays = jpeg_read_coefficients(&dropinfo);
+  }
+#endif
+
   /* Initialize destination compression parameters from source values */
   jpeg_copy_critical_parameters(&srcinfo, &dstinfo);
 
@@ -580,20 +698,36 @@
   /* Finish compression and release memory */
   jpeg_finish_compress(&dstinfo);
   jpeg_destroy_compress(&dstinfo);
+#if TRANSFORMS_SUPPORTED
+  if (dropfilename != NULL) {
+    (void)jpeg_finish_decompress(&dropinfo);
+    jpeg_destroy_decompress(&dropinfo);
+  }
+#endif
   (void)jpeg_finish_decompress(&srcinfo);
   jpeg_destroy_decompress(&srcinfo);
 
   /* Close output file, if we opened it */
   if (fp != stdout)
     fclose(fp);
-
-#ifdef PROGRESS_REPORT
-  end_progress_monitor((j_common_ptr)&dstinfo);
+#if TRANSFORMS_SUPPORTED
+  if (drop_file != NULL)
+    fclose(drop_file);
 #endif
 
+  if (report)
+    end_progress_monitor((j_common_ptr)&dstinfo);
+  if (report || max_scans != 0)
+    end_progress_monitor((j_common_ptr)&srcinfo);
+
   free(icc_profile);
 
   /* All done. */
+#if TRANSFORMS_SUPPORTED
+  if (dropfilename != NULL)
+    exit(jsrcerr.num_warnings + jdroperr.num_warnings +
+         jdsterr.num_warnings ? EXIT_WARNING : EXIT_SUCCESS);
+#endif
   exit(jsrcerr.num_warnings + jdsterr.num_warnings ?
        EXIT_WARNING : EXIT_SUCCESS);
   return 0;                     /* suppress no-return-value warnings */
diff --git a/jquant1.c b/jquant1.c
index 40bbb28..73b83e1 100644
--- a/jquant1.c
+++ b/jquant1.c
@@ -479,7 +479,7 @@
     for (col = width; col > 0; col--) {
       pixcode = 0;
       for (ci = 0; ci < nc; ci++) {
-        pixcode += GETJSAMPLE(colorindex[ci][GETJSAMPLE(*ptrin++)]);
+        pixcode += colorindex[ci][*ptrin++];
       }
       *ptrout++ = (JSAMPLE)pixcode;
     }
@@ -506,9 +506,9 @@
     ptrin = input_buf[row];
     ptrout = output_buf[row];
     for (col = width; col > 0; col--) {
-      pixcode  = GETJSAMPLE(colorindex0[GETJSAMPLE(*ptrin++)]);
-      pixcode += GETJSAMPLE(colorindex1[GETJSAMPLE(*ptrin++)]);
-      pixcode += GETJSAMPLE(colorindex2[GETJSAMPLE(*ptrin++)]);
+      pixcode  = colorindex0[*ptrin++];
+      pixcode += colorindex1[*ptrin++];
+      pixcode += colorindex2[*ptrin++];
       *ptrout++ = (JSAMPLE)pixcode;
     }
   }
@@ -552,7 +552,7 @@
          * required amount of padding.
          */
         *output_ptr +=
-          colorindex_ci[GETJSAMPLE(*input_ptr) + dither[col_index]];
+          colorindex_ci[*input_ptr + dither[col_index]];
         input_ptr += nc;
         output_ptr++;
         col_index = (col_index + 1) & ODITHER_MASK;
@@ -595,12 +595,9 @@
     col_index = 0;
 
     for (col = width; col > 0; col--) {
-      pixcode  =
-        GETJSAMPLE(colorindex0[GETJSAMPLE(*input_ptr++) + dither0[col_index]]);
-      pixcode +=
-        GETJSAMPLE(colorindex1[GETJSAMPLE(*input_ptr++) + dither1[col_index]]);
-      pixcode +=
-        GETJSAMPLE(colorindex2[GETJSAMPLE(*input_ptr++) + dither2[col_index]]);
+      pixcode  = colorindex0[(*input_ptr++) + dither0[col_index]];
+      pixcode += colorindex1[(*input_ptr++) + dither1[col_index]];
+      pixcode += colorindex2[(*input_ptr++) + dither2[col_index]];
       *output_ptr++ = (JSAMPLE)pixcode;
       col_index = (col_index + 1) & ODITHER_MASK;
     }
@@ -677,15 +674,15 @@
          * The maximum error is +- MAXJSAMPLE; this sets the required size
          * of the range_limit array.
          */
-        cur += GETJSAMPLE(*input_ptr);
-        cur = GETJSAMPLE(range_limit[cur]);
+        cur += *input_ptr;
+        cur = range_limit[cur];
         /* Select output value, accumulate into output code for this pixel */
-        pixcode = GETJSAMPLE(colorindex_ci[cur]);
+        pixcode = colorindex_ci[cur];
         *output_ptr += (JSAMPLE)pixcode;
         /* Compute actual representation error at this pixel */
         /* Note: we can do this even though we don't have the final */
         /* pixel code, because the colormap is orthogonal. */
-        cur -= GETJSAMPLE(colormap_ci[pixcode]);
+        cur -= colormap_ci[pixcode];
         /* Compute error fractions to be propagated to adjacent pixels.
          * Add these into the running sums, and simultaneously shift the
          * next-line error sums left by 1 column.
diff --git a/jquant2.c b/jquant2.c
index 6570613..44efb18 100644
--- a/jquant2.c
+++ b/jquant2.c
@@ -215,9 +215,9 @@
     ptr = input_buf[row];
     for (col = width; col > 0; col--) {
       /* get pixel value and index into the histogram */
-      histp = &histogram[GETJSAMPLE(ptr[0]) >> C0_SHIFT]
-                        [GETJSAMPLE(ptr[1]) >> C1_SHIFT]
-                        [GETJSAMPLE(ptr[2]) >> C2_SHIFT];
+      histp = &histogram[ptr[0] >> C0_SHIFT]
+                        [ptr[1] >> C1_SHIFT]
+                        [ptr[2] >> C2_SHIFT];
       /* increment, check for overflow and undo increment if so. */
       if (++(*histp) <= 0)
         (*histp)--;
@@ -665,7 +665,7 @@
 
   for (i = 0; i < numcolors; i++) {
     /* We compute the squared-c0-distance term, then add in the other two. */
-    x = GETJSAMPLE(cinfo->colormap[0][i]);
+    x = cinfo->colormap[0][i];
     if (x < minc0) {
       tdist = (x - minc0) * C0_SCALE;
       min_dist = tdist * tdist;
@@ -688,7 +688,7 @@
       }
     }
 
-    x = GETJSAMPLE(cinfo->colormap[1][i]);
+    x = cinfo->colormap[1][i];
     if (x < minc1) {
       tdist = (x - minc1) * C1_SCALE;
       min_dist += tdist * tdist;
@@ -710,7 +710,7 @@
       }
     }
 
-    x = GETJSAMPLE(cinfo->colormap[2][i]);
+    x = cinfo->colormap[2][i];
     if (x < minc2) {
       tdist = (x - minc2) * C2_SCALE;
       min_dist += tdist * tdist;
@@ -788,13 +788,13 @@
 #define STEP_C2  ((1 << C2_SHIFT) * C2_SCALE)
 
   for (i = 0; i < numcolors; i++) {
-    icolor = GETJSAMPLE(colorlist[i]);
+    icolor = colorlist[i];
     /* Compute (square of) distance from minc0/c1/c2 to this color */
-    inc0 = (minc0 - GETJSAMPLE(cinfo->colormap[0][icolor])) * C0_SCALE;
+    inc0 = (minc0 - cinfo->colormap[0][icolor]) * C0_SCALE;
     dist0 = inc0 * inc0;
-    inc1 = (minc1 - GETJSAMPLE(cinfo->colormap[1][icolor])) * C1_SCALE;
+    inc1 = (minc1 - cinfo->colormap[1][icolor]) * C1_SCALE;
     dist0 += inc1 * inc1;
-    inc2 = (minc2 - GETJSAMPLE(cinfo->colormap[2][icolor])) * C2_SCALE;
+    inc2 = (minc2 - cinfo->colormap[2][icolor]) * C2_SCALE;
     dist0 += inc2 * inc2;
     /* Form the initial difference increments */
     inc0 = inc0 * (2 * STEP_C0) + STEP_C0 * STEP_C0;
@@ -879,7 +879,7 @@
     for (ic1 = 0; ic1 < BOX_C1_ELEMS; ic1++) {
       cachep = &histogram[c0 + ic0][c1 + ic1][c2];
       for (ic2 = 0; ic2 < BOX_C2_ELEMS; ic2++) {
-        *cachep++ = (histcell)(GETJSAMPLE(*cptr++) + 1);
+        *cachep++ = (histcell)((*cptr++) + 1);
       }
     }
   }
@@ -909,9 +909,9 @@
     outptr = output_buf[row];
     for (col = width; col > 0; col--) {
       /* get pixel value and index into the cache */
-      c0 = GETJSAMPLE(*inptr++) >> C0_SHIFT;
-      c1 = GETJSAMPLE(*inptr++) >> C1_SHIFT;
-      c2 = GETJSAMPLE(*inptr++) >> C2_SHIFT;
+      c0 = (*inptr++) >> C0_SHIFT;
+      c1 = (*inptr++) >> C1_SHIFT;
+      c2 = (*inptr++) >> C2_SHIFT;
       cachep = &histogram[c0][c1][c2];
       /* If we have not seen this color before, find nearest colormap entry */
       /* and update the cache */
@@ -996,12 +996,12 @@
        * The maximum error is +- MAXJSAMPLE (or less with error limiting);
        * this sets the required size of the range_limit array.
        */
-      cur0 += GETJSAMPLE(inptr[0]);
-      cur1 += GETJSAMPLE(inptr[1]);
-      cur2 += GETJSAMPLE(inptr[2]);
-      cur0 = GETJSAMPLE(range_limit[cur0]);
-      cur1 = GETJSAMPLE(range_limit[cur1]);
-      cur2 = GETJSAMPLE(range_limit[cur2]);
+      cur0 += inptr[0];
+      cur1 += inptr[1];
+      cur2 += inptr[2];
+      cur0 = range_limit[cur0];
+      cur1 = range_limit[cur1];
+      cur2 = range_limit[cur2];
       /* Index into the cache with adjusted pixel value */
       cachep =
         &histogram[cur0 >> C0_SHIFT][cur1 >> C1_SHIFT][cur2 >> C2_SHIFT];
@@ -1015,9 +1015,9 @@
         register int pixcode = *cachep - 1;
         *outptr = (JSAMPLE)pixcode;
         /* Compute representation error for this pixel */
-        cur0 -= GETJSAMPLE(colormap0[pixcode]);
-        cur1 -= GETJSAMPLE(colormap1[pixcode]);
-        cur2 -= GETJSAMPLE(colormap2[pixcode]);
+        cur0 -= colormap0[pixcode];
+        cur1 -= colormap1[pixcode];
+        cur2 -= colormap2[pixcode];
       }
       /* Compute error fractions to be propagated to adjacent pixels.
        * Add these into the running sums, and simultaneously shift the
diff --git a/jsimd.h b/jsimd.h
index 51e2b8c..6c20365 100644
--- a/jsimd.h
+++ b/jsimd.h
@@ -4,6 +4,7 @@
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2011, 2014, D. R. Commander.
  * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -75,6 +76,7 @@
 
 EXTERN(int) jsimd_can_h2v2_fancy_upsample(void);
 EXTERN(int) jsimd_can_h2v1_fancy_upsample(void);
+EXTERN(int) jsimd_can_h1v2_fancy_upsample(void);
 
 EXTERN(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo,
                                        jpeg_component_info *compptr,
@@ -84,6 +86,10 @@
                                        jpeg_component_info *compptr,
                                        JSAMPARRAY input_data,
                                        JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo,
+                                       jpeg_component_info *compptr,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr);
 
 EXTERN(int) jsimd_can_h2v2_merged_upsample(void);
 EXTERN(int) jsimd_can_h2v1_merged_upsample(void);
diff --git a/jsimd_none.c b/jsimd_none.c
index 3cb6c80..5b38a9f 100644
--- a/jsimd_none.c
+++ b/jsimd_none.c
@@ -4,6 +4,7 @@
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2009-2011, 2014, D. R. Commander.
  * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -169,6 +170,12 @@
   return 0;
 }
 
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  return 0;
+}
+
 GLOBAL(void)
 jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
@@ -181,6 +188,12 @@
 {
 }
 
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
 GLOBAL(int)
 jsimd_can_h2v2_merged_upsample(void)
 {
diff --git a/jversion.h b/jversion.h
index 4462b94..d1d94f5 100644
--- a/jversion.h
+++ b/jversion.h
@@ -2,7 +2,7 @@
  * jversion.h
  *
  * This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
+ * Copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2010, 2012-2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -39,7 +39,7 @@
 #define JCOPYRIGHT \
   "Copyright (C) 2009-2020 D. R. Commander\n" \
   "Copyright (C) 2015, 2020 Google, Inc.\n" \
-  "Copyright (C) 2019 Arm Limited\n" \
+  "Copyright (C) 2019-2020 Arm Limited\n" \
   "Copyright (C) 2015-2016, 2018 Matthieu Darbois\n" \
   "Copyright (C) 2011-2016 Siarhei Siamashka\n" \
   "Copyright (C) 2015 Intel Corporation\n" \
@@ -48,7 +48,7 @@
   "Copyright (C) 2009, 2012 Pierre Ossman for Cendio AB\n" \
   "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
   "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
-  "Copyright (C) 1991-2017 Thomas G. Lane, Guido Vollbeding"
+  "Copyright (C) 1991-2020 Thomas G. Lane, Guido Vollbeding"
 
 #define JCOPYRIGHT_SHORT \
   "Copyright (C) 1991-2020 The libjpeg-turbo Project and many others"
diff --git a/rdbmp.c b/rdbmp.c
index 51af237..6ba584a 100644
--- a/rdbmp.c
+++ b/rdbmp.c
@@ -12,7 +12,7 @@
  *
  * This file contains routines to read input images in Microsoft "BMP"
  * format (MS Windows 3.x, OS/2 1.x, and OS/2 2.x flavors).
- * Currently, only 8-bit and 24-bit images are supported, not 1-bit or
+ * Currently, only 8-, 24-, and 32-bit images are supported, not 1-bit or
  * 4-bit (feeding such low-depth images into JPEG would be silly anyway).
  * Also, we don't support RLE-compressed files.
  *
@@ -34,18 +34,8 @@
 
 /* Macros to deal with unsigned chars as efficiently as compiler allows */
 
-#ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char U_CHAR;
 #define UCH(x)  ((int)(x))
-#else /* !HAVE_UNSIGNED_CHAR */
-#ifdef __CHAR_UNSIGNED__
-typedef char U_CHAR;
-#define UCH(x)  ((int)(x))
-#else
-typedef char U_CHAR;
-#define UCH(x)  ((int)(x) & 0xFF)
-#endif
-#endif /* HAVE_UNSIGNED_CHAR */
 
 
 #define ReadOK(file, buffer, len) \
@@ -71,7 +61,7 @@
   JDIMENSION source_row;        /* Current source row number */
   JDIMENSION row_width;         /* Physical width of scanlines in file */
 
-  int bits_per_pixel;           /* remembers 8- or 24-bit format */
+  int bits_per_pixel;           /* remembers 8-, 24-, or 32-bit format */
   int cmap_length;              /* colormap length */
 
   boolean use_inversion_array;  /* TRUE = preload the whole image, which is
@@ -179,14 +169,14 @@
   outptr = source->pub.buffer[0];
   if (cinfo->in_color_space == JCS_GRAYSCALE) {
     for (col = cinfo->image_width; col > 0; col--) {
-      t = GETJSAMPLE(*inptr++);
+      t = *inptr++;
       if (t >= cmaplen)
         ERREXIT(cinfo, JERR_BMP_OUTOFRANGE);
       *outptr++ = colormap[0][t];
     }
   } else if (cinfo->in_color_space == JCS_CMYK) {
     for (col = cinfo->image_width; col > 0; col--) {
-      t = GETJSAMPLE(*inptr++);
+      t = *inptr++;
       if (t >= cmaplen)
         ERREXIT(cinfo, JERR_BMP_OUTOFRANGE);
       rgb_to_cmyk(colormap[0][t], colormap[1][t], colormap[2][t], outptr,
@@ -202,7 +192,7 @@
 
     if (aindex >= 0) {
       for (col = cinfo->image_width; col > 0; col--) {
-        t = GETJSAMPLE(*inptr++);
+        t = *inptr++;
         if (t >= cmaplen)
           ERREXIT(cinfo, JERR_BMP_OUTOFRANGE);
         outptr[rindex] = colormap[0][t];
@@ -213,7 +203,7 @@
       }
     } else {
       for (col = cinfo->image_width; col > 0; col--) {
-        t = GETJSAMPLE(*inptr++);
+        t = *inptr++;
         if (t >= cmaplen)
           ERREXIT(cinfo, JERR_BMP_OUTOFRANGE);
         outptr[rindex] = colormap[0][t];
@@ -258,7 +248,6 @@
     MEMCOPY(outptr, inptr, source->row_width);
   } else if (cinfo->in_color_space == JCS_CMYK) {
     for (col = cinfo->image_width; col > 0; col--) {
-      /* can omit GETJSAMPLE() safely */
       JSAMPLE b = *inptr++, g = *inptr++, r = *inptr++;
       rgb_to_cmyk(r, g, b, outptr, outptr + 1, outptr + 2, outptr + 3);
       outptr += 4;
@@ -272,7 +261,7 @@
 
     if (aindex >= 0) {
       for (col = cinfo->image_width; col > 0; col--) {
-        outptr[bindex] = *inptr++;      /* can omit GETJSAMPLE() safely */
+        outptr[bindex] = *inptr++;
         outptr[gindex] = *inptr++;
         outptr[rindex] = *inptr++;
         outptr[aindex] = 0xFF;
@@ -280,7 +269,7 @@
       }
     } else {
       for (col = cinfo->image_width; col > 0; col--) {
-        outptr[bindex] = *inptr++;      /* can omit GETJSAMPLE() safely */
+        outptr[bindex] = *inptr++;
         outptr[gindex] = *inptr++;
         outptr[rindex] = *inptr++;
         outptr += ps;
@@ -323,7 +312,6 @@
     MEMCOPY(outptr, inptr, source->row_width);
   } else if (cinfo->in_color_space == JCS_CMYK) {
     for (col = cinfo->image_width; col > 0; col--) {
-      /* can omit GETJSAMPLE() safely */
       JSAMPLE b = *inptr++, g = *inptr++, r = *inptr++;
       rgb_to_cmyk(r, g, b, outptr, outptr + 1, outptr + 2, outptr + 3);
       inptr++;                          /* skip the 4th byte (Alpha channel) */
@@ -338,7 +326,7 @@
 
     if (aindex >= 0) {
       for (col = cinfo->image_width; col > 0; col--) {
-        outptr[bindex] = *inptr++;      /* can omit GETJSAMPLE() safely */
+        outptr[bindex] = *inptr++;
         outptr[gindex] = *inptr++;
         outptr[rindex] = *inptr++;
         outptr[aindex] = *inptr++;
@@ -346,7 +334,7 @@
       }
     } else {
       for (col = cinfo->image_width; col > 0; col--) {
-        outptr[bindex] = *inptr++;      /* can omit GETJSAMPLE() safely */
+        outptr[bindex] = *inptr++;
         outptr[gindex] = *inptr++;
         outptr[rindex] = *inptr++;
         inptr++;                        /* skip the 4th byte (Alpha channel) */
@@ -481,7 +469,9 @@
       TRACEMS2(cinfo, 1, JTRC_BMP_OS2_MAPPED, biWidth, biHeight);
       break;
     case 24:                    /* RGB image */
-      TRACEMS2(cinfo, 1, JTRC_BMP_OS2, biWidth, biHeight);
+    case 32:                    /* RGB image + Alpha channel */
+      TRACEMS3(cinfo, 1, JTRC_BMP_OS2, biWidth, biHeight,
+               source->bits_per_pixel);
       break;
     default:
       ERREXIT(cinfo, JERR_BMP_BADDEPTH);
@@ -508,10 +498,8 @@
       TRACEMS2(cinfo, 1, JTRC_BMP_MAPPED, biWidth, biHeight);
       break;
     case 24:                    /* RGB image */
-      TRACEMS2(cinfo, 1, JTRC_BMP, biWidth, biHeight);
-      break;
     case 32:                    /* RGB image + Alpha channel */
-      TRACEMS2(cinfo, 1, JTRC_BMP, biWidth, biHeight);
+      TRACEMS3(cinfo, 1, JTRC_BMP, biWidth, biHeight, source->bits_per_pixel);
       break;
     default:
       ERREXIT(cinfo, JERR_BMP_BADDEPTH);
diff --git a/rdcolmap.c b/rdcolmap.c
index cbbef59..d2ed95c 100644
--- a/rdcolmap.c
+++ b/rdcolmap.c
@@ -54,9 +54,8 @@
 
   /* Check for duplicate color. */
   for (index = 0; index < ncolors; index++) {
-    if (GETJSAMPLE(colormap0[index]) == R &&
-        GETJSAMPLE(colormap1[index]) == G &&
-        GETJSAMPLE(colormap2[index]) == B)
+    if (colormap0[index] == R && colormap1[index] == G &&
+        colormap2[index] == B)
       return;                   /* color is already in map */
   }
 
diff --git a/rdgif.c b/rdgif.c
index ff9258d..e1ea56c 100644
--- a/rdgif.c
+++ b/rdgif.c
@@ -2,28 +2,656 @@
  * rdgif.c
  *
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2019 by Guido Vollbeding.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
  * This file contains routines to read input images in GIF format.
  *
- *****************************************************************************
- * NOTE: to avoid entanglements with Unisys' patent on LZW compression,      *
- * the ability to read GIF files has been removed from the IJG distribution. *
- * Sorry about that.                                                         *
- *****************************************************************************
- *
- * We are required to state that
- *    "The Graphics Interchange Format(c) is the Copyright property of
- *    CompuServe Incorporated. GIF(sm) is a Service Mark property of
- *    CompuServe Incorporated."
+ * These routines may need modification for non-Unix environments or
+ * specialized applications.  As they stand, they assume input from
+ * an ordinary stdio stream.  They further assume that reading begins
+ * at the start of the file; start_input may need work if the
+ * user interface has already read some data (e.g., to determine that
+ * the file is indeed GIF format).
+ */
+
+/*
+ * This code is loosely based on giftoppm from the PBMPLUS distribution
+ * of Feb. 1991.  That file contains the following copyright notice:
+ * +-------------------------------------------------------------------+
+ * | Copyright 1990, David Koblas.                                     |
+ * |   Permission to use, copy, modify, and distribute this software   |
+ * |   and its documentation for any purpose and without fee is hereby |
+ * |   granted, provided that the above copyright notice appear in all |
+ * |   copies and that both that copyright notice and this permission  |
+ * |   notice appear in supporting documentation.  This software is    |
+ * |   provided "as is" without express or implied warranty.           |
+ * +-------------------------------------------------------------------+
  */
 
 #include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
 
 #ifdef GIF_SUPPORTED
 
+
+/* Macros to deal with unsigned chars as efficiently as compiler allows */
+
+typedef unsigned char U_CHAR;
+#define UCH(x)  ((int)(x))
+
+
+#define ReadOK(file, buffer, len) \
+  (JFREAD(file, buffer, len) == ((size_t)(len)))
+
+
+#define MAXCOLORMAPSIZE  256    /* max # of colors in a GIF colormap */
+#define NUMCOLORS        3      /* # of colors */
+#define CM_RED           0      /* color component numbers */
+#define CM_GREEN         1
+#define CM_BLUE          2
+
+#define MAX_LZW_BITS     12     /* maximum LZW code size */
+#define LZW_TABLE_SIZE   (1 << MAX_LZW_BITS) /* # of possible LZW symbols */
+
+/* Macros for extracting header data --- note we assume chars may be signed */
+
+#define LM_to_uint(array, offset) \
+  ((unsigned int)UCH(array[offset]) + \
+   (((unsigned int)UCH(array[offset + 1])) << 8))
+
+#define BitSet(byte, bit)       ((byte) & (bit))
+#define INTERLACE       0x40    /* mask for bit signifying interlaced image */
+#define COLORMAPFLAG    0x80    /* mask for bit signifying colormap presence */
+
+
+/*
+ * LZW decompression tables look like this:
+ *   symbol_head[K] = prefix symbol of any LZW symbol K (0..LZW_TABLE_SIZE-1)
+ *   symbol_tail[K] = suffix byte   of any LZW symbol K (0..LZW_TABLE_SIZE-1)
+ * Note that entries 0..end_code of the above tables are not used,
+ * since those symbols represent raw bytes or special codes.
+ *
+ * The stack represents the not-yet-used expansion of the last LZW symbol.
+ * In the worst case, a symbol could expand to as many bytes as there are
+ * LZW symbols, so we allocate LZW_TABLE_SIZE bytes for the stack.
+ * (This is conservative since that number includes the raw-byte symbols.)
+ */
+
+
+/* Private version of data source object */
+
+typedef struct {
+  struct cjpeg_source_struct pub; /* public fields */
+
+  j_compress_ptr cinfo;         /* back link saves passing separate parm */
+
+  JSAMPARRAY colormap;          /* GIF colormap (converted to my format) */
+
+  /* State for GetCode and LZWReadByte */
+  U_CHAR code_buf[256 + 4];     /* current input data block */
+  int last_byte;                /* # of bytes in code_buf */
+  int last_bit;                 /* # of bits in code_buf */
+  int cur_bit;                  /* next bit index to read */
+  boolean first_time;           /* flags first call to GetCode */
+  boolean out_of_blocks;        /* TRUE if hit terminator data block */
+
+  int input_code_size;          /* codesize given in GIF file */
+  int clear_code, end_code;     /* values for Clear and End codes */
+
+  int code_size;                /* current actual code size */
+  int limit_code;               /* 2^code_size */
+  int max_code;                 /* first unused code value */
+
+  /* Private state for LZWReadByte */
+  int oldcode;                  /* previous LZW symbol */
+  int firstcode;                /* first byte of oldcode's expansion */
+
+  /* LZW symbol table and expansion stack */
+  UINT16 *symbol_head;          /* => table of prefix symbols */
+  UINT8  *symbol_tail;          /* => table of suffix bytes */
+  UINT8  *symbol_stack;         /* => stack for symbol expansions */
+  UINT8  *sp;                   /* stack pointer */
+
+  /* State for interlaced image processing */
+  boolean is_interlaced;        /* TRUE if have interlaced image */
+  jvirt_sarray_ptr interlaced_image; /* full image in interlaced order */
+  JDIMENSION cur_row_number;    /* need to know actual row number */
+  JDIMENSION pass2_offset;      /* # of pixel rows in pass 1 */
+  JDIMENSION pass3_offset;      /* # of pixel rows in passes 1&2 */
+  JDIMENSION pass4_offset;      /* # of pixel rows in passes 1,2,3 */
+} gif_source_struct;
+
+typedef gif_source_struct *gif_source_ptr;
+
+
+/* Forward declarations */
+METHODDEF(JDIMENSION) get_pixel_rows(j_compress_ptr cinfo,
+                                     cjpeg_source_ptr sinfo);
+METHODDEF(JDIMENSION) load_interlaced_image(j_compress_ptr cinfo,
+                                            cjpeg_source_ptr sinfo);
+METHODDEF(JDIMENSION) get_interlaced_row(j_compress_ptr cinfo,
+                                         cjpeg_source_ptr sinfo);
+
+
+LOCAL(int)
+ReadByte(gif_source_ptr sinfo)
+/* Read next byte from GIF file */
+{
+  register FILE *infile = sinfo->pub.input_file;
+  register int c;
+
+  if ((c = getc(infile)) == EOF)
+    ERREXIT(sinfo->cinfo, JERR_INPUT_EOF);
+  return c;
+}
+
+
+LOCAL(int)
+GetDataBlock(gif_source_ptr sinfo, U_CHAR *buf)
+/* Read a GIF data block, which has a leading count byte */
+/* A zero-length block marks the end of a data block sequence */
+{
+  int count;
+
+  count = ReadByte(sinfo);
+  if (count > 0) {
+    if (!ReadOK(sinfo->pub.input_file, buf, count))
+      ERREXIT(sinfo->cinfo, JERR_INPUT_EOF);
+  }
+  return count;
+}
+
+
+LOCAL(void)
+SkipDataBlocks(gif_source_ptr sinfo)
+/* Skip a series of data blocks, until a block terminator is found */
+{
+  U_CHAR buf[256];
+
+  while (GetDataBlock(sinfo, buf) > 0)
+    /* skip */;
+}
+
+
+LOCAL(void)
+ReInitLZW(gif_source_ptr sinfo)
+/* (Re)initialize LZW state; shared code for startup and Clear processing */
+{
+  sinfo->code_size = sinfo->input_code_size + 1;
+  sinfo->limit_code = sinfo->clear_code << 1;   /* 2^code_size */
+  sinfo->max_code = sinfo->clear_code + 2;      /* first unused code value */
+  sinfo->sp = sinfo->symbol_stack;              /* init stack to empty */
+}
+
+
+LOCAL(void)
+InitLZWCode(gif_source_ptr sinfo)
+/* Initialize for a series of LZWReadByte (and hence GetCode) calls */
+{
+  /* GetCode initialization */
+  sinfo->last_byte = 2;         /* make safe to "recopy last two bytes" */
+  sinfo->code_buf[0] = 0;
+  sinfo->code_buf[1] = 0;
+  sinfo->last_bit = 0;          /* nothing in the buffer */
+  sinfo->cur_bit = 0;           /* force buffer load on first call */
+  sinfo->first_time = TRUE;
+  sinfo->out_of_blocks = FALSE;
+
+  /* LZWReadByte initialization: */
+  /* compute special code values (note that these do not change later) */
+  sinfo->clear_code = 1 << sinfo->input_code_size;
+  sinfo->end_code = sinfo->clear_code + 1;
+  ReInitLZW(sinfo);
+}
+
+
+LOCAL(int)
+GetCode(gif_source_ptr sinfo)
+/* Fetch the next code_size bits from the GIF data */
+/* We assume code_size is less than 16 */
+{
+  register int accum;
+  int offs, count;
+
+  while (sinfo->cur_bit + sinfo->code_size > sinfo->last_bit) {
+    /* Time to reload the buffer */
+    /* First time, share code with Clear case */
+    if (sinfo->first_time) {
+      sinfo->first_time = FALSE;
+      return sinfo->clear_code;
+    }
+    if (sinfo->out_of_blocks) {
+      WARNMS(sinfo->cinfo, JWRN_GIF_NOMOREDATA);
+      return sinfo->end_code;   /* fake something useful */
+    }
+    /* preserve last two bytes of what we have -- assume code_size <= 16 */
+    sinfo->code_buf[0] = sinfo->code_buf[sinfo->last_byte-2];
+    sinfo->code_buf[1] = sinfo->code_buf[sinfo->last_byte-1];
+    /* Load more bytes; set flag if we reach the terminator block */
+    if ((count = GetDataBlock(sinfo, &sinfo->code_buf[2])) == 0) {
+      sinfo->out_of_blocks = TRUE;
+      WARNMS(sinfo->cinfo, JWRN_GIF_NOMOREDATA);
+      return sinfo->end_code;   /* fake something useful */
+    }
+    /* Reset counters */
+    sinfo->cur_bit = (sinfo->cur_bit - sinfo->last_bit) + 16;
+    sinfo->last_byte = 2 + count;
+    sinfo->last_bit = sinfo->last_byte * 8;
+  }
+
+  /* Form up next 24 bits in accum */
+  offs = sinfo->cur_bit >> 3;   /* byte containing cur_bit */
+  accum = UCH(sinfo->code_buf[offs + 2]);
+  accum <<= 8;
+  accum |= UCH(sinfo->code_buf[offs + 1]);
+  accum <<= 8;
+  accum |= UCH(sinfo->code_buf[offs]);
+
+  /* Right-align cur_bit in accum, then mask off desired number of bits */
+  accum >>= (sinfo->cur_bit & 7);
+  sinfo->cur_bit += sinfo->code_size;
+  return accum & ((1 << sinfo->code_size) - 1);
+}
+
+
+LOCAL(int)
+LZWReadByte(gif_source_ptr sinfo)
+/* Read an LZW-compressed byte */
+{
+  register int code;            /* current working code */
+  int incode;                   /* saves actual input code */
+
+  /* If any codes are stacked from a previously read symbol, return them */
+  if (sinfo->sp > sinfo->symbol_stack)
+    return (int)(*(--sinfo->sp));
+
+  /* Time to read a new symbol */
+  code = GetCode(sinfo);
+
+  if (code == sinfo->clear_code) {
+    /* Reinit state, swallow any extra Clear codes, and */
+    /* return next code, which is expected to be a raw byte. */
+    ReInitLZW(sinfo);
+    do {
+      code = GetCode(sinfo);
+    } while (code == sinfo->clear_code);
+    if (code > sinfo->clear_code) { /* make sure it is a raw byte */
+      WARNMS(sinfo->cinfo, JWRN_GIF_BADDATA);
+      code = 0;                 /* use something valid */
+    }
+    /* make firstcode, oldcode valid! */
+    sinfo->firstcode = sinfo->oldcode = code;
+    return code;
+  }
+
+  if (code == sinfo->end_code) {
+    /* Skip the rest of the image, unless GetCode already read terminator */
+    if (!sinfo->out_of_blocks) {
+      SkipDataBlocks(sinfo);
+      sinfo->out_of_blocks = TRUE;
+    }
+    /* Complain that there's not enough data */
+    WARNMS(sinfo->cinfo, JWRN_GIF_ENDCODE);
+    /* Pad data with 0's */
+    return 0;                   /* fake something usable */
+  }
+
+  /* Got normal raw byte or LZW symbol */
+  incode = code;                /* save for a moment */
+
+  if (code >= sinfo->max_code) { /* special case for not-yet-defined symbol */
+    /* code == max_code is OK; anything bigger is bad data */
+    if (code > sinfo->max_code) {
+      WARNMS(sinfo->cinfo, JWRN_GIF_BADDATA);
+      incode = 0;               /* prevent creation of loops in symbol table */
+    }
+    /* this symbol will be defined as oldcode/firstcode */
+    *(sinfo->sp++) = (UINT8)sinfo->firstcode;
+    code = sinfo->oldcode;
+  }
+
+  /* If it's a symbol, expand it into the stack */
+  while (code >= sinfo->clear_code) {
+    *(sinfo->sp++) = sinfo->symbol_tail[code]; /* tail is a byte value */
+    code = sinfo->symbol_head[code]; /* head is another LZW symbol */
+  }
+  /* At this point code just represents a raw byte */
+  sinfo->firstcode = code;      /* save for possible future use */
+
+  /* If there's room in table... */
+  if ((code = sinfo->max_code) < LZW_TABLE_SIZE) {
+    /* Define a new symbol = prev sym + head of this sym's expansion */
+    sinfo->symbol_head[code] = (UINT16)sinfo->oldcode;
+    sinfo->symbol_tail[code] = (UINT8)sinfo->firstcode;
+    sinfo->max_code++;
+    /* Is it time to increase code_size? */
+    if (sinfo->max_code >= sinfo->limit_code &&
+        sinfo->code_size < MAX_LZW_BITS) {
+      sinfo->code_size++;
+      sinfo->limit_code <<= 1;  /* keep equal to 2^code_size */
+    }
+  }
+
+  sinfo->oldcode = incode;      /* save last input symbol for future use */
+  return sinfo->firstcode;      /* return first byte of symbol's expansion */
+}
+
+
+LOCAL(void)
+ReadColorMap(gif_source_ptr sinfo, int cmaplen, JSAMPARRAY cmap)
+/* Read a GIF colormap */
+{
+  int i;
+
+  for (i = 0; i < cmaplen; i++) {
+#if BITS_IN_JSAMPLE == 8
+#define UPSCALE(x)  (x)
+#else
+#define UPSCALE(x)  ((x) << (BITS_IN_JSAMPLE - 8))
+#endif
+    cmap[CM_RED][i]   = (JSAMPLE)UPSCALE(ReadByte(sinfo));
+    cmap[CM_GREEN][i] = (JSAMPLE)UPSCALE(ReadByte(sinfo));
+    cmap[CM_BLUE][i]  = (JSAMPLE)UPSCALE(ReadByte(sinfo));
+  }
+}
+
+
+LOCAL(void)
+DoExtension(gif_source_ptr sinfo)
+/* Process an extension block */
+/* Currently we ignore 'em all */
+{
+  int extlabel;
+
+  /* Read extension label byte */
+  extlabel = ReadByte(sinfo);
+  TRACEMS1(sinfo->cinfo, 1, JTRC_GIF_EXTENSION, extlabel);
+  /* Skip the data block(s) associated with the extension */
+  SkipDataBlocks(sinfo);
+}
+
+
+/*
+ * Read the file header; return image size and component count.
+ */
+
+METHODDEF(void)
+start_input_gif(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  gif_source_ptr source = (gif_source_ptr)sinfo;
+  U_CHAR hdrbuf[10];            /* workspace for reading control blocks */
+  unsigned int width, height;   /* image dimensions */
+  int colormaplen, aspectRatio;
+  int c;
+
+  /* Read and verify GIF Header */
+  if (!ReadOK(source->pub.input_file, hdrbuf, 6))
+    ERREXIT(cinfo, JERR_GIF_NOT);
+  if (hdrbuf[0] != 'G' || hdrbuf[1] != 'I' || hdrbuf[2] != 'F')
+    ERREXIT(cinfo, JERR_GIF_NOT);
+  /* Check for expected version numbers.
+   * If unknown version, give warning and try to process anyway;
+   * this is per recommendation in GIF89a standard.
+   */
+  if ((hdrbuf[3] != '8' || hdrbuf[4] != '7' || hdrbuf[5] != 'a') &&
+      (hdrbuf[3] != '8' || hdrbuf[4] != '9' || hdrbuf[5] != 'a'))
+    TRACEMS3(cinfo, 1, JTRC_GIF_BADVERSION, hdrbuf[3], hdrbuf[4], hdrbuf[5]);
+
+  /* Read and decipher Logical Screen Descriptor */
+  if (!ReadOK(source->pub.input_file, hdrbuf, 7))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  width = LM_to_uint(hdrbuf, 0);
+  height = LM_to_uint(hdrbuf, 2);
+  /* we ignore the color resolution, sort flag, and background color index */
+  aspectRatio = UCH(hdrbuf[6]);
+  if (aspectRatio != 0 && aspectRatio != 49)
+    TRACEMS(cinfo, 1, JTRC_GIF_NONSQUARE);
+
+  /* Allocate space to store the colormap */
+  source->colormap = (*cinfo->mem->alloc_sarray)
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, (JDIMENSION)MAXCOLORMAPSIZE,
+     (JDIMENSION)NUMCOLORS);
+  colormaplen = 0;              /* indicate initialization */
+
+  /* Read global colormap if header indicates it is present */
+  if (BitSet(hdrbuf[4], COLORMAPFLAG)) {
+    colormaplen = 2 << (hdrbuf[4] & 0x07);
+    ReadColorMap(source, colormaplen, source->colormap);
+  }
+
+  /* Scan until we reach start of desired image.
+   * We don't currently support skipping images, but could add it easily.
+   */
+  for (;;) {
+    c = ReadByte(source);
+
+    if (c == ';')               /* GIF terminator?? */
+      ERREXIT(cinfo, JERR_GIF_IMAGENOTFOUND);
+
+    if (c == '!') {             /* Extension */
+      DoExtension(source);
+      continue;
+    }
+
+    if (c != ',') {             /* Not an image separator? */
+      WARNMS1(cinfo, JWRN_GIF_CHAR, c);
+      continue;
+    }
+
+    /* Read and decipher Local Image Descriptor */
+    if (!ReadOK(source->pub.input_file, hdrbuf, 9))
+      ERREXIT(cinfo, JERR_INPUT_EOF);
+    /* we ignore top/left position info, also sort flag */
+    width = LM_to_uint(hdrbuf, 4);
+    height = LM_to_uint(hdrbuf, 6);
+    source->is_interlaced = (BitSet(hdrbuf[8], INTERLACE) != 0);
+
+    /* Read local colormap if header indicates it is present */
+    /* Note: if we wanted to support skipping images, */
+    /* we'd need to skip rather than read colormap for ignored images */
+    if (BitSet(hdrbuf[8], COLORMAPFLAG)) {
+      colormaplen = 2 << (hdrbuf[8] & 0x07);
+      ReadColorMap(source, colormaplen, source->colormap);
+    }
+
+    source->input_code_size = ReadByte(source); /* get min-code-size byte */
+    if (source->input_code_size < 2 || source->input_code_size > 8)
+      ERREXIT1(cinfo, JERR_GIF_CODESIZE, source->input_code_size);
+
+    /* Reached desired image, so break out of loop */
+    /* If we wanted to skip this image, */
+    /* we'd call SkipDataBlocks and then continue the loop */
+    break;
+  }
+
+  /* Prepare to read selected image: first initialize LZW decompressor */
+  source->symbol_head = (UINT16 *)
+    (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                LZW_TABLE_SIZE * sizeof(UINT16));
+  source->symbol_tail = (UINT8 *)
+    (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                LZW_TABLE_SIZE * sizeof(UINT8));
+  source->symbol_stack = (UINT8 *)
+    (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                LZW_TABLE_SIZE * sizeof(UINT8));
+  InitLZWCode(source);
+
+  /*
+   * If image is interlaced, we read it into a full-size sample array,
+   * decompressing as we go; then get_interlaced_row selects rows from the
+   * sample array in the proper order.
+   */
+  if (source->is_interlaced) {
+    /* We request the virtual array now, but can't access it until virtual
+     * arrays have been allocated.  Hence, the actual work of reading the
+     * image is postponed until the first call to get_pixel_rows.
+     */
+    source->interlaced_image = (*cinfo->mem->request_virt_sarray)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
+       (JDIMENSION)width, (JDIMENSION)height, (JDIMENSION)1);
+    if (cinfo->progress != NULL) {
+      cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
+      progress->total_extra_passes++; /* count file input as separate pass */
+    }
+    source->pub.get_pixel_rows = load_interlaced_image;
+  } else {
+    source->pub.get_pixel_rows = get_pixel_rows;
+  }
+
+  /* Create compressor input buffer. */
+  source->pub.buffer = (*cinfo->mem->alloc_sarray)
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, (JDIMENSION)width * NUMCOLORS,
+     (JDIMENSION)1);
+  source->pub.buffer_height = 1;
+
+  /* Pad colormap for safety. */
+  for (c = colormaplen; c < source->clear_code; c++) {
+    source->colormap[CM_RED][c]   =
+    source->colormap[CM_GREEN][c] =
+    source->colormap[CM_BLUE][c]  = CENTERJSAMPLE;
+  }
+
+  /* Return info about the image. */
+  cinfo->in_color_space = JCS_RGB;
+  cinfo->input_components = NUMCOLORS;
+  cinfo->data_precision = BITS_IN_JSAMPLE; /* we always rescale data to this */
+  cinfo->image_width = width;
+  cinfo->image_height = height;
+
+  TRACEMS3(cinfo, 1, JTRC_GIF, width, height, colormaplen);
+}
+
+
+/*
+ * Read one row of pixels.
+ * This version is used for noninterlaced GIF images:
+ * we read directly from the GIF file.
+ */
+
+METHODDEF(JDIMENSION)
+get_pixel_rows(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  gif_source_ptr source = (gif_source_ptr)sinfo;
+  register int c;
+  register JSAMPROW ptr;
+  register JDIMENSION col;
+  register JSAMPARRAY colormap = source->colormap;
+
+  ptr = source->pub.buffer[0];
+  for (col = cinfo->image_width; col > 0; col--) {
+    c = LZWReadByte(source);
+    *ptr++ = colormap[CM_RED][c];
+    *ptr++ = colormap[CM_GREEN][c];
+    *ptr++ = colormap[CM_BLUE][c];
+  }
+  return 1;
+}
+
+
+/*
+ * Read one row of pixels.
+ * This version is used for the first call on get_pixel_rows when
+ * reading an interlaced GIF file: we read the whole image into memory.
+ */
+
+METHODDEF(JDIMENSION)
+load_interlaced_image(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  gif_source_ptr source = (gif_source_ptr)sinfo;
+  register JSAMPROW sptr;
+  register JDIMENSION col;
+  JDIMENSION row;
+  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
+
+  /* Read the interlaced image into the virtual array we've created. */
+  for (row = 0; row < cinfo->image_height; row++) {
+    if (progress != NULL) {
+      progress->pub.pass_counter = (long)row;
+      progress->pub.pass_limit = (long)cinfo->image_height;
+      (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
+    }
+    sptr = *(*cinfo->mem->access_virt_sarray)
+      ((j_common_ptr)cinfo, source->interlaced_image, row, (JDIMENSION)1,
+       TRUE);
+    for (col = cinfo->image_width; col > 0; col--) {
+      *sptr++ = (JSAMPLE)LZWReadByte(source);
+    }
+  }
+  if (progress != NULL)
+    progress->completed_extra_passes++;
+
+  /* Replace method pointer so subsequent calls don't come here. */
+  source->pub.get_pixel_rows = get_interlaced_row;
+  /* Initialize for get_interlaced_row, and perform first call on it. */
+  source->cur_row_number = 0;
+  source->pass2_offset = (cinfo->image_height + 7) / 8;
+  source->pass3_offset = source->pass2_offset + (cinfo->image_height + 3) / 8;
+  source->pass4_offset = source->pass3_offset + (cinfo->image_height + 1) / 4;
+
+  return get_interlaced_row(cinfo, sinfo);
+}
+
+
+/*
+ * Read one row of pixels.
+ * This version is used for interlaced GIF images:
+ * we read from the virtual array.
+ */
+
+METHODDEF(JDIMENSION)
+get_interlaced_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  gif_source_ptr source = (gif_source_ptr)sinfo;
+  register int c;
+  register JSAMPROW sptr, ptr;
+  register JDIMENSION col;
+  register JSAMPARRAY colormap = source->colormap;
+  JDIMENSION irow;
+
+  /* Figure out which row of interlaced image is needed, and access it. */
+  switch ((int)(source->cur_row_number & 7)) {
+  case 0:                       /* first-pass row */
+    irow = source->cur_row_number >> 3;
+    break;
+  case 4:                       /* second-pass row */
+    irow = (source->cur_row_number >> 3) + source->pass2_offset;
+    break;
+  case 2:                       /* third-pass row */
+  case 6:
+    irow = (source->cur_row_number >> 2) + source->pass3_offset;
+    break;
+  default:                      /* fourth-pass row */
+    irow = (source->cur_row_number >> 1) + source->pass4_offset;
+  }
+  sptr = *(*cinfo->mem->access_virt_sarray)
+    ((j_common_ptr)cinfo, source->interlaced_image, irow, (JDIMENSION)1,
+     FALSE);
+  /* Scan the row, expand colormap, and output */
+  ptr = source->pub.buffer[0];
+  for (col = cinfo->image_width; col > 0; col--) {
+    c = *sptr++;
+    *ptr++ = colormap[CM_RED][c];
+    *ptr++ = colormap[CM_GREEN][c];
+    *ptr++ = colormap[CM_BLUE][c];
+  }
+  source->cur_row_number++;     /* for next time */
+  return 1;
+}
+
+
+/*
+ * Finish up at the end of the file.
+ */
+
+METHODDEF(void)
+finish_input_gif(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  /* no work */
+}
+
+
 /*
  * The module selection routine for GIF format input.
  */
@@ -31,9 +659,18 @@
 GLOBAL(cjpeg_source_ptr)
 jinit_read_gif(j_compress_ptr cinfo)
 {
-  fprintf(stderr, "GIF input is unsupported for legal reasons.  Sorry.\n");
-  exit(EXIT_FAILURE);
-  return NULL;                  /* keep compiler happy */
+  gif_source_ptr source;
+
+  /* Create module interface object */
+  source = (gif_source_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(gif_source_struct));
+  source->cinfo = cinfo;        /* make back link for subroutines */
+  /* Fill in method ptrs, except get_pixel_rows which start_input sets */
+  source->pub.start_input = start_input_gif;
+  source->pub.finish_input = finish_input_gif;
+
+  return (cjpeg_source_ptr)source;
 }
 
 #endif /* GIF_SUPPORTED */
diff --git a/rdppm.c b/rdppm.c
index 2a58e79..c4c937e 100644
--- a/rdppm.c
+++ b/rdppm.c
@@ -43,18 +43,8 @@
 
 /* Macros to deal with unsigned chars as efficiently as compiler allows */
 
-#ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char U_CHAR;
 #define UCH(x)  ((int)(x))
-#else /* !HAVE_UNSIGNED_CHAR */
-#ifdef __CHAR_UNSIGNED__
-typedef char U_CHAR;
-#define UCH(x)  ((int)(x))
-#else
-typedef char U_CHAR;
-#define UCH(x)  ((int)(x) & 0xFF)
-#endif
-#endif /* HAVE_UNSIGNED_CHAR */
 
 
 #define ReadOK(file, buffer, len) \
diff --git a/rdrle.c b/rdrle.c
deleted file mode 100644
index b694514..0000000
--- a/rdrle.c
+++ /dev/null
@@ -1,389 +0,0 @@
-/*
- * rdrle.c
- *
- * This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1991-1996, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code and
- * information relevant to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README.ijg
- * file.
- *
- * This file contains routines to read input images in Utah RLE format.
- * The Utah Raster Toolkit library is required (version 3.1 or later).
- *
- * These routines may need modification for non-Unix environments or
- * specialized applications.  As they stand, they assume input from
- * an ordinary stdio stream.  They further assume that reading begins
- * at the start of the file; start_input may need work if the
- * user interface has already read some data (e.g., to determine that
- * the file is indeed RLE format).
- *
- * Based on code contributed by Mike Lijewski,
- * with updates from Robert Hutchinson.
- */
-
-#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
-
-#ifdef RLE_SUPPORTED
-
-/* rle.h is provided by the Utah Raster Toolkit. */
-
-#include <rle.h>
-
-/*
- * We assume that JSAMPLE has the same representation as rle_pixel,
- * to wit, "unsigned char".  Hence we can't cope with 12- or 16-bit samples.
- */
-
-#if BITS_IN_JSAMPLE != 8
-  Sorry, this code only copes with 8-bit JSAMPLEs. /* deliberate syntax err */
-#endif
-
-/*
- * We support the following types of RLE files:
- *
- *   GRAYSCALE   - 8 bits, no colormap
- *   MAPPEDGRAY  - 8 bits, 1 channel colomap
- *   PSEUDOCOLOR - 8 bits, 3 channel colormap
- *   TRUECOLOR   - 24 bits, 3 channel colormap
- *   DIRECTCOLOR - 24 bits, no colormap
- *
- * For now, we ignore any alpha channel in the image.
- */
-
-typedef enum
-  { GRAYSCALE, MAPPEDGRAY, PSEUDOCOLOR, TRUECOLOR, DIRECTCOLOR } rle_kind;
-
-
-/*
- * Since RLE stores scanlines bottom-to-top, we have to invert the image
- * to conform to JPEG's top-to-bottom order.  To do this, we read the
- * incoming image into a virtual array on the first get_pixel_rows call,
- * then fetch the required row from the virtual array on subsequent calls.
- */
-
-typedef struct _rle_source_struct *rle_source_ptr;
-
-typedef struct _rle_source_struct {
-  struct cjpeg_source_struct pub; /* public fields */
-
-  rle_kind visual;              /* actual type of input file */
-  jvirt_sarray_ptr image;       /* virtual array to hold the image */
-  JDIMENSION row;               /* current row # in the virtual array */
-  rle_hdr header;               /* Input file information */
-  rle_pixel **rle_row;          /* holds a row returned by rle_getrow() */
-
-} rle_source_struct;
-
-
-/*
- * Read the file header; return image size and component count.
- */
-
-METHODDEF(void)
-start_input_rle(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
-{
-  rle_source_ptr source = (rle_source_ptr)sinfo;
-  JDIMENSION width, height;
-#ifdef PROGRESS_REPORT
-  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
-#endif
-
-  /* Use RLE library routine to get the header info */
-  source->header = *rle_hdr_init(NULL);
-  source->header.rle_file = source->pub.input_file;
-  switch (rle_get_setup(&(source->header))) {
-  case RLE_SUCCESS:
-    /* A-OK */
-    break;
-  case RLE_NOT_RLE:
-    ERREXIT(cinfo, JERR_RLE_NOT);
-    break;
-  case RLE_NO_SPACE:
-    ERREXIT(cinfo, JERR_RLE_MEM);
-    break;
-  case RLE_EMPTY:
-    ERREXIT(cinfo, JERR_RLE_EMPTY);
-    break;
-  case RLE_EOF:
-    ERREXIT(cinfo, JERR_RLE_EOF);
-    break;
-  default:
-    ERREXIT(cinfo, JERR_RLE_BADERROR);
-    break;
-  }
-
-  /* Figure out what we have, set private vars and return values accordingly */
-
-  width  = source->header.xmax - source->header.xmin + 1;
-  height = source->header.ymax - source->header.ymin + 1;
-  source->header.xmin = 0;              /* realign horizontally */
-  source->header.xmax = width - 1;
-
-  cinfo->image_width      = width;
-  cinfo->image_height     = height;
-  cinfo->data_precision   = 8;  /* we can only handle 8 bit data */
-
-  if (source->header.ncolors == 1 && source->header.ncmap == 0) {
-    source->visual     = GRAYSCALE;
-    TRACEMS2(cinfo, 1, JTRC_RLE_GRAY, width, height);
-  } else if (source->header.ncolors == 1 && source->header.ncmap == 1) {
-    source->visual     = MAPPEDGRAY;
-    TRACEMS3(cinfo, 1, JTRC_RLE_MAPGRAY, width, height,
-             1 << source->header.cmaplen);
-  } else if (source->header.ncolors == 1 && source->header.ncmap == 3) {
-    source->visual     = PSEUDOCOLOR;
-    TRACEMS3(cinfo, 1, JTRC_RLE_MAPPED, width, height,
-             1 << source->header.cmaplen);
-  } else if (source->header.ncolors == 3 && source->header.ncmap == 3) {
-    source->visual     = TRUECOLOR;
-    TRACEMS3(cinfo, 1, JTRC_RLE_FULLMAP, width, height,
-             1 << source->header.cmaplen);
-  } else if (source->header.ncolors == 3 && source->header.ncmap == 0) {
-    source->visual     = DIRECTCOLOR;
-    TRACEMS2(cinfo, 1, JTRC_RLE, width, height);
-  } else
-    ERREXIT(cinfo, JERR_RLE_UNSUPPORTED);
-
-  if (source->visual == GRAYSCALE || source->visual == MAPPEDGRAY) {
-    cinfo->in_color_space   = JCS_GRAYSCALE;
-    cinfo->input_components = 1;
-  } else {
-    cinfo->in_color_space   = JCS_RGB;
-    cinfo->input_components = 3;
-  }
-
-  /*
-   * A place to hold each scanline while it's converted.
-   * (GRAYSCALE scanlines don't need converting)
-   */
-  if (source->visual != GRAYSCALE) {
-    source->rle_row = (rle_pixel **)(*cinfo->mem->alloc_sarray)
-      ((j_common_ptr)cinfo, JPOOL_IMAGE,
-       (JDIMENSION)width, (JDIMENSION)cinfo->input_components);
-  }
-
-  /* request a virtual array to hold the image */
-  source->image = (*cinfo->mem->request_virt_sarray)
-    ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
-     (JDIMENSION)(width * source->header.ncolors),
-     (JDIMENSION)height, (JDIMENSION)1);
-
-#ifdef PROGRESS_REPORT
-  if (progress != NULL) {
-    /* count file input as separate pass */
-    progress->total_extra_passes++;
-  }
-#endif
-
-  source->pub.buffer_height = 1;
-}
-
-
-/*
- * Read one row of pixels.
- * Called only after load_image has read the image into the virtual array.
- * Used for GRAYSCALE, MAPPEDGRAY, TRUECOLOR, and DIRECTCOLOR images.
- */
-
-METHODDEF(JDIMENSION)
-get_rle_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
-{
-  rle_source_ptr source = (rle_source_ptr)sinfo;
-
-  source->row--;
-  source->pub.buffer = (*cinfo->mem->access_virt_sarray)
-    ((j_common_ptr)cinfo, source->image, source->row, (JDIMENSION)1, FALSE);
-
-  return 1;
-}
-
-/*
- * Read one row of pixels.
- * Called only after load_image has read the image into the virtual array.
- * Used for PSEUDOCOLOR images.
- */
-
-METHODDEF(JDIMENSION)
-get_pseudocolor_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
-{
-  rle_source_ptr source = (rle_source_ptr)sinfo;
-  JSAMPROW src_row, dest_row;
-  JDIMENSION col;
-  rle_map *colormap;
-  int val;
-
-  colormap = source->header.cmap;
-  dest_row = source->pub.buffer[0];
-  source->row--;
-  src_row = *(*cinfo->mem->access_virt_sarray)
-    ((j_common_ptr)cinfo, source->image, source->row, (JDIMENSION)1, FALSE);
-
-  for (col = cinfo->image_width; col > 0; col--) {
-    val = GETJSAMPLE(*src_row++);
-    *dest_row++ = (JSAMPLE)(colormap[val      ] >> 8);
-    *dest_row++ = (JSAMPLE)(colormap[val + 256] >> 8);
-    *dest_row++ = (JSAMPLE)(colormap[val + 512] >> 8);
-  }
-
-  return 1;
-}
-
-
-/*
- * Load the image into a virtual array.  We have to do this because RLE
- * files start at the lower left while the JPEG standard has them starting
- * in the upper left.  This is called the first time we want to get a row
- * of input.  What we do is load the RLE data into the array and then call
- * the appropriate routine to read one row from the array.  Before returning,
- * we set source->pub.get_pixel_rows so that subsequent calls go straight to
- * the appropriate row-reading routine.
- */
-
-METHODDEF(JDIMENSION)
-load_image(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
-{
-  rle_source_ptr source = (rle_source_ptr)sinfo;
-  JDIMENSION row, col;
-  JSAMPROW scanline, red_ptr, green_ptr, blue_ptr;
-  rle_pixel **rle_row;
-  rle_map *colormap;
-  char channel;
-#ifdef PROGRESS_REPORT
-  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
-#endif
-
-  colormap = source->header.cmap;
-  rle_row = source->rle_row;
-
-  /* Read the RLE data into our virtual array.
-   * We assume here that rle_pixel is represented the same as JSAMPLE.
-   */
-  RLE_CLR_BIT(source->header, RLE_ALPHA); /* don't read the alpha channel */
-
-#ifdef PROGRESS_REPORT
-  if (progress != NULL) {
-    progress->pub.pass_limit = cinfo->image_height;
-    progress->pub.pass_counter = 0;
-    (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
-  }
-#endif
-
-  switch (source->visual) {
-
-  case GRAYSCALE:
-  case PSEUDOCOLOR:
-    for (row = 0; row < cinfo->image_height; row++) {
-      rle_row = (rle_pixel **)(*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr)cinfo, source->image, row, (JDIMENSION)1, TRUE);
-      rle_getrow(&source->header, rle_row);
-#ifdef PROGRESS_REPORT
-      if (progress != NULL) {
-        progress->pub.pass_counter++;
-        (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
-      }
-#endif
-    }
-    break;
-
-  case MAPPEDGRAY:
-  case TRUECOLOR:
-    for (row = 0; row < cinfo->image_height; row++) {
-      scanline = *(*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr)cinfo, source->image, row, (JDIMENSION)1, TRUE);
-      rle_row = source->rle_row;
-      rle_getrow(&source->header, rle_row);
-
-      for (col = 0; col < cinfo->image_width; col++) {
-        for (channel = 0; channel < source->header.ncolors; channel++) {
-          *scanline++ = (JSAMPLE)
-            (colormap[GETJSAMPLE(rle_row[channel][col]) + 256 * channel] >> 8);
-        }
-      }
-
-#ifdef PROGRESS_REPORT
-      if (progress != NULL) {
-        progress->pub.pass_counter++;
-        (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
-      }
-#endif
-    }
-    break;
-
-  case DIRECTCOLOR:
-    for (row = 0; row < cinfo->image_height; row++) {
-      scanline = *(*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr)cinfo, source->image, row, (JDIMENSION)1, TRUE);
-      rle_getrow(&source->header, rle_row);
-
-      red_ptr   = rle_row[0];
-      green_ptr = rle_row[1];
-      blue_ptr  = rle_row[2];
-
-      for (col = cinfo->image_width; col > 0; col--) {
-        *scanline++ = *red_ptr++;
-        *scanline++ = *green_ptr++;
-        *scanline++ = *blue_ptr++;
-      }
-
-#ifdef PROGRESS_REPORT
-      if (progress != NULL) {
-        progress->pub.pass_counter++;
-        (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
-      }
-#endif
-    }
-  }
-
-#ifdef PROGRESS_REPORT
-  if (progress != NULL)
-    progress->completed_extra_passes++;
-#endif
-
-  /* Set up to call proper row-extraction routine in future */
-  if (source->visual == PSEUDOCOLOR) {
-    source->pub.buffer = source->rle_row;
-    source->pub.get_pixel_rows = get_pseudocolor_row;
-  } else {
-    source->pub.get_pixel_rows = get_rle_row;
-  }
-  source->row = cinfo->image_height;
-
-  /* And fetch the topmost (bottommost) row */
-  return (*source->pub.get_pixel_rows) (cinfo, sinfo);
-}
-
-
-/*
- * Finish up at the end of the file.
- */
-
-METHODDEF(void)
-finish_input_rle(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
-{
-  /* no work */
-}
-
-
-/*
- * The module selection routine for RLE format input.
- */
-
-GLOBAL(cjpeg_source_ptr)
-jinit_read_rle(j_compress_ptr cinfo)
-{
-  rle_source_ptr source;
-
-  /* Create module interface object */
-  source = (rle_source_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                sizeof(rle_source_struct));
-  /* Fill in method ptrs */
-  source->pub.start_input = start_input_rle;
-  source->pub.finish_input = finish_input_rle;
-  source->pub.get_pixel_rows = load_image;
-
-  return (cjpeg_source_ptr)source;
-}
-
-#endif /* RLE_SUPPORTED */
diff --git a/rdtarga.c b/rdtarga.c
index cd8a363..c17073f 100644
--- a/rdtarga.c
+++ b/rdtarga.c
@@ -28,18 +28,8 @@
 
 /* Macros to deal with unsigned chars as efficiently as compiler allows */
 
-#ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char U_CHAR;
 #define UCH(x)  ((int)(x))
-#else /* !HAVE_UNSIGNED_CHAR */
-#ifdef __CHAR_UNSIGNED__
-typedef char U_CHAR;
-#define UCH(x)  ((int)(x))
-#else
-typedef char U_CHAR;
-#define UCH(x)  ((int)(x) & 0xFF)
-#endif
-#endif /* HAVE_UNSIGNED_CHAR */
 
 
 #define ReadOK(file, buffer, len) \
diff --git a/release/Config.cmake.in b/release/Config.cmake.in
new file mode 100644
index 0000000..0c1ba8a
--- /dev/null
+++ b/release/Config.cmake.in
@@ -0,0 +1,4 @@
+@PACKAGE_INIT@
+
+include("${CMAKE_CURRENT_LIST_DIR}/@CMAKE_PROJECT_NAME@Targets.cmake")
+check_required_components("@CMAKE_PROJECT_NAME@")
diff --git a/release/ReadMe.txt b/release/ReadMe.txt
index 0d1888d..446ce46 100644
--- a/release/ReadMe.txt
+++ b/release/ReadMe.txt
@@ -1,4 +1,4 @@
-libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and MIPS systems, as well as progressive JPEG compression on x86 and x86-64 systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines.  In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Arm systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines.  In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
 
 libjpeg-turbo implements both the traditional libjpeg API as well as the less powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features colorspace extensions that allow it to compress from/decompress to 32-bit and big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java interface.
 
diff --git a/release/Welcome.rtf b/release/Welcome.rtf.in
similarity index 91%
rename from release/Welcome.rtf
rename to release/Welcome.rtf.in
index a570c5b..6bec24d 100644
--- a/release/Welcome.rtf
+++ b/release/Welcome.rtf.in
@@ -9,9 +9,9 @@
 \
 \pard\pardeftab720\ql\qnatural
 
-\f1 \cf0   /opt/libjpeg-turbo/bin/uninstall\
+\f1 \cf0   @CMAKE_INSTALL_FULL_BINDIR@/uninstall\
 \pard\pardeftab720\ql\qnatural
 
 \f0 \cf0 \
 from the command line.\
-}
\ No newline at end of file
+}
diff --git a/release/deb-control.in b/release/deb-control.in
index b82bdac..72bceec 100644
--- a/release/deb-control.in
+++ b/release/deb-control.in
@@ -10,12 +10,12 @@
 Description: A SIMD-accelerated JPEG codec that provides both the libjpeg and TurboJPEG APIs
  libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
  baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and
- MIPS systems, as well as progressive JPEG compression on x86 and x86-64
- systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
- all else being equal.  On other types of systems, libjpeg-turbo can still
- outperform libjpeg by a significant amount, by virtue of its highly-optimized
- Huffman coding routines.  In many cases, the performance of libjpeg-turbo
- rivals that of proprietary high-speed JPEG codecs.
+ MIPS systems, as well as progressive JPEG compression on x86, x86-64, and
+ Arm systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as
+ libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can
+ still outperform libjpeg by a significant amount, by virtue of its
+ highly-optimized Huffman coding routines.  In many cases, the performance of
+ libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
  .
  libjpeg-turbo implements both the traditional libjpeg API as well as the less
  powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features
diff --git a/release/installer.nsi.in b/release/installer.nsi.in
index 44419fa..65db63d 100644
--- a/release/installer.nsi.in
+++ b/release/installer.nsi.in
@@ -71,6 +71,11 @@
 	SetOutPath $INSTDIR\lib\pkgconfig
 	File "@CMAKE_CURRENT_BINARY_DIR@\pkgscripts\libjpeg.pc"
 	File "@CMAKE_CURRENT_BINARY_DIR@\pkgscripts\libturbojpeg.pc"
+	SetOutPath $INSTDIR\lib\cmake\@CMAKE_PROJECT_NAME@
+	File "@CMAKE_CURRENT_BINARY_DIR@\pkgscripts\@CMAKE_PROJECT_NAME@Config.cmake"
+	File "@CMAKE_CURRENT_BINARY_DIR@\pkgscripts\@CMAKE_PROJECT_NAME@ConfigVersion.cmake"
+	File "@CMAKE_CURRENT_BINARY_DIR@\win\@CMAKE_PROJECT_NAME@Targets.cmake"
+	File "@CMAKE_CURRENT_BINARY_DIR@\win\@CMAKE_PROJECT_NAME@Targets-release.cmake"
 !ifdef JAVA
 	SetOutPath $INSTDIR\classes
 	File "@CMAKE_CURRENT_BINARY_DIR@\java\turbojpeg.jar"
@@ -141,6 +146,10 @@
 !endif
 	Delete $INSTDIR\lib\pkgconfig\libjpeg.pc
 	Delete $INSTDIR\lib\pkgconfig\libturbojpeg.pc
+	Delete $INSTDIR\lib\cmake\@CMAKE_PROJECT_NAME@\@CMAKE_PROJECT_NAME@Config.cmake
+	Delete $INSTDIR\lib\cmake\@CMAKE_PROJECT_NAME@\@CMAKE_PROJECT_NAME@ConfigVersion.cmake
+	Delete $INSTDIR\lib\cmake\@CMAKE_PROJECT_NAME@\@CMAKE_PROJECT_NAME@Targets.cmake
+	Delete $INSTDIR\lib\cmake\@CMAKE_PROJECT_NAME@\@CMAKE_PROJECT_NAME@Targets-release.cmake
 !ifdef JAVA
 	Delete $INSTDIR\classes\turbojpeg.jar
 !endif
@@ -176,6 +185,8 @@
 
 	RMDir "$INSTDIR\include"
 	RMDir "$INSTDIR\lib\pkgconfig"
+	RMDir "$INSTDIR\lib\cmake\@CMAKE_PROJECT_NAME@"
+	RMDir "$INSTDIR\lib\cmake"
 	RMDir "$INSTDIR\lib"
 	RMDir "$INSTDIR\doc"
 !ifdef GCC
diff --git a/release/makecygwinpkg.in b/release/makecygwinpkg.in
deleted file mode 100755
index b7f353e..0000000
--- a/release/makecygwinpkg.in
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/sh
-
-set -u
-set -e
-trap onexit INT
-trap onexit TERM
-trap onexit EXIT
-
-TMPDIR=
-
-onexit()
-{
-	if [ ! "$TMPDIR" = "" ]; then
-		rm -rf $TMPDIR
-	fi
-}
-
-safedirmove ()
-{
-	if [ "$1" = "$2" ]; then
-		return 0
-	fi
-	if [ "$1" = "" -o ! -d "$1" ]; then
-		echo safedirmove: source dir $1 is not valid
-		return 1
-	fi
-	if [ "$2" = "" -o -e "$2" ]; then
-		echo safedirmove: dest dir $2 is not valid
-		return 1
-	fi
-	if [ "$3" = "" -o -e "$3" ]; then
-		echo safedirmove: tmp dir $3 is not valid
-		return 1
-	fi
-	mkdir -p $3
-	mv $1/* $3/
-	rmdir $1
-	mkdir -p $2
-	mv $3/* $2/
-	rmdir $3
-	return 0
-}
-
-PKGNAME=@PKGNAME@
-VERSION=@VERSION@
-BUILD=@BUILD@
-
-PREFIX=@CMAKE_INSTALL_PREFIX@
-DOCDIR=@CMAKE_INSTALL_FULL_DOCDIR@
-LIBDIR=@CMAKE_INSTALL_FULL_LIBDIR@
-
-umask 022
-rm -f $PKGNAME-$VERSION-$BUILD.tar.bz2
-TMPDIR=`mktemp -d /tmp/ljtbuild.XXXXXX`
-__PWD=`pwd`
-make install DESTDIR=$TMPDIR/pkg
-if [ "$PREFIX" = "@CMAKE_INSTALL_DEFAULT_PREFIX@" -a "$DOCDIR" = "@CMAKE_INSTALL_DEFAULT_PREFIX@/doc" ]; then
-	safedirmove $TMPDIR/pkg$DOCDIR $TMPDIR/pkg/usr/share/doc/$PKGNAME-$VERSION $TMPDIR/__tmpdoc
-	ln -fs /usr/share/doc/$PKGNAME-$VERSION $TMPDIR/pkg$DOCDIR
-fi
-cd $TMPDIR/pkg
-tar cfj ../$PKGNAME-$VERSION-$BUILD.tar.bz2 *
-cd $__PWD
-mv $TMPDIR/*.tar.bz2 .
-
-exit 0
diff --git a/release/makemacpkg.in b/release/makemacpkg.in
index ae80bec..99c6e05 100755
--- a/release/makemacpkg.in
+++ b/release/makemacpkg.in
@@ -43,19 +43,14 @@
 
 usage()
 {
-	echo "$0 [universal] [-lipo [path to lipo]]"
+	echo "$0 [-lipo [path to lipo]]"
 	exit 1
 }
 
-UNIVERSAL=0
-
 PKGNAME=@PKGNAME@
 VERSION=@VERSION@
 BUILD=@BUILD@
 SRCDIR=@CMAKE_CURRENT_SOURCE_DIR@
-BUILDDIR32=@OSX_32BIT_BUILD@
-BUILDDIRARMV7=@IOS_ARMV7_BUILD@
-BUILDDIRARMV7S=@IOS_ARMV7S_BUILD@
 BUILDDIRARMV8=@IOS_ARMV8_BUILD@
 WITH_JAVA=@WITH_JAVA@
 OSX_APP_CERT_NAME="@OSX_APP_CERT_NAME@"
@@ -82,9 +77,6 @@
 			fi
 		fi
 		;;
-	universal)
-		UNIVERSAL=1
-		;;
 	esac
 	shift
 done
@@ -106,61 +98,6 @@
 	ln -fs /Library/Documentation/$PKGNAME $PKGROOT$DOCDIR
 fi
 
-if [ $UNIVERSAL = 1 -a "$BUILDDIR32" != "" ]; then
-	if [ ! -d $BUILDDIR32 ]; then
-		echo ERROR: 32-bit build directory $BUILDDIR32 does not exist
-		exit 1
-	fi
-	if [ ! -f $BUILDDIR32/Makefile ]; then
-		echo ERROR: 32-bit build directory $BUILDDIR32 is not configured
-		exit 1
-	fi
-	mkdir -p $TMPDIR/dist.x86
-	pushd $BUILDDIR32
-	make install DESTDIR=$TMPDIR/dist.x86
-	popd
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$LIBDIR/$LIBJPEG_DSO_NAME \
-		-arch x86_64 $PKGROOT/$LIBDIR/$LIBJPEG_DSO_NAME \
-		-output $PKGROOT/$LIBDIR/$LIBJPEG_DSO_NAME
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$LIBDIR/libjpeg.a \
-		-arch x86_64 $PKGROOT/$LIBDIR/libjpeg.a \
-		-output $PKGROOT/$LIBDIR/libjpeg.a
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$LIBDIR/$TURBOJPEG_DSO_NAME \
-		-arch x86_64 $PKGROOT/$LIBDIR/$TURBOJPEG_DSO_NAME \
-		-output $PKGROOT/$LIBDIR/$TURBOJPEG_DSO_NAME
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$LIBDIR/libturbojpeg.a \
-		-arch x86_64 $PKGROOT/$LIBDIR/libturbojpeg.a \
-		-output $PKGROOT/$LIBDIR/libturbojpeg.a
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$BINDIR/cjpeg \
-		-arch x86_64 $PKGROOT/$BINDIR/cjpeg \
-		-output $PKGROOT/$BINDIR/cjpeg
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$BINDIR/djpeg \
-		-arch x86_64 $PKGROOT/$BINDIR/djpeg \
-		-output $PKGROOT/$BINDIR/djpeg
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$BINDIR/jpegtran \
-		-arch x86_64 $PKGROOT/$BINDIR/jpegtran \
-		-output $PKGROOT/$BINDIR/jpegtran
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$BINDIR/tjbench \
-		-arch x86_64 $PKGROOT/$BINDIR/tjbench \
-		-output $PKGROOT/$BINDIR/tjbench
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$BINDIR/rdjpgcom \
-		-arch x86_64 $PKGROOT/$BINDIR/rdjpgcom \
-		-output $PKGROOT/$BINDIR/rdjpgcom
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$BINDIR/wrjpgcom \
-		-arch x86_64 $PKGROOT/$BINDIR/wrjpgcom \
-		-output $PKGROOT/$BINDIR/wrjpgcom
-fi
-
 install_ios()
 {
 	BUILDDIR=$1
@@ -222,28 +159,14 @@
 		-output $PKGROOT/$BINDIR/wrjpgcom
 }
 
-if [ $UNIVERSAL = 1 -a "$BUILDDIRARMV7" != "" ]; then
-	install_ios $BUILDDIRARMV7 Armv7 armv7 arm
-fi
-
-if [ $UNIVERSAL = 1 -a "$BUILDDIRARMV7S" != "" ]; then
-	install_ios $BUILDDIRARMV7S Armv7s armv7s arm
-fi
-
-if [ $UNIVERSAL = 1 -a "$BUILDDIRARMV8" != "" ]; then
+if [ "$BUILDDIRARMV8" != "" ]; then
 	install_ios $BUILDDIRARMV8 Armv8 armv8 arm64
 fi
 
 install_name_tool -id $LIBDIR/$LIBJPEG_DSO_NAME $PKGROOT/$LIBDIR/$LIBJPEG_DSO_NAME
 install_name_tool -id $LIBDIR/$TURBOJPEG_DSO_NAME $PKGROOT/$LIBDIR/$TURBOJPEG_DSO_NAME
 
-if [ $WITH_JAVA = 1 ]; then
-	ln -fs $TURBOJPEG_DSO_NAME $PKGROOT/$LIBDIR/libturbojpeg.jnilib
-fi
 if [ "$PREFIX" = "@CMAKE_INSTALL_DEFAULT_PREFIX@" -a "$LIBDIR" = "@CMAKE_INSTALL_DEFAULT_PREFIX@/lib" ]; then
-	if [ ! -h $PKGROOT/$PREFIX/lib32 ]; then
-		ln -fs lib $PKGROOT/$PREFIX/lib32
-	fi
 	if [ ! -h $PKGROOT/$PREFIX/lib64 ]; then
 		ln -fs lib $PKGROOT/$PREFIX/lib64
 	fi
@@ -255,7 +178,7 @@
 
 find $PKGROOT -type f | while read file; do xattr -c $file; done
 
-cp $SRCDIR/release/License.rtf $SRCDIR/release/Welcome.rtf $SRCDIR/release/ReadMe.txt $TMPDIR/pkg/
+cp $SRCDIR/release/License.rtf pkgscripts/Welcome.rtf $SRCDIR/release/ReadMe.txt $TMPDIR/pkg/
 
 mkdir $TMPDIR/dmg
 pkgbuild --root $PKGROOT --version $VERSION.$BUILD --identifier @PKGID@ \
diff --git a/release/rpm.spec.in b/release/rpm.spec.in
index 6b5cd2b..05410a6 100644
--- a/release/rpm.spec.in
+++ b/release/rpm.spec.in
@@ -53,7 +53,7 @@
 %description
 libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
 baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and
-MIPS systems, as well as progressive JPEG compression on x86 and x86-64
+MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Arm
 systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
 all else being equal.  On other types of systems, libjpeg-turbo can still
 outperform libjpeg by a significant amount, by virtue of its highly-optimized
@@ -184,6 +184,9 @@
 %endif
 %dir %{_libdir}/pkgconfig
 %{_libdir}/pkgconfig/libjpeg.pc
+%dir %{_libdir}/cmake
+%dir %{_libdir}/cmake/@CMAKE_PROJECT_NAME@
+%{_libdir}/cmake/@CMAKE_PROJECT_NAME@
 %if "%{_with_turbojpeg}" == "1"
 	%if "%{_enable_shared}" == "1" || "%{_with_java}" == "1"
 		%{_libdir}/libturbojpeg.so.@TURBOJPEG_SO_VERSION@
diff --git a/release/uninstall.in b/release/uninstall.in
index cf1ba77..34757b0 100644
--- a/release/uninstall.in
+++ b/release/uninstall.in
@@ -1,4 +1,5 @@
-# Copyright (C)2009-2011, 2013, 2016 D. R. Commander.  All Rights Reserved.
+# Copyright (C)2009-2011, 2013, 2016, 2020 D. R. Commander.
+#                                          All Rights Reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
@@ -70,6 +71,12 @@
 if [ -d $LIBDIR/pkgconfig ]; then
 	rmdir $LIBDIR/pkgconfig 2>&1 || EXITSTATUS=-1
 fi
+if [ -d $LIBDIR/cmake/@CMAKE_PROJECT_NAME@ ]; then
+	rmdir $LIBDIR/cmake/@CMAKE_PROJECT_NAME@ || EXITSTATUS=-1
+fi
+if [ -d $LIBDIR/cmake ]; then
+	rmdir $LIBDIR/cmake || EXITSTATUS=-1
+fi
 if [ -d $LIBDIR ]; then
 	rmdir $LIBDIR 2>&1 || EXITSTATUS=-1
 fi
@@ -90,7 +97,7 @@
 if [ -d $MANDIR ]; then
 	rmdir $MANDIR 2>&1 || EXITSTATUS=-1
 fi
-if [ -d $JAVADIR ]; then
+if [ -d "$JAVADIR" ]; then
 	rmdir $JAVADIR 2>&1 || EXITSTATUS=-1
 fi
 if [ -d $DATAROOTDIR -a "$DATAROOTDIR" != "$PREFIX" ]; then
diff --git a/sharedlib/CMakeLists.txt b/sharedlib/CMakeLists.txt
index 8d65e58..78a2f28 100644
--- a/sharedlib/CMakeLists.txt
+++ b/sharedlib/CMakeLists.txt
@@ -88,10 +88,13 @@
 add_executable(jcstest ../jcstest.c)
 target_link_libraries(jcstest jpeg)
 
-install(TARGETS jpeg cjpeg djpeg jpegtran
+install(TARGETS jpeg EXPORT ${CMAKE_PROJECT_NAME}Targets
+  INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
   RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+install(TARGETS cjpeg djpeg jpegtran
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 if(NOT CMAKE_VERSION VERSION_LESS "3.1" AND MSVC AND
   CMAKE_C_LINKER_SUPPORTS_PDB)
   install(FILES "$<TARGET_PDB_FILE:jpeg>"
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
index ba0bd13..f3c24ef 100644
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -30,6 +30,9 @@
   if(CYGWIN)
     set(CMAKE_ASM_NASM_OBJECT_FORMAT win64)
   endif()
+  if(CMAKE_C_COMPILER_ABI MATCHES "ELF X32")
+    set(CMAKE_ASM_NASM_OBJECT_FORMAT elfx32)
+  endif()
 elseif(CPU_TYPE STREQUAL "i386")
   if(BORLAND)
     set(CMAKE_ASM_NASM_OBJECT_FORMAT obj)
@@ -205,64 +208,76 @@
 
 
 ###############################################################################
-# Arm (GAS)
+# Arm (Intrinsics or GAS)
 ###############################################################################
 
 elseif(CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm")
 
-enable_language(ASM)
+include(CheckSymbolExists)
+if(BITS EQUAL 32)
+  set(CMAKE_REQUIRED_FLAGS -mfpu=neon)
+endif()
+check_symbol_exists(vld1_s16_x3 arm_neon.h HAVE_VLD1_S16_X3)
+check_symbol_exists(vld1_u16_x2 arm_neon.h HAVE_VLD1_U16_X2)
+check_symbol_exists(vld1q_u8_x4 arm_neon.h HAVE_VLD1Q_U8_X4)
+if(BITS EQUAL 32)
+  unset(CMAKE_REQUIRED_FLAGS)
+endif()
+configure_file(arm/neon-compat.h.in arm/neon-compat.h @ONLY)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/arm)
 
-set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_ASM_FLAGS}")
-
-string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
-set(EFFECTIVE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CMAKE_ASM_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
-message(STATUS "CMAKE_ASM_FLAGS = ${EFFECTIVE_ASM_FLAGS}")
-
-# Test whether we need gas-preprocessor.pl
-if(CPU_TYPE STREQUAL "arm")
-  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S "
-    .text
-    .fpu neon
-    .arch armv7a
-    .object_arch armv4
-    .arm
-    pld [r0]
-    vmovn.u16 d0, q0")
+# GCC (as of this writing) and some older versions of Clang do not have a full
+# or optimal set of Neon intrinsics, so for performance reasons, when using
+# those compilers, we default to using the older GAS implementation of the Neon
+# SIMD extensions for certain algorithms.  The presence or absence of the three
+# intrinsics we tested above is a reasonable proxy for this.  We always default
+# to using the full Neon intrinsics implementation when building for macOS or
+# iOS, to avoid the need for gas-preprocessor.
+if((HAVE_VLD1_S16_X3 AND HAVE_VLD1_U16_X2 AND HAVE_VLD1Q_U8_X4) OR APPLE)
+  set(DEFAULT_NEON_INTRINSICS 1)
 else()
-  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S "
-    .text
-    MYVAR .req x0
-    movi v0.16b, #100
-    mov MYVAR, #100
-    .unreq MYVAR")
+  set(DEFAULT_NEON_INTRINSICS 0)
+endif()
+option(NEON_INTRINSICS
+  "Because GCC (as of this writing) and some older versions of Clang do not have a full or optimal set of Neon intrinsics, for performance reasons, the default when building libjpeg-turbo with those compilers is to continue using the older GAS implementation of the Neon SIMD extensions for certain algorithms.  Setting this option forces the full Neon intrinsics implementation to be used with all compilers.  Unsetting this option forces the hybrid GAS/intrinsics implementation to be used with all compilers."
+  ${DEFAULT_NEON_INTRINSICS})
+boolean_number(NEON_INTRINSICS PARENT_SCOPE)
+if(NEON_INTRINSICS)
+  add_definitions(-DNEON_INTRINSICS)
+  message(STATUS "Use full Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
+else()
+  message(STATUS "Use partial Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
 endif()
 
-separate_arguments(CMAKE_ASM_FLAGS_SEP UNIX_COMMAND "${CMAKE_ASM_FLAGS}")
+set(SIMD_SOURCES arm/jcgray-neon.c arm/jcphuff-neon.c arm/jcsample-neon.c
+  arm/jdmerge-neon.c arm/jdsample-neon.c arm/jfdctfst-neon.c
+  arm/jidctred-neon.c arm/jquanti-neon.c)
+if(NEON_INTRINSICS)
+  set(SIMD_SOURCES ${SIMD_SOURCES} arm/jccolor-neon.c arm/jidctint-neon.c)
+endif()
+if(NEON_INTRINSICS OR BITS EQUAL 64)
+  set(SIMD_SOURCES ${SIMD_SOURCES} arm/jidctfst-neon.c)
+endif()
+if(NEON_INTRINSICS OR BITS EQUAL 32)
+  set(SIMD_SOURCES ${SIMD_SOURCES} arm/aarch${BITS}/jchuff-neon.c
+    arm/jdcolor-neon.c arm/jfdctint-neon.c)
+endif()
+if(BITS EQUAL 32)
+  set_source_files_properties(${SIMD_SOURCES} COMPILE_FLAGS -mfpu=neon)
+endif()
+if(NOT NEON_INTRINSICS)
+  enable_language(ASM)
 
-execute_process(COMMAND ${CMAKE_ASM_COMPILER} ${CMAKE_ASM_FLAGS_SEP}
-    -x assembler-with-cpp -c ${CMAKE_CURRENT_BINARY_DIR}/gastest.S
-  RESULT_VARIABLE RESULT OUTPUT_VARIABLE OUTPUT ERROR_VARIABLE ERROR)
-if(NOT RESULT EQUAL 0)
-  message(STATUS "GAS appears to be broken.  Trying gas-preprocessor.pl ...")
-  execute_process(COMMAND gas-preprocessor.pl ${CMAKE_ASM_COMPILER}
-      ${CMAKE_ASM_FLAGS_SEP} -x assembler-with-cpp -c
-      ${CMAKE_CURRENT_BINARY_DIR}/gastest.S
-    RESULT_VARIABLE RESULT OUTPUT_VARIABLE OUTPUT ERROR_VARIABLE ERROR)
-  if(NOT RESULT EQUAL 0)
-    simd_fail("SIMD extensions disabled: GAS is not working properly")
-    return()
-  else()
-    message(STATUS "Using gas-preprocessor.pl")
-    configure_file(gas-preprocessor.in gas-preprocessor @ONLY)
-    set(CMAKE_ASM_COMPILER ${CMAKE_CURRENT_BINARY_DIR}/gas-preprocessor)
-  endif()
-else()
-  message(STATUS "GAS is working properly")
+  set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_ASM_FLAGS}")
+
+  string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
+  set(EFFECTIVE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CMAKE_ASM_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
+  message(STATUS "CMAKE_ASM_FLAGS = ${EFFECTIVE_ASM_FLAGS}")
+
+  set(SIMD_SOURCES ${SIMD_SOURCES} arm/aarch${BITS}/jsimd_neon.S)
 endif()
 
-file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S)
-
-add_library(simd OBJECT ${CPU_TYPE}/jsimd_neon.S ${CPU_TYPE}/jsimd.c)
+add_library(simd OBJECT ${SIMD_SOURCES} arm/aarch${BITS}/jsimd.c)
 
 if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
   set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
@@ -311,14 +326,35 @@
 endif()
 
 ###############################################################################
-# Loongson (Intrinsics)
+# MIPS64 (Intrinsics)
 ###############################################################################
 
-elseif(CPU_TYPE STREQUAL "loongson")
+elseif(CPU_TYPE STREQUAL "loongson" OR CPU_TYPE MATCHES "mips64*")
 
-set(SIMD_SOURCES loongson/jccolor-mmi.c loongson/jcsample-mmi.c
-  loongson/jdcolor-mmi.c loongson/jdsample-mmi.c loongson/jfdctint-mmi.c
-  loongson/jidctint-mmi.c loongson/jquanti-mmi.c)
+set(CMAKE_REQUIRED_FLAGS -Wa,-mloongson-mmi,-mloongson-ext)
+
+check_c_source_compiles("
+  int main(void) {
+    int c = 0, a = 0, b = 0;
+    asm (
+      \"paddb %0, %1, %2\"
+      : \"=f\" (c)
+      : \"f\" (a), \"f\" (b)
+    );
+    return c;
+  }" HAVE_MMI)
+
+unset(CMAKE_REQUIRED_FLAGS)
+
+if(NOT HAVE_MMI)
+  simd_fail("SIMD extensions not available for this CPU")
+  return()
+endif()
+
+set(SIMD_SOURCES mips64/jccolor-mmi.c mips64/jcgray-mmi.c mips64/jcsample-mmi.c
+  mips64/jdcolor-mmi.c mips64/jdmerge-mmi.c mips64/jdsample-mmi.c
+  mips64/jfdctfst-mmi.c mips64/jfdctint-mmi.c mips64/jidctfst-mmi.c
+  mips64/jidctint-mmi.c mips64/jquanti-mmi.c)
 
 if(CMAKE_COMPILER_IS_GNUCC)
   foreach(file ${SIMD_SOURCES})
@@ -326,8 +362,12 @@
       " -fno-strict-aliasing")
   endforeach()
 endif()
+foreach(file ${SIMD_SOURCES})
+  set_property(SOURCE ${file} APPEND_STRING PROPERTY COMPILE_FLAGS
+    " -Wa,-mloongson-mmi,-mloongson-ext")
+endforeach()
 
-add_library(simd OBJECT ${SIMD_SOURCES} loongson/jsimd.c)
+add_library(simd OBJECT ${SIMD_SOURCES} mips64/jsimd.c)
 
 if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
   set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
diff --git a/simd/arm/aarch32/jccolext-neon.c b/simd/arm/aarch32/jccolext-neon.c
new file mode 100644
index 0000000..96b44d8
--- /dev/null
+++ b/simd/arm/aarch32/jccolext-neon.c
@@ -0,0 +1,147 @@
+/*
+ * jccolext-neon.c - colorspace conversion (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *    Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128
+ *    Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ *    0.16874695 = 11059 * 2^-16
+ *    0.33125305 = 21709 * 2^-16
+ *    0.50000000 = 32768 * 2^-16
+ *    0.41868592 = 27439 * 2^-16
+ *    0.08131409 =  5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  /* Pointer to RGB(X/A) input data */
+  JSAMPROW inptr;
+  /* Pointers to Y, Cb, and Cr output data */
+  JSAMPROW outptr0, outptr1, outptr2;
+
+  /* Set up conversion constants. */
+#ifdef HAVE_VLD1_U16_X2
+  const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x2(). */
+  const uint16x4_t consts1 = vld1_u16(jsimd_rgb_ycc_neon_consts);
+  const uint16x4_t consts2 = vld1_u16(jsimd_rgb_ycc_neon_consts + 4);
+  const uint16x4x2_t consts = { { consts1, consts2 } };
+#endif
+  const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining > 0; cols_remaining -= 8) {
+
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 8) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      if (cols_remaining < 8) {
+        ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
+        memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+        inptr = tmp_buf;
+      }
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+      uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+      uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+      uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+      uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_low = vmull_lane_u16(vget_low_u16(r), consts.val[0], 0);
+      y_low = vmlal_lane_u16(y_low, vget_low_u16(g), consts.val[0], 1);
+      y_low = vmlal_lane_u16(y_low, vget_low_u16(b), consts.val[0], 2);
+      uint32x4_t y_high = vmull_lane_u16(vget_high_u16(r), consts.val[0], 0);
+      y_high = vmlal_lane_u16(y_high, vget_high_u16(g), consts.val[0], 1);
+      y_high = vmlal_lane_u16(y_high, vget_high_u16(b), consts.val[0], 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_low = scaled_128_5;
+      cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(r), consts.val[0], 3);
+      cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(g), consts.val[1], 0);
+      cb_low = vmlal_lane_u16(cb_low, vget_low_u16(b), consts.val[1], 1);
+      uint32x4_t cb_high = scaled_128_5;
+      cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(r), consts.val[0], 3);
+      cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(g), consts.val[1], 0);
+      cb_high = vmlal_lane_u16(cb_high, vget_high_u16(b), consts.val[1], 1);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_low = scaled_128_5;
+      cr_low = vmlal_lane_u16(cr_low, vget_low_u16(r), consts.val[1], 1);
+      cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(g), consts.val[1], 2);
+      cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(b), consts.val[1], 3);
+      uint32x4_t cr_high = scaled_128_5;
+      cr_high = vmlal_lane_u16(cr_high, vget_high_u16(r), consts.val[1], 1);
+      cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(g), consts.val[1], 2);
+      cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(b), consts.val[1], 3);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_low, 16),
+                                      vrshrn_n_u32(y_high, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_low, 16),
+                                       vshrn_n_u32(cb_high, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_low, 16),
+                                       vshrn_n_u32(cr_high, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1_u8(outptr0, vmovn_u16(y_u16));
+      vst1_u8(outptr1, vmovn_u16(cb_u16));
+      vst1_u8(outptr2, vmovn_u16(cr_u16));
+
+      /* Increment pointers. */
+      inptr += (8 * RGB_PIXELSIZE);
+      outptr0 += 8;
+      outptr1 += 8;
+      outptr2 += 8;
+    }
+  }
+}
diff --git a/simd/arm/aarch32/jchuff-neon.c b/simd/arm/aarch32/jchuff-neon.c
new file mode 100644
index 0000000..941c9b2
--- /dev/null
+++ b/simd/arm/aarch32/jchuff-neon.c
@@ -0,0 +1,332 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../jchuff.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+                                         JCOEFPTR block, int last_dc_val,
+                                         c_derived_tbl *dctbl,
+                                         c_derived_tbl *actbl)
+{
+  uint8_t block_nbits[DCTSIZE2];
+  uint16_t block_diff[DCTSIZE2];
+
+  /* Load rows of coefficients from DCT block in zig-zag order. */
+
+  /* Compute DC coefficient difference value. (F.1.1.5.1) */
+  int16x8_t row0 = vdupq_n_s16(block[0] - last_dc_val);
+  row0 = vld1q_lane_s16(block +  1, row0, 1);
+  row0 = vld1q_lane_s16(block +  8, row0, 2);
+  row0 = vld1q_lane_s16(block + 16, row0, 3);
+  row0 = vld1q_lane_s16(block +  9, row0, 4);
+  row0 = vld1q_lane_s16(block +  2, row0, 5);
+  row0 = vld1q_lane_s16(block +  3, row0, 6);
+  row0 = vld1q_lane_s16(block + 10, row0, 7);
+
+  int16x8_t row1 = vld1q_dup_s16(block + 17);
+  row1 = vld1q_lane_s16(block + 24, row1, 1);
+  row1 = vld1q_lane_s16(block + 32, row1, 2);
+  row1 = vld1q_lane_s16(block + 25, row1, 3);
+  row1 = vld1q_lane_s16(block + 18, row1, 4);
+  row1 = vld1q_lane_s16(block + 11, row1, 5);
+  row1 = vld1q_lane_s16(block +  4, row1, 6);
+  row1 = vld1q_lane_s16(block +  5, row1, 7);
+
+  int16x8_t row2 = vld1q_dup_s16(block + 12);
+  row2 = vld1q_lane_s16(block + 19, row2, 1);
+  row2 = vld1q_lane_s16(block + 26, row2, 2);
+  row2 = vld1q_lane_s16(block + 33, row2, 3);
+  row2 = vld1q_lane_s16(block + 40, row2, 4);
+  row2 = vld1q_lane_s16(block + 48, row2, 5);
+  row2 = vld1q_lane_s16(block + 41, row2, 6);
+  row2 = vld1q_lane_s16(block + 34, row2, 7);
+
+  int16x8_t row3 = vld1q_dup_s16(block + 27);
+  row3 = vld1q_lane_s16(block + 20, row3, 1);
+  row3 = vld1q_lane_s16(block + 13, row3, 2);
+  row3 = vld1q_lane_s16(block +  6, row3, 3);
+  row3 = vld1q_lane_s16(block +  7, row3, 4);
+  row3 = vld1q_lane_s16(block + 14, row3, 5);
+  row3 = vld1q_lane_s16(block + 21, row3, 6);
+  row3 = vld1q_lane_s16(block + 28, row3, 7);
+
+  int16x8_t abs_row0 = vabsq_s16(row0);
+  int16x8_t abs_row1 = vabsq_s16(row1);
+  int16x8_t abs_row2 = vabsq_s16(row2);
+  int16x8_t abs_row3 = vabsq_s16(row3);
+
+  int16x8_t row0_lz = vclzq_s16(abs_row0);
+  int16x8_t row1_lz = vclzq_s16(abs_row1);
+  int16x8_t row2_lz = vclzq_s16(abs_row2);
+  int16x8_t row3_lz = vclzq_s16(abs_row3);
+
+  /* Compute number of bits required to represent each coefficient. */
+  uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
+  uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
+  uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
+  uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
+
+  vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
+  vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
+  vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
+  vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
+
+  uint16x8_t row0_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row0, 15)),
+              vnegq_s16(row0_lz));
+  uint16x8_t row1_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row1, 15)),
+              vnegq_s16(row1_lz));
+  uint16x8_t row2_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row2, 15)),
+              vnegq_s16(row2_lz));
+  uint16x8_t row3_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row3, 15)),
+              vnegq_s16(row3_lz));
+
+  uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
+  uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), row1_mask);
+  uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), row2_mask);
+  uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), row3_mask);
+
+  /* Store diff values for rows 0, 1, 2, and 3. */
+  vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+  vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+  vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+  vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+
+  /* Load last four rows of coefficients from DCT block in zig-zag order. */
+  int16x8_t row4 = vld1q_dup_s16(block + 35);
+  row4 = vld1q_lane_s16(block + 42, row4, 1);
+  row4 = vld1q_lane_s16(block + 49, row4, 2);
+  row4 = vld1q_lane_s16(block + 56, row4, 3);
+  row4 = vld1q_lane_s16(block + 57, row4, 4);
+  row4 = vld1q_lane_s16(block + 50, row4, 5);
+  row4 = vld1q_lane_s16(block + 43, row4, 6);
+  row4 = vld1q_lane_s16(block + 36, row4, 7);
+
+  int16x8_t row5 = vld1q_dup_s16(block + 29);
+  row5 = vld1q_lane_s16(block + 22, row5, 1);
+  row5 = vld1q_lane_s16(block + 15, row5, 2);
+  row5 = vld1q_lane_s16(block + 23, row5, 3);
+  row5 = vld1q_lane_s16(block + 30, row5, 4);
+  row5 = vld1q_lane_s16(block + 37, row5, 5);
+  row5 = vld1q_lane_s16(block + 44, row5, 6);
+  row5 = vld1q_lane_s16(block + 51, row5, 7);
+
+  int16x8_t row6 = vld1q_dup_s16(block + 58);
+  row6 = vld1q_lane_s16(block + 59, row6, 1);
+  row6 = vld1q_lane_s16(block + 52, row6, 2);
+  row6 = vld1q_lane_s16(block + 45, row6, 3);
+  row6 = vld1q_lane_s16(block + 38, row6, 4);
+  row6 = vld1q_lane_s16(block + 31, row6, 5);
+  row6 = vld1q_lane_s16(block + 39, row6, 6);
+  row6 = vld1q_lane_s16(block + 46, row6, 7);
+
+  int16x8_t row7 = vld1q_dup_s16(block + 53);
+  row7 = vld1q_lane_s16(block + 60, row7, 1);
+  row7 = vld1q_lane_s16(block + 61, row7, 2);
+  row7 = vld1q_lane_s16(block + 54, row7, 3);
+  row7 = vld1q_lane_s16(block + 47, row7, 4);
+  row7 = vld1q_lane_s16(block + 55, row7, 5);
+  row7 = vld1q_lane_s16(block + 62, row7, 6);
+  row7 = vld1q_lane_s16(block + 63, row7, 7);
+
+  int16x8_t abs_row4 = vabsq_s16(row4);
+  int16x8_t abs_row5 = vabsq_s16(row5);
+  int16x8_t abs_row6 = vabsq_s16(row6);
+  int16x8_t abs_row7 = vabsq_s16(row7);
+
+  int16x8_t row4_lz = vclzq_s16(abs_row4);
+  int16x8_t row5_lz = vclzq_s16(abs_row5);
+  int16x8_t row6_lz = vclzq_s16(abs_row6);
+  int16x8_t row7_lz = vclzq_s16(abs_row7);
+
+  /* Compute number of bits required to represent each coefficient. */
+  uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
+  uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
+  uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
+  uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
+
+  vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
+  vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
+  vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
+  vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
+
+  uint16x8_t row4_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row4, 15)),
+              vnegq_s16(row4_lz));
+  uint16x8_t row5_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row5, 15)),
+              vnegq_s16(row5_lz));
+  uint16x8_t row6_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row6, 15)),
+              vnegq_s16(row6_lz));
+  uint16x8_t row7_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row7, 15)),
+              vnegq_s16(row7_lz));
+
+  uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), row4_mask);
+  uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), row5_mask);
+  uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), row6_mask);
+  uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), row7_mask);
+
+  /* Store diff values for rows 4, 5, 6, and 7. */
+  vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+  vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+  vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+  vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+  /* Construct bitmap to accelerate encoding of AC coefficients.  A set bit
+   * means that the corresponding coefficient != 0.
+   */
+  uint8x8_t row0_nbits_gt0 = vcgt_u8(row0_nbits, vdup_n_u8(0));
+  uint8x8_t row1_nbits_gt0 = vcgt_u8(row1_nbits, vdup_n_u8(0));
+  uint8x8_t row2_nbits_gt0 = vcgt_u8(row2_nbits, vdup_n_u8(0));
+  uint8x8_t row3_nbits_gt0 = vcgt_u8(row3_nbits, vdup_n_u8(0));
+  uint8x8_t row4_nbits_gt0 = vcgt_u8(row4_nbits, vdup_n_u8(0));
+  uint8x8_t row5_nbits_gt0 = vcgt_u8(row5_nbits, vdup_n_u8(0));
+  uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
+  uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));
+
+  const uint8x8_t bitmap_mask =
+    { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
+
+  row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
+  row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
+  row2_nbits_gt0 = vand_u8(row2_nbits_gt0, bitmap_mask);
+  row3_nbits_gt0 = vand_u8(row3_nbits_gt0, bitmap_mask);
+  row4_nbits_gt0 = vand_u8(row4_nbits_gt0, bitmap_mask);
+  row5_nbits_gt0 = vand_u8(row5_nbits_gt0, bitmap_mask);
+  row6_nbits_gt0 = vand_u8(row6_nbits_gt0, bitmap_mask);
+  row7_nbits_gt0 = vand_u8(row7_nbits_gt0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_10 = vpadd_u8(row1_nbits_gt0, row0_nbits_gt0);
+  uint8x8_t bitmap_rows_32 = vpadd_u8(row3_nbits_gt0, row2_nbits_gt0);
+  uint8x8_t bitmap_rows_54 = vpadd_u8(row5_nbits_gt0, row4_nbits_gt0);
+  uint8x8_t bitmap_rows_76 = vpadd_u8(row7_nbits_gt0, row6_nbits_gt0);
+  uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
+  uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
+  uint8x8_t bitmap = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
+
+  /* Shift left to remove DC bit. */
+  bitmap = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap), 1));
+  /* Move bitmap to 32-bit scalar registers. */
+  uint32_t bitmap_1_32 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 1);
+  uint32_t bitmap_33_63 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 0);
+
+  /* Set up state and bit buffer for output bitstream. */
+  working_state *state_ptr = (working_state *)state;
+  int free_bits = state_ptr->cur.free_bits;
+  size_t put_buffer = state_ptr->cur.put_buffer;
+
+  /* Encode DC coefficient. */
+
+  unsigned int nbits = block_nbits[0];
+  /* Emit Huffman-coded symbol and additional diff bits. */
+  unsigned int diff = block_diff[0];
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+  /* Encode AC coefficients. */
+
+  unsigned int r = 0;  /* r = run length of zeros */
+  unsigned int i = 1;  /* i = number of coefficients encoded */
+  /* Code and size information for a run length of 16 zero coefficients */
+  const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+  const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+  while (bitmap_1_32 != 0) {
+    r = __builtin_clz(bitmap_1_32);
+    i += r;
+    bitmap_1_32 <<= r;
+    nbits = block_nbits[i];
+    diff = block_diff[i];
+    while (r > 15) {
+      /* If run length > 15, emit special run-length-16 codes. */
+      PUT_BITS(code_0xf0, size_0xf0)
+      r -= 16;
+    }
+    /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+    unsigned int rs = (r << 4) + nbits;
+    PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+    i++;
+    bitmap_1_32 <<= 1;
+  }
+
+  r = 33 - i;
+  i = 33;
+
+  while (bitmap_33_63 != 0) {
+    unsigned int leading_zeros = __builtin_clz(bitmap_33_63);
+    r += leading_zeros;
+    i += leading_zeros;
+    bitmap_33_63 <<= leading_zeros;
+    nbits = block_nbits[i];
+    diff = block_diff[i];
+    while (r > 15) {
+      /* If run length > 15, emit special run-length-16 codes. */
+      PUT_BITS(code_0xf0, size_0xf0)
+      r -= 16;
+    }
+    /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+    unsigned int rs = (r << 4) + nbits;
+    PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+    r = 0;
+    i++;
+    bitmap_33_63 <<= 1;
+  }
+
+  /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+   * The value of RS for the EOB code is 0.
+   */
+  if (i != 64) {
+    PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+  }
+
+  state_ptr->cur.put_buffer = put_buffer;
+  state_ptr->cur.free_bits = free_bits;
+
+  return buffer;
+}
diff --git a/simd/arm/jsimd.c b/simd/arm/aarch32/jsimd.c
similarity index 68%
rename from simd/arm/jsimd.c
rename to simd/arm/aarch32/jsimd.c
index 709656c..fac55df 100644
--- a/simd/arm/jsimd.c
+++ b/simd/arm/aarch32/jsimd.c
@@ -6,6 +6,7 @@
  * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
  * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
  * Copyright (C) 2019, Google LLC.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -17,12 +18,12 @@
  */
 
 #define JPEG_INTERNALS
-#include "../../jinclude.h"
-#include "../../jpeglib.h"
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
 #include "../../jsimd.h"
-#include "../../jdct.h"
-#include "../../jsimddct.h"
-#include "../jsimd.h"
 
 #include <stdio.h>
 #include <string.h>
@@ -164,6 +165,19 @@
 GLOBAL(int)
 jsimd_can_rgb_gray(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -246,6 +260,37 @@
                        JSAMPIMAGE output_buf, JDIMENSION output_row,
                        int num_rows)
 {
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_gray_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_extbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_gray_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_gray_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
 }
 
 GLOBAL(void)
@@ -298,12 +343,38 @@
 GLOBAL(int)
 jsimd_can_h2v2_downsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_downsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -311,23 +382,50 @@
 jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
                       JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
 }
 
 GLOBAL(void)
 jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
                       JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
 }
 
 GLOBAL(int)
 jsimd_can_h2v2_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -335,17 +433,32 @@
 jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
 }
 
 GLOBAL(void)
 jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
 }
 
 GLOBAL(int)
 jsimd_can_h2v2_fancy_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -366,10 +479,30 @@
   return 0;
 }
 
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
 GLOBAL(void)
 jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
 }
 
 GLOBAL(void)
@@ -381,15 +514,46 @@
                                  output_data_ptr);
 }
 
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
 GLOBAL(int)
 jsimd_can_h2v2_merged_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_merged_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -397,12 +561,74 @@
 jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                            JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
 }
 
 GLOBAL(void)
 jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                            JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
 }
 
 GLOBAL(int)
@@ -448,6 +674,17 @@
 GLOBAL(int)
 jsimd_can_fdct_islow(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -477,6 +714,7 @@
 GLOBAL(void)
 jsimd_fdct_islow(DCTELEM *data)
 {
+  jsimd_fdct_islow_neon(data);
 }
 
 GLOBAL(void)
@@ -696,6 +934,16 @@
 GLOBAL(int)
 jsimd_can_encode_mcu_AC_first_prepare(void)
 {
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -704,11 +952,23 @@
                                   const int *jpeg_natural_order_start, int Sl,
                                   int Al, JCOEF *values, size_t *zerobits)
 {
+  jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
 }
 
 GLOBAL(int)
 jsimd_can_encode_mcu_AC_refine_prepare(void)
 {
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -717,5 +977,7 @@
                                    const int *jpeg_natural_order_start, int Sl,
                                    int Al, JCOEF *absvalues, size_t *bits)
 {
-  return 0;
+  return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+                                                 jpeg_natural_order_start, Sl,
+                                                 Al, absvalues, bits);
 }
diff --git a/simd/arm/aarch32/jsimd_neon.S b/simd/arm/aarch32/jsimd_neon.S
new file mode 100644
index 0000000..7e1e2b1
--- /dev/null
+++ b/simd/arm/aarch32/jsimd_neon.S
@@ -0,0 +1,1200 @@
+/*
+ * Armv7 Neon optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
+ *                          All Rights Reserved.
+ * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2014, Siarhei Siamashka.  All Rights Reserved.
+ * Copyright (C) 2014, Linaro Limited.  All Rights Reserved.
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
+#endif
+
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.arm
+.syntax unified
+
+
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+#ifdef __APPLE__
+    .private_extern _\fname
+    .globl _\fname
+_\fname:
+#else
+    .global \fname
+#ifdef __ELF__
+    .hidden \fname
+    .type \fname, %function
+#endif
+\fname:
+#endif
+.endm
+
+
+#define CENTERJSAMPLE  128
+
+/*****************************************************************************/
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ *
+ * GLOBAL(void)
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ *                       JSAMPARRAY output_buf, JDIMENSION output_col)
+ */
+
+#define FIX_0_298631336  (2446)
+#define FIX_0_390180644  (3196)
+#define FIX_0_541196100  (4433)
+#define FIX_0_765366865  (6270)
+#define FIX_0_899976223  (7373)
+#define FIX_1_175875602  (9633)
+#define FIX_1_501321110  (12299)
+#define FIX_1_847759065  (15137)
+#define FIX_1_961570560  (16069)
+#define FIX_2_053119869  (16819)
+#define FIX_2_562915447  (20995)
+#define FIX_3_072711026  (25172)
+
+#define FIX_1_175875602_MINUS_1_961570560  (FIX_1_175875602 - FIX_1_961570560)
+#define FIX_1_175875602_MINUS_0_390180644  (FIX_1_175875602 - FIX_0_390180644)
+#define FIX_0_541196100_MINUS_1_847759065  (FIX_0_541196100 - FIX_1_847759065)
+#define FIX_3_072711026_MINUS_2_562915447  (FIX_3_072711026 - FIX_2_562915447)
+#define FIX_0_298631336_MINUS_0_899976223  (FIX_0_298631336 - FIX_0_899976223)
+#define FIX_1_501321110_MINUS_0_899976223  (FIX_1_501321110 - FIX_0_899976223)
+#define FIX_2_053119869_MINUS_2_562915447  (FIX_2_053119869 - FIX_2_562915447)
+#define FIX_0_541196100_PLUS_0_765366865   (FIX_0_541196100 + FIX_0_765366865)
+
+/*
+ * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
+ * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
+ */
+#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
+  DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
+  JLONG   q1, q2, q3, q4, q5, q6, q7; \
+  JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2; \
+  \
+  /* 1-D iDCT input data */ \
+  row0 = xrow0; \
+  row1 = xrow1; \
+  row2 = xrow2; \
+  row3 = xrow3; \
+  row4 = xrow4; \
+  row5 = xrow5; \
+  row6 = xrow6; \
+  row7 = xrow7; \
+  \
+  q5 = row7 + row3; \
+  q4 = row5 + row1; \
+  q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
+       MULTIPLY(q4, FIX_1_175875602); \
+  q7 = MULTIPLY(q5, FIX_1_175875602) + \
+       MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
+  q2 = MULTIPLY(row2, FIX_0_541196100) + \
+       MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
+  q4 = q6; \
+  q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
+  q6 += MULTIPLY(row5, -FIX_2_562915447) + \
+        MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
+  /* now we can use q1 (reloadable constants have been used up) */ \
+  q1 = q3 + q2; \
+  q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
+        MULTIPLY(row1, -FIX_0_899976223); \
+  q5 = q7; \
+  q1 = q1 + q6; \
+  q7 += MULTIPLY(row7, -FIX_0_899976223) + \
+        MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
+  \
+  /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
+  tmp11_plus_tmp2 = q1; \
+  row1 = 0; \
+  \
+  q1 = q1 - q6; \
+  q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
+        MULTIPLY(row3, -FIX_2_562915447); \
+  q1 = q1 - q6; \
+  q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
+       MULTIPLY(row6, FIX_0_541196100); \
+  q3 = q3 - q2; \
+  \
+  /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
+  tmp11_minus_tmp2 = q1; \
+  \
+  q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
+  q2 = q1 + q6; \
+  q1 = q1 - q6; \
+  \
+  /* pick up the results */ \
+  tmp0  = q4; \
+  tmp1  = q5; \
+  tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
+  tmp3  = q7; \
+  tmp10 = q2; \
+  tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
+  tmp12 = q3; \
+  tmp13 = q1; \
+}
+
+#define XFIX_0_899976223                    d0[0]
+#define XFIX_0_541196100                    d0[1]
+#define XFIX_2_562915447                    d0[2]
+#define XFIX_0_298631336_MINUS_0_899976223  d0[3]
+#define XFIX_1_501321110_MINUS_0_899976223  d1[0]
+#define XFIX_2_053119869_MINUS_2_562915447  d1[1]
+#define XFIX_0_541196100_PLUS_0_765366865   d1[2]
+#define XFIX_1_175875602                    d1[3]
+#define XFIX_1_175875602_MINUS_0_390180644  d2[0]
+#define XFIX_0_541196100_MINUS_1_847759065  d2[1]
+#define XFIX_3_072711026_MINUS_2_562915447  d2[2]
+#define XFIX_1_175875602_MINUS_1_961570560  d2[3]
+
+.balign 16
+jsimd_idct_islow_neon_consts:
+  .short FIX_0_899976223                    /* d0[0] */
+  .short FIX_0_541196100                    /* d0[1] */
+  .short FIX_2_562915447                    /* d0[2] */
+  .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
+  .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
+  .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
+  .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
+  .short FIX_1_175875602                    /* d1[3] */
+  /* reloadable constants */
+  .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
+  .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
+  .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
+  .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
+
+asm_function jsimd_idct_islow_neon
+
+    DCT_TABLE       .req r0
+    COEF_BLOCK      .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_COL      .req r3
+    TMP1            .req r0
+    TMP2            .req r1
+    TMP3            .req r2
+    TMP4            .req ip
+
+    ROW0L           .req d16
+    ROW0R           .req d17
+    ROW1L           .req d18
+    ROW1R           .req d19
+    ROW2L           .req d20
+    ROW2R           .req d21
+    ROW3L           .req d22
+    ROW3R           .req d23
+    ROW4L           .req d24
+    ROW4R           .req d25
+    ROW5L           .req d26
+    ROW5R           .req d27
+    ROW6L           .req d28
+    ROW6R           .req d29
+    ROW7L           .req d30
+    ROW7R           .req d31
+
+    /* Load and dequantize coefficients into Neon registers
+     * with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17     ( q8  )
+     *   1 | d18     | d19     ( q9  )
+     *   2 | d20     | d21     ( q10 )
+     *   3 | d22     | d23     ( q11 )
+     *   4 | d24     | d25     ( q12 )
+     *   5 | d26     | d27     ( q13 )
+     *   6 | d28     | d29     ( q14 )
+     *   7 | d30     | d31     ( q15 )
+     */
+    adr             ip, jsimd_idct_islow_neon_consts
+    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
+    vmul.s16        q8, q8, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q9, q9, q1
+    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
+    vmul.s16        q10, q10, q2
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vmul.s16        q11, q11, q3
+    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
+    vmul.s16        q12, q12, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q14, q14, q2
+    vmul.s16        q13, q13, q1
+    vld1.16         {d0, d1, d2, d3}, [ip, :128]  /* load constants */
+    add             ip, ip, #16
+    vmul.s16        q15, q15, q3
+    vpush           {d8 - d15}                    /* save Neon registers */
+    /* 1-D IDCT, pass 1, left 4x8 half */
+    vadd.s16        d4, ROW7L, ROW3L
+    vadd.s16        d5, ROW5L, ROW1L
+    vmull.s16       q6, d4, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, d5, XFIX_1_175875602
+    vmull.s16       q7, d4, XFIX_1_175875602
+      /* Check for the zero coefficients in the right 4x8 half */
+      push            {r4, r5}
+    vmlal.s16       q7, d5, XFIX_1_175875602_MINUS_0_390180644
+    vsubl.s16       q3, ROW0L, ROW4L
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vmlal.s16       q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
+      orr             r0, r4, r5
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5L, XFIX_2_562915447
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vshl.s32        q3, q3, #13
+      orr             r0, r0, r4
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+      orr             r0, r0, r5
+    vadd.s32        q1, q3, q2
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+      orr             r0, r0, r4
+    vmlsl.s16       q7, ROW7L, XFIX_0_899976223
+      orr             r0, r0, r5
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vrshrn.s32      ROW1L, q1, #11
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
+      orr             r0, r0, r4
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+      orr             r0, r0, r5
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
+    vmlal.s16       q6, ROW6L, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+      orr             r0, r0, r4
+    vrshrn.s32      ROW6L, q1, #11
+      orr             r0, r0, r5
+    vadd.s32        q1, q3, q5
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0L, ROW4L
+      orr             r0, r0, r4
+    vrshrn.s32      ROW2L, q1, #11
+      orr             r0, r0, r5
+    vrshrn.s32      ROW5L, q3, #11
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
+      orr             r0, r0, r4
+    vadd.s32        q2, q5, q6
+      orrs            r0, r0, r5
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+      orr             r0, r4, r5
+    vsub.s32        q3, q1, q4
+      pop             {r4, r5}
+    vrshrn.s32      ROW7L, q2, #11
+    vrshrn.s32      ROW3L, q5, #11
+    vrshrn.s32      ROW0L, q6, #11
+    vrshrn.s32      ROW4L, q3, #11
+
+      beq             3f  /* Go to do some special handling for the sparse
+                             right 4x8 half */
+
+    /* 1-D IDCT, pass 1, right 4x8 half */
+    vld1.s16        {d2}, [ip, :64]  /* reload constants */
+    vadd.s16        d10, ROW7R, ROW3R
+    vadd.s16        d8, ROW5R, ROW1R
+      /* Transpose left 4x8 half */
+      vtrn.16         ROW6L, ROW7L
+    vmull.s16       q6, d10, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, d8, XFIX_1_175875602
+      vtrn.16         ROW2L, ROW3L
+    vmull.s16       q7, d10, XFIX_1_175875602
+    vmlal.s16       q7, d8, XFIX_1_175875602_MINUS_0_390180644
+      vtrn.16         ROW0L, ROW1L
+    vsubl.s16       q3, ROW0R, ROW4R
+    vmull.s16       q2, ROW2R, XFIX_0_541196100
+    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+      vtrn.16         ROW4L, ROW5L
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
+    vmlal.s16       q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
+      vtrn.32         ROW1L, ROW3L
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW1R, XFIX_0_899976223
+      vtrn.32         ROW4L, ROW6L
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+      vtrn.32         ROW0L, ROW2L
+    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
+    vmlal.s16       q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
+    vrshrn.s32      ROW1R, q1, #11
+      vtrn.32         ROW5L, ROW7L
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+    vmlsl.s16       q5, ROW3R, XFIX_2_562915447
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
+    vmlal.s16       q6, ROW6R, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+    vrshrn.s32      ROW6R, q1, #11
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0R, ROW4R
+    vrshrn.s32      ROW2R, q1, #11
+    vrshrn.s32      ROW5R, q3, #11
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vrshrn.s32      ROW7R, q2, #11
+    vrshrn.s32      ROW3R, q5, #11
+    vrshrn.s32      ROW0R, q6, #11
+    vrshrn.s32      ROW4R, q3, #11
+    /* Transpose right 4x8 half */
+    vtrn.16         ROW6R, ROW7R
+    vtrn.16         ROW2R, ROW3R
+    vtrn.16         ROW0R, ROW1R
+    vtrn.16         ROW4R, ROW5R
+    vtrn.32         ROW1R, ROW3R
+    vtrn.32         ROW4R, ROW6R
+    vtrn.32         ROW0R, ROW2R
+    vtrn.32         ROW5R, ROW7R
+
+1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW1R, XFIX_1_175875602   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW1L, XFIX_1_175875602
+    vmlal.s16       q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
+    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW3R, XFIX_1_175875602   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW3L, XFIX_1_175875602
+    vmlal.s16       q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
+    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+    vsubl.s16       q3, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vmlal.s16       q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065  /* ROW6L <-> ROW2R */
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW1R, XFIX_2_562915447   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+    vmlsl.s16       q7, ROW3R, XFIX_0_899976223   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vshrn.s32       ROW1L, q1, #16
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447  /* ROW5L <-> ROW1R */
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+    vmlal.s16       q6, ROW2R, XFIX_0_541196100   /* ROW6L <-> ROW2R */
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW2L, q1, #16
+    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223  /* ROW7L <-> ROW3R */
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW3L, q5, #16
+    vshrn.s32       ROW0L, q6, #16
+    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
+    /* 1-D IDCT, pass 2, right 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW5R, XFIX_1_175875602
+    vmlal.s16       q6, ROW5L, XFIX_1_175875602   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
+    vmull.s16       q7, ROW7R, XFIX_1_175875602
+    vmlal.s16       q7, ROW7L, XFIX_1_175875602   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
+    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
+    vsubl.s16       q3, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
+    vmull.s16       q2, ROW6L, XFIX_0_541196100   /* ROW6L <-> ROW2R */
+    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
+    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447  /* ROW7L <-> ROW3R */
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW5L, XFIX_0_899976223   /* ROW5L <-> ROW1R */
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
+    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223  /* ROW5L <-> ROW1R */
+    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+    vmlsl.s16       q5, ROW7L, XFIX_2_562915447   /* ROW7L <-> ROW3R */
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865  /* ROW6L <-> ROW2R */
+    vmlal.s16       q6, ROW6R, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW6R, q1, #16
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
+    vshrn.s32       ROW5R, q3, #16
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW7R, q2, #16
+    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW4R, q3, #16
+
+2:  /* Descale to 8-bit and range limit */
+    vqrshrn.s16     d16, q8, #2
+    vqrshrn.s16     d17, q9, #2
+    vqrshrn.s16     d18, q10, #2
+    vqrshrn.s16     d19, q11, #2
+    vpop            {d8 - d15}                    /* restore Neon registers */
+    vqrshrn.s16     d20, q12, #2
+      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
+      vtrn.16         q8, q9
+    vqrshrn.s16     d21, q13, #2
+    vqrshrn.s16     d22, q14, #2
+      vmov.u8         q0, #(CENTERJSAMPLE)
+    vqrshrn.s16     d23, q15, #2
+      vtrn.8          d16, d17
+      vtrn.8          d18, d19
+      vadd.u8         q8, q8, q0
+      vadd.u8         q9, q9, q0
+      vtrn.16         q10, q11
+        /* Store results to the output buffer */
+        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+        add             TMP1, TMP1, OUTPUT_COL
+        add             TMP2, TMP2, OUTPUT_COL
+        vst1.8          {d16}, [TMP1]
+      vtrn.8          d20, d21
+        vst1.8          {d17}, [TMP2]
+        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+        add             TMP1, TMP1, OUTPUT_COL
+        add             TMP2, TMP2, OUTPUT_COL
+        vst1.8          {d18}, [TMP1]
+      vadd.u8         q10, q10, q0
+        vst1.8          {d19}, [TMP2]
+        ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+        add             TMP1, TMP1, OUTPUT_COL
+        add             TMP2, TMP2, OUTPUT_COL
+        add             TMP3, TMP3, OUTPUT_COL
+        add             TMP4, TMP4, OUTPUT_COL
+      vtrn.8          d22, d23
+        vst1.8          {d20}, [TMP1]
+      vadd.u8         q11, q11, q0
+        vst1.8          {d21}, [TMP2]
+        vst1.8          {d22}, [TMP3]
+        vst1.8          {d23}, [TMP4]
+    bx              lr
+
+3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
+
+    /* Transpose left 4x8 half */
+    vtrn.16         ROW6L, ROW7L
+    vtrn.16         ROW2L, ROW3L
+    vtrn.16         ROW0L, ROW1L
+    vtrn.16         ROW4L, ROW5L
+    vshl.s16        ROW0R, ROW0R, #2  /* PASS1_BITS */
+    vtrn.32         ROW1L, ROW3L
+    vtrn.32         ROW4L, ROW6L
+    vtrn.32         ROW0L, ROW2L
+    vtrn.32         ROW5L, ROW7L
+
+    cmp             r0, #0
+    beq             4f  /* Right 4x8 half has all zeros, go to 'sparse' second
+                           pass */
+
+    /* Only row 0 is non-zero for the right 4x8 half  */
+    vdup.s16        ROW1R, ROW0R[1]
+    vdup.s16        ROW2R, ROW0R[2]
+    vdup.s16        ROW3R, ROW0R[3]
+    vdup.s16        ROW4R, ROW0R[0]
+    vdup.s16        ROW5R, ROW0R[1]
+    vdup.s16        ROW6R, ROW0R[2]
+    vdup.s16        ROW7R, ROW0R[3]
+    vdup.s16        ROW0R, ROW0R[0]
+    b               1b  /* Go to 'normal' second pass */
+
+4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW1L, XFIX_1_175875602
+    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW3L, XFIX_1_175875602
+    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vshll.s16       q3, ROW0L, #13
+    vmov            q4, q6
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vadd.s32        q1, q1, q6
+    vadd.s32        q6, q6, q6
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+    vshrn.s32       ROW1L, q1, #16
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vshll.s16       q5, ROW0L, #13
+    vshrn.s32       ROW2L, q1, #16
+    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW3L, q5, #16
+    vshrn.s32       ROW0L, q6, #16
+    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
+    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW5L, XFIX_1_175875602
+    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW7L, XFIX_1_175875602
+    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q2, ROW6L, XFIX_0_541196100
+    vshll.s16       q3, ROW4L, #13
+    vmov            q4, q6
+    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
+    vmlsl.s16       q4, ROW5L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
+    vadd.s32        q1, q1, q6
+    vadd.s32        q6, q6, q6
+    vmlsl.s16       q5, ROW7L, XFIX_2_562915447
+    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW6R, q1, #16
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vshll.s16       q5, ROW4L, #13
+    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
+    vshrn.s32       ROW5R, q3, #16
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW7R, q2, #16
+    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW4R, q3, #16
+    b               2b                            /* Go to epilogue */
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+
+    .unreq          ROW0L
+    .unreq          ROW0R
+    .unreq          ROW1L
+    .unreq          ROW1R
+    .unreq          ROW2L
+    .unreq          ROW2R
+    .unreq          ROW3L
+    .unreq          ROW3R
+    .unreq          ROW4L
+    .unreq          ROW4R
+    .unreq          ROW5L
+    .unreq          ROW5R
+    .unreq          ROW6L
+    .unreq          ROW6R
+    .unreq          ROW7L
+    .unreq          ROW7R
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_idct_ifast_neon
+ *
+ * This function contains a fast, not so accurate integer implementation of
+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
+ * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
+ * function from jidctfst.c
+ *
+ * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
+ * But in Arm Neon case some extra additions are required because VQDMULH
+ * instruction can't handle the constants larger than 1. So the expressions
+ * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
+ * which introduces an extra addition. Overall, there are 6 extra additions
+ * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
+ */
+
+#define XFIX_1_082392200  d0[0]
+#define XFIX_1_414213562  d0[1]
+#define XFIX_1_847759065  d0[2]
+#define XFIX_2_613125930  d0[3]
+
+.balign 16
+jsimd_idct_ifast_neon_consts:
+  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
+  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
+  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
+  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
+
+asm_function jsimd_idct_ifast_neon
+
+    DCT_TABLE       .req r0
+    COEF_BLOCK      .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_COL      .req r3
+    TMP1            .req r0
+    TMP2            .req r1
+    TMP3            .req r2
+    TMP4            .req ip
+
+    /* Load and dequantize coefficients into Neon registers
+     * with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17     ( q8  )
+     *   1 | d18     | d19     ( q9  )
+     *   2 | d20     | d21     ( q10 )
+     *   3 | d22     | d23     ( q11 )
+     *   4 | d24     | d25     ( q12 )
+     *   5 | d26     | d27     ( q13 )
+     *   6 | d28     | d29     ( q14 )
+     *   7 | d30     | d31     ( q15 )
+     */
+    adr             ip, jsimd_idct_ifast_neon_consts
+    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
+    vmul.s16        q8, q8, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q9, q9, q1
+    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
+    vmul.s16        q10, q10, q2
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vmul.s16        q11, q11, q3
+    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
+    vmul.s16        q12, q12, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q14, q14, q2
+    vmul.s16        q13, q13, q1
+    vld1.16         {d0}, [ip, :64]  /* load constants */
+    vmul.s16        q15, q15, q3
+    vpush           {d8 - d13}       /* save Neon registers */
+    /* 1-D IDCT, pass 1 */
+    vsub.s16        q2, q10, q14
+    vadd.s16        q14, q10, q14
+    vsub.s16        q1, q11, q13
+    vadd.s16        q13, q11, q13
+    vsub.s16        q5, q9, q15
+    vadd.s16        q15, q9, q15
+    vqdmulh.s16     q4, q2, XFIX_1_414213562
+    vqdmulh.s16     q6, q1, XFIX_2_613125930
+    vadd.s16        q3, q1, q1
+    vsub.s16        q1, q5, q1
+    vadd.s16        q10, q2, q4
+    vqdmulh.s16     q4, q1, XFIX_1_847759065
+    vsub.s16        q2, q15, q13
+    vadd.s16        q3, q3, q6
+    vqdmulh.s16     q6, q2, XFIX_1_414213562
+    vadd.s16        q1, q1, q4
+    vqdmulh.s16     q4, q5, XFIX_1_082392200
+    vsub.s16        q10, q10, q14
+    vadd.s16        q2, q2, q6
+    vsub.s16        q6, q8, q12
+    vadd.s16        q12, q8, q12
+    vadd.s16        q9, q5, q4
+    vadd.s16        q5, q6, q10
+    vsub.s16        q10, q6, q10
+    vadd.s16        q6, q15, q13
+    vadd.s16        q8, q12, q14
+    vsub.s16        q3, q6, q3
+    vsub.s16        q12, q12, q14
+    vsub.s16        q3, q3, q1
+    vsub.s16        q1, q9, q1
+    vadd.s16        q2, q3, q2
+    vsub.s16        q15, q8, q6
+    vadd.s16        q1, q1, q2
+    vadd.s16        q8, q8, q6
+    vadd.s16        q14, q5, q3
+    vsub.s16        q9, q5, q3
+    vsub.s16        q13, q10, q2
+    vadd.s16        q10, q10, q2
+      /* Transpose */
+      vtrn.16         q8, q9
+    vsub.s16        q11, q12, q1
+      vtrn.16         q14, q15
+    vadd.s16        q12, q12, q1
+      vtrn.16         q10, q11
+      vtrn.16         q12, q13
+      vtrn.32         q9, q11
+      vtrn.32         q12, q14
+      vtrn.32         q8, q10
+      vtrn.32         q13, q15
+      vswp            d28, d21
+      vswp            d26, d19
+    /* 1-D IDCT, pass 2 */
+    vsub.s16        q2, q10, q14
+      vswp            d30, d23
+    vadd.s16        q14, q10, q14
+      vswp            d24, d17
+    vsub.s16        q1, q11, q13
+    vadd.s16        q13, q11, q13
+    vsub.s16        q5, q9, q15
+    vadd.s16        q15, q9, q15
+    vqdmulh.s16     q4, q2, XFIX_1_414213562
+    vqdmulh.s16     q6, q1, XFIX_2_613125930
+    vadd.s16        q3, q1, q1
+    vsub.s16        q1, q5, q1
+    vadd.s16        q10, q2, q4
+    vqdmulh.s16     q4, q1, XFIX_1_847759065
+    vsub.s16        q2, q15, q13
+    vadd.s16        q3, q3, q6
+    vqdmulh.s16     q6, q2, XFIX_1_414213562
+    vadd.s16        q1, q1, q4
+    vqdmulh.s16     q4, q5, XFIX_1_082392200
+    vsub.s16        q10, q10, q14
+    vadd.s16        q2, q2, q6
+    vsub.s16        q6, q8, q12
+    vadd.s16        q12, q8, q12
+    vadd.s16        q9, q5, q4
+    vadd.s16        q5, q6, q10
+    vsub.s16        q10, q6, q10
+    vadd.s16        q6, q15, q13
+    vadd.s16        q8, q12, q14
+    vsub.s16        q3, q6, q3
+    vsub.s16        q12, q12, q14
+    vsub.s16        q3, q3, q1
+    vsub.s16        q1, q9, q1
+    vadd.s16        q2, q3, q2
+    vsub.s16        q15, q8, q6
+    vadd.s16        q1, q1, q2
+    vadd.s16        q8, q8, q6
+    vadd.s16        q14, q5, q3
+    vsub.s16        q9, q5, q3
+    vsub.s16        q13, q10, q2
+    vpop            {d8 - d13}    /* restore Neon registers */
+    vadd.s16        q10, q10, q2
+    vsub.s16        q11, q12, q1
+    vadd.s16        q12, q12, q1
+    /* Descale to 8-bit and range limit */
+    vmov.u8         q0, #0x80
+    vqshrn.s16      d16, q8, #5
+    vqshrn.s16      d17, q9, #5
+    vqshrn.s16      d18, q10, #5
+    vqshrn.s16      d19, q11, #5
+    vqshrn.s16      d20, q12, #5
+    vqshrn.s16      d21, q13, #5
+    vqshrn.s16      d22, q14, #5
+    vqshrn.s16      d23, q15, #5
+    vadd.u8         q8, q8, q0
+    vadd.u8         q9, q9, q0
+    vadd.u8         q10, q10, q0
+    vadd.u8         q11, q11, q0
+    /* Transpose the final 8-bit samples */
+    vtrn.16         q8, q9
+    vtrn.16         q10, q11
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
+    vtrn.8          d16, d17
+    vtrn.8          d18, d19
+      /* Store results to the output buffer */
+      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+      add             TMP1, TMP1, OUTPUT_COL
+      add             TMP2, TMP2, OUTPUT_COL
+      vst1.8          {d16}, [TMP1]
+      vst1.8          {d17}, [TMP2]
+      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+      add             TMP1, TMP1, OUTPUT_COL
+      add             TMP2, TMP2, OUTPUT_COL
+      vst1.8          {d18}, [TMP1]
+    vtrn.8          d20, d21
+      vst1.8          {d19}, [TMP2]
+      ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+      add             TMP1, TMP1, OUTPUT_COL
+      add             TMP2, TMP2, OUTPUT_COL
+      add             TMP3, TMP3, OUTPUT_COL
+      add             TMP4, TMP4, OUTPUT_COL
+      vst1.8          {d20}, [TMP1]
+    vtrn.8          d22, d23
+      vst1.8          {d21}, [TMP2]
+      vst1.8          {d22}, [TMP3]
+      vst1.8          {d23}, [TMP4]
+    bx              lr
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_extrgb_ycc_convert_neon
+ * jsimd_extbgr_ycc_convert_neon
+ * jsimd_extrgbx_ycc_convert_neon
+ * jsimd_extbgrx_ycc_convert_neon
+ * jsimd_extxbgr_ycc_convert_neon
+ * jsimd_extxrgb_ycc_convert_neon
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro do_store size
+  .if \size == 8
+    vst1.8          {d20}, [Y]!
+    vst1.8          {d21}, [U]!
+    vst1.8          {d22}, [V]!
+  .elseif \size == 4
+    vst1.8          {d20[0]}, [Y]!
+    vst1.8          {d20[1]}, [Y]!
+    vst1.8          {d20[2]}, [Y]!
+    vst1.8          {d20[3]}, [Y]!
+    vst1.8          {d21[0]}, [U]!
+    vst1.8          {d21[1]}, [U]!
+    vst1.8          {d21[2]}, [U]!
+    vst1.8          {d21[3]}, [U]!
+    vst1.8          {d22[0]}, [V]!
+    vst1.8          {d22[1]}, [V]!
+    vst1.8          {d22[2]}, [V]!
+    vst1.8          {d22[3]}, [V]!
+  .elseif \size == 2
+    vst1.8          {d20[4]}, [Y]!
+    vst1.8          {d20[5]}, [Y]!
+    vst1.8          {d21[4]}, [U]!
+    vst1.8          {d21[5]}, [U]!
+    vst1.8          {d22[4]}, [V]!
+    vst1.8          {d22[5]}, [V]!
+  .elseif \size == 1
+    vst1.8          {d20[6]}, [Y]!
+    vst1.8          {d21[6]}, [U]!
+    vst1.8          {d22[6]}, [V]!
+  .else
+    .error unsupported macroblock size
+  .endif
+.endm
+
+.macro do_load bpp, size
+  .if \bpp == 24
+    .if \size == 8
+      vld3.8        {d10, d11, d12}, [RGB]!
+      pld           [RGB, #128]
+    .elseif \size == 4
+      vld3.8        {d10[0], d11[0], d12[0]}, [RGB]!
+      vld3.8        {d10[1], d11[1], d12[1]}, [RGB]!
+      vld3.8        {d10[2], d11[2], d12[2]}, [RGB]!
+      vld3.8        {d10[3], d11[3], d12[3]}, [RGB]!
+    .elseif \size == 2
+      vld3.8        {d10[4], d11[4], d12[4]}, [RGB]!
+      vld3.8        {d10[5], d11[5], d12[5]}, [RGB]!
+    .elseif \size == 1
+      vld3.8        {d10[6], d11[6], d12[6]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      vld4.8        {d10, d11, d12, d13}, [RGB]!
+      pld           [RGB, #128]
+    .elseif \size == 4
+      vld4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
+      vld4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
+      vld4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
+      vld4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
+    .elseif \size == 2
+      vld4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
+      vld4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
+    .elseif \size == 1
+      vld4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
+.endm
+
+.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
+
+/*
+ * 2-stage pipelined RGB->YCbCr conversion
+ */
+
+.macro do_rgb_to_yuv_stage1
+    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
+    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
+    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
+    vmull.u16       q7, d4, d0[0]
+    vmlal.u16       q7, d6, d0[1]
+    vmlal.u16       q7, d8, d0[2]
+    vmull.u16       q8, d5, d0[0]
+    vmlal.u16       q8, d7, d0[1]
+    vmlal.u16       q8, d9, d0[2]
+    vrev64.32       q9, q1
+    vrev64.32       q13, q1
+    vmlsl.u16       q9, d4, d0[3]
+    vmlsl.u16       q9, d6, d1[0]
+    vmlal.u16       q9, d8, d1[1]
+    vmlsl.u16       q13, d5, d0[3]
+    vmlsl.u16       q13, d7, d1[0]
+    vmlal.u16       q13, d9, d1[1]
+    vrev64.32       q14, q1
+    vrev64.32       q15, q1
+    vmlal.u16       q14, d4, d1[1]
+    vmlsl.u16       q14, d6, d1[2]
+    vmlsl.u16       q14, d8, d1[3]
+    vmlal.u16       q15, d5, d1[1]
+    vmlsl.u16       q15, d7, d1[2]
+    vmlsl.u16       q15, d9, d1[3]
+.endm
+
+.macro do_rgb_to_yuv_stage2
+    vrshrn.u32      d20, q7, #16
+    vrshrn.u32      d21, q8, #16
+    vshrn.u32       d22, q9, #16
+    vshrn.u32       d23, q13, #16
+    vshrn.u32       d24, q14, #16
+    vshrn.u32       d25, q15, #16
+    vmovn.u16       d20, q10       /* d20 = y */
+    vmovn.u16       d21, q11       /* d21 = u */
+    vmovn.u16       d22, q12       /* d22 = v */
+.endm
+
+.macro do_rgb_to_yuv
+    do_rgb_to_yuv_stage1
+    do_rgb_to_yuv_stage2
+.endm
+
+.macro do_rgb_to_yuv_stage2_store_load_stage1
+      vrshrn.u32      d20, q7, #16
+      vrshrn.u32      d21, q8, #16
+      vshrn.u32       d22, q9, #16
+    vrev64.32       q9, q1
+      vshrn.u32       d23, q13, #16
+    vrev64.32       q13, q1
+      vshrn.u32       d24, q14, #16
+      vshrn.u32       d25, q15, #16
+    do_load         \bpp, 8
+      vmovn.u16       d20, q10     /* d20 = y */
+    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
+      vmovn.u16       d21, q11     /* d21 = u */
+    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
+      vmovn.u16       d22, q12     /* d22 = v */
+    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
+    vmull.u16       q7, d4, d0[0]
+    vmlal.u16       q7, d6, d0[1]
+    vmlal.u16       q7, d8, d0[2]
+      vst1.8          {d20}, [Y]!
+    vmull.u16       q8, d5, d0[0]
+    vmlal.u16       q8, d7, d0[1]
+    vmlal.u16       q8, d9, d0[2]
+    vmlsl.u16       q9, d4, d0[3]
+    vmlsl.u16       q9, d6, d1[0]
+    vmlal.u16       q9, d8, d1[1]
+      vst1.8          {d21}, [U]!
+    vmlsl.u16       q13, d5, d0[3]
+    vmlsl.u16       q13, d7, d1[0]
+    vmlal.u16       q13, d9, d1[1]
+    vrev64.32       q14, q1
+    vrev64.32       q15, q1
+    vmlal.u16       q14, d4, d1[1]
+    vmlsl.u16       q14, d6, d1[2]
+    vmlsl.u16       q14, d8, d1[3]
+      vst1.8          {d22}, [V]!
+    vmlal.u16       q15, d5, d1[1]
+    vmlsl.u16       q15, d7, d1[2]
+    vmlsl.u16       q15, d9, d1[3]
+.endm
+
+.balign 16
+jsimd_\colorid\()_ycc_neon_consts:
+  .short 19595, 38470, 7471,  11059
+  .short 21709, 32768, 27439, 5329
+  .short 32767, 128,   32767, 128
+  .short 32767, 128,   32767, 128
+
+asm_function jsimd_\colorid\()_ycc_convert_neon
+    OUTPUT_WIDTH    .req r0
+    INPUT_BUF       .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_ROW      .req r3
+    NUM_ROWS        .req r4
+
+    OUTPUT_BUF0     .req r5
+    OUTPUT_BUF1     .req r6
+    OUTPUT_BUF2     .req OUTPUT_BUF
+
+    RGB             .req r7
+    Y               .req r8
+    U               .req r9
+    V               .req r10
+    N               .req ip
+
+    /* Load constants to d0, d1, d2, d3 */
+    adr             ip, jsimd_\colorid\()_ycc_neon_consts
+    vld1.16         {d0, d1, d2, d3}, [ip, :128]
+
+    /* Save Arm registers and handle input arguments */
+    push            {r4, r5, r6, r7, r8, r9, r10, lr}
+    ldr             NUM_ROWS, [sp, #(4 * 8)]
+    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
+    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
+    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
+    .unreq          OUTPUT_BUF
+
+    /* Save Neon registers */
+    vpush           {d8 - d15}
+
+    /* Outer loop over scanlines */
+    cmp             NUM_ROWS, #1
+    blt             9f
+0:
+    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
+    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
+    mov             N, OUTPUT_WIDTH
+    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
+    add             OUTPUT_ROW, OUTPUT_ROW, #1
+    ldr             RGB, [INPUT_BUF], #4
+
+    /* Inner loop over pixels */
+    subs            N, N, #8
+    blt             3f
+    do_load         \bpp, 8
+    do_rgb_to_yuv_stage1
+    subs            N, N, #8
+    blt             2f
+1:
+    do_rgb_to_yuv_stage2_store_load_stage1
+    subs            N, N, #8
+    bge             1b
+2:
+    do_rgb_to_yuv_stage2
+    do_store        8
+    tst             N, #7
+    beq             8f
+3:
+    tst             N, #4
+    beq             3f
+    do_load         \bpp, 4
+3:
+    tst             N, #2
+    beq             4f
+    do_load         \bpp, 2
+4:
+    tst             N, #1
+    beq             5f
+    do_load         \bpp, 1
+5:
+    do_rgb_to_yuv
+    tst             N, #4
+    beq             6f
+    do_store        4
+6:
+    tst             N, #2
+    beq             7f
+    do_store        2
+7:
+    tst             N, #1
+    beq             8f
+    do_store        1
+8:
+    subs            NUM_ROWS, NUM_ROWS, #1
+    bgt             0b
+9:
+    /* Restore all registers and return */
+    vpop            {d8 - d15}
+    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
+
+    .unreq          OUTPUT_WIDTH
+    .unreq          OUTPUT_ROW
+    .unreq          INPUT_BUF
+    .unreq          NUM_ROWS
+    .unreq          OUTPUT_BUF0
+    .unreq          OUTPUT_BUF1
+    .unreq          OUTPUT_BUF2
+    .unreq          RGB
+    .unreq          Y
+    .unreq          U
+    .unreq          V
+    .unreq          N
+
+.purgem do_rgb_to_yuv
+.purgem do_rgb_to_yuv_stage1
+.purgem do_rgb_to_yuv_stage2
+.purgem do_rgb_to_yuv_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R  G  B */
+generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
+
+.purgem do_load
+.purgem do_store
diff --git a/simd/arm/aarch64/jccolext-neon.c b/simd/arm/aarch64/jccolext-neon.c
new file mode 100644
index 0000000..756aeda
--- /dev/null
+++ b/simd/arm/aarch64/jccolext-neon.c
@@ -0,0 +1,316 @@
+/*
+ * jccolext-neon.c - colorspace conversion (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *    Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128
+ *    Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ *    0.16874695 = 11059 * 2^-16
+ *    0.33125305 = 21709 * 2^-16
+ *    0.50000000 = 32768 * 2^-16
+ *    0.41868592 = 27439 * 2^-16
+ *    0.08131409 =  5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  /* Pointer to RGB(X/A) input data */
+  JSAMPROW inptr;
+  /* Pointers to Y, Cb, and Cr output data */
+  JSAMPROW outptr0, outptr1, outptr2;
+
+  /* Set up conversion constants. */
+  const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
+  const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining >= 16; cols_remaining -= 16) {
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+      uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+      uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+      uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_ll = scaled_128_5;
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+      cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+      uint32x4_t cb_lh = scaled_128_5;
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+      cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+      uint32x4_t cb_hl = scaled_128_5;
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+      cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+      uint32x4_t cb_hh = scaled_128_5;
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+      cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_ll = scaled_128_5;
+      cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+      uint32x4_t cr_lh = scaled_128_5;
+      cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+      uint32x4_t cr_hl = scaled_128_5;
+      cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+      uint32x4_t cr_hh = scaled_128_5;
+      cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+                                     vshrn_n_u32(cb_lh, 16));
+      uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+                                     vshrn_n_u32(cb_hh, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+                                     vshrn_n_u32(cr_lh, 16));
+      uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+                                     vshrn_n_u32(cr_hh, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+      vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+      vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+      /* Increment pointers. */
+      inptr += (16 * RGB_PIXELSIZE);
+      outptr0 += 16;
+      outptr1 += 16;
+      outptr2 += 16;
+    }
+
+    if (cols_remaining > 8) {
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 16) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+      memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+      inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+      uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+      uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+      uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_ll = scaled_128_5;
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+      cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+      uint32x4_t cb_lh = scaled_128_5;
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+      cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+      uint32x4_t cb_hl = scaled_128_5;
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+      cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+      uint32x4_t cb_hh = scaled_128_5;
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+      cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_ll = scaled_128_5;
+      cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+      uint32x4_t cr_lh = scaled_128_5;
+      cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+      uint32x4_t cr_hl = scaled_128_5;
+      cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+      uint32x4_t cr_hh = scaled_128_5;
+      cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+                                     vshrn_n_u32(cb_lh, 16));
+      uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+                                     vshrn_n_u32(cb_hh, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+                                     vshrn_n_u32(cr_lh, 16));
+      uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+                                     vshrn_n_u32(cr_hh, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+      vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+      vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+    } else if (cols_remaining > 0) {
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 8) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
+      memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+      inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+      uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+      uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+      uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+      uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
+      y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
+      y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
+      uint32x4_t y_h = vmull_laneq_u16(vget_high_u16(r), consts, 0);
+      y_h = vmlal_laneq_u16(y_h, vget_high_u16(g), consts, 1);
+      y_h = vmlal_laneq_u16(y_h, vget_high_u16(b), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_l = scaled_128_5;
+      cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3);
+      cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
+      cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
+      uint32x4_t cb_h = scaled_128_5;
+      cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(r), consts, 3);
+      cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(g), consts, 4);
+      cb_h = vmlal_laneq_u16(cb_h, vget_high_u16(b), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_l = scaled_128_5;
+      cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5);
+      cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
+      cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
+      uint32x4_t cr_h = scaled_128_5;
+      cr_h = vmlal_laneq_u16(cr_h, vget_high_u16(r), consts, 5);
+      cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(g), consts, 6);
+      cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(b), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
+                                      vrshrn_n_u32(y_h, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16),
+                                       vshrn_n_u32(cb_h, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
+                                       vshrn_n_u32(cr_h, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1_u8(outptr0, vmovn_u16(y_u16));
+      vst1_u8(outptr1, vmovn_u16(cb_u16));
+      vst1_u8(outptr2, vmovn_u16(cr_u16));
+    }
+  }
+}
diff --git a/simd/arm/aarch64/jchuff-neon.c b/simd/arm/aarch64/jchuff-neon.c
new file mode 100644
index 0000000..808fa95
--- /dev/null
+++ b/simd/arm/aarch64/jchuff-neon.c
@@ -0,0 +1,398 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../align.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_huff_encode_one_block_consts[] = {
+    0,   1,   2,   3,  16,  17,  32,  33,
+   18,  19,   4,   5,   6,   7,  20,  21,
+   34,  35,  48,  49, 255, 255,  50,  51,
+   36,  37,  22,  23,   8,   9,  10,  11,
+  255, 255,   6,   7,  20,  21,  34,  35,
+   48,  49, 255, 255,  50,  51,  36,  37,
+   54,  55,  40,  41,  26,  27,  12,  13,
+   14,  15,  28,  29,  42,  43,  56,  57,
+    6,   7,  20,  21,  34,  35,  48,  49,
+   50,  51,  36,  37,  22,  23,   8,   9,
+   26,  27,  12,  13, 255, 255,  14,  15,
+   28,  29,  42,  43,  56,  57, 255, 255,
+   52,  53,  54,  55,  40,  41,  26,  27,
+   12,  13, 255, 255,  14,  15,  28,  29,
+   26,  27,  40,  41,  42,  43,  28,  29,
+   14,  15,  30,  31,  44,  45,  46,  47
+};
+
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+                                         JCOEFPTR block, int last_dc_val,
+                                         c_derived_tbl *dctbl,
+                                         c_derived_tbl *actbl)
+{
+  uint16_t block_diff[DCTSIZE2];
+
+  /* Load lookup table indices for rows of zig-zag ordering. */
+#ifdef HAVE_VLD1Q_U8_X4
+  const uint8x16x4_t idx_rows_0123 =
+    vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE);
+  const uint8x16x4_t idx_rows_4567 =
+    vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE);
+#else
+  /* GCC does not currently support intrinsics vl1dq_<type>_x4(). */
+  const uint8x16x4_t idx_rows_0123 = { {
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 2 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 4 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 6 * DCTSIZE)
+  } };
+  const uint8x16x4_t idx_rows_4567 = { {
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 10 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 12 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 14 * DCTSIZE)
+  } };
+#endif
+
+  /* Load 8x8 block of DCT coefficients. */
+#ifdef HAVE_VLD1Q_U8_X4
+  const int8x16x4_t tbl_rows_0123 =
+    vld1q_s8_x4((int8_t *)(block + 0 * DCTSIZE));
+  const int8x16x4_t tbl_rows_4567 =
+    vld1q_s8_x4((int8_t *)(block + 4 * DCTSIZE));
+#else
+  const int8x16x4_t tbl_rows_0123 = { {
+    vld1q_s8((int8_t *)(block + 0 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 1 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 2 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 3 * DCTSIZE))
+  } };
+  const int8x16x4_t tbl_rows_4567 = { {
+    vld1q_s8((int8_t *)(block + 4 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 5 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 6 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 7 * DCTSIZE))
+  } };
+#endif
+
+  /* Initialise extra lookup tables. */
+  const int8x16x4_t tbl_rows_2345 = { {
+    tbl_rows_0123.val[2], tbl_rows_0123.val[3],
+    tbl_rows_4567.val[0], tbl_rows_4567.val[1]
+  } };
+  const int8x16x3_t tbl_rows_567 =
+    { { tbl_rows_4567.val[1], tbl_rows_4567.val[2], tbl_rows_4567.val[3] } };
+
+  /* Shuffle coefficients into zig-zag order. */
+  int16x8_t row0 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[0]));
+  int16x8_t row1 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[1]));
+  int16x8_t row2 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_0123.val[2]));
+  int16x8_t row3 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[3]));
+  int16x8_t row4 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[0]));
+  int16x8_t row5 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_4567.val[1]));
+  int16x8_t row6 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[2]));
+  int16x8_t row7 =
+    vreinterpretq_s16_s8(vqtbl3q_s8(tbl_rows_567, idx_rows_4567.val[3]));
+
+  /* Compute DC coefficient difference value (F.1.1.5.1). */
+  row0 = vsetq_lane_s16(block[0] - last_dc_val, row0, 0);
+  /* Initialize AC coefficient lanes not reachable by lookup tables. */
+  row1 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[0]),
+                                  0), row1, 2);
+  row2 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+                                  4), row2, 0);
+  row2 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+                                  0), row2, 5);
+  row5 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+                                  7), row5, 2);
+  row5 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+                                  3), row5, 7);
+  row6 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[3]),
+                                  7), row6, 5);
+
+  /* DCT block is now in zig-zag order; start Huffman encoding process. */
+  int16x8_t abs_row0 = vabsq_s16(row0);
+  int16x8_t abs_row1 = vabsq_s16(row1);
+  int16x8_t abs_row2 = vabsq_s16(row2);
+  int16x8_t abs_row3 = vabsq_s16(row3);
+  int16x8_t abs_row4 = vabsq_s16(row4);
+  int16x8_t abs_row5 = vabsq_s16(row5);
+  int16x8_t abs_row6 = vabsq_s16(row6);
+  int16x8_t abs_row7 = vabsq_s16(row7);
+
+  /* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */
+  uint16x8_t row0_diff =
+    vreinterpretq_u16_s16(veorq_s16(abs_row0, vshrq_n_s16(row0, 15)));
+  uint16x8_t row1_diff =
+    vreinterpretq_u16_s16(veorq_s16(abs_row1, vshrq_n_s16(row1, 15)));
+  uint16x8_t row2_diff =
+    vreinterpretq_u16_s16(veorq_s16(abs_row2, vshrq_n_s16(row2, 15)));
+  uint16x8_t row3_diff =
+    vreinterpretq_u16_s16(veorq_s16(abs_row3, vshrq_n_s16(row3, 15)));
+  uint16x8_t row4_diff =
+    vreinterpretq_u16_s16(veorq_s16(abs_row4, vshrq_n_s16(row4, 15)));
+  uint16x8_t row5_diff =
+    vreinterpretq_u16_s16(veorq_s16(abs_row5, vshrq_n_s16(row5, 15)));
+  uint16x8_t row6_diff =
+    vreinterpretq_u16_s16(veorq_s16(abs_row6, vshrq_n_s16(row6, 15)));
+  uint16x8_t row7_diff =
+    vreinterpretq_u16_s16(veorq_s16(abs_row7, vshrq_n_s16(row7, 15)));
+
+  /* Construct bitmap to accelerate encoding of AC coefficients.  A set bit
+   * means that the corresponding coefficient != 0.
+   */
+  uint8x8_t abs_row0_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row0),
+                                               vdupq_n_u16(0)));
+  uint8x8_t abs_row1_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row1),
+                                               vdupq_n_u16(0)));
+  uint8x8_t abs_row2_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row2),
+                                               vdupq_n_u16(0)));
+  uint8x8_t abs_row3_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row3),
+                                               vdupq_n_u16(0)));
+  uint8x8_t abs_row4_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row4),
+                                               vdupq_n_u16(0)));
+  uint8x8_t abs_row5_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row5),
+                                               vdupq_n_u16(0)));
+  uint8x8_t abs_row6_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row6),
+                                               vdupq_n_u16(0)));
+  uint8x8_t abs_row7_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row7),
+                                               vdupq_n_u16(0)));
+
+  const uint8x8_t bitmap_mask =
+    { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
+
+  abs_row0_gt0 = vand_u8(abs_row0_gt0, bitmap_mask);
+  abs_row1_gt0 = vand_u8(abs_row1_gt0, bitmap_mask);
+  abs_row2_gt0 = vand_u8(abs_row2_gt0, bitmap_mask);
+  abs_row3_gt0 = vand_u8(abs_row3_gt0, bitmap_mask);
+  abs_row4_gt0 = vand_u8(abs_row4_gt0, bitmap_mask);
+  abs_row5_gt0 = vand_u8(abs_row5_gt0, bitmap_mask);
+  abs_row6_gt0 = vand_u8(abs_row6_gt0, bitmap_mask);
+  abs_row7_gt0 = vand_u8(abs_row7_gt0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_10 = vpadd_u8(abs_row1_gt0, abs_row0_gt0);
+  uint8x8_t bitmap_rows_32 = vpadd_u8(abs_row3_gt0, abs_row2_gt0);
+  uint8x8_t bitmap_rows_54 = vpadd_u8(abs_row5_gt0, abs_row4_gt0);
+  uint8x8_t bitmap_rows_76 = vpadd_u8(abs_row7_gt0, abs_row6_gt0);
+  uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
+  uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
+  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
+
+  /* Shift left to remove DC bit. */
+  bitmap_all =
+    vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap_all), 1));
+  /* Count bits set (number of non-zero coefficients) in bitmap. */
+  unsigned int non_zero_coefficients = vaddv_u8(vcnt_u8(bitmap_all));
+  /* Move bitmap to 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+  /* Set up state and bit buffer for output bitstream. */
+  working_state *state_ptr = (working_state *)state;
+  int free_bits = state_ptr->cur.free_bits;
+  size_t put_buffer = state_ptr->cur.put_buffer;
+
+  /* Encode DC coefficient. */
+
+  /* Find nbits required to specify sign and amplitude of coefficient. */
+  unsigned int lz;
+  __asm__("clz %w0, %w1" : "=r"(lz) : "r"(vgetq_lane_s16(abs_row0, 0)));
+  unsigned int nbits = 32 - lz;
+  /* Emit Huffman-coded symbol and additional diff bits. */
+  unsigned int diff = (unsigned int)(vgetq_lane_u16(row0_diff, 0) << lz) >> lz;
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+  /* Encode AC coefficients. */
+
+  unsigned int r = 0;  /* r = run length of zeros */
+  unsigned int i = 1;  /* i = number of coefficients encoded */
+  /* Code and size information for a run length of 16 zero coefficients */
+  const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+  const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+  /* The most efficient method of computing nbits and diff depends on the
+   * number of non-zero coefficients.  If the bitmap is not too sparse (> 8
+   * non-zero AC coefficients), it is beneficial to use Neon; else we compute
+   * nbits and diff on demand using scalar code.
+   */
+  if (non_zero_coefficients > 8) {
+    uint8_t block_nbits[DCTSIZE2];
+
+    int16x8_t row0_lz = vclzq_s16(abs_row0);
+    int16x8_t row1_lz = vclzq_s16(abs_row1);
+    int16x8_t row2_lz = vclzq_s16(abs_row2);
+    int16x8_t row3_lz = vclzq_s16(abs_row3);
+    int16x8_t row4_lz = vclzq_s16(abs_row4);
+    int16x8_t row5_lz = vclzq_s16(abs_row5);
+    int16x8_t row6_lz = vclzq_s16(abs_row6);
+    int16x8_t row7_lz = vclzq_s16(abs_row7);
+    /* Compute nbits needed to specify magnitude of each coefficient. */
+    uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
+                                   vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
+    uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
+                                   vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
+    uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
+                                   vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
+    uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
+                                   vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
+    uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
+                                   vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
+    uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
+                                   vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
+    uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
+                                   vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
+    uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
+                                   vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
+    /* Store nbits. */
+    vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
+    vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
+    vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
+    vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
+    vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
+    vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
+    vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
+    vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
+    /* Mask bits not required to specify sign and amplitude of diff. */
+    row0_diff = vshlq_u16(row0_diff, row0_lz);
+    row1_diff = vshlq_u16(row1_diff, row1_lz);
+    row2_diff = vshlq_u16(row2_diff, row2_lz);
+    row3_diff = vshlq_u16(row3_diff, row3_lz);
+    row4_diff = vshlq_u16(row4_diff, row4_lz);
+    row5_diff = vshlq_u16(row5_diff, row5_lz);
+    row6_diff = vshlq_u16(row6_diff, row6_lz);
+    row7_diff = vshlq_u16(row7_diff, row7_lz);
+    row0_diff = vshlq_u16(row0_diff, vnegq_s16(row0_lz));
+    row1_diff = vshlq_u16(row1_diff, vnegq_s16(row1_lz));
+    row2_diff = vshlq_u16(row2_diff, vnegq_s16(row2_lz));
+    row3_diff = vshlq_u16(row3_diff, vnegq_s16(row3_lz));
+    row4_diff = vshlq_u16(row4_diff, vnegq_s16(row4_lz));
+    row5_diff = vshlq_u16(row5_diff, vnegq_s16(row5_lz));
+    row6_diff = vshlq_u16(row6_diff, vnegq_s16(row6_lz));
+    row7_diff = vshlq_u16(row7_diff, vnegq_s16(row7_lz));
+    /* Store diff bits. */
+    vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+    vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+    vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+    vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+    vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+    vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+    vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+    vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+    while (bitmap != 0) {
+      r = __builtin_clzl(bitmap);
+      i += r;
+      bitmap <<= r;
+      nbits = block_nbits[i];
+      diff = block_diff[i];
+      while (r > 15) {
+        /* If run length > 15, emit special run-length-16 codes. */
+        PUT_BITS(code_0xf0, size_0xf0)
+        r -= 16;
+      }
+      /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+      unsigned int rs = (r << 4) + nbits;
+      PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+      i++;
+      bitmap <<= 1;
+    }
+  } else if (bitmap != 0) {
+    uint16_t block_abs[DCTSIZE2];
+    /* Store absolute value of coefficients. */
+    vst1q_u16(block_abs + 0 * DCTSIZE, vreinterpretq_u16_s16(abs_row0));
+    vst1q_u16(block_abs + 1 * DCTSIZE, vreinterpretq_u16_s16(abs_row1));
+    vst1q_u16(block_abs + 2 * DCTSIZE, vreinterpretq_u16_s16(abs_row2));
+    vst1q_u16(block_abs + 3 * DCTSIZE, vreinterpretq_u16_s16(abs_row3));
+    vst1q_u16(block_abs + 4 * DCTSIZE, vreinterpretq_u16_s16(abs_row4));
+    vst1q_u16(block_abs + 5 * DCTSIZE, vreinterpretq_u16_s16(abs_row5));
+    vst1q_u16(block_abs + 6 * DCTSIZE, vreinterpretq_u16_s16(abs_row6));
+    vst1q_u16(block_abs + 7 * DCTSIZE, vreinterpretq_u16_s16(abs_row7));
+    /* Store diff bits. */
+    vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+    vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+    vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+    vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+    vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+    vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+    vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+    vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+    /* Same as above but must mask diff bits and compute nbits on demand. */
+    while (bitmap != 0) {
+      r = __builtin_clzl(bitmap);
+      i += r;
+      bitmap <<= r;
+      lz = __builtin_clz(block_abs[i]);
+      nbits = 32 - lz;
+      diff = (unsigned int)(block_diff[i] << lz) >> lz;
+      while (r > 15) {
+        /* If run length > 15, emit special run-length-16 codes. */
+        PUT_BITS(code_0xf0, size_0xf0)
+        r -= 16;
+      }
+      /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+      unsigned int rs = (r << 4) + nbits;
+      PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+      i++;
+      bitmap <<= 1;
+    }
+  }
+
+  /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+   * The value of RS for the EOB code is 0.
+   */
+  if (i != 64) {
+    PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+  }
+
+  state_ptr->cur.put_buffer = put_buffer;
+  state_ptr->cur.free_bits = free_bits;
+
+  return buffer;
+}
diff --git a/simd/arm64/jsimd.c b/simd/arm/aarch64/jsimd.c
similarity index 72%
rename from simd/arm64/jsimd.c
rename to simd/arm/aarch64/jsimd.c
index 808c0e3..8570b82 100644
--- a/simd/arm64/jsimd.c
+++ b/simd/arm/aarch64/jsimd.c
@@ -3,8 +3,9 @@
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
- * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, D. R. Commander.
  * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -16,12 +17,13 @@
  */
 
 #define JPEG_INTERNALS
-#include "../../jinclude.h"
-#include "../../jpeglib.h"
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
 #include "../../jsimd.h"
-#include "../../jdct.h"
-#include "../../jsimddct.h"
-#include "../jsimd.h"
+#include "jconfigint.h"
 
 #include <stdio.h>
 #include <string.h>
@@ -189,6 +191,19 @@
 GLOBAL(int)
 jsimd_can_rgb_gray(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -237,20 +252,28 @@
 
   switch (cinfo->in_color_space) {
   case JCS_EXT_RGB:
+#ifndef NEON_INTRINSICS
     if (simd_features & JSIMD_FASTLD3)
+#endif
       neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
     else
       neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
     break;
   case JCS_EXT_RGBX:
   case JCS_EXT_RGBA:
     neonfct = jsimd_extrgbx_ycc_convert_neon;
     break;
   case JCS_EXT_BGR:
+#ifndef NEON_INTRINSICS
     if (simd_features & JSIMD_FASTLD3)
+#endif
       neonfct = jsimd_extbgr_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
     else
       neonfct = jsimd_extbgr_ycc_convert_neon_slowld3;
+#endif
     break;
   case JCS_EXT_BGRX:
   case JCS_EXT_BGRA:
@@ -265,10 +288,14 @@
     neonfct = jsimd_extxrgb_ycc_convert_neon;
     break;
   default:
+#ifndef NEON_INTRINSICS
     if (simd_features & JSIMD_FASTLD3)
+#endif
       neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
     else
       neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
     break;
   }
 
@@ -280,6 +307,37 @@
                        JSAMPIMAGE output_buf, JDIMENSION output_row,
                        int num_rows)
 {
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_gray_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_extbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_gray_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_gray_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
 }
 
 GLOBAL(void)
@@ -291,20 +349,28 @@
 
   switch (cinfo->out_color_space) {
   case JCS_EXT_RGB:
+#ifndef NEON_INTRINSICS
     if (simd_features & JSIMD_FASTST3)
+#endif
       neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
     else
       neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
     break;
   case JCS_EXT_RGBX:
   case JCS_EXT_RGBA:
     neonfct = jsimd_ycc_extrgbx_convert_neon;
     break;
   case JCS_EXT_BGR:
+#ifndef NEON_INTRINSICS
     if (simd_features & JSIMD_FASTST3)
+#endif
       neonfct = jsimd_ycc_extbgr_convert_neon;
+#ifndef NEON_INTRINSICS
     else
       neonfct = jsimd_ycc_extbgr_convert_neon_slowst3;
+#endif
     break;
   case JCS_EXT_BGRX:
   case JCS_EXT_BGRA:
@@ -319,10 +385,14 @@
     neonfct = jsimd_ycc_extxrgb_convert_neon;
     break;
   default:
+#ifndef NEON_INTRINSICS
     if (simd_features & JSIMD_FASTST3)
+#endif
       neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
     else
       neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
     break;
   }
 
@@ -397,12 +467,33 @@
 GLOBAL(int)
 jsimd_can_h2v2_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -410,23 +501,66 @@
 jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
 }
 
 GLOBAL(void)
 jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
 }
 
 GLOBAL(int)
 jsimd_can_h2v2_fancy_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_fancy_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -434,23 +568,60 @@
 jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
 }
 
 GLOBAL(void)
 jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
 }
 
 GLOBAL(int)
 jsimd_can_h2v2_merged_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_merged_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -458,12 +629,74 @@
 jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                            JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
 }
 
 GLOBAL(void)
 jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                            JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
 }
 
 GLOBAL(int)
@@ -762,17 +995,33 @@
                             int last_dc_val, c_derived_tbl *dctbl,
                             c_derived_tbl *actbl)
 {
+#ifndef NEON_INTRINSICS
   if (simd_features & JSIMD_FASTTBL)
+#endif
     return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
                                             dctbl, actbl);
+#ifndef NEON_INTRINSICS
   else
     return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
                                                     last_dc_val, dctbl, actbl);
+#endif
 }
 
 GLOBAL(int)
 jsimd_can_encode_mcu_AC_first_prepare(void)
 {
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 8)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -781,11 +1030,25 @@
                                   const int *jpeg_natural_order_start, int Sl,
                                   int Al, JCOEF *values, size_t *zerobits)
 {
+  jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
 }
 
 GLOBAL(int)
 jsimd_can_encode_mcu_AC_refine_prepare(void)
 {
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 8)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -794,5 +1057,7 @@
                                    const int *jpeg_natural_order_start, int Sl,
                                    int Al, JCOEF *absvalues, size_t *bits)
 {
-  return 0;
+  return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+                                                 jpeg_natural_order_start,
+                                                 Sl, Al, absvalues, bits);
 }
diff --git a/simd/arm64/jsimd_neon.S b/simd/arm/aarch64/jsimd_neon.S
similarity index 69%
rename from simd/arm64/jsimd_neon.S
rename to simd/arm/aarch64/jsimd_neon.S
index c13d0d3..31aa8e2 100644
--- a/simd/arm64/jsimd_neon.S
+++ b/simd/arm/aarch64/jsimd_neon.S
@@ -86,56 +86,6 @@
 #undef F_2_562
 #undef F_3_072
 
-/* Constants for jsimd_idct_ifast_neon() */
-
-.balign 16
-Ljsimd_idct_ifast_neon_consts:
-  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
-  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
-  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
-  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
-
-/* Constants for jsimd_idct_4x4_neon() and jsimd_idct_2x2_neon() */
-
-#define CONST_BITS  13
-
-#define FIX_0_211164243  (1730)   /* FIX(0.211164243) */
-#define FIX_0_509795579  (4176)   /* FIX(0.509795579) */
-#define FIX_0_601344887  (4926)   /* FIX(0.601344887) */
-#define FIX_0_720959822  (5906)   /* FIX(0.720959822) */
-#define FIX_0_765366865  (6270)   /* FIX(0.765366865) */
-#define FIX_0_850430095  (6967)   /* FIX(0.850430095) */
-#define FIX_0_899976223  (7373)   /* FIX(0.899976223) */
-#define FIX_1_061594337  (8697)   /* FIX(1.061594337) */
-#define FIX_1_272758580  (10426)  /* FIX(1.272758580) */
-#define FIX_1_451774981  (11893)  /* FIX(1.451774981) */
-#define FIX_1_847759065  (15137)  /* FIX(1.847759065) */
-#define FIX_2_172734803  (17799)  /* FIX(2.172734803) */
-#define FIX_2_562915447  (20995)  /* FIX(2.562915447) */
-#define FIX_3_624509785  (29692)  /* FIX(3.624509785) */
-
-.balign 16
-Ljsimd_idct_4x4_neon_consts:
-  .short FIX_1_847759065        /* v0.h[0] */
-  .short -FIX_0_765366865       /* v0.h[1] */
-  .short -FIX_0_211164243       /* v0.h[2] */
-  .short FIX_1_451774981        /* v0.h[3] */
-  .short -FIX_2_172734803       /* d1[0] */
-  .short FIX_1_061594337        /* d1[1] */
-  .short -FIX_0_509795579       /* d1[2] */
-  .short -FIX_0_601344887       /* d1[3] */
-  .short FIX_0_899976223        /* v2.h[0] */
-  .short FIX_2_562915447        /* v2.h[1] */
-  .short 1 << (CONST_BITS + 1)  /* v2.h[2] */
-  .short 0                      /* v2.h[3] */
-
-.balign 8
-Ljsimd_idct_2x2_neon_consts:
-  .short -FIX_0_720959822  /* v14[0] */
-  .short FIX_0_850430095   /* v14[1] */
-  .short -FIX_1_272758580  /* v14[2] */
-  .short FIX_3_624509785   /* v14[3] */
-
 /* Constants for jsimd_ycc_*_neon() */
 
 .balign 16
@@ -201,52 +151,6 @@
 #undef F_2_562
 #undef F_3_072
 
-/* Constants for jsimd_fdct_ifast_neon() */
-
-.balign 16
-Ljsimd_fdct_ifast_neon_consts:
-  .short (98 * 128)               /* XFIX_0_382683433 */
-  .short (139 * 128)              /* XFIX_0_541196100 */
-  .short (181 * 128)              /* XFIX_0_707106781 */
-  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
-
-/* Constants for jsimd_h2*_downsample_neon() */
-
-.balign 16
-Ljsimd_h2_downsample_neon_consts:
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
-        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
-        0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
-        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
-        0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
-  .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
-        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
-  .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
-        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
-  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
-
 /* Constants for jsimd_huff_encode_one_block_neon() */
 
 .balign 16
@@ -278,11 +182,6 @@
     .byte    4,   5,   6,   7, 255, 255, 255, 255, \
            255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
 
-.text
-
-
-#define RESPECT_STRICT_ALIGNMENT  1
-
 
 /*****************************************************************************/
 
@@ -313,45 +212,6 @@
 #endif
 .endm
 
-/* Transpose elements of single 128 bit registers */
-.macro transpose_single x0, x1, xi, xilen, literal
-    ins             \xi\xilen[0], \x0\xilen[0]
-    ins             \x1\xilen[0], \x0\xilen[1]
-    trn1            \x0\literal, \x0\literal, \x1\literal
-    trn2            \x1\literal, \xi\literal, \x1\literal
-.endm
-
-/* Transpose elements of 2 different registers */
-.macro transpose x0, x1, xi, xilen, literal
-    mov             \xi\xilen, \x0\xilen
-    trn1            \x0\literal, \x0\literal, \x1\literal
-    trn2            \x1\literal, \xi\literal, \x1\literal
-.endm
-
-/* Transpose a block of 4x4 coefficients in four 64-bit registers */
-.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
-    mov             \xi\xilen, \x0\xilen
-    trn1            \x0\x0len, \x0\x0len, \x2\x2len
-    trn2            \x2\x2len, \xi\x0len, \x2\x2len
-    mov             \xi\xilen, \x1\xilen
-    trn1            \x1\x1len, \x1\x1len, \x3\x3len
-    trn2            \x3\x3len, \xi\x1len, \x3\x3len
-.endm
-
-.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
-    mov             \xi\xilen, \x0\xilen
-    trn1            \x0\x0len, \x0\x0len, \x1\x1len
-    trn2            \x1\x2len, \xi\x0len, \x1\x2len
-    mov             \xi\xilen, \x2\xilen
-    trn1            \x2\x2len, \x2\x2len, \x3\x3len
-    trn2            \x3\x2len, \xi\x1len, \x3\x3len
-.endm
-
-.macro transpose_4x4 x0, x1, x2, x3, x5
-    transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
-    transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
-.endm
-
 .macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
     trn1            \t0\().8h, \l0\().8h, \l1\().8h
     trn1            \t1\().8h, \l2\().8h, \l3\().8h
@@ -984,619 +844,6 @@
 /*****************************************************************************/
 
 /*
- * jsimd_idct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
- * function from jidctfst.c
- *
- * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
- * But in Arm Neon case some extra additions are required because VQDMULH
- * instruction can't handle the constants larger than 1. So the expressions
- * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
- * which introduces an extra addition. Overall, there are 6 extra additions
- * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
- */
-
-#define XFIX_1_082392200  v0.h[0]
-#define XFIX_1_414213562  v0.h[1]
-#define XFIX_1_847759065  v0.h[2]
-#define XFIX_2_613125930  v0.h[3]
-
-asm_function jsimd_idct_ifast_neon
-
-    DCT_TABLE       .req x0
-    COEF_BLOCK      .req x1
-    OUTPUT_BUF      .req x2
-    OUTPUT_COL      .req x3
-    TMP1            .req x0
-    TMP2            .req x1
-    TMP3            .req x9
-    TMP4            .req x10
-    TMP5            .req x11
-    TMP6            .req x12
-    TMP7            .req x13
-    TMP8            .req x14
-
-    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x3 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x3, w3
-
-    /* Load and dequantize coefficients into Neon registers
-     * with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17     ( v16.8h )
-     *   1 | d18     | d19     ( v17.8h )
-     *   2 | d20     | d21     ( v18.8h )
-     *   3 | d22     | d23     ( v19.8h )
-     *   4 | d24     | d25     ( v20.8h )
-     *   5 | d26     | d27     ( v21.8h )
-     *   6 | d28     | d29     ( v22.8h )
-     *   7 | d30     | d31     ( v23.8h )
-     */
-    /* Save Neon registers used in fast IDCT */
-    get_symbol_loc  TMP5, Ljsimd_idct_ifast_neon_consts
-    ld1             {v16.8h, v17.8h}, [COEF_BLOCK], 32
-    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
-    ld1             {v18.8h, v19.8h}, [COEF_BLOCK], 32
-    mul             v16.8h, v16.8h, v0.8h
-    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
-    mul             v17.8h, v17.8h, v1.8h
-    ld1             {v20.8h, v21.8h}, [COEF_BLOCK], 32
-    mul             v18.8h, v18.8h, v2.8h
-    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
-    mul             v19.8h, v19.8h, v3.8h
-    ld1             {v22.8h, v23.8h}, [COEF_BLOCK], 32
-    mul             v20.8h, v20.8h, v0.8h
-    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
-    mul             v22.8h, v22.8h, v2.8h
-    mul             v21.8h, v21.8h, v1.8h
-    ld1             {v0.4h}, [TMP5]        /* load constants */
-    mul             v23.8h, v23.8h, v3.8h
-
-    /* 1-D IDCT, pass 1 */
-    sub             v2.8h, v18.8h, v22.8h
-    add             v22.8h, v18.8h, v22.8h
-    sub             v1.8h, v19.8h, v21.8h
-    add             v21.8h, v19.8h, v21.8h
-    sub             v5.8h, v17.8h, v23.8h
-    add             v23.8h, v17.8h, v23.8h
-    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
-    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
-    add             v3.8h, v1.8h, v1.8h
-    sub             v1.8h, v5.8h, v1.8h
-    add             v18.8h, v2.8h, v4.8h
-    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
-    sub             v2.8h, v23.8h, v21.8h
-    add             v3.8h, v3.8h, v6.8h
-    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
-    add             v1.8h, v1.8h, v4.8h
-    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
-    sub             v18.8h, v18.8h, v22.8h
-    add             v2.8h, v2.8h, v6.8h
-    sub             v6.8h, v16.8h, v20.8h
-    add             v20.8h, v16.8h, v20.8h
-    add             v17.8h, v5.8h, v4.8h
-    add             v5.8h, v6.8h, v18.8h
-    sub             v18.8h, v6.8h, v18.8h
-    add             v6.8h, v23.8h, v21.8h
-    add             v16.8h, v20.8h, v22.8h
-    sub             v3.8h, v6.8h, v3.8h
-    sub             v20.8h, v20.8h, v22.8h
-    sub             v3.8h, v3.8h, v1.8h
-    sub             v1.8h, v17.8h, v1.8h
-    add             v2.8h, v3.8h, v2.8h
-    sub             v23.8h, v16.8h, v6.8h
-    add             v1.8h, v1.8h, v2.8h
-    add             v16.8h, v16.8h, v6.8h
-    add             v22.8h, v5.8h, v3.8h
-    sub             v17.8h, v5.8h, v3.8h
-    sub             v21.8h, v18.8h, v2.8h
-    add             v18.8h, v18.8h, v2.8h
-    sub             v19.8h, v20.8h, v1.8h
-    add             v20.8h, v20.8h, v1.8h
-    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
-    /* 1-D IDCT, pass 2 */
-    sub             v2.8h, v18.8h, v22.8h
-    add             v22.8h, v18.8h, v22.8h
-    sub             v1.8h, v19.8h, v21.8h
-    add             v21.8h, v19.8h, v21.8h
-    sub             v5.8h, v17.8h, v23.8h
-    add             v23.8h, v17.8h, v23.8h
-    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
-    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
-    add             v3.8h, v1.8h, v1.8h
-    sub             v1.8h, v5.8h, v1.8h
-    add             v18.8h, v2.8h, v4.8h
-    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
-    sub             v2.8h, v23.8h, v21.8h
-    add             v3.8h, v3.8h, v6.8h
-    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
-    add             v1.8h, v1.8h, v4.8h
-    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
-    sub             v18.8h, v18.8h, v22.8h
-    add             v2.8h, v2.8h, v6.8h
-    sub             v6.8h, v16.8h, v20.8h
-    add             v20.8h, v16.8h, v20.8h
-    add             v17.8h, v5.8h, v4.8h
-    add             v5.8h, v6.8h, v18.8h
-    sub             v18.8h, v6.8h, v18.8h
-    add             v6.8h, v23.8h, v21.8h
-    add             v16.8h, v20.8h, v22.8h
-    sub             v3.8h, v6.8h, v3.8h
-    sub             v20.8h, v20.8h, v22.8h
-    sub             v3.8h, v3.8h, v1.8h
-    sub             v1.8h, v17.8h, v1.8h
-    add             v2.8h, v3.8h, v2.8h
-    sub             v23.8h, v16.8h, v6.8h
-    add             v1.8h, v1.8h, v2.8h
-    add             v16.8h, v16.8h, v6.8h
-    add             v22.8h, v5.8h, v3.8h
-    sub             v17.8h, v5.8h, v3.8h
-    sub             v21.8h, v18.8h, v2.8h
-    add             v18.8h, v18.8h, v2.8h
-    sub             v19.8h, v20.8h, v1.8h
-    add             v20.8h, v20.8h, v1.8h
-    /* Descale to 8-bit and range limit */
-    movi            v0.16b, #0x80
-      /* Prepare pointers (dual-issue with Neon instructions) */
-      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
-    sqshrn          v28.8b, v16.8h, #5
-      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
-    sqshrn          v29.8b, v17.8h, #5
-      add             TMP1, TMP1, OUTPUT_COL
-    sqshrn          v30.8b, v18.8h, #5
-      add             TMP2, TMP2, OUTPUT_COL
-    sqshrn          v31.8b, v19.8h, #5
-      add             TMP3, TMP3, OUTPUT_COL
-    sqshrn2         v28.16b, v20.8h, #5
-      add             TMP4, TMP4, OUTPUT_COL
-    sqshrn2         v29.16b, v21.8h, #5
-      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
-    sqshrn2         v30.16b, v22.8h, #5
-      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
-    sqshrn2         v31.16b, v23.8h, #5
-      add             TMP5, TMP5, OUTPUT_COL
-    add             v16.16b, v28.16b, v0.16b
-      add             TMP6, TMP6, OUTPUT_COL
-    add             v18.16b, v29.16b, v0.16b
-      add             TMP7, TMP7, OUTPUT_COL
-    add             v20.16b, v30.16b, v0.16b
-      add             TMP8, TMP8, OUTPUT_COL
-    add             v22.16b, v31.16b, v0.16b
-
-    /* Transpose the final 8-bit samples */
-    trn1            v28.16b, v16.16b, v18.16b
-    trn1            v30.16b, v20.16b, v22.16b
-    trn2            v29.16b, v16.16b, v18.16b
-    trn2            v31.16b, v20.16b, v22.16b
-
-    trn1            v16.8h, v28.8h, v30.8h
-    trn2            v18.8h, v28.8h, v30.8h
-    trn1            v20.8h, v29.8h, v31.8h
-    trn2            v22.8h, v29.8h, v31.8h
-
-    uzp1            v28.4s, v16.4s, v18.4s
-    uzp2            v30.4s, v16.4s, v18.4s
-    uzp1            v29.4s, v20.4s, v22.4s
-    uzp2            v31.4s, v20.4s, v22.4s
-
-    /* Store results to the output buffer */
-    st1             {v28.d}[0], [TMP1]
-    st1             {v29.d}[0], [TMP2]
-    st1             {v28.d}[1], [TMP3]
-    st1             {v29.d}[1], [TMP4]
-    st1             {v30.d}[0], [TMP5]
-    st1             {v31.d}[0], [TMP6]
-    st1             {v30.d}[1], [TMP7]
-    st1             {v31.d}[1], [TMP8]
-    blr             x30
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-    .unreq          TMP5
-    .unreq          TMP6
-    .unreq          TMP7
-    .unreq          TMP8
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_4x4_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular Neon optimized function is
- *       bit exact compatibility with jpeg-6b.
- *
- * TODO: a bit better instructions scheduling can be achieved by expanding
- *       idct_helper/transpose_4x4 macros and reordering instructions,
- *       but readability will suffer somewhat.
- */
-
-.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
-    smull           v28.4s, \x4, v2.h[2]
-    smlal           v28.4s, \x8, v0.h[0]
-    smlal           v28.4s, \x14, v0.h[1]
-
-    smull           v26.4s, \x16, v1.h[2]
-    smlal           v26.4s, \x12, v1.h[3]
-    smlal           v26.4s, \x10, v2.h[0]
-    smlal           v26.4s, \x6, v2.h[1]
-
-    smull           v30.4s, \x4, v2.h[2]
-    smlsl           v30.4s, \x8, v0.h[0]
-    smlsl           v30.4s, \x14, v0.h[1]
-
-    smull           v24.4s, \x16, v0.h[2]
-    smlal           v24.4s, \x12, v0.h[3]
-    smlal           v24.4s, \x10, v1.h[0]
-    smlal           v24.4s, \x6, v1.h[1]
-
-    add             v20.4s, v28.4s, v26.4s
-    sub             v28.4s, v28.4s, v26.4s
-
-  .if \shift > 16
-    srshr           v20.4s, v20.4s, #\shift
-    srshr           v28.4s, v28.4s, #\shift
-    xtn             \y26, v20.4s
-    xtn             \y29, v28.4s
-  .else
-    rshrn           \y26, v20.4s, #\shift
-    rshrn           \y29, v28.4s, #\shift
-  .endif
-
-    add             v20.4s, v30.4s, v24.4s
-    sub             v30.4s, v30.4s, v24.4s
-
-  .if \shift > 16
-    srshr           v20.4s, v20.4s, #\shift
-    srshr           v30.4s, v30.4s, #\shift
-    xtn             \y27, v20.4s
-    xtn             \y28, v30.4s
-  .else
-    rshrn           \y27, v20.4s, #\shift
-    rshrn           \y28, v30.4s, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_4x4_neon
-
-    DCT_TABLE       .req x0
-    COEF_BLOCK      .req x1
-    OUTPUT_BUF      .req x2
-    OUTPUT_COL      .req x3
-    TMP1            .req x0
-    TMP2            .req x1
-    TMP3            .req x2
-    TMP4            .req x15
-
-    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x3 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x3, w3
-
-    /* Save all used Neon registers */
-    sub             sp, sp, 64
-    mov             x9, sp
-    /* Load constants (v3.4h is just used for padding) */
-    get_symbol_loc  TMP4, Ljsimd_idct_4x4_neon_consts
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
-    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
-
-    /* Load all COEF_BLOCK into Neon registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | v4.4h   | v5.4h
-     *   1 | v6.4h   | v7.4h
-     *   2 | v8.4h   | v9.4h
-     *   3 | v10.4h  | v11.4h
-     *   4 | -       | -
-     *   5 | v12.4h  | v13.4h
-     *   6 | v14.4h  | v15.4h
-     *   7 | v16.4h  | v17.4h
-     */
-    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
-    ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
-    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
-    /* dequantize */
-    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
-    mul             v4.4h, v4.4h, v18.4h
-    mul             v5.4h, v5.4h, v19.4h
-    ins             v4.d[1], v5.d[0]              /* 128 bit q4 */
-    ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
-    mul             v6.4h, v6.4h, v20.4h
-    mul             v7.4h, v7.4h, v21.4h
-    ins             v6.d[1], v7.d[0]              /* 128 bit q6 */
-    mul             v8.4h, v8.4h, v22.4h
-    mul             v9.4h, v9.4h, v23.4h
-    ins             v8.d[1], v9.d[0]              /* 128 bit q8 */
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
-    mul             v10.4h, v10.4h, v24.4h
-    mul             v11.4h, v11.4h, v25.4h
-    ins             v10.d[1], v11.d[0]            /* 128 bit q10 */
-    mul             v12.4h, v12.4h, v26.4h
-    mul             v13.4h, v13.4h, v27.4h
-    ins             v12.d[1], v13.d[0]            /* 128 bit q12 */
-    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
-    mul             v14.4h, v14.4h, v28.4h
-    mul             v15.4h, v15.4h, v29.4h
-    ins             v14.d[1], v15.d[0]            /* 128 bit q14 */
-    mul             v16.4h, v16.4h, v30.4h
-    mul             v17.4h, v17.4h, v31.4h
-    ins             v16.d[1], v17.d[0]            /* 128 bit q16 */
-
-    /* Pass 1 */
-    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
-                    v4.4h, v6.4h, v8.4h, v10.4h
-    transpose_4x4   v4, v6, v8, v10, v3
-    ins             v10.d[1], v11.d[0]
-    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
-                    v5.4h, v7.4h, v9.4h, v11.4h
-    transpose_4x4   v5, v7, v9, v11, v3
-    ins             v10.d[1], v11.d[0]
-
-    /* Pass 2 */
-    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
-                    v26.4h, v27.4h, v28.4h, v29.4h
-    transpose_4x4   v26, v27, v28, v29, v3
-
-    /* Range limit */
-    movi            v30.8h, #0x80
-    ins             v26.d[1], v27.d[0]
-    ins             v28.d[1], v29.d[0]
-    add             v26.8h, v26.8h, v30.8h
-    add             v28.8h, v28.8h, v30.8h
-    sqxtun          v26.8b, v26.8h
-    sqxtun          v27.8b, v28.8h
-
-    /* Store results to the output buffer */
-    ldp             TMP1, TMP2, [OUTPUT_BUF], 16
-    ldp             TMP3, TMP4, [OUTPUT_BUF]
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-    add             TMP3, TMP3, OUTPUT_COL
-    add             TMP4, TMP4, OUTPUT_COL
-
-#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
-    /* We can use much less instructions on little endian systems if the
-     * OS kernel is not configured to trap unaligned memory accesses
-     */
-    st1             {v26.s}[0], [TMP1], 4
-    st1             {v27.s}[0], [TMP3], 4
-    st1             {v26.s}[1], [TMP2], 4
-    st1             {v27.s}[1], [TMP4], 4
-#else
-    st1             {v26.b}[0], [TMP1], 1
-    st1             {v27.b}[0], [TMP3], 1
-    st1             {v26.b}[1], [TMP1], 1
-    st1             {v27.b}[1], [TMP3], 1
-    st1             {v26.b}[2], [TMP1], 1
-    st1             {v27.b}[2], [TMP3], 1
-    st1             {v26.b}[3], [TMP1], 1
-    st1             {v27.b}[3], [TMP3], 1
-
-    st1             {v26.b}[4], [TMP2], 1
-    st1             {v27.b}[4], [TMP4], 1
-    st1             {v26.b}[5], [TMP2], 1
-    st1             {v27.b}[5], [TMP4], 1
-    st1             {v26.b}[6], [TMP2], 1
-    st1             {v27.b}[6], [TMP4], 1
-    st1             {v26.b}[7], [TMP2], 1
-    st1             {v27.b}[7], [TMP4], 1
-#endif
-
-    /* vpop            {v8.4h - v15.4h}    (not available) */
-    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    blr             x30
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_2x2_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular Neon optimized function is
- *       bit exact compatibility with jpeg-6b.
- */
-
-.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
-    sshll           v15.4s, \x4, #15
-    smull           v26.4s, \x6, v14.h[3]
-    smlal           v26.4s, \x10, v14.h[2]
-    smlal           v26.4s, \x12, v14.h[1]
-    smlal           v26.4s, \x16, v14.h[0]
-
-    add             v20.4s, v15.4s, v26.4s
-    sub             v15.4s, v15.4s, v26.4s
-
-  .if \shift > 16
-    srshr           v20.4s, v20.4s, #\shift
-    srshr           v15.4s, v15.4s, #\shift
-    xtn             \y26, v20.4s
-    xtn             \y27, v15.4s
-  .else
-    rshrn           \y26, v20.4s, #\shift
-    rshrn           \y27, v15.4s, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_2x2_neon
-
-    DCT_TABLE       .req x0
-    COEF_BLOCK      .req x1
-    OUTPUT_BUF      .req x2
-    OUTPUT_COL      .req x3
-    TMP1            .req x0
-    TMP2            .req x15
-
-    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x3 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x3, w3
-
-    /* vpush           {v8.4h - v15.4h}    (not available) */
-    sub             sp, sp, 64
-    mov             x9, sp
-
-    /* Load constants */
-    get_symbol_loc  TMP2, Ljsimd_idct_2x2_neon_consts
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
-    ld1             {v14.4h}, [TMP2]
-
-    /* Load all COEF_BLOCK into Neon registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | v4.4h   | v5.4h
-     *   1 | v6.4h   | v7.4h
-     *   2 | -       | -
-     *   3 | v10.4h  | v11.4h
-     *   4 | -       | -
-     *   5 | v12.4h  | v13.4h
-     *   6 | -       | -
-     *   7 | v16.4h  | v17.4h
-     */
-    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v10.4h, v11.4h}, [COEF_BLOCK], 16
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v12.4h, v13.4h}, [COEF_BLOCK], 16
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
-    /* Dequantize */
-    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
-    mul             v4.4h, v4.4h, v18.4h
-    mul             v5.4h, v5.4h, v19.4h
-    ins             v4.d[1], v5.d[0]
-    mul             v6.4h, v6.4h, v20.4h
-    mul             v7.4h, v7.4h, v21.4h
-    ins             v6.d[1], v7.d[0]
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v24.4h, v25.4h}, [DCT_TABLE], 16
-    mul             v10.4h, v10.4h, v24.4h
-    mul             v11.4h, v11.4h, v25.4h
-    ins             v10.d[1], v11.d[0]
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v26.4h, v27.4h}, [DCT_TABLE], 16
-    mul             v12.4h, v12.4h, v26.4h
-    mul             v13.4h, v13.4h, v27.4h
-    ins             v12.d[1], v13.d[0]
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
-    mul             v16.4h, v16.4h, v30.4h
-    mul             v17.4h, v17.4h, v31.4h
-    ins             v16.d[1], v17.d[0]
-
-    /* Pass 1 */
-#if 0
-    idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
-    transpose_4x4   v4.4h, v6.4h, v8.4h, v10.4h
-    idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
-    transpose_4x4   v5.4h, v7.4h, v9.4h, v11.4h
-#else
-    smull           v26.4s, v6.4h, v14.h[3]
-    smlal           v26.4s, v10.4h, v14.h[2]
-    smlal           v26.4s, v12.4h, v14.h[1]
-    smlal           v26.4s, v16.4h, v14.h[0]
-    smull           v24.4s, v7.4h, v14.h[3]
-    smlal           v24.4s, v11.4h, v14.h[2]
-    smlal           v24.4s, v13.4h, v14.h[1]
-    smlal           v24.4s, v17.4h, v14.h[0]
-    sshll           v15.4s, v4.4h, #15
-    sshll           v30.4s, v5.4h, #15
-    add             v20.4s, v15.4s, v26.4s
-    sub             v15.4s, v15.4s, v26.4s
-    rshrn           v4.4h, v20.4s, #13
-    rshrn           v6.4h, v15.4s, #13
-    add             v20.4s, v30.4s, v24.4s
-    sub             v15.4s, v30.4s, v24.4s
-    rshrn           v5.4h, v20.4s, #13
-    rshrn           v7.4h, v15.4s, #13
-    ins             v4.d[1], v5.d[0]
-    ins             v6.d[1], v7.d[0]
-    transpose       v4, v6, v3, .16b, .8h
-    transpose       v6, v10, v3, .16b, .4s
-    ins             v11.d[0], v10.d[1]
-    ins             v7.d[0], v6.d[1]
-#endif
-
-    /* Pass 2 */
-    idct_helper     v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
-
-    /* Range limit */
-    movi            v30.8h, #0x80
-    ins             v26.d[1], v27.d[0]
-    add             v26.8h, v26.8h, v30.8h
-    sqxtun          v30.8b, v26.8h
-    ins             v26.d[0], v30.d[0]
-    sqxtun          v27.8b, v26.8h
-
-    /* Store results to the output buffer */
-    ldp             TMP1, TMP2, [OUTPUT_BUF]
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-
-    st1             {v26.b}[0], [TMP1], 1
-    st1             {v27.b}[4], [TMP1], 1
-    st1             {v26.b}[1], [TMP2], 1
-    st1             {v27.b}[5], [TMP2], 1
-
-    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    blr             x30
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
  * jsimd_ycc_extrgb_convert_neon
  * jsimd_ycc_extbgr_convert_neon
  * jsimd_ycc_extrgbx_convert_neon
@@ -2281,82 +1528,6 @@
 /*****************************************************************************/
 
 /*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- *       rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
-    SAMPLE_DATA     .req x0
-    START_COL       .req x1
-    WORKSPACE       .req x2
-    TMP1            .req x9
-    TMP2            .req x10
-    TMP3            .req x11
-    TMP4            .req x12
-    TMP5            .req x13
-    TMP6            .req x14
-    TMP7            .req x15
-    TMP8            .req x4
-    TMPDUP          .req w3
-
-    /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x1 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x1, w1
-
-    mov             TMPDUP, #128
-    ldp             TMP1, TMP2, [SAMPLE_DATA], 16
-    ldp             TMP3, TMP4, [SAMPLE_DATA], 16
-    dup             v0.8b, TMPDUP
-    add             TMP1, TMP1, START_COL
-    add             TMP2, TMP2, START_COL
-    ldp             TMP5, TMP6, [SAMPLE_DATA], 16
-    add             TMP3, TMP3, START_COL
-    add             TMP4, TMP4, START_COL
-    ldp             TMP7, TMP8, [SAMPLE_DATA], 16
-    add             TMP5, TMP5, START_COL
-    add             TMP6, TMP6, START_COL
-    ld1             {v16.8b}, [TMP1]
-    add             TMP7, TMP7, START_COL
-    add             TMP8, TMP8, START_COL
-    ld1             {v17.8b}, [TMP2]
-    usubl           v16.8h, v16.8b, v0.8b
-    ld1             {v18.8b}, [TMP3]
-    usubl           v17.8h, v17.8b, v0.8b
-    ld1             {v19.8b}, [TMP4]
-    usubl           v18.8h, v18.8b, v0.8b
-    ld1             {v20.8b}, [TMP5]
-    usubl           v19.8h, v19.8b, v0.8b
-    ld1             {v21.8b}, [TMP6]
-    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
-    usubl           v20.8h, v20.8b, v0.8b
-    ld1             {v22.8b}, [TMP7]
-    usubl           v21.8h, v21.8b, v0.8b
-    ld1             {v23.8b}, [TMP8]
-    usubl           v22.8h, v22.8b, v0.8b
-    usubl           v23.8h, v23.8b, v0.8b
-    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
-
-    br              x30
-
-    .unreq          SAMPLE_DATA
-    .unreq          START_COL
-    .unreq          WORKSPACE
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-    .unreq          TMP5
-    .unreq          TMP6
-    .unreq          TMP7
-    .unreq          TMP8
-    .unreq          TMPDUP
-
-/*****************************************************************************/
-
-/*
  * jsimd_fdct_islow_neon
  *
  * This file contains a slower but more accurate integer implementation of the
@@ -2655,360 +1826,6 @@
 /*****************************************************************************/
 
 /*
- * jsimd_fdct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the forward DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
- * function from jfdctfst.c
- *
- * TODO: can be combined with 'jsimd_convsamp_neon' to get
- *       rid of a bunch of VLD1.16 instructions
- */
-
-#undef XFIX_0_541196100
-#define XFIX_0_382683433  v0.h[0]
-#define XFIX_0_541196100  v0.h[1]
-#define XFIX_0_707106781  v0.h[2]
-#define XFIX_1_306562965  v0.h[3]
-
-asm_function jsimd_fdct_ifast_neon
-
-    DATA            .req x0
-    TMP             .req x9
-
-    /* Load constants */
-    get_symbol_loc  TMP, Ljsimd_fdct_ifast_neon_consts
-    ld1             {v0.4h}, [TMP]
-
-    /* Load all DATA into Neon registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17    | v0.8h
-     *   1 | d18     | d19    | q9
-     *   2 | d20     | d21    | q10
-     *   3 | d22     | d23    | q11
-     *   4 | d24     | d25    | q12
-     *   5 | d26     | d27    | q13
-     *   6 | d28     | d29    | q14
-     *   7 | d30     | d31    | q15
-     */
-
-    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
-    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
-    mov             TMP, #2
-    sub             DATA, DATA, #64
-1:
-    /* Transpose */
-    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
-    subs            TMP, TMP, #1
-    /* 1-D FDCT */
-    add             v4.8h, v19.8h, v20.8h
-    sub             v20.8h, v19.8h, v20.8h
-    sub             v28.8h, v18.8h, v21.8h
-    add             v18.8h, v18.8h, v21.8h
-    sub             v29.8h, v17.8h, v22.8h
-    add             v17.8h, v17.8h, v22.8h
-    sub             v21.8h, v16.8h, v23.8h
-    add             v16.8h, v16.8h, v23.8h
-    sub             v6.8h, v17.8h, v18.8h
-    sub             v7.8h, v16.8h, v4.8h
-    add             v5.8h, v17.8h, v18.8h
-    add             v6.8h, v6.8h, v7.8h
-    add             v4.8h, v16.8h, v4.8h
-    sqdmulh         v6.8h, v6.8h, XFIX_0_707106781
-    add             v19.8h, v20.8h, v28.8h
-    add             v16.8h, v4.8h, v5.8h
-    sub             v20.8h, v4.8h, v5.8h
-    add             v5.8h, v28.8h, v29.8h
-    add             v29.8h, v29.8h, v21.8h
-    sqdmulh         v5.8h, v5.8h, XFIX_0_707106781
-    sub             v28.8h, v19.8h, v29.8h
-    add             v18.8h, v7.8h, v6.8h
-    sqdmulh         v28.8h, v28.8h, XFIX_0_382683433
-    sub             v22.8h, v7.8h, v6.8h
-    sqdmulh         v19.8h, v19.8h, XFIX_0_541196100
-    sqdmulh         v7.8h, v29.8h, XFIX_1_306562965
-    add             v6.8h, v21.8h, v5.8h
-    sub             v5.8h, v21.8h, v5.8h
-    add             v29.8h, v29.8h, v28.8h
-    add             v19.8h, v19.8h, v28.8h
-    add             v29.8h, v29.8h, v7.8h
-    add             v21.8h, v5.8h, v19.8h
-    sub             v19.8h, v5.8h, v19.8h
-    add             v17.8h, v6.8h, v29.8h
-    sub             v23.8h, v6.8h, v29.8h
-
-    b.ne            1b
-
-    /* store results */
-    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
-    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
-
-    br              x30
-
-    .unreq          DATA
-    .unreq          TMP
-#undef XFIX_0_382683433
-#undef XFIX_0_541196100
-#undef XFIX_0_707106781
-#undef XFIX_1_306562965
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(void)
- * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
- *                     DCTELEM *workspace);
- *
- */
-asm_function jsimd_quantize_neon
-
-    COEF_BLOCK      .req x0
-    DIVISORS        .req x1
-    WORKSPACE       .req x2
-
-    RECIPROCAL      .req DIVISORS
-    CORRECTION      .req x9
-    SHIFT           .req x10
-    LOOP_COUNT      .req x11
-
-    mov             LOOP_COUNT, #2
-    add             CORRECTION, DIVISORS, #(64 * 2)
-    add             SHIFT, DIVISORS, #(64 * 6)
-1:
-    subs            LOOP_COUNT, LOOP_COUNT, #1
-    ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
-    ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
-    abs             v20.8h, v0.8h
-    abs             v21.8h, v1.8h
-    abs             v22.8h, v2.8h
-    abs             v23.8h, v3.8h
-    ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
-    add             v20.8h, v20.8h, v4.8h  /* add correction */
-    add             v21.8h, v21.8h, v5.8h
-    add             v22.8h, v22.8h, v6.8h
-    add             v23.8h, v23.8h, v7.8h
-    umull           v4.4s, v20.4h, v28.4h  /* multiply by reciprocal */
-    umull2          v16.4s, v20.8h, v28.8h
-    umull           v5.4s, v21.4h, v29.4h
-    umull2          v17.4s, v21.8h, v29.8h
-    umull           v6.4s, v22.4h, v30.4h  /* multiply by reciprocal */
-    umull2          v18.4s, v22.8h, v30.8h
-    umull           v7.4s, v23.4h, v31.4h
-    umull2          v19.4s, v23.8h, v31.8h
-    ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
-    shrn            v4.4h, v4.4s, #16
-    shrn            v5.4h, v5.4s, #16
-    shrn            v6.4h, v6.4s, #16
-    shrn            v7.4h, v7.4s, #16
-    shrn2           v4.8h, v16.4s, #16
-    shrn2           v5.8h, v17.4s, #16
-    shrn2           v6.8h, v18.4s, #16
-    shrn2           v7.8h, v19.4s, #16
-    neg             v24.8h, v24.8h
-    neg             v25.8h, v25.8h
-    neg             v26.8h, v26.8h
-    neg             v27.8h, v27.8h
-    sshr            v0.8h, v0.8h, #15  /* extract sign */
-    sshr            v1.8h, v1.8h, #15
-    sshr            v2.8h, v2.8h, #15
-    sshr            v3.8h, v3.8h, #15
-    ushl            v4.8h, v4.8h, v24.8h  /* shift */
-    ushl            v5.8h, v5.8h, v25.8h
-    ushl            v6.8h, v6.8h, v26.8h
-    ushl            v7.8h, v7.8h, v27.8h
-
-    eor             v4.16b, v4.16b, v0.16b  /* restore sign */
-    eor             v5.16b, v5.16b, v1.16b
-    eor             v6.16b, v6.16b, v2.16b
-    eor             v7.16b, v7.16b, v3.16b
-    sub             v4.8h, v4.8h, v0.8h
-    sub             v5.8h, v5.8h, v1.8h
-    sub             v6.8h, v6.8h, v2.8h
-    sub             v7.8h, v7.8h, v3.8h
-    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
-
-    b.ne            1b
-
-    br              x30  /* return */
-
-    .unreq          COEF_BLOCK
-    .unreq          DIVISORS
-    .unreq          WORKSPACE
-    .unreq          RECIPROCAL
-    .unreq          CORRECTION
-    .unreq          SHIFT
-    .unreq          LOOP_COUNT
-
-
-/*****************************************************************************/
-
-/*
- * Downsample pixel values of a single component.
- * This version handles the common case of 2:1 horizontal and 1:1 vertical,
- * without smoothing.
- *
- * GLOBAL(void)
- * jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
- *                            JDIMENSION v_samp_factor,
- *                            JDIMENSION width_in_blocks,
- *                            JSAMPARRAY input_data, JSAMPARRAY output_data);
- */
-
-asm_function jsimd_h2v1_downsample_neon
-    IMAGE_WIDTH     .req x0
-    MAX_V_SAMP      .req x1
-    V_SAMP          .req x2
-    BLOCK_WIDTH     .req x3
-    INPUT_DATA      .req x4
-    OUTPUT_DATA     .req x5
-    OUTPTR          .req x9
-    INPTR           .req x10
-    TMP1            .req x11
-    TMP2            .req x12
-    TMP3            .req x13
-    TMPDUP          .req w15
-
-    mov             TMPDUP, #0x10000
-    lsl             TMP2, BLOCK_WIDTH, #4
-    sub             TMP2, TMP2, IMAGE_WIDTH
-    get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
-    add             TMP3, TMP3, TMP2, lsl #4
-    dup             v16.4s, TMPDUP
-    ld1             {v18.16b}, [TMP3]
-
-1:  /* row loop */
-    ldr             INPTR, [INPUT_DATA], #8
-    ldr             OUTPTR, [OUTPUT_DATA], #8
-    subs            TMP1, BLOCK_WIDTH, #1
-    b.eq            3f
-2:  /* columns */
-    ld1             {v0.16b}, [INPTR], #16
-    mov             v4.16b, v16.16b
-    subs            TMP1, TMP1, #1
-    uadalp          v4.8h, v0.16b
-    shrn            v6.8b, v4.8h, #1
-    st1             {v6.8b}, [OUTPTR], #8
-    b.ne            2b
-3:  /* last columns */
-    ld1             {v0.16b}, [INPTR]
-    mov             v4.16b, v16.16b
-    subs            V_SAMP, V_SAMP, #1
-    /* expand right */
-    tbl             v2.16b, {v0.16b}, v18.16b
-    uadalp          v4.8h, v2.16b
-    shrn            v6.8b, v4.8h, #1
-    st1             {v6.8b}, [OUTPTR], #8
-    b.ne            1b
-
-    br              x30
-
-    .unreq          IMAGE_WIDTH
-    .unreq          MAX_V_SAMP
-    .unreq          V_SAMP
-    .unreq          BLOCK_WIDTH
-    .unreq          INPUT_DATA
-    .unreq          OUTPUT_DATA
-    .unreq          OUTPTR
-    .unreq          INPTR
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMPDUP
-
-
-/*****************************************************************************/
-
-/*
- * Downsample pixel values of a single component.
- * This version handles the common case of 2:1 horizontal and 2:1 vertical,
- * without smoothing.
- *
- * GLOBAL(void)
- * jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
- *                            JDIMENSION v_samp_factor,
- *                            JDIMENSION width_in_blocks,
- *                            JSAMPARRAY input_data, JSAMPARRAY output_data);
- */
-
-.balign 16
-asm_function jsimd_h2v2_downsample_neon
-    IMAGE_WIDTH     .req x0
-    MAX_V_SAMP      .req x1
-    V_SAMP          .req x2
-    BLOCK_WIDTH     .req x3
-    INPUT_DATA      .req x4
-    OUTPUT_DATA     .req x5
-    OUTPTR          .req x9
-    INPTR0          .req x10
-    INPTR1          .req x14
-    TMP1            .req x11
-    TMP2            .req x12
-    TMP3            .req x13
-    TMPDUP          .req w15
-
-    mov             TMPDUP, #1
-    lsl             TMP2, BLOCK_WIDTH, #4
-    lsl             TMPDUP, TMPDUP, #17
-    sub             TMP2, TMP2, IMAGE_WIDTH
-    get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
-    orr             TMPDUP, TMPDUP, #1
-    add             TMP3, TMP3, TMP2, lsl #4
-    dup             v16.4s, TMPDUP
-    ld1             {v18.16b}, [TMP3]
-
-1:  /* row loop */
-    ldr             INPTR0, [INPUT_DATA], #8
-    ldr             OUTPTR, [OUTPUT_DATA], #8
-    ldr             INPTR1, [INPUT_DATA], #8
-    subs            TMP1, BLOCK_WIDTH, #1
-    b.eq            3f
-2:  /* columns */
-    ld1             {v0.16b}, [INPTR0], #16
-    ld1             {v1.16b}, [INPTR1], #16
-    mov             v4.16b, v16.16b
-    subs            TMP1, TMP1, #1
-    uadalp          v4.8h, v0.16b
-    uadalp          v4.8h, v1.16b
-    shrn            v6.8b, v4.8h, #2
-    st1             {v6.8b}, [OUTPTR], #8
-    b.ne            2b
-3:  /* last columns */
-    ld1             {v0.16b}, [INPTR0], #16
-    ld1             {v1.16b}, [INPTR1], #16
-    mov             v4.16b, v16.16b
-    subs            V_SAMP, V_SAMP, #1
-    /* expand right */
-    tbl             v2.16b, {v0.16b}, v18.16b
-    tbl             v3.16b, {v1.16b}, v18.16b
-    uadalp          v4.8h, v2.16b
-    uadalp          v4.8h, v3.16b
-    shrn            v6.8b, v4.8h, #2
-    st1             {v6.8b}, [OUTPTR], #8
-    b.ne            1b
-
-    br              x30
-
-    .unreq          IMAGE_WIDTH
-    .unreq          MAX_V_SAMP
-    .unreq          V_SAMP
-    .unreq          BLOCK_WIDTH
-    .unreq          INPUT_DATA
-    .unreq          OUTPUT_DATA
-    .unreq          OUTPTR
-    .unreq          INPTR0
-    .unreq          INPTR1
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMPDUP
-
-
-/*****************************************************************************/
-
-/*
  * GLOBAL(JOCTET *)
  * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
  *                             JCOEFPTR block, int last_dc_val,
diff --git a/simd/arm/align.h b/simd/arm/align.h
new file mode 100644
index 0000000..cff4241
--- /dev/null
+++ b/simd/arm/align.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* How to obtain memory alignment for structures and variables */
+#if defined(_MSC_VER)
+#define ALIGN(alignment)  __declspec(align(alignment))
+#elif defined(__clang__) || defined(__GNUC__)
+#define ALIGN(alignment)  __attribute__((aligned(alignment)))
+#else
+#error "Unknown compiler"
+#endif
diff --git a/simd/arm/jccolor-neon.c b/simd/arm/jccolor-neon.c
new file mode 100644
index 0000000..f18ed9e
--- /dev/null
+++ b/simd/arm/jccolor-neon.c
@@ -0,0 +1,160 @@
+/*
+ * jccolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> YCbCr conversion constants */
+
+#define F_0_298  19595
+#define F_0_587  38470
+#define F_0_113  7471
+#define F_0_168  11059
+#define F_0_331  21709
+#define F_0_500  32768
+#define F_0_418  27439
+#define F_0_081  5329
+
+ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
+  F_0_298, F_0_587, F_0_113, F_0_168,
+  F_0_331, F_0_500, F_0_418, F_0_081
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#if defined(__aarch64__)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extrgb_ycc_convert_neon
+#if defined(__aarch64__)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extrgbx_ycc_convert_neon
+#if defined(__aarch64__)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extbgr_ycc_convert_neon
+#if defined(__aarch64__)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extbgrx_ycc_convert_neon
+#if defined(__aarch64__)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extxbgr_ycc_convert_neon
+#if defined(__aarch64__)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extxrgb_ycc_convert_neon
+#if defined(__aarch64__)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
diff --git a/simd/arm/jcgray-neon.c b/simd/arm/jcgray-neon.c
new file mode 100644
index 0000000..71c7b2d
--- /dev/null
+++ b/simd/arm/jcgray-neon.c
@@ -0,0 +1,120 @@
+/*
+ * jcgray-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> Grayscale conversion constants */
+
+#define F_0_298  19595
+#define F_0_587  38470
+#define F_0_113  7471
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extrgbx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extbgrx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extxbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extxrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
diff --git a/simd/arm/jcgryext-neon.c b/simd/arm/jcgryext-neon.c
new file mode 100644
index 0000000..b1f00e6
--- /dev/null
+++ b/simd/arm/jcgryext-neon.c
@@ -0,0 +1,105 @@
+/*
+ * jcgryext-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-neon.c */
+
+
+/* RGB -> Grayscale conversion is defined by the following equation:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ * These constants are defined in jcgray-neon.c
+ *
+ * This is the same computation as the RGB -> Y portion of RGB -> YCbCr.
+ */
+
+void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                 JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                 int num_rows)
+{
+  JSAMPROW inptr;
+  JSAMPROW outptr;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining > 0; cols_remaining -= 16) {
+
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 16) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      if (cols_remaining < 16) {
+        ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+        memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+        inptr = tmp_buf;
+      }
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_n_u16(vget_low_u16(r_l), F_0_298);
+      uint32x4_t y_lh = vmull_n_u16(vget_high_u16(r_l), F_0_298);
+      uint32x4_t y_hl = vmull_n_u16(vget_low_u16(r_h), F_0_298);
+      uint32x4_t y_hh = vmull_n_u16(vget_high_u16(r_h), F_0_298);
+      y_ll = vmlal_n_u16(y_ll, vget_low_u16(g_l), F_0_587);
+      y_lh = vmlal_n_u16(y_lh, vget_high_u16(g_l), F_0_587);
+      y_hl = vmlal_n_u16(y_hl, vget_low_u16(g_h), F_0_587);
+      y_hh = vmlal_n_u16(y_hh, vget_high_u16(g_h), F_0_587);
+      y_ll = vmlal_n_u16(y_ll, vget_low_u16(b_l), F_0_113);
+      y_lh = vmlal_n_u16(y_lh, vget_high_u16(b_l), F_0_113);
+      y_hl = vmlal_n_u16(y_hl, vget_low_u16(b_h), F_0_113);
+      y_hh = vmlal_n_u16(y_hh, vget_high_u16(b_h), F_0_113);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+
+      /* Narrow Y values to 8-bit and store to memory.  Buffer overwrite is
+       * permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+
+      /* Increment pointers. */
+      inptr += (16 * RGB_PIXELSIZE);
+      outptr += 16;
+    }
+  }
+}
diff --git a/simd/arm/jchuff.h b/simd/arm/jchuff.h
new file mode 100644
index 0000000..87ff0d3
--- /dev/null
+++ b/simd/arm/jchuff.h
@@ -0,0 +1,119 @@
+/*
+ * jchuff.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009, 2018, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
+ * Copyright (C) 2020, Arm Limited.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+/* Expanded entropy encoder object for Huffman encoding.
+ *
+ * The savable_state subrecord contains fields that change within an MCU,
+ * but must not be updated permanently until we complete the MCU.
+ */
+
+#if defined(__aarch64__)
+#define BIT_BUF_SIZE  64
+#else
+#define BIT_BUF_SIZE  32
+#endif
+
+typedef struct {
+  size_t put_buffer;                    /* current bit accumulation buffer */
+  int free_bits;                        /* # of bits available in it */
+  int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
+} savable_state;
+
+typedef struct {
+  JOCTET *next_output_byte;     /* => next byte to write in buffer */
+  size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
+  savable_state cur;            /* Current bit buffer & DC state */
+  j_compress_ptr cinfo;         /* dump_buffer needs access to this */
+  int simd;
+} working_state;
+
+/* Outputting bits to the file */
+
+/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be encoded
+ * as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the byte is
+ * 0xFF.  Otherwise, the output buffer pointer is advanced by 1, and the
+ * speculative 0 byte will be overwritten by the next byte.
+ */
+#define EMIT_BYTE(b) { \
+  buffer[0] = (JOCTET)(b); \
+  buffer[1] = 0; \
+  buffer -= -2 + ((JOCTET)(b) < 0xFF); \
+}
+
+/* Output the entire bit buffer.  If there are no 0xFF bytes in it, then write
+ * directly to the output buffer.  Otherwise, use the EMIT_BYTE() macro to
+ * encode 0xFF as 0xFF 0x00.
+ */
+#if defined(__aarch64__)
+
+#define FLUSH() { \
+  if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+    EMIT_BYTE(put_buffer >> 56) \
+    EMIT_BYTE(put_buffer >> 48) \
+    EMIT_BYTE(put_buffer >> 40) \
+    EMIT_BYTE(put_buffer >> 32) \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    __asm__("rev %x0, %x1" : "=r"(put_buffer) : "r"(put_buffer)); \
+    *((uint64_t *)buffer) = put_buffer; \
+    buffer += 8; \
+  } \
+}
+
+#else
+
+#define FLUSH() { \
+  if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    __asm__("rev %0, %1" : "=r"(put_buffer) : "r"(put_buffer)); \
+    *((uint32_t *)buffer) = put_buffer; \
+    buffer += 4; \
+  } \
+}
+
+#endif
+
+/* Fill the bit buffer to capacity with the leading bits from code, then output
+ * the bit buffer and put the remaining bits from code into the bit buffer.
+ */
+#define PUT_AND_FLUSH(code, size) { \
+  put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+  FLUSH() \
+  free_bits += BIT_BUF_SIZE; \
+  put_buffer = code; \
+}
+
+/* Insert code into the bit buffer and output the bit buffer if needed.
+ * NOTE: We can't flush with free_bits == 0, since the left shift in
+ * PUT_AND_FLUSH() would have undefined behavior.
+ */
+#define PUT_BITS(code, size) { \
+  free_bits -= size; \
+  if (free_bits < 0) \
+    PUT_AND_FLUSH(code, size) \
+  else \
+    put_buffer = (put_buffer << size) | code; \
+}
+
+#define PUT_CODE(code, size, diff) { \
+  diff |= code << nbits; \
+  nbits += size; \
+  PUT_BITS(diff, nbits) \
+}
diff --git a/simd/arm/jcphuff-neon.c b/simd/arm/jcphuff-neon.c
new file mode 100644
index 0000000..61f94c2
--- /dev/null
+++ b/simd/arm/jcphuff-neon.c
@@ -0,0 +1,588 @@
+/*
+ * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* Data preparation for encode_mcu_AC_first().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+void jsimd_encode_mcu_AC_first_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits)
+{
+  JCOEF *values_ptr = values;
+  JCOEF *diff_values_ptr = values + DCTSIZE2;
+
+  /* Rows of coefficients to zero (since they haven't been processed) */
+  int i, rows_to_zero = 8;
+
+  for (i = 0; i < Sl / 16; i++) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+    /* Isolate sign of coefficients. */
+    int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+    int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+    int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_s16(values_ptr, coefs1);
+    vst1q_s16(values_ptr + DCTSIZE, coefs2);
+    vst1q_s16(diff_values_ptr, diff1);
+    vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+    values_ptr += 16;
+    diff_values_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+  }
+
+  /* Same operation but for remaining partial vector */
+  int remaining_coefs = Sl % 16;
+  if (remaining_coefs > 8) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vdupq_n_s16(0);
+    switch (remaining_coefs) {
+    case 15:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    case 14:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    case 13:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    case 12:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    case 11:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    case 10:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    case 9:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+    default:
+      break;
+    }
+
+    /* Isolate sign of coefficients. */
+    int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+    int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+    int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_s16(values_ptr, coefs1);
+    vst1q_s16(values_ptr + DCTSIZE, coefs2);
+    vst1q_s16(diff_values_ptr, diff1);
+    vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+    values_ptr += 16;
+    diff_values_ptr += 16;
+    rows_to_zero -= 2;
+
+  } else if (remaining_coefs > 0) {
+    int16x8_t coefs = vdupq_n_s16(0);
+
+    switch (remaining_coefs) {
+    case 8:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+    case 7:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+    case 6:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+    case 5:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+    case 4:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+    case 3:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+    case 2:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+    case 1:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+    default:
+      break;
+    }
+
+    /* Isolate sign of coefficients. */
+    int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs = vabsq_s16(coefs);
+    coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    int16x8_t diff = veorq_s16(coefs, sign_coefs);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_s16(values_ptr, coefs);
+    vst1q_s16(diff_values_ptr, diff);
+    values_ptr += 8;
+    diff_values_ptr += 8;
+    rows_to_zero--;
+  }
+
+  /* Zero remaining memory in the values and diff_values blocks. */
+  for (i = 0; i < rows_to_zero; i++) {
+    vst1q_s16(values_ptr, vdupq_n_s16(0));
+    vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
+    values_ptr += 8;
+    diff_values_ptr += 8;
+  }
+
+  /* Construct zerobits bitmap.  A set bit means that the corresponding
+   * coefficient != 0.
+   */
+  int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
+  int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
+  int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
+  int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
+
+  uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
+  uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
+  uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
+  uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
+  uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
+  uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
+  uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
+  uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
+
+  const uint8x8_t bitmap_mask =
+    { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
+
+  row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
+  row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
+  row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
+  row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
+  row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
+  row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
+  row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
+  row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
+  uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
+  uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
+  uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
+  uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__)
+  /* Move bitmap to a 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store zerobits bitmap. */
+  *zerobits = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store zerobits bitmap. */
+  zerobits[0] = ~bitmap0;
+  zerobits[1] = ~bitmap1;
+#endif
+}
+
+
+/* Data preparation for encode_mcu_AC_refine().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+int jsimd_encode_mcu_AC_refine_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits)
+{
+  /* Temporary storage buffers for data used to compute the signbits bitmap and
+   * the end-of-block (EOB) position
+   */
+  uint8_t coef_sign_bits[64];
+  uint8_t coef_eq1_bits[64];
+
+  JCOEF *absvalues_ptr = absvalues;
+  uint8_t *coef_sign_bits_ptr = coef_sign_bits;
+  uint8_t *eq1_bits_ptr = coef_eq1_bits;
+
+  /* Rows of coefficients to zero (since they haven't been processed) */
+  int i, rows_to_zero = 8;
+
+  for (i = 0; i < Sl / 16; i++) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs1 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+    uint8x8_t sign_coefs2 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+    vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_s16(absvalues_ptr, coefs1);
+    vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq11);
+    vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+    absvalues_ptr += 16;
+    coef_sign_bits_ptr += 16;
+    eq1_bits_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+  }
+
+  /* Same operation but for remaining partial vector */
+  int remaining_coefs = Sl % 16;
+  if (remaining_coefs > 8) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vdupq_n_s16(0);
+    switch (remaining_coefs) {
+    case 15:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    case 14:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    case 13:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    case 12:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    case 11:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    case 10:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    case 9:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+    default:
+      break;
+    }
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs1 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+    uint8x8_t sign_coefs2 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+    vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_s16(absvalues_ptr, coefs1);
+    vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq11);
+    vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+    absvalues_ptr += 16;
+    coef_sign_bits_ptr += 16;
+    eq1_bits_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+
+  } else if (remaining_coefs > 0) {
+    int16x8_t coefs = vdupq_n_s16(0);
+
+    switch (remaining_coefs) {
+    case 8:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+    case 7:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+    case 6:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+    case 5:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+    case 4:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+    case 3:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+    case 2:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+    case 1:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+    default:
+      break;
+    }
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs = vabsq_s16(coefs);
+    coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+    vst1q_s16(absvalues_ptr, coefs);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq1);
+
+    absvalues_ptr += 8;
+    coef_sign_bits_ptr += 8;
+    eq1_bits_ptr += 8;
+    rows_to_zero--;
+  }
+
+  /* Zero remaining memory in blocks. */
+  for (i = 0; i < rows_to_zero; i++) {
+    vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
+    vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
+    vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
+    absvalues_ptr += 8;
+    coef_sign_bits_ptr += 8;
+    eq1_bits_ptr += 8;
+  }
+
+  /* Construct zerobits bitmap. */
+  int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
+  int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
+  int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
+  int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
+  int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
+  int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
+  int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
+  int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
+
+  uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
+  uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
+  uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
+  uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
+  uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
+  uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
+  uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
+  uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
+
+  const uint8x8_t bitmap_mask =
+    { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
+
+  abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
+  abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
+  abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
+  abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
+  abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
+  abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
+  abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
+  abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
+  uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
+  uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
+  uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
+  uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__)
+  /* Move bitmap to a 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store zerobits bitmap. */
+  bits[0] = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store zerobits bitmap. */
+  bits[0] = ~bitmap0;
+  bits[1] = ~bitmap1;
+#endif
+
+  /* Construct signbits bitmap. */
+  uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
+  uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
+  uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
+  uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
+  uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
+  uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
+  uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
+  uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
+
+  signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
+  signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
+  signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
+  signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
+  signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
+  signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
+  signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
+  signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
+
+  bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
+  bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
+  bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
+  bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
+  bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__)
+  /* Move bitmap to a 64-bit scalar register. */
+  bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store signbits bitmap. */
+  bits[1] = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store signbits bitmap. */
+  bits[2] = ~bitmap0;
+  bits[3] = ~bitmap1;
+#endif
+
+  /* Construct bitmap to find EOB position (the index of the last coefficient
+   * equal to 1.)
+   */
+  uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
+  uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
+  uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
+  uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
+  uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
+  uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
+  uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
+  uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
+
+  row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
+  row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
+  row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
+  row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
+  row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
+  row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
+  row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
+  row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
+
+  bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
+  bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
+  bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
+  bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
+  bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__)
+  /* Move bitmap to a 64-bit scalar register. */
+  bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+  /* Return EOB position. */
+  if (bitmap == 0) {
+    /* EOB position is defined to be 0 if all coefficients != 1. */
+    return 0;
+  } else {
+    return 63 - __builtin_clzl(bitmap);
+  }
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+
+  /* Return EOB position. */
+  if (bitmap0 == 0 && bitmap1 == 0) {
+    return 0;
+  } else if (bitmap1 != 0) {
+    return 63 - __builtin_clz(bitmap1);
+  } else {
+    return 31 - __builtin_clz(bitmap0);
+  }
+#endif
+}
diff --git a/simd/arm/jcsample-neon.c b/simd/arm/jcsample-neon.c
new file mode 100644
index 0000000..e4e7827
--- /dev/null
+++ b/simd/arm/jcsample-neon.c
@@ -0,0 +1,190 @@
+/*
+ * jcsample-neon.c - downsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 0 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 1 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 2 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 3 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 4 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 5 */
+  0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 6 */
+  0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 7 */
+  0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 8 */
+  0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06,   /* Pad 9 */
+  0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05,   /* Pad 10 */
+  0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04,   /* Pad 11 */
+  0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+  0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03,   /* Pad 12 */
+  0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+  0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,   /* Pad 13 */
+  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+  0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,   /* Pad 14 */
+  0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,   /* Pad 15 */
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the common case of 2:1 horizontal and 1:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+                                JDIMENSION v_samp_factor,
+                                JDIMENSION width_in_blocks,
+                                JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  JSAMPROW inptr, outptr;
+  /* Load expansion mask to pad remaining elements of last DCT block. */
+  const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+  const uint8x16_t expand_mask =
+    vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+  /* Load bias pattern (alternating every pixel.) */
+  const uint16x8_t bias = { 0, 1, 0, 1, 0, 1, 0, 1 };
+  unsigned i, outrow;
+
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    outptr = output_data[outrow];
+    inptr = input_data[outrow];
+
+    /* Downsample all but the last DCT block of pixels. */
+    for (i = 0; i < width_in_blocks - 1; i++) {
+      uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
+      /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+      uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+      /* Divide total by 2 and narrow to 8-bit. */
+      uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+      /* Store samples to memory. */
+      vst1_u8(outptr + i * DCTSIZE, samples_u8);
+    }
+
+    /* Load pixels in last DCT block into a table. */
+    uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__)
+    /* Pad the empty elements with the value of the last pixel. */
+    pixels = vqtbl1q_u8(pixels, expand_mask);
+#else
+    uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
+    pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
+                         vtbl2_u8(table, vget_high_u8(expand_mask)));
+#endif
+    /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+    uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+    /* Divide total by 2, narrow to 8-bit, and store. */
+    uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+    vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+  }
+}
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+                                JDIMENSION v_samp_factor,
+                                JDIMENSION width_in_blocks,
+                                JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  JSAMPROW inptr0, inptr1, outptr;
+  /* Load expansion mask to pad remaining elements of last DCT block. */
+  const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+  const uint8x16_t expand_mask =
+    vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+  /* Load bias pattern (alternating every pixel.) */
+  const uint16x8_t bias = { 1, 2, 1, 2, 1, 2, 1, 2 };
+  unsigned i, outrow;
+
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    outptr = output_data[outrow];
+    inptr0 = input_data[outrow];
+    inptr1 = input_data[outrow + 1];
+
+    /* Downsample all but the last DCT block of pixels. */
+    for (i = 0; i < width_in_blocks - 1; i++) {
+      uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
+      uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
+      /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+      uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+      /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
+       */
+      samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+      /* Divide total by 4 and narrow to 8-bit. */
+      uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+      /* Store samples to memory and increment pointers. */
+      vst1_u8(outptr + i * DCTSIZE, samples_u8);
+    }
+
+    /* Load pixels in last DCT block into a table. */
+    uint8x16_t pixels_r0 =
+      vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
+    uint8x16_t pixels_r1 =
+      vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__)
+    /* Pad the empty elements with the value of the last pixel. */
+    pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
+    pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
+#else
+    uint8x8x2_t table_r0 =
+      { { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
+    uint8x8x2_t table_r1 =
+      { { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
+    pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
+                            vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
+    pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
+                            vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
+#endif
+    /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+    uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+    /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
+    samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+    /* Divide total by 4, narrow to 8-bit, and store. */
+    uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+    vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+  }
+}
diff --git a/simd/arm/jdcolext-neon.c b/simd/arm/jdcolext-neon.c
new file mode 100644
index 0000000..ae440f4
--- /dev/null
+++ b/simd/arm/jdcolext-neon.c
@@ -0,0 +1,353 @@
+/*
+ * jdcolext-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-neon.c. */
+
+
+/* YCbCr -> RGB conversion is defined by the following equations:
+ *    R = Y                        + 1.40200 * (Cr - 128)
+ *    G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ *    B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.3441467 = 11277 * 2^-15
+ *    0.7141418 = 23401 * 2^-15
+ *    1.4020386 = 22971 * 2^-14
+ *    1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdcolor-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for YCbCr -> RGB conversion routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
+                                JDIMENSION input_row, JSAMPARRAY output_buf,
+                                int num_rows)
+{
+  JSAMPROW outptr;
+  /* Pointers to Y, Cb, and Cr data */
+  JSAMPROW inptr0, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    int cols_remaining = output_width;
+    for (; cols_remaining >= 16; cols_remaining -= 16) {
+      uint8x16_t y  = vld1q_u8(inptr0);
+      uint8x16_t cb = vld1q_u8(inptr1);
+      uint8x16_t cr = vld1q_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_low_u8(cr)));
+      int16x8_t cr_128_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_high_u8(cr)));
+      int16x8_t cb_128_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_low_u8(cb)));
+      int16x8_t cb_128_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_high_u8(cb)));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
+      int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
+                                            consts, 0);
+      int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
+      int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
+                                            consts, 0);
+      g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
+                                  consts, 1);
+      g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
+                                  consts, 1);
+      g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
+                                  consts, 1);
+      g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
+                                  consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
+                                         vrshrn_n_s32(g_sub_y_lh, 15));
+      int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
+                                         vrshrn_n_s32(g_sub_y_hh, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
+                                               consts, 2);
+      int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
+                                               consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
+                                               consts, 3);
+      int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
+                                               consts, 3);
+      /* Add Y. */
+      int16x8_t r_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t r_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
+                                       vget_high_u8(y)));
+      int16x8_t b_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t b_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
+                                       vget_high_u8(y)));
+      int16x8_t g_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t g_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
+                                       vget_high_u8(y)));
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+      rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+      rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      vst4q_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+      uint8x16x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+      rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+      rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+      /* Store RGB pixel data to memory. */
+      vst3q_u8(outptr, rgb);
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
+      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
+      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
+      uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
+      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
+      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
+      /* Store RGB pixel data to memory. */
+      vst1q_u16((uint16_t *)outptr, rgb565_l);
+      vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
+#endif
+
+      /* Increment pointers. */
+      inptr0 += 16;
+      inptr1 += 16;
+      inptr2 += 16;
+      outptr += (RGB_PIXELSIZE * 16);
+    }
+
+    if (cols_remaining >= 8) {
+      uint8x8_t y  = vld1_u8(inptr0);
+      uint8x8_t cb = vld1_u8(inptr1);
+      uint8x8_t cr = vld1_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+      int16x8_t cb_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+      int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+      g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+      g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                       vrshrn_n_s32(g_sub_y_h, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+                                             consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+                                             consts, 3);
+      /* Add Y. */
+      int16x8_t r =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+      int16x8_t b =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+      int16x8_t g =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vqmovun_s16(r);
+      rgba.val[RGB_GREEN] = vqmovun_s16(g);
+      rgba.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      vst4_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+      uint8x8x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vqmovun_s16(r);
+      rgb.val[RGB_GREEN] = vqmovun_s16(g);
+      rgb.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Store RGB pixel data to memory. */
+      vst3_u8(outptr, rgb);
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+      /* Store RGB pixel data to memory. */
+      vst1q_u16((uint16_t *)outptr, rgb565);
+#endif
+
+      /* Increment pointers. */
+      inptr0 += 8;
+      inptr1 += 8;
+      inptr2 += 8;
+      outptr += (RGB_PIXELSIZE * 8);
+      cols_remaining -= 8;
+    }
+
+    /* Handle the tail elements. */
+    if (cols_remaining > 0) {
+      uint8x8_t y  = vld1_u8(inptr0);
+      uint8x8_t cb = vld1_u8(inptr1);
+      uint8x8_t cr = vld1_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+      int16x8_t cb_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+      int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+      g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+      g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                       vrshrn_n_s32(g_sub_y_h, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+                                             consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+                                             consts, 3);
+      /* Add Y. */
+      int16x8_t r =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+      int16x8_t b =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+      int16x8_t g =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vqmovun_s16(r);
+      rgba.val[RGB_GREEN] = vqmovun_s16(g);
+      rgba.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
+      case 6:
+        vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
+      case 5:
+        vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
+      case 4:
+        vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
+      case 3:
+        vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
+      case 2:
+        vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
+      case 1:
+        vst4_lane_u8(outptr, rgba, 0);
+      default:
+        break;
+      }
+#elif RGB_PIXELSIZE == 3
+      uint8x8x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vqmovun_s16(r);
+      rgb.val[RGB_GREEN] = vqmovun_s16(g);
+      rgb.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Store RGB pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
+      case 6:
+        vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
+      case 5:
+        vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
+      case 4:
+        vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
+      case 3:
+        vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
+      case 2:
+        vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
+      case 1:
+        vst3_lane_u8(outptr, rgb, 0);
+      default:
+        break;
+      }
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+      /* Store RGB565 pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
+      case 6:
+        vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
+      case 5:
+        vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
+      case 4:
+        vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
+      case 3:
+        vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
+      case 2:
+        vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
+      case 1:
+        vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
+      default:
+        break;
+      }
+#endif
+    }
+  }
+}
diff --git a/simd/arm/jdcolor-neon.c b/simd/arm/jdcolor-neon.c
new file mode 100644
index 0000000..28dbc57
--- /dev/null
+++ b/simd/arm/jdcolor-neon.c
@@ -0,0 +1,141 @@
+/*
+ * jdcolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344  11277  /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714  23401  /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402  22971  /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772  29033  /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+  -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extrgbx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extbgrx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extxbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extxrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+/* YCbCr -> RGB565 Conversion */
+
+#define RGB_PIXELSIZE  2
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_rgb565_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
diff --git a/simd/arm/jdmerge-neon.c b/simd/arm/jdmerge-neon.c
new file mode 100644
index 0000000..18fb9d8
--- /dev/null
+++ b/simd/arm/jdmerge-neon.c
@@ -0,0 +1,144 @@
+/*
+ * jdmerge-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344  11277  /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714  23401  /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402  22971  /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772  29033  /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+  -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extrgbx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extrgbx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extbgrx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extbgrx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extxbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extxbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extxrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extxrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
diff --git a/simd/arm/jdmrgext-neon.c b/simd/arm/jdmrgext-neon.c
new file mode 100644
index 0000000..fa2ec05
--- /dev/null
+++ b/simd/arm/jdmrgext-neon.c
@@ -0,0 +1,667 @@
+/*
+ * jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-neon.c. */
+
+
+/* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
+ * chroma upsampling and YCbCr -> RGB color conversion into a single function.
+ *
+ * As with the standalone functions, YCbCr -> RGB conversion is defined by the
+ * following equations:
+ *    R = Y                        + 1.40200 * (Cr - 128)
+ *    G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ *    B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.3441467 = 11277 * 2^-15
+ *    0.7141418 = 23401 * 2^-15
+ *    1.4020386 = 22971 * 2^-14
+ *    1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdmerge-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
+ * routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+/* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+ */
+
+void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
+                                     JSAMPIMAGE input_buf,
+                                     JDIMENSION in_row_group_ctr,
+                                     JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr;
+  /* Pointers to Y, Cb, and Cr data */
+  JSAMPROW inptr0, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  int cols_remaining = output_width;
+  for (; cols_remaining >= 16; cols_remaining -= 16) {
+    /* De-interleave Y component values into two separate vectors, one
+     * containing the component values with even-numbered indices and one
+     * containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y = vld2_u8(inptr0);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+     * "odd" Y component values.  This effectively upsamples the chroma
+     * components horizontally.
+     */
+    int16x8_t g_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[0]));
+    int16x8_t r_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[0]));
+    int16x8_t b_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[0]));
+    int16x8_t g_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[1]));
+    int16x8_t r_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[1]));
+    int16x8_t b_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+    uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+    uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+    uint8x16x4_t rgba;
+    rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+    rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+    rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+    /* Set alpha channel to opaque (0xFF). */
+    rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    vst4q_u8(outptr, rgba);
+#else
+    uint8x16x3_t rgb;
+    rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+    rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+    rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+    /* Store RGB pixel data to memory. */
+    vst3q_u8(outptr, rgb);
+#endif
+
+    /* Increment pointers. */
+    inptr0 += 16;
+    inptr1 += 8;
+    inptr2 += 8;
+    outptr += (RGB_PIXELSIZE * 16);
+  }
+
+  if (cols_remaining > 0) {
+    /* De-interleave Y component values into two separate vectors, one
+     * containing the component values with even-numbered indices and one
+     * containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y = vld2_u8(inptr0);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+     * "odd" Y component values.  This effectively upsamples the chroma
+     * components horizontally.
+     */
+    int16x8_t g_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[0]));
+    int16x8_t r_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[0]));
+    int16x8_t b_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[0]));
+    int16x8_t g_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[1]));
+    int16x8_t r_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[1]));
+    int16x8_t b_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+    uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+    uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+    uint8x8x4_t rgba_h;
+    rgba_h.val[RGB_RED] = r.val[1];
+    rgba_h.val[RGB_GREEN] = g.val[1];
+    rgba_h.val[RGB_BLUE] = b.val[1];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    uint8x8x4_t rgba_l;
+    rgba_l.val[RGB_RED] = r.val[0];
+    rgba_l.val[RGB_GREEN] = g.val[0];
+    rgba_l.val[RGB_BLUE] = b.val[0];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
+    case 14:
+      vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
+    case 13:
+      vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
+    case 12:
+      vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
+    case 11:
+      vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
+    case 10:
+      vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
+    case 9:
+      vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
+    case 8:
+      vst4_u8(outptr, rgba_l);
+      break;
+    case 7:
+      vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
+    case 6:
+      vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
+    case 5:
+      vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
+    case 4:
+      vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
+    case 3:
+      vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
+    case 2:
+      vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
+    case 1:
+      vst4_lane_u8(outptr, rgba_l, 0);
+    default:
+      break;
+    }
+#else
+    uint8x8x3_t rgb_h;
+    rgb_h.val[RGB_RED] = r.val[1];
+    rgb_h.val[RGB_GREEN] = g.val[1];
+    rgb_h.val[RGB_BLUE] = b.val[1];
+    uint8x8x3_t rgb_l;
+    rgb_l.val[RGB_RED] = r.val[0];
+    rgb_l.val[RGB_GREEN] = g.val[0];
+    rgb_l.val[RGB_BLUE] = b.val[0];
+    /* Store RGB pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
+    case 14:
+      vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
+    case 13:
+      vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
+    case 12:
+      vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
+    case 11:
+      vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
+    case 10:
+      vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
+    case 9:
+      vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
+    case 8:
+      vst3_u8(outptr, rgb_l);
+      break;
+    case 7:
+      vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
+    case 6:
+      vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
+    case 5:
+      vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
+    case 4:
+      vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
+    case 3:
+      vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
+    case 2:
+      vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
+    case 1:
+      vst3_lane_u8(outptr, rgb_l, 0);
+    default:
+      break;
+    }
+#endif
+  }
+}
+
+
+/* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+ *
+ * See comments above for details regarding color conversion and safe memory
+ * access.
+ */
+
+void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
+                                     JSAMPIMAGE input_buf,
+                                     JDIMENSION in_row_group_ctr,
+                                     JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr0, outptr1;
+  /* Pointers to Y (both rows), Cb, and Cr data */
+  JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  inptr0_0 = input_buf[0][in_row_group_ctr * 2];
+  inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr0 = output_buf[0];
+  outptr1 = output_buf[1];
+
+  int cols_remaining = output_width;
+  for (; cols_remaining >= 16; cols_remaining -= 16) {
+    /* For each row, de-interleave Y component values into two separate
+     * vectors, one containing the component values with even-numbered indices
+     * and one containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y0 = vld2_u8(inptr0_0);
+    uint8x8x2_t y1 = vld2_u8(inptr0_1);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+     * the "even" and "odd" Y component values.  This effectively upsamples the
+     * chroma components both horizontally and vertically.
+     */
+    int16x8_t g0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[0]));
+    int16x8_t r0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[0]));
+    int16x8_t b0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[0]));
+    int16x8_t g0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[1]));
+    int16x8_t r0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[1]));
+    int16x8_t b0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[1]));
+    int16x8_t g1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[0]));
+    int16x8_t r1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[0]));
+    int16x8_t b1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[0]));
+    int16x8_t g1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[1]));
+    int16x8_t r1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[1]));
+    int16x8_t b1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+    uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+    uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+    uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+    uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+    uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+    uint8x16x4_t rgba0, rgba1;
+    rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+    rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+    rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+    rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+    rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+    rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    vst4q_u8(outptr0, rgba0);
+    vst4q_u8(outptr1, rgba1);
+#else
+    uint8x16x3_t rgb0, rgb1;
+    rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+    rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+    rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+    rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+    rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+    rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+    /* Store RGB pixel data to memory. */
+    vst3q_u8(outptr0, rgb0);
+    vst3q_u8(outptr1, rgb1);
+#endif
+
+    /* Increment pointers. */
+    inptr0_0 += 16;
+    inptr0_1 += 16;
+    inptr1 += 8;
+    inptr2 += 8;
+    outptr0 += (RGB_PIXELSIZE * 16);
+    outptr1 += (RGB_PIXELSIZE * 16);
+  }
+
+  if (cols_remaining > 0) {
+    /* For each row, de-interleave Y component values into two separate
+     * vectors, one containing the component values with even-numbered indices
+     * and one containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y0 = vld2_u8(inptr0_0);
+    uint8x8x2_t y1 = vld2_u8(inptr0_1);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+     * the "even" and "odd" Y component values.  This effectively upsamples the
+     * chroma components both horizontally and vertically.
+     */
+    int16x8_t g0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[0]));
+    int16x8_t r0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[0]));
+    int16x8_t b0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[0]));
+    int16x8_t g0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[1]));
+    int16x8_t r0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[1]));
+    int16x8_t b0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[1]));
+    int16x8_t g1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[0]));
+    int16x8_t r1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[0]));
+    int16x8_t b1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[0]));
+    int16x8_t g1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[1]));
+    int16x8_t r1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[1]));
+    int16x8_t b1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+    uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+    uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+    uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+    uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+    uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+    uint8x8x4_t rgba0_h, rgba1_h;
+    rgba0_h.val[RGB_RED] = r0.val[1];
+    rgba1_h.val[RGB_RED] = r1.val[1];
+    rgba0_h.val[RGB_GREEN] = g0.val[1];
+    rgba1_h.val[RGB_GREEN] = g1.val[1];
+    rgba0_h.val[RGB_BLUE] = b0.val[1];
+    rgba1_h.val[RGB_BLUE] = b1.val[1];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+
+    uint8x8x4_t rgba0_l, rgba1_l;
+    rgba0_l.val[RGB_RED] = r0.val[0];
+    rgba1_l.val[RGB_RED] = r1.val[0];
+    rgba0_l.val[RGB_GREEN] = g0.val[0];
+    rgba1_l.val[RGB_GREEN] = g1.val[0];
+    rgba0_l.val[RGB_BLUE] = b0.val[0];
+    rgba1_l.val[RGB_BLUE] = b1.val[0];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
+      vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
+    case 14:
+      vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
+      vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
+    case 13:
+      vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
+      vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
+    case 12:
+      vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
+      vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
+    case 11:
+      vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
+      vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
+    case 10:
+      vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
+      vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
+    case 9:
+      vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
+      vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
+    case 8:
+      vst4_u8(outptr0, rgba0_l);
+      vst4_u8(outptr1, rgba1_l);
+      break;
+    case 7:
+      vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
+      vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
+    case 6:
+      vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
+      vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
+    case 5:
+      vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
+      vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
+    case 4:
+      vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
+      vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
+    case 3:
+      vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
+      vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
+    case 2:
+      vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
+      vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
+    case 1:
+      vst4_lane_u8(outptr0, rgba0_l, 0);
+      vst4_lane_u8(outptr1, rgba1_l, 0);
+    default:
+      break;
+    }
+#else
+    uint8x8x3_t rgb0_h, rgb1_h;
+    rgb0_h.val[RGB_RED] = r0.val[1];
+    rgb1_h.val[RGB_RED] = r1.val[1];
+    rgb0_h.val[RGB_GREEN] = g0.val[1];
+    rgb1_h.val[RGB_GREEN] = g1.val[1];
+    rgb0_h.val[RGB_BLUE] = b0.val[1];
+    rgb1_h.val[RGB_BLUE] = b1.val[1];
+
+    uint8x8x3_t rgb0_l, rgb1_l;
+    rgb0_l.val[RGB_RED] = r0.val[0];
+    rgb1_l.val[RGB_RED] = r1.val[0];
+    rgb0_l.val[RGB_GREEN] = g0.val[0];
+    rgb1_l.val[RGB_GREEN] = g1.val[0];
+    rgb0_l.val[RGB_BLUE] = b0.val[0];
+    rgb1_l.val[RGB_BLUE] = b1.val[0];
+    /* Store RGB pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
+      vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
+    case 14:
+      vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
+      vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
+    case 13:
+      vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
+      vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
+    case 12:
+      vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
+      vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
+    case 11:
+      vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
+      vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
+    case 10:
+      vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
+      vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
+    case 9:
+      vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
+      vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
+    case 8:
+      vst3_u8(outptr0, rgb0_l);
+      vst3_u8(outptr1, rgb1_l);
+      break;
+    case 7:
+      vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
+      vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
+    case 6:
+      vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
+      vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
+    case 5:
+      vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
+      vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
+    case 4:
+      vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
+      vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
+    case 3:
+      vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
+      vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
+    case 2:
+      vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
+      vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
+    case 1:
+      vst3_lane_u8(outptr0, rgb0_l, 0);
+      vst3_lane_u8(outptr1, rgb1_l, 0);
+    default:
+      break;
+    }
+#endif
+  }
+}
diff --git a/simd/arm/jdsample-neon.c b/simd/arm/jdsample-neon.c
new file mode 100644
index 0000000..90ec678
--- /dev/null
+++ b/simd/arm/jdsample-neon.c
@@ -0,0 +1,569 @@
+/*
+ * jdsample-neon.c - upsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ *                s0        s1        s2
+ *            +---------+---------+---------+
+ *            |         |         |         |
+ *            | p0   p1 | p2   p3 | p4   p5 |
+ *            |         |         |         |
+ *            +---------+---------+---------+
+ *
+ * Samples s0-s2 were created by averaging the original pixel component values
+ * centered at positions p0-p5 above.  To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each row.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1.  For example:
+ *     p1(upsampled) = 3/4 * s0 + 1/4 * s1
+ *     p2(upsampled) = 3/4 * s1 + 1/4 * s0
+ * When computing the first and last pixel component values in the row, there
+ * is no adjacent sample to blend, so:
+ *     p0(upsampled) = s0
+ *     p5(upsampled) = s2
+ */
+
+void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t one_u16 = vdupq_n_u16(1);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+    /* First pixel component value in this row of the original image */
+    *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
+
+    /*    3/4 * containing sample + 1/4 * nearest neighboring sample
+     * For p1: containing sample = s0, nearest neighboring sample = s1
+     * For p2: containing sample = s1, nearest neighboring sample = s0
+     */
+    uint8x16_t s0 = vld1q_u8(inptr);
+    uint8x16_t s1 = vld1q_u8(inptr + 1);
+    /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+     * denote low half and high half respectively.
+     */
+    uint16x8_t s1_add_3s0_l =
+      vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+    uint16x8_t s1_add_3s0_h =
+      vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+    uint16x8_t s0_add_3s1_l =
+      vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+    uint16x8_t s0_add_3s1_h =
+      vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+    /* Add ordered dithering bias to odd pixel values. */
+    s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+    s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+    /* The offset is initially 1, because the first pixel component has already
+     * been stored.  However, in subsequent iterations of the SIMD loop, this
+     * offset is (2 * colctr - 1) to stay within the bounds of the sample
+     * buffers without having to resort to a slow scalar tail case for the last
+     * (downsampled_width % 16) samples.  See "Creation of 2-D sample arrays"
+     * in jmemmgr.c for more details.
+     */
+    unsigned outptr_offset = 1;
+    uint8x16x2_t output_pixels;
+
+    /* We use software pipelining to maximise performance.  The code indented
+     * an extra two spaces begins the next iteration of the loop.
+     */
+    for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+
+        s0 = vld1q_u8(inptr + colctr - 1);
+        s1 = vld1q_u8(inptr + colctr);
+
+      /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+      output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+                                         vrshrn_n_u16(s1_add_3s0_h, 2));
+      output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+                                         vshrn_n_u16(s0_add_3s1_h, 2));
+
+        /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+         * denote low half and high half respectively.
+         */
+        s1_add_3s0_l =
+          vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+        s1_add_3s0_h =
+          vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+        s0_add_3s1_l =
+          vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+        s0_add_3s1_h =
+          vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+        /* Add ordered dithering bias to odd pixel values. */
+        s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+        s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+      /* Store pixel component values to memory. */
+      vst2q_u8(outptr + outptr_offset, output_pixels);
+      outptr_offset = 2 * colctr - 1;
+    }
+
+    /* Complete the last iteration of the loop. */
+
+    /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+    output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+                                       vrshrn_n_u16(s1_add_3s0_h, 2));
+    output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+                                       vshrn_n_u16(s0_add_3s1_h, 2));
+    /* Store pixel component values to memory. */
+    vst2q_u8(outptr + outptr_offset, output_pixels);
+
+    /* Last pixel component value in this row of the original image */
+    outptr[2 * downsampled_width - 1] =
+      GETJSAMPLE(inptr[downsampled_width - 1]);
+  }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ *                s0        s1        s2
+ *            +---------+---------+---------+
+ *            | p0   p1 | p2   p3 | p4   p5 |
+ *       sA   |         |         |         |
+ *            | p6   p7 | p8   p9 | p10  p11|
+ *            +---------+---------+---------+
+ *            | p12  p13| p14  p15| p16  p17|
+ *       sB   |         |         |         |
+ *            | p18  p19| p20  p21| p22  p23|
+ *            +---------+---------+---------+
+ *            | p24  p25| p26  p27| p28  p29|
+ *       sC   |         |         |         |
+ *            | p30  p31| p32  p33| p34  p35|
+ *            +---------+---------+---------+
+ *
+ * Samples s0A-s2C were created by averaging the original pixel component
+ * values centered at positions p0-p35 above.  To approximate one of those
+ * original pixel component values, we proportionally blend the sample
+ * containing the pixel center with the nearest neighboring samples in each
+ * row, column, and diagonal.
+ *
+ * An upsampled pixel component value is computed by first blending the sample
+ * containing the pixel center with the nearest neighboring samples in the
+ * same column, in the ratio 3:1, and then blending each column sum with the
+ * nearest neighboring column sum, in the ratio 3:1.  For example:
+ *     p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
+ *                      1/4 * (3/4 * s0B + 1/4 * s0A)
+ *                    = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
+ * When computing the first and last pixel component values in the row, there
+ * is no horizontally adjacent sample to blend, so:
+ *     p12(upsampled) = 3/4 * s0B + 1/4 * s0A
+ *     p23(upsampled) = 3/4 * s2B + 1/4 * s2C
+ * When computing the first and last pixel component values in the column,
+ * there is no vertically adjacent sample to blend, so:
+ *     p2(upsampled) = 3/4 * s1A + 1/4 * s0A
+ *     p33(upsampled) = 3/4 * s1C + 1/4 * s2C
+ * When computing the corner pixel component values, there is no adjacent
+ * sample to blend, so:
+ *     p0(upsampled) = s0A
+ *     p35(upsampled) = s2C
+ */
+
+void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t seven_u16 = vdupq_n_u16(7);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+  const uint16x8_t three_u16 = vdupq_n_u16(3);
+
+  inrow = outrow = 0;
+  while (outrow < max_v_samp_factor) {
+    inptr0 = input_data[inrow - 1];
+    inptr1 = input_data[inrow];
+    inptr2 = input_data[inrow + 1];
+    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+     * respectively.
+     */
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    /* First pixel component value in this row of the original image */
+    int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
+    *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
+    int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
+    *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
+
+    /* Step 1: Blend samples vertically in columns s0 and s1.
+     * Leave the divide by 4 until the end, when it can be done for both
+     * dimensions at once, right-shifting by 4.
+     */
+
+    /* Load and compute s0colsum0 and s0colsum1. */
+    uint8x16_t s0A = vld1q_u8(inptr0);
+    uint8x16_t s0B = vld1q_u8(inptr1);
+    uint8x16_t s0C = vld1q_u8(inptr2);
+    /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+     * denote low half and high half respectively.
+     */
+    uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
+                                      vget_low_u8(s0B), three_u8);
+    uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
+                                      vget_high_u8(s0B), three_u8);
+    uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
+                                      vget_low_u8(s0B), three_u8);
+    uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
+                                      vget_high_u8(s0B), three_u8);
+    /* Load and compute s1colsum0 and s1colsum1. */
+    uint8x16_t s1A = vld1q_u8(inptr0 + 1);
+    uint8x16_t s1B = vld1q_u8(inptr1 + 1);
+    uint8x16_t s1C = vld1q_u8(inptr2 + 1);
+    uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
+                                      vget_low_u8(s1B), three_u8);
+    uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
+                                      vget_high_u8(s1B), three_u8);
+    uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
+                                      vget_low_u8(s1B), three_u8);
+    uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
+                                      vget_high_u8(s1B), three_u8);
+
+    /* Step 2: Blend the already-blended columns. */
+
+    uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+    uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+    uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+    uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+    uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+    uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+    uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+    uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+    /* Add ordered dithering bias to odd pixel values. */
+    output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+    output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+    output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+    output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+    /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+    uint8x16x2_t output_pixels0 = { {
+      vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
+      vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
+    } };
+    uint8x16x2_t output_pixels1 = { {
+      vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
+      vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
+    } };
+
+    /* Store pixel component values to memory.
+     * The minimum size of the output buffer for each row is 64 bytes => no
+     * need to worry about buffer overflow here.  See "Creation of 2-D sample
+     * arrays" in jmemmgr.c for more details.
+     */
+    vst2q_u8(outptr0 + 1, output_pixels0);
+    vst2q_u8(outptr1 + 1, output_pixels1);
+
+    /* The first pixel of the image shifted our loads and stores by one byte.
+     * We have to re-align on a 32-byte boundary at some point before the end
+     * of the row (we do it now on the 32/33 pixel boundary) to stay within the
+     * bounds of the sample buffers without having to resort to a slow scalar
+     * tail case for the last (downsampled_width % 16) samples.  See "Creation
+     * of 2-D sample arrays" in jmemmgr.c for more details.
+     */
+    for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+      /* Step 1: Blend samples vertically in columns s0 and s1. */
+
+      /* Load and compute s0colsum0 and s0colsum1. */
+      s0A = vld1q_u8(inptr0 + colctr - 1);
+      s0B = vld1q_u8(inptr1 + colctr - 1);
+      s0C = vld1q_u8(inptr2 + colctr - 1);
+      s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
+                             three_u8);
+      s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
+                             three_u8);
+      s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
+                             three_u8);
+      s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
+                             three_u8);
+      /* Load and compute s1colsum0 and s1colsum1. */
+      s1A = vld1q_u8(inptr0 + colctr);
+      s1B = vld1q_u8(inptr1 + colctr);
+      s1C = vld1q_u8(inptr2 + colctr);
+      s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
+                             three_u8);
+      s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
+                             three_u8);
+      s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
+                             three_u8);
+      s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
+                             three_u8);
+
+      /* Step 2: Blend the already-blended columns. */
+
+      output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+      output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+      output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+      output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+      output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+      output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+      output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+      output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+      /* Add ordered dithering bias to odd pixel values. */
+      output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+      output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+      output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+      output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+      /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+      output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
+                                          vshrn_n_u16(output0_p1_h, 4));
+      output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
+                                          vrshrn_n_u16(output0_p2_h, 4));
+      output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
+                                          vshrn_n_u16(output1_p1_h, 4));
+      output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
+                                          vrshrn_n_u16(output1_p2_h, 4));
+      /* Store pixel component values to memory. */
+      vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
+      vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
+    }
+
+    /* Last pixel component value in this row of the original image */
+    int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+                    GETJSAMPLE(inptr0[downsampled_width - 1]);
+    outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
+    int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+                    GETJSAMPLE(inptr2[downsampled_width - 1]);
+    outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
+    inrow++;
+  }
+}
+
+
+/* The diagram below shows a column of samples produced by h1v2 downsampling
+ * (or by losslessly rotating or transposing an h2v1-downsampled image.)
+ *
+ *            +---------+
+ *            |   p0    |
+ *     sA     |         |
+ *            |   p1    |
+ *            +---------+
+ *            |   p2    |
+ *     sB     |         |
+ *            |   p3    |
+ *            +---------+
+ *            |   p4    |
+ *     sC     |         |
+ *            |   p5    |
+ *            +---------+
+ *
+ * Samples sA-sC were created by averaging the original pixel component values
+ * centered at positions p0-p5 above.  To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each
+ * column.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1.  For example:
+ *     p1(upsampled) = 3/4 * sA + 1/4 * sB
+ *     p2(upsampled) = 3/4 * sB + 1/4 * sA
+ * When computing the first and last pixel component values in the column,
+ * there is no adjacent sample to blend, so:
+ *     p0(upsampled) = sA
+ *     p5(upsampled) = sC
+ */
+
+void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t one_u16 = vdupq_n_u16(1);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+
+  inrow = outrow = 0;
+  while (outrow < max_v_samp_factor) {
+    inptr0 = input_data[inrow - 1];
+    inptr1 = input_data[inrow];
+    inptr2 = input_data[inrow + 1];
+    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+     * respectively.
+     */
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+    inrow++;
+
+    /* The size of the input and output buffers is always a multiple of 32
+     * bytes => no need to worry about buffer overflow when reading/writing
+     * memory.  See "Creation of 2-D sample arrays" in jmemmgr.c for more
+     * details.
+     */
+    for (colctr = 0; colctr < downsampled_width; colctr += 16) {
+      /* Load samples. */
+      uint8x16_t sA = vld1q_u8(inptr0 + colctr);
+      uint8x16_t sB = vld1q_u8(inptr1 + colctr);
+      uint8x16_t sC = vld1q_u8(inptr2 + colctr);
+      /* Blend samples vertically. */
+      uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
+                                      vget_low_u8(sB), three_u8);
+      uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
+                                      vget_high_u8(sB), three_u8);
+      uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
+                                      vget_low_u8(sB), three_u8);
+      uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
+                                      vget_high_u8(sB), three_u8);
+      /* Add ordered dithering bias to pixel values in even output rows. */
+      colsum0_l = vaddq_u16(colsum0_l, one_u16);
+      colsum0_h = vaddq_u16(colsum0_h, one_u16);
+      /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+      uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
+                                              vshrn_n_u16(colsum0_h, 2));
+      uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
+                                              vrshrn_n_u16(colsum1_h, 2));
+      /* Store pixel component values to memory. */
+      vst1q_u8(outptr0 + colctr, output_pixels0);
+      vst1q_u8(outptr1 + colctr, output_pixels1);
+    }
+  }
+}
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ *                s0        s1
+ *            +---------+---------+
+ *            |         |         |
+ *            | p0   p1 | p2   p3 |
+ *            |         |         |
+ *            +---------+---------+
+ *
+ * Samples s0 and s1 were created by averaging the original pixel component
+ * values centered at positions p0-p3 above.  To approximate those original
+ * pixel component values, we duplicate the samples horizontally:
+ *     p0(upsampled) = p1(upsampled) = s0
+ *     p2(upsampled) = p3(upsampled) = s1
+ */
+
+void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+                              JSAMPARRAY input_data,
+                              JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow;
+  unsigned colctr;
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+      uint8x16_t samples = vld1q_u8(inptr + colctr);
+      /* Duplicate the samples.  The store operation below interleaves them so
+       * that adjacent pixel component values take on the same sample value,
+       * per above.
+       */
+      uint8x16x2_t output_pixels = { { samples, samples } };
+      /* Store pixel component values to memory.
+       * Due to the way sample buffers are allocated, we don't need to worry
+       * about tail cases when output_width is not a multiple of 32.  See
+       * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+       */
+      vst2q_u8(outptr + 2 * colctr, output_pixels);
+    }
+  }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ *                s0        s1
+ *            +---------+---------+
+ *            | p0   p1 | p2   p3 |
+ *       sA   |         |         |
+ *            | p4   p5 | p6   p7 |
+ *            +---------+---------+
+ *            | p8   p9 | p10  p11|
+ *       sB   |         |         |
+ *            | p12  p13| p14  p15|
+ *            +---------+---------+
+ *
+ * Samples s0A-s1B were created by averaging the original pixel component
+ * values centered at positions p0-p15 above.  To approximate those original
+ * pixel component values, we duplicate the samples both horizontally and
+ * vertically:
+ *     p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
+ *     p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
+ *     p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
+ *     p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
+ */
+
+void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+                              JSAMPARRAY input_data,
+                              JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+      uint8x16_t samples = vld1q_u8(inptr + colctr);
+      /* Duplicate the samples.  The store operation below interleaves them so
+       * that adjacent pixel component values take on the same sample value,
+       * per above.
+       */
+      uint8x16x2_t output_pixels = { { samples, samples } };
+      /* Store pixel component values for both output rows to memory.
+       * Due to the way sample buffers are allocated, we don't need to worry
+       * about tail cases when output_width is not a multiple of 32.  See
+       * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+       */
+      vst2q_u8(outptr0 + 2 * colctr, output_pixels);
+      vst2q_u8(outptr1 + 2 * colctr, output_pixels);
+    }
+  }
+}
diff --git a/simd/arm/jfdctfst-neon.c b/simd/arm/jfdctfst-neon.c
new file mode 100644
index 0000000..bb371be
--- /dev/null
+++ b/simd/arm/jfdctfst-neon.c
@@ -0,0 +1,214 @@
+/*
+ * jfdctfst-neon.c - fast integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples.  It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_ifast() function, which can be found in jfdctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.382683433 = 12544 * 2^-15
+ *    0.541196100 = 17795 * 2^-15
+ *    0.707106781 = 23168 * 2^-15
+ *    0.306562965 =  9984 * 2^-15
+ *
+ * See jfdctfst.c for further details of the DCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_fdct_ifast_neon() match up
+ * with those in jpeg_fdct_ifast().
+ */
+
+#define F_0_382  12544
+#define F_0_541  17792
+#define F_0_707  23168
+#define F_0_306  9984
+
+
+ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
+  F_0_382, F_0_541, F_0_707, F_0_306
+};
+
+void jsimd_fdct_ifast_neon(DCTELEM *data)
+{
+  /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
+   * are used, followed by vuzp to transpose the block such that we have a
+   * column of samples per vector - allowing all rows to be processed at once.
+   */
+  int16x8x4_t data1 = vld4q_s16(data);
+  int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
+
+  int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
+  int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
+  int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
+  int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
+
+  int16x8_t col0 = cols_04.val[0];
+  int16x8_t col1 = cols_15.val[0];
+  int16x8_t col2 = cols_26.val[0];
+  int16x8_t col3 = cols_37.val[0];
+  int16x8_t col4 = cols_04.val[1];
+  int16x8_t col5 = cols_15.val[1];
+  int16x8_t col6 = cols_26.val[1];
+  int16x8_t col7 = cols_37.val[1];
+
+  /* Pass 1: process rows. */
+
+  /* Load DCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
+
+  int16x8_t tmp0 = vaddq_s16(col0, col7);
+  int16x8_t tmp7 = vsubq_s16(col0, col7);
+  int16x8_t tmp1 = vaddq_s16(col1, col6);
+  int16x8_t tmp6 = vsubq_s16(col1, col6);
+  int16x8_t tmp2 = vaddq_s16(col2, col5);
+  int16x8_t tmp5 = vsubq_s16(col2, col5);
+  int16x8_t tmp3 = vaddq_s16(col3, col4);
+  int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);    /* phase 2 */
+  int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+  int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+  int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+  col0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
+  col4 = vsubq_s16(tmp10, tmp11);
+
+  int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+  col2 = vaddq_s16(tmp13, z1);                /* phase 5 */
+  col6 = vsubq_s16(tmp13, z1);
+
+  /* Odd part */
+  tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
+  tmp11 = vaddq_s16(tmp5, tmp6);
+  tmp12 = vaddq_s16(tmp6, tmp7);
+
+  int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+  int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+  z2 = vaddq_s16(z2, z5);
+  int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+  z5 = vaddq_s16(tmp12, z5);
+  z4 = vaddq_s16(z4, z5);
+  int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+  int16x8_t z11 = vaddq_s16(tmp7, z3);        /* phase 5 */
+  int16x8_t z13 = vsubq_s16(tmp7, z3);
+
+  col5 = vaddq_s16(z13, z2);                  /* phase 6 */
+  col3 = vsubq_s16(z13, z2);
+  col1 = vaddq_s16(z11, z4);
+  col7 = vsubq_s16(z11, z4);
+
+  /* Transpose to work on columns in pass 2. */
+  int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+  int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+  int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+  int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+  int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+                                      vreinterpretq_s32_s16(cols_45.val[0]));
+  int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+                                      vreinterpretq_s32_s16(cols_45.val[1]));
+  int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+                                      vreinterpretq_s32_s16(cols_67.val[0]));
+  int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+                                      vreinterpretq_s32_s16(cols_67.val[1]));
+
+  int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+  int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+  int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+  int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+  int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+  int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+  int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+  int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+  int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+  int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+  int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+  int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+  /* Pass 2: process columns. */
+
+  tmp0 = vaddq_s16(row0, row7);
+  tmp7 = vsubq_s16(row0, row7);
+  tmp1 = vaddq_s16(row1, row6);
+  tmp6 = vsubq_s16(row1, row6);
+  tmp2 = vaddq_s16(row2, row5);
+  tmp5 = vsubq_s16(row2, row5);
+  tmp3 = vaddq_s16(row3, row4);
+  tmp4 = vsubq_s16(row3, row4);
+
+  /* Even part */
+  tmp10 = vaddq_s16(tmp0, tmp3);              /* phase 2 */
+  tmp13 = vsubq_s16(tmp0, tmp3);
+  tmp11 = vaddq_s16(tmp1, tmp2);
+  tmp12 = vsubq_s16(tmp1, tmp2);
+
+  row0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
+  row4 = vsubq_s16(tmp10, tmp11);
+
+  z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+  row2 = vaddq_s16(tmp13, z1);                /* phase 5 */
+  row6 = vsubq_s16(tmp13, z1);
+
+  /* Odd part */
+  tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
+  tmp11 = vaddq_s16(tmp5, tmp6);
+  tmp12 = vaddq_s16(tmp6, tmp7);
+
+  z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+  z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+  z2 = vaddq_s16(z2, z5);
+  z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+  z5 = vaddq_s16(tmp12, z5);
+  z4 = vaddq_s16(z4, z5);
+  z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+  z11 = vaddq_s16(tmp7, z3);                  /* phase 5 */
+  z13 = vsubq_s16(tmp7, z3);
+
+  row5 = vaddq_s16(z13, z2);                  /* phase 6 */
+  row3 = vsubq_s16(z13, z2);
+  row1 = vaddq_s16(z11, z4);
+  row7 = vsubq_s16(z11, z4);
+
+  vst1q_s16(data + 0 * DCTSIZE, row0);
+  vst1q_s16(data + 1 * DCTSIZE, row1);
+  vst1q_s16(data + 2 * DCTSIZE, row2);
+  vst1q_s16(data + 3 * DCTSIZE, row3);
+  vst1q_s16(data + 4 * DCTSIZE, row4);
+  vst1q_s16(data + 5 * DCTSIZE, row5);
+  vst1q_s16(data + 6 * DCTSIZE, row6);
+  vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/simd/arm/jfdctint-neon.c b/simd/arm/jfdctint-neon.c
new file mode 100644
index 0000000..ccfc07b
--- /dev/null
+++ b/simd/arm/jfdctint-neon.c
@@ -0,0 +1,376 @@
+/*
+ * jfdctint-neon.c - accurate integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples.  It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_islow() function, which can be found in jfdctint.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.298631336 =  2446 * 2^-13
+ *    0.390180644 =  3196 * 2^-13
+ *    0.541196100 =  4433 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.175875602 =  9633 * 2^-13
+ *    1.501321110 = 12299 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    1.961570560 = 16069 * 2^-13
+ *    2.053119869 = 16819 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *    3.072711026 = 25172 * 2^-13
+ *
+ * See jfdctint.c for further details of the DCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_fdct_islow_neon() match up
+ * with those in jpeg_fdct_islow().
+ */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+#define F_0_298  2446
+#define F_0_390  3196
+#define F_0_541  4433
+#define F_0_765  6270
+#define F_0_899  7373
+#define F_1_175  9633
+#define F_1_501  12299
+#define F_1_847  15137
+#define F_1_961  16069
+#define F_2_053  16819
+#define F_2_562  20995
+#define F_3_072  25172
+
+
+ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
+  F_0_298, -F_0_390,  F_0_541,  F_0_765,
+ -F_0_899,  F_1_175,  F_1_501, -F_1_847,
+ -F_1_961,  F_2_053, -F_2_562,  F_3_072
+};
+
+void jsimd_fdct_islow_neon(DCTELEM *data)
+{
+  /* Load DCT constants. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+  const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
+   * are used, followed by vuzp to transpose the block such that we have a
+   * column of samples per vector - allowing all rows to be processed at once.
+   */
+  int16x8x4_t s_rows_0123 = vld4q_s16(data);
+  int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
+
+  int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]);
+  int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]);
+  int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]);
+  int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]);
+
+  int16x8_t col0 = cols_04.val[0];
+  int16x8_t col1 = cols_15.val[0];
+  int16x8_t col2 = cols_26.val[0];
+  int16x8_t col3 = cols_37.val[0];
+  int16x8_t col4 = cols_04.val[1];
+  int16x8_t col5 = cols_15.val[1];
+  int16x8_t col6 = cols_26.val[1];
+  int16x8_t col7 = cols_37.val[1];
+
+  /* Pass 1: process rows. */
+
+  int16x8_t tmp0 = vaddq_s16(col0, col7);
+  int16x8_t tmp7 = vsubq_s16(col0, col7);
+  int16x8_t tmp1 = vaddq_s16(col1, col6);
+  int16x8_t tmp6 = vsubq_s16(col1, col6);
+  int16x8_t tmp2 = vaddq_s16(col2, col5);
+  int16x8_t tmp5 = vsubq_s16(col2, col5);
+  int16x8_t tmp3 = vaddq_s16(col3, col4);
+  int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
+  int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+  int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+  int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+  col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+  col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+  int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+  int32x4_t z1_l =
+    vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+  int32x4_t z1_h =
+    vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+  int32x4_t col2_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+  int32x4_t col2_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+  col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
+                      vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
+
+  int32x4_t col6_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+  int32x4_t col6_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+  col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
+                      vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
+
+  /* Odd part */
+  int16x8_t z1 = vaddq_s16(tmp4, tmp7);
+  int16x8_t z2 = vaddq_s16(tmp5, tmp6);
+  int16x8_t z3 = vaddq_s16(tmp4, tmp6);
+  int16x8_t z4 = vaddq_s16(tmp5, tmp7);
+  /* sqrt(2) * c3 */
+  int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+  int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+  z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+  z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+  /* sqrt(2) * (-c1+c3+c5-c7) */
+  int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+  int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+  /* sqrt(2) * ( c1+c3-c5+c7) */
+  int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+  int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+  /* sqrt(2) * ( c1+c3+c5-c7) */
+  int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+  int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+  /* sqrt(2) * ( c1+c3-c5-c7) */
+  int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+  int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+  /* sqrt(2) * (c7-c3) */
+  z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+  z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+  /* sqrt(2) * (-c1-c3) */
+  int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+  int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+  /* sqrt(2) * (-c3-c5) */
+  int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+  int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+  /* sqrt(2) * (c5-c3) */
+  int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+  int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+  z3_l = vaddq_s32(z3_l, z5_l);
+  z3_h = vaddq_s32(z3_h, z5_h);
+  z4_l = vaddq_s32(z4_l, z5_l);
+  z4_h = vaddq_s32(z4_h, z5_h);
+
+  tmp4_l = vaddq_s32(tmp4_l, z1_l);
+  tmp4_h = vaddq_s32(tmp4_h, z1_h);
+  tmp4_l = vaddq_s32(tmp4_l, z3_l);
+  tmp4_h = vaddq_s32(tmp4_h, z3_h);
+  col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp4_h, DESCALE_P1));
+
+  tmp5_l = vaddq_s32(tmp5_l, z2_l);
+  tmp5_h = vaddq_s32(tmp5_h, z2_h);
+  tmp5_l = vaddq_s32(tmp5_l, z4_l);
+  tmp5_h = vaddq_s32(tmp5_h, z4_h);
+  col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp5_h, DESCALE_P1));
+
+  tmp6_l = vaddq_s32(tmp6_l, z2_l);
+  tmp6_h = vaddq_s32(tmp6_h, z2_h);
+  tmp6_l = vaddq_s32(tmp6_l, z3_l);
+  tmp6_h = vaddq_s32(tmp6_h, z3_h);
+  col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp6_h, DESCALE_P1));
+
+  tmp7_l = vaddq_s32(tmp7_l, z1_l);
+  tmp7_h = vaddq_s32(tmp7_h, z1_h);
+  tmp7_l = vaddq_s32(tmp7_l, z4_l);
+  tmp7_h = vaddq_s32(tmp7_h, z4_h);
+  col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp7_h, DESCALE_P1));
+
+  /* Transpose to work on columns in pass 2. */
+  int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+  int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+  int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+  int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+  int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+                                      vreinterpretq_s32_s16(cols_45.val[0]));
+  int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+                                      vreinterpretq_s32_s16(cols_45.val[1]));
+  int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+                                      vreinterpretq_s32_s16(cols_67.val[0]));
+  int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+                                      vreinterpretq_s32_s16(cols_67.val[1]));
+
+  int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+  int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+  int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+  int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+  int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+  int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+  int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+  int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+  int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+  int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+  int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+  int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+  /* Pass 2: process columns. */
+
+  tmp0 = vaddq_s16(row0, row7);
+  tmp7 = vsubq_s16(row0, row7);
+  tmp1 = vaddq_s16(row1, row6);
+  tmp6 = vsubq_s16(row1, row6);
+  tmp2 = vaddq_s16(row2, row5);
+  tmp5 = vsubq_s16(row2, row5);
+  tmp3 = vaddq_s16(row3, row4);
+  tmp4 = vsubq_s16(row3, row4);
+
+  /* Even part */
+  tmp10 = vaddq_s16(tmp0, tmp3);
+  tmp13 = vsubq_s16(tmp0, tmp3);
+  tmp11 = vaddq_s16(tmp1, tmp2);
+  tmp12 = vsubq_s16(tmp1, tmp2);
+
+  row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+  row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+  tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+  z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+  z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+  int32x4_t row2_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+  int32x4_t row2_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+  row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
+                      vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
+
+  int32x4_t row6_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+  int32x4_t row6_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+  row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
+                      vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
+
+  /* Odd part */
+  z1 = vaddq_s16(tmp4, tmp7);
+  z2 = vaddq_s16(tmp5, tmp6);
+  z3 = vaddq_s16(tmp4, tmp6);
+  z4 = vaddq_s16(tmp5, tmp7);
+  /* sqrt(2) * c3 */
+  z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+  z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+  z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+  z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+  /* sqrt(2) * (-c1+c3+c5-c7) */
+  tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+  tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+  /* sqrt(2) * ( c1+c3-c5+c7) */
+  tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+  tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+  /* sqrt(2) * ( c1+c3+c5-c7) */
+  tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+  tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+  /* sqrt(2) * ( c1+c3-c5-c7) */
+  tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+  tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+  /* sqrt(2) * (c7-c3) */
+  z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+  z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+  /* sqrt(2) * (-c1-c3) */
+  z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+  z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+  /* sqrt(2) * (-c3-c5) */
+  z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+  z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+  /* sqrt(2) * (c5-c3) */
+  z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+  z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+  z3_l = vaddq_s32(z3_l, z5_l);
+  z3_h = vaddq_s32(z3_h, z5_h);
+  z4_l = vaddq_s32(z4_l, z5_l);
+  z4_h = vaddq_s32(z4_h, z5_h);
+
+  tmp4_l = vaddq_s32(tmp4_l, z1_l);
+  tmp4_h = vaddq_s32(tmp4_h, z1_h);
+  tmp4_l = vaddq_s32(tmp4_l, z3_l);
+  tmp4_h = vaddq_s32(tmp4_h, z3_h);
+  row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp4_h, DESCALE_P2));
+
+  tmp5_l = vaddq_s32(tmp5_l, z2_l);
+  tmp5_h = vaddq_s32(tmp5_h, z2_h);
+  tmp5_l = vaddq_s32(tmp5_l, z4_l);
+  tmp5_h = vaddq_s32(tmp5_h, z4_h);
+  row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp5_h, DESCALE_P2));
+
+  tmp6_l = vaddq_s32(tmp6_l, z2_l);
+  tmp6_h = vaddq_s32(tmp6_h, z2_h);
+  tmp6_l = vaddq_s32(tmp6_l, z3_l);
+  tmp6_h = vaddq_s32(tmp6_h, z3_h);
+  row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp6_h, DESCALE_P2));
+
+  tmp7_l = vaddq_s32(tmp7_l, z1_l);
+  tmp7_h = vaddq_s32(tmp7_h, z1_h);
+  tmp7_l = vaddq_s32(tmp7_l, z4_l);
+  tmp7_h = vaddq_s32(tmp7_h, z4_h);
+  row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp7_h, DESCALE_P2));
+
+  vst1q_s16(data + 0 * DCTSIZE, row0);
+  vst1q_s16(data + 1 * DCTSIZE, row1);
+  vst1q_s16(data + 2 * DCTSIZE, row2);
+  vst1q_s16(data + 3 * DCTSIZE, row3);
+  vst1q_s16(data + 4 * DCTSIZE, row4);
+  vst1q_s16(data + 5 * DCTSIZE, row5);
+  vst1q_s16(data + 6 * DCTSIZE, row6);
+  vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/simd/arm/jidctfst-neon.c b/simd/arm/jidctfst-neon.c
new file mode 100644
index 0000000..a91be53
--- /dev/null
+++ b/simd/arm/jidctfst-neon.c
@@ -0,0 +1,472 @@
+/*
+ * jidctfst-neon.c - fast integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate
+ * inverse DCT (Discrete Cosine Transform) on one block of coefficients.  It
+ * uses the same calculations and produces exactly the same output as IJG's
+ * original jpeg_idct_ifast() function, which can be found in jidctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.082392200 =  2688 * 2^-15
+ *    0.414213562 = 13568 * 2^-15
+ *    0.847759065 = 27776 * 2^-15
+ *    0.613125930 = 20096 * 2^-15
+ *
+ * See jidctfst.c for further details of the IDCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_idct_ifast_neon() match up
+ * with those in jpeg_idct_ifast().
+ */
+
+#define PASS1_BITS  2
+
+#define F_0_082  2688
+#define F_0_414  13568
+#define F_0_847  27776
+#define F_0_613  20096
+
+
+ALIGN(16) static const int16_t jsimd_idct_ifast_neon_consts[] = {
+  F_0_082, F_0_414, F_0_847, F_0_613
+};
+
+void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  IFAST_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values for DC coefficients. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  /* Dequantize DC coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+
+  /* Construct bitmap to test if all AC coefficients are 0. */
+  int16x8_t bitmap = vorrq_s16(row1, row2);
+  bitmap = vorrq_s16(bitmap, row3);
+  bitmap = vorrq_s16(bitmap, row4);
+  bitmap = vorrq_s16(bitmap, row5);
+  bitmap = vorrq_s16(bitmap, row6);
+  bitmap = vorrq_s16(bitmap, row7);
+
+  int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+  int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+  /* Load IDCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts);
+
+  if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+    /* All AC coefficients are zero.
+     * Compute DC values and duplicate into vectors.
+     */
+    int16x8_t dcval = row0;
+    row1 = dcval;
+    row2 = dcval;
+    row3 = dcval;
+    row4 = dcval;
+    row5 = dcval;
+    row6 = dcval;
+    row7 = dcval;
+  } else if (left_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 0, 1, 2, and 3.
+     * Use DC values for these columns.
+     */
+    int16x4_t dcval = vget_low_s16(row0);
+
+    /* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+    int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x4_t tmp0 = vget_high_s16(row0);
+    int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2);
+    int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4);
+    int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+    int16x4_t tmp10 = vadd_s16(tmp0, tmp2);   /* phase 3 */
+    int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+    int16x4_t tmp13 = vadd_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+    int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsub_s16(tmp12, tmp13);
+
+    tmp0 = vadd_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsub_s16(tmp10, tmp13);
+    tmp1 = vadd_s16(tmp11, tmp12);
+    tmp2 = vsub_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1);
+    int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3);
+    int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5);
+    int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7);
+
+    int16x4_t z13 = vadd_s16(tmp6, tmp5);     /* phase 6 */
+    int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+    int16x4_t z11 = vadd_s16(tmp4, tmp7);
+    int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+    tmp7 = vadd_s16(z11, z13);                /* phase 5 */
+    int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+    tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+    int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+    int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+    z5 = vadd_s16(z5, z10_add_z12);
+    tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+    tmp10 = vadd_s16(tmp10, z12);
+    tmp10 = vsub_s16(tmp10, z5);
+    tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+    tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+    tmp12 = vadd_s16(tmp12, z5);
+
+    tmp6 = vsub_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsub_s16(tmp11, tmp6);
+    tmp4 = vadd_s16(tmp10, tmp5);
+
+    row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7));
+    row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7));
+    row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6));
+    row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6));
+    row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5));
+    row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5));
+    row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4));
+    row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4));
+  } else if (right_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 4, 5, 6, and 7.
+     * Use DC values for these columns.
+     */
+    int16x4_t dcval = vget_high_s16(row0);
+
+    /* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+    int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x4_t tmp0 = vget_low_s16(row0);
+    int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2);
+    int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4);
+    int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+    int16x4_t tmp10 = vadd_s16(tmp0, tmp2);   /* phase 3 */
+    int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+    int16x4_t tmp13 = vadd_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+    int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsub_s16(tmp12, tmp13);
+
+    tmp0 = vadd_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsub_s16(tmp10, tmp13);
+    tmp1 = vadd_s16(tmp11, tmp12);
+    tmp2 = vsub_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1);
+    int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3);
+    int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5);
+    int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7);
+
+    int16x4_t z13 = vadd_s16(tmp6, tmp5);     /* phase 6 */
+    int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+    int16x4_t z11 = vadd_s16(tmp4, tmp7);
+    int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+    tmp7 = vadd_s16(z11, z13);                /* phase 5 */
+    int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+    tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+    int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+    int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+    z5 = vadd_s16(z5, z10_add_z12);
+    tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+    tmp10 = vadd_s16(tmp10, z12);
+    tmp10 = vsub_s16(tmp10, z5);
+    tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+    tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+    tmp12 = vadd_s16(tmp12, z5);
+
+    tmp6 = vsub_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsub_s16(tmp11, tmp6);
+    tmp4 = vadd_s16(tmp10, tmp5);
+
+    row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval);
+    row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval);
+    row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval);
+    row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval);
+    row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval);
+    row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval);
+    row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval);
+    row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval);
+  } else {
+    /* Some AC coefficients are non-zero; full IDCT calculation required. */
+
+    /* Load quantization table. */
+    int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+    int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+    int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+    int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE);
+    int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+    int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+    int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x8_t tmp0 = row0;
+    int16x8_t tmp1 = vmulq_s16(row2, quant_row2);
+    int16x8_t tmp2 = vmulq_s16(row4, quant_row4);
+    int16x8_t tmp3 = vmulq_s16(row6, quant_row6);
+
+    int16x8_t tmp10 = vaddq_s16(tmp0, tmp2);   /* phase 3 */
+    int16x8_t tmp11 = vsubq_s16(tmp0, tmp2);
+
+    int16x8_t tmp13 = vaddq_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3);
+    int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsubq_s16(tmp12, tmp13);
+
+    tmp0 = vaddq_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsubq_s16(tmp10, tmp13);
+    tmp1 = vaddq_s16(tmp11, tmp12);
+    tmp2 = vsubq_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x8_t tmp4 = vmulq_s16(row1, quant_row1);
+    int16x8_t tmp5 = vmulq_s16(row3, quant_row3);
+    int16x8_t tmp6 = vmulq_s16(row5, quant_row5);
+    int16x8_t tmp7 = vmulq_s16(row7, quant_row7);
+
+    int16x8_t z13 = vaddq_s16(tmp6, tmp5);     /* phase 6 */
+    int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6);
+    int16x8_t z11 = vaddq_s16(tmp4, tmp7);
+    int16x8_t z12 = vsubq_s16(tmp4, tmp7);
+
+    tmp7 = vaddq_s16(z11, z13);                /* phase 5 */
+    int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+    tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+    int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+    int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+    z5 = vaddq_s16(z5, z10_add_z12);
+    tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+    tmp10 = vaddq_s16(tmp10, z12);
+    tmp10 = vsubq_s16(tmp10, z5);
+    tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+    tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+    tmp12 = vaddq_s16(tmp12, z5);
+
+    tmp6 = vsubq_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsubq_s16(tmp11, tmp6);
+    tmp4 = vaddq_s16(tmp10, tmp5);
+
+    row0 = vaddq_s16(tmp0, tmp7);
+    row7 = vsubq_s16(tmp0, tmp7);
+    row1 = vaddq_s16(tmp1, tmp6);
+    row6 = vsubq_s16(tmp1, tmp6);
+    row2 = vaddq_s16(tmp2, tmp5);
+    row5 = vsubq_s16(tmp2, tmp5);
+    row4 = vaddq_s16(tmp3, tmp4);
+    row3 = vsubq_s16(tmp3, tmp4);
+  }
+
+  /* Transpose rows to work on columns in pass 2. */
+  int16x8x2_t rows_01 = vtrnq_s16(row0, row1);
+  int16x8x2_t rows_23 = vtrnq_s16(row2, row3);
+  int16x8x2_t rows_45 = vtrnq_s16(row4, row5);
+  int16x8x2_t rows_67 = vtrnq_s16(row6, row7);
+
+  int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]),
+                                      vreinterpretq_s32_s16(rows_45.val[0]));
+  int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]),
+                                      vreinterpretq_s32_s16(rows_45.val[1]));
+  int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]),
+                                      vreinterpretq_s32_s16(rows_67.val[0]));
+  int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]),
+                                      vreinterpretq_s32_s16(rows_67.val[1]));
+
+  int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]);
+  int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]);
+  int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]);
+  int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]);
+
+  int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]);
+  int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]);
+  int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]);
+  int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]);
+  int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]);
+  int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]);
+  int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]);
+  int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]);
+
+  /* 1-D IDCT, pass 2 */
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(col0, col4);
+  int16x8_t tmp11 = vsubq_s16(col0, col4);
+
+  int16x8_t tmp13 = vaddq_s16(col2, col6);
+  int16x8_t col2_sub_col6 = vsubq_s16(col2, col6);
+  int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1);
+  tmp12 = vaddq_s16(tmp12, col2_sub_col6);
+  tmp12 = vsubq_s16(tmp12, tmp13);
+
+  int16x8_t tmp0 = vaddq_s16(tmp10, tmp13);
+  int16x8_t tmp3 = vsubq_s16(tmp10, tmp13);
+  int16x8_t tmp1 = vaddq_s16(tmp11, tmp12);
+  int16x8_t tmp2 = vsubq_s16(tmp11, tmp12);
+
+  /* Odd part */
+  int16x8_t z13 = vaddq_s16(col5, col3);
+  int16x8_t neg_z10 = vsubq_s16(col3, col5);
+  int16x8_t z11 = vaddq_s16(col1, col7);
+  int16x8_t z12 = vsubq_s16(col1, col7);
+
+  int16x8_t tmp7 = vaddq_s16(z11, z13);      /* phase 5 */
+  int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+  tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+  tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+  int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+  int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+  z5 = vaddq_s16(z5, z10_add_z12);
+  tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+  tmp10 = vaddq_s16(tmp10, z12);
+  tmp10 = vsubq_s16(tmp10, z5);
+  tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+  tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+  tmp12 = vaddq_s16(tmp12, z5);
+
+  int16x8_t tmp6 = vsubq_s16(tmp12, tmp7);   /* phase 2 */
+  int16x8_t tmp5 = vsubq_s16(tmp11, tmp6);
+  int16x8_t tmp4 = vaddq_s16(tmp10, tmp5);
+
+  col0 = vaddq_s16(tmp0, tmp7);
+  col7 = vsubq_s16(tmp0, tmp7);
+  col1 = vaddq_s16(tmp1, tmp6);
+  col6 = vsubq_s16(tmp1, tmp6);
+  col2 = vaddq_s16(tmp2, tmp5);
+  col5 = vsubq_s16(tmp2, tmp5);
+  col4 = vaddq_s16(tmp3, tmp4);
+  col3 = vsubq_s16(tmp3, tmp4);
+
+  /* Scale down by a factor of 8, narrowing to 8-bit. */
+  int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col1, PASS1_BITS + 3));
+  int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col5, PASS1_BITS + 3));
+  int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col3, PASS1_BITS + 3));
+  int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col7, PASS1_BITS + 3));
+  /* Clamp to range [0-255]. */
+  uint8x16_t cols_01 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_45 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_23 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_67 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+
+  /* Transpose block to prepare for store. */
+  uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01),
+                                     vreinterpretq_u32_u8(cols_45));
+  uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23),
+                                     vreinterpretq_u32_u8(cols_67));
+
+  uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]),
+                                    vreinterpretq_u8_u32(cols_0415.val[1]));
+  uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]),
+                                    vreinterpretq_u8_u32(cols_2637.val[1]));
+  uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]),
+                                     vreinterpretq_u16_u8(cols_2367.val[0]));
+  uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]),
+                                     vreinterpretq_u16_u8(cols_2367.val[1]));
+
+  uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]);
+  uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]);
+  uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]);
+  uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]);
+
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  JSAMPROW outptr2 = output_buf[2] + output_col;
+  JSAMPROW outptr3 = output_buf[3] + output_col;
+  JSAMPROW outptr4 = output_buf[4] + output_col;
+  JSAMPROW outptr5 = output_buf[5] + output_col;
+  JSAMPROW outptr6 = output_buf[6] + output_col;
+  JSAMPROW outptr7 = output_buf[7] + output_col;
+
+  /* Store DCT block to memory. */
+  vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0);
+  vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0);
+  vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0);
+  vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0);
+  vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1);
+  vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1);
+  vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1);
+  vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1);
+}
diff --git a/simd/arm/jidctint-neon.c b/simd/arm/jidctint-neon.c
new file mode 100644
index 0000000..043b652
--- /dev/null
+++ b/simd/arm/jidctint-neon.c
@@ -0,0 +1,802 @@
+/*
+ * jidctint-neon.c - accurate integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+/* The computation of the inverse DCT requires the use of constants known at
+ * compile time.  Scaled integer constants are used to avoid floating-point
+ * arithmetic:
+ *    0.298631336 =  2446 * 2^-13
+ *    0.390180644 =  3196 * 2^-13
+ *    0.541196100 =  4433 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.175875602 =  9633 * 2^-13
+ *    1.501321110 = 12299 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    1.961570560 = 16069 * 2^-13
+ *    2.053119869 = 16819 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *    3.072711026 = 25172 * 2^-13
+ */
+
+#define F_0_298  2446
+#define F_0_390  3196
+#define F_0_541  4433
+#define F_0_765  6270
+#define F_0_899  7373
+#define F_1_175  9633
+#define F_1_501  12299
+#define F_1_847  15137
+#define F_1_961  16069
+#define F_2_053  16819
+#define F_2_562  20995
+#define F_3_072  25172
+
+#define F_1_175_MINUS_1_961  (F_1_175 - F_1_961)
+#define F_1_175_MINUS_0_390  (F_1_175 - F_0_390)
+#define F_0_541_MINUS_1_847  (F_0_541 - F_1_847)
+#define F_3_072_MINUS_2_562  (F_3_072 - F_2_562)
+#define F_0_298_MINUS_0_899  (F_0_298 - F_0_899)
+#define F_1_501_MINUS_0_899  (F_1_501 - F_0_899)
+#define F_2_053_MINUS_2_562  (F_2_053 - F_2_562)
+#define F_0_541_PLUS_0_765   (F_0_541 + F_0_765)
+
+
+ALIGN(16) static const int16_t jsimd_idct_islow_neon_consts[] = {
+  F_0_899,             F_0_541,
+  F_2_562,             F_0_298_MINUS_0_899,
+  F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
+  F_0_541_PLUS_0_765,  F_1_175,
+  F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
+  F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
+  0, 0, 0, 0
+};
+
+
+/* Forward declaration of regular and sparse IDCT helper functions */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+                                                  int16x4_t row1,
+                                                  int16x4_t row2,
+                                                  int16x4_t row3,
+                                                  int16x4_t row4,
+                                                  int16x4_t row5,
+                                                  int16x4_t row6,
+                                                  int16x4_t row7,
+                                                  int16x4_t quant_row0,
+                                                  int16x4_t quant_row1,
+                                                  int16x4_t quant_row2,
+                                                  int16x4_t quant_row3,
+                                                  int16x4_t quant_row4,
+                                                  int16x4_t quant_row5,
+                                                  int16x4_t quant_row6,
+                                                  int16x4_t quant_row7,
+                                                  int16_t *workspace_1,
+                                                  int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+                                                 int16x4_t row1,
+                                                 int16x4_t row2,
+                                                 int16x4_t row3,
+                                                 int16x4_t quant_row0,
+                                                 int16x4_t quant_row1,
+                                                 int16x4_t quant_row2,
+                                                 int16x4_t quant_row3,
+                                                 int16_t *workspace_1,
+                                                 int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+                                                  JSAMPARRAY output_buf,
+                                                  JDIMENSION output_col,
+                                                  unsigned buf_offset);
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+                                                 JSAMPARRAY output_buf,
+                                                 JDIMENSION output_col,
+                                                 unsigned buf_offset);
+
+
+/* Perform dequantization and inverse DCT on one block of coefficients.  For
+ * reference, the C implementation (jpeg_idct_slow()) can be found in
+ * jidctint.c.
+ *
+ * Optimization techniques used for fast data access:
+ *
+ * In each pass, the inverse DCT is computed for the left and right 4x8 halves
+ * of the DCT block.  This avoids spilling due to register pressure, and the
+ * increased granularity allows for an optimized calculation depending on the
+ * values of the DCT coefficients.  Between passes, intermediate data is stored
+ * in 4x8 workspace buffers.
+ *
+ * Transposing the 8x8 DCT block after each pass can be achieved by transposing
+ * each of the four 4x4 quadrants and swapping quadrants 1 and 2 (refer to the
+ * diagram below.)  Swapping quadrants is cheap, since the second pass can just
+ * swap the workspace buffer pointers.
+ *
+ *      +-------+-------+                   +-------+-------+
+ *      |       |       |                   |       |       |
+ *      |   0   |   1   |                   |   0   |   2   |
+ *      |       |       |    transpose      |       |       |
+ *      +-------+-------+     ------>       +-------+-------+
+ *      |       |       |                   |       |       |
+ *      |   2   |   3   |                   |   1   |   3   |
+ *      |       |       |                   |       |       |
+ *      +-------+-------+                   +-------+-------+
+ *
+ * Optimization techniques used to accelerate the inverse DCT calculation:
+ *
+ * In a DCT coefficient block, the coefficients are increasingly likely to be 0
+ * as you move diagonally from top left to bottom right.  If whole rows of
+ * coefficients are 0, then the inverse DCT calculation can be simplified.  On
+ * the first pass of the inverse DCT, we test for three special cases before
+ * defaulting to a full "regular" inverse DCT:
+ *
+ * 1) Coefficients in rows 4-7 are all zero.  In this case, we perform a
+ *    "sparse" simplified inverse DCT on rows 0-3.
+ * 2) AC coefficients (rows 1-7) are all zero.  In this case, the inverse DCT
+ *    result is equal to the dequantized DC coefficients.
+ * 3) AC and DC coefficients are all zero.  In this case, the inverse DCT
+ *    result is all zero.  For the left 4x8 half, this is handled identically
+ *    to Case 2 above.  For the right 4x8 half, we do no work and signal that
+ *    the "sparse" algorithm is required for the second pass.
+ *
+ * In the second pass, only a single special case is tested: whether the AC and
+ * DC coefficients were all zero in the right 4x8 block during the first pass
+ * (refer to Case 3 above.)  If this is the case, then a "sparse" variant of
+ * the second pass is performed for both the left and right halves of the DCT
+ * block.  (The transposition after the first pass means that the right 4x8
+ * block during the first pass becomes rows 4-7 during the second pass.)
+ */
+
+void jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  int16_t workspace_l[8 * DCTSIZE / 2];
+  int16_t workspace_r[8 * DCTSIZE / 2];
+
+  /* Compute IDCT first pass on left 4x8 coefficient block. */
+
+  /* Load DCT coefficients in left 4x8 block. */
+  int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
+  int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
+  int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE);
+  int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE);
+  int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE);
+  int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE);
+  int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE);
+  int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table for left 4x8 block. */
+  int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE);
+  int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+  int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+  int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+  int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+  int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+  int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+  int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+  /* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */
+  int16x4_t bitmap = vorr_s16(row7, row6);
+  bitmap = vorr_s16(bitmap, row5);
+  bitmap = vorr_s16(bitmap, row4);
+  int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+  if (bitmap_rows_4567 == 0) {
+    bitmap = vorr_s16(bitmap, row3);
+    bitmap = vorr_s16(bitmap, row2);
+    bitmap = vorr_s16(bitmap, row1);
+    int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+    if (left_ac_bitmap == 0) {
+      int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+      int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+      /* Store 4x4 blocks to workspace, transposing in the process. */
+      vst4_s16(workspace_l, quadrant);
+      vst4_s16(workspace_r, quadrant);
+    } else {
+      jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+                                    quant_row1, quant_row2, quant_row3,
+                                    workspace_l, workspace_r);
+    }
+  } else {
+    jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+                                   row6, row7, quant_row0, quant_row1,
+                                   quant_row2, quant_row3, quant_row4,
+                                   quant_row5, quant_row6, quant_row7,
+                                   workspace_l, workspace_r);
+  }
+
+  /* Compute IDCT first pass on right 4x8 coefficient block. */
+
+  /* Load DCT coefficients in right 4x8 block. */
+  row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
+  row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
+  row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
+  row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4);
+  row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4);
+  row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4);
+  row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4);
+  row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4);
+
+  /* Load quantization table for right 4x8 block. */
+  quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4);
+  quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+  quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+  quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+  quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+  quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+  quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+  quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+  /* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */
+  bitmap = vorr_s16(row7, row6);
+  bitmap = vorr_s16(bitmap, row5);
+  bitmap = vorr_s16(bitmap, row4);
+  bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+  bitmap = vorr_s16(bitmap, row3);
+  bitmap = vorr_s16(bitmap, row2);
+  bitmap = vorr_s16(bitmap, row1);
+  int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+  /* If this remains non-zero, a "regular" second pass will be performed. */
+  int64_t right_ac_dc_bitmap = 1;
+
+  if (right_ac_bitmap == 0) {
+    bitmap = vorr_s16(bitmap, row0);
+    right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+    if (right_ac_dc_bitmap != 0) {
+      int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+      int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+      /* Store 4x4 blocks to workspace, transposing in the process. */
+      vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
+      vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
+    }
+  } else {
+    if (bitmap_rows_4567 == 0) {
+      jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+                                    quant_row1, quant_row2, quant_row3,
+                                    workspace_l + 4 * DCTSIZE / 2,
+                                    workspace_r + 4 * DCTSIZE / 2);
+    } else {
+      jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+                                     row6, row7, quant_row0, quant_row1,
+                                     quant_row2, quant_row3, quant_row4,
+                                     quant_row5, quant_row6, quant_row7,
+                                     workspace_l + 4 * DCTSIZE / 2,
+                                     workspace_r + 4 * DCTSIZE / 2);
+    }
+  }
+
+  /* Second pass: compute IDCT on rows in workspace. */
+
+  /* If all coefficients in right 4x8 block are 0, use "sparse" second pass. */
+  if (right_ac_dc_bitmap == 0) {
+    jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
+    jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
+  } else {
+    jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0);
+    jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4);
+  }
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients.  (To process the full 8x8 DCT block, this
+ * function-- or some other optimized variant-- needs to be called for both the
+ * left and right 4x8 blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of AC coefficients is all 0.
+ *
+ * The original C implementation of the accurate IDCT (jpeg_idct_slow()) can be
+ * found in jidctint.c.  Algorithmic changes made here are documented inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+                                                  int16x4_t row1,
+                                                  int16x4_t row2,
+                                                  int16x4_t row3,
+                                                  int16x4_t row4,
+                                                  int16x4_t row5,
+                                                  int16x4_t row6,
+                                                  int16x4_t row7,
+                                                  int16x4_t quant_row0,
+                                                  int16x4_t quant_row1,
+                                                  int16x4_t quant_row2,
+                                                  int16x4_t quant_row3,
+                                                  int16x4_t quant_row4,
+                                                  int16x4_t quant_row5,
+                                                  int16x4_t quant_row6,
+                                                  int16x4_t quant_row7,
+                                                  int16_t *workspace_1,
+                                                  int16_t *workspace_2)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part */
+  int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+  int16x4_t z3_s16 = vmul_s16(row6, quant_row6);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+  tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+  z2_s16 = vmul_s16(row0, quant_row0);
+  z3_s16 = vmul_s16(row4, quant_row4);
+
+  int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part */
+  int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7);
+  int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5);
+  int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+  int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+  z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+  int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z5 = (z3 + z4) * 1.175875602;
+   *   z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+   *   z3 += z5;  z4 += z5;
+   *
+   * This implementation:
+   *   z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+   *   z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+   */
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+   *   tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+   *   tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+   *   z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+   *   tmp0 += z1 + z3;  tmp1 += z2 + z4;
+   *   tmp2 += z2 + z3;  tmp3 += z1 + z4;
+   *
+   * This implementation:
+   *   tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+   *   tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+   *   tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+   *   tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+   *   tmp0 += z3;  tmp1 += z4;
+   *   tmp2 += z3;  tmp3 += z4;
+   */
+
+  tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+  tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+  tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+  tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+  tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+  tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+  tmp0 = vaddq_s32(tmp0, z3);
+  tmp1 = vaddq_s32(tmp1, z4);
+  tmp2 = vaddq_s32(tmp2, z3);
+  tmp3 = vaddq_s32(tmp3, z4);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x4x4_t rows_0123 = { {
+    vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+  } };
+  int16x4x4_t rows_4567 = { {
+    vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+  } };
+
+  /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+   * (VST4 transposes the blocks.  We need to operate on rows in the next
+   * pass.)
+   */
+  vst4_s16(workspace_1, rows_0123);
+  vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients.
+ *
+ * This "sparse" version assumes that the AC coefficients in rows 4-7 are all
+ * 0.  This simplifies the IDCT calculation, accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+                                                 int16x4_t row1,
+                                                 int16x4_t row2,
+                                                 int16x4_t row3,
+                                                 int16x4_t quant_row0,
+                                                 int16x4_t quant_row1,
+                                                 int16x4_t quant_row2,
+                                                 int16x4_t quant_row3,
+                                                 int16_t *workspace_1,
+                                                 int16_t *workspace_2)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part (z3 is all 0) */
+  int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+  z2_s16 = vmul_s16(row0, quant_row0);
+  int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part (tmp0 and tmp1 are both all 0) */
+  int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+  int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+  int16x4_t z3_s16 = tmp2_s16;
+  int16x4_t z4_s16 = tmp3_s16;
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+  tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x4x4_t rows_0123 = { {
+    vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+  } };
+  int16x4x4_t rows_4567 = { {
+    vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+  } };
+
+  /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+   * (VST4 transposes the blocks.  We need to operate on rows in the next
+   * pass.)
+   */
+  vst4_s16(workspace_1, rows_0123);
+  vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform the second pass of the accurate inverse DCT on a 4x8 block of
+ * coefficients.  (To process the full 8x8 DCT block, this function-- or some
+ * other optimized variant-- needs to be called for both the right and left 4x8
+ * blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of coefficient values are all 0 after the
+ * first pass.
+ *
+ * Again, the original C implementation of the accurate IDCT (jpeg_idct_slow())
+ * can be found in jidctint.c.  Algorithmic changes made here are documented
+ * inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+                                                  JSAMPARRAY output_buf,
+                                                  JDIMENSION output_col,
+                                                  unsigned buf_offset)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part */
+  int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+  int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+  tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+  z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+  z3_s16 = vld1_s16(workspace + 4 * DCTSIZE / 2);
+
+  int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part */
+  int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2);
+  int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2);
+  int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+  int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+  z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+  int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z5 = (z3 + z4) * 1.175875602;
+   *   z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+   *   z3 += z5;  z4 += z5;
+   *
+   * This implementation:
+   *   z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+   *   z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+   */
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+   *   tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+   *   tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+   *   z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+   *   tmp0 += z1 + z3;  tmp1 += z2 + z4;
+   *   tmp2 += z2 + z3;  tmp3 += z1 + z4;
+   *
+   * This implementation:
+   *   tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+   *   tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+   *   tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+   *   tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+   *   tmp0 += z3;  tmp1 += z4;
+   *   tmp2 += z3;  tmp3 += z4;
+   */
+
+  tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+  tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+  tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+  tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+  tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+  tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+  tmp0 = vaddq_s32(tmp0, z3);
+  tmp1 = vaddq_s32(tmp1, z4);
+  tmp2 = vaddq_s32(tmp2, z3);
+  tmp3 = vaddq_s32(tmp3, z4);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+                                       vaddhn_s32(tmp12, tmp1));
+  int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+                                       vaddhn_s32(tmp13, tmp0));
+  int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+                                       vsubhn_s32(tmp11, tmp2));
+  int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+                                       vsubhn_s32(tmp10, tmp3));
+  /* Descale and narrow to 8-bit. */
+  int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+  int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+  int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+  int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+  /* Clamp to range [0-255]. */
+  uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+
+  /* Transpose 4x8 block and store to memory.  (Zipping adjacent columns
+   * together allows us to store 16-bit elements.)
+   */
+  uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+  uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+  uint16x4x4_t cols_01_23_45_67 = { {
+    vreinterpret_u16_u8(cols_01_23.val[0]),
+    vreinterpret_u16_u8(cols_01_23.val[1]),
+    vreinterpret_u16_u8(cols_45_67.val[0]),
+    vreinterpret_u16_u8(cols_45_67.val[1])
+  } };
+
+  JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+  JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+  JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+  JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+  /* VST4 of 16-bit elements completes the transpose. */
+  vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+  vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+  vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+  vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
+
+
+/* Performs the second pass of the accurate inverse DCT on a 4x8 block
+ * of coefficients.
+ *
+ * This "sparse" version assumes that the coefficient values (after the first
+ * pass) in rows 4-7 are all 0.  This simplifies the IDCT calculation,
+ * accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+                                                 JSAMPARRAY output_buf,
+                                                 JDIMENSION output_col,
+                                                 unsigned buf_offset)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part (z3 is all 0) */
+  int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+  z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+  int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part (tmp0 and tmp1 are both all 0) */
+  int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+  int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+  int16x4_t z3_s16 = tmp2_s16;
+  int16x4_t z4_s16 = tmp3_s16;
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+  tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+                                       vaddhn_s32(tmp12, tmp1));
+  int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+                                       vaddhn_s32(tmp13, tmp0));
+  int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+                                       vsubhn_s32(tmp11, tmp2));
+  int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+                                       vsubhn_s32(tmp10, tmp3));
+  /* Descale and narrow to 8-bit. */
+  int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+  int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+  int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+  int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+  /* Clamp to range [0-255]. */
+  uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+
+  /* Transpose 4x8 block and store to memory.  (Zipping adjacent columns
+   * together allows us to store 16-bit elements.)
+   */
+  uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+  uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+  uint16x4x4_t cols_01_23_45_67 = { {
+    vreinterpret_u16_u8(cols_01_23.val[0]),
+    vreinterpret_u16_u8(cols_01_23.val[1]),
+    vreinterpret_u16_u8(cols_45_67.val[0]),
+    vreinterpret_u16_u8(cols_45_67.val[1])
+  } };
+
+  JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+  JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+  JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+  JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+  /* VST4 of 16-bit elements completes the transpose. */
+  vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+  vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+  vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+  vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
diff --git a/simd/arm/jidctred-neon.c b/simd/arm/jidctred-neon.c
new file mode 100644
index 0000000..be9627e
--- /dev/null
+++ b/simd/arm/jidctred-neon.c
@@ -0,0 +1,486 @@
+/*
+ * jidctred-neon.c - reduced-size IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define F_0_211  1730
+#define F_0_509  4176
+#define F_0_601  4926
+#define F_0_720  5906
+#define F_0_765  6270
+#define F_0_850  6967
+#define F_0_899  7373
+#define F_1_061  8697
+#define F_1_272  10426
+#define F_1_451  11893
+#define F_1_847  15137
+#define F_2_172  17799
+#define F_2_562  20995
+#define F_3_624  29692
+
+
+/* jsimd_idct_2x2_neon() is an inverse DCT function that produces reduced-size
+ * 2x2 output from an 8x8 DCT block.  It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_2x2() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.720959822 =  5906 * 2^-13
+ *    0.850430095 =  6967 * 2^-13
+ *    1.272758580 = 10426 * 2^-13
+ *    3.624509785 = 29692 * 2^-13
+ *
+ * See jidctred.c for further details of the 2x2 IDCT algorithm.  Where
+ * possible, the variable names and comments here in jsimd_idct_2x2_neon()
+ * match up with those in jpeg_idct_2x2().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_2x2_neon_consts[] = {
+  -F_0_720, F_0_850, -F_1_272, F_3_624
+};
+
+void jsimd_idct_2x2_neon(void *dct_table, JCOEFPTR coef_block,
+                         JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+  int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+  int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+  int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+  /* Dequantize DCT coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+  row1 = vmulq_s16(row1, quant_row1);
+  row3 = vmulq_s16(row3, quant_row3);
+  row5 = vmulq_s16(row5, quant_row5);
+  row7 = vmulq_s16(row7, quant_row7);
+
+  /* Load IDCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_idct_2x2_neon_consts);
+
+  /* Pass 1: process columns from input, put results in vectors row0 and
+   * row1.
+   */
+
+  /* Even part */
+  int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2);
+  int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2);
+
+  /* Odd part */
+  int32x4_t tmp0_l = vmull_lane_s16(vget_low_s16(row1), consts, 3);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row3), consts, 2);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row5), consts, 1);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row7), consts, 0);
+  int32x4_t tmp0_h = vmull_lane_s16(vget_high_s16(row1), consts, 3);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row3), consts, 2);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row5), consts, 1);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row7), consts, 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS),
+                      vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS));
+  row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS),
+                      vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS));
+
+  /* Transpose two rows, ready for second pass. */
+  int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1);
+  int16x8_t cols_0246 = cols_0246_1357.val[0];
+  int16x8_t cols_1357 = cols_0246_1357.val[1];
+  /* Duplicate columns such that each is accessible in its own vector. */
+  int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357),
+                                         vreinterpretq_s32_s16(cols_1357));
+  int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]);
+  int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]);
+
+  /* Pass 2: process two rows, store to output array. */
+
+  /* Even part: we're only interested in col0; the top half of tmp10 is "don't
+   * care."
+   */
+  int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2);
+
+  /* Odd part: we're only interested in the bottom half of tmp0. */
+  int32x4_t tmp0 = vmull_lane_s16(vget_low_s16(cols_1155), consts, 3);
+  tmp0 = vmlal_lane_s16(tmp0, vget_low_s16(cols_3377), consts, 2);
+  tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_1155), consts, 1);
+  tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_3377), consts, 0);
+
+  /* Final output stage: descale and clamp to range [0-255]. */
+  int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0),
+                                      vsubhn_s32(tmp10, tmp0));
+  output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16,
+                            CONST_BITS + PASS1_BITS + 3 + 2 - 16);
+  /* Narrow to 8-bit and convert to unsigned. */
+  uint8x8_t output_u8 = vqmovun_s16(output_s16);
+
+  /* Store 2x2 block to memory. */
+  vst1_lane_u8(output_buf[0] + output_col, output_u8, 0);
+  vst1_lane_u8(output_buf[1] + output_col, output_u8, 1);
+  vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4);
+  vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5);
+}
+
+
+/* jsimd_idct_4x4_neon() is an inverse DCT function that produces reduced-size
+ * 4x4 output from an 8x8 DCT block.  It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_4x4() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.211164243 =  1730 * 2^-13
+ *    0.509795579 =  4176 * 2^-13
+ *    0.601344887 =  4926 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.061594337 =  8697 * 2^-13
+ *    1.451774981 = 11893 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    2.172734803 = 17799 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *
+ * See jidctred.c for further details of the 4x4 IDCT algorithm.  Where
+ * possible, the variable names and comments here in jsimd_idct_4x4_neon()
+ * match up with those in jpeg_idct_4x4().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = {
+  F_1_847, -F_0_765, -F_0_211,  F_1_451,
+ -F_2_172,  F_1_061, -F_0_509, -F_0_601,
+  F_0_899,  F_2_562,        0,        0
+};
+
+void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block,
+                         JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0  = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1  = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row2  = vld1q_s16(coef_block + 2 * DCTSIZE);
+  int16x8_t row3  = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row5  = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row6  = vld1q_s16(coef_block + 6 * DCTSIZE);
+  int16x8_t row7  = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values for DC coefficients. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  /* Dequantize DC coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+
+  /* Construct bitmap to test if all AC coefficients are 0. */
+  int16x8_t bitmap = vorrq_s16(row1, row2);
+  bitmap = vorrq_s16(bitmap, row3);
+  bitmap = vorrq_s16(bitmap, row5);
+  bitmap = vorrq_s16(bitmap, row6);
+  bitmap = vorrq_s16(bitmap, row7);
+
+  int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+  int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_4x4_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_4x4_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_4x4_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+    /* All AC coefficients are zero.
+     * Compute DC values and duplicate into row vectors 0, 1, 2, and 3.
+     */
+    int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS);
+    row0 = dcval;
+    row1 = dcval;
+    row2 = dcval;
+    row3 = dcval;
+  } else if (left_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 0, 1, 2, and 3.
+     * Compute DC values for these columns.
+     */
+    int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS);
+
+    /* Commence regular IDCT computation for columns 4, 5, 6, and 7. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+    /* Even part */
+    int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+    int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2);
+    int16x4_t z3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+    int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+    int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+    int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+    /* Odd part */
+    int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7);
+    z2 = vmul_s16(vget_high_s16(row5), quant_row5);
+    z3 = vmul_s16(vget_high_s16(row3), quant_row3);
+    int16x4_t z4 = vmul_s16(vget_high_s16(row1), quant_row1);
+
+    tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+    tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+    tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+    tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+    tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+    tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row3 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row1 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+                                            CONST_BITS - PASS1_BITS + 1));
+  } else if (right_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 4, 5, 6, and 7.
+     * Compute DC values for these columns.
+     */
+    int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS);
+
+    /* Commence regular IDCT computation for columns 0, 1, 2, and 3. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part */
+    int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+
+    int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2);
+    int16x4_t z3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+    int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+    int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+    int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+    /* Odd part */
+    int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7);
+    z2 = vmul_s16(vget_low_s16(row5), quant_row5);
+    z3 = vmul_s16(vget_low_s16(row3), quant_row3);
+    int16x4_t z4 = vmul_s16(vget_low_s16(row1), quant_row1);
+
+    tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+    tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+    tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+    tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+    tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+    tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+  } else {
+    /* All AC coefficients are non-zero; full IDCT calculation required. */
+    int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+    int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+    int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+    int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+    int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+    int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part */
+    int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+    int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+    int16x8_t z2 = vmulq_s16(row2, quant_row2);
+    int16x8_t z3 = vmulq_s16(row6, quant_row6);
+
+    int32x4_t tmp2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[0], 0);
+    int32x4_t tmp2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[0], 0);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[0], 1);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[0], 1);
+
+    int32x4_t tmp10_l = vaddq_s32(tmp0_l, tmp2_l);
+    int32x4_t tmp10_h = vaddq_s32(tmp0_h, tmp2_h);
+    int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l);
+    int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h);
+
+    /* Odd part */
+    int16x8_t z1 = vmulq_s16(row7, quant_row7);
+    z2 = vmulq_s16(row5, quant_row5);
+    z3 = vmulq_s16(row3, quant_row3);
+    int16x8_t z4 = vmulq_s16(row1, quant_row1);
+
+    tmp0_l = vmull_lane_s16(vget_low_s16(z1), consts.val[0], 2);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z2), consts.val[0], 3);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z3), consts.val[1], 0);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z4), consts.val[1], 1);
+    tmp0_h = vmull_lane_s16(vget_high_s16(z1), consts.val[0], 2);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z2), consts.val[0], 3);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z3), consts.val[1], 0);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z4), consts.val[1], 1);
+
+    tmp2_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 2);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z2), consts.val[1], 3);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[2], 0);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z4), consts.val[2], 1);
+    tmp2_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 2);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z2), consts.val[1], 3);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[2], 0);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z4), consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp2_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vaddq_s32(tmp10_h, tmp2_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp2_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vsubq_s32(tmp10_h, tmp2_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12_l, tmp0_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vaddq_s32(tmp12_h, tmp0_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12_l, tmp0_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vsubq_s32(tmp12_h, tmp0_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+  }
+
+  /* Transpose 8x4 block to perform IDCT on rows in second pass. */
+  int16x8x2_t row_01 = vtrnq_s16(row0, row1);
+  int16x8x2_t row_23 = vtrnq_s16(row2, row3);
+
+  int32x4x2_t cols_0426 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[0]),
+                                    vreinterpretq_s32_s16(row_23.val[0]));
+  int32x4x2_t cols_1537 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[1]),
+                                    vreinterpretq_s32_s16(row_23.val[1]));
+
+  int16x4_t col0 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[0]));
+  int16x4_t col1 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[0]));
+  int16x4_t col2 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[1]));
+  int16x4_t col3 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[1]));
+  int16x4_t col5 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[0]));
+  int16x4_t col6 = vreinterpret_s16_s32(vget_high_s32(cols_0426.val[1]));
+  int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1]));
+
+  /* Commence second pass of IDCT. */
+
+  /* Even part */
+  int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1);
+  int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0);
+  tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+  /* Odd part */
+  tmp0 = vmull_lane_s16(col7, consts.val[0], 2);
+  tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3);
+  tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0);
+  tmp0 = vmlal_lane_s16(tmp0, col1, consts.val[1], 1);
+
+  tmp2 = vmull_lane_s16(col7, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, col5, consts.val[1], 3);
+  tmp2 = vmlal_lane_s16(tmp2, col3, consts.val[2], 0);
+  tmp2 = vmlal_lane_s16(tmp2, col1, consts.val[2], 1);
+
+  /* Final output stage: descale and clamp to range [0-255]. */
+  int16x8_t output_cols_02 = vcombine_s16(vaddhn_s32(tmp10, tmp2),
+                                          vsubhn_s32(tmp12, tmp0));
+  int16x8_t output_cols_13 = vcombine_s16(vaddhn_s32(tmp12, tmp0),
+                                          vsubhn_s32(tmp10, tmp2));
+  output_cols_02 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_02,
+                                CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+  output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13,
+                                CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+  /* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements.
+   * An interleaving store completes the transpose.
+   */
+  uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02),
+                                    vqmovun_s16(output_cols_13));
+  uint16x4x2_t output_01_23 = { {
+    vreinterpret_u16_u8(output_0123.val[0]),
+    vreinterpret_u16_u8(output_0123.val[1])
+  } };
+
+  /* Store 4x4 block to memory. */
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  JSAMPROW outptr2 = output_buf[2] + output_col;
+  JSAMPROW outptr3 = output_buf[3] + output_col;
+  vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0);
+  vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1);
+  vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2);
+  vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3);
+}
diff --git a/simd/arm/jquanti-neon.c b/simd/arm/jquanti-neon.c
new file mode 100644
index 0000000..a7eb6f1
--- /dev/null
+++ b/simd/arm/jquanti-neon.c
@@ -0,0 +1,190 @@
+/*
+ * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* After downsampling, the resulting sample values are in the range [0, 255],
+ * but the Discrete Cosine Transform (DCT) operates on values centered around
+ * 0.
+ *
+ * To prepare sample values for the DCT, load samples into a DCT workspace,
+ * subtracting CENTERJSAMPLE (128).  The samples, now in the range [-128, 127],
+ * are also widened from 8- to 16-bit.
+ *
+ * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
+ */
+
+void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
+                         DCTELEM *workspace)
+{
+  uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
+  uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
+  uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
+  uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
+  uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
+  uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
+  uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
+  uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
+
+  int16x8_t row0 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row1 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row2 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row3 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row4 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row5 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row6 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row7 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
+
+  vst1q_s16(workspace + 0 * DCTSIZE, row0);
+  vst1q_s16(workspace + 1 * DCTSIZE, row1);
+  vst1q_s16(workspace + 2 * DCTSIZE, row2);
+  vst1q_s16(workspace + 3 * DCTSIZE, row3);
+  vst1q_s16(workspace + 4 * DCTSIZE, row4);
+  vst1q_s16(workspace + 5 * DCTSIZE, row5);
+  vst1q_s16(workspace + 6 * DCTSIZE, row6);
+  vst1q_s16(workspace + 7 * DCTSIZE, row7);
+}
+
+
+/* After the DCT, the resulting array of coefficient values needs to be divided
+ * by an array of quantization values.
+ *
+ * To avoid a slow division operation, the DCT coefficients are multiplied by
+ * the (scaled) reciprocals of the quantization values and then right-shifted.
+ *
+ * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
+ */
+
+void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
+                         DCTELEM *workspace)
+{
+  JCOEFPTR out_ptr = coef_block;
+  UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
+  UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
+  DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
+  int i;
+
+  for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
+    /* Load DCT coefficients. */
+    int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
+    int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
+    int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
+    int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
+    /* Load reciprocals of quantization values. */
+    uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
+    uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
+    uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
+    uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
+    uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
+    uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
+    uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
+    uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
+    int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
+    int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
+    int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
+    int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
+
+    /* Extract sign from coefficients. */
+    int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
+    int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
+    int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
+    int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
+    /* Get absolute value of DCT coefficients. */
+    uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
+    uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
+    uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
+    uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
+    /* Add correction. */
+    abs_row0 = vaddq_u16(abs_row0, corr0);
+    abs_row1 = vaddq_u16(abs_row1, corr1);
+    abs_row2 = vaddq_u16(abs_row2, corr2);
+    abs_row3 = vaddq_u16(abs_row3, corr3);
+
+    /* Multiply DCT coefficients by quantization reciprocals. */
+    int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
+                                                       vget_low_u16(recip0)));
+    int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
+                                                       vget_high_u16(recip0)));
+    int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
+                                                       vget_low_u16(recip1)));
+    int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
+                                                       vget_high_u16(recip1)));
+    int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
+                                                       vget_low_u16(recip2)));
+    int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
+                                                       vget_high_u16(recip2)));
+    int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
+                                                       vget_low_u16(recip3)));
+    int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
+                                                       vget_high_u16(recip3)));
+    /* Narrow back to 16-bit. */
+    row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
+    row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
+    row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
+    row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
+
+    /* Since VSHR only supports an immediate as its second argument, negate the
+     * shift value and shift left.
+     */
+    row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
+                                           vnegq_s16(shift0)));
+    row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
+                                           vnegq_s16(shift1)));
+    row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
+                                           vnegq_s16(shift2)));
+    row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
+                                           vnegq_s16(shift3)));
+
+    /* Restore sign to original product. */
+    row0 = veorq_s16(row0, sign_row0);
+    row0 = vsubq_s16(row0, sign_row0);
+    row1 = veorq_s16(row1, sign_row1);
+    row1 = vsubq_s16(row1, sign_row1);
+    row2 = veorq_s16(row2, sign_row2);
+    row2 = vsubq_s16(row2, sign_row2);
+    row3 = veorq_s16(row3, sign_row3);
+    row3 = vsubq_s16(row3, sign_row3);
+
+    /* Store quantized coefficients to memory. */
+    vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
+    vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
+    vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
+    vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
+  }
+}
diff --git a/simd/arm/jsimd_neon.S b/simd/arm/jsimd_neon.S
deleted file mode 100644
index 30f9cc6..0000000
--- a/simd/arm/jsimd_neon.S
+++ /dev/null
@@ -1,2878 +0,0 @@
-/*
- * Armv7 Neon optimizations for libjpeg-turbo
- *
- * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
- *                          All Rights Reserved.
- * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
- * Copyright (C) 2014, Siarhei Siamashka.  All Rights Reserved.
- * Copyright (C) 2014, Linaro Limited.  All Rights Reserved.
- * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
-#endif
-
-.text
-.fpu neon
-.arch armv7a
-.object_arch armv4
-.arm
-.syntax unified
-
-
-#define RESPECT_STRICT_ALIGNMENT  1
-
-
-/*****************************************************************************/
-
-/* Supplementary macro for setting function attributes */
-.macro asm_function fname
-#ifdef __APPLE__
-    .private_extern _\fname
-    .globl _\fname
-_\fname:
-#else
-    .global \fname
-#ifdef __ELF__
-    .hidden \fname
-    .type \fname, %function
-#endif
-\fname:
-#endif
-.endm
-
-/* Transpose a block of 4x4 coefficients in four 64-bit registers */
-.macro transpose_4x4 x0, x1, x2, x3
-    vtrn.16         \x0, \x1
-    vtrn.16         \x2, \x3
-    vtrn.32         \x0, \x2
-    vtrn.32         \x1, \x3
-.endm
-
-
-#define CENTERJSAMPLE  128
-
-/*****************************************************************************/
-
-/*
- * Perform dequantization and inverse DCT on one block of coefficients.
- *
- * GLOBAL(void)
- * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
- *                       JSAMPARRAY output_buf, JDIMENSION output_col)
- */
-
-#define FIX_0_298631336  (2446)
-#define FIX_0_390180644  (3196)
-#define FIX_0_541196100  (4433)
-#define FIX_0_765366865  (6270)
-#define FIX_0_899976223  (7373)
-#define FIX_1_175875602  (9633)
-#define FIX_1_501321110  (12299)
-#define FIX_1_847759065  (15137)
-#define FIX_1_961570560  (16069)
-#define FIX_2_053119869  (16819)
-#define FIX_2_562915447  (20995)
-#define FIX_3_072711026  (25172)
-
-#define FIX_1_175875602_MINUS_1_961570560  (FIX_1_175875602 - FIX_1_961570560)
-#define FIX_1_175875602_MINUS_0_390180644  (FIX_1_175875602 - FIX_0_390180644)
-#define FIX_0_541196100_MINUS_1_847759065  (FIX_0_541196100 - FIX_1_847759065)
-#define FIX_3_072711026_MINUS_2_562915447  (FIX_3_072711026 - FIX_2_562915447)
-#define FIX_0_298631336_MINUS_0_899976223  (FIX_0_298631336 - FIX_0_899976223)
-#define FIX_1_501321110_MINUS_0_899976223  (FIX_1_501321110 - FIX_0_899976223)
-#define FIX_2_053119869_MINUS_2_562915447  (FIX_2_053119869 - FIX_2_562915447)
-#define FIX_0_541196100_PLUS_0_765366865   (FIX_0_541196100 + FIX_0_765366865)
-
-/*
- * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
- * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
- */
-#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
-  DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
-  JLONG   q1, q2, q3, q4, q5, q6, q7; \
-  JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2; \
-  \
-  /* 1-D iDCT input data */ \
-  row0 = xrow0; \
-  row1 = xrow1; \
-  row2 = xrow2; \
-  row3 = xrow3; \
-  row4 = xrow4; \
-  row5 = xrow5; \
-  row6 = xrow6; \
-  row7 = xrow7; \
-  \
-  q5 = row7 + row3; \
-  q4 = row5 + row1; \
-  q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
-       MULTIPLY(q4, FIX_1_175875602); \
-  q7 = MULTIPLY(q5, FIX_1_175875602) + \
-       MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
-  q2 = MULTIPLY(row2, FIX_0_541196100) + \
-       MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
-  q4 = q6; \
-  q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
-  q6 += MULTIPLY(row5, -FIX_2_562915447) + \
-        MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
-  /* now we can use q1 (reloadable constants have been used up) */ \
-  q1 = q3 + q2; \
-  q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
-        MULTIPLY(row1, -FIX_0_899976223); \
-  q5 = q7; \
-  q1 = q1 + q6; \
-  q7 += MULTIPLY(row7, -FIX_0_899976223) + \
-        MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
-  \
-  /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
-  tmp11_plus_tmp2 = q1; \
-  row1 = 0; \
-  \
-  q1 = q1 - q6; \
-  q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
-        MULTIPLY(row3, -FIX_2_562915447); \
-  q1 = q1 - q6; \
-  q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
-       MULTIPLY(row6, FIX_0_541196100); \
-  q3 = q3 - q2; \
-  \
-  /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
-  tmp11_minus_tmp2 = q1; \
-  \
-  q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
-  q2 = q1 + q6; \
-  q1 = q1 - q6; \
-  \
-  /* pick up the results */ \
-  tmp0  = q4; \
-  tmp1  = q5; \
-  tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
-  tmp3  = q7; \
-  tmp10 = q2; \
-  tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
-  tmp12 = q3; \
-  tmp13 = q1; \
-}
-
-#define XFIX_0_899976223                    d0[0]
-#define XFIX_0_541196100                    d0[1]
-#define XFIX_2_562915447                    d0[2]
-#define XFIX_0_298631336_MINUS_0_899976223  d0[3]
-#define XFIX_1_501321110_MINUS_0_899976223  d1[0]
-#define XFIX_2_053119869_MINUS_2_562915447  d1[1]
-#define XFIX_0_541196100_PLUS_0_765366865   d1[2]
-#define XFIX_1_175875602                    d1[3]
-#define XFIX_1_175875602_MINUS_0_390180644  d2[0]
-#define XFIX_0_541196100_MINUS_1_847759065  d2[1]
-#define XFIX_3_072711026_MINUS_2_562915447  d2[2]
-#define XFIX_1_175875602_MINUS_1_961570560  d2[3]
-
-.balign 16
-jsimd_idct_islow_neon_consts:
-  .short FIX_0_899976223                    /* d0[0] */
-  .short FIX_0_541196100                    /* d0[1] */
-  .short FIX_2_562915447                    /* d0[2] */
-  .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
-  .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
-  .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
-  .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
-  .short FIX_1_175875602                    /* d1[3] */
-  /* reloadable constants */
-  .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
-  .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
-  .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
-  .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
-
-asm_function jsimd_idct_islow_neon
-
-    DCT_TABLE       .req r0
-    COEF_BLOCK      .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_COL      .req r3
-    TMP1            .req r0
-    TMP2            .req r1
-    TMP3            .req r2
-    TMP4            .req ip
-
-    ROW0L           .req d16
-    ROW0R           .req d17
-    ROW1L           .req d18
-    ROW1R           .req d19
-    ROW2L           .req d20
-    ROW2R           .req d21
-    ROW3L           .req d22
-    ROW3R           .req d23
-    ROW4L           .req d24
-    ROW4R           .req d25
-    ROW5L           .req d26
-    ROW5R           .req d27
-    ROW6L           .req d28
-    ROW6R           .req d29
-    ROW7L           .req d30
-    ROW7R           .req d31
-
-    /* Load and dequantize coefficients into Neon registers
-     * with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17     ( q8  )
-     *   1 | d18     | d19     ( q9  )
-     *   2 | d20     | d21     ( q10 )
-     *   3 | d22     | d23     ( q11 )
-     *   4 | d24     | d25     ( q12 )
-     *   5 | d26     | d27     ( q13 )
-     *   6 | d28     | d29     ( q14 )
-     *   7 | d30     | d31     ( q15 )
-     */
-    adr             ip, jsimd_idct_islow_neon_consts
-    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
-    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
-    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
-    vmul.s16        q8, q8, q0
-    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
-    vmul.s16        q9, q9, q1
-    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
-    vmul.s16        q10, q10, q2
-    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
-    vmul.s16        q11, q11, q3
-    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
-    vmul.s16        q12, q12, q0
-    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
-    vmul.s16        q14, q14, q2
-    vmul.s16        q13, q13, q1
-    vld1.16         {d0, d1, d2, d3}, [ip, :128]  /* load constants */
-    add             ip, ip, #16
-    vmul.s16        q15, q15, q3
-    vpush           {d8 - d15}                    /* save Neon registers */
-    /* 1-D IDCT, pass 1, left 4x8 half */
-    vadd.s16        d4, ROW7L, ROW3L
-    vadd.s16        d5, ROW5L, ROW1L
-    vmull.s16       q6, d4, XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6, d5, XFIX_1_175875602
-    vmull.s16       q7, d4, XFIX_1_175875602
-      /* Check for the zero coefficients in the right 4x8 half */
-      push            {r4, r5}
-    vmlal.s16       q7, d5, XFIX_1_175875602_MINUS_0_390180644
-    vsubl.s16       q3, ROW0L, ROW4L
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
-    vmull.s16       q2, ROW2L, XFIX_0_541196100
-    vmlal.s16       q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
-      orr             r0, r4, r5
-    vmov            q4, q6
-    vmlsl.s16       q6, ROW5L, XFIX_2_562915447
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
-    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vshl.s32        q3, q3, #13
-      orr             r0, r0, r4
-    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
-      orr             r0, r0, r5
-    vadd.s32        q1, q3, q2
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
-    vmov            q5, q7
-    vadd.s32        q1, q1, q6
-      orr             r0, r0, r4
-    vmlsl.s16       q7, ROW7L, XFIX_0_899976223
-      orr             r0, r0, r5
-    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vrshrn.s32      ROW1L, q1, #11
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
-    vsub.s32        q1, q1, q6
-    vmlal.s16       q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
-      orr             r0, r0, r4
-    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
-      orr             r0, r0, r5
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
-    vmlal.s16       q6, ROW6L, XFIX_0_541196100
-    vsub.s32        q3, q3, q2
-      orr             r0, r0, r4
-    vrshrn.s32      ROW6L, q1, #11
-      orr             r0, r0, r5
-    vadd.s32        q1, q3, q5
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
-    vsub.s32        q3, q3, q5
-    vaddl.s16       q5, ROW0L, ROW4L
-      orr             r0, r0, r4
-    vrshrn.s32      ROW2L, q1, #11
-      orr             r0, r0, r5
-    vrshrn.s32      ROW5L, q3, #11
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
-    vshl.s32        q5, q5, #13
-    vmlal.s16       q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
-      orr             r0, r0, r4
-    vadd.s32        q2, q5, q6
-      orrs            r0, r0, r5
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-      orr             r0, r4, r5
-    vsub.s32        q3, q1, q4
-      pop             {r4, r5}
-    vrshrn.s32      ROW7L, q2, #11
-    vrshrn.s32      ROW3L, q5, #11
-    vrshrn.s32      ROW0L, q6, #11
-    vrshrn.s32      ROW4L, q3, #11
-
-      beq             3f  /* Go to do some special handling for the sparse
-                             right 4x8 half */
-
-    /* 1-D IDCT, pass 1, right 4x8 half */
-    vld1.s16        {d2}, [ip, :64]  /* reload constants */
-    vadd.s16        d10, ROW7R, ROW3R
-    vadd.s16        d8, ROW5R, ROW1R
-      /* Transpose left 4x8 half */
-      vtrn.16         ROW6L, ROW7L
-    vmull.s16       q6, d10, XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6, d8, XFIX_1_175875602
-      vtrn.16         ROW2L, ROW3L
-    vmull.s16       q7, d10, XFIX_1_175875602
-    vmlal.s16       q7, d8, XFIX_1_175875602_MINUS_0_390180644
-      vtrn.16         ROW0L, ROW1L
-    vsubl.s16       q3, ROW0R, ROW4R
-    vmull.s16       q2, ROW2R, XFIX_0_541196100
-    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
-      vtrn.16         ROW4L, ROW5L
-    vmov            q4, q6
-    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
-    vmlal.s16       q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
-      vtrn.32         ROW1L, ROW3L
-    vshl.s32        q3, q3, #13
-    vmlsl.s16       q4, ROW1R, XFIX_0_899976223
-      vtrn.32         ROW4L, ROW6L
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vadd.s32        q1, q1, q6
-      vtrn.32         ROW0L, ROW2L
-    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
-    vmlal.s16       q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
-    vrshrn.s32      ROW1R, q1, #11
-      vtrn.32         ROW5L, ROW7L
-    vsub.s32        q1, q1, q6
-    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
-    vmlsl.s16       q5, ROW3R, XFIX_2_562915447
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
-    vmlal.s16       q6, ROW6R, XFIX_0_541196100
-    vsub.s32        q3, q3, q2
-    vrshrn.s32      ROW6R, q1, #11
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vaddl.s16       q5, ROW0R, ROW4R
-    vrshrn.s32      ROW2R, q1, #11
-    vrshrn.s32      ROW5R, q3, #11
-    vshl.s32        q5, q5, #13
-    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vrshrn.s32      ROW7R, q2, #11
-    vrshrn.s32      ROW3R, q5, #11
-    vrshrn.s32      ROW0R, q6, #11
-    vrshrn.s32      ROW4R, q3, #11
-    /* Transpose right 4x8 half */
-    vtrn.16         ROW6R, ROW7R
-    vtrn.16         ROW2R, ROW3R
-    vtrn.16         ROW0R, ROW1R
-    vtrn.16         ROW4R, ROW5R
-    vtrn.32         ROW1R, ROW3R
-    vtrn.32         ROW4R, ROW6R
-    vtrn.32         ROW0R, ROW2R
-    vtrn.32         ROW5R, ROW7R
-
-1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
-    vld1.s16        {d2}, [ip, :64]               /* reload constants */
-    vmull.s16       q6, ROW1R, XFIX_1_175875602   /* ROW5L <-> ROW1R */
-    vmlal.s16       q6, ROW1L, XFIX_1_175875602
-    vmlal.s16       q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
-    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7, ROW3R, XFIX_1_175875602   /* ROW7L <-> ROW3R */
-    vmlal.s16       q7, ROW3L, XFIX_1_175875602
-    vmlal.s16       q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
-    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
-    vsubl.s16       q3, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
-    vmull.s16       q2, ROW2L, XFIX_0_541196100
-    vmlal.s16       q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065  /* ROW6L <-> ROW2R */
-    vmov            q4, q6
-    vmlsl.s16       q6, ROW1R, XFIX_2_562915447   /* ROW5L <-> ROW1R */
-    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vshl.s32        q3, q3, #13
-    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vadd.s32        q1, q1, q6
-    vmlsl.s16       q7, ROW3R, XFIX_0_899976223   /* ROW7L <-> ROW3R */
-    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vshrn.s32       ROW1L, q1, #16
-    vsub.s32        q1, q1, q6
-    vmlal.s16       q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447  /* ROW5L <-> ROW1R */
-    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
-    vmlal.s16       q6, ROW2R, XFIX_0_541196100   /* ROW6L <-> ROW2R */
-    vsub.s32        q3, q3, q2
-    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vaddl.s16       q5, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW2L, q1, #16
-    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
-    vshl.s32        q5, q5, #13
-    vmlal.s16       q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223  /* ROW7L <-> ROW3R */
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW3L, q5, #16
-    vshrn.s32       ROW0L, q6, #16
-    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
-    /* 1-D IDCT, pass 2, right 4x8 half */
-    vld1.s16        {d2}, [ip, :64]               /* reload constants */
-    vmull.s16       q6, ROW5R, XFIX_1_175875602
-    vmlal.s16       q6, ROW5L, XFIX_1_175875602   /* ROW5L <-> ROW1R */
-    vmlal.s16       q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
-    vmull.s16       q7, ROW7R, XFIX_1_175875602
-    vmlal.s16       q7, ROW7L, XFIX_1_175875602   /* ROW7L <-> ROW3R */
-    vmlal.s16       q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
-    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
-    vsubl.s16       q3, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
-    vmull.s16       q2, ROW6L, XFIX_0_541196100   /* ROW6L <-> ROW2R */
-    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
-    vmov            q4, q6
-    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
-    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447  /* ROW7L <-> ROW3R */
-    vshl.s32        q3, q3, #13
-    vmlsl.s16       q4, ROW5L, XFIX_0_899976223   /* ROW5L <-> ROW1R */
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vadd.s32        q1, q1, q6
-    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
-    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223  /* ROW5L <-> ROW1R */
-    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
-    vsub.s32        q1, q1, q6
-    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
-    vmlsl.s16       q5, ROW7L, XFIX_2_562915447   /* ROW7L <-> ROW3R */
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865  /* ROW6L <-> ROW2R */
-    vmlal.s16       q6, ROW6R, XFIX_0_541196100
-    vsub.s32        q3, q3, q2
-    vshrn.s32       ROW6R, q1, #16
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vaddl.s16       q5, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
-    vshrn.s32       ROW5R, q3, #16
-    vshl.s32        q5, q5, #13
-    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vshrn.s32       ROW7R, q2, #16
-    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW4R, q3, #16
-
-2:  /* Descale to 8-bit and range limit */
-    vqrshrn.s16     d16, q8, #2
-    vqrshrn.s16     d17, q9, #2
-    vqrshrn.s16     d18, q10, #2
-    vqrshrn.s16     d19, q11, #2
-    vpop            {d8 - d15}                    /* restore Neon registers */
-    vqrshrn.s16     d20, q12, #2
-      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
-      vtrn.16         q8, q9
-    vqrshrn.s16     d21, q13, #2
-    vqrshrn.s16     d22, q14, #2
-      vmov.u8         q0, #(CENTERJSAMPLE)
-    vqrshrn.s16     d23, q15, #2
-      vtrn.8          d16, d17
-      vtrn.8          d18, d19
-      vadd.u8         q8, q8, q0
-      vadd.u8         q9, q9, q0
-      vtrn.16         q10, q11
-        /* Store results to the output buffer */
-        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
-        add             TMP1, TMP1, OUTPUT_COL
-        add             TMP2, TMP2, OUTPUT_COL
-        vst1.8          {d16}, [TMP1]
-      vtrn.8          d20, d21
-        vst1.8          {d17}, [TMP2]
-        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
-        add             TMP1, TMP1, OUTPUT_COL
-        add             TMP2, TMP2, OUTPUT_COL
-        vst1.8          {d18}, [TMP1]
-      vadd.u8         q10, q10, q0
-        vst1.8          {d19}, [TMP2]
-        ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
-        add             TMP1, TMP1, OUTPUT_COL
-        add             TMP2, TMP2, OUTPUT_COL
-        add             TMP3, TMP3, OUTPUT_COL
-        add             TMP4, TMP4, OUTPUT_COL
-      vtrn.8          d22, d23
-        vst1.8          {d20}, [TMP1]
-      vadd.u8         q11, q11, q0
-        vst1.8          {d21}, [TMP2]
-        vst1.8          {d22}, [TMP3]
-        vst1.8          {d23}, [TMP4]
-    bx              lr
-
-3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
-
-    /* Transpose left 4x8 half */
-    vtrn.16         ROW6L, ROW7L
-    vtrn.16         ROW2L, ROW3L
-    vtrn.16         ROW0L, ROW1L
-    vtrn.16         ROW4L, ROW5L
-    vshl.s16        ROW0R, ROW0R, #2  /* PASS1_BITS */
-    vtrn.32         ROW1L, ROW3L
-    vtrn.32         ROW4L, ROW6L
-    vtrn.32         ROW0L, ROW2L
-    vtrn.32         ROW5L, ROW7L
-
-    cmp             r0, #0
-    beq             4f  /* Right 4x8 half has all zeros, go to 'sparse' second
-                           pass */
-
-    /* Only row 0 is non-zero for the right 4x8 half  */
-    vdup.s16        ROW1R, ROW0R[1]
-    vdup.s16        ROW2R, ROW0R[2]
-    vdup.s16        ROW3R, ROW0R[3]
-    vdup.s16        ROW4R, ROW0R[0]
-    vdup.s16        ROW5R, ROW0R[1]
-    vdup.s16        ROW6R, ROW0R[2]
-    vdup.s16        ROW7R, ROW0R[3]
-    vdup.s16        ROW0R, ROW0R[0]
-    b               1b  /* Go to 'normal' second pass */
-
-4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
-    vld1.s16        {d2}, [ip, :64]               /* reload constants */
-    vmull.s16       q6, ROW1L, XFIX_1_175875602
-    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7, ROW3L, XFIX_1_175875602
-    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
-    vmull.s16       q2, ROW2L, XFIX_0_541196100
-    vshll.s16       q3, ROW0L, #13
-    vmov            q4, q6
-    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vadd.s32        q1, q1, q6
-    vadd.s32        q6, q6, q6
-    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
-    vshrn.s32       ROW1L, q1, #16
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
-    vsub.s32        q3, q3, q2
-    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vshll.s16       q5, ROW0L, #13
-    vshrn.s32       ROW2L, q1, #16
-    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW3L, q5, #16
-    vshrn.s32       ROW0L, q6, #16
-    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
-    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
-    vld1.s16        {d2}, [ip, :64]               /* reload constants */
-    vmull.s16       q6, ROW5L, XFIX_1_175875602
-    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7, ROW7L, XFIX_1_175875602
-    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
-    vmull.s16       q2, ROW6L, XFIX_0_541196100
-    vshll.s16       q3, ROW4L, #13
-    vmov            q4, q6
-    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
-    vmlsl.s16       q4, ROW5L, XFIX_0_899976223
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
-    vadd.s32        q1, q1, q6
-    vadd.s32        q6, q6, q6
-    vmlsl.s16       q5, ROW7L, XFIX_2_562915447
-    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
-    vsub.s32        q3, q3, q2
-    vshrn.s32       ROW6R, q1, #16
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vshll.s16       q5, ROW4L, #13
-    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
-    vshrn.s32       ROW5R, q3, #16
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vshrn.s32       ROW7R, q2, #16
-    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW4R, q3, #16
-    b               2b                            /* Go to epilogue */
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-    .unreq          ROW0L
-    .unreq          ROW0R
-    .unreq          ROW1L
-    .unreq          ROW1R
-    .unreq          ROW2L
-    .unreq          ROW2R
-    .unreq          ROW3L
-    .unreq          ROW3R
-    .unreq          ROW4L
-    .unreq          ROW4R
-    .unreq          ROW5L
-    .unreq          ROW5R
-    .unreq          ROW6L
-    .unreq          ROW6R
-    .unreq          ROW7L
-    .unreq          ROW7R
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
- * function from jidctfst.c
- *
- * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
- * But in Arm Neon case some extra additions are required because VQDMULH
- * instruction can't handle the constants larger than 1. So the expressions
- * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
- * which introduces an extra addition. Overall, there are 6 extra additions
- * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
- */
-
-#define XFIX_1_082392200  d0[0]
-#define XFIX_1_414213562  d0[1]
-#define XFIX_1_847759065  d0[2]
-#define XFIX_2_613125930  d0[3]
-
-.balign 16
-jsimd_idct_ifast_neon_consts:
-  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
-  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
-  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
-  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
-
-asm_function jsimd_idct_ifast_neon
-
-    DCT_TABLE       .req r0
-    COEF_BLOCK      .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_COL      .req r3
-    TMP1            .req r0
-    TMP2            .req r1
-    TMP3            .req r2
-    TMP4            .req ip
-
-    /* Load and dequantize coefficients into Neon registers
-     * with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17     ( q8  )
-     *   1 | d18     | d19     ( q9  )
-     *   2 | d20     | d21     ( q10 )
-     *   3 | d22     | d23     ( q11 )
-     *   4 | d24     | d25     ( q12 )
-     *   5 | d26     | d27     ( q13 )
-     *   6 | d28     | d29     ( q14 )
-     *   7 | d30     | d31     ( q15 )
-     */
-    adr             ip, jsimd_idct_ifast_neon_consts
-    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
-    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
-    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
-    vmul.s16        q8, q8, q0
-    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
-    vmul.s16        q9, q9, q1
-    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
-    vmul.s16        q10, q10, q2
-    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
-    vmul.s16        q11, q11, q3
-    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
-    vmul.s16        q12, q12, q0
-    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
-    vmul.s16        q14, q14, q2
-    vmul.s16        q13, q13, q1
-    vld1.16         {d0}, [ip, :64]  /* load constants */
-    vmul.s16        q15, q15, q3
-    vpush           {d8 - d13}       /* save Neon registers */
-    /* 1-D IDCT, pass 1 */
-    vsub.s16        q2, q10, q14
-    vadd.s16        q14, q10, q14
-    vsub.s16        q1, q11, q13
-    vadd.s16        q13, q11, q13
-    vsub.s16        q5, q9, q15
-    vadd.s16        q15, q9, q15
-    vqdmulh.s16     q4, q2, XFIX_1_414213562
-    vqdmulh.s16     q6, q1, XFIX_2_613125930
-    vadd.s16        q3, q1, q1
-    vsub.s16        q1, q5, q1
-    vadd.s16        q10, q2, q4
-    vqdmulh.s16     q4, q1, XFIX_1_847759065
-    vsub.s16        q2, q15, q13
-    vadd.s16        q3, q3, q6
-    vqdmulh.s16     q6, q2, XFIX_1_414213562
-    vadd.s16        q1, q1, q4
-    vqdmulh.s16     q4, q5, XFIX_1_082392200
-    vsub.s16        q10, q10, q14
-    vadd.s16        q2, q2, q6
-    vsub.s16        q6, q8, q12
-    vadd.s16        q12, q8, q12
-    vadd.s16        q9, q5, q4
-    vadd.s16        q5, q6, q10
-    vsub.s16        q10, q6, q10
-    vadd.s16        q6, q15, q13
-    vadd.s16        q8, q12, q14
-    vsub.s16        q3, q6, q3
-    vsub.s16        q12, q12, q14
-    vsub.s16        q3, q3, q1
-    vsub.s16        q1, q9, q1
-    vadd.s16        q2, q3, q2
-    vsub.s16        q15, q8, q6
-    vadd.s16        q1, q1, q2
-    vadd.s16        q8, q8, q6
-    vadd.s16        q14, q5, q3
-    vsub.s16        q9, q5, q3
-    vsub.s16        q13, q10, q2
-    vadd.s16        q10, q10, q2
-      /* Transpose */
-      vtrn.16         q8, q9
-    vsub.s16        q11, q12, q1
-      vtrn.16         q14, q15
-    vadd.s16        q12, q12, q1
-      vtrn.16         q10, q11
-      vtrn.16         q12, q13
-      vtrn.32         q9, q11
-      vtrn.32         q12, q14
-      vtrn.32         q8, q10
-      vtrn.32         q13, q15
-      vswp            d28, d21
-      vswp            d26, d19
-    /* 1-D IDCT, pass 2 */
-    vsub.s16        q2, q10, q14
-      vswp            d30, d23
-    vadd.s16        q14, q10, q14
-      vswp            d24, d17
-    vsub.s16        q1, q11, q13
-    vadd.s16        q13, q11, q13
-    vsub.s16        q5, q9, q15
-    vadd.s16        q15, q9, q15
-    vqdmulh.s16     q4, q2, XFIX_1_414213562
-    vqdmulh.s16     q6, q1, XFIX_2_613125930
-    vadd.s16        q3, q1, q1
-    vsub.s16        q1, q5, q1
-    vadd.s16        q10, q2, q4
-    vqdmulh.s16     q4, q1, XFIX_1_847759065
-    vsub.s16        q2, q15, q13
-    vadd.s16        q3, q3, q6
-    vqdmulh.s16     q6, q2, XFIX_1_414213562
-    vadd.s16        q1, q1, q4
-    vqdmulh.s16     q4, q5, XFIX_1_082392200
-    vsub.s16        q10, q10, q14
-    vadd.s16        q2, q2, q6
-    vsub.s16        q6, q8, q12
-    vadd.s16        q12, q8, q12
-    vadd.s16        q9, q5, q4
-    vadd.s16        q5, q6, q10
-    vsub.s16        q10, q6, q10
-    vadd.s16        q6, q15, q13
-    vadd.s16        q8, q12, q14
-    vsub.s16        q3, q6, q3
-    vsub.s16        q12, q12, q14
-    vsub.s16        q3, q3, q1
-    vsub.s16        q1, q9, q1
-    vadd.s16        q2, q3, q2
-    vsub.s16        q15, q8, q6
-    vadd.s16        q1, q1, q2
-    vadd.s16        q8, q8, q6
-    vadd.s16        q14, q5, q3
-    vsub.s16        q9, q5, q3
-    vsub.s16        q13, q10, q2
-    vpop            {d8 - d13}    /* restore Neon registers */
-    vadd.s16        q10, q10, q2
-    vsub.s16        q11, q12, q1
-    vadd.s16        q12, q12, q1
-    /* Descale to 8-bit and range limit */
-    vmov.u8         q0, #0x80
-    vqshrn.s16      d16, q8, #5
-    vqshrn.s16      d17, q9, #5
-    vqshrn.s16      d18, q10, #5
-    vqshrn.s16      d19, q11, #5
-    vqshrn.s16      d20, q12, #5
-    vqshrn.s16      d21, q13, #5
-    vqshrn.s16      d22, q14, #5
-    vqshrn.s16      d23, q15, #5
-    vadd.u8         q8, q8, q0
-    vadd.u8         q9, q9, q0
-    vadd.u8         q10, q10, q0
-    vadd.u8         q11, q11, q0
-    /* Transpose the final 8-bit samples */
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.8          d16, d17
-    vtrn.8          d18, d19
-      /* Store results to the output buffer */
-      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
-      add             TMP1, TMP1, OUTPUT_COL
-      add             TMP2, TMP2, OUTPUT_COL
-      vst1.8          {d16}, [TMP1]
-      vst1.8          {d17}, [TMP2]
-      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
-      add             TMP1, TMP1, OUTPUT_COL
-      add             TMP2, TMP2, OUTPUT_COL
-      vst1.8          {d18}, [TMP1]
-    vtrn.8          d20, d21
-      vst1.8          {d19}, [TMP2]
-      ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
-      add             TMP1, TMP1, OUTPUT_COL
-      add             TMP2, TMP2, OUTPUT_COL
-      add             TMP3, TMP3, OUTPUT_COL
-      add             TMP4, TMP4, OUTPUT_COL
-      vst1.8          {d20}, [TMP1]
-    vtrn.8          d22, d23
-      vst1.8          {d21}, [TMP2]
-      vst1.8          {d22}, [TMP3]
-      vst1.8          {d23}, [TMP4]
-    bx              lr
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_4x4_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular Neon optimized function is
- *       bit exact compatibility with jpeg-6b.
- *
- * TODO: a bit better instructions scheduling can be achieved by expanding
- *       idct_helper/transpose_4x4 macros and reordering instructions,
- *       but readability will suffer somewhat.
- */
-
-#define CONST_BITS  13
-
-#define FIX_0_211164243  (1730)   /* FIX(0.211164243) */
-#define FIX_0_509795579  (4176)   /* FIX(0.509795579) */
-#define FIX_0_601344887  (4926)   /* FIX(0.601344887) */
-#define FIX_0_720959822  (5906)   /* FIX(0.720959822) */
-#define FIX_0_765366865  (6270)   /* FIX(0.765366865) */
-#define FIX_0_850430095  (6967)   /* FIX(0.850430095) */
-#define FIX_0_899976223  (7373)   /* FIX(0.899976223) */
-#define FIX_1_061594337  (8697)   /* FIX(1.061594337) */
-#define FIX_1_272758580  (10426)  /* FIX(1.272758580) */
-#define FIX_1_451774981  (11893)  /* FIX(1.451774981) */
-#define FIX_1_847759065  (15137)  /* FIX(1.847759065) */
-#define FIX_2_172734803  (17799)  /* FIX(2.172734803) */
-#define FIX_2_562915447  (20995)  /* FIX(2.562915447) */
-#define FIX_3_624509785  (29692)  /* FIX(3.624509785) */
-
-.balign 16
-jsimd_idct_4x4_neon_consts:
-  .short FIX_1_847759065        /* d0[0] */
-  .short -FIX_0_765366865       /* d0[1] */
-  .short -FIX_0_211164243       /* d0[2] */
-  .short FIX_1_451774981        /* d0[3] */
-  .short -FIX_2_172734803       /* d1[0] */
-  .short FIX_1_061594337        /* d1[1] */
-  .short -FIX_0_509795579       /* d1[2] */
-  .short -FIX_0_601344887       /* d1[3] */
-  .short FIX_0_899976223        /* d2[0] */
-  .short FIX_2_562915447        /* d2[1] */
-  .short 1 << (CONST_BITS + 1)  /* d2[2] */
-  .short 0                      /* d2[3] */
-
-.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
-    vmull.s16       q14, \x4, d2[2]
-    vmlal.s16       q14, \x8, d0[0]
-    vmlal.s16       q14, \x14, d0[1]
-
-    vmull.s16       q13, \x16, d1[2]
-    vmlal.s16       q13, \x12, d1[3]
-    vmlal.s16       q13, \x10, d2[0]
-    vmlal.s16       q13, \x6, d2[1]
-
-    vmull.s16       q15, \x4, d2[2]
-    vmlsl.s16       q15, \x8, d0[0]
-    vmlsl.s16       q15, \x14, d0[1]
-
-    vmull.s16       q12, \x16, d0[2]
-    vmlal.s16       q12, \x12, d0[3]
-    vmlal.s16       q12, \x10, d1[0]
-    vmlal.s16       q12, \x6, d1[1]
-
-    vadd.s32        q10, q14, q13
-    vsub.s32        q14, q14, q13
-
-  .if \shift > 16
-    vrshr.s32       q10, q10, #\shift
-    vrshr.s32       q14, q14, #\shift
-    vmovn.s32       \y26, q10
-    vmovn.s32       \y29, q14
-  .else
-    vrshrn.s32      \y26, q10, #\shift
-    vrshrn.s32      \y29, q14, #\shift
-  .endif
-
-    vadd.s32        q10, q15, q12
-    vsub.s32        q15, q15, q12
-
-  .if \shift > 16
-    vrshr.s32       q10, q10, #\shift
-    vrshr.s32       q15, q15, #\shift
-    vmovn.s32       \y27, q10
-    vmovn.s32       \y28, q15
-  .else
-    vrshrn.s32      \y27, q10, #\shift
-    vrshrn.s32      \y28, q15, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_4x4_neon
-
-    DCT_TABLE       .req r0
-    COEF_BLOCK      .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_COL      .req r3
-    TMP1            .req r0
-    TMP2            .req r1
-    TMP3            .req r2
-    TMP4            .req ip
-
-    vpush           {d8 - d15}
-
-    /* Load constants (d3 is just used for padding) */
-    adr             TMP4, jsimd_idct_4x4_neon_consts
-    vld1.16         {d0, d1, d2, d3}, [TMP4, :128]
-
-    /* Load all COEF_BLOCK into Neon registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d4      | d5
-     *   1 | d6      | d7
-     *   2 | d8      | d9
-     *   3 | d10     | d11
-     *   4 | -       | -
-     *   5 | d12     | d13
-     *   6 | d14     | d15
-     *   7 | d16     | d17
-     */
-    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
-    vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
-    add COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
-    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
-    /* dequantize */
-    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
-    vmul.s16        q2, q2, q9
-    vld1.16         {d22, d23, d24, d25}, [DCT_TABLE, :128]!
-    vmul.s16        q3, q3, q10
-    vmul.s16        q4, q4, q11
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d26, d27, d28, d29}, [DCT_TABLE, :128]!
-    vmul.s16        q5, q5, q12
-    vmul.s16        q6, q6, q13
-    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
-    vmul.s16        q7, q7, q14
-    vmul.s16        q8, q8, q15
-
-    /* Pass 1 */
-    idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
-    transpose_4x4   d4, d6, d8, d10
-    idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
-    transpose_4x4   d5, d7, d9, d11
-
-    /* Pass 2 */
-    idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
-    transpose_4x4   d26, d27, d28, d29
-
-    /* Range limit */
-    vmov.u16        q15, #0x80
-    vadd.s16        q13, q13, q15
-    vadd.s16        q14, q14, q15
-    vqmovun.s16     d26, q13
-    vqmovun.s16     d27, q14
-
-    /* Store results to the output buffer */
-    ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-    add             TMP3, TMP3, OUTPUT_COL
-    add             TMP4, TMP4, OUTPUT_COL
-
-#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
-    /* We can use much less instructions on little endian systems if the
-     * OS kernel is not configured to trap unaligned memory accesses
-     */
-    vst1.32         {d26[0]}, [TMP1]!
-    vst1.32         {d27[0]}, [TMP3]!
-    vst1.32         {d26[1]}, [TMP2]!
-    vst1.32         {d27[1]}, [TMP4]!
-#else
-    vst1.8          {d26[0]}, [TMP1]!
-    vst1.8          {d27[0]}, [TMP3]!
-    vst1.8          {d26[1]}, [TMP1]!
-    vst1.8          {d27[1]}, [TMP3]!
-    vst1.8          {d26[2]}, [TMP1]!
-    vst1.8          {d27[2]}, [TMP3]!
-    vst1.8          {d26[3]}, [TMP1]!
-    vst1.8          {d27[3]}, [TMP3]!
-
-    vst1.8          {d26[4]}, [TMP2]!
-    vst1.8          {d27[4]}, [TMP4]!
-    vst1.8          {d26[5]}, [TMP2]!
-    vst1.8          {d27[5]}, [TMP4]!
-    vst1.8          {d26[6]}, [TMP2]!
-    vst1.8          {d27[6]}, [TMP4]!
-    vst1.8          {d26[7]}, [TMP2]!
-    vst1.8          {d27[7]}, [TMP4]!
-#endif
-
-    vpop            {d8 - d15}
-    bx              lr
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_2x2_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular Neon optimized function is
- *       bit exact compatibility with jpeg-6b.
- */
-
-.balign 8
-jsimd_idct_2x2_neon_consts:
-  .short -FIX_0_720959822  /* d0[0] */
-  .short FIX_0_850430095   /* d0[1] */
-  .short -FIX_1_272758580  /* d0[2] */
-  .short FIX_3_624509785   /* d0[3] */
-
-.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
-    vshll.s16       q14, \x4, #15
-    vmull.s16       q13, \x6, d0[3]
-    vmlal.s16       q13, \x10, d0[2]
-    vmlal.s16       q13, \x12, d0[1]
-    vmlal.s16       q13, \x16, d0[0]
-
-    vadd.s32        q10, q14, q13
-    vsub.s32        q14, q14, q13
-
-  .if \shift > 16
-    vrshr.s32       q10, q10, #\shift
-    vrshr.s32       q14, q14, #\shift
-    vmovn.s32       \y26, q10
-    vmovn.s32       \y27, q14
-  .else
-    vrshrn.s32      \y26, q10, #\shift
-    vrshrn.s32      \y27, q14, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_2x2_neon
-
-    DCT_TABLE       .req r0
-    COEF_BLOCK      .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_COL      .req r3
-    TMP1            .req r0
-    TMP2            .req ip
-
-    vpush           {d8 - d15}
-
-    /* Load constants */
-    adr             TMP2, jsimd_idct_2x2_neon_consts
-    vld1.16         {d0}, [TMP2, :64]
-
-    /* Load all COEF_BLOCK into Neon registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d4      | d5
-     *   1 | d6      | d7
-     *   2 | -       | -
-     *   3 | d10     | d11
-     *   4 | -       | -
-     *   5 | d12     | d13
-     *   6 | -       | -
-     *   7 | d16     | d17
-     */
-    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d10, d11}, [COEF_BLOCK, :128]!
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d12, d13}, [COEF_BLOCK, :128]!
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
-    /* Dequantize */
-    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
-    vmul.s16        q2, q2, q9
-    vmul.s16        q3, q3, q10
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d24, d25}, [DCT_TABLE, :128]!
-    vmul.s16        q5, q5, q12
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d26, d27}, [DCT_TABLE, :128]!
-    vmul.s16        q6, q6, q13
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
-    vmul.s16        q8, q8, q15
-
-    /* Pass 1 */
-#if 0
-    idct_helper     d4, d6, d10, d12, d16, 13, d4, d6
-    transpose_4x4   d4, d6, d8, d10
-    idct_helper     d5, d7, d11, d13, d17, 13, d5, d7
-    transpose_4x4   d5, d7, d9, d11
-#else
-    vmull.s16       q13, d6, d0[3]
-    vmlal.s16       q13, d10, d0[2]
-    vmlal.s16       q13, d12, d0[1]
-    vmlal.s16       q13, d16, d0[0]
-    vmull.s16       q12, d7, d0[3]
-    vmlal.s16       q12, d11, d0[2]
-    vmlal.s16       q12, d13, d0[1]
-    vmlal.s16       q12, d17, d0[0]
-    vshll.s16       q14, d4, #15
-    vshll.s16       q15, d5, #15
-    vadd.s32        q10, q14, q13
-    vsub.s32        q14, q14, q13
-    vrshrn.s32      d4, q10, #13
-    vrshrn.s32      d6, q14, #13
-    vadd.s32        q10, q15, q12
-    vsub.s32        q14, q15, q12
-    vrshrn.s32      d5, q10, #13
-    vrshrn.s32      d7, q14, #13
-    vtrn.16         q2, q3
-    vtrn.32         q3, q5
-#endif
-
-    /* Pass 2 */
-    idct_helper     d4, d6, d10, d7, d11, 20, d26, d27
-
-    /* Range limit */
-    vmov.u16        q15, #0x80
-    vadd.s16        q13, q13, q15
-    vqmovun.s16     d26, q13
-    vqmovun.s16     d27, q13
-
-    /* Store results to the output buffer */
-    ldmia           OUTPUT_BUF, {TMP1, TMP2}
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-
-    vst1.8          {d26[0]}, [TMP1]!
-    vst1.8          {d27[4]}, [TMP1]!
-    vst1.8          {d26[1]}, [TMP2]!
-    vst1.8          {d27[5]}, [TMP2]!
-
-    vpop            {d8 - d15}
-    bx              lr
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_ycc_extrgb_convert_neon
- * jsimd_ycc_extbgr_convert_neon
- * jsimd_ycc_extrgbx_convert_neon
- * jsimd_ycc_extbgrx_convert_neon
- * jsimd_ycc_extxbgr_convert_neon
- * jsimd_ycc_extxrgb_convert_neon
- *
- * Colorspace conversion YCbCr -> RGB
- */
-
-
-.macro do_load size
-  .if \size == 8
-    vld1.8          {d4}, [U, :64]!
-    vld1.8          {d5}, [V, :64]!
-    vld1.8          {d0}, [Y, :64]!
-    pld             [U, #64]
-    pld             [V, #64]
-    pld             [Y, #64]
-  .elseif \size == 4
-    vld1.8          {d4[0]}, [U]!
-    vld1.8          {d4[1]}, [U]!
-    vld1.8          {d4[2]}, [U]!
-    vld1.8          {d4[3]}, [U]!
-    vld1.8          {d5[0]}, [V]!
-    vld1.8          {d5[1]}, [V]!
-    vld1.8          {d5[2]}, [V]!
-    vld1.8          {d5[3]}, [V]!
-    vld1.8          {d0[0]}, [Y]!
-    vld1.8          {d0[1]}, [Y]!
-    vld1.8          {d0[2]}, [Y]!
-    vld1.8          {d0[3]}, [Y]!
-  .elseif \size == 2
-    vld1.8          {d4[4]}, [U]!
-    vld1.8          {d4[5]}, [U]!
-    vld1.8          {d5[4]}, [V]!
-    vld1.8          {d5[5]}, [V]!
-    vld1.8          {d0[4]}, [Y]!
-    vld1.8          {d0[5]}, [Y]!
-  .elseif \size == 1
-    vld1.8          {d4[6]}, [U]!
-    vld1.8          {d5[6]}, [V]!
-    vld1.8          {d0[6]}, [Y]!
-  .else
-    .error unsupported macroblock size
-  .endif
-.endm
-
-.macro do_store bpp, size
-  .if \bpp == 24
-    .if \size == 8
-      vst3.8        {d10, d11, d12}, [RGB]!
-    .elseif \size == 4
-      vst3.8        {d10[0], d11[0], d12[0]}, [RGB]!
-      vst3.8        {d10[1], d11[1], d12[1]}, [RGB]!
-      vst3.8        {d10[2], d11[2], d12[2]}, [RGB]!
-      vst3.8        {d10[3], d11[3], d12[3]}, [RGB]!
-    .elseif \size == 2
-      vst3.8        {d10[4], d11[4], d12[4]}, [RGB]!
-      vst3.8        {d10[5], d11[5], d12[5]}, [RGB]!
-    .elseif \size == 1
-      vst3.8        {d10[6], d11[6], d12[6]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .elseif \bpp == 32
-    .if \size == 8
-      vst4.8        {d10, d11, d12, d13}, [RGB]!
-    .elseif \size == 4
-      vst4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
-      vst4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
-      vst4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
-      vst4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
-    .elseif \size == 2
-      vst4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
-      vst4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
-    .elseif \size == 1
-      vst4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .elseif \bpp == 16
-    .if \size == 8
-      vst1.16       {q15}, [RGB]!
-    .elseif \size == 4
-      vst1.16       {d30}, [RGB]!
-    .elseif \size == 2
-      vst1.16       {d31[0]}, [RGB]!
-      vst1.16       {d31[1]}, [RGB]!
-    .elseif \size == 1
-      vst1.16       {d31[2]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .else
-    .error unsupported bpp
-  .endif
-.endm
-
-.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
-
-/*
- * 2-stage pipelined YCbCr->RGB conversion
- */
-
-.macro do_yuv_to_rgb_stage1
-    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
-    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
-    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
-    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
-    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
-    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
-    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
-    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
-.endm
-
-.macro do_yuv_to_rgb_stage2
-    vrshrn.s32      d20, q10, #15
-    vrshrn.s32      d21, q11, #15
-    vrshrn.s32      d24, q12, #14
-    vrshrn.s32      d25, q13, #14
-    vrshrn.s32      d28, q14, #14
-    vrshrn.s32      d29, q15, #14
-    vaddw.u8        q11, q10, d0
-    vaddw.u8        q12, q12, d0
-    vaddw.u8        q14, q14, d0
-  .if \bpp != 16
-    vqmovun.s16     d1\g_offs, q11
-    vqmovun.s16     d1\r_offs, q12
-    vqmovun.s16     d1\b_offs, q14
-  .else  /* rgb565 */
-    vqshlu.s16      q13, q11, #8
-    vqshlu.s16      q15, q12, #8
-    vqshlu.s16      q14, q14, #8
-    vsri.u16        q15, q13, #5
-    vsri.u16        q15, q14, #11
-  .endif
-.endm
-
-.macro do_yuv_to_rgb_stage2_store_load_stage1
-                                       /* "do_yuv_to_rgb_stage2" and "store" */
-                                       vrshrn.s32      d20, q10, #15
-    /* "load" and "do_yuv_to_rgb_stage1" */
-    pld             [U, #64]
-                                       vrshrn.s32      d21, q11, #15
-    pld             [V, #64]
-                                       vrshrn.s32      d24, q12, #14
-                                       vrshrn.s32      d25, q13, #14
-    vld1.8          {d4}, [U, :64]!
-                                       vrshrn.s32      d28, q14, #14
-    vld1.8          {d5}, [V, :64]!
-                                       vrshrn.s32      d29, q15, #14
-    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
-    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
-                                       vaddw.u8        q11, q10, d0
-    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
-                                       vaddw.u8        q12, q12, d0
-                                       vaddw.u8        q14, q14, d0
-  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
-                                       vqmovun.s16     d1\g_offs, q11
-    pld             [Y, #64]
-                                       vqmovun.s16     d1\r_offs, q12
-    vld1.8          {d0}, [Y, :64]!
-                                       vqmovun.s16     d1\b_offs, q14
-    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
-                                       do_store        \bpp, 8
-    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
-    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
-    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
-    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
-  .else  /**************************** rgb565 ********************************/
-                                       vqshlu.s16      q13, q11, #8
-    pld             [Y, #64]
-                                       vqshlu.s16      q15, q12, #8
-                                       vqshlu.s16      q14, q14, #8
-    vld1.8          {d0}, [Y, :64]!
-    vmull.s16       q11, d7, d1[1]
-    vmlal.s16       q11, d9, d1[2]
-                                       vsri.u16        q15, q13, #5
-    vmull.s16       q12, d8, d1[0]
-                                       vsri.u16        q15, q14, #11
-    vmull.s16       q13, d9, d1[0]
-    vmull.s16       q14, d6, d1[3]
-                                       do_store        \bpp, 8
-    vmull.s16       q15, d7, d1[3]
-  .endif
-.endm
-
-.macro do_yuv_to_rgb
-    do_yuv_to_rgb_stage1
-    do_yuv_to_rgb_stage2
-.endm
-
-/* Apple gas crashes on adrl, work around that by using adr.
- * But this requires a copy of these constants for each function.
- */
-
-.balign 16
-jsimd_ycc_\colorid\()_neon_consts:
-  .short 0,      0,     0,      0
-  .short 22971, -11277, -23401, 29033
-  .short -128,  -128,   -128,   -128
-  .short -128,  -128,   -128,   -128
-
-asm_function jsimd_ycc_\colorid\()_convert_neon
-    OUTPUT_WIDTH    .req r0
-    INPUT_BUF       .req r1
-    INPUT_ROW       .req r2
-    OUTPUT_BUF      .req r3
-    NUM_ROWS        .req r4
-
-    INPUT_BUF0      .req r5
-    INPUT_BUF1      .req r6
-    INPUT_BUF2      .req INPUT_BUF
-
-    RGB             .req r7
-    Y               .req r8
-    U               .req r9
-    V               .req r10
-    N               .req ip
-
-    /* Load constants to d1, d2, d3 (d0 is just used for padding) */
-    adr             ip, jsimd_ycc_\colorid\()_neon_consts
-    vld1.16         {d0, d1, d2, d3}, [ip, :128]
-
-    /* Save Arm registers and handle input arguments */
-    push            {r4, r5, r6, r7, r8, r9, r10, lr}
-    ldr             NUM_ROWS, [sp, #(4 * 8)]
-    ldr             INPUT_BUF0, [INPUT_BUF]
-    ldr             INPUT_BUF1, [INPUT_BUF, #4]
-    ldr             INPUT_BUF2, [INPUT_BUF, #8]
-    .unreq          INPUT_BUF
-
-    /* Save Neon registers */
-    vpush           {d8 - d15}
-
-    /* Initially set d10, d11, d12, d13 to 0xFF */
-    vmov.u8         q5, #255
-    vmov.u8         q6, #255
-
-    /* Outer loop over scanlines */
-    cmp             NUM_ROWS, #1
-    blt             9f
-0:
-    ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
-    ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2]
-    mov             N, OUTPUT_WIDTH
-    ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2]
-    add             INPUT_ROW, INPUT_ROW, #1
-    ldr             RGB, [OUTPUT_BUF], #4
-
-    /* Inner loop over pixels */
-    subs            N, N, #8
-    blt             3f
-    do_load         8
-    do_yuv_to_rgb_stage1
-    subs            N, N, #8
-    blt             2f
-1:
-    do_yuv_to_rgb_stage2_store_load_stage1
-    subs            N, N, #8
-    bge             1b
-2:
-    do_yuv_to_rgb_stage2
-    do_store        \bpp, 8
-    tst             N, #7
-    beq             8f
-3:
-    tst             N, #4
-    beq             3f
-    do_load         4
-3:
-    tst             N, #2
-    beq             4f
-    do_load         2
-4:
-    tst             N, #1
-    beq             5f
-    do_load         1
-5:
-    do_yuv_to_rgb
-    tst             N, #4
-    beq             6f
-    do_store        \bpp, 4
-6:
-    tst             N, #2
-    beq             7f
-    do_store        \bpp, 2
-7:
-    tst             N, #1
-    beq             8f
-    do_store        \bpp, 1
-8:
-    subs            NUM_ROWS, NUM_ROWS, #1
-    bgt             0b
-9:
-    /* Restore all registers and return */
-    vpop            {d8 - d15}
-    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
-
-    .unreq          OUTPUT_WIDTH
-    .unreq          INPUT_ROW
-    .unreq          OUTPUT_BUF
-    .unreq          NUM_ROWS
-    .unreq          INPUT_BUF0
-    .unreq          INPUT_BUF1
-    .unreq          INPUT_BUF2
-    .unreq          RGB
-    .unreq          Y
-    .unreq          U
-    .unreq          V
-    .unreq          N
-
-.purgem do_yuv_to_rgb
-.purgem do_yuv_to_rgb_stage1
-.purgem do_yuv_to_rgb_stage2
-.purgem do_yuv_to_rgb_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R  G  B */
-generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, 1, 2
-generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, 1, 0
-generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
-generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
-generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
-generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
-generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, 0, 0
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_extrgb_ycc_convert_neon
- * jsimd_extbgr_ycc_convert_neon
- * jsimd_extrgbx_ycc_convert_neon
- * jsimd_extbgrx_ycc_convert_neon
- * jsimd_extxbgr_ycc_convert_neon
- * jsimd_extxrgb_ycc_convert_neon
- *
- * Colorspace conversion RGB -> YCbCr
- */
-
-.macro do_store size
-  .if \size == 8
-    vst1.8          {d20}, [Y]!
-    vst1.8          {d21}, [U]!
-    vst1.8          {d22}, [V]!
-  .elseif \size == 4
-    vst1.8          {d20[0]}, [Y]!
-    vst1.8          {d20[1]}, [Y]!
-    vst1.8          {d20[2]}, [Y]!
-    vst1.8          {d20[3]}, [Y]!
-    vst1.8          {d21[0]}, [U]!
-    vst1.8          {d21[1]}, [U]!
-    vst1.8          {d21[2]}, [U]!
-    vst1.8          {d21[3]}, [U]!
-    vst1.8          {d22[0]}, [V]!
-    vst1.8          {d22[1]}, [V]!
-    vst1.8          {d22[2]}, [V]!
-    vst1.8          {d22[3]}, [V]!
-  .elseif \size == 2
-    vst1.8          {d20[4]}, [Y]!
-    vst1.8          {d20[5]}, [Y]!
-    vst1.8          {d21[4]}, [U]!
-    vst1.8          {d21[5]}, [U]!
-    vst1.8          {d22[4]}, [V]!
-    vst1.8          {d22[5]}, [V]!
-  .elseif \size == 1
-    vst1.8          {d20[6]}, [Y]!
-    vst1.8          {d21[6]}, [U]!
-    vst1.8          {d22[6]}, [V]!
-  .else
-    .error unsupported macroblock size
-  .endif
-.endm
-
-.macro do_load bpp, size
-  .if \bpp == 24
-    .if \size == 8
-      vld3.8        {d10, d11, d12}, [RGB]!
-      pld           [RGB, #128]
-    .elseif \size == 4
-      vld3.8        {d10[0], d11[0], d12[0]}, [RGB]!
-      vld3.8        {d10[1], d11[1], d12[1]}, [RGB]!
-      vld3.8        {d10[2], d11[2], d12[2]}, [RGB]!
-      vld3.8        {d10[3], d11[3], d12[3]}, [RGB]!
-    .elseif \size == 2
-      vld3.8        {d10[4], d11[4], d12[4]}, [RGB]!
-      vld3.8        {d10[5], d11[5], d12[5]}, [RGB]!
-    .elseif \size == 1
-      vld3.8        {d10[6], d11[6], d12[6]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .elseif \bpp == 32
-    .if \size == 8
-      vld4.8        {d10, d11, d12, d13}, [RGB]!
-      pld           [RGB, #128]
-    .elseif \size == 4
-      vld4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
-      vld4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
-      vld4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
-      vld4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
-    .elseif \size == 2
-      vld4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
-      vld4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
-    .elseif \size == 1
-      vld4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .else
-    .error unsupported bpp
-  .endif
-.endm
-
-.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
-
-/*
- * 2-stage pipelined RGB->YCbCr conversion
- */
-
-.macro do_rgb_to_yuv_stage1
-    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
-    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
-    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
-    vmull.u16       q7, d4, d0[0]
-    vmlal.u16       q7, d6, d0[1]
-    vmlal.u16       q7, d8, d0[2]
-    vmull.u16       q8, d5, d0[0]
-    vmlal.u16       q8, d7, d0[1]
-    vmlal.u16       q8, d9, d0[2]
-    vrev64.32       q9, q1
-    vrev64.32       q13, q1
-    vmlsl.u16       q9, d4, d0[3]
-    vmlsl.u16       q9, d6, d1[0]
-    vmlal.u16       q9, d8, d1[1]
-    vmlsl.u16       q13, d5, d0[3]
-    vmlsl.u16       q13, d7, d1[0]
-    vmlal.u16       q13, d9, d1[1]
-    vrev64.32       q14, q1
-    vrev64.32       q15, q1
-    vmlal.u16       q14, d4, d1[1]
-    vmlsl.u16       q14, d6, d1[2]
-    vmlsl.u16       q14, d8, d1[3]
-    vmlal.u16       q15, d5, d1[1]
-    vmlsl.u16       q15, d7, d1[2]
-    vmlsl.u16       q15, d9, d1[3]
-.endm
-
-.macro do_rgb_to_yuv_stage2
-    vrshrn.u32      d20, q7, #16
-    vrshrn.u32      d21, q8, #16
-    vshrn.u32       d22, q9, #16
-    vshrn.u32       d23, q13, #16
-    vshrn.u32       d24, q14, #16
-    vshrn.u32       d25, q15, #16
-    vmovn.u16       d20, q10       /* d20 = y */
-    vmovn.u16       d21, q11       /* d21 = u */
-    vmovn.u16       d22, q12       /* d22 = v */
-.endm
-
-.macro do_rgb_to_yuv
-    do_rgb_to_yuv_stage1
-    do_rgb_to_yuv_stage2
-.endm
-
-.macro do_rgb_to_yuv_stage2_store_load_stage1
-      vrshrn.u32      d20, q7, #16
-      vrshrn.u32      d21, q8, #16
-      vshrn.u32       d22, q9, #16
-    vrev64.32       q9, q1
-      vshrn.u32       d23, q13, #16
-    vrev64.32       q13, q1
-      vshrn.u32       d24, q14, #16
-      vshrn.u32       d25, q15, #16
-    do_load         \bpp, 8
-      vmovn.u16       d20, q10     /* d20 = y */
-    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
-      vmovn.u16       d21, q11     /* d21 = u */
-    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
-      vmovn.u16       d22, q12     /* d22 = v */
-    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
-    vmull.u16       q7, d4, d0[0]
-    vmlal.u16       q7, d6, d0[1]
-    vmlal.u16       q7, d8, d0[2]
-      vst1.8          {d20}, [Y]!
-    vmull.u16       q8, d5, d0[0]
-    vmlal.u16       q8, d7, d0[1]
-    vmlal.u16       q8, d9, d0[2]
-    vmlsl.u16       q9, d4, d0[3]
-    vmlsl.u16       q9, d6, d1[0]
-    vmlal.u16       q9, d8, d1[1]
-      vst1.8          {d21}, [U]!
-    vmlsl.u16       q13, d5, d0[3]
-    vmlsl.u16       q13, d7, d1[0]
-    vmlal.u16       q13, d9, d1[1]
-    vrev64.32       q14, q1
-    vrev64.32       q15, q1
-    vmlal.u16       q14, d4, d1[1]
-    vmlsl.u16       q14, d6, d1[2]
-    vmlsl.u16       q14, d8, d1[3]
-      vst1.8          {d22}, [V]!
-    vmlal.u16       q15, d5, d1[1]
-    vmlsl.u16       q15, d7, d1[2]
-    vmlsl.u16       q15, d9, d1[3]
-.endm
-
-.balign 16
-jsimd_\colorid\()_ycc_neon_consts:
-  .short 19595, 38470, 7471,  11059
-  .short 21709, 32768, 27439, 5329
-  .short 32767, 128,   32767, 128
-  .short 32767, 128,   32767, 128
-
-asm_function jsimd_\colorid\()_ycc_convert_neon
-    OUTPUT_WIDTH    .req r0
-    INPUT_BUF       .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_ROW      .req r3
-    NUM_ROWS        .req r4
-
-    OUTPUT_BUF0     .req r5
-    OUTPUT_BUF1     .req r6
-    OUTPUT_BUF2     .req OUTPUT_BUF
-
-    RGB             .req r7
-    Y               .req r8
-    U               .req r9
-    V               .req r10
-    N               .req ip
-
-    /* Load constants to d0, d1, d2, d3 */
-    adr             ip, jsimd_\colorid\()_ycc_neon_consts
-    vld1.16         {d0, d1, d2, d3}, [ip, :128]
-
-    /* Save Arm registers and handle input arguments */
-    push            {r4, r5, r6, r7, r8, r9, r10, lr}
-    ldr             NUM_ROWS, [sp, #(4 * 8)]
-    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
-    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
-    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
-    .unreq          OUTPUT_BUF
-
-    /* Save Neon registers */
-    vpush           {d8 - d15}
-
-    /* Outer loop over scanlines */
-    cmp             NUM_ROWS, #1
-    blt             9f
-0:
-    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
-    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
-    mov             N, OUTPUT_WIDTH
-    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
-    add             OUTPUT_ROW, OUTPUT_ROW, #1
-    ldr             RGB, [INPUT_BUF], #4
-
-    /* Inner loop over pixels */
-    subs            N, N, #8
-    blt             3f
-    do_load         \bpp, 8
-    do_rgb_to_yuv_stage1
-    subs            N, N, #8
-    blt             2f
-1:
-    do_rgb_to_yuv_stage2_store_load_stage1
-    subs            N, N, #8
-    bge             1b
-2:
-    do_rgb_to_yuv_stage2
-    do_store        8
-    tst             N, #7
-    beq             8f
-3:
-    tst             N, #4
-    beq             3f
-    do_load         \bpp, 4
-3:
-    tst             N, #2
-    beq             4f
-    do_load         \bpp, 2
-4:
-    tst             N, #1
-    beq             5f
-    do_load         \bpp, 1
-5:
-    do_rgb_to_yuv
-    tst             N, #4
-    beq             6f
-    do_store        4
-6:
-    tst             N, #2
-    beq             7f
-    do_store        2
-7:
-    tst             N, #1
-    beq             8f
-    do_store        1
-8:
-    subs            NUM_ROWS, NUM_ROWS, #1
-    bgt             0b
-9:
-    /* Restore all registers and return */
-    vpop            {d8 - d15}
-    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
-
-    .unreq          OUTPUT_WIDTH
-    .unreq          OUTPUT_ROW
-    .unreq          INPUT_BUF
-    .unreq          NUM_ROWS
-    .unreq          OUTPUT_BUF0
-    .unreq          OUTPUT_BUF1
-    .unreq          OUTPUT_BUF2
-    .unreq          RGB
-    .unreq          Y
-    .unreq          U
-    .unreq          V
-    .unreq          N
-
-.purgem do_rgb_to_yuv
-.purgem do_rgb_to_yuv_stage1
-.purgem do_rgb_to_yuv_stage2
-.purgem do_rgb_to_yuv_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R  G  B */
-generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
-generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
-generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
-generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
-generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
-generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- *       rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
-    SAMPLE_DATA     .req r0
-    START_COL       .req r1
-    WORKSPACE       .req r2
-    TMP1            .req r3
-    TMP2            .req r4
-    TMP3            .req r5
-    TMP4            .req ip
-
-    push            {r4, r5}
-    vmov.u8         d0, #128
-
-    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
-    add             TMP1, TMP1, START_COL
-    add             TMP2, TMP2, START_COL
-    add             TMP3, TMP3, START_COL
-    add             TMP4, TMP4, START_COL
-    vld1.8          {d16}, [TMP1]
-    vsubl.u8        q8, d16, d0
-    vld1.8          {d18}, [TMP2]
-    vsubl.u8        q9, d18, d0
-    vld1.8          {d20}, [TMP3]
-    vsubl.u8        q10, d20, d0
-    vld1.8          {d22}, [TMP4]
-    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
-    vsubl.u8        q11, d22, d0
-    vst1.16         {d16, d17, d18, d19}, [WORKSPACE, :128]!
-    add             TMP1, TMP1, START_COL
-    add             TMP2, TMP2, START_COL
-    vst1.16         {d20, d21, d22, d23}, [WORKSPACE, :128]!
-    add             TMP3, TMP3, START_COL
-    add             TMP4, TMP4, START_COL
-    vld1.8          {d24}, [TMP1]
-    vsubl.u8        q12, d24, d0
-    vld1.8          {d26}, [TMP2]
-    vsubl.u8        q13, d26, d0
-    vld1.8          {d28}, [TMP3]
-    vsubl.u8        q14, d28, d0
-    vld1.8          {d30}, [TMP4]
-    vsubl.u8        q15, d30, d0
-    vst1.16         {d24, d25, d26, d27}, [WORKSPACE, :128]!
-    vst1.16         {d28, d29, d30, d31}, [WORKSPACE, :128]!
-    pop             {r4, r5}
-    bx              lr
-
-    .unreq          SAMPLE_DATA
-    .unreq          START_COL
-    .unreq          WORKSPACE
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_fdct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the forward DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
- * function from jfdctfst.c
- *
- * TODO: can be combined with 'jsimd_convsamp_neon' to get
- *       rid of a bunch of VLD1.16 instructions
- */
-
-#define XFIX_0_382683433  d0[0]
-#define XFIX_0_541196100  d0[1]
-#define XFIX_0_707106781  d0[2]
-#define XFIX_1_306562965  d0[3]
-
-.balign 16
-jsimd_fdct_ifast_neon_consts:
-  .short (98 * 128)               /* XFIX_0_382683433 */
-  .short (139 * 128)              /* XFIX_0_541196100 */
-  .short (181 * 128)              /* XFIX_0_707106781 */
-  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
-
-asm_function jsimd_fdct_ifast_neon
-
-    DATA            .req r0
-    TMP             .req ip
-
-    vpush           {d8 - d15}
-
-    /* Load constants */
-    adr             TMP, jsimd_fdct_ifast_neon_consts
-    vld1.16         {d0}, [TMP, :64]
-
-    /* Load all DATA into Neon registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17    | q8
-     *   1 | d18     | d19    | q9
-     *   2 | d20     | d21    | q10
-     *   3 | d22     | d23    | q11
-     *   4 | d24     | d25    | q12
-     *   5 | d26     | d27    | q13
-     *   6 | d28     | d29    | q14
-     *   7 | d30     | d31    | q15
-     */
-
-    vld1.16         {d16, d17, d18, d19}, [DATA, :128]!
-    vld1.16         {d20, d21, d22, d23}, [DATA, :128]!
-    vld1.16         {d24, d25, d26, d27}, [DATA, :128]!
-    vld1.16         {d28, d29, d30, d31}, [DATA, :128]
-    sub             DATA, DATA, #(128 - 32)
-
-    mov             TMP, #2
-1:
-    /* Transpose */
-    vtrn.16         q12, q13
-    vtrn.16         q10, q11
-    vtrn.16         q8, q9
-    vtrn.16         q14, q15
-    vtrn.32         q9, q11
-    vtrn.32         q13, q15
-    vtrn.32         q8, q10
-    vtrn.32         q12, q14
-    vswp            d30, d23
-    vswp            d24, d17
-    vswp            d26, d19
-      /* 1-D FDCT */
-      vadd.s16        q2, q11, q12
-    vswp            d28, d21
-      vsub.s16        q12, q11, q12
-      vsub.s16        q6, q10, q13
-      vadd.s16        q10, q10, q13
-      vsub.s16        q7, q9, q14
-      vadd.s16        q9, q9, q14
-      vsub.s16        q1, q8, q15
-      vadd.s16        q8, q8, q15
-      vsub.s16        q4, q9, q10
-      vsub.s16        q5, q8, q2
-      vadd.s16        q3, q9, q10
-      vadd.s16        q4, q4, q5
-      vadd.s16        q2, q8, q2
-      vqdmulh.s16     q4, q4, XFIX_0_707106781
-      vadd.s16        q11, q12, q6
-      vadd.s16        q8, q2, q3
-      vsub.s16        q12, q2, q3
-      vadd.s16        q3, q6, q7
-      vadd.s16        q7, q7, q1
-      vqdmulh.s16     q3, q3, XFIX_0_707106781
-      vsub.s16        q6, q11, q7
-      vadd.s16        q10, q5, q4
-      vqdmulh.s16     q6, q6, XFIX_0_382683433
-      vsub.s16        q14, q5, q4
-      vqdmulh.s16     q11, q11, XFIX_0_541196100
-      vqdmulh.s16     q5, q7, XFIX_1_306562965
-      vadd.s16        q4, q1, q3
-      vsub.s16        q3, q1, q3
-      vadd.s16        q7, q7, q6
-      vadd.s16        q11, q11, q6
-      vadd.s16        q7, q7, q5
-      vadd.s16        q13, q3, q11
-      vsub.s16        q11, q3, q11
-      vadd.s16        q9, q4, q7
-      vsub.s16        q15, q4, q7
-    subs            TMP, TMP, #1
-    bne             1b
-
-    /* store results */
-    vst1.16         {d16, d17, d18, d19}, [DATA, :128]!
-    vst1.16         {d20, d21, d22, d23}, [DATA, :128]!
-    vst1.16         {d24, d25, d26, d27}, [DATA, :128]!
-    vst1.16         {d28, d29, d30, d31}, [DATA, :128]
-
-    vpop            {d8 - d15}
-    bx              lr
-
-    .unreq          DATA
-    .unreq          TMP
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(void)
- * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
- *                     DCTELEM *workspace);
- *
- * Note: the code uses 2 stage pipelining in order to improve instructions
- *       scheduling and eliminate stalls (this provides ~15% better
- *       performance for this function on both Arm Cortex-A8 and
- *       Arm Cortex-A9 when compared to the non-pipelined variant).
- *       The instructions which belong to the second stage use different
- *       indentation for better readiability.
- */
-asm_function jsimd_quantize_neon
-
-    COEF_BLOCK      .req r0
-    DIVISORS        .req r1
-    WORKSPACE       .req r2
-
-    RECIPROCAL      .req DIVISORS
-    CORRECTION      .req r3
-    SHIFT           .req ip
-    LOOP_COUNT      .req r4
-
-    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
-    vabs.s16        q12, q0
-    add             CORRECTION, DIVISORS, #(64 * 2)
-    add             SHIFT, DIVISORS, #(64 * 6)
-    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
-    vabs.s16        q13, q1
-    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
-    vadd.u16        q12, q12, q10  /* add correction */
-    vadd.u16        q13, q13, q11
-    vmull.u16       q10, d24, d16  /* multiply by reciprocal */
-    vmull.u16       q11, d25, d17
-    vmull.u16       q8, d26, d18
-    vmull.u16       q9, d27, d19
-    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
-    vshrn.u32       d20, q10, #16
-    vshrn.u32       d21, q11, #16
-    vshrn.u32       d22, q8, #16
-    vshrn.u32       d23, q9, #16
-    vneg.s16        q12, q12
-    vneg.s16        q13, q13
-    vshr.s16        q2, q0, #15    /* extract sign */
-    vshr.s16        q3, q1, #15
-    vshl.u16        q14, q10, q12  /* shift */
-    vshl.u16        q15, q11, q13
-
-    push            {r4, r5}
-    mov             LOOP_COUNT, #3
-1:
-    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
-      veor.u16        q14, q14, q2  /* restore sign */
-    vabs.s16        q12, q0
-    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
-    vabs.s16        q13, q1
-      veor.u16        q15, q15, q3
-    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
-    vadd.u16        q12, q12, q10  /* add correction */
-    vadd.u16        q13, q13, q11
-    vmull.u16       q10, d24, d16  /* multiply by reciprocal */
-    vmull.u16       q11, d25, d17
-    vmull.u16       q8, d26, d18
-    vmull.u16       q9, d27, d19
-      vsub.u16        q14, q14, q2
-    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
-      vsub.u16        q15, q15, q3
-    vshrn.u32       d20, q10, #16
-    vshrn.u32       d21, q11, #16
-      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
-    vshrn.u32       d22, q8, #16
-    vshrn.u32       d23, q9, #16
-    vneg.s16        q12, q12
-    vneg.s16        q13, q13
-    vshr.s16        q2, q0, #15    /* extract sign */
-    vshr.s16        q3, q1, #15
-    vshl.u16        q14, q10, q12  /* shift */
-    vshl.u16        q15, q11, q13
-    subs            LOOP_COUNT, LOOP_COUNT, #1
-    bne             1b
-    pop             {r4, r5}
-
-      veor.u16        q14, q14, q2  /* restore sign */
-      veor.u16        q15, q15, q3
-      vsub.u16        q14, q14, q2
-      vsub.u16        q15, q15, q3
-      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
-
-    bx              lr  /* return */
-
-    .unreq          COEF_BLOCK
-    .unreq          DIVISORS
-    .unreq          WORKSPACE
-    .unreq          RECIPROCAL
-    .unreq          CORRECTION
-    .unreq          SHIFT
-    .unreq          LOOP_COUNT
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(void)
- * jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
- *                                JDIMENSION downsampled_width,
- *                                JSAMPARRAY input_data,
- *                                JSAMPARRAY *output_data_ptr);
- *
- * Note: the use of unaligned writes is the main remaining bottleneck in
- *       this code, which can be potentially solved to get up to tens
- *       of percents performance improvement on Cortex-A8/Cortex-A9.
- */
-
-/*
- * Upsample 16 source pixels to 32 destination pixels. The new 16 source
- * pixels are loaded to q0. The previous 16 source pixels are in q1. The
- * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
- * Register d28 is used for multiplication by 3. Register q15 is used
- * for adding +1 bias.
- */
-.macro upsample16 OUTPTR, INPTR
-    vld1.8          {q0}, [\INPTR]!
-    vmovl.u8        q8, d0
-    vext.8          q2, q1, q0, #15
-    vmovl.u8        q9, d1
-    vaddw.u8        q10, q15, d4
-    vaddw.u8        q11, q15, d5
-    vmlal.u8        q8, d4, d28
-    vmlal.u8        q9, d5, d28
-    vmlal.u8        q10, d0, d28
-    vmlal.u8        q11, d1, d28
-    vmov            q1, q0        /* backup source pixels to q1 */
-    vrshrn.u16      d6, q8, #2
-    vrshrn.u16      d7, q9, #2
-    vshrn.u16       d8, q10, #2
-    vshrn.u16       d9, q11, #2
-    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
-.endm
-
-/*
- * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
- * macro, the roles of q0 and q1 registers are reversed for even and odd
- * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
- * Also this unrolling allows to reorder loads and stores to compensate
- * multiplication latency and reduce stalls.
- */
-.macro upsample32 OUTPTR, INPTR
-    /* even 16 pixels group */
-    vld1.8          {q0}, [\INPTR]!
-    vmovl.u8        q8, d0
-    vext.8          q2, q1, q0, #15
-    vmovl.u8        q9, d1
-    vaddw.u8        q10, q15, d4
-    vaddw.u8        q11, q15, d5
-    vmlal.u8        q8, d4, d28
-    vmlal.u8        q9, d5, d28
-    vmlal.u8        q10, d0, d28
-    vmlal.u8        q11, d1, d28
-      /* odd 16 pixels group */
-      vld1.8          {q1}, [\INPTR]!
-    vrshrn.u16      d6, q8, #2
-    vrshrn.u16      d7, q9, #2
-    vshrn.u16       d8, q10, #2
-    vshrn.u16       d9, q11, #2
-      vmovl.u8        q8, d2
-      vext.8          q2, q0, q1, #15
-      vmovl.u8        q9, d3
-      vaddw.u8        q10, q15, d4
-      vaddw.u8        q11, q15, d5
-      vmlal.u8        q8, d4, d28
-      vmlal.u8        q9, d5, d28
-      vmlal.u8        q10, d2, d28
-      vmlal.u8        q11, d3, d28
-    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
-      vrshrn.u16      d6, q8, #2
-      vrshrn.u16      d7, q9, #2
-      vshrn.u16       d8, q10, #2
-      vshrn.u16       d9, q11, #2
-      vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
-.endm
-
-/*
- * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
- */
-.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
-    /* special case for the first and last pixels */
-    sub             \WIDTH, \WIDTH, #1
-    add             \OUTPTR, \OUTPTR, #1
-    ldrb            \TMP1, [\INPTR, \WIDTH]
-    strb            \TMP1, [\OUTPTR, \WIDTH, asl #1]
-    ldrb            \TMP1, [\INPTR], #1
-    strb            \TMP1, [\OUTPTR, #-1]
-    vmov.8          d3[7], \TMP1
-
-    subs            \WIDTH, \WIDTH, #32
-    blt             5f
-0:  /* process 32 pixels per iteration */
-    upsample32      \OUTPTR, \INPTR
-    subs            \WIDTH, \WIDTH, #32
-    bge             0b
-5:
-    adds            \WIDTH, \WIDTH, #16
-    blt             1f
-0:  /* process 16 pixels if needed */
-    upsample16      \OUTPTR, \INPTR
-    subs            \WIDTH, \WIDTH, #16
-1:
-    adds            \WIDTH, \WIDTH, #16
-    beq             9f
-
-    /* load the remaining 1-15 pixels */
-    add             \INPTR, \INPTR, \WIDTH
-    tst             \WIDTH, #1
-    beq             2f
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[0]}, [\INPTR]
-2:
-    tst             \WIDTH, #2
-    beq             2f
-    vext.8          d0, d0, d0, #6
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[1]}, [\INPTR]
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[0]}, [\INPTR]
-2:
-    tst             \WIDTH, #4
-    beq             2f
-    vrev64.32       d0, d0
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[3]}, [\INPTR]
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[2]}, [\INPTR]
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[1]}, [\INPTR]
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[0]}, [\INPTR]
-2:
-    tst             \WIDTH, #8
-    beq             2f
-    vmov            d1, d0
-    sub             \INPTR, \INPTR, #8
-    vld1.8          {d0}, [\INPTR]
-2:  /* upsample the remaining pixels */
-    vmovl.u8        q8, d0
-    vext.8          q2, q1, q0, #15
-    vmovl.u8        q9, d1
-    vaddw.u8        q10, q15, d4
-    vaddw.u8        q11, q15, d5
-    vmlal.u8        q8, d4, d28
-    vmlal.u8        q9, d5, d28
-    vmlal.u8        q10, d0, d28
-    vmlal.u8        q11, d1, d28
-    vrshrn.u16      d10, q8, #2
-    vrshrn.u16      d12, q9, #2
-    vshrn.u16       d11, q10, #2
-    vshrn.u16       d13, q11, #2
-    vzip.8          d10, d11
-    vzip.8          d12, d13
-    /* store the remaining pixels */
-    tst             \WIDTH, #8
-    beq             2f
-    vst1.8          {d10, d11}, [\OUTPTR]!
-    vmov            q5, q6
-2:
-    tst             \WIDTH, #4
-    beq             2f
-    vst1.8          {d10}, [\OUTPTR]!
-    vmov            d10, d11
-2:
-    tst             \WIDTH, #2
-    beq             2f
-    vst1.8          {d10[0]}, [\OUTPTR]!
-    vst1.8          {d10[1]}, [\OUTPTR]!
-    vst1.8          {d10[2]}, [\OUTPTR]!
-    vst1.8          {d10[3]}, [\OUTPTR]!
-    vext.8          d10, d10, d10, #4
-2:
-    tst             \WIDTH, #1
-    beq             2f
-    vst1.8          {d10[0]}, [\OUTPTR]!
-    vst1.8          {d10[1]}, [\OUTPTR]!
-2:
-9:
-.endm
-
-asm_function jsimd_h2v1_fancy_upsample_neon
-
-    MAX_V_SAMP_FACTOR .req r0
-    DOWNSAMPLED_WIDTH .req r1
-    INPUT_DATA        .req r2
-    OUTPUT_DATA_PTR   .req r3
-    OUTPUT_DATA       .req OUTPUT_DATA_PTR
-
-    OUTPTR            .req r4
-    INPTR             .req r5
-    WIDTH             .req ip
-    TMP               .req lr
-
-    push            {r4, r5, r6, lr}
-    vpush           {d8 - d15}
-
-    ldr             OUTPUT_DATA, [OUTPUT_DATA_PTR]
-    cmp             MAX_V_SAMP_FACTOR, #0
-    ble             99f
-
-    /* initialize constants */
-    vmov.u8         d28, #3
-    vmov.u16        q15, #1
-11:
-    ldr             INPTR, [INPUT_DATA], #4
-    ldr             OUTPTR, [OUTPUT_DATA], #4
-    mov             WIDTH, DOWNSAMPLED_WIDTH
-    upsample_row    OUTPTR, INPTR, WIDTH, TMP
-    subs            MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
-    bgt             11b
-
-99:
-    vpop            {d8 - d15}
-    pop             {r4, r5, r6, pc}
-
-    .unreq          MAX_V_SAMP_FACTOR
-    .unreq          DOWNSAMPLED_WIDTH
-    .unreq          INPUT_DATA
-    .unreq          OUTPUT_DATA_PTR
-    .unreq          OUTPUT_DATA
-
-    .unreq          OUTPTR
-    .unreq          INPTR
-    .unreq          WIDTH
-    .unreq          TMP
-
-.purgem upsample16
-.purgem upsample32
-.purgem upsample_row
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(JOCTET *)
- * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
- *                             JCOEFPTR block, int last_dc_val,
- *                             c_derived_tbl *dctbl, c_derived_tbl *actbl)
- *
- */
-
-.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
-    sub             \PUT_BITS, \PUT_BITS, #0x8
-    lsr             \TMP, \PUT_BUFFER, \PUT_BITS
-    uxtb            \TMP, \TMP
-    strb            \TMP, [\BUFFER, #1]!
-    cmp             \TMP, #0xff
-    /*it eq*/
-    strbeq          \ZERO, [\BUFFER, #1]!
-.endm
-
-.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
-    /*lsl             \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
-    add             \PUT_BITS, \SIZE
-    /*orr             \PUT_BUFFER, \PUT_BUFFER, \CODE*/
-    orr             \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
-.endm
-
-.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
-  cmp               \PUT_BITS, #0x10
-  blt               15f
-    eor               \ZERO, \ZERO, \ZERO
-    emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
-    emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
-15:
-.endm
-
-.balign 16
-jsimd_huff_encode_one_block_neon_consts:
-  .byte 0x01
-  .byte 0x02
-  .byte 0x04
-  .byte 0x08
-  .byte 0x10
-  .byte 0x20
-  .byte 0x40
-  .byte 0x80
-
-asm_function jsimd_huff_encode_one_block_neon
-    push            {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-    add             r7, sp, #0x1c
-    sub             r4, sp, #0x40
-    bfc             r4, #0, #5
-    mov             sp, r4           /* align sp on 32 bytes */
-    vst1.64         {d8, d9, d10, d11}, [r4, :128]!
-    vst1.64         {d12, d13, d14, d15}, [r4, :128]
-    sub             sp, #0x140       /* reserve 320 bytes */
-    str             r0, [sp, #0x18]  /* working state > sp + Ox18 */
-    add             r4, sp, #0x20    /* r4 = t1 */
-    ldr             lr, [r7, #0x8]   /* lr = dctbl */
-    sub             r10, r1, #0x1    /* r10=buffer-- */
-    ldrsh           r1, [r2]
-    mov             r9, #0x10
-    mov             r8, #0x1
-    adr             r5, jsimd_huff_encode_one_block_neon_consts
-    /* prepare data */
-    vld1.8          {d26}, [r5, :64]
-    veor            q8, q8, q8
-    veor            q9, q9, q9
-    vdup.16         q14, r9
-    vdup.16         q15, r8
-    veor            q10, q10, q10
-    veor            q11, q11, q11
-    sub             r1, r1, r3
-    add             r9, r2, #0x22
-    add             r8, r2, #0x18
-    add             r3, r2, #0x36
-    vmov.16         d0[0], r1
-    vld1.16         {d2[0]}, [r9, :16]
-    vld1.16         {d4[0]}, [r8, :16]
-    vld1.16         {d6[0]}, [r3, :16]
-    add             r1, r2, #0x2
-    add             r9, r2, #0x30
-    add             r8, r2, #0x26
-    add             r3, r2, #0x28
-    vld1.16         {d0[1]}, [r1, :16]
-    vld1.16         {d2[1]}, [r9, :16]
-    vld1.16         {d4[1]}, [r8, :16]
-    vld1.16         {d6[1]}, [r3, :16]
-    add             r1, r2, #0x10
-    add             r9, r2, #0x40
-    add             r8, r2, #0x34
-    add             r3, r2, #0x1a
-    vld1.16         {d0[2]}, [r1, :16]
-    vld1.16         {d2[2]}, [r9, :16]
-    vld1.16         {d4[2]}, [r8, :16]
-    vld1.16         {d6[2]}, [r3, :16]
-    add             r1, r2, #0x20
-    add             r9, r2, #0x32
-    add             r8, r2, #0x42
-    add             r3, r2, #0xc
-    vld1.16         {d0[3]}, [r1, :16]
-    vld1.16         {d2[3]}, [r9, :16]
-    vld1.16         {d4[3]}, [r8, :16]
-    vld1.16         {d6[3]}, [r3, :16]
-    add             r1, r2, #0x12
-    add             r9, r2, #0x24
-    add             r8, r2, #0x50
-    add             r3, r2, #0xe
-    vld1.16         {d1[0]}, [r1, :16]
-    vld1.16         {d3[0]}, [r9, :16]
-    vld1.16         {d5[0]}, [r8, :16]
-    vld1.16         {d7[0]}, [r3, :16]
-    add             r1, r2, #0x4
-    add             r9, r2, #0x16
-    add             r8, r2, #0x60
-    add             r3, r2, #0x1c
-    vld1.16         {d1[1]}, [r1, :16]
-    vld1.16         {d3[1]}, [r9, :16]
-    vld1.16         {d5[1]}, [r8, :16]
-    vld1.16         {d7[1]}, [r3, :16]
-    add             r1, r2, #0x6
-    add             r9, r2, #0x8
-    add             r8, r2, #0x52
-    add             r3, r2, #0x2a
-    vld1.16         {d1[2]}, [r1, :16]
-    vld1.16         {d3[2]}, [r9, :16]
-    vld1.16         {d5[2]}, [r8, :16]
-    vld1.16         {d7[2]}, [r3, :16]
-    add             r1, r2, #0x14
-    add             r9, r2, #0xa
-    add             r8, r2, #0x44
-    add             r3, r2, #0x38
-    vld1.16         {d1[3]}, [r1, :16]
-    vld1.16         {d3[3]}, [r9, :16]
-    vld1.16         {d5[3]}, [r8, :16]
-    vld1.16         {d7[3]}, [r3, :16]
-    vcgt.s16        q8, q8, q0
-    vcgt.s16        q9, q9, q1
-    vcgt.s16        q10, q10, q2
-    vcgt.s16        q11, q11, q3
-    vabs.s16        q0, q0
-    vabs.s16        q1, q1
-    vabs.s16        q2, q2
-    vabs.s16        q3, q3
-    veor            q8, q8, q0
-    veor            q9, q9, q1
-    veor            q10, q10, q2
-    veor            q11, q11, q3
-    add             r9, r4, #0x20
-    add             r8, r4, #0x80
-    add             r3, r4, #0xa0
-    vclz.i16        q0, q0
-    vclz.i16        q1, q1
-    vclz.i16        q2, q2
-    vclz.i16        q3, q3
-    vsub.i16        q0, q14, q0
-    vsub.i16        q1, q14, q1
-    vsub.i16        q2, q14, q2
-    vsub.i16        q3, q14, q3
-    vst1.16         {d0, d1, d2, d3}, [r4, :256]
-    vst1.16         {d4, d5, d6, d7}, [r9, :256]
-    vshl.s16        q0, q15, q0
-    vshl.s16        q1, q15, q1
-    vshl.s16        q2, q15, q2
-    vshl.s16        q3, q15, q3
-    vsub.i16        q0, q0, q15
-    vsub.i16        q1, q1, q15
-    vsub.i16        q2, q2, q15
-    vsub.i16        q3, q3, q15
-    vand            q8, q8, q0
-    vand            q9, q9, q1
-    vand            q10, q10, q2
-    vand            q11, q11, q3
-    vst1.16         {d16, d17, d18, d19}, [r8, :256]
-    vst1.16         {d20, d21, d22, d23}, [r3, :256]
-    add             r1, r2, #0x46
-    add             r9, r2, #0x3a
-    add             r8, r2, #0x74
-    add             r3, r2, #0x6a
-    vld1.16         {d8[0]}, [r1, :16]
-    vld1.16         {d10[0]}, [r9, :16]
-    vld1.16         {d12[0]}, [r8, :16]
-    vld1.16         {d14[0]}, [r3, :16]
-    veor            q8, q8, q8
-    veor            q9, q9, q9
-    veor            q10, q10, q10
-    veor            q11, q11, q11
-    add             r1, r2, #0x54
-    add             r9, r2, #0x2c
-    add             r8, r2, #0x76
-    add             r3, r2, #0x78
-    vld1.16         {d8[1]}, [r1, :16]
-    vld1.16         {d10[1]}, [r9, :16]
-    vld1.16         {d12[1]}, [r8, :16]
-    vld1.16         {d14[1]}, [r3, :16]
-    add             r1, r2, #0x62
-    add             r9, r2, #0x1e
-    add             r8, r2, #0x68
-    add             r3, r2, #0x7a
-    vld1.16         {d8[2]}, [r1, :16]
-    vld1.16         {d10[2]}, [r9, :16]
-    vld1.16         {d12[2]}, [r8, :16]
-    vld1.16         {d14[2]}, [r3, :16]
-    add             r1, r2, #0x70
-    add             r9, r2, #0x2e
-    add             r8, r2, #0x5a
-    add             r3, r2, #0x6c
-    vld1.16         {d8[3]}, [r1, :16]
-    vld1.16         {d10[3]}, [r9, :16]
-    vld1.16         {d12[3]}, [r8, :16]
-    vld1.16         {d14[3]}, [r3, :16]
-    add             r1, r2, #0x72
-    add             r9, r2, #0x3c
-    add             r8, r2, #0x4c
-    add             r3, r2, #0x5e
-    vld1.16         {d9[0]}, [r1, :16]
-    vld1.16         {d11[0]}, [r9, :16]
-    vld1.16         {d13[0]}, [r8, :16]
-    vld1.16         {d15[0]}, [r3, :16]
-    add             r1, r2, #0x64
-    add             r9, r2, #0x4a
-    add             r8, r2, #0x3e
-    add             r3, r2, #0x6e
-    vld1.16         {d9[1]}, [r1, :16]
-    vld1.16         {d11[1]}, [r9, :16]
-    vld1.16         {d13[1]}, [r8, :16]
-    vld1.16         {d15[1]}, [r3, :16]
-    add             r1, r2, #0x56
-    add             r9, r2, #0x58
-    add             r8, r2, #0x4e
-    add             r3, r2, #0x7c
-    vld1.16         {d9[2]}, [r1, :16]
-    vld1.16         {d11[2]}, [r9, :16]
-    vld1.16         {d13[2]}, [r8, :16]
-    vld1.16         {d15[2]}, [r3, :16]
-    add             r1, r2, #0x48
-    add             r9, r2, #0x66
-    add             r8, r2, #0x5c
-    add             r3, r2, #0x7e
-    vld1.16         {d9[3]}, [r1, :16]
-    vld1.16         {d11[3]}, [r9, :16]
-    vld1.16         {d13[3]}, [r8, :16]
-    vld1.16         {d15[3]}, [r3, :16]
-    vcgt.s16        q8, q8, q4
-    vcgt.s16        q9, q9, q5
-    vcgt.s16        q10, q10, q6
-    vcgt.s16        q11, q11, q7
-    vabs.s16        q4, q4
-    vabs.s16        q5, q5
-    vabs.s16        q6, q6
-    vabs.s16        q7, q7
-    veor            q8, q8, q4
-    veor            q9, q9, q5
-    veor            q10, q10, q6
-    veor            q11, q11, q7
-    add             r1, r4, #0x40
-    add             r9, r4, #0x60
-    add             r8, r4, #0xc0
-    add             r3, r4, #0xe0
-    vclz.i16        q4, q4
-    vclz.i16        q5, q5
-    vclz.i16        q6, q6
-    vclz.i16        q7, q7
-    vsub.i16        q4, q14, q4
-    vsub.i16        q5, q14, q5
-    vsub.i16        q6, q14, q6
-    vsub.i16        q7, q14, q7
-    vst1.16         {d8, d9, d10, d11}, [r1, :256]
-    vst1.16         {d12, d13, d14, d15}, [r9, :256]
-    vshl.s16        q4, q15, q4
-    vshl.s16        q5, q15, q5
-    vshl.s16        q6, q15, q6
-    vshl.s16        q7, q15, q7
-    vsub.i16        q4, q4, q15
-    vsub.i16        q5, q5, q15
-    vsub.i16        q6, q6, q15
-    vsub.i16        q7, q7, q15
-    vand            q8, q8, q4
-    vand            q9, q9, q5
-    vand            q10, q10, q6
-    vand            q11, q11, q7
-    vst1.16         {d16, d17, d18, d19}, [r8, :256]
-    vst1.16         {d20, d21, d22, d23}, [r3, :256]
-    ldr             r12, [r7, #0xc]       /* r12 = actbl */
-    add             r1, lr, #0x400        /* r1 = dctbl->ehufsi */
-    mov             r9, r12               /* r9 = actbl */
-    add             r6, r4, #0x80         /* r6 = t2 */
-    ldr             r11, [r0, #0x8]       /* r11 = put_buffer */
-    ldr             r4, [r0, #0xc]        /* r4  = put_bits */
-    ldrh            r2, [r6, #-128]       /* r2  = nbits */
-    ldrh            r3, [r6]              /* r3  = temp2 & (((JLONG)1)<<nbits) - 1; */
-    ldr             r0, [lr, r2, lsl #2]
-    ldrb            r5, [r1, r2]
-    put_bits        r11, r4, r0, r5
-    checkbuf15      r10, r11, r4, r5, r0
-    put_bits        r11, r4, r3, r2
-    checkbuf15      r10, r11, r4, r5, r0
-    mov             lr, r6                /* lr = t2 */
-    add             r5, r9, #0x400        /* r5 = actbl->ehufsi */
-    ldrsb           r6, [r5, #0xf0]       /* r6 = actbl->ehufsi[0xf0] */
-    veor            q8, q8, q8
-    vceq.i16        q0, q0, q8
-    vceq.i16        q1, q1, q8
-    vceq.i16        q2, q2, q8
-    vceq.i16        q3, q3, q8
-    vceq.i16        q4, q4, q8
-    vceq.i16        q5, q5, q8
-    vceq.i16        q6, q6, q8
-    vceq.i16        q7, q7, q8
-    vmovn.i16       d0, q0
-    vmovn.i16       d2, q1
-    vmovn.i16       d4, q2
-    vmovn.i16       d6, q3
-    vmovn.i16       d8, q4
-    vmovn.i16       d10, q5
-    vmovn.i16       d12, q6
-    vmovn.i16       d14, q7
-    vand            d0, d0, d26
-    vand            d2, d2, d26
-    vand            d4, d4, d26
-    vand            d6, d6, d26
-    vand            d8, d8, d26
-    vand            d10, d10, d26
-    vand            d12, d12, d26
-    vand            d14, d14, d26
-    vpadd.i8        d0, d0, d2
-    vpadd.i8        d4, d4, d6
-    vpadd.i8        d8, d8, d10
-    vpadd.i8        d12, d12, d14
-    vpadd.i8        d0, d0, d4
-    vpadd.i8        d8, d8, d12
-    vpadd.i8        d0, d0, d8
-    vmov.32         r1, d0[1]
-    vmov.32         r8, d0[0]
-    mvn             r1, r1
-    mvn             r8, r8
-    lsrs            r1, r1, #0x1
-    rrx             r8, r8            /* shift in last r1 bit while shifting out DC bit */
-    rbit            r1, r1            /* r1 = index1 */
-    rbit            r8, r8            /* r8 = index0 */
-    ldr             r0, [r9, #0x3c0]  /* r0 = actbl->ehufco[0xf0] */
-    str             r1, [sp, #0x14]   /* index1 > sp + 0x14 */
-    cmp             r8, #0x0
-    beq             6f
-1:
-    clz             r2, r8
-    add             lr, lr, r2, lsl #1
-    lsl             r8, r8, r2
-    ldrh            r1, [lr, #-126]
-2:
-    cmp             r2, #0x10
-    blt             3f
-    sub             r2, r2, #0x10
-    put_bits        r11, r4, r0, r6
-    cmp             r4, #0x10
-    blt             2b
-    eor             r3, r3, r3
-    emit_byte       r10, r11, r4, r3, r12
-    emit_byte       r10, r11, r4, r3, r12
-    b               2b
-3:
-    add             r2, r1, r2, lsl #4
-    ldrh            r3, [lr, #2]!
-    ldr             r12, [r9, r2, lsl #2]
-    ldrb            r2, [r5, r2]
-    put_bits        r11, r4, r12, r2
-    checkbuf15      r10, r11, r4, r2, r12
-    put_bits        r11, r4, r3, r1
-    checkbuf15      r10, r11, r4, r2, r12
-    lsls            r8, r8, #0x1
-    bne             1b
-6:
-    add             r12, sp, #0x20   /* r12 = t1 */
-    ldr             r8, [sp, #0x14]  /* r8 = index1 */
-    adds            r12, #0xc0       /* r12 = t2 + (DCTSIZE2/2) */
-    cmp             r8, #0x0
-    beq             6f
-    clz             r2, r8
-    sub             r12, r12, lr
-    lsl             r8, r8, r2
-    add             r2, r2, r12, lsr #1
-    add             lr, lr, r2, lsl #1
-    b               7f
-1:
-    clz             r2, r8
-    add             lr, lr, r2, lsl #1
-    lsl             r8, r8, r2
-7:
-    ldrh            r1, [lr, #-126]
-2:
-    cmp             r2, #0x10
-    blt             3f
-    sub             r2, r2, #0x10
-    put_bits        r11, r4, r0, r6
-    cmp             r4, #0x10
-    blt             2b
-    eor             r3, r3, r3
-    emit_byte       r10, r11, r4, r3, r12
-    emit_byte       r10, r11, r4, r3, r12
-    b               2b
-3:
-    add             r2, r1, r2, lsl #4
-    ldrh            r3, [lr, #2]!
-    ldr             r12, [r9, r2, lsl #2]
-    ldrb            r2, [r5, r2]
-    put_bits        r11, r4, r12, r2
-    checkbuf15      r10, r11, r4, r2, r12
-    put_bits        r11, r4, r3, r1
-    checkbuf15      r10, r11, r4, r2, r12
-    lsls            r8, r8, #0x1
-    bne             1b
-6:
-    add             r0, sp, #0x20
-    add             r0, #0xfe
-    cmp             lr, r0
-    bhs             1f
-    ldr             r1, [r9]
-    ldrb            r0, [r5]
-    put_bits        r11, r4, r1, r0
-    checkbuf15      r10, r11, r4, r0, r1
-1:
-    ldr             r12, [sp, #0x18]
-    str             r11, [r12, #0x8]
-    str             r4, [r12, #0xc]
-    add             r0, r10, #0x1
-    add             r4, sp, #0x140
-    vld1.64         {d8, d9, d10, d11}, [r4, :128]!
-    vld1.64         {d12, d13, d14, d15}, [r4, :128]
-    sub             r4, r7, #0x1c
-    mov             sp, r4
-    pop             {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
-.purgem emit_byte
-.purgem put_bits
-.purgem checkbuf15
diff --git a/simd/arm/neon-compat.h.in b/simd/arm/neon-compat.h.in
new file mode 100644
index 0000000..7a03d81
--- /dev/null
+++ b/simd/arm/neon-compat.h.in
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#cmakedefine HAVE_VLD1_S16_X3
+#cmakedefine HAVE_VLD1_U16_X2
+#cmakedefine HAVE_VLD1Q_U8_X4
diff --git a/simd/gas-preprocessor.in b/simd/gas-preprocessor.in
deleted file mode 100755
index 560f788..0000000
--- a/simd/gas-preprocessor.in
+++ /dev/null
@@ -1 +0,0 @@
-gas-preprocessor.pl @CMAKE_ASM_COMPILER@ ${1+"$@"}
diff --git a/simd/i386/jchuff-sse2.asm b/simd/i386/jchuff-sse2.asm
index 79f0ca5..278cf5e 100644
--- a/simd/i386/jchuff-sse2.asm
+++ b/simd/i386/jchuff-sse2.asm
@@ -1,8 +1,9 @@
 ;
 ; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
 ;
-; Copyright (C) 2009-2011, 2014-2017, D. R. Commander.
+; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander.
 ; Copyright (C) 2015, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -15,134 +16,255 @@
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
 ;
 ; This file contains an SSE2 implementation for Huffman coding of one block.
-; The following code is based directly on jchuff.c; see jchuff.c for more
-; details.
+; The following code is based on jchuff.c; see jchuff.c for more details.
 
 %include "jsimdext.inc"
 
+struc working_state
+.next_output_byte:   resp 1     ; => next byte to write in buffer
+.free_in_buffer:     resp 1     ; # of byte spaces remaining in buffer
+.cur.put_buffer.simd resq 1     ; current bit accumulation buffer
+.cur.free_bits       resd 1     ; # of bits available in it
+.cur.last_dc_val     resd 4     ; last DC coef for each component
+.cinfo:              resp 1     ; dump_buffer needs access to this
+endstruc
+
+struc c_derived_tbl
+.ehufco:             resd 256   ; code for each symbol
+.ehufsi:             resb 256   ; length of code for each symbol
+; If no code has been allocated for a symbol S, ehufsi[S] contains 0
+endstruc
+
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
     GLOBAL_DATA(jconst_huff_encode_one_block)
 
 EXTN(jconst_huff_encode_one_block):
 
-%include "jpeg_nbits_table.inc"
+    alignz      32
+
+jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007
+               dq 0x000f, 0x001f, 0x003f, 0x007f
+               dq 0x00ff, 0x01ff, 0x03ff, 0x07ff
+               dq 0x0fff, 0x1fff, 0x3fff, 0x7fff
+
+times 1 << 14 db 15
+times 1 << 13 db 14
+times 1 << 12 db 13
+times 1 << 11 db 12
+times 1 << 10 db 11
+times 1 <<  9 db 10
+times 1 <<  8 db  9
+times 1 <<  7 db  8
+times 1 <<  6 db  7
+times 1 <<  5 db  6
+times 1 <<  4 db  5
+times 1 <<  3 db  4
+times 1 <<  2 db  3
+times 1 <<  1 db  2
+times 1 <<  0 db  1
+times 1       db  0
+jpeg_nbits_table:
+times 1       db  0
+times 1 <<  0 db  1
+times 1 <<  1 db  2
+times 1 <<  2 db  3
+times 1 <<  3 db  4
+times 1 <<  4 db  5
+times 1 <<  5 db  6
+times 1 <<  6 db  7
+times 1 <<  7 db  8
+times 1 <<  8 db  9
+times 1 <<  9 db 10
+times 1 << 10 db 11
+times 1 << 11 db 12
+times 1 << 12 db 13
+times 1 << 13 db 14
+times 1 << 14 db 15
 
     alignz      32
 
+%ifdef PIC
+%define NBITS(x)      nbits_base + x
+%else
+%define NBITS(x)      jpeg_nbits_table + x
+%endif
+%define MASK_BITS(x)  NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table)
+
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
     BITS        32
 
-; These macros perform the same task as the emit_bits() function in the
-; original libjpeg code.  In addition to reducing overhead by explicitly
-; inlining the code, additional performance is achieved by taking into
-; account the size of the bit buffer and waiting until it is almost full
-; before emptying it.  This mostly benefits 64-bit platforms, since 6
-; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
+%define mm_put_buffer     mm0
+%define mm_all_0xff       mm1
+%define mm_temp           mm2
+%define mm_nbits          mm3
+%define mm_code_bits      mm3
+%define mm_code           mm4
+%define mm_overflow_bits  mm5
+%define mm_save_nbits     mm6
 
-%macro EMIT_BYTE 0
-    sub         put_bits, 8             ; put_bits -= 8;
-    mov         edx, put_buffer
-    mov         ecx, put_bits
-    shr         edx, cl                 ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
-    mov         byte [eax], dl          ; *buffer++ = c;
-    add         eax, 1
-    cmp         dl, 0xFF                ; need to stuff a zero byte?
-    jne         %%.EMIT_BYTE_END
-    mov         byte [eax], 0           ; *buffer++ = 0;
-    add         eax, 1
-%%.EMIT_BYTE_END:
-%endmacro
+; Shorthand used to describe SIMD operations:
+; wN:  xmmN treated as eight signed 16-bit values
+; wN[i]:  perform the same operation on all eight signed 16-bit values, i=0..7
+; bN:  xmmN treated as 16 unsigned 8-bit values, or
+;      mmN treated as eight unsigned 8-bit values
+; bN[i]:  perform the same operation on all unsigned 8-bit values,
+;         i=0..15 (SSE register) or i=0..7 (MMX register)
+; Contents of SIMD registers are shown in memory order.
 
-%macro PUT_BITS 1
-    add         put_bits, ecx           ; put_bits += size;
-    shl         put_buffer, cl          ; put_buffer = (put_buffer << size);
-    or          put_buffer, %1
-%endmacro
+; Fill the bit buffer to capacity with the leading bits from code, then output
+; the bit buffer and put the remaining bits from code into the bit buffer.
+;
+; Usage:
+; code - contains the bits to shift into the bit buffer (LSB-aligned)
+; %1 - temp register
+; %2 - low byte of temp register
+; %3 - second byte of temp register
+; %4-%8 (optional) - extra instructions to execute before the macro completes
+; %9 - the label to which to jump when the macro completes
+;
+; Upon completion, free_bits will be set to the number of remaining bits from
+; code, and put_buffer will contain those remaining bits.  temp and code will
+; be clobbered.
+;
+; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
+; macro in jchuff.c.
 
-%macro CHECKBUF15 0
-    cmp         put_bits, 16            ; if (put_bits > 31) {
-    jl          %%.CHECKBUF15_END
-    mov         eax, POINTER [esp+buffer]
-    EMIT_BYTE
-    EMIT_BYTE
-    mov         POINTER [esp+buffer], eax
-%%.CHECKBUF15_END:
-%endmacro
-
-%macro EMIT_BITS 1
-    PUT_BITS    %1
-    CHECKBUF15
-%endmacro
-
-%macro kloop_prepare 37                 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
-    pxor        xmm4, xmm4              ; __m128i neg = _mm_setzero_si128();
-    pxor        xmm5, xmm5              ; __m128i neg = _mm_setzero_si128();
-    pxor        xmm6, xmm6              ; __m128i neg = _mm_setzero_si128();
-    pxor        xmm7, xmm7              ; __m128i neg = _mm_setzero_si128();
-    pinsrw      %34, word [esi + %2  * SIZEOF_WORD], 0  ; xmm_shadow[0] = block[jno0];
-    pinsrw      %35, word [esi + %10 * SIZEOF_WORD], 0  ; xmm_shadow[8] = block[jno8];
-    pinsrw      %36, word [esi + %18 * SIZEOF_WORD], 0  ; xmm_shadow[16] = block[jno16];
-    pinsrw      %37, word [esi + %26 * SIZEOF_WORD], 0  ; xmm_shadow[24] = block[jno24];
-    pinsrw      %34, word [esi + %3  * SIZEOF_WORD], 1  ; xmm_shadow[1] = block[jno1];
-    pinsrw      %35, word [esi + %11 * SIZEOF_WORD], 1  ; xmm_shadow[9] = block[jno9];
-    pinsrw      %36, word [esi + %19 * SIZEOF_WORD], 1  ; xmm_shadow[17] = block[jno17];
-    pinsrw      %37, word [esi + %27 * SIZEOF_WORD], 1  ; xmm_shadow[25] = block[jno25];
-    pinsrw      %34, word [esi + %4  * SIZEOF_WORD], 2  ; xmm_shadow[2] = block[jno2];
-    pinsrw      %35, word [esi + %12 * SIZEOF_WORD], 2  ; xmm_shadow[10] = block[jno10];
-    pinsrw      %36, word [esi + %20 * SIZEOF_WORD], 2  ; xmm_shadow[18] = block[jno18];
-    pinsrw      %37, word [esi + %28 * SIZEOF_WORD], 2  ; xmm_shadow[26] = block[jno26];
-    pinsrw      %34, word [esi + %5  * SIZEOF_WORD], 3  ; xmm_shadow[3] = block[jno3];
-    pinsrw      %35, word [esi + %13 * SIZEOF_WORD], 3  ; xmm_shadow[11] = block[jno11];
-    pinsrw      %36, word [esi + %21 * SIZEOF_WORD], 3  ; xmm_shadow[19] = block[jno19];
-    pinsrw      %37, word [esi + %29 * SIZEOF_WORD], 3  ; xmm_shadow[27] = block[jno27];
-    pinsrw      %34, word [esi + %6  * SIZEOF_WORD], 4  ; xmm_shadow[4] = block[jno4];
-    pinsrw      %35, word [esi + %14 * SIZEOF_WORD], 4  ; xmm_shadow[12] = block[jno12];
-    pinsrw      %36, word [esi + %22 * SIZEOF_WORD], 4  ; xmm_shadow[20] = block[jno20];
-    pinsrw      %37, word [esi + %30 * SIZEOF_WORD], 4  ; xmm_shadow[28] = block[jno28];
-    pinsrw      %34, word [esi + %7  * SIZEOF_WORD], 5  ; xmm_shadow[5] = block[jno5];
-    pinsrw      %35, word [esi + %15 * SIZEOF_WORD], 5  ; xmm_shadow[13] = block[jno13];
-    pinsrw      %36, word [esi + %23 * SIZEOF_WORD], 5  ; xmm_shadow[21] = block[jno21];
-    pinsrw      %37, word [esi + %31 * SIZEOF_WORD], 5  ; xmm_shadow[29] = block[jno29];
-    pinsrw      %34, word [esi + %8  * SIZEOF_WORD], 6  ; xmm_shadow[6] = block[jno6];
-    pinsrw      %35, word [esi + %16 * SIZEOF_WORD], 6  ; xmm_shadow[14] = block[jno14];
-    pinsrw      %36, word [esi + %24 * SIZEOF_WORD], 6  ; xmm_shadow[22] = block[jno22];
-    pinsrw      %37, word [esi + %32 * SIZEOF_WORD], 6  ; xmm_shadow[30] = block[jno30];
-    pinsrw      %34, word [esi + %9  * SIZEOF_WORD], 7  ; xmm_shadow[7] = block[jno7];
-    pinsrw      %35, word [esi + %17 * SIZEOF_WORD], 7  ; xmm_shadow[15] = block[jno15];
-    pinsrw      %36, word [esi + %25 * SIZEOF_WORD], 7  ; xmm_shadow[23] = block[jno23];
-%if %1 != 32
-    pinsrw      %37, word [esi + %33 * SIZEOF_WORD], 7  ; xmm_shadow[31] = block[jno31];
-%else
-    pinsrw      %37, ecx, 7             ; xmm_shadow[31] = block[jno31];
+%macro EMIT_QWORD 9
+%define %%temp   %1
+%define %%tempb  %2
+%define %%temph  %3
+    add         nbits, free_bits             ; nbits += free_bits;
+    neg         free_bits                    ; free_bits = -free_bits;
+    movq        mm_temp, mm_code             ; temp = code;
+    movd        mm_nbits, nbits              ; nbits --> MMX register
+    movd        mm_overflow_bits, free_bits  ; overflow_bits (temp register) = free_bits;
+    neg         free_bits                    ; free_bits = -free_bits;
+    psllq       mm_put_buffer, mm_nbits      ; put_buffer <<= nbits;
+    psrlq       mm_temp, mm_overflow_bits    ; temp >>= overflow_bits;
+    add         free_bits, 64                ; free_bits += 64;
+    por         mm_temp, mm_put_buffer       ; temp |= put_buffer;
+%ifidn %%temp, nbits_base
+    movd        mm_save_nbits, nbits_base    ; save nbits_base
 %endif
-    pcmpgtw     xmm4, %34               ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw     xmm5, %35               ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw     xmm6, %36               ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw     xmm7, %37               ; neg = _mm_cmpgt_epi16(neg, x1);
-    paddw       %34, xmm4               ; x1 = _mm_add_epi16(x1, neg);
-    paddw       %35, xmm5               ; x1 = _mm_add_epi16(x1, neg);
-    paddw       %36, xmm6               ; x1 = _mm_add_epi16(x1, neg);
-    paddw       %37, xmm7               ; x1 = _mm_add_epi16(x1, neg);
-    pxor        %34, xmm4               ; x1 = _mm_xor_si128(x1, neg);
-    pxor        %35, xmm5               ; x1 = _mm_xor_si128(x1, neg);
-    pxor        %36, xmm6               ; x1 = _mm_xor_si128(x1, neg);
-    pxor        %37, xmm7               ; x1 = _mm_xor_si128(x1, neg);
-    pxor        xmm4, %34               ; neg = _mm_xor_si128(neg, x1);
-    pxor        xmm5, %35               ; neg = _mm_xor_si128(neg, x1);
-    pxor        xmm6, %36               ; neg = _mm_xor_si128(neg, x1);
-    pxor        xmm7, %37               ; neg = _mm_xor_si128(neg, x1);
-    movdqa      XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34          ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
-    movdqa      XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35    ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
-    movdqa      XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36   ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
-    movdqa      XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37   ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
-    movdqa      XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4         ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
-    movdqa      XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5   ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
-    movdqa      XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6  ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
-    movdqa      XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7  ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
+    movq        mm_code_bits, mm_temp        ; code_bits (temp register) = temp;
+    movq        mm_put_buffer, mm_code       ; put_buffer = code;
+    pcmpeqb     mm_temp, mm_all_0xff         ; b_temp[i] = (b_temp[i] == 0xFF ? 0xFF : 0);
+    movq        mm_code, mm_code_bits        ; code = code_bits;
+    psrlq       mm_code_bits, 32             ; code_bits >>= 32;
+    pmovmskb    nbits, mm_temp               ; nbits = 0;  nbits |= ((b_temp[i] >> 7) << i);
+    movd        %%temp, mm_code_bits         ; temp = code_bits;
+    bswap       %%temp                       ; temp = htonl(temp);
+    test        nbits, nbits                 ; if (nbits != 0)  /* Some 0xFF bytes */
+    jnz         %%.SLOW                      ;   goto %%.SLOW
+    mov         dword [buffer], %%temp       ; *(uint32_t)buffer = temp;
+%ifidn %%temp, nbits_base
+    movd        nbits_base, mm_save_nbits    ; restore nbits_base
+%endif
+    %4
+    movd        nbits, mm_code               ; nbits = (uint32_t)(code);
+    %5
+    bswap       nbits                        ; nbits = htonl(nbits);
+    mov         dword [buffer + 4], nbits    ; *(uint32_t)(buffer + 4) = nbits;
+    lea         buffer, [buffer + 8]         ; buffer += 8;
+    %6
+    %7
+    %8
+    jmp %9                                   ; return
+%%.SLOW:
+    ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
+    ; bytes in the qword.
+    mov         byte [buffer], %%tempb     ; buffer[0] = temp[0];
+    cmp         %%tempb, 0xFF              ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], %%temph     ; buffer[0] = temp[1];
+    cmp         %%temph, 0xFF              ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         %%temp, 16                 ; temp >>= 16;
+    mov         byte [buffer], %%tempb     ; buffer[0] = temp[0];
+    cmp         %%tempb, 0xFF              ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], %%temph     ; buffer[0] = temp[1];
+    cmp         %%temph, 0xFF              ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    movd        nbits, mm_code             ; nbits (temp register) = (uint32_t)(code)
+%ifidn %%temp, nbits_base
+    movd        nbits_base, mm_save_nbits  ; restore nbits_base
+%endif
+    bswap       nbits                      ; nbits = htonl(nbits)
+    mov         byte [buffer], nbitsb      ; buffer[0] = nbits[0];
+    cmp         nbitsb, 0xFF               ; Set CF if nbits[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], nbitsh      ; buffer[0] = nbits[1];
+    cmp         nbitsh, 0xFF               ; Set CF if nbits[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+    shr         nbits, 16                  ; nbits >>= 16;
+    mov         byte [buffer], nbitsb      ; buffer[0] = nbits[0];
+    cmp         nbitsb, 0xFF               ; Set CF if nbits[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], nbitsh      ; buffer[0] = nbits[1];
+    %4
+    cmp         nbitsh, 0xFF               ; Set CF if nbits[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+    %5
+    %6
+    %7
+    %8
+    jmp %9                                 ; return;
+%endmacro
+
+%macro PUSH 1
+    push        %1
+%assign stack_offset  stack_offset + 4
+%endmacro
+
+%macro POP 1
+    pop         %1
+%assign stack_offset  stack_offset - 4
+%endmacro
+
+; If PIC is defined, load the address of a symbol defined in this file into a
+; register.  Equivalent to
+;   get_GOT     %1
+;   lea         %1, [GOTOFF(%1, %2)]
+; without using the GOT.
+;
+; Usage:
+; %1 - register into which to load the address of the symbol
+; %2 - symbol whose address should be loaded
+; %3 - optional multi-line macro to execute before the symbol address is loaded
+; %4 - optional multi-line macro to execute after the symbol address is loaded
+;
+; If PIC is not defined, then %3 and %4 are executed in order.
+
+%macro GET_SYM 2-4
+%ifdef PIC
+    call        %%.geteip
+%%.ref:
+    %4
+    add         %1, %2 - %%.ref
+    jmp         short %%.done
+    align       32
+%%.geteip:
+    %3          4               ; must adjust stack pointer because of call
+    mov         %1, POINTER [esp]
+    ret
+    align       32
+%%.done:
+%else
+    %3          0
+    %4
+%endif
 %endmacro
 
 ;
@@ -153,272 +275,487 @@
 ;                                  JCOEFPTR block, int last_dc_val,
 ;                                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
 ;
+; Stack layout:
+; Function args
+; Return address
+; Saved ebx
+; Saved ebp
+; Saved esi
+; Saved edi <-- esp_save
+; ...
+; esp_save
+; t_ 64*2 bytes (aligned to 128 bytes)
+;
+; esp is used (as t) to point into t_ (data in lower indices is not used once
+; esp passes over them, so this is signal-safe.)  Aligning to 128 bytes allows
+; us to find the rest of the data again.
+;
+; NOTES:
+; When shuffling data, we try to avoid pinsrw as much as possible, since it is
+; slow on many CPUs.  Its reciprocal throughput (issue latency) is 1 even on
+; modern CPUs, so chains of pinsrw instructions (even with different outputs)
+; can limit performance.  pinsrw is a VectorPath instruction on AMD K8 and
+; requires 2 µops (with memory operand) on Intel.  In either case, only one
+; pinsrw instruction can be decoded per cycle (and nothing else if they are
+; back-to-back), so out-of-order execution cannot be used to work around long
+; pinsrw chains (though for Sandy Bridge and later, this may be less of a
+; problem if the code runs from the µop cache.)
+;
+; We use tzcnt instead of bsf without checking for support.  The instruction is
+; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
+; rep bsf.)  The destination (first) operand of bsf (and tzcnt on some CPUs) is
+; an input dependency (although the behavior is not formally defined, Intel
+; CPUs usually leave the destination unmodified if the source is zero.)  This
+; can prevent out-of-order execution, so we clear the destination before
+; invoking tzcnt.
+;
+; Initial register allocation
+; eax - frame --> buffer
+; ebx - nbits_base (PIC) / emit_temp
+; ecx - dctbl --> size --> state
+; edx - block --> nbits
+; esi - code_temp --> state --> actbl
+; edi - index_temp --> free_bits
+; esp - t
+; ebp - index
 
-; eax + 8 = working_state *state
-; eax + 12 = JOCTET *buffer
-; eax + 16 = JCOEFPTR block
-; eax + 20 = int last_dc_val
-; eax + 24 = c_derived_tbl *dctbl
-; eax + 28 = c_derived_tbl *actbl
+%define frame       eax
+%ifdef PIC
+%define nbits_base  ebx
+%endif
+%define emit_temp   ebx
+%define emit_tempb  bl
+%define emit_temph  bh
+%define dctbl       ecx
+%define block       edx
+%define code_temp   esi
+%define index_temp  edi
+%define t           esp
+%define index       ebp
 
-%define pad         6 * SIZEOF_DWORD    ; Align to 16 bytes
-%define t1          pad
-%define t2          t1 + (DCTSIZE2 * SIZEOF_WORD)
-%define block       t2 + (DCTSIZE2 * SIZEOF_WORD)
-%define actbl       block + SIZEOF_DWORD
-%define buffer      actbl + SIZEOF_DWORD
-%define temp        buffer + SIZEOF_DWORD
-%define temp2       temp + SIZEOF_DWORD
-%define temp3       temp2 + SIZEOF_DWORD
-%define temp4       temp3 + SIZEOF_DWORD
-%define temp5       temp4 + SIZEOF_DWORD
-%define gotptr      temp5 + SIZEOF_DWORD  ; void *gotptr
-%define put_buffer  ebx
-%define put_bits    edi
+%assign save_frame  DCTSIZE2 * SIZEOF_WORD
+
+; Step 1: Re-arrange input data according to jpeg_natural_order
+; xx 01 02 03 04 05 06 07      xx 01 08 16 09 02 03 10
+; 08 09 10 11 12 13 14 15      17 24 32 25 18 11 04 05
+; 16 17 18 19 20 21 22 23      12 19 26 33 40 48 41 34
+; 24 25 26 27 28 29 30 31 ==>  27 20 13 06 07 14 21 28
+; 32 33 34 35 36 37 38 39      35 42 49 56 57 50 43 36
+; 40 41 42 43 44 45 46 47      29 22 15 23 30 37 44 51
+; 48 49 50 51 52 53 54 55      58 59 52 45 38 31 39 46
+; 56 57 58 59 60 61 62 63      53 60 61 54 47 55 62 63
 
     align       32
     GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
 
 EXTN(jsimd_huff_encode_one_block_sse2):
-    push        ebp
-    mov         eax, esp                     ; eax = original ebp
-    sub         esp, byte 4
-    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
-    mov         [esp], eax
-    mov         ebp, esp                     ; ebp = aligned ebp
-    sub         esp, temp5+9*SIZEOF_DWORD-pad
-    push        ebx
-    push        ecx
-;   push        edx                     ; need not be preserved
-    push        esi
-    push        edi
-    push        ebp
 
-    mov         esi, POINTER [eax+8]       ; (working_state *state)
-    mov         put_buffer, dword [esi+8]  ; put_buffer = state->cur.put_buffer;
-    mov         put_bits, dword [esi+12]   ; put_bits = state->cur.put_bits;
-    push        esi                        ; esi is now scratch
+%assign stack_offset      0
+%define arg_state         4 + stack_offset
+%define arg_buffer        8 + stack_offset
+%define arg_block        12 + stack_offset
+%define arg_last_dc_val  16 + stack_offset
+%define arg_dctbl        20 + stack_offset
+%define arg_actbl        24 + stack_offset
 
-    get_GOT     edx                        ; get GOT address
-    movpic      POINTER [esp+gotptr], edx  ; save GOT address
+                                                          ;X: X = code stream
+    mov         block, [esp + arg_block]
+    PUSH        ebx
+    PUSH        ebp
+    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
+    PUSH        esi
+    PUSH        edi
+    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
+    mov         frame, esp
+    lea         t, [frame - (save_frame + 4)]
+    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
+    and         t, -DCTSIZE2 * SIZEOF_WORD                                             ; t = &t_[0]
+    mov         [t + save_frame], frame
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
+    pshuflw     xmm0, xmm0, 11001001b                     ;A: w0 = 01 08 xx 09 02 03 10 11
+    pinsrw      xmm0, word [block + 16 * SIZEOF_WORD], 2  ;A: w0 = 01 08 16 09 02 03 10 11
+    punpckhdq   xmm3, xmm1                                ;D: w3 = 04 05 12 13 06 07 14 15
+    punpcklqdq  xmm1, xmm3                                ;B: w1 = 08 09 10 11 04 05 12 13
+    pinsrw      xmm0, word [block + 17 * SIZEOF_WORD], 7  ;A: w0 = 01 08 16 09 02 03 10 17
+                                                          ;A:      (Row 0, offset 1)
+    pcmpgtw     xmm4, xmm0                                ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
+    paddw       xmm0, xmm4                                ;A: w0[i] += w4[i];
+    movaps      XMMWORD [t + 0 * SIZEOF_WORD], xmm0       ;A: t[i] = w0[i];
 
-    mov         ecx, POINTER [eax+28]
-    mov         edx, POINTER [eax+16]
-    mov         esi, POINTER [eax+12]
-    mov         POINTER [esp+actbl], ecx
-    mov         POINTER [esp+block], edx
-    mov         POINTER [esp+buffer], esi
+    movq        xmm2, qword [block + 24 * SIZEOF_WORD]    ;B: w2 = 24 25 26 27 -- -- -- --
+    pshuflw     xmm2, xmm2, 11011000b                     ;B: w2 = 24 26 25 27 -- -- -- --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;B: w1 = -- 08 09 10 11 04 05 12
+    movups      xmm5, XMMWORD [block + 48 * SIZEOF_WORD]  ;H: w5 = 48 49 50 51 52 53 54 55
+    movsd       xmm1, xmm2                                ;B: w1 = 24 26 25 27 11 04 05 12
+    punpcklqdq  xmm2, xmm5                                ;C: w2 = 24 26 25 27 48 49 50 51
+    pinsrw      xmm1, word [block + 32 * SIZEOF_WORD], 1  ;B: w1 = 24 32 25 27 11 04 05 12
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    psrldq      xmm3, 2 * SIZEOF_WORD                     ;D: w3 = 12 13 06 07 14 15 -- --
+    pcmpeqw     xmm0, xmm4                                ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 18 * SIZEOF_WORD], 3  ;B: w1 = 24 32 25 18 11 04 05 12
+                                                          ;        (Row 1, offset 1)
+    pcmpgtw     xmm4, xmm1                                ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm4                                ;B: w1[i] += w4[i];
+    movaps      XMMWORD [t + 8 * SIZEOF_WORD], xmm1       ;B: t[i+8] = w1[i];
+    pxor        xmm4, xmm4                                ;B: w4[i] = 0;
+    pcmpeqw     xmm1, xmm4                                ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
 
-    ; Encode the DC coefficient difference per section F.1.2.1
-    mov         esi, POINTER [esp+block]  ; block
-    movsx       ecx, word [esi]           ; temp = temp2 = block[0] - last_dc_val;
-    sub         ecx, dword [eax+20]
-    mov         esi, ecx
+    packsswb    xmm0, xmm1                                ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
+                                                          ;    w/ signed saturation
 
-    ; This is a well-known technique for obtaining the absolute value
-    ; with out a branch.  It is derived from an assembly language technique
-    ; presented in "How to Optimize for the Pentium Processors",
-    ; Copyright (c) 1996, 1997 by Agner Fog.
-    mov         edx, ecx
-    sar         edx, 31                 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
-    xor         ecx, edx                ; temp ^= temp3;
-    sub         ecx, edx                ; temp -= temp3;
+    pinsrw      xmm3, word [block + 20 * SIZEOF_WORD], 0  ;D: w3 = 20 13 06 07 14 15 -- --
+    pinsrw      xmm3, word [block + 21 * SIZEOF_WORD], 5  ;D: w3 = 20 13 06 07 14 21 -- --
+    pinsrw      xmm3, word [block + 28 * SIZEOF_WORD], 6  ;D: w3 = 20 13 06 07 14 21 28 --
+    pinsrw      xmm3, word [block + 35 * SIZEOF_WORD], 7  ;D: w3 = 20 13 06 07 14 21 28 35
+                                                          ;        (Row 3, offset 1)
+    pcmpgtw     xmm4, xmm3                                ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
+    paddw       xmm3, xmm4                                ;D: w3[i] += w4[i];
+    movaps      XMMWORD [t + 24 * SIZEOF_WORD], xmm3      ;D: t[i+24] = w3[i];
+    pxor        xmm4, xmm4                                ;D: w4[i] = 0;
+    pcmpeqw     xmm3, xmm4                                ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
 
-    ; For a negative input, want temp2 = bitwise complement of abs(input)
-    ; This code assumes we are on a two's complement machine
-    add         esi, edx                ; temp2 += temp3;
-    mov         dword [esp+temp], esi   ; backup temp2 in temp
+    pinsrw      xmm2, word [block + 19 * SIZEOF_WORD], 0  ;C: w2 = 19 26 25 27 48 49 50 51
+    pinsrw      xmm2, word [block + 33 * SIZEOF_WORD], 2  ;C: w2 = 19 26 33 27 48 49 50 51
+    pinsrw      xmm2, word [block + 40 * SIZEOF_WORD], 3  ;C: w2 = 19 26 33 40 48 49 50 51
+    pinsrw      xmm2, word [block + 41 * SIZEOF_WORD], 5  ;C: w2 = 19 26 33 40 48 41 50 51
+    pinsrw      xmm2, word [block + 34 * SIZEOF_WORD], 6  ;C: w2 = 19 26 33 40 48 41 34 51
+    pinsrw      xmm2, word [block + 27 * SIZEOF_WORD], 7  ;C: w2 = 19 26 33 40 48 41 34 27
+                                                          ;        (Row 2, offset 1)
+    pcmpgtw     xmm4, xmm2                                ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
+    paddw       xmm2, xmm4                                ;C: w2[i] += w4[i];
+    movsx       code_temp, word [block]                   ;Z:     code_temp = block[0];
 
-    ; Find the number of bits needed for the magnitude of the coefficient
-    movpic      ebp, POINTER [esp+gotptr]                        ; load GOT address (ebp)
-    movzx       edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)]  ; nbits = JPEG_NBITS(temp);
-    mov         dword [esp+temp2], edx                           ; backup nbits in temp2
+; %1 - stack pointer adjustment
+%macro GET_SYM_BEFORE 1
+    movaps      XMMWORD [t + 16 * SIZEOF_WORD + %1], xmm2
+                                                          ;C: t[i+16] = w2[i];
+    pxor        xmm4, xmm4                                ;C: w4[i] = 0;
+    pcmpeqw     xmm2, xmm4                                ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
+    sub         code_temp, [frame + arg_last_dc_val]      ;Z:     code_temp -= last_dc_val;
 
-    ; Emit the Huffman-coded symbol for the number of bits
-    mov         ebp, POINTER [eax+24]         ; After this point, arguments are not accessible anymore
-    mov         eax,  INT [ebp + edx * 4]     ; code = dctbl->ehufco[nbits];
-    movzx       ecx, byte [ebp + edx + 1024]  ; size = dctbl->ehufsi[nbits];
-    EMIT_BITS   eax                           ; EMIT_BITS(code, size)
+    packsswb    xmm2, xmm3                                ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
+                                                          ;    w/ signed saturation
 
-    mov         ecx, dword [esp+temp2]        ; restore nbits
+    movdqa      xmm3, xmm5                                ;H: w3 = 48 49 50 51 52 53 54 55
+    pmovmskb    index_temp, xmm2                          ;Z:     index_temp = 0;  index_temp |= ((b2[i] >> 7) << i);
+    pmovmskb    index, xmm0                               ;Z:     index = 0;  index |= ((b0[i] >> 7) << i);
+    movups      xmm0, XMMWORD [block + 56 * SIZEOF_WORD]  ;H: w0 = 56 57 58 59 60 61 62 63
+    punpckhdq   xmm3, xmm0                                ;H: w3 = 52 53 60 61 54 55 62 63
+    shl         index_temp, 16                            ;Z:     index_temp <<= 16;
+    psrldq      xmm3, 1 * SIZEOF_WORD                     ;H: w3 = 53 60 61 54 55 62 63 --
+    pxor        xmm2, xmm2                                ;H: w2[i] = 0;
+    pshuflw     xmm3, xmm3, 00111001b                     ;H: w3 = 60 61 54 53 55 62 63 --
+    or          index, index_temp                         ;Z:     index |= index_temp;
+%undef index_temp
+%define free_bits  edi
+%endmacro
 
-    ; Mask off any extra bits in code
-    mov         eax, 1
-    shl         eax, cl
-    dec         eax
-    and         eax, dword [esp+temp]   ; temp2 &= (((JLONG)1)<<nbits) - 1;
+%macro GET_SYM_AFTER 0
+    movq        xmm1, qword [block + 44 * SIZEOF_WORD]    ;G: w1 = 44 45 46 47 -- -- -- --
+    unpcklps    xmm5, xmm0                                ;E: w5 = 48 49 56 57 50 51 58 59
+    pxor        xmm0, xmm0                                ;H: w0[i] = 0;
+    not         index                                     ;Z:     index = ~index;
+    pinsrw      xmm3, word [block + 47 * SIZEOF_WORD], 3  ;H: w3 = 60 61 54 47 55 62 63 --
+                                                          ;        (Row 7, offset 1)
+    pcmpgtw     xmm2, xmm3                                ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
+    mov         dctbl, [frame + arg_dctbl]
+    paddw       xmm3, xmm2                                ;H: w3[i] += w2[i];
+    movaps      XMMWORD [t + 56 * SIZEOF_WORD], xmm3      ;H: t[i+56] = w3[i];
+    movq        xmm4, qword [block + 36 * SIZEOF_WORD]    ;G: w4 = 36 37 38 39 -- -- -- --
+    pcmpeqw     xmm3, xmm0                                ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
+    punpckldq   xmm4, xmm1                                ;G: w4 = 36 37 44 45 38 39 46 47
+    movdqa      xmm1, xmm4                                ;F: w1 = 36 37 44 45 38 39 46 47
+    pcmpeqw     mm_all_0xff, mm_all_0xff                  ;Z:     all_0xff[i] = 0xFF;
+%endmacro
 
-    ; Emit that number of bits of the value, if positive,
-    ; or the complement of its magnitude, if negative.
-    EMIT_BITS   eax                     ; EMIT_BITS(temp2, nbits)
+    GET_SYM     nbits_base, jpeg_nbits_table, GET_SYM_BEFORE, GET_SYM_AFTER
 
-    ; Prepare data
-    xor         ecx, ecx
-    mov         esi, POINTER [esp+block]
-    kloop_prepare  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, \
-                   18, 11, 4,  5,  12, 19, 26, 33, 40, 48, 41, 34, \
-                   27, 20, 13, 6,  7,  14, 21, 28, 35, \
-                   xmm0, xmm1, xmm2, xmm3
-    kloop_prepare  32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
-                   30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
-                   53, 60, 61, 54, 47, 55, 62, 63, 63, \
-                   xmm0, xmm1, xmm2, xmm3
+    psrldq      xmm4, 1 * SIZEOF_WORD                     ;G: w4 = 37 44 45 38 39 46 47 --
+    shufpd      xmm1, xmm5, 10b                           ;F: w1 = 36 37 44 45 50 51 58 59
+    pshufhw     xmm4, xmm4, 11010011b                     ;G: w4 = 37 44 45 38 -- 39 46 --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;F: w1 = -- 36 37 44 45 50 51 58
+    pinsrw      xmm4, word [block + 59 * SIZEOF_WORD], 0  ;G: w4 = 59 44 45 38 -- 39 46 --
+    pshufd      xmm1, xmm1, 11011000b                     ;F: w1 = -- 36 45 50 37 44 51 58
+    cmp         code_temp, 1 << 31                        ;Z:     Set CF if code_temp < 0x80000000,
+                                                          ;Z:     i.e. if code_temp is positive
+    pinsrw      xmm4, word [block + 52 * SIZEOF_WORD], 1  ;G: w4 = 59 52 45 38 -- 39 46 --
+    movlps      xmm1, qword [block + 20 * SIZEOF_WORD]    ;F: w1 = 20 21 22 23 37 44 51 58
+    pinsrw      xmm4, word [block + 31 * SIZEOF_WORD], 4  ;G: w4 = 59 52 45 38 31 39 46 --
+    pshuflw     xmm1, xmm1, 01110010b                     ;F: w1 = 22 20 23 21 37 44 51 58
+    pinsrw      xmm4, word [block + 53 * SIZEOF_WORD], 7  ;G: w4 = 59 52 45 38 31 39 46 53
+                                                          ;        (Row 6, offset 1)
+    adc         code_temp, -1                             ;Z:     code_temp += -1 + (code_temp >= 0 ? 1 : 0);
+    pxor        xmm2, xmm2                                ;G: w2[i] = 0;
+    pcmpgtw     xmm0, xmm4                                ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 15 * SIZEOF_WORD], 1  ;F: w1 = 22 15 23 21 37 44 51 58
+    paddw       xmm4, xmm0                                ;G: w4[i] += w0[i];
+    movaps      XMMWORD [t + 48 * SIZEOF_WORD], xmm4      ;G: t[48+i] = w4[i];
+    movd        mm_temp, code_temp                        ;Z:     temp = code_temp
+    pinsrw      xmm1, word [block + 30 * SIZEOF_WORD], 3  ;F: w1 = 22 15 23 30 37 44 51 58
+                                                          ;        (Row 5, offset 1)
+    pcmpeqw     xmm4, xmm2                                ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
 
-    pxor        xmm7, xmm7
-    movdqa      xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD]   ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
-    movdqa      xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD]   ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
-    movdqa      xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
-    movdqa      xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
-    pcmpeqw     xmm0, xmm7              ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
-    pcmpeqw     xmm1, xmm7              ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
-    pcmpeqw     xmm2, xmm7              ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
-    pcmpeqw     xmm3, xmm7              ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
-    packsswb    xmm0, xmm1              ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
-    packsswb    xmm2, xmm3              ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
-    pmovmskb    edx, xmm0               ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
-    pmovmskb    ecx, xmm2               ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
-    shl         ecx, 16
-    or          edx, ecx
-    not         edx                     ; index = ~index;
+    packsswb    xmm4, xmm3                                ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
+                                                          ;    w/ signed saturation
 
-    lea         esi, [esp+t1]
-    mov         ebp, POINTER [esp+actbl]  ; ebp = actbl
+    lea         t, [t - SIZEOF_WORD]                      ;Z:     t = &t[-1]
+    pxor        xmm0, xmm0                                ;F: w0[i] = 0;
+    pcmpgtw     xmm2, xmm1                                ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm2                                ;F: w1[i] += w2[i];
+    movaps      XMMWORD [t + (40+1) * SIZEOF_WORD], xmm1  ;F: t[40+i] = w1[i];
+    pcmpeqw     xmm1, xmm0                                ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
+    pinsrw      xmm5, word [block + 42 * SIZEOF_WORD], 0  ;E: w5 = 42 49 56 57 50 51 58 59
+    pinsrw      xmm5, word [block + 43 * SIZEOF_WORD], 5  ;E: w5 = 42 49 56 57 50 43 58 59
+    pinsrw      xmm5, word [block + 36 * SIZEOF_WORD], 6  ;E: w5 = 42 49 56 57 50 43 36 59
+    pinsrw      xmm5, word [block + 29 * SIZEOF_WORD], 7  ;E: w5 = 42 49 56 57 50 43 36 29
+                                                          ;        (Row 4, offset 1)
+%undef block
+%define nbits  edx
+%define nbitsb  dl
+%define nbitsh  dh
+    movzx       nbits, byte [NBITS(code_temp)]            ;Z:     nbits = JPEG_NBITS(code_temp);
+%undef code_temp
+%define state  esi
+    pxor        xmm2, xmm2                                ;E: w2[i] = 0;
+    mov         state, [frame + arg_state]
+    movd        mm_nbits, nbits                           ;Z:     nbits --> MMX register
+    pcmpgtw     xmm0, xmm5                                ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
+    movd        mm_code, dword [dctbl + c_derived_tbl.ehufco + nbits * 4]
+                                                          ;Z:     code = dctbl->ehufco[nbits];
+%define size  ecx
+%define sizeb  cl
+%define sizeh  ch
+    paddw       xmm5, xmm0                                ;E: w5[i] += w0[i];
+    movaps      XMMWORD [t + (32+1) * SIZEOF_WORD], xmm5  ;E: t[32+i] = w5[i];
+    movzx       size, byte [dctbl + c_derived_tbl.ehufsi + nbits]
+                                                          ;Z:     size = dctbl->ehufsi[nbits];
+%undef dctbl
+    pcmpeqw     xmm5, xmm2                                ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
 
-.BLOOP:
-    bsf         ecx, edx                ; r = __builtin_ctzl(index);
-    jz          near .ELOOP
-    lea         esi, [esi+ecx*2]        ; k += r;
-    shr         edx, cl                 ; index >>= r;
-    mov         dword [esp+temp3], edx
-.BRLOOP:
-    cmp         ecx, 16                       ; while (r > 15) {
-    jl          near .ERLOOP
-    sub         ecx, 16                       ; r -= 16;
-    mov         dword [esp+temp], ecx
-    mov         eax, INT [ebp + 240 * 4]      ; code_0xf0 = actbl->ehufco[0xf0];
-    movzx       ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
-    EMIT_BITS   eax                           ; EMIT_BITS(code_0xf0, size_0xf0)
-    mov         ecx, dword [esp+temp]
-    jmp         .BRLOOP
-.ERLOOP:
-    movsx       eax, word [esi]                                  ; temp = t1[k];
-    movpic      edx, POINTER [esp+gotptr]                        ; load GOT address (edx)
-    movzx       eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)]  ; nbits = JPEG_NBITS(temp);
-    mov         dword [esp+temp2], eax
-    ; Emit Huffman symbol for run length / number of bits
-    shl         ecx, 4                        ; temp3 = (r << 4) + nbits;
-    add         ecx, eax
-    mov         eax,  INT [ebp + ecx * 4]     ; code = actbl->ehufco[temp3];
-    movzx       ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
-    EMIT_BITS   eax
+    packsswb    xmm5, xmm1                                ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
+                                                          ;    w/ signed saturation
 
-    movsx       edx, word [esi+DCTSIZE2*2]    ; temp2 = t2[k];
-    ; Mask off any extra bits in code
-    mov         ecx, dword [esp+temp2]
-    mov         eax, 1
-    shl         eax, cl
-    dec         eax
-    and         eax, edx                ; temp2 &= (((JLONG)1)<<nbits) - 1;
-    EMIT_BITS   eax                     ; PUT_BITS(temp2, nbits)
-    mov         edx, dword [esp+temp3]
-    add         esi, 2                  ; ++k;
-    shr         edx, 1                  ; index >>= 1;
+    movq        mm_put_buffer, [state + working_state.cur.put_buffer.simd]
+                                                          ;Z:     put_buffer = state->cur.put_buffer.simd;
+    mov         free_bits, [state + working_state.cur.free_bits]
+                                                          ;Z:     free_bits = state->cur.free_bits;
+%undef state
+%define actbl  esi
+    mov         actbl, [frame + arg_actbl]
+%define buffer  eax
+    mov         buffer, [frame + arg_buffer]
+%undef frame
+    jmp        .BEGIN
 
-    jmp         .BLOOP
-.ELOOP:
-    movdqa      xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD]  ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
-    movdqa      xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD]  ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
-    movdqa      xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
-    movdqa      xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
-    pcmpeqw     xmm0, xmm7              ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
-    pcmpeqw     xmm1, xmm7              ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
-    pcmpeqw     xmm2, xmm7              ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
-    pcmpeqw     xmm3, xmm7              ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
-    packsswb    xmm0, xmm1              ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
-    packsswb    xmm2, xmm3              ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
-    pmovmskb    edx, xmm0               ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
-    pmovmskb    ecx, xmm2               ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
-    shl         ecx, 16
-    or          edx, ecx
-    not         edx                     ; index = ~index;
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    lea         eax, [esp + t1 + (DCTSIZE2/2) * 2]
-    sub         eax, esi
-    shr         eax, 1
-    bsf         ecx, edx                ; r = __builtin_ctzl(index);
-    jz          near .ELOOP2
-    shr         edx, cl                 ; index >>= r;
-    add         ecx, eax
-    lea         esi, [esi+ecx*2]        ; k += r;
-    mov         dword [esp+temp3], edx
-    jmp         .BRLOOP2
-.BLOOP2:
-    bsf         ecx, edx                ; r = __builtin_ctzl(index);
-    jz          near .ELOOP2
-    lea         esi, [esi+ecx*2]        ; k += r;
-    shr         edx, cl                 ; index >>= r;
-    mov         dword [esp+temp3], edx
-.BRLOOP2:
-    cmp         ecx, 16                       ; while (r > 15) {
-    jl          near .ERLOOP2
-    sub         ecx, 16                       ; r -= 16;
-    mov         dword [esp+temp], ecx
-    mov         eax, INT [ebp + 240 * 4]      ; code_0xf0 = actbl->ehufco[0xf0];
-    movzx       ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
-    EMIT_BITS   eax                           ; EMIT_BITS(code_0xf0, size_0xf0)
-    mov         ecx, dword [esp+temp]
-    jmp         .BRLOOP2
-.ERLOOP2:
-    movsx       eax, word [esi]         ; temp = t1[k];
-    bsr         eax, eax                ; nbits = 32 - __builtin_clz(temp);
-    inc         eax
-    mov         dword [esp+temp2], eax
-    ; Emit Huffman symbol for run length / number of bits
-    shl         ecx, 4                        ; temp3 = (r << 4) + nbits;
-    add         ecx, eax
-    mov         eax,  INT [ebp + ecx * 4]     ; code = actbl->ehufco[temp3];
-    movzx       ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
-    EMIT_BITS   eax
+    align       16
+; size <= 32, so this is not really a loop
+.BRLOOP1:                                                 ; .BRLOOP1:
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ; nbits = actbl->ehufsi[0xf0];
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ; code = actbl->ehufco[0xf0];
+    and         index, 0x7ffffff                          ; clear index if size == 32
+    sub         size, 16                                  ; size -= 16;
+    sub         free_bits, nbits                          ; if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP1                             ;   goto .EMIT_BRLOOP1;
+    movd        mm_nbits, nbits                           ; nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ; put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ; put_buffer |= code;
+    jmp         .ERLOOP1                                  ; goto .ERLOOP1;
 
-    movsx       edx, word [esi+DCTSIZE2*2]    ; temp2 = t2[k];
-    ; Mask off any extra bits in code
-    mov         ecx, dword [esp+temp2]
-    mov         eax, 1
-    shl         eax, cl
-    dec         eax
-    and         eax, edx                ; temp2 &= (((JLONG)1)<<nbits) - 1;
-    EMIT_BITS   eax                     ; PUT_BITS(temp2, nbits)
-    mov         edx, dword [esp+temp3]
-    add         esi, 2                  ; ++k;
-    shr         edx, 1                  ; index >>= 1;
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    jmp         .BLOOP2
-.ELOOP2:
-    ; If the last coef(s) were zero, emit an end-of-block code
-    lea         edx, [esp + t1 + (DCTSIZE2-1) * 2]  ; r = DCTSIZE2-1-k;
-    cmp         edx, esi                            ; if (r > 0) {
-    je          .EFN
-    mov         eax,  INT [ebp]                     ; code = actbl->ehufco[0];
-    movzx       ecx, byte [ebp + 1024]              ; size = actbl->ehufsi[0];
-    EMIT_BITS   eax
-.EFN:
-    mov         eax, [esp+buffer]
-    pop         esi
-    ; Save put_buffer & put_bits
-    mov         dword [esi+8], put_buffer  ; state->cur.put_buffer = put_buffer;
-    mov         dword [esi+12], put_bits   ; state->cur.put_bits = put_bits;
+    align       16
+%ifdef PIC
+    times 6     nop
+%else
+    times 2     nop
+%endif
+.BLOOP1:                                                  ; do {  /* size = # of zero bits/elements to skip */
+; if size == 32, index remains unchanged.  Correct in .BRLOOP.
+    shr         index, sizeb                              ;   index >>= size;
+    lea         t, [t + size * SIZEOF_WORD]               ;   t += size;
+    cmp         size, 16                                  ;   if (size > 16)
+    jg          .BRLOOP1                                  ;     goto .BRLOOP1;
+.ERLOOP1:                                                 ; .ERLOOP1:
+    movsx       nbits, word [t]                           ;   nbits = *t;
+%ifdef PIC
+    add         size, size                                ;   size += size;
+%else
+    lea         size, [size * 2]                          ;   size += size;
+%endif
+    movd        mm_temp, nbits                            ;   temp = nbits;
+    movzx       nbits, byte [NBITS(nbits)]                ;   nbits = JPEG_NBITS(nbits);
+    lea         size, [size * 8 + nbits]                  ;   size = size * 8 + nbits;
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+                                                          ;   code = actbl->ehufco[size-16];
+    movzx       size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+                                                          ;   size = actbl->ehufsi[size-16];
+.BEGIN:                                                   ; .BEGIN:
+    pand        mm_temp, [MASK_BITS(nbits)]               ;   temp &= (1 << nbits) - 1;
+    psllq       mm_code, mm_nbits                         ;   code <<= nbits;
+    add         nbits, size                               ;   nbits += size;
+    por         mm_code, mm_temp                          ;   code |= temp;
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_ERLOOP1                             ;     insert code, flush buffer, init size, goto .BLOOP1
+    xor         size, size                                ;   size = 0;  /* kill tzcnt input dependency */
+    tzcnt       size, index                               ;   size = # of trailing 0 bits in index
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;   put_buffer <<= nbits;
+    inc         size                                      ;   ++size;
+    por         mm_put_buffer, mm_code                    ;   put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP1                                   ; } while (index != 0);
+; Round 2
+; t points to the last used word, possibly below t_ if the previous index had 32 zero bits.
+.ELOOP1:                                                  ; .ELOOP1:
+    pmovmskb    size, xmm4                                ; size = 0;  size |= ((b4[i] >> 7) << i);
+    pmovmskb    index, xmm5                               ; index = 0;  index |= ((b5[i] >> 7) << i);
+    shl         size, 16                                  ; size <<= 16;
+    or          index, size                               ; index |= size;
+    not         index                                     ; index = ~index;
+    lea         nbits, [t + (1 + DCTSIZE2) * SIZEOF_WORD]
+                                                          ; nbits = t + 1 + 64;
+    and         nbits, -DCTSIZE2 * SIZEOF_WORD            ; nbits &= -128;  /* now points to &t_[64] */
+    sub         nbits, t                                  ; nbits -= t;
+    shr         nbits, 1                                  ; nbits >>= 1;  /* # of leading 0 bits in old index + 33 */
+    tzcnt       size, index                               ; size = # of trailing 0 bits in index
+    inc         size                                      ; ++size;
+    test        index, index                              ; if (index == 0)
+    jz          .ELOOP2                                   ;   goto .ELOOP2;
+; NOTE: size == 32 cannot happen, since the last element is always 0.
+    shr         index, sizeb                              ; index >>= size;
+    lea         size, [size + nbits - 33]                 ; size = size + nbits - 33;
+    lea         t, [t + size * SIZEOF_WORD]               ; t += size;
+    cmp         size, 16                                  ; if (size <= 16)
+    jle         .ERLOOP2                                  ;   goto .ERLOOP2;
+.BRLOOP2:                                                 ; do {
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ;   nbits = actbl->ehufsi[0xf0];
+    sub         size, 16                                  ;   size -= 16;
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ;   code = actbl->ehufco[0xf0];
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP2                             ;     insert code and flush put_buffer
+    movd        mm_nbits, nbits                           ;   else { nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;     put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ;     put_buffer |= code;
+    cmp         size, 16                                  ;     if (size <= 16)
+    jle        .ERLOOP2                                   ;       goto .ERLOOP2;
+    jmp        .BRLOOP2                                   ; } while (1);
 
-    pop         ebp
-    pop         edi
-    pop         esi
-;   pop         edx                     ; need not be preserved
-    pop         ecx
-    pop         ebx
-    mov         esp, ebp                ; esp <- aligned ebp
-    pop         esp                     ; esp <- original ebp
-    pop         ebp
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align      16
+.BLOOP2:                                                  ; do {  /* size = # of zero bits/elements to skip */
+    shr         index, sizeb                              ;   index >>= size;
+    lea         t, [t + size * SIZEOF_WORD]               ;   t += size;
+    cmp         size, 16                                  ;   if (size > 16)
+    jg          .BRLOOP2                                  ;     goto .BRLOOP2;
+.ERLOOP2:                                                 ; .ERLOOP2:
+    movsx       nbits, word [t]                           ;   nbits = *t;
+    add         size, size                                ;   size += size;
+    movd        mm_temp, nbits                            ;   temp = nbits;
+    movzx       nbits, byte [NBITS(nbits)]                ;   nbits = JPEG_NBITS(nbits);
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    lea         size, [size * 8 + nbits]                  ;   size = size * 8 + nbits;
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+                                                          ;   code = actbl->ehufco[size-16];
+    movzx       size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+                                                          ;   size = actbl->ehufsi[size-16];
+    psllq       mm_code, mm_nbits                         ;   code <<= nbits;
+    pand        mm_temp, [MASK_BITS(nbits)]               ;   temp &= (1 << nbits) - 1;
+    lea         nbits, [nbits + size]                     ;   nbits += size;
+    por         mm_code, mm_temp                          ;   code |= temp;
+    xor         size, size                                ;   size = 0;  /* kill tzcnt input dependency */
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_ERLOOP2                             ;     insert code, flush buffer, init size, goto .BLOOP2
+    tzcnt       size, index                               ;   size = # of trailing 0 bits in index
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;   put_buffer <<= nbits;
+    inc         size                                      ;   ++size;
+    por         mm_put_buffer, mm_code                    ;   put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP2                                   ; } while (index != 0);
+.ELOOP2:                                                  ; .ELOOP2:
+    mov         nbits, t                                  ; nbits = t;
+    lea         t, [t + SIZEOF_WORD]                      ; t = &t[1];
+    and         nbits, DCTSIZE2 * SIZEOF_WORD - 1         ; nbits &= 127;
+    and         t, -DCTSIZE2 * SIZEOF_WORD                ; t &= -128;  /* t = &t_[0]; */
+    cmp         nbits, (DCTSIZE2 - 2) * SIZEOF_WORD       ; if (nbits != 62 * 2)
+    je          .EFN                                      ; {
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0]
+                                                          ;   code = actbl->ehufco[0];
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
+                                                          ;   nbits = actbl->ehufsi[0];
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jg          .EFN_SKIP_EMIT_CODE                       ;   {
+    EMIT_QWORD  size, sizeb, sizeh, , , , , , .EFN        ;     insert code, flush put_buffer
+    align       16
+.EFN_SKIP_EMIT_CODE:                                      ;   } else {
+    movd        mm_nbits, nbits                           ;     nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;     put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ;     put_buffer |= code;
+.EFN:                                                     ; } }
+%define frame  esp
+    mov         frame, [t + save_frame]
+%define state  ecx
+    mov         state, [frame + arg_state]
+    movq        [state + working_state.cur.put_buffer.simd], mm_put_buffer
+                                                          ; state->cur.put_buffer.simd = put_buffer;
+    emms
+    mov         [state + working_state.cur.free_bits], free_bits
+                                                          ; state->cur.free_bits = free_bits;
+    POP         edi
+    POP         esi
+    POP         ebp
+    POP         ebx
     ret
 
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP1:
+    EMIT_QWORD  emit_temp, emit_tempb, emit_temph, , , , , , \
+      .ERLOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_ERLOOP1:
+    EMIT_QWORD  size, sizeb, sizeh, \
+      { xor     size, size }, \
+      { tzcnt   size, index }, \
+      { inc     size }, \
+      { test    index, index }, \
+      { jnz     .BLOOP1 }, \
+      .ELOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP2:
+    EMIT_QWORD  emit_temp, emit_tempb, emit_temph, , , , \
+      { cmp     size, 16 }, \
+      { jle     .ERLOOP2 }, \
+      .BRLOOP2
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_ERLOOP2:
+    EMIT_QWORD  size, sizeb, sizeh, \
+      { xor     size, size }, \
+      { tzcnt   size, index }, \
+      { inc     size }, \
+      { test    index, index }, \
+      { jnz     .BLOOP2 }, \
+      .ELOOP2
+
 ; For some reason, the OS X linker does not honor the request to align the
 ; segment unless we do this.
     align       32
diff --git a/simd/jsimd.h b/simd/jsimd.h
index fdcc61e..64747c6 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -6,7 +6,8 @@
  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2014, Linaro Limited.
  * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -121,6 +122,8 @@
   (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
    JDIMENSION output_row, int num_rows);
 
+#ifndef NEON_INTRINSICS
+
 EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3
   (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
    JDIMENSION output_row, int num_rows);
@@ -128,6 +131,8 @@
   (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
    JDIMENSION output_row, int num_rows);
 
+#endif
+
 EXTERN(void) jsimd_rgb_ycc_convert_dspr2
   (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
    JDIMENSION output_row, int num_rows);
@@ -263,6 +268,28 @@
   (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
    JDIMENSION output_row, int num_rows);
 
+EXTERN(void) jsimd_rgb_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
 EXTERN(void) jsimd_rgb_gray_convert_dspr2
   (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
    JDIMENSION output_row, int num_rows);
@@ -285,6 +312,28 @@
   (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
    JDIMENSION output_row, int num_rows);
 
+EXTERN(void) jsimd_rgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
 EXTERN(void) jsimd_rgb_gray_convert_altivec
   (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
    JDIMENSION output_row, int num_rows);
@@ -401,6 +450,8 @@
   (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
    JSAMPARRAY output_buf, int num_rows);
 
+#ifndef NEON_INTRINSICS
+
 EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3
   (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
    JSAMPARRAY output_buf, int num_rows);
@@ -408,6 +459,8 @@
   (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
    JSAMPARRAY output_buf, int num_rows);
 
+#endif
+
 EXTERN(void) jsimd_ycc_rgb_convert_dspr2
   (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
    JSAMPARRAY output_buf, int num_rows);
@@ -562,6 +615,13 @@
   (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
    JSAMPARRAY *output_data_ptr);
 
+EXTERN(void) jsimd_h2v1_upsample_neon
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_neon
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
 EXTERN(void) jsimd_h2v1_upsample_dspr2
   (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
    JSAMPARRAY *output_data_ptr);
@@ -608,6 +668,12 @@
 EXTERN(void) jsimd_h2v1_fancy_upsample_neon
   (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
    JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_neon
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample_neon
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2
   (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
@@ -616,6 +682,9 @@
   (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
    JSAMPARRAY *output_data_ptr);
 
+EXTERN(void) jsimd_h2v1_fancy_upsample_mmi
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_fancy_upsample_mmi
   (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
    JSAMPARRAY *output_data_ptr);
@@ -762,6 +831,50 @@
   (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
    JSAMPARRAY output_buf);
 
+EXTERN(void) jsimd_h2v1_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
 EXTERN(void) jsimd_h2v1_merged_upsample_dspr2
   (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
    JSAMPARRAY output_buf, JSAMPLE *range);
@@ -806,6 +919,50 @@
   (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
    JSAMPARRAY output_buf, JSAMPLE *range);
 
+EXTERN(void) jsimd_h2v1_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
 EXTERN(void) jsimd_h2v1_merged_upsample_altivec
   (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
    JSAMPARRAY output_buf);
@@ -909,6 +1066,8 @@
 
 EXTERN(void) jsimd_fdct_ifast_dspr2(DCTELEM *data);
 
+EXTERN(void) jsimd_fdct_ifast_mmi(DCTELEM *data);
+
 EXTERN(void) jsimd_fdct_ifast_altivec(DCTELEM *data);
 
 /* Floating Point Forward DCT */
@@ -1040,6 +1199,10 @@
   (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
    const int *idct_coefs);
 
+EXTERN(void) jsimd_idct_ifast_mmi
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
 EXTERN(void) jsimd_idct_ifast_altivec
   (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
    JDIMENSION output_col);
@@ -1069,15 +1232,27 @@
   (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
    c_derived_tbl *dctbl, c_derived_tbl *actbl);
 
+#ifndef NEON_INTRINSICS
+
 EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl
   (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
    c_derived_tbl *dctbl, c_derived_tbl *actbl);
 
+#endif
+
 /* Progressive Huffman encoding */
 EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
    JCOEF *values, size_t *zerobits);
 
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits);
+
 EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
    JCOEF *absvalues, size_t *bits);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits);
diff --git a/simd/loongson/jccolext-mmi.c b/simd/loongson/jccolext-mmi.c
deleted file mode 100644
index 6cdeb5e..0000000
--- a/simd/loongson/jccolext-mmi.c
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
- *                          All Rights Reserved.
- * Authors:  ZhuChen     <zhuchen@loongson.cn>
- *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- *           CaiWanwei   <caiwanwei@loongson.cn>
- *           ZhangLixia  <zhanglixia-hf@loongson.cn>
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* This file is included by jccolor-mmi.c */
-
-
-#if RGB_RED == 0
-#define mmA  mm0
-#define mmB  mm1
-#elif RGB_GREEN == 0
-#define mmA  mm2
-#define mmB  mm3
-#elif RGB_BLUE == 0
-#define mmA  mm4
-#define mmB  mm5
-#else
-#define mmA  mm6
-#define mmB  mm7
-#endif
-
-#if RGB_RED == 1
-#define mmC  mm0
-#define mmD  mm1
-#elif RGB_GREEN == 1
-#define mmC  mm2
-#define mmD  mm3
-#elif RGB_BLUE == 1
-#define mmC  mm4
-#define mmD  mm5
-#else
-#define mmC  mm6
-#define mmD  mm7
-#endif
-
-#if RGB_RED == 2
-#define mmE  mm0
-#define mmF  mm1
-#elif RGB_GREEN == 2
-#define mmE  mm2
-#define mmF  mm3
-#elif RGB_BLUE == 2
-#define mmE  mm4
-#define mmF  mm5
-#else
-#define mmE  mm6
-#define mmF  mm7
-#endif
-
-#if RGB_RED == 3
-#define mmG  mm0
-#define mmH  mm1
-#elif RGB_GREEN == 3
-#define mmG  mm2
-#define mmH  mm3
-#elif RGB_BLUE == 3
-#define mmG  mm4
-#define mmH  mm5
-#else
-#define mmG  mm6
-#define mmH  mm7
-#endif
-
-
-void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
-                               JSAMPIMAGE output_buf, JDIMENSION output_row,
-                               int num_rows)
-{
-  JSAMPROW inptr, outptr0, outptr1, outptr2;
-  int num_cols, col;
-  __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
-  __m64 wk[7];
-  __m64 Y_BG, Cb_RG, Cr_BG;
-
-  while (--num_rows >= 0) {
-    inptr = *input_buf++;
-    outptr0 = output_buf[0][output_row];
-    outptr1 = output_buf[1][output_row];
-    outptr2 = output_buf[2][output_row];
-    output_row++;
-
-    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
-         outptr0 += 8, outptr1 += 8, outptr2 += 8) {
-
-#if RGB_PIXELSIZE == 3
-
-      if (num_cols < 8) {
-        col = num_cols * 3;
-        asm(".set noreorder\r\n"
-
-            "li     $8, 1\r\n"
-            "move   $9, %3\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 1f\r\n"
-            "nop    \r\n"
-            "subu   $9, $9, 1\r\n"
-            "xor    $12, $12, $12\r\n"
-            "move   $13, %5\r\n"
-            "dadd   $13, $13, $9\r\n"
-            "lbu    $12, 0($13)\r\n"
-
-            "1:     \r\n"
-            "li     $8, 2\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 2f\r\n"
-            "nop    \r\n"
-            "subu   $9, $9, 2\r\n"
-            "xor    $11, $11, $11\r\n"
-            "move   $13, %5\r\n"
-            "dadd   $13, $13, $9\r\n"
-            "lhu    $11, 0($13)\r\n"
-            "sll    $12, $12, 16\r\n"
-            "or     $12, $12, $11\r\n"
-
-            "2:     \r\n"
-            "dmtc1  $12, %0\r\n"
-            "li     $8, 4\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 3f\r\n"
-            "nop    \r\n"
-            "subu   $9, $9, 4\r\n"
-            "move   $13, %5\r\n"
-            "dadd   $13, $13, $9\r\n"
-            "lwu    $14, 0($13)\r\n"
-            "dmtc1  $14, %1\r\n"
-            "dsll32 $12, $12, 0\r\n"
-            "or     $12, $12, $14\r\n"
-            "dmtc1  $12, %0\r\n"
-
-            "3:     \r\n"
-            "li     $8, 8\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 4f\r\n"
-            "nop    \r\n"
-            "mov.s  %1, %0\r\n"
-            "ldc1   %0, 0(%5)\r\n"
-            "li     $9, 8\r\n"
-            "j      5f\r\n"
-            "nop    \r\n"
-
-            "4:     \r\n"
-            "li     $8, 16\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 5f\r\n"
-            "nop    \r\n"
-            "mov.s  %2, %0\r\n"
-            "ldc1   %0, 0(%5)\r\n"
-            "ldc1   %1, 8(%5)\r\n"
-
-            "5:     \r\n"
-            "nop    \r\n"
-            ".set reorder\r\n"
-
-            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
-            : "r" (col), "r" (num_rows), "r" (inptr)
-            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
-              "$14", "memory"
-           );
-      } else {
-        if (!(((long)inptr) & 7)) {
-          mmA = _mm_load_si64((__m64 *)&inptr[0]);
-          mmG = _mm_load_si64((__m64 *)&inptr[8]);
-          mmF = _mm_load_si64((__m64 *)&inptr[16]);
-        } else {
-          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
-          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
-          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
-        }
-        inptr += RGB_PIXELSIZE * 8;
-      }
-      mmD = mmA;
-      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
-      mmD = _mm_srli_si64(mmD, 4 * BYTE_BIT);
-
-      mmA = _mm_unpackhi_pi8(mmA, mmG);
-      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
-
-      mmD = _mm_unpacklo_pi8(mmD, mmF);
-      mmG = _mm_unpackhi_pi8(mmG, mmF);
-
-      mmE = mmA;
-      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
-      mmE = _mm_srli_si64(mmE, 4 * BYTE_BIT);
-
-      mmA = _mm_unpackhi_pi8(mmA, mmD);
-      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
-
-      mmE = _mm_unpacklo_pi8(mmE, mmG);
-      mmD = _mm_unpackhi_pi8(mmD, mmG);
-      mmC = mmA;
-      mmA = _mm_loadlo_pi8_f(mmA);
-      mmC = _mm_loadhi_pi8_f(mmC);
-
-      mmB = mmE;
-      mmE = _mm_loadlo_pi8_f(mmE);
-      mmB = _mm_loadhi_pi8_f(mmB);
-
-      mmF = mmD;
-      mmD = _mm_loadlo_pi8_f(mmD);
-      mmF = _mm_loadhi_pi8_f(mmF);
-
-#else  /* RGB_PIXELSIZE == 4 */
-
-      if (num_cols < 8) {
-        col = num_cols;
-        asm(".set noreorder\r\n"
-
-            "li     $8, 1\r\n"
-            "move   $9, %4\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 1f\r\n"
-            "nop    \r\n"
-            "subu   $9, $9, 1\r\n"
-            "dsll   $11, $9, 2\r\n"
-            "move   $13, %5\r\n"
-            "daddu  $13, $13, $11\r\n"
-            "lwc1   %0, 0($13)\r\n"
-
-            "1:     \r\n"
-            "li     $8, 2\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 2f\r\n"
-            "nop    \r\n"
-            "subu   $9, $9, 2\r\n"
-            "dsll   $11, $9, 2\r\n"
-            "move   $13, %5\r\n"
-            "daddu  $13, $13, $11\r\n"
-            "mov.s  %1, %0\r\n"
-            "ldc1   %0, 0($13)\r\n"
-
-            "2:     \r\n"
-            "li     $8, 4\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 3f\r\n"
-            "nop    \r\n"
-            "mov.s  %2, %0\r\n"
-            "mov.s  %3, %1\r\n"
-            "ldc1   %0, 0(%5)\r\n"
-            "ldc1   %1, 8(%5)\r\n"
-
-            "3:     \r\n"
-            "nop    \r\n"
-            ".set reorder\r\n"
-
-            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
-            : "r" (col), "r" (inptr)
-            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
-           );
-      } else {
-        if (!(((long)inptr) & 7)) {
-          mmA = _mm_load_si64((__m64 *)&inptr[0]);
-          mmF = _mm_load_si64((__m64 *)&inptr[8]);
-          mmD = _mm_load_si64((__m64 *)&inptr[16]);
-          mmC = _mm_load_si64((__m64 *)&inptr[24]);
-        } else {
-          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
-          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
-          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
-          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
-        }
-        inptr += RGB_PIXELSIZE * 8;
-      }
-      mmB = mmA;
-      mmA = _mm_unpacklo_pi8(mmA, mmF);
-      mmB = _mm_unpackhi_pi8(mmB, mmF);
-
-      mmG = mmD;
-      mmD = _mm_unpacklo_pi8(mmD, mmC);
-      mmG = _mm_unpackhi_pi8(mmG, mmC);
-
-      mmE = mmA;
-      mmA = _mm_unpacklo_pi16(mmA, mmD);
-      mmE = _mm_unpackhi_pi16(mmE, mmD);
-
-      mmH = mmB;
-      mmB = _mm_unpacklo_pi16(mmB, mmG);
-      mmH = _mm_unpackhi_pi16(mmH, mmG);
-
-      mmC = mmA;
-      mmA = _mm_loadlo_pi8_f(mmA);
-      mmC = _mm_loadhi_pi8_f(mmC);
-
-      mmD = mmB;
-      mmB = _mm_loadlo_pi8_f(mmB);
-      mmD = _mm_loadhi_pi8_f(mmD);
-
-      mmG = mmE;
-      mmE = _mm_loadlo_pi8_f(mmE);
-      mmG = _mm_loadhi_pi8_f(mmG);
-
-      mmF = mmH;
-      mmF = _mm_unpacklo_pi8(mmF, mmH);
-      mmH = _mm_unpackhi_pi8(mmH, mmH);
-      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
-      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
-
-#endif
-
-      wk[0] = mm0;
-      wk[1] = mm1;
-      wk[2] = mm4;
-      wk[3] = mm5;
-
-      mm6 = mm1;
-      mm1 = _mm_unpacklo_pi16(mm1, mm3);
-      mm6 = _mm_unpackhi_pi16(mm6, mm3);
-      mm7 = mm1;
-      mm4 = mm6;
-      mm1 = _mm_madd_pi16(mm1, PW_F0299_F0337);
-      mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
-      mm7 = _mm_madd_pi16(mm7, PW_MF016_MF033);
-      mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
-
-      wk[4] = mm1;
-      wk[5] = mm6;
-
-      mm1 = _mm_loadlo_pi16_f(mm5);
-      mm6 = _mm_loadhi_pi16_f(mm5);
-      mm1 = _mm_srli_pi32(mm1, 1);
-      mm6 = _mm_srli_pi32(mm6, 1);
-
-      mm5 = PD_ONEHALFM1_CJ;
-      mm7 = _mm_add_pi32(mm7, mm1);
-      mm4 = _mm_add_pi32(mm4, mm6);
-      mm7 = _mm_add_pi32(mm7, mm5);
-      mm4 = _mm_add_pi32(mm4, mm5);
-      mm7 = _mm_srli_pi32(mm7, SCALEBITS);
-      mm4 = _mm_srli_pi32(mm4, SCALEBITS);
-      mm7 = _mm_packs_pi32(mm7, mm4);
-
-      mm1 = wk[2];
-      mm6 = mm0;
-      mm0 = _mm_unpacklo_pi16(mm0, mm2);
-      mm6 = _mm_unpackhi_pi16(mm6, mm2);
-      mm5 = mm0;
-      mm4 = mm6;
-      mm0 = _mm_madd_pi16(mm0, PW_F0299_F0337);
-      mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
-      mm5 = _mm_madd_pi16(mm5, PW_MF016_MF033);
-      mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
-
-      wk[6] = mm0;
-      wk[7] = mm6;
-      mm0 = _mm_loadlo_pi16_f(mm1);
-      mm6 = _mm_loadhi_pi16_f(mm1);
-      mm0 = _mm_srli_pi32(mm0, 1);
-      mm6 = _mm_srli_pi32(mm6, 1);
-
-      mm1 = PD_ONEHALFM1_CJ;
-      mm5 = _mm_add_pi32(mm5, mm0);
-      mm4 = _mm_add_pi32(mm4, mm6);
-      mm5 = _mm_add_pi32(mm5, mm1);
-      mm4 = _mm_add_pi32(mm4, mm1);
-      mm5 = _mm_srli_pi32(mm5, SCALEBITS);
-      mm4 = _mm_srli_pi32(mm4, SCALEBITS);
-      mm5 = _mm_packs_pi32(mm5, mm4);
-
-      mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
-      mm5  = _mm_or_si64(mm5, mm7);
-      Cb_RG = mm5;
-
-      mm0 = wk[3];
-      mm6 = wk[2];
-      mm1 = wk[1];
-
-      mm4 = mm0;
-      mm0 = _mm_unpacklo_pi16(mm0, mm3);
-      mm4 = _mm_unpackhi_pi16(mm4, mm3);
-      mm7 = mm0;
-      mm5 = mm4;
-      mm0 = _mm_madd_pi16(mm0, PW_F0114_F0250);
-      mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
-      mm7 = _mm_madd_pi16(mm7, PW_MF008_MF041);
-      mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
-
-      mm3 = PD_ONEHALF;
-      mm0 = _mm_add_pi32(mm0, wk[4]);
-      mm4 = _mm_add_pi32(mm4, wk[5]);
-      mm0 = _mm_add_pi32(mm0, mm3);
-      mm4 = _mm_add_pi32(mm4, mm3);
-      mm0 = _mm_srli_pi32(mm0, SCALEBITS);
-      mm4 = _mm_srli_pi32(mm4, SCALEBITS);
-      mm0 = _mm_packs_pi32(mm0, mm4);
-
-      mm3 = _mm_loadlo_pi16_f(mm1);
-      mm4 = _mm_loadhi_pi16_f(mm1);
-      mm3 = _mm_srli_pi32(mm3, 1);
-      mm4 = _mm_srli_pi32(mm4, 1);
-
-      mm1 = PD_ONEHALFM1_CJ;
-      mm7 = _mm_add_pi32(mm7, mm3);
-      mm5 = _mm_add_pi32(mm5, mm4);
-      mm7 = _mm_add_pi32(mm7, mm1);
-      mm5 = _mm_add_pi32(mm5, mm1);
-      mm7 = _mm_srli_pi32(mm7, SCALEBITS);
-      mm5 = _mm_srli_pi32(mm5, SCALEBITS);
-      mm7 = _mm_packs_pi32(mm7, mm5);
-
-      mm3 = wk[0];
-      mm4 = mm6;
-      mm6 = _mm_unpacklo_pi16(mm6, mm2);
-      mm4 = _mm_unpackhi_pi16(mm4, mm2);
-      mm1 = mm6;
-      mm5 = mm4;
-      mm6 = _mm_madd_pi16(mm6, PW_F0114_F0250);
-      mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
-      mm1 = _mm_madd_pi16(mm1, PW_MF008_MF041);
-      mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
-
-      mm2 = PD_ONEHALF;
-      mm6 = _mm_add_pi32(mm6, wk[6]);
-      mm4 = _mm_add_pi32(mm4, wk[7]);
-      mm6 = _mm_add_pi32(mm6, mm2);
-      mm4 = _mm_add_pi32(mm4, mm2);
-      mm6 = _mm_srli_pi32(mm6, SCALEBITS);
-      mm4 = _mm_srli_pi32(mm4, SCALEBITS);
-      mm6 = _mm_packs_pi32(mm6, mm4);
-
-      mm0 = _mm_slli_pi16(mm0, BYTE_BIT);
-      mm6 = _mm_or_si64(mm6, mm0);
-      Y_BG = mm6;
-
-      mm2 = _mm_loadlo_pi16_f(mm3);
-      mm4 = _mm_loadhi_pi16_f(mm3);
-      mm2 = _mm_srli_pi32(mm2, 1);
-      mm4 = _mm_srli_pi32(mm4, 1);
-
-      mm0 = PD_ONEHALFM1_CJ;
-      mm1 = _mm_add_pi32(mm1, mm2);
-      mm5 = _mm_add_pi32(mm5, mm4);
-      mm1 = _mm_add_pi32(mm1, mm0);
-      mm5 = _mm_add_pi32(mm5, mm0);
-      mm1 = _mm_srli_pi32(mm1, SCALEBITS);
-      mm5 = _mm_srli_pi32(mm5, SCALEBITS);
-      mm1 = _mm_packs_pi32(mm1, mm5);
-
-      mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
-      mm1 = _mm_or_si64(mm1, mm7);
-      Cr_BG = mm1;
-
-      _mm_store_si64((__m64 *)&outptr0[0], Y_BG);
-      _mm_store_si64((__m64 *)&outptr1[0], Cb_RG);
-      _mm_store_si64((__m64 *)&outptr2[0], Cr_BG);
-    }
-  }
-}
-
-#undef mmA
-#undef mmB
-#undef mmC
-#undef mmD
-#undef mmE
-#undef mmF
-#undef mmG
-#undef mmH
diff --git a/simd/loongson/jcsample-mmi.c b/simd/loongson/jcsample-mmi.c
deleted file mode 100644
index 2f2d851..0000000
--- a/simd/loongson/jcsample-mmi.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright (C) 2015, 2018, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- *                          All Rights Reserved.
- * Authors:  ZhuChen     <zhuchen@loongson.cn>
- *           CaiWanwei   <caiwanwei@loongson.cn>
- *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* CHROMA DOWNSAMPLING */
-
-#include "jsimd_mmi.h"
-#include "jcsample.h"
-
-
-void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor,
-                               JDIMENSION v_samp_factor,
-                               JDIMENSION width_in_blocks,
-                               JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  int inrow, outrow, outcol, bias;
-  JDIMENSION output_cols = width_in_blocks * DCTSIZE;
-  JSAMPROW inptr0, inptr1, outptr;
-  __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6 = 0.0, mm7;
-
-  expand_right_edge(input_data, max_v_samp_factor, image_width,
-                    output_cols * 2);
-
-  bias = (1 << 17) + 1;                      /* 0x00020001 (bias pattern) */
-  mm7 = _mm_set1_pi32(bias);                 /* mm7={1, 2, 1, 2} */
-  mm6 = _mm_cmpeq_pi16(mm6, mm6);
-  mm6 = _mm_srli_pi16(mm6, BYTE_BIT);        /* mm6={0xFF 0x00 0xFF 0x00 ..} */
-
-  for (inrow = 0, outrow = 0; outrow < v_samp_factor;
-       inrow += 2, outrow++) {
-
-    inptr0 = input_data[inrow];
-    inptr1 = input_data[inrow + 1];
-    outptr = output_data[outrow];
-
-    for (outcol = output_cols; outcol > 0;
-         outcol -= 8, inptr0 += 16, inptr1 += 16, outptr += 8) {
-
-      mm0 = _mm_load_si64((__m64 *)&inptr0[0]);
-      mm1 = _mm_load_si64((__m64 *)&inptr1[0]);
-      mm2 = _mm_load_si64((__m64 *)&inptr0[8]);
-      mm3 = _mm_load_si64((__m64 *)&inptr1[8]);
-
-      mm4 = mm0;
-      mm5 = mm1;
-      mm0 = _mm_and_si64(mm0, mm6);
-      mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
-      mm1 = _mm_and_si64(mm1, mm6);
-      mm5 = _mm_srli_pi16(mm5, BYTE_BIT);
-      mm0 = _mm_add_pi16(mm0, mm4);
-      mm1 = _mm_add_pi16(mm1, mm5);
-
-      mm4 = mm2;
-      mm5 = mm3;
-      mm2 = _mm_and_si64(mm2, mm6);
-      mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
-      mm3 = _mm_and_si64(mm3, mm6);
-      mm5 = _mm_srli_pi16(mm5, BYTE_BIT);
-      mm2 = _mm_add_pi16(mm2, mm4);
-      mm3 = _mm_add_pi16(mm3, mm5);
-
-      mm0 = _mm_add_pi16(mm0, mm1);
-      mm2 = _mm_add_pi16(mm2, mm3);
-      mm0 = _mm_add_pi16(mm0, mm7);
-      mm2 = _mm_add_pi16(mm2, mm7);
-      mm0 = _mm_srli_pi16(mm0, 2);
-      mm2 = _mm_srli_pi16(mm2, 2);
-
-      mm0 = _mm_packs_pu16(mm0, mm2);
-
-      _mm_store_si64((__m64 *)&outptr[0], mm0);
-    }
-  }
-}
diff --git a/simd/loongson/jdcolext-mmi.c b/simd/loongson/jdcolext-mmi.c
deleted file mode 100644
index 560d9b0..0000000
--- a/simd/loongson/jdcolext-mmi.c
+++ /dev/null
@@ -1,424 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- *                          All Rights Reserved.
- * Authors:  ZhuChen     <zhuchen@loongson.cn>
- *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- *           CaiWanwei   <caiwanwei@loongson.cn>
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* This file is included by jdcolor-mmi.c */
-
-
-#if RGB_RED == 0
-#define mmA  mm0
-#define mmB  mm1
-#elif RGB_GREEN == 0
-#define mmA  mm2
-#define mmB  mm3
-#elif RGB_BLUE == 0
-#define mmA  mm4
-#define mmB  mm5
-#else
-#define mmA  mm6
-#define mmB  mm7
-#endif
-
-#if RGB_RED == 1
-#define mmC  mm0
-#define mmD  mm1
-#elif RGB_GREEN == 1
-#define mmC  mm2
-#define mmD  mm3
-#elif RGB_BLUE == 1
-#define mmC  mm4
-#define mmD  mm5
-#else
-#define mmC  mm6
-#define mmD  mm7
-#endif
-
-#if RGB_RED == 2
-#define mmE  mm0
-#define mmF  mm1
-#elif RGB_GREEN == 2
-#define mmE  mm2
-#define mmF  mm3
-#elif RGB_BLUE == 2
-#define mmE  mm4
-#define mmF  mm5
-#else
-#define mmE  mm6
-#define mmF  mm7
-#endif
-
-#if RGB_RED == 3
-#define mmG  mm0
-#define mmH  mm1
-#elif RGB_GREEN == 3
-#define mmG  mm2
-#define mmH  mm3
-#elif RGB_BLUE == 3
-#define mmG  mm4
-#define mmH  mm5
-#else
-#define mmG  mm6
-#define mmH  mm7
-#endif
-
-
-void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
-                               JDIMENSION input_row, JSAMPARRAY output_buf,
-                               int num_rows)
-{
-  JSAMPROW outptr, inptr0, inptr1, inptr2;
-  int num_cols, col;
-  __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
-  __m64 mm8, wk[2];
-
-  while (--num_rows >= 0) {
-    inptr0 = input_buf[0][input_row];
-    inptr1 = input_buf[1][input_row];
-    inptr2 = input_buf[2][input_row];
-    input_row++;
-    outptr = *output_buf++;
-
-    for (num_cols = out_width; num_cols > 0; num_cols -= 8,
-         inptr0 += 8, inptr1 += 8, inptr2 += 8) {
-
-      mm5 = _mm_load_si64((__m64 *)inptr1);
-      mm1 = _mm_load_si64((__m64 *)inptr2);
-      mm8 = _mm_load_si64((__m64 *)inptr0);
-      mm4 = 0;
-      mm7 = 0;
-      mm4 = _mm_cmpeq_pi16(mm4, mm4);
-      mm7 = _mm_cmpeq_pi16(mm7, mm7);
-      mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
-      mm7 = _mm_slli_pi16(mm7, 7);      /* mm7={0xFF80 0xFF80 0xFF80 0xFF80} */
-      mm0 = mm4;                        /* mm0=mm4={0xFF 0x00 0xFF 0x00 ..} */
-
-      mm4 = _mm_and_si64(mm4, mm5);           /* mm4=Cb(0246)=CbE */
-      mm5 = _mm_srli_pi16(mm5, BYTE_BIT);     /* mm5=Cb(1357)=CbO */
-      mm0 = _mm_and_si64(mm0, mm1);           /* mm0=Cr(0246)=CrE */
-      mm1 = _mm_srli_pi16(mm1, BYTE_BIT);     /* mm1=Cr(1357)=CrO */
-      mm4 = _mm_add_pi16(mm4, mm7);
-      mm5 = _mm_add_pi16(mm5, mm7);
-      mm0 = _mm_add_pi16(mm0, mm7);
-      mm1 = _mm_add_pi16(mm1, mm7);
-
-      /* (Original)
-       * R = Y                + 1.40200 * Cr
-       * G = Y - 0.34414 * Cb - 0.71414 * Cr
-       * B = Y + 1.77200 * Cb
-       *
-       * (This implementation)
-       * R = Y                + 0.40200 * Cr + Cr
-       * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-       * B = Y - 0.22800 * Cb + Cb + Cb
-       */
-
-      mm2 = mm4;                              /* mm2 = CbE */
-      mm3 = mm5;                              /* mm3 = CbO */
-      mm4 = _mm_add_pi16(mm4, mm4);           /* mm4 = 2*CbE */
-      mm5 = _mm_add_pi16(mm5, mm5);           /* mm5 = 2*CbO */
-      mm6 = mm0;                              /* mm6 = CrE */
-      mm7 = mm1;                              /* mm7 = CrO */
-      mm0 = _mm_add_pi16(mm0, mm0);           /* mm0 = 2*CrE */
-      mm1 = _mm_add_pi16(mm1, mm1);           /* mm1 = 2*CrO */
-
-      mm4 = _mm_mulhi_pi16(mm4, PW_MF0228);   /* mm4=(2*CbE * -FIX(0.22800) */
-      mm5 = _mm_mulhi_pi16(mm5, PW_MF0228);   /* mm5=(2*CbO * -FIX(0.22800) */
-      mm0 = _mm_mulhi_pi16(mm0, PW_F0402);    /* mm0=(2*CrE * FIX(0.40200)) */
-      mm1 = _mm_mulhi_pi16(mm1, PW_F0402);    /* mm1=(2*CrO * FIX(0.40200)) */
-
-      mm4 = _mm_add_pi16(mm4, PW_ONE);
-      mm5 = _mm_add_pi16(mm5, PW_ONE);
-      mm4 = _mm_srai_pi16(mm4, 1);            /* mm4=(CbE * -FIX(0.22800)) */
-      mm5 = _mm_srai_pi16(mm5, 1);            /* mm5=(CbO * -FIX(0.22800)) */
-      mm0 = _mm_add_pi16(mm0, PW_ONE);
-      mm1 = _mm_add_pi16(mm1, PW_ONE);
-      mm0 = _mm_srai_pi16(mm0, 1);            /* mm0=(CrE * FIX(0.40200)) */
-      mm1 = _mm_srai_pi16(mm1, 1);            /* mm1=(CrO * FIX(0.40200)) */
-
-      mm4 = _mm_add_pi16(mm4, mm2);
-      mm5 = _mm_add_pi16(mm5, mm3);
-      mm4 = _mm_add_pi16(mm4, mm2);       /* mm4=(CbE * FIX(1.77200))=(B-Y)E */
-      mm5 = _mm_add_pi16(mm5, mm3);       /* mm5=(CbO * FIX(1.77200))=(B-Y)O */
-      mm0 = _mm_add_pi16(mm0, mm6);       /* mm0=(CrE * FIX(1.40200))=(R-Y)E */
-      mm1 = _mm_add_pi16(mm1, mm7);       /* mm1=(CrO * FIX(1.40200))=(R-Y)O */
-
-      wk[0] = mm4;                            /* wk(0)=(B-Y)E */
-      wk[1] = mm5;                            /* wk(1)=(B-Y)O */
-
-      mm4 = mm2;
-      mm5 = mm3;
-      mm2 = _mm_unpacklo_pi16(mm2, mm6);
-      mm4 = _mm_unpackhi_pi16(mm4, mm6);
-      mm2 = _mm_madd_pi16(mm2, PW_MF0344_F0285);
-      mm4 = _mm_madd_pi16(mm4, PW_MF0344_F0285);
-      mm3 = _mm_unpacklo_pi16(mm3, mm7);
-      mm5 = _mm_unpackhi_pi16(mm5, mm7);
-      mm3 = _mm_madd_pi16(mm3, PW_MF0344_F0285);
-      mm5 = _mm_madd_pi16(mm5, PW_MF0344_F0285);
-
-      mm2 = _mm_add_pi32(mm2, PD_ONEHALF);
-      mm4 = _mm_add_pi32(mm4, PD_ONEHALF);
-      mm2 = _mm_srai_pi32(mm2, SCALEBITS);
-      mm4 = _mm_srai_pi32(mm4, SCALEBITS);
-      mm3 = _mm_add_pi32(mm3, PD_ONEHALF);
-      mm5 = _mm_add_pi32(mm5, PD_ONEHALF);
-      mm3 = _mm_srai_pi32(mm3, SCALEBITS);
-      mm5 = _mm_srai_pi32(mm5, SCALEBITS);
-
-      mm2 = _mm_packs_pi32(mm2, mm4);  /* mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) */
-      mm3 = _mm_packs_pi32(mm3, mm5);  /* mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) */
-      mm2 = _mm_sub_pi16(mm2, mm6);  /* mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
-      mm3 = _mm_sub_pi16(mm3, mm7);  /* mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
-
-      mm5 = mm8;                              /* mm5=Y(01234567) */
-
-      mm4 = _mm_cmpeq_pi16(mm4, mm4);
-      mm4 = _mm_srli_pi16(mm4, BYTE_BIT);    /* mm4={0xFF 0x00 0xFF 0x00 ..} */
-      mm4 = _mm_and_si64(mm4, mm5);          /* mm4=Y(0246)=YE */
-      mm5 = _mm_srli_pi16(mm5, BYTE_BIT);    /* mm5=Y(1357)=YO */
-
-      mm0 = _mm_add_pi16(mm0, mm4);      /* mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) */
-      mm1 = _mm_add_pi16(mm1, mm5);      /* mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) */
-      mm0 = _mm_packs_pu16(mm0, mm0);    /* mm0=(R0 R2 R4 R6 ** ** ** **) */
-      mm1 = _mm_packs_pu16(mm1, mm1);    /* mm1=(R1 R3 R5 R7 ** ** ** **) */
-
-      mm2 = _mm_add_pi16(mm2, mm4);      /* mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) */
-      mm3 = _mm_add_pi16(mm3, mm5);      /* mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) */
-      mm2 = _mm_packs_pu16(mm2, mm2);    /* mm2=(G0 G2 G4 G6 ** ** ** **) */
-      mm3 = _mm_packs_pu16(mm3, mm3);    /* mm3=(G1 G3 G5 G7 ** ** ** **) */
-
-      mm4 = _mm_add_pi16(mm4, wk[0]);    /* mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) */
-      mm5 = _mm_add_pi16(mm5, wk[1]);    /* mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) */
-      mm4 = _mm_packs_pu16(mm4, mm4);    /* mm4=(B0 B2 B4 B6 ** ** ** **) */
-      mm5 = _mm_packs_pu16(mm5, mm5);    /* mm5=(B1 B3 B5 B7 ** ** ** **) */
-
-#if RGB_PIXELSIZE == 3
-
-      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
-      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
-      mmA = _mm_unpacklo_pi8(mmA, mmC);     /* mmA=(00 10 02 12 04 14 06 16) */
-      mmE = _mm_unpacklo_pi8(mmE, mmB);     /* mmE=(20 01 22 03 24 05 26 07) */
-      mmD = _mm_unpacklo_pi8(mmD, mmF);     /* mmD=(11 21 13 23 15 25 17 27) */
-
-      mmG = mmA;
-      mmH = mmA;
-      mmA = _mm_unpacklo_pi16(mmA, mmE);    /* mmA=(00 10 20 01 02 12 22 03) */
-      mmG = _mm_unpackhi_pi16(mmG, mmE);    /* mmG=(04 14 24 05 06 16 26 07) */
-
-      mmH = _mm_srli_si64(mmH, 2 * BYTE_BIT);
-      mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT);
-
-      mmC = mmD;
-      mmB = mmD;
-      mmD = _mm_unpacklo_pi16(mmD, mmH);    /* mmD=(11 21 02 12 13 23 04 14) */
-      mmC = _mm_unpackhi_pi16(mmC, mmH);    /* mmC=(15 25 06 16 17 27 -- --) */
-
-      mmB = _mm_srli_si64(mmB, 2 * BYTE_BIT); /* mmB=(13 23 15 25 17 27 -- --) */
-
-      mmF = mmE;
-      mmE = _mm_unpacklo_pi16(mmE, mmB);    /* mmE=(22 03 13 23 24 05 15 25) */
-      mmF = _mm_unpackhi_pi16(mmF, mmB);    /* mmF=(26 07 17 27 -- -- -- --) */
-
-      mmA = _mm_unpacklo_pi32(mmA, mmD);    /* mmA=(00 10 20 01 11 21 02 12) */
-      mmE = _mm_unpacklo_pi32(mmE, mmG);    /* mmE=(22 03 13 23 04 14 24 05) */
-      mmC = _mm_unpacklo_pi32(mmC, mmF);    /* mmC=(15 25 06 16 26 07 17 27) */
-
-      if (num_cols >= 8) {
-        _mm_store_si64((__m64 *)outptr, mmA);
-        _mm_store_si64((__m64 *)(outptr + 8), mmE);
-        _mm_store_si64((__m64 *)(outptr + 16), mmC);
-        outptr += RGB_PIXELSIZE * 8;
-      } else {
-        col = num_cols * 3;
-        asm(".set noreorder\r\n"
-
-            "li      $8, 16\r\n"
-            "move    $9, %4\r\n"
-            "mov.s   $f4, %1\r\n"
-            "mov.s   $f6, %3\r\n"
-            "move    $10, %5\r\n"
-            "bltu    $9, $8, 1f\r\n"
-            "nop     \r\n"
-            "gssdlc1 $f4, 7($10)\r\n"
-            "gssdrc1 $f4, 0($10)\r\n"
-            "gssdlc1 $f6, 7+8($10)\r\n"
-            "gssdrc1 $f6, 8($10)\r\n"
-            "mov.s   $f4, %2\r\n"
-            "subu    $9, $9, 16\r\n"
-            "daddu   $10, $10, 16\r\n"
-            "b       2f\r\n"
-            "nop     \r\n"
-
-            "1:      \r\n"
-            "li      $8, 8\r\n"               /* st8 */
-            "bltu    $9, $8, 2f\r\n"
-            "nop     \r\n"
-            "gssdlc1 $f4, 7($10)\r\n"
-            "gssdrc1 $f4, ($10)\r\n"
-            "mov.s   $f4, %3\r\n"
-            "subu    $9, $9, 8\r\n"
-            "daddu   $10, $10, 8\r\n"
-
-            "2:      \r\n"
-            "li      $8, 4\r\n"               /* st4 */
-            "mfc1    $11, $f4\r\n"
-            "bltu    $9, $8, 3f\r\n"
-            "nop     \r\n"
-            "swl     $11, 3($10)\r\n"
-            "swr     $11, 0($10)\r\n"
-            "li      $8, 32\r\n"
-            "mtc1    $8, $f6\r\n"
-            "dsrl    $f4, $f4, $f6\r\n"
-            "mfc1    $11, $f4\r\n"
-            "subu    $9, $9, 4\r\n"
-            "daddu   $10, $10, 4\r\n"
-
-            "3:      \r\n"
-            "li      $8, 2\r\n"               /* st2 */
-            "bltu    $9, $8, 4f\r\n"
-            "nop     \r\n"
-            "ush     $11, 0($10)\r\n"
-            "srl     $11, 16\r\n"
-            "subu    $9, $9, 2\r\n"
-            "daddu   $10, $10, 2\r\n"
-
-            "4:      \r\n"
-            "li      $8, 1\r\n"               /* st1 */
-            "bltu    $9, $8, 5f\r\n"
-            "nop     \r\n"
-            "sb      $11, 0($10)\r\n"
-
-            "5:      \r\n"
-            "nop     \r\n"                    /* end */
-            : "=m" (*outptr)
-            : "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr)
-            : "$f4", "$f6", "$8", "$9", "$10", "$11", "memory"
-           );
-      }
-
-#else  /* RGB_PIXELSIZE == 4 */
-
-#ifdef RGBX_FILLER_0XFF
-      mm6 = _mm_cmpeq_pi8(mm6, mm6);
-      mm7 = _mm_cmpeq_pi8(mm7, mm7);
-#else
-      mm6 = _mm_xor_si64(mm6, mm6);
-      mm7 = _mm_xor_si64(mm7, mm7);
-#endif
-      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
-      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
-      /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */
-      /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */
-
-      mmA = _mm_unpacklo_pi8(mmA, mmC);     /* mmA=(00 10 02 12 04 14 06 16) */
-      mmE = _mm_unpacklo_pi8(mmE, mmG);     /* mmE=(20 30 22 32 24 34 26 36) */
-      mmB = _mm_unpacklo_pi8(mmB, mmD);     /* mmB=(01 11 03 13 05 15 07 17) */
-      mmF = _mm_unpacklo_pi8(mmF, mmH);     /* mmF=(21 31 23 33 25 35 27 37) */
-
-      mmC = mmA;
-      mmA = _mm_unpacklo_pi16(mmA, mmE);    /* mmA=(00 10 20 30 02 12 22 32) */
-      mmC = _mm_unpackhi_pi16(mmC, mmE);    /* mmC=(04 14 24 34 06 16 26 36) */
-      mmG = mmB;
-      mmB = _mm_unpacklo_pi16(mmB, mmF);    /* mmB=(01 11 21 31 03 13 23 33) */
-      mmG = _mm_unpackhi_pi16(mmG, mmF);    /* mmG=(05 15 25 35 07 17 27 37) */
-
-      mmD = mmA;
-      mmA = _mm_unpacklo_pi32(mmA, mmB);    /* mmA=(00 10 20 30 01 11 21 31) */
-      mmD = _mm_unpackhi_pi32(mmD, mmB);    /* mmD=(02 12 22 32 03 13 23 33) */
-      mmH = mmC;
-      mmC = _mm_unpacklo_pi32(mmC, mmG);    /* mmC=(04 14 24 34 05 15 25 35) */
-      mmH = _mm_unpackhi_pi32(mmH, mmG);    /* mmH=(06 16 26 36 07 17 27 37) */
-
-      if (num_cols >= 8) {
-        _mm_store_si64((__m64 *)outptr, mmA);
-        _mm_store_si64((__m64 *)(outptr + 8), mmD);
-        _mm_store_si64((__m64 *)(outptr + 16), mmC);
-        _mm_store_si64((__m64 *)(outptr + 24), mmH);
-        outptr += RGB_PIXELSIZE * 8;
-      } else {
-        col = num_cols;
-        asm(".set noreorder\r\n"              /* st16 */
-
-            "li      $8, 4\r\n"
-            "move    $9, %6\r\n"
-            "move    $10, %7\r\n"
-            "mov.s   $f4, %2\r\n"
-            "mov.s   $f6, %4\r\n"
-            "bltu    $9, $8, 1f\r\n"
-            "nop     \r\n"
-            "gssdlc1 $f4, 7($10)\r\n"
-            "gssdrc1 $f4, ($10)\r\n"
-            "gssdlc1 $f6, 7+8($10)\r\n"
-            "gssdrc1 $f6, 8($10)\r\n"
-            "mov.s   $f4, %3\r\n"
-            "mov.s   $f6, %5\r\n"
-            "subu    $9, $9, 4\r\n"
-            "daddu   $10, $10, 16\r\n"
-
-            "1:      \r\n"
-            "li      $8, 2\r\n"               /* st8 */
-            "bltu    $9, $8, 2f\r\n"
-            "nop     \r\n"
-            "gssdlc1 $f4, 7($10)\r\n"
-            "gssdrc1 $f4, 0($10)\r\n"
-            "mov.s   $f4, $f6\r\n"
-            "subu    $9, $9, 2\r\n"
-            "daddu   $10, $10, 8\r\n"
-
-            "2:      \r\n"
-            "li      $8, 1\r\n"               /* st4 */
-            "bltu    $9, $8, 3f\r\n"
-            "nop     \r\n"
-            "gsswlc1 $f4, 3($10)\r\n"
-            "gsswrc1 $f4, 0($10)\r\n"
-
-            "3:      \r\n"
-            "li      %1, 0\r\n"               /* end */
-            : "=m" (*outptr), "=r" (col)
-            : "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col),
-              "r" (outptr)
-            : "$f4", "$f6", "$8", "$9", "$10", "memory"
-           );
-      }
-
-#endif
-
-    }
-  }
-}
-
-#undef mmA
-#undef mmB
-#undef mmC
-#undef mmD
-#undef mmE
-#undef mmF
-#undef mmG
-#undef mmH
diff --git a/simd/loongson/jdsample-mmi.c b/simd/loongson/jdsample-mmi.c
deleted file mode 100644
index 00a6265..0000000
--- a/simd/loongson/jdsample-mmi.c
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright (C) 2015, 2018, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- *                          All Rights Reserved.
- * Authors:  ZhuChen     <zhuchen@loongson.cn>
- *           CaiWanwei   <caiwanwei@loongson.cn>
- *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* CHROMA UPSAMPLING */
-
-#include "jsimd_mmi.h"
-
-
-enum const_index {
-  index_PW_THREE,
-  index_PW_SEVEN,
-  index_PW_EIGHT,
-};
-
-static uint64_t const_value[] = {
-  _uint64_set_pi16(3, 3, 3, 3),
-  _uint64_set_pi16(7, 7, 7, 7),
-  _uint64_set_pi16(8, 8, 8, 8),
-};
-
-#define PW_THREE  get_const_value(index_PW_THREE)
-#define PW_SEVEN  get_const_value(index_PW_SEVEN)
-#define PW_EIGHT  get_const_value(index_PW_EIGHT)
-
-
-#define PROCESS_ROW(r) { \
-  mm7 = _mm_load_si64((__m64 *)outptr##r);      /* mm7=IntrL=( 0 1 2 3) */ \
-  mm3 = _mm_load_si64((__m64 *)outptr##r + 1);  /* mm3=IntrH=( 4 5 6 7) */ \
-  \
-  mm0 = mm7; \
-  mm4 = mm3; \
-  mm0 = _mm_srli_si64(mm0, 2 * BYTE_BIT);                   /* mm0=( 1 2 3 -) */ \
-  mm4 = _mm_slli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( - - - 4) */ \
-  mm5 = mm7; \
-  mm6 = mm3; \
-  mm5 = _mm_srli_si64(mm5, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm5=( 3 - - -) */ \
-  mm6 = _mm_slli_si64(mm6, 2 * BYTE_BIT);                   /* mm6=( - 4 5 6) */ \
-  \
-  mm0 = _mm_or_si64(mm0, mm4);                /* mm0=( 1 2 3 4) */ \
-  mm5 = _mm_or_si64(mm5, mm6);                /* mm5=( 3 4 5 6) */ \
-  \
-  mm1 = mm7; \
-  mm2 = mm3; \
-  mm1 = _mm_slli_si64(mm1, 2 * BYTE_BIT);     /* mm1=( - 0 1 2) */ \
-  mm2 = _mm_srli_si64(mm2, 2 * BYTE_BIT);     /* mm2=( 5 6 7 -) */ \
-  mm4 = mm3; \
-  mm4 = _mm_srli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( 7 - - -) */ \
-  \
-  mm1 = _mm_or_si64(mm1, wk[r]);              /* mm1=(-1 0 1 2) */ \
-  mm2 = _mm_or_si64(mm2, wk[r + 2]);          /* mm2=( 5 6 6 8) */ \
-  \
-  wk[r] = mm4; \
-  \
-  mm7 = _mm_mullo_pi16(mm7, PW_THREE); \
-  mm3 = _mm_mullo_pi16(mm3, PW_THREE); \
-  mm1 = _mm_add_pi16(mm1, PW_EIGHT); \
-  mm5 = _mm_add_pi16(mm5, PW_EIGHT); \
-  mm0 = _mm_add_pi16(mm0, PW_SEVEN); \
-  mm2 = _mm_add_pi16(mm2, PW_SEVEN); \
-  \
-  mm1 = _mm_add_pi16(mm1, mm7); \
-  mm5 = _mm_add_pi16(mm5, mm3); \
-  mm1 = _mm_srli_pi16(mm1, 4);                /* mm1=OutrLE=( 0  2  4  6) */ \
-  mm5 = _mm_srli_pi16(mm5, 4);                /* mm5=OutrHE=( 8 10 12 14) */ \
-  mm0 = _mm_add_pi16(mm0, mm7); \
-  mm2 = _mm_add_pi16(mm2, mm3); \
-  mm0 = _mm_srli_pi16(mm0, 4);                /* mm0=OutrLO=( 1  3  5  7) */ \
-  mm2 = _mm_srli_pi16(mm2, 4);                /* mm2=OutrHO=( 9 11 13 15) */ \
-  \
-  mm0 = _mm_slli_pi16(mm0, BYTE_BIT); \
-  mm2 = _mm_slli_pi16(mm2, BYTE_BIT); \
-  mm1 = _mm_or_si64(mm1, mm0);     /* mm1=OutrL=( 0  1  2  3  4  5  6  7) */ \
-  mm5 = _mm_or_si64(mm5, mm2);     /* mm5=OutrH=( 8  9 10 11 12 13 14 15) */ \
-  \
-  _mm_store_si64((__m64 *)outptr##r, mm1); \
-  _mm_store_si64((__m64 *)outptr##r + 1, mm5); \
-}
-
-void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor,
-                                   JDIMENSION downsampled_width,
-                                   JSAMPARRAY input_data,
-                                   JSAMPARRAY *output_data_ptr)
-{
-  JSAMPARRAY output_data = *output_data_ptr;
-  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
-  int inrow, outrow, incol, tmp, tmp1;
-  __m64 mm0, mm1, mm2, mm3 = 0.0, mm4, mm5, mm6, mm7 = 0.0;
-  __m64 wk[4], mm_tmp;
-
-  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
-
-    inptr_1 = input_data[inrow - 1];
-    inptr0 = input_data[inrow];
-    inptr1 = input_data[inrow + 1];
-    outptr0 = output_data[outrow++];
-    outptr1 = output_data[outrow++];
-
-    if (downsampled_width & 7) {
-      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
-      tmp1 =  downsampled_width * sizeof(JSAMPLE);
-      asm("daddu  $8, %3, %6\r\n"
-          "lb     $9, ($8)\r\n"
-          "daddu  $8, %3, %7\r\n"
-          "sb     $9, ($8)\r\n"
-          "daddu  $8, %4, %6\r\n"
-          "lb     $9, ($8)\r\n"
-          "daddu  $8, %4, %7\r\n"
-          "sb     $9, ($8)\r\n"
-          "daddu  $8, %5, %6\r\n"
-          "lb     $9, ($8)\r\n"
-          "daddu  $8, %5, %7\r\n"
-          "sb     $9, ($8)\r\n"
-          : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1)
-          : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1)
-          : "$8", "$9"
-         );
-    }
-
-    /* process the first column block */
-    mm0 = _mm_load_si64((__m64 *)inptr0);     /* mm0 = row[ 0][0] */
-    mm1 = _mm_load_si64((__m64 *)inptr_1);    /* mm1 = row[-1][0] */
-    mm2 = _mm_load_si64((__m64 *)inptr1);     /* mm2 = row[ 1][0] */
-
-    mm3 = _mm_xor_si64(mm3, mm3);             /* mm3 = (all 0's) */
-    mm4 = mm0;
-    mm0 = _mm_unpacklo_pi8(mm0, mm3);         /* mm0 = row[ 0][0]( 0 1 2 3) */
-    mm4 = _mm_unpackhi_pi8(mm4, mm3);         /* mm4 = row[ 0][0]( 4 5 6 7) */
-    mm5 = mm1;
-    mm1 = _mm_unpacklo_pi8(mm1, mm3);         /* mm1 = row[-1][0]( 0 1 2 3) */
-    mm5 = _mm_unpackhi_pi8(mm5, mm3);         /* mm5 = row[-1][0]( 4 5 6 7) */
-    mm6 = mm2;
-    mm2 = _mm_unpacklo_pi8(mm2, mm3);         /* mm2 = row[+1][0]( 0 1 2 3) */
-    mm6 = _mm_unpackhi_pi8(mm6, mm3);         /* mm6 = row[+1][0]( 4 5 6 7) */
-
-    mm0 = _mm_mullo_pi16(mm0, PW_THREE);
-    mm4 = _mm_mullo_pi16(mm4, PW_THREE);
-
-    mm7 = _mm_cmpeq_pi8(mm7, mm7);
-    mm7 = _mm_srli_si64(mm7, (SIZEOF_MMWORD - 2) * BYTE_BIT);
-
-    mm1 = _mm_add_pi16(mm1, mm0);             /* mm1=Int0L=( 0 1 2 3) */
-    mm5 = _mm_add_pi16(mm5, mm4);             /* mm5=Int0H=( 4 5 6 7) */
-    mm2 = _mm_add_pi16(mm2, mm0);             /* mm2=Int1L=( 0 1 2 3) */
-    mm6 = _mm_add_pi16(mm6, mm4);             /* mm6=Int1H=( 4 5 6 7) */
-
-    _mm_store_si64((__m64 *)outptr0, mm1);      /* temporarily save */
-    _mm_store_si64((__m64 *)outptr0 + 1, mm5);  /* the intermediate data */
-    _mm_store_si64((__m64 *)outptr1, mm2);
-    _mm_store_si64((__m64 *)outptr1 + 1, mm6);
-
-    mm1 = _mm_and_si64(mm1, mm7);             /* mm1=( 0 - - -) */
-    mm2 = _mm_and_si64(mm2, mm7);             /* mm2=( 0 - - -) */
-
-    wk[0] = mm1;
-    wk[1] = mm2;
-
-    for (incol = downsampled_width; incol > 0;
-         incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8,
-         outptr0 += 16, outptr1 += 16) {
-
-      if (incol > 8) {
-        /* process the next column block */
-        mm0 = _mm_load_si64((__m64 *)inptr0 + 1);   /* mm0 = row[ 0][1] */
-        mm1 = _mm_load_si64((__m64 *)inptr_1 + 1);  /* mm1 = row[-1][1] */
-        mm2 = _mm_load_si64((__m64 *)inptr1 + 1);   /* mm2 = row[+1][1] */
-
-        mm3 = _mm_setzero_si64();             /* mm3 = (all 0's) */
-        mm4 = mm0;
-        mm0 = _mm_unpacklo_pi8(mm0, mm3);     /* mm0 = row[ 0][1]( 0 1 2 3) */
-        mm4 = _mm_unpackhi_pi8(mm4, mm3);     /* mm4 = row[ 0][1]( 4 5 6 7) */
-        mm5 = mm1;
-        mm1 = _mm_unpacklo_pi8(mm1, mm3);     /* mm1 = row[-1][1]( 0 1 2 3) */
-        mm5 = _mm_unpackhi_pi8(mm5, mm3);     /* mm5 = row[-1][1]( 4 5 6 7) */
-        mm6 = mm2;
-        mm2 = _mm_unpacklo_pi8(mm2, mm3);     /* mm2 = row[+1][1]( 0 1 2 3) */
-        mm6 = _mm_unpackhi_pi8(mm6, mm3);     /* mm6 = row[+1][1]( 4 5 6 7) */
-
-        mm0 = _mm_mullo_pi16(mm0, PW_THREE);
-        mm4 = _mm_mullo_pi16(mm4, PW_THREE);
-
-        mm1 = _mm_add_pi16(mm1, mm0);         /* mm1 = Int0L = ( 0 1 2 3) */
-        mm5 = _mm_add_pi16(mm5, mm4);         /* mm5 = Int0H = ( 4 5 6 7) */
-        mm2 = _mm_add_pi16(mm2, mm0);         /* mm2 = Int1L = ( 0 1 2 3) */
-        mm6 = _mm_add_pi16(mm6, mm4);         /* mm6 = Int1H = ( 4 5 6 7) */
-
-        _mm_store_si64((__m64 *)outptr0 + 2, mm1);  /* temporarily save */
-        _mm_store_si64((__m64 *)outptr0 + 3, mm5);  /* the intermediate data */
-        _mm_store_si64((__m64 *)outptr1 + 2, mm2);
-        _mm_store_si64((__m64 *)outptr1 + 3, mm6);
-
-        mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm1=( - - - 0) */
-        mm2 = _mm_slli_si64(mm2, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm2=( - - - 0) */
-
-        wk[2] = mm1;
-        wk[3] = mm2;
-      } else {
-        /* process the last column block */
-        mm1 = _mm_cmpeq_pi8(mm1, mm1);
-        mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT);
-        mm2 = mm1;
-
-        mm_tmp = _mm_load_si64((__m64 *)outptr0 + 1);
-        mm1 = _mm_and_si64(mm1, mm_tmp);      /* mm1=( - - - 7) */
-        mm_tmp = _mm_load_si64((__m64 *)outptr1 + 1);
-        mm2 = _mm_and_si64(mm2, mm_tmp);      /* mm2=( - - - 7) */
-
-        wk[2] = mm1;
-        wk[3] = mm2;
-      }
-
-      /* process the upper row */
-      PROCESS_ROW(0)
-
-      /* process the lower row */
-      PROCESS_ROW(1)
-    }
-  }
-}
diff --git a/simd/loongson/jquanti-mmi.c b/simd/loongson/jquanti-mmi.c
deleted file mode 100644
index f9a3f81..0000000
--- a/simd/loongson/jquanti-mmi.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- *                          All Rights Reserved.
- * Authors:  ZhuChen     <zhuchen@loongson.cn>
- *           CaiWanwei   <caiwanwei@loongson.cn>
- *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- * Copyright (C) 2018, D. R. Commander.  All Rights Reserved.
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
-
-#include "jsimd_mmi.h"
-
-
-#define DO_QUANT() { \
-  mm2 = _mm_load_si64((__m64 *)&workspace[0]); \
-  mm3 = _mm_load_si64((__m64 *)&workspace[4]); \
-  \
-  mm0 = mm2; \
-  mm1 = mm3; \
-  \
-  mm2 = _mm_srai_pi16(mm2, (WORD_BIT - 1));   /* -1 if value < 0, */ \
-                                              /* 0 otherwise */ \
-  mm3 = _mm_srai_pi16(mm3, (WORD_BIT - 1)); \
-  \
-  mm0 = _mm_xor_si64(mm0, mm2);               /* val = -val */ \
-  mm1 = _mm_xor_si64(mm1, mm3); \
-  mm0 = _mm_sub_pi16(mm0, mm2); \
-  mm1 = _mm_sub_pi16(mm1, mm3); \
-  \
-  corr0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]);  /* correction */ \
-  corr1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \
-  \
-  mm0 = _mm_add_pi16(mm0, corr0);             /* correction + roundfactor */ \
-  mm1 = _mm_add_pi16(mm1, corr1); \
-  \
-  mm4 = mm0; \
-  mm5 = mm1; \
-  \
-  recip0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]);  /* reciprocal */ \
-  recip1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \
-  \
-  mm0 = _mm_mulhi_pi16(mm0, recip0); \
-  mm1 = _mm_mulhi_pi16(mm1, recip1); \
-  \
-  mm0 = _mm_add_pi16(mm0, mm4);  /* reciprocal is always negative */ \
-  mm1 = _mm_add_pi16(mm1, mm5);  /* (MSB=1), so we always need to add the */ \
-                                 /* initial value (input value is never */ \
-                                 /* negative as we inverted it at the */ \
-                                 /* start of this routine) */ \
-  \
-  scale0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]);  /* scale */ \
-  scale1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \
-  \
-  mm6 = scale0; \
-  mm7 = scale1; \
-  mm4 = mm0; \
-  mm5 = mm1; \
-  \
-  mm0 = _mm_mulhi_pi16(mm0, mm6); \
-  mm1 = _mm_mulhi_pi16(mm1, mm7); \
-  \
-  mm6 = _mm_srai_pi16(mm6, (WORD_BIT - 1));   /* determine if scale... */ \
-                                              /* is negative */ \
-  mm7 = _mm_srai_pi16(mm7, (WORD_BIT - 1)); \
-  \
-  mm6 = _mm_and_si64(mm6, mm4);               /* and add input if it is */ \
-  mm7 = _mm_and_si64(mm7, mm5); \
-  mm0 = _mm_add_pi16(mm0, mm6); \
-  mm1 = _mm_add_pi16(mm1, mm7); \
-  \
-  mm4 = _mm_srai_pi16(mm4, (WORD_BIT - 1));   /* then check if... */ \
-  mm5 = _mm_srai_pi16(mm5, (WORD_BIT - 1));   /* negative input */ \
-  \
-  mm4 = _mm_and_si64(mm4, scale0);            /* and add scale if it is */ \
-  mm5 = _mm_and_si64(mm5, scale1); \
-  mm0 = _mm_add_pi16(mm0, mm4); \
-  mm1 = _mm_add_pi16(mm1, mm5); \
-  \
-  mm0 = _mm_xor_si64(mm0, mm2);               /* val = -val */ \
-  mm1 = _mm_xor_si64(mm1, mm3); \
-  mm0 = _mm_sub_pi16(mm0, mm2); \
-  mm1 = _mm_sub_pi16(mm1, mm3); \
-  \
-  _mm_store_si64((__m64 *)&output_ptr[0], mm0); \
-  _mm_store_si64((__m64 *)&output_ptr[4], mm1); \
-  \
-  workspace += DCTSIZE; \
-  divisors += DCTSIZE; \
-  output_ptr += DCTSIZE; \
-}
-
-
-void jsimd_quantize_mmi(JCOEFPTR coef_block, DCTELEM *divisors,
-                        DCTELEM *workspace)
-{
-  JCOEFPTR output_ptr = coef_block;
-  __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
-  __m64 corr0, corr1, recip0, recip1, scale0, scale1;
-
-  DO_QUANT()
-  DO_QUANT()
-  DO_QUANT()
-  DO_QUANT()
-  DO_QUANT()
-  DO_QUANT()
-  DO_QUANT()
-  DO_QUANT()
-}
diff --git a/simd/mips64/jccolext-mmi.c b/simd/mips64/jccolext-mmi.c
new file mode 100644
index 0000000..558eb2a
--- /dev/null
+++ b/simd/mips64/jccolext-mmi.c
@@ -0,0 +1,455 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           ZhangLixia  <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
+                               JSAMPIMAGE output_buf, JDIMENSION output_row,
+                               int num_rows)
+{
+  JSAMPROW inptr, outptr0, outptr1, outptr2;
+  int num_cols, col;
+  __m64 re, ro, ge, go, be, bo, xe;
+#if RGB_PIXELSIZE == 4
+  __m64 xo;
+#endif
+  __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
+  __m64 ble, halfble, bhe, halfbhe, blo, halfblo, bho, halfbho;
+  __m64 rle, halfrle, rhe, halfrhe, rlo, halfrlo, rho, halfrho;
+  __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
+  __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
+  __m64 cble, cbhe, cbe, cblo, cbho, cbo, cb;
+  __m64 crle, crhe, cre, crlo, crho, cro, cr;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
+         outptr0 += 8, outptr1 += 8, outptr2 += 8) {
+
+#if RGB_PIXELSIZE == 3
+
+      if (num_cols < 8) {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %3\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            "xor      $12, $12, $12\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lbu      $12, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            "xor      $11, $11, $11\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lhu      $11, 0($13)\r\n"
+            "sll      $12, $12, 16\r\n"
+            "or       $12, $12, $11\r\n"
+
+            "2:       \r\n"
+            "dmtc1    $12, %0\r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 4\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lwu      $14, 0($13)\r\n"
+            "dmtc1    $14, %1\r\n"
+            "dsll32   $12, $12, 0\r\n"
+            "or       $12, $12, $14\r\n"
+            "dmtc1    $12, %0\r\n"
+
+            "3:       \r\n"
+            "li       $8, 8\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 4f\r\n"
+            "nop      \r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "li       $9, 8\r\n"
+            "j        5f\r\n"
+            "nop      \r\n"
+
+            "4:       \r\n"
+            "li       $8, 16\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 5f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
+            : "r" (col), "r" (num_rows), "r" (inptr)
+            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
+              "$14", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmG = _mm_load_si64((__m64 *)&inptr[8]);
+          mmF = _mm_load_si64((__m64 *)&inptr[16]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmG);
+      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
+
+      mmD = _mm_unpacklo_pi8(mmD, mmF);
+      mmG = _mm_unpackhi_pi8(mmG, mmF);
+
+      mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmD);
+      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
+
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmD = _mm_unpackhi_pi8(mmD, mmG);
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmB = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_loadhi_pi8_f(mmD);
+      mmD = _mm_loadlo_pi8_f(mmD);
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+      if (num_cols < 8) {
+        col = num_cols;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "lwc1     %0, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0($13)\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "mov.s    %3, %1\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "3:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
+            : "r" (col), "r" (inptr)
+            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmF = _mm_load_si64((__m64 *)&inptr[8]);
+          mmD = _mm_load_si64((__m64 *)&inptr[16]);
+          mmC = _mm_load_si64((__m64 *)&inptr[24]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
+          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmB = _mm_unpackhi_pi8(mmA, mmF);
+      mmA = _mm_unpacklo_pi8(mmA, mmF);
+
+      mmG = _mm_unpackhi_pi8(mmD, mmC);
+      mmD = _mm_unpacklo_pi8(mmD, mmC);
+
+      mmE = _mm_unpackhi_pi16(mmA, mmD);
+      mmA = _mm_unpacklo_pi16(mmA, mmD);
+
+      mmH = _mm_unpackhi_pi16(mmB, mmG);
+      mmB = _mm_unpacklo_pi16(mmB, mmG);
+
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmD = _mm_loadhi_pi8_f(mmB);
+      mmB = _mm_loadlo_pi8_f(mmB);
+
+      mmG = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_unpacklo_pi8(mmH, mmH);
+      mmH = _mm_unpackhi_pi8(mmH, mmH);
+      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
+      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
+
+#endif
+
+      /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
+       * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
+       *
+       * (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       */
+
+      rglo = _mm_unpacklo_pi16(ro, go);
+      rgho = _mm_unpackhi_pi16(ro, go);
+      ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
+      yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
+      cblo = _mm_madd_pi16(rglo, PW_MF016_MF033);
+      cbho = _mm_madd_pi16(rgho, PW_MF016_MF033);
+
+      blo = _mm_loadlo_pi16_f(bo);
+      bho = _mm_loadhi_pi16_f(bo);
+      halfblo = _mm_srli_pi32(blo, 1);
+      halfbho = _mm_srli_pi32(bho, 1);
+
+      cblo = _mm_add_pi32(cblo, halfblo);
+      cbho = _mm_add_pi32(cbho, halfbho);
+      cblo = _mm_add_pi32(cblo, PD_ONEHALFM1_CJ);
+      cbho = _mm_add_pi32(cbho, PD_ONEHALFM1_CJ);
+      cblo = _mm_srli_pi32(cblo, SCALEBITS);
+      cbho = _mm_srli_pi32(cbho, SCALEBITS);
+      cbo = _mm_packs_pi32(cblo, cbho);
+
+      rgle = _mm_unpacklo_pi16(re, ge);
+      rghe = _mm_unpackhi_pi16(re, ge);
+      yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
+      yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
+      cble = _mm_madd_pi16(rgle, PW_MF016_MF033);
+      cbhe = _mm_madd_pi16(rghe, PW_MF016_MF033);
+
+      ble = _mm_loadlo_pi16_f(be);
+      bhe = _mm_loadhi_pi16_f(be);
+      halfble = _mm_srli_pi32(ble, 1);
+      halfbhe = _mm_srli_pi32(bhe, 1);
+
+      cble = _mm_add_pi32(cble, halfble);
+      cbhe = _mm_add_pi32(cbhe, halfbhe);
+      cble = _mm_add_pi32(cble, PD_ONEHALFM1_CJ);
+      cbhe = _mm_add_pi32(cbhe, PD_ONEHALFM1_CJ);
+      cble = _mm_srli_pi32(cble, SCALEBITS);
+      cbhe = _mm_srli_pi32(cbhe, SCALEBITS);
+      cbe = _mm_packs_pi32(cble, cbhe);
+
+      cbo = _mm_slli_pi16(cbo, BYTE_BIT);
+      cb = _mm_or_si64(cbe, cbo);
+
+      bglo = _mm_unpacklo_pi16(bo, go);
+      bgho = _mm_unpackhi_pi16(bo, go);
+      ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
+      yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
+      crlo = _mm_madd_pi16(bglo, PW_MF008_MF041);
+      crho = _mm_madd_pi16(bgho, PW_MF008_MF041);
+
+      ylo = _mm_add_pi32(ylo_bg, ylo_rg);
+      yho = _mm_add_pi32(yho_bg, yho_rg);
+      ylo = _mm_add_pi32(ylo, PD_ONEHALF);
+      yho = _mm_add_pi32(yho, PD_ONEHALF);
+      ylo = _mm_srli_pi32(ylo, SCALEBITS);
+      yho = _mm_srli_pi32(yho, SCALEBITS);
+      yo = _mm_packs_pi32(ylo, yho);
+
+      rlo = _mm_loadlo_pi16_f(ro);
+      rho = _mm_loadhi_pi16_f(ro);
+      halfrlo = _mm_srli_pi32(rlo, 1);
+      halfrho = _mm_srli_pi32(rho, 1);
+
+      crlo = _mm_add_pi32(crlo, halfrlo);
+      crho = _mm_add_pi32(crho, halfrho);
+      crlo = _mm_add_pi32(crlo, PD_ONEHALFM1_CJ);
+      crho = _mm_add_pi32(crho, PD_ONEHALFM1_CJ);
+      crlo = _mm_srli_pi32(crlo, SCALEBITS);
+      crho = _mm_srli_pi32(crho, SCALEBITS);
+      cro = _mm_packs_pi32(crlo, crho);
+
+      bgle = _mm_unpacklo_pi16(be, ge);
+      bghe = _mm_unpackhi_pi16(be, ge);
+      yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
+      yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
+      crle = _mm_madd_pi16(bgle, PW_MF008_MF041);
+      crhe = _mm_madd_pi16(bghe, PW_MF008_MF041);
+
+      yle = _mm_add_pi32(yle_bg, yle_rg);
+      yhe = _mm_add_pi32(yhe_bg, yhe_rg);
+      yle = _mm_add_pi32(yle, PD_ONEHALF);
+      yhe = _mm_add_pi32(yhe, PD_ONEHALF);
+      yle = _mm_srli_pi32(yle, SCALEBITS);
+      yhe = _mm_srli_pi32(yhe, SCALEBITS);
+      ye = _mm_packs_pi32(yle, yhe);
+
+      yo = _mm_slli_pi16(yo, BYTE_BIT);
+      y = _mm_or_si64(ye, yo);
+
+      rle = _mm_loadlo_pi16_f(re);
+      rhe = _mm_loadhi_pi16_f(re);
+      halfrle = _mm_srli_pi32(rle, 1);
+      halfrhe = _mm_srli_pi32(rhe, 1);
+
+      crle = _mm_add_pi32(crle, halfrle);
+      crhe = _mm_add_pi32(crhe, halfrhe);
+      crle = _mm_add_pi32(crle, PD_ONEHALFM1_CJ);
+      crhe = _mm_add_pi32(crhe, PD_ONEHALFM1_CJ);
+      crle = _mm_srli_pi32(crle, SCALEBITS);
+      crhe = _mm_srli_pi32(crhe, SCALEBITS);
+      cre = _mm_packs_pi32(crle, crhe);
+
+      cro = _mm_slli_pi16(cro, BYTE_BIT);
+      cr = _mm_or_si64(cre, cro);
+
+      _mm_store_si64((__m64 *)&outptr0[0], y);
+      _mm_store_si64((__m64 *)&outptr1[0], cb);
+      _mm_store_si64((__m64 *)&outptr2[0], cr);
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/simd/loongson/jccolor-mmi.c b/simd/mips64/jccolor-mmi.c
similarity index 100%
rename from simd/loongson/jccolor-mmi.c
rename to simd/mips64/jccolor-mmi.c
diff --git a/simd/mips64/jcgray-mmi.c b/simd/mips64/jcgray-mmi.c
new file mode 100644
index 0000000..9c7b833
--- /dev/null
+++ b/simd/mips64/jcgray-mmi.c
@@ -0,0 +1,132 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2014, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> GRAYSCALE CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_114  ((short)7471)                /* FIX(0.11400) */
+#define F_0_250  ((short)16384)               /* FIX(0.25000) */
+#define F_0_299  ((short)19595)               /* FIX(0.29900) */
+#define F_0_587  ((short)38470)               /* FIX(0.58700) */
+#define F_0_337  ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
+
+enum const_index {
+  index_PD_ONEHALF,
+  index_PW_F0299_F0337,
+  index_PW_F0114_F0250
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))),
+  _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299),
+  _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114)
+};
+
+#define get_const_value(index)  (*(__m64 *)&const_value[index])
+
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+#define PW_F0299_F0337   get_const_value(index_PW_F0299_F0337)
+#define PW_F0114_F0250   get_const_value(index_PW_F0114_F0250)
+
+
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extrgb_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extrgbx_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extbgr_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extbgrx_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extxbgr_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extxrgb_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
diff --git a/simd/mips64/jcgryext-mmi.c b/simd/mips64/jcgryext-mmi.c
new file mode 100644
index 0000000..08a83d6
--- /dev/null
+++ b/simd/mips64/jcgryext-mmi.c
@@ -0,0 +1,374 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_rgb_gray_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  JSAMPROW inptr, outptr;
+  int num_cols, col;
+  __m64 re, ro, ge, go, be, bo, xe;
+#if RGB_PIXELSIZE == 4
+  __m64 xo;
+#endif
+  __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
+  __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
+  __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
+         outptr += 8) {
+
+#if RGB_PIXELSIZE == 3
+
+      if (num_cols < 8) {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %3\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            "xor      $12, $12, $12\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lbu      $12, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            "xor      $11, $11, $11\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lhu      $11, 0($13)\r\n"
+            "sll      $12, $12, 16\r\n"
+            "or       $12, $12, $11\r\n"
+
+            "2:       \r\n"
+            "dmtc1    $12, %0\r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 4\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lwu      $14, 0($13)\r\n"
+            "dmtc1    $14, %1\r\n"
+            "dsll32   $12, $12, 0\r\n"
+            "or       $12, $12, $14\r\n"
+            "dmtc1    $12, %0\r\n"
+
+            "3:       \r\n"
+            "li       $8, 8\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 4f\r\n"
+            "nop      \r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "li       $9, 8\r\n"
+            "j        5f\r\n"
+            "nop      \r\n"
+
+            "4:       \r\n"
+            "li       $8, 16\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 5f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
+            : "r" (col), "r" (num_rows), "r" (inptr)
+            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
+              "$14", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmG = _mm_load_si64((__m64 *)&inptr[8]);
+          mmF = _mm_load_si64((__m64 *)&inptr[16]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmG);
+      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
+
+      mmD = _mm_unpacklo_pi8(mmD, mmF);
+      mmG = _mm_unpackhi_pi8(mmG, mmF);
+
+      mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmD);
+      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
+
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmD = _mm_unpackhi_pi8(mmD, mmG);
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmB = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_loadhi_pi8_f(mmD);
+      mmD = _mm_loadlo_pi8_f(mmD);
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+      if (num_cols < 8) {
+        col = num_cols;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "lwc1     %0, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0($13)\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "mov.s    %3, %1\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "3:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
+            : "r" (col), "r" (inptr)
+            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmF = _mm_load_si64((__m64 *)&inptr[8]);
+          mmD = _mm_load_si64((__m64 *)&inptr[16]);
+          mmC = _mm_load_si64((__m64 *)&inptr[24]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
+          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmB = _mm_unpackhi_pi8(mmA, mmF);
+      mmA = _mm_unpacklo_pi8(mmA, mmF);
+
+      mmG = _mm_unpackhi_pi8(mmD, mmC);
+      mmD = _mm_unpacklo_pi8(mmD, mmC);
+
+      mmE = _mm_unpackhi_pi16(mmA, mmD);
+      mmA = _mm_unpacklo_pi16(mmA, mmD);
+
+      mmH = _mm_unpackhi_pi16(mmB, mmG);
+      mmB = _mm_unpacklo_pi16(mmB, mmG);
+
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmD = _mm_loadhi_pi8_f(mmB);
+      mmB = _mm_loadlo_pi8_f(mmB);
+
+      mmG = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_unpacklo_pi8(mmH, mmH);
+      mmH = _mm_unpackhi_pi8(mmH, mmH);
+      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
+      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
+
+#endif
+
+      /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
+       * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
+       *
+       * (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       */
+
+      rglo = _mm_unpacklo_pi16(ro, go);
+      rgho = _mm_unpackhi_pi16(ro, go);
+      ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
+      yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
+
+      rgle = _mm_unpacklo_pi16(re, ge);
+      rghe = _mm_unpackhi_pi16(re, ge);
+      yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
+      yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
+
+      bglo = _mm_unpacklo_pi16(bo, go);
+      bgho = _mm_unpackhi_pi16(bo, go);
+      ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
+      yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
+
+      ylo = _mm_add_pi32(ylo_bg, ylo_rg);
+      yho = _mm_add_pi32(yho_bg, yho_rg);
+      ylo = _mm_add_pi32(ylo, PD_ONEHALF);
+      yho = _mm_add_pi32(yho, PD_ONEHALF);
+      ylo = _mm_srli_pi32(ylo, SCALEBITS);
+      yho = _mm_srli_pi32(yho, SCALEBITS);
+      yo = _mm_packs_pi32(ylo, yho);
+
+      bgle = _mm_unpacklo_pi16(be, ge);
+      bghe = _mm_unpackhi_pi16(be, ge);
+      yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
+      yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
+
+      yle = _mm_add_pi32(yle_bg, yle_rg);
+      yhe = _mm_add_pi32(yhe_bg, yhe_rg);
+      yle = _mm_add_pi32(yle, PD_ONEHALF);
+      yhe = _mm_add_pi32(yhe, PD_ONEHALF);
+      yle = _mm_srli_pi32(yle, SCALEBITS);
+      yhe = _mm_srli_pi32(yhe, SCALEBITS);
+      ye = _mm_packs_pi32(yle, yhe);
+
+      yo = _mm_slli_pi16(yo, BYTE_BIT);
+      y = _mm_or_si64(ye, yo);
+
+      _mm_store_si64((__m64 *)&outptr[0], y);
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/simd/mips64/jcsample-mmi.c b/simd/mips64/jcsample-mmi.c
new file mode 100644
index 0000000..0354dac
--- /dev/null
+++ b/simd/mips64/jcsample-mmi.c
@@ -0,0 +1,98 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA DOWNSAMPLING */
+
+#include "jsimd_mmi.h"
+#include "jcsample.h"
+
+
+void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor,
+                               JDIMENSION v_samp_factor,
+                               JDIMENSION width_in_blocks,
+                               JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  int inrow, outrow, outcol;
+  JDIMENSION output_cols = width_in_blocks * DCTSIZE;
+  JSAMPROW inptr0, inptr1, outptr;
+  __m64 bias, mask = 0.0, thisavg, nextavg, avg;
+  __m64 this0o, this0e, this0, this0sum, next0o, next0e, next0, next0sum;
+  __m64 this1o, this1e, this1, this1sum, next1o, next1e, next1, next1sum;
+
+  expand_right_edge(input_data, max_v_samp_factor, image_width,
+                    output_cols * 2);
+
+  bias = _mm_set1_pi32((1 << 17) + 1);   /* 0x00020001 (32-bit bias pattern) */
+                                         /* bias={1, 2, 1, 2} (16-bit) */
+  mask = _mm_cmpeq_pi16(mask, mask);
+  mask = _mm_srli_pi16(mask, BYTE_BIT);  /* {0xFF 0x00 0xFF 0x00 ..} */
+
+  for (inrow = 0, outrow = 0; outrow < v_samp_factor;
+       inrow += 2, outrow++) {
+
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr = output_data[outrow];
+
+    for (outcol = output_cols; outcol > 0;
+         outcol -= 8, inptr0 += 16, inptr1 += 16, outptr += 8) {
+
+      this0 = _mm_load_si64((__m64 *)&inptr0[0]);
+      this1 = _mm_load_si64((__m64 *)&inptr1[0]);
+      next0 = _mm_load_si64((__m64 *)&inptr0[8]);
+      next1 = _mm_load_si64((__m64 *)&inptr1[8]);
+
+      this0o = _mm_and_si64(this0, mask);
+      this0e = _mm_srli_pi16(this0, BYTE_BIT);
+      this1o = _mm_and_si64(this1, mask);
+      this1e = _mm_srli_pi16(this1, BYTE_BIT);
+      this0sum = _mm_add_pi16(this0o, this0e);
+      this1sum = _mm_add_pi16(this1o, this1e);
+
+      next0o = _mm_and_si64(next0, mask);
+      next0e = _mm_srli_pi16(next0, BYTE_BIT);
+      next1o = _mm_and_si64(next1, mask);
+      next1e = _mm_srli_pi16(next1, BYTE_BIT);
+      next0sum = _mm_add_pi16(next0o, next0e);
+      next1sum = _mm_add_pi16(next1o, next1e);
+
+      thisavg = _mm_add_pi16(this0sum, this1sum);
+      nextavg = _mm_add_pi16(next0sum, next1sum);
+      thisavg = _mm_add_pi16(thisavg, bias);
+      nextavg = _mm_add_pi16(nextavg, bias);
+      thisavg = _mm_srli_pi16(thisavg, 2);
+      nextavg = _mm_srli_pi16(nextavg, 2);
+
+      avg = _mm_packs_pu16(thisavg, nextavg);
+
+      _mm_store_si64((__m64 *)&outptr[0], avg);
+    }
+  }
+}
diff --git a/simd/loongson/jcsample.h b/simd/mips64/jcsample.h
similarity index 90%
rename from simd/loongson/jcsample.h
rename to simd/mips64/jcsample.h
index 2ac4816..bd07fcc 100644
--- a/simd/loongson/jcsample.h
+++ b/simd/mips64/jcsample.h
@@ -20,7 +20,7 @@
   if (numcols > 0) {
     for (row = 0; row < num_rows; row++) {
       ptr = image_data[row] + input_cols;
-      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
+      pixval = ptr[-1];
       for (count = numcols; count > 0; count--)
         *ptr++ = pixval;
     }
diff --git a/simd/mips64/jdcolext-mmi.c b/simd/mips64/jdcolext-mmi.c
new file mode 100644
index 0000000..3b5b2f2
--- /dev/null
+++ b/simd/mips64/jdcolext-mmi.c
@@ -0,0 +1,415 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
+                               JDIMENSION input_row, JSAMPARRAY output_buf,
+                               int num_rows)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int num_cols, col;
+  __m64 ye, yo, y, cbe, cbe2, cbo, cbo2, cb, cre, cre2, cro, cro2, cr;
+  __m64 re, ro, gle, ghe, ge, glo, gho, go, be, bo, xe = 0.0, xo = 0.0;
+  __m64 decenter, mask;
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+
+    for (num_cols = out_width; num_cols > 0; num_cols -= 8,
+         inptr0 += 8, inptr1 += 8, inptr2 += 8) {
+
+      cb = _mm_load_si64((__m64 *)inptr1);
+      cr = _mm_load_si64((__m64 *)inptr2);
+      y = _mm_load_si64((__m64 *)inptr0);
+
+      mask = decenter = 0.0;
+      mask = _mm_cmpeq_pi16(mask, mask);
+      decenter = _mm_cmpeq_pi16(decenter, decenter);
+      mask = _mm_srli_pi16(mask, BYTE_BIT);   /* {0xFF 0x00 0xFF 0x00 ..} */
+      decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+      cbe = _mm_and_si64(mask, cb);           /* Cb(0246) */
+      cbo = _mm_srli_pi16(cb, BYTE_BIT);      /* Cb(1357) */
+      cre = _mm_and_si64(mask, cr);           /* Cr(0246) */
+      cro = _mm_srli_pi16(cr, BYTE_BIT);      /* Cr(1357) */
+      cbe = _mm_add_pi16(cbe, decenter);
+      cbo = _mm_add_pi16(cbo, decenter);
+      cre = _mm_add_pi16(cre, decenter);
+      cro = _mm_add_pi16(cro, decenter);
+
+      /* (Original)
+       * R = Y                + 1.40200 * Cr
+       * G = Y - 0.34414 * Cb - 0.71414 * Cr
+       * B = Y + 1.77200 * Cb
+       *
+       * (This implementation)
+       * R = Y                + 0.40200 * Cr + Cr
+       * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+       * B = Y - 0.22800 * Cb + Cb + Cb
+       */
+
+      cbe2 = _mm_add_pi16(cbe, cbe);          /* 2*CbE */
+      cbo2 = _mm_add_pi16(cbo, cbo);          /* 2*CbO */
+      cre2 = _mm_add_pi16(cre, cre);          /* 2*CrE */
+      cro2 = _mm_add_pi16(cro, cro);          /* 2*CrO */
+
+      be = _mm_mulhi_pi16(cbe2, PW_MF0228);   /* (2*CbE * -FIX(0.22800) */
+      bo = _mm_mulhi_pi16(cbo2, PW_MF0228);   /* (2*CbO * -FIX(0.22800) */
+      re = _mm_mulhi_pi16(cre2, PW_F0402);    /* (2*CrE * FIX(0.40200)) */
+      ro = _mm_mulhi_pi16(cro2, PW_F0402);    /* (2*CrO * FIX(0.40200)) */
+
+      be = _mm_add_pi16(be, PW_ONE);
+      bo = _mm_add_pi16(bo, PW_ONE);
+      be = _mm_srai_pi16(be, 1);              /* (CbE * -FIX(0.22800)) */
+      bo = _mm_srai_pi16(bo, 1);              /* (CbO * -FIX(0.22800)) */
+      re = _mm_add_pi16(re, PW_ONE);
+      ro = _mm_add_pi16(ro, PW_ONE);
+      re = _mm_srai_pi16(re, 1);              /* (CrE * FIX(0.40200)) */
+      ro = _mm_srai_pi16(ro, 1);              /* (CrO * FIX(0.40200)) */
+
+      be = _mm_add_pi16(be, cbe);
+      bo = _mm_add_pi16(bo, cbo);
+      be = _mm_add_pi16(be, cbe);             /* (CbE * FIX(1.77200))=(B-Y)E */
+      bo = _mm_add_pi16(bo, cbo);             /* (CbO * FIX(1.77200))=(B-Y)O */
+      re = _mm_add_pi16(re, cre);             /* (CrE * FIX(1.40200))=(R-Y)E */
+      ro = _mm_add_pi16(ro, cro);             /* (CrO * FIX(1.40200))=(R-Y)O */
+
+      gle = _mm_unpacklo_pi16(cbe, cre);
+      ghe = _mm_unpackhi_pi16(cbe, cre);
+      gle = _mm_madd_pi16(gle, PW_MF0344_F0285);
+      ghe = _mm_madd_pi16(ghe, PW_MF0344_F0285);
+      glo = _mm_unpacklo_pi16(cbo, cro);
+      gho = _mm_unpackhi_pi16(cbo, cro);
+      glo = _mm_madd_pi16(glo, PW_MF0344_F0285);
+      gho = _mm_madd_pi16(gho, PW_MF0344_F0285);
+
+      gle = _mm_add_pi32(gle, PD_ONEHALF);
+      ghe = _mm_add_pi32(ghe, PD_ONEHALF);
+      gle = _mm_srai_pi32(gle, SCALEBITS);
+      ghe = _mm_srai_pi32(ghe, SCALEBITS);
+      glo = _mm_add_pi32(glo, PD_ONEHALF);
+      gho = _mm_add_pi32(gho, PD_ONEHALF);
+      glo = _mm_srai_pi32(glo, SCALEBITS);
+      gho = _mm_srai_pi32(gho, SCALEBITS);
+
+      ge = _mm_packs_pi32(gle, ghe);       /* CbE*-FIX(0.344)+CrE*FIX(0.285) */
+      go = _mm_packs_pi32(glo, gho);       /* CbO*-FIX(0.344)+CrO*FIX(0.285) */
+      ge = _mm_sub_pi16(ge, cre);  /* CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
+      go = _mm_sub_pi16(go, cro);  /* CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
+
+      ye = _mm_and_si64(mask, y);             /* Y(0246) */
+      yo = _mm_srli_pi16(y, BYTE_BIT);        /* Y(1357) */
+
+      re = _mm_add_pi16(re, ye);              /* ((R-Y)E+YE)=(R0 R2 R4 R6) */
+      ro = _mm_add_pi16(ro, yo);              /* ((R-Y)O+YO)=(R1 R3 R5 R7) */
+      re = _mm_packs_pu16(re, re);            /* (R0 R2 R4 R6 ** ** ** **) */
+      ro = _mm_packs_pu16(ro, ro);            /* (R1 R3 R5 R7 ** ** ** **) */
+
+      ge = _mm_add_pi16(ge, ye);              /* ((G-Y)E+YE)=(G0 G2 G4 G6) */
+      go = _mm_add_pi16(go, yo);              /* ((G-Y)O+YO)=(G1 G3 G5 G7) */
+      ge = _mm_packs_pu16(ge, ge);            /* (G0 G2 G4 G6 ** ** ** **) */
+      go = _mm_packs_pu16(go, go);            /* (G1 G3 G5 G7 ** ** ** **) */
+
+      be = _mm_add_pi16(be, ye);              /* (YE+(B-Y)E)=(B0 B2 B4 B6) */
+      bo = _mm_add_pi16(bo, yo);              /* (YO+(B-Y)O)=(B1 B3 B5 B7) */
+      be = _mm_packs_pu16(be, be);            /* (B0 B2 B4 B6 ** ** ** **) */
+      bo = _mm_packs_pu16(bo, bo);            /* (B1 B3 B5 B7 ** ** ** **) */
+
+#if RGB_PIXELSIZE == 3
+
+      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+      mmA = _mm_unpacklo_pi8(mmA, mmC);       /* (00 10 02 12 04 14 06 16) */
+      mmE = _mm_unpacklo_pi8(mmE, mmB);       /* (20 01 22 03 24 05 26 07) */
+      mmD = _mm_unpacklo_pi8(mmD, mmF);       /* (11 21 13 23 15 25 17 27) */
+
+      mmH = _mm_srli_si64(mmA, 2 * BYTE_BIT);
+
+      mmG = _mm_unpackhi_pi16(mmA, mmE);      /* (04 14 24 05 06 16 26 07) */
+      mmA = _mm_unpacklo_pi16(mmA, mmE);      /* (00 10 20 01 02 12 22 03) */
+
+      mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT);
+      mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT);  /* (13 23 15 25 17 27 -- --) */
+
+      mmC = _mm_unpackhi_pi16(mmD, mmH);      /* (15 25 06 16 17 27 -- --) */
+      mmD = _mm_unpacklo_pi16(mmD, mmH);      /* (11 21 02 12 13 23 04 14) */
+
+      mmF = _mm_unpackhi_pi16(mmE, mmB);      /* (26 07 17 27 -- -- -- --) */
+      mmE = _mm_unpacklo_pi16(mmE, mmB);      /* (22 03 13 23 24 05 15 25) */
+
+      mmA = _mm_unpacklo_pi32(mmA, mmD);      /* (00 10 20 01 11 21 02 12) */
+      mmE = _mm_unpacklo_pi32(mmE, mmG);      /* (22 03 13 23 04 14 24 05) */
+      mmC = _mm_unpacklo_pi32(mmC, mmF);      /* (15 25 06 16 26 07 17 27) */
+
+      if (num_cols >= 8) {
+        if (!(((long)outptr) & 7)) {
+          _mm_store_si64((__m64 *)outptr, mmA);
+          _mm_store_si64((__m64 *)(outptr + 8), mmE);
+          _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        } else {
+          _mm_storeu_si64((__m64 *)outptr, mmA);
+          _mm_storeu_si64((__m64 *)(outptr + 8), mmE);
+          _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        }
+        outptr += RGB_PIXELSIZE * 8;
+      } else {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 16\r\n"
+            "move     $9, %4\r\n"
+            "mov.s    $f4, %1\r\n"
+            "mov.s    $f6, %3\r\n"
+            "move     $10, %5\r\n"
+            "bltu     $9, $8, 1f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "gssdlc1  $f6, 7+8($10)\r\n"
+            "gssdrc1  $f6, 8($10)\r\n"
+            "mov.s    $f4, %2\r\n"
+            "subu     $9, $9, 16\r\n"
+            PTR_ADDU  "$10, $10, 16\r\n"
+            "b        2f\r\n"
+            "nop      \r\n"
+
+            "1:       \r\n"
+            "li       $8, 8\r\n"              /* st8 */
+            "bltu     $9, $8, 2f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "mov.s    $f4, %3\r\n"
+            "subu     $9, $9, 8\r\n"
+            PTR_ADDU  "$10, $10, 8\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"              /* st4 */
+            "mfc1     $11, $f4\r\n"
+            "bltu     $9, $8, 3f\r\n"
+            "nop      \r\n"
+            "swl      $11, 3($10)\r\n"
+            "swr      $11, 0($10)\r\n"
+            "li       $8, 32\r\n"
+            "mtc1     $8, $f6\r\n"
+            "dsrl     $f4, $f4, $f6\r\n"
+            "mfc1     $11, $f4\r\n"
+            "subu     $9, $9, 4\r\n"
+            PTR_ADDU  "$10, $10, 4\r\n"
+
+            "3:       \r\n"
+            "li       $8, 2\r\n"              /* st2 */
+            "bltu     $9, $8, 4f\r\n"
+            "nop      \r\n"
+            "ush      $11, 0($10)\r\n"
+            "srl      $11, 16\r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_ADDU  "$10, $10, 2\r\n"
+
+            "4:       \r\n"
+            "li       $8, 1\r\n"              /* st1 */
+            "bltu     $9, $8, 5f\r\n"
+            "nop      \r\n"
+            "sb       $11, 0($10)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"                   /* end */
+            : "=m" (*outptr)
+            : "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr)
+            : "$f4", "$f6", "$8", "$9", "$10", "$11", "memory"
+           );
+      }
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+      xe = _mm_cmpeq_pi8(xe, xe);
+      xo = _mm_cmpeq_pi8(xo, xo);
+#else
+      xe = _mm_xor_si64(xe, xe);
+      xo = _mm_xor_si64(xo, xo);
+#endif
+      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+      /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */
+      /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */
+
+      mmA = _mm_unpacklo_pi8(mmA, mmC);       /* (00 10 02 12 04 14 06 16) */
+      mmE = _mm_unpacklo_pi8(mmE, mmG);       /* (20 30 22 32 24 34 26 36) */
+      mmB = _mm_unpacklo_pi8(mmB, mmD);       /* (01 11 03 13 05 15 07 17) */
+      mmF = _mm_unpacklo_pi8(mmF, mmH);       /* (21 31 23 33 25 35 27 37) */
+
+      mmC = _mm_unpackhi_pi16(mmA, mmE);      /* (04 14 24 34 06 16 26 36) */
+      mmA = _mm_unpacklo_pi16(mmA, mmE);      /* (00 10 20 30 02 12 22 32) */
+      mmG = _mm_unpackhi_pi16(mmB, mmF);      /* (05 15 25 35 07 17 27 37) */
+      mmB = _mm_unpacklo_pi16(mmB, mmF);      /* (01 11 21 31 03 13 23 33) */
+
+      mmD = _mm_unpackhi_pi32(mmA, mmB);      /* (02 12 22 32 03 13 23 33) */
+      mmA = _mm_unpacklo_pi32(mmA, mmB);      /* (00 10 20 30 01 11 21 31) */
+      mmH = _mm_unpackhi_pi32(mmC, mmG);      /* (06 16 26 36 07 17 27 37) */
+      mmC = _mm_unpacklo_pi32(mmC, mmG);      /* (04 14 24 34 05 15 25 35) */
+
+      if (num_cols >= 8) {
+        if (!(((long)outptr) & 7)) {
+          _mm_store_si64((__m64 *)outptr, mmA);
+          _mm_store_si64((__m64 *)(outptr + 8), mmD);
+          _mm_store_si64((__m64 *)(outptr + 16), mmC);
+          _mm_store_si64((__m64 *)(outptr + 24), mmH);
+        } else {
+          _mm_storeu_si64((__m64 *)outptr, mmA);
+          _mm_storeu_si64((__m64 *)(outptr + 8), mmD);
+          _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+          _mm_storeu_si64((__m64 *)(outptr + 24), mmH);
+        }
+        outptr += RGB_PIXELSIZE * 8;
+      } else {
+        col = num_cols;
+        asm(".set noreorder\r\n"              /* st16 */
+
+            "li       $8, 4\r\n"
+            "move     $9, %6\r\n"
+            "move     $10, %7\r\n"
+            "mov.s    $f4, %2\r\n"
+            "mov.s    $f6, %4\r\n"
+            "bltu     $9, $8, 1f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "gssdlc1  $f6, 7+8($10)\r\n"
+            "gssdrc1  $f6, 8($10)\r\n"
+            "mov.s    $f4, %3\r\n"
+            "mov.s    $f6, %5\r\n"
+            "subu     $9, $9, 4\r\n"
+            PTR_ADDU  "$10, $10, 16\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"              /* st8 */
+            "bltu     $9, $8, 2f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "mov.s    $f4, $f6\r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_ADDU  "$10, $10, 8\r\n"
+
+            "2:       \r\n"
+            "li       $8, 1\r\n"              /* st4 */
+            "bltu     $9, $8, 3f\r\n"
+            "nop      \r\n"
+            "gsswlc1  $f4, 3($10)\r\n"
+            "gsswrc1  $f4, 0($10)\r\n"
+
+            "3:       \r\n"
+            "li       %1, 0\r\n"              /* end */
+            : "=m" (*outptr), "=r" (col)
+            : "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col),
+              "r" (outptr)
+            : "$f4", "$f6", "$8", "$9", "$10", "memory"
+           );
+      }
+
+#endif
+
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/simd/loongson/jdcolor-mmi.c b/simd/mips64/jdcolor-mmi.c
similarity index 100%
rename from simd/loongson/jdcolor-mmi.c
rename to simd/mips64/jdcolor-mmi.c
diff --git a/simd/mips64/jdmerge-mmi.c b/simd/mips64/jdmerge-mmi.c
new file mode 100644
index 0000000..0a39bd5
--- /dev/null
+++ b/simd/mips64/jdmerge-mmi.c
@@ -0,0 +1,149 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_344  ((short)22554)  /* FIX(0.34414) */
+#define F_0_402  ((short)26345)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  ((short)18734)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  ((short)14942)  /* FIX(2) - FIX(1.77200) */
+
+enum const_index {
+  index_PW_ONE,
+  index_PW_F0402,
+  index_PW_MF0228,
+  index_PW_MF0344_F0285,
+  index_PD_ONEHALF
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(1, 1, 1, 1),
+  _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402),
+  _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228),
+  _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344),
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1)))
+};
+
+#define PW_ONE           get_const_value(index_PW_ONE)
+#define PW_F0402         get_const_value(index_PW_F0402)
+#define PW_MF0228        get_const_value(index_PW_MF0228)
+#define PW_MF0344_F0285  get_const_value(index_PW_MF0344_F0285)
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+
+#define RGBX_FILLER_0XFF  1
+
+
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extrgb_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extrgb_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extrgbx_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extrgbx_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extbgr_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extbgr_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extbgrx_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extbgrx_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extxbgr_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extxbgr_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extxrgb_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extxrgb_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
diff --git a/simd/mips64/jdmrgext-mmi.c b/simd/mips64/jdmrgext-mmi.c
new file mode 100644
index 0000000..be09ff2
--- /dev/null
+++ b/simd/mips64/jdmrgext-mmi.c
@@ -0,0 +1,615 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
+                                    JSAMPIMAGE input_buf,
+                                    JDIMENSION in_row_group_ctr,
+                                    JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int num_cols, col;
+  __m64 ythise, ythiso, ythis, ynexte, ynexto, ynext, yl, y;
+  __m64 cbl, cbl2, cbh, cbh2, cb, crl, crl2, crh, crh2, cr;
+  __m64 rle, rlo, rl, rhe, rho, rh, re, ro;
+  __m64 ga, gb, gle, glo, gl, gc, gd, ghe, gho, gh, ge, go;
+  __m64 ble, blo, bl, bhe, bho, bh, be, bo, xe = 0.0, xo = 0.0;
+  __m64 decenter, mask, zero = 0.0;
+#if RGB_PIXELSIZE == 4
+  __m64 mm8, mm9;
+#endif
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  for (num_cols = output_width >> 1; num_cols > 0; num_cols -= 8,
+       inptr0 += 16, inptr1 += 8, inptr2 += 8) {
+
+    cb = _mm_load_si64((__m64 *)inptr1);
+    cr = _mm_load_si64((__m64 *)inptr2);
+    ythis = _mm_load_si64((__m64 *)inptr0);
+    ynext = _mm_load_si64((__m64 *)inptr0 + 1);
+
+    mask = decenter = 0.0;
+    mask = _mm_cmpeq_pi16(mask, mask);
+    decenter = _mm_cmpeq_pi16(decenter, decenter);
+    mask = _mm_srli_pi16(mask, BYTE_BIT);   /* {0xFF 0x00 0xFF 0x00 ..} */
+    decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+    cbl = _mm_unpacklo_pi8(cb, zero);         /* Cb(0123) */
+    cbh = _mm_unpackhi_pi8(cb, zero);         /* Cb(4567) */
+    crl = _mm_unpacklo_pi8(cr, zero);         /* Cr(0123) */
+    crh = _mm_unpackhi_pi8(cr, zero);         /* Cr(4567) */
+    cbl = _mm_add_pi16(cbl, decenter);
+    cbh = _mm_add_pi16(cbh, decenter);
+    crl = _mm_add_pi16(crl, decenter);
+    crh = _mm_add_pi16(crh, decenter);
+
+    /* (Original)
+     * R = Y                + 1.40200 * Cr
+     * G = Y - 0.34414 * Cb - 0.71414 * Cr
+     * B = Y + 1.77200 * Cb
+     *
+     * (This implementation)
+     * R = Y                + 0.40200 * Cr + Cr
+     * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+     * B = Y - 0.22800 * Cb + Cb + Cb
+     */
+
+    cbl2 = _mm_add_pi16(cbl, cbl);            /* 2*CbL */
+    cbh2 = _mm_add_pi16(cbh, cbh);            /* 2*CbH */
+    crl2 = _mm_add_pi16(crl, crl);            /* 2*CrL */
+    crh2 = _mm_add_pi16(crh, crh);            /* 2*CrH */
+
+    bl = _mm_mulhi_pi16(cbl2, PW_MF0228);     /* (2*CbL * -FIX(0.22800) */
+    bh = _mm_mulhi_pi16(cbh2, PW_MF0228);     /* (2*CbH * -FIX(0.22800) */
+    rl = _mm_mulhi_pi16(crl2, PW_F0402);      /* (2*CrL * FIX(0.40200)) */
+    rh = _mm_mulhi_pi16(crh2, PW_F0402);      /* (2*CrH * FIX(0.40200)) */
+
+    bl = _mm_add_pi16(bl, PW_ONE);
+    bh = _mm_add_pi16(bh, PW_ONE);
+    bl = _mm_srai_pi16(bl, 1);                /* (CbL * -FIX(0.22800)) */
+    bh = _mm_srai_pi16(bh, 1);                /* (CbH * -FIX(0.22800)) */
+    rl = _mm_add_pi16(rl, PW_ONE);
+    rh = _mm_add_pi16(rh, PW_ONE);
+    rl = _mm_srai_pi16(rl, 1);                /* (CrL * FIX(0.40200)) */
+    rh = _mm_srai_pi16(rh, 1);                /* (CrH * FIX(0.40200)) */
+
+    bl = _mm_add_pi16(bl, cbl);
+    bh = _mm_add_pi16(bh, cbh);
+    bl = _mm_add_pi16(bl, cbl);               /* (CbL * FIX(1.77200))=(B-Y)L */
+    bh = _mm_add_pi16(bh, cbh);               /* (CbH * FIX(1.77200))=(B-Y)H */
+    rl = _mm_add_pi16(rl, crl);               /* (CrL * FIX(1.40200))=(R-Y)L */
+    rh = _mm_add_pi16(rh, crh);               /* (CrH * FIX(1.40200))=(R-Y)H */
+
+    ga = _mm_unpacklo_pi16(cbl, crl);
+    gb = _mm_unpackhi_pi16(cbl, crl);
+    ga = _mm_madd_pi16(ga, PW_MF0344_F0285);
+    gb = _mm_madd_pi16(gb, PW_MF0344_F0285);
+    gc = _mm_unpacklo_pi16(cbh, crh);
+    gd = _mm_unpackhi_pi16(cbh, crh);
+    gc = _mm_madd_pi16(gc, PW_MF0344_F0285);
+    gd = _mm_madd_pi16(gd, PW_MF0344_F0285);
+
+    ga = _mm_add_pi32(ga, PD_ONEHALF);
+    gb = _mm_add_pi32(gb, PD_ONEHALF);
+    ga = _mm_srai_pi32(ga, SCALEBITS);
+    gb = _mm_srai_pi32(gb, SCALEBITS);
+    gc = _mm_add_pi32(gc, PD_ONEHALF);
+    gd = _mm_add_pi32(gd, PD_ONEHALF);
+    gc = _mm_srai_pi32(gc, SCALEBITS);
+    gd = _mm_srai_pi32(gd, SCALEBITS);
+
+    gl = _mm_packs_pi32(ga, gb);           /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
+    gh = _mm_packs_pi32(gc, gd);           /* CbH*-FIX(0.344)+CrH*FIX(0.285) */
+    gl = _mm_sub_pi16(gl, crl);    /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
+    gh = _mm_sub_pi16(gh, crh);    /* CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H */
+
+    ythise = _mm_and_si64(mask, ythis);       /* Y(0246) */
+    ythiso = _mm_srli_pi16(ythis, BYTE_BIT);  /* Y(1357) */
+    ynexte = _mm_and_si64(mask, ynext);       /* Y(8ACE) */
+    ynexto = _mm_srli_pi16(ynext, BYTE_BIT);  /* Y(9BDF) */
+
+    rle = _mm_add_pi16(rl, ythise);           /* (R0 R2 R4 R6) */
+    rlo = _mm_add_pi16(rl, ythiso);           /* (R1 R3 R5 R7) */
+    rhe = _mm_add_pi16(rh, ynexte);           /* (R8 RA RC RE) */
+    rho = _mm_add_pi16(rh, ynexto);           /* (R9 RB RD RF) */
+    re = _mm_packs_pu16(rle, rhe);            /* (R0 R2 R4 R6 R8 RA RC RE) */
+    ro = _mm_packs_pu16(rlo, rho);            /* (R1 R3 R5 R7 R9 RB RD RF) */
+
+    gle = _mm_add_pi16(gl, ythise);           /* (G0 G2 G4 G6) */
+    glo = _mm_add_pi16(gl, ythiso);           /* (G1 G3 G5 G7) */
+    ghe = _mm_add_pi16(gh, ynexte);           /* (G8 GA GC GE) */
+    gho = _mm_add_pi16(gh, ynexto);           /* (G9 GB GD GF) */
+    ge = _mm_packs_pu16(gle, ghe);            /* (G0 G2 G4 G6 G8 GA GC GE) */
+    go = _mm_packs_pu16(glo, gho);            /* (G1 G3 G5 G7 G9 GB GD GF) */
+
+    ble = _mm_add_pi16(bl, ythise);           /* (B0 B2 B4 B6) */
+    blo = _mm_add_pi16(bl, ythiso);           /* (B1 B3 B5 B7) */
+    bhe = _mm_add_pi16(bh, ynexte);           /* (B8 BA BC BE) */
+    bho = _mm_add_pi16(bh, ynexto);           /* (B9 BB BD BF) */
+    be = _mm_packs_pu16(ble, bhe);            /* (B0 B2 B4 B6 B8 BA BC BE) */
+    bo = _mm_packs_pu16(blo, bho);            /* (B1 B3 B5 B7 B9 BB BD BF) */
+
+#if RGB_PIXELSIZE == 3
+
+    /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
+    /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
+    /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
+    mmG = _mm_unpacklo_pi8(mmA, mmC);         /* (00 10 02 12 04 14 06 16) */
+    mmA = _mm_unpackhi_pi8(mmA, mmC);         /* (08 18 0A 1A 0C 1C 0E 1E) */
+    mmH = _mm_unpacklo_pi8(mmE, mmB);         /* (20 01 22 03 24 05 26 07) */
+    mmE = _mm_unpackhi_pi8(mmE, mmB);         /* (28 09 2A 0B 2C 0D 2E 0F) */
+    mmC = _mm_unpacklo_pi8(mmD, mmF);         /* (11 21 13 23 15 25 17 27) */
+    mmD = _mm_unpackhi_pi8(mmD, mmF);         /* (19 29 1B 2B 1D 2D 1F 2F) */
+
+    mmB = _mm_unpacklo_pi16(mmG, mmA);        /* (00 10 08 18 02 12 0A 1A) */
+    mmA = _mm_unpackhi_pi16(mmG, mmA);        /* (04 14 0C 1C 06 16 0E 1E) */
+    mmF = _mm_unpacklo_pi16(mmH, mmE);        /* (20 01 28 09 22 03 2A 0B) */
+    mmE = _mm_unpackhi_pi16(mmH, mmE);        /* (24 05 2C 0D 26 07 2E 0F) */
+    mmH = _mm_unpacklo_pi16(mmC, mmD);        /* (11 21 19 29 13 23 1B 2B) */
+    mmG = _mm_unpackhi_pi16(mmC, mmD);        /* (15 25 1D 2D 17 27 1F 2F) */
+
+    mmC = _mm_unpacklo_pi16(mmB, mmF);        /* (00 10 20 01 08 18 28 09) */
+    mmB = _mm_srli_si64(mmB, 4 * BYTE_BIT);
+    mmB = _mm_unpacklo_pi16(mmH, mmB);        /* (11 21 02 12 19 29 0A 1A) */
+    mmD = _mm_unpackhi_pi16(mmF, mmH);        /* (22 03 13 23 2A 0B 1B 2B) */
+    mmF = _mm_unpacklo_pi16(mmA, mmE);        /* (04 14 24 05 0C 1C 2C 0D) */
+    mmA = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+    mmH = _mm_unpacklo_pi16(mmG, mmA);        /* (15 25 06 16 1D 2D 0E 1E) */
+    mmG = _mm_unpackhi_pi16(mmE, mmG);        /* (26 07 17 27 2E 0F 1F 2F) */
+
+    mmA = _mm_unpacklo_pi32(mmC, mmB);        /* (00 10 20 01 11 21 02 12) */
+    mmE = _mm_unpackhi_pi32(mmC, mmB);        /* (08 18 28 09 19 29 0A 1A) */
+    mmB = _mm_unpacklo_pi32(mmD, mmF);        /* (22 03 13 23 04 14 24 05) */
+    mmF = _mm_unpackhi_pi32(mmD, mmF);        /* (2A 0B 1B 2B 0C 1C 2C 0D) */
+    mmC = _mm_unpacklo_pi32(mmH, mmG);        /* (15 25 06 16 26 07 17 27) */
+    mmG = _mm_unpackhi_pi32(mmH, mmG);        /* (1D 2D 0E 1E 2E 0F 1F 2F) */
+
+    if (num_cols >= 8) {
+      if (!(((long)outptr) & 7)) {
+        _mm_store_si64((__m64 *)outptr, mmA);
+        _mm_store_si64((__m64 *)(outptr + 8), mmB);
+        _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        _mm_store_si64((__m64 *)(outptr + 24), mmE);
+        _mm_store_si64((__m64 *)(outptr + 32), mmF);
+        _mm_store_si64((__m64 *)(outptr + 40), mmG);
+      } else {
+        _mm_storeu_si64((__m64 *)outptr, mmA);
+        _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
+        _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        _mm_storeu_si64((__m64 *)(outptr + 24), mmE);
+        _mm_storeu_si64((__m64 *)(outptr + 32), mmF);
+        _mm_storeu_si64((__m64 *)(outptr + 40), mmG);
+      }
+      outptr += RGB_PIXELSIZE * 16;
+    } else {
+      if (output_width & 1)
+        col = num_cols * 6 + 3;
+      else
+        col = num_cols * 6;
+
+      asm(".set noreorder\r\n"                /* st24 */
+
+          "li       $8, 24\r\n"
+          "move     $9, %7\r\n"
+          "mov.s    $f4, %1\r\n"
+          "mov.s    $f6, %2\r\n"
+          "mov.s    $f8, %3\r\n"
+          "move     $10, %8\r\n"
+          "bltu     $9, $8, 1f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "gssdlc1  $f8, 7+16($10)\r\n"
+          "gssdrc1  $f8, 16($10)\r\n"
+          "mov.s    $f4, %4\r\n"
+          "mov.s    $f6, %5\r\n"
+          "mov.s    $f8, %6\r\n"
+          "subu     $9, $9, 24\r\n"
+          PTR_ADDU  "$10, $10, 24\r\n"
+
+          "1:       \r\n"
+          "li       $8, 16\r\n"               /* st16 */
+          "bltu     $9, $8, 2f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "mov.s    $f4, $f8\r\n"
+          "subu     $9, $9, 16\r\n"
+          PTR_ADDU  "$10, $10, 16\r\n"
+
+          "2:       \r\n"
+          "li       $8,  8\r\n"               /* st8 */
+          "bltu     $9, $8, 3f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "mov.s    $f4, $f6\r\n"
+          "subu     $9, $9, 8\r\n"
+          PTR_ADDU  "$10, $10, 8\r\n"
+
+          "3:       \r\n"
+          "li       $8,  4\r\n"               /* st4 */
+          "mfc1     $11, $f4\r\n"
+          "bltu     $9, $8, 4f\r\n"
+          "nop      \r\n"
+          "swl      $11, 3($10)\r\n"
+          "swr      $11, 0($10)\r\n"
+          "li       $8, 32\r\n"
+          "mtc1     $8, $f6\r\n"
+          "dsrl     $f4, $f4, $f6\r\n"
+          "mfc1     $11, $f4\r\n"
+          "subu     $9, $9, 4\r\n"
+          PTR_ADDU  "$10, $10, 4\r\n"
+
+          "4:       \r\n"
+          "li       $8, 2\r\n"                /* st2 */
+          "bltu     $9, $8, 5f\r\n"
+          "nop      \r\n"
+          "ush      $11, 0($10)\r\n"
+          "srl      $11, 16\r\n"
+          "subu     $9, $9, 2\r\n"
+          PTR_ADDU  "$10, $10, 2\r\n"
+
+          "5:       \r\n"
+          "li       $8, 1\r\n"                /* st1 */
+          "bltu     $9, $8, 6f\r\n"
+          "nop      \r\n"
+          "sb       $11, 0($10)\r\n"
+
+          "6:       \r\n"
+          "nop      \r\n"                     /* end */
+          : "=m" (*outptr)
+          : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmE), "f" (mmF),
+            "f" (mmG), "r" (col), "r" (outptr)
+          : "$f4", "$f6", "$f8", "$8", "$9", "$10", "$11", "memory"
+         );
+    }
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+    xe = _mm_cmpeq_pi8(xe, xe);
+    xo = _mm_cmpeq_pi8(xo, xo);
+#else
+    xe = _mm_xor_si64(xe, xe);
+    xo = _mm_xor_si64(xo, xo);
+#endif
+    /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
+    /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
+    /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
+    /* mmG=(30 32 34 36 38 3A 3C 3E), mmH=(31 33 35 37 39 3B 3D 3F) */
+
+    mm8 = _mm_unpacklo_pi8(mmA, mmC);         /* (00 10 02 12 04 14 06 16) */
+    mm9 = _mm_unpackhi_pi8(mmA, mmC);         /* (08 18 0A 1A 0C 1C 0E 1E) */
+    mmA = _mm_unpacklo_pi8(mmE, mmG);         /* (20 30 22 32 24 34 26 36) */
+    mmE = _mm_unpackhi_pi8(mmE, mmG);         /* (28 38 2A 3A 2C 3C 2E 3E) */
+
+    mmG = _mm_unpacklo_pi8(mmB, mmD);         /* (01 11 03 13 05 15 07 17) */
+    mmB = _mm_unpackhi_pi8(mmB, mmD);         /* (09 19 0B 1B 0D 1D 0F 1F) */
+    mmD = _mm_unpacklo_pi8(mmF, mmH);         /* (21 31 23 33 25 35 27 37) */
+    mmF = _mm_unpackhi_pi8(mmF, mmH);         /* (29 39 2B 3B 2D 3D 2F 3F) */
+
+    mmH = _mm_unpacklo_pi16(mm8, mmA);        /* (00 10 20 30 02 12 22 32) */
+    mm8 = _mm_unpackhi_pi16(mm8, mmA);        /* (04 14 24 34 06 16 26 36) */
+    mmA = _mm_unpacklo_pi16(mmG, mmD);        /* (01 11 21 31 03 13 23 33) */
+    mmD = _mm_unpackhi_pi16(mmG, mmD);        /* (05 15 25 35 07 17 27 37) */
+
+    mmG = _mm_unpackhi_pi16(mm9, mmE);        /* (0C 1C 2C 3C 0E 1E 2E 3E) */
+    mm9 = _mm_unpacklo_pi16(mm9, mmE);        /* (08 18 28 38 0A 1A 2A 3A) */
+    mmE = _mm_unpacklo_pi16(mmB, mmF);        /* (09 19 29 39 0B 1B 2B 3B) */
+    mmF = _mm_unpackhi_pi16(mmB, mmF);        /* (0D 1D 2D 3D 0F 1F 2F 3F) */
+
+    mmB = _mm_unpackhi_pi32(mmH, mmA);        /* (02 12 22 32 03 13 23 33) */
+    mmA = _mm_unpacklo_pi32(mmH, mmA);        /* (00 10 20 30 01 11 21 31) */
+    mmC = _mm_unpacklo_pi32(mm8, mmD);        /* (04 14 24 34 05 15 25 35) */
+    mmD = _mm_unpackhi_pi32(mm8, mmD);        /* (06 16 26 36 07 17 27 37) */
+
+    mmH = _mm_unpackhi_pi32(mmG, mmF);        /* (0E 1E 2E 3E 0F 1F 2F 3F) */
+    mmG = _mm_unpacklo_pi32(mmG, mmF);        /* (0C 1C 2C 3C 0D 1D 2D 3D) */
+    mmF = _mm_unpackhi_pi32(mm9, mmE);        /* (0A 1A 2A 3A 0B 1B 2B 3B) */
+    mmE = _mm_unpacklo_pi32(mm9, mmE);        /* (08 18 28 38 09 19 29 39) */
+
+    if (num_cols >= 8) {
+      if (!(((long)outptr) & 7)) {
+        _mm_store_si64((__m64 *)outptr, mmA);
+        _mm_store_si64((__m64 *)(outptr + 8), mmB);
+        _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        _mm_store_si64((__m64 *)(outptr + 24), mmD);
+        _mm_store_si64((__m64 *)(outptr + 32), mmE);
+        _mm_store_si64((__m64 *)(outptr + 40), mmF);
+        _mm_store_si64((__m64 *)(outptr + 48), mmG);
+        _mm_store_si64((__m64 *)(outptr + 56), mmH);
+      } else {
+        _mm_storeu_si64((__m64 *)outptr, mmA);
+        _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
+        _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        _mm_storeu_si64((__m64 *)(outptr + 24), mmD);
+        _mm_storeu_si64((__m64 *)(outptr + 32), mmE);
+        _mm_storeu_si64((__m64 *)(outptr + 40), mmF);
+        _mm_storeu_si64((__m64 *)(outptr + 48), mmG);
+        _mm_storeu_si64((__m64 *)(outptr + 56), mmH);
+      }
+      outptr += RGB_PIXELSIZE * 16;
+    } else {
+      if (output_width & 1)
+        col = num_cols * 2 + 1;
+      else
+        col = num_cols * 2;
+      asm(".set noreorder\r\n"                /* st32 */
+
+          "li       $8, 8\r\n"
+          "move     $9, %10\r\n"
+          "move     $10, %11\r\n"
+          "mov.s    $f4, %2\r\n"
+          "mov.s    $f6, %3\r\n"
+          "mov.s    $f8, %4\r\n"
+          "mov.s    $f10, %5\r\n"
+          "bltu     $9, $8, 1f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "gssdlc1  $f8, 7+16($10)\r\n"
+          "gssdrc1  $f8, 16($10)\r\n"
+          "gssdlc1  $f10, 7+24($10)\r\n"
+          "gssdrc1  $f10, 24($10)\r\n"
+          "mov.s    $f4, %6\r\n"
+          "mov.s    $f6, %7\r\n"
+          "mov.s    $f8, %8\r\n"
+          "mov.s    $f10, %9\r\n"
+          "subu     $9, $9, 8\r\n"
+          PTR_ADDU  "$10, $10, 32\r\n"
+
+          "1:       \r\n"
+          "li       $8, 4\r\n"                /* st16 */
+          "bltu     $9, $8, 2f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "mov.s    $f4, $f8\r\n"
+          "mov.s    $f6, $f10\r\n"
+          "subu     $9, $9, 4\r\n"
+          PTR_ADDU  "$10, $10, 16\r\n"
+
+          "2:       \r\n"
+          "li       $8, 2\r\n"                /* st8 */
+          "bltu     $9, $8, 3f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "mov.s    $f4, $f6\r\n"
+          "subu     $9, $9, 2\r\n"
+          PTR_ADDU  "$10, $10, 8\r\n"
+
+          "3:       \r\n"
+          "li       $8, 1\r\n"                /* st4 */
+          "bltu     $9, $8, 4f\r\n"
+          "nop      \r\n"
+          "gsswlc1  $f4, 3($10)\r\n"
+          "gsswrc1  $f4, 0($10)\r\n"
+
+          "4:       \r\n"
+          "li       %1, 0\r\n"                /* end */
+          : "=m" (*outptr), "=r" (col)
+          : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmD), "f" (mmE), "f" (mmF),
+            "f" (mmG), "f" (mmH), "r" (col), "r" (outptr)
+          : "$f4", "$f6", "$f8", "$f10", "$8", "$9", "$10", "memory"
+         );
+    }
+
+#endif
+
+  }
+
+  if (!((output_width >> 1) & 7)) {
+    if (output_width & 1) {
+      cb = _mm_load_si64((__m64 *)inptr1);
+      cr = _mm_load_si64((__m64 *)inptr2);
+      y = _mm_load_si64((__m64 *)inptr0);
+
+      decenter = 0.0;
+      decenter = _mm_cmpeq_pi16(decenter, decenter);
+      decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+      cbl = _mm_unpacklo_pi8(cb, zero);       /* Cb(0123) */
+      crl = _mm_unpacklo_pi8(cr, zero);       /* Cr(0123) */
+      cbl = _mm_add_pi16(cbl, decenter);
+      crl = _mm_add_pi16(crl, decenter);
+
+      cbl2 = _mm_add_pi16(cbl, cbl);          /* 2*CbL */
+      crl2 = _mm_add_pi16(crl, crl);          /* 2*CrL */
+      bl = _mm_mulhi_pi16(cbl2, PW_MF0228);   /* (2*CbL * -FIX(0.22800) */
+      rl = _mm_mulhi_pi16(crl2, PW_F0402);    /* (2*CrL * FIX(0.40200)) */
+
+      bl = _mm_add_pi16(bl, PW_ONE);
+      bl = _mm_srai_pi16(bl, 1);              /* (CbL * -FIX(0.22800)) */
+      rl = _mm_add_pi16(rl, PW_ONE);
+      rl = _mm_srai_pi16(rl, 1);              /* (CrL * FIX(0.40200)) */
+
+      bl = _mm_add_pi16(bl, cbl);
+      bl = _mm_add_pi16(bl, cbl);             /* (CbL * FIX(1.77200))=(B-Y)L */
+      rl = _mm_add_pi16(rl, crl);             /* (CrL * FIX(1.40200))=(R-Y)L */
+
+      gl = _mm_unpacklo_pi16(cbl, crl);
+      gl = _mm_madd_pi16(gl, PW_MF0344_F0285);
+      gl = _mm_add_pi32(gl, PD_ONEHALF);
+      gl = _mm_srai_pi32(gl, SCALEBITS);
+      gl = _mm_packs_pi32(gl, zero);       /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
+      gl = _mm_sub_pi16(gl, crl);  /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
+
+      yl = _mm_unpacklo_pi8(y, zero);         /* Y(0123) */
+      rl = _mm_add_pi16(rl, yl);              /* (R0 R1 R2 R3) */
+      gl = _mm_add_pi16(gl, yl);              /* (G0 G1 G2 G3) */
+      bl = _mm_add_pi16(bl, yl);              /* (B0 B1 B2 B3) */
+      re = _mm_packs_pu16(rl, rl);
+      ge = _mm_packs_pu16(gl, gl);
+      be = _mm_packs_pu16(bl, bl);
+#if RGB_PIXELSIZE == 3
+      mmA = _mm_unpacklo_pi8(mmA, mmC);
+      mmA = _mm_unpacklo_pi16(mmA, mmE);
+      asm(".set noreorder\r\n"
+
+          "move    $8, %2\r\n"
+          "mov.s   $f4, %1\r\n"
+          "mfc1    $9, $f4\r\n"
+          "ush     $9, 0($8)\r\n"
+          "srl     $9, 16\r\n"
+          "sb      $9, 2($8)\r\n"
+          : "=m" (*outptr)
+          : "f" (mmA), "r" (outptr)
+          : "$f4", "$8", "$9", "memory"
+         );
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+      xe = _mm_cmpeq_pi8(xe, xe);
+#else
+      xe = _mm_xor_si64(xe, xe);
+#endif
+      mmA = _mm_unpacklo_pi8(mmA, mmC);
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmA = _mm_unpacklo_pi16(mmA, mmE);
+      asm(".set noreorder\r\n"
+
+          "move    $8, %2\r\n"
+          "mov.s   $f4, %1\r\n"
+          "gsswlc1 $f4, 3($8)\r\n"
+          "gsswrc1 $f4, 0($8)\r\n"
+          : "=m" (*outptr)
+          : "f" (mmA), "r" (outptr)
+          : "$f4", "$8", "memory"
+         );
+#endif
+    }
+  }
+}
+
+
+void jsimd_h2v2_merged_upsample_mmi(JDIMENSION output_width,
+                                    JSAMPIMAGE input_buf,
+                                    JDIMENSION in_row_group_ctr,
+                                    JSAMPARRAY output_buf)
+{
+  JSAMPROW inptr, outptr;
+
+  inptr = input_buf[0][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
+  jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
+                                 output_buf);
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
+  output_buf[0] = output_buf[1];
+  jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
+                                 output_buf);
+
+  input_buf[0][in_row_group_ctr] = inptr;
+  output_buf[0] = outptr;
+}
+
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/simd/mips64/jdsample-mmi.c b/simd/mips64/jdsample-mmi.c
new file mode 100644
index 0000000..8ae94e7
--- /dev/null
+++ b/simd/mips64/jdsample-mmi.c
@@ -0,0 +1,304 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           ZhangLixia  <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA UPSAMPLING */
+
+#include "jsimd_mmi.h"
+
+
+enum const_index {
+  index_PW_ONE,
+  index_PW_TWO,
+  index_PW_THREE,
+  index_PW_SEVEN,
+  index_PW_EIGHT,
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(1, 1, 1, 1),
+  _uint64_set_pi16(2, 2, 2, 2),
+  _uint64_set_pi16(3, 3, 3, 3),
+  _uint64_set_pi16(7, 7, 7, 7),
+  _uint64_set_pi16(8, 8, 8, 8),
+};
+
+#define PW_ONE    get_const_value(index_PW_ONE)
+#define PW_TWO    get_const_value(index_PW_TWO)
+#define PW_THREE  get_const_value(index_PW_THREE)
+#define PW_SEVEN  get_const_value(index_PW_SEVEN)
+#define PW_EIGHT  get_const_value(index_PW_EIGHT)
+
+
+#define PROCESS_ROW(row, wkoffset, bias1, bias2, shift) { \
+  __m64 samp123X, samp3XXX, samp1234, sampX012, samp_1012; \
+  __m64 sampXXX4, sampX456, samp3456, samp567X, samp7XXX, samp5678; \
+  __m64 outle, outhe, outlo, outho, outl, outh; \
+  \
+  samp123X = _mm_srli_si64(samp0123, 2 * BYTE_BIT);  /* ( 1 2 3 -) */ \
+  sampXXX4 = _mm_slli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 4) */ \
+  samp3XXX = _mm_srli_si64(samp0123, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( 3 - - -) */ \
+  sampX456 = _mm_slli_si64(samp4567, 2 * BYTE_BIT);  /* ( - 4 5 6) */ \
+  \
+  samp1234 = _mm_or_si64(samp123X, sampXXX4);  /* ( 1 2 3 4) */ \
+  samp3456 = _mm_or_si64(samp3XXX, sampX456);  /* ( 3 4 5 6) */ \
+  \
+  sampX012 = _mm_slli_si64(samp0123, 2 * BYTE_BIT);  /* ( - 0 1 2) */ \
+  samp567X = _mm_srli_si64(samp4567, 2 * BYTE_BIT);  /* ( 5 6 7 -) */ \
+  samp7XXX = _mm_srli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( 7 - - -) */ \
+  \
+  samp_1012 = _mm_or_si64(sampX012, wk[row]);            /* (-1 0 1 2) */ \
+  samp5678 = _mm_or_si64(samp567X, wk[row + wkoffset]);  /* ( 5 6 7 8) */ \
+  \
+  wk[row] = samp7XXX; \
+  \
+  samp0123 = _mm_mullo_pi16(samp0123, PW_THREE); \
+  samp4567 = _mm_mullo_pi16(samp4567, PW_THREE); \
+  samp_1012 = _mm_add_pi16(samp_1012, bias1); \
+  samp3456 = _mm_add_pi16(samp3456, bias1); \
+  samp1234 = _mm_add_pi16(samp1234, bias2); \
+  samp5678 = _mm_add_pi16(samp5678, bias2); \
+  \
+  outle = _mm_add_pi16(samp_1012, samp0123); \
+  outhe = _mm_add_pi16(samp3456, samp4567); \
+  outle = _mm_srli_pi16(outle, shift);        /* ( 0  2  4  6) */ \
+  outhe = _mm_srli_pi16(outhe, shift);        /* ( 8 10 12 14) */ \
+  outlo = _mm_add_pi16(samp1234, samp0123); \
+  outho = _mm_add_pi16(samp5678, samp4567); \
+  outlo = _mm_srli_pi16(outlo, shift);        /* ( 1  3  5  7) */ \
+  outho = _mm_srli_pi16(outho, shift);        /* ( 9 11 13 15) */ \
+  \
+  outlo = _mm_slli_pi16(outlo, BYTE_BIT); \
+  outho = _mm_slli_pi16(outho, BYTE_BIT); \
+  outl = _mm_or_si64(outle, outlo);           /* ( 0  1  2  3  4  5  6  7) */ \
+  outh = _mm_or_si64(outhe, outho);           /* ( 8  9 10 11 12 13 14 15) */ \
+  \
+  _mm_store_si64((__m64 *)outptr##row, outl); \
+  _mm_store_si64((__m64 *)outptr##row + 1, outh); \
+}
+
+void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
+  int inrow, outrow, incol, tmp, tmp1;
+  __m64 this_1l, this_1h, this_1, thiscolsum_1l, thiscolsum_1h;
+  __m64 this0l, this0h, this0;
+  __m64 this1l, this1h, this1, thiscolsum1l, thiscolsum1h;
+  __m64 next_1l, next_1h, next_1, nextcolsum_1l, nextcolsum_1h;
+  __m64 next0l, next0h, next0;
+  __m64 next1l, next1h, next1, nextcolsum1l, nextcolsum1h;
+  __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[4], zero = 0.0;
+
+  mask0 = _mm_cmpeq_pi8(mask0, mask0);
+  masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+  mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+    inptr_1 = input_data[inrow - 1];
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    if (downsampled_width & 7) {
+      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
+      tmp1 = downsampled_width * sizeof(JSAMPLE);
+      asm(PTR_ADDU  "$8, %3, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %3, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %4, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %4, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %5, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %5, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1)
+          : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1)
+          : "$8", "$9"
+         );
+    }
+
+    /* process the first column block */
+    this0 = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+    this_1 = _mm_load_si64((__m64 *)inptr_1);  /* row[-1][0] */
+    this1 = _mm_load_si64((__m64 *)inptr1);    /* row[ 1][0] */
+
+    this0l = _mm_unpacklo_pi8(this0, zero);    /* row[ 0][0]( 0 1 2 3) */
+    this0h = _mm_unpackhi_pi8(this0, zero);    /* row[ 0][0]( 4 5 6 7) */
+    this_1l = _mm_unpacklo_pi8(this_1, zero);  /* row[-1][0]( 0 1 2 3) */
+    this_1h = _mm_unpackhi_pi8(this_1, zero);  /* row[-1][0]( 4 5 6 7) */
+    this1l = _mm_unpacklo_pi8(this1, zero);    /* row[+1][0]( 0 1 2 3) */
+    this1h = _mm_unpackhi_pi8(this1, zero);    /* row[+1][0]( 4 5 6 7) */
+
+    this0l = _mm_mullo_pi16(this0l, PW_THREE);
+    this0h = _mm_mullo_pi16(this0h, PW_THREE);
+
+    thiscolsum_1l = _mm_add_pi16(this_1l, this0l);  /* ( 0 1 2 3) */
+    thiscolsum_1h = _mm_add_pi16(this_1h, this0h);  /* ( 4 5 6 7) */
+    thiscolsum1l = _mm_add_pi16(this0l, this1l);    /* ( 0 1 2 3) */
+    thiscolsum1h = _mm_add_pi16(this0h, this1h);    /* ( 4 5 6 7) */
+
+    /* temporarily save the intermediate data */
+    _mm_store_si64((__m64 *)outptr0, thiscolsum_1l);
+    _mm_store_si64((__m64 *)outptr0 + 1, thiscolsum_1h);
+    _mm_store_si64((__m64 *)outptr1, thiscolsum1l);
+    _mm_store_si64((__m64 *)outptr1 + 1, thiscolsum1h);
+
+    wk[0] = _mm_and_si64(thiscolsum_1l, mask0);  /* ( 0 - - -) */
+    wk[1] = _mm_and_si64(thiscolsum1l, mask0);   /* ( 0 - - -) */
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8,
+         outptr0 += 16, outptr1 += 16) {
+
+      if (incol > 8) {
+        /* process the next column block */
+        next0 = _mm_load_si64((__m64 *)inptr0 + 1);    /* row[ 0][1] */
+        next_1 = _mm_load_si64((__m64 *)inptr_1 + 1);  /* row[-1][1] */
+        next1 = _mm_load_si64((__m64 *)inptr1 + 1);    /* row[+1][1] */
+
+        next0l = _mm_unpacklo_pi8(next0, zero);    /* row[ 0][1]( 0 1 2 3) */
+        next0h = _mm_unpackhi_pi8(next0, zero);    /* row[ 0][1]( 4 5 6 7) */
+        next_1l = _mm_unpacklo_pi8(next_1, zero);  /* row[-1][1]( 0 1 2 3) */
+        next_1h = _mm_unpackhi_pi8(next_1, zero);  /* row[-1][1]( 4 5 6 7) */
+        next1l = _mm_unpacklo_pi8(next1, zero);    /* row[+1][1]( 0 1 2 3) */
+        next1h = _mm_unpackhi_pi8(next1, zero);    /* row[+1][1]( 4 5 6 7) */
+
+        next0l = _mm_mullo_pi16(next0l, PW_THREE);
+        next0h = _mm_mullo_pi16(next0h, PW_THREE);
+
+        nextcolsum_1l = _mm_add_pi16(next_1l, next0l);  /* ( 0 1 2 3) */
+        nextcolsum_1h = _mm_add_pi16(next_1h, next0h);  /* ( 4 5 6 7) */
+        nextcolsum1l = _mm_add_pi16(next0l, next1l);    /* ( 0 1 2 3) */
+        nextcolsum1h = _mm_add_pi16(next0h, next1h);    /* ( 4 5 6 7) */
+
+        /* temporarily save the intermediate data */
+        _mm_store_si64((__m64 *)outptr0 + 2, nextcolsum_1l);
+        _mm_store_si64((__m64 *)outptr0 + 3, nextcolsum_1h);
+        _mm_store_si64((__m64 *)outptr1 + 2, nextcolsum1l);
+        _mm_store_si64((__m64 *)outptr1 + 3, nextcolsum1h);
+
+        wk[2] = _mm_slli_si64(nextcolsum_1l, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 0) */
+        wk[3] = _mm_slli_si64(nextcolsum1l, (SIZEOF_MMWORD - 2) * BYTE_BIT);   /* ( - - - 0) */
+      } else {
+        __m64 tmp;
+
+        /* process the last column block */
+        tmp = _mm_load_si64((__m64 *)outptr0 + 1);
+        wk[2] = _mm_and_si64(masklast, tmp);        /* ( - - - 7) */
+        tmp = _mm_load_si64((__m64 *)outptr1 + 1);
+        wk[3] = _mm_and_si64(masklast, tmp);        /* ( - - - 7) */
+      }
+
+      /* process the upper row */
+      samp0123 = _mm_load_si64((__m64 *)outptr0);      /* ( 0 1 2 3) */ \
+      samp4567 = _mm_load_si64((__m64 *)outptr0 + 1);  /* ( 4 5 6 7) */ \
+      PROCESS_ROW(0, 2, PW_EIGHT, PW_SEVEN, 4)
+
+      /* process the lower row */
+      samp0123 = _mm_load_si64((__m64 *)outptr1);      /* ( 0 1 2 3) */ \
+      samp4567 = _mm_load_si64((__m64 *)outptr1 + 1);  /* ( 4 5 6 7) */ \
+      PROCESS_ROW(1, 2, PW_EIGHT, PW_SEVEN, 4)
+    }
+  }
+}
+
+
+void jsimd_h2v1_fancy_upsample_mmi(int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, outptr0;
+  int inrow, incol, tmp, tmp1;
+  __m64 thisl, this, nextl, next;
+  __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[2], zero = 0.0;
+
+  mask0 = _mm_cmpeq_pi8(mask0, mask0);
+  masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+  mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+
+    inptr0 = input_data[inrow];
+    outptr0 = output_data[inrow];
+
+    if (downsampled_width & 7) {
+      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
+      tmp1 = downsampled_width * sizeof(JSAMPLE);
+      asm(PTR_ADDU  "$8, %1, %2\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %1, %3\r\n"
+          "sb       $9, ($8)\r\n"
+          : "=m" (*inptr0)
+          : "r" (inptr0), "r" (tmp), "r" (tmp1)
+          : "$8", "$9"
+         );
+    }
+
+    /* process the first column block */
+    this = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+    thisl = _mm_unpacklo_pi8(this, zero);     /* row[ 0][0]( 0 1 2 3) */
+    wk[0] = _mm_and_si64(thisl, mask0);       /* ( 0 - - -) */
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 8, inptr0 += 8, outptr0 += 16) {
+
+      if (incol > 8) {
+        /* process the next column block */
+        next = _mm_load_si64((__m64 *)inptr0 + 1);  /* row[ 0][1] */
+        nextl = _mm_unpacklo_pi8(next, zero);       /* row[ 0][1]( 0 1 2 3) */
+        wk[1] = _mm_slli_si64(nextl, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 0) */
+      } else {
+        __m64 thish;
+
+        /* process the last column block */
+        this = _mm_load_si64((__m64 *)inptr0);  /* row[ 0][0] */
+        thish = _mm_unpackhi_pi8(this, zero);   /* row[ 0][1]( 4 5 6 7) */
+        wk[1] = _mm_and_si64(masklast, thish);  /* ( - - - 7) */
+      }
+
+      /* process the row */
+      this = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+      samp0123 = _mm_unpacklo_pi8(this, zero);  /* ( 0 1 2 3) */
+      samp4567 = _mm_unpackhi_pi8(this, zero);  /* ( 4 5 6 7) */
+      PROCESS_ROW(0, 1, PW_ONE, PW_TWO, 2)
+    }
+  }
+}
diff --git a/simd/mips64/jfdctfst-mmi.c b/simd/mips64/jfdctfst-mmi.c
new file mode 100644
index 0000000..f7caf09
--- /dev/null
+++ b/simd/mips64/jfdctfst-mmi.c
@@ -0,0 +1,255 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  LiuQingfa <liuqingfa-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER FORWARD DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  8
+
+#define F_0_382  ((short)98)   /* FIX(0.382683433) */
+#define F_0_541  ((short)139)  /* FIX(0.541196100) */
+#define F_0_707  ((short)181)  /* FIX(0.707106781) */
+#define F_1_306  ((short)334)  /* FIX(1.306562965) */
+
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+enum const_index {
+  index_PW_F0707,
+  index_PW_F0382,
+  index_PW_F0541,
+  index_PW_F1306
+};
+
+static uint64_t const_value[] = {
+  _uint64_set1_pi16(F_0_707),
+  _uint64_set1_pi16(F_0_382),
+  _uint64_set1_pi16(F_0_541),
+  _uint64_set1_pi16(F_1_306)
+};
+
+#define PW_F0707  get_const_value(index_PW_F0707)
+#define PW_F0382  get_const_value(index_PW_F0382)
+#define PW_F0541  get_const_value(index_PW_F0541)
+#define PW_F1306  get_const_value(index_PW_F1306)
+
+
+#define DO_FDCT_MULTIPLY(out, in, multiplier) { \
+  __m64 mulhi, mullo, mul12, mul34; \
+  \
+  mullo = _mm_mullo_pi16(in, multiplier); \
+  mulhi = _mm_mulhi_pi16(in, multiplier); \
+  mul12 = _mm_unpacklo_pi16(mullo, mulhi); \
+  mul34 = _mm_unpackhi_pi16(mullo, mulhi); \
+  mul12 = _mm_srai_pi32(mul12, CONST_BITS); \
+  mul34 = _mm_srai_pi32(mul34, CONST_BITS); \
+  out = _mm_packs_pi32(mul12, mul34); \
+}
+
+#define DO_FDCT_COMMON() { \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp3); \
+  tmp13 = _mm_sub_pi16(tmp0, tmp3); \
+  tmp11 = _mm_add_pi16(tmp1, tmp2); \
+  tmp12 = _mm_sub_pi16(tmp1, tmp2); \
+  \
+  out0 = _mm_add_pi16(tmp10, tmp11); \
+  out4 = _mm_sub_pi16(tmp10, tmp11); \
+  \
+  z1 = _mm_add_pi16(tmp12, tmp13); \
+  DO_FDCT_MULTIPLY(z1, z1, PW_F0707) \
+  \
+  out2 = _mm_add_pi16(tmp13, z1); \
+  out6 = _mm_sub_pi16(tmp13, z1); \
+  \
+  /* Odd part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp4, tmp5); \
+  tmp11 = _mm_add_pi16(tmp5, tmp6); \
+  tmp12 = _mm_add_pi16(tmp6, tmp7); \
+  \
+  z5 = _mm_sub_pi16(tmp10, tmp12); \
+  DO_FDCT_MULTIPLY(z5, z5, PW_F0382) \
+  \
+  DO_FDCT_MULTIPLY(z2, tmp10, PW_F0541) \
+  z2 = _mm_add_pi16(z2, z5); \
+  \
+  DO_FDCT_MULTIPLY(z4, tmp12, PW_F1306) \
+  z4 = _mm_add_pi16(z4, z5); \
+  \
+  DO_FDCT_MULTIPLY(z3, tmp11, PW_F0707) \
+  \
+  z11 = _mm_add_pi16(tmp7, z3); \
+  z13 = _mm_sub_pi16(tmp7, z3); \
+  \
+  out5 = _mm_add_pi16(z13, z2); \
+  out3 = _mm_sub_pi16(z13, z2); \
+  out1 = _mm_add_pi16(z11, z4); \
+  out7 = _mm_sub_pi16(z11, z4); \
+}
+
+#define DO_FDCT_PASS1() { \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 col0, col1, col2, col3, col4, col5, col6, col7; \
+  \
+  row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);     /* (00 01 02 03) */ \
+  row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
+  row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);     /* (10 11 12 13) */ \
+  row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
+  row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);     /* (20 21 22 23) */ \
+  row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
+  row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);     /* (30 31 32 33) */ \
+  row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row23a = _mm_unpacklo_pi16(row2l, row3l);   /* row23a=(20 30 21 31) */ \
+  row23b = _mm_unpackhi_pi16(row2l, row3l);   /* row23b=(22 32 23 33) */ \
+  row23c = _mm_unpacklo_pi16(row2h, row3h);   /* row23c=(24 34 25 35) */ \
+  row23d = _mm_unpackhi_pi16(row2h, row3h);   /* row23d=(26 36 27 37) */ \
+  \
+  row01a = _mm_unpacklo_pi16(row0l, row1l);   /* row01a=(00 10 01 11) */ \
+  row01b = _mm_unpackhi_pi16(row0l, row1l);   /* row01b=(02 12 03 13) */ \
+  row01c = _mm_unpacklo_pi16(row0h, row1h);   /* row01c=(04 14 05 15) */ \
+  row01d = _mm_unpackhi_pi16(row0h, row1h);   /* row01d=(06 16 07 17) */ \
+  \
+  col0 = _mm_unpacklo_pi32(row01a, row23a);   /* col0=(00 10 20 30) */ \
+  col1 = _mm_unpackhi_pi32(row01a, row23a);   /* col1=(01 11 21 31) */ \
+  col6 = _mm_unpacklo_pi32(row01d, row23d);   /* col6=(06 16 26 36) */ \
+  col7 = _mm_unpackhi_pi32(row01d, row23d);   /* col7=(07 17 27 37) */ \
+  \
+  tmp6 = _mm_sub_pi16(col1, col6);            /* tmp6=col1-col6 */ \
+  tmp7 = _mm_sub_pi16(col0, col7);            /* tmp7=col0-col7 */ \
+  tmp1 = _mm_add_pi16(col1, col6);            /* tmp1=col1+col6 */ \
+  tmp0 = _mm_add_pi16(col0, col7);            /* tmp0=col0+col7 */ \
+  \
+  col2 = _mm_unpacklo_pi32(row01b, row23b);   /* col2=(02 12 22 32) */ \
+  col3 = _mm_unpackhi_pi32(row01b, row23b);   /* col3=(03 13 23 33) */ \
+  col4 = _mm_unpacklo_pi32(row01c, row23c);   /* col4=(04 14 24 34) */ \
+  col5 = _mm_unpackhi_pi32(row01c, row23c);   /* col5=(05 15 25 35) */ \
+  \
+  tmp3 = _mm_add_pi16(col3, col4);            /* tmp3=col3+col4 */ \
+  tmp2 = _mm_add_pi16(col2, col5);            /* tmp2=col2+col5 */ \
+  tmp4 = _mm_sub_pi16(col3, col4);            /* tmp4=col3-col4 */ \
+  tmp5 = _mm_sub_pi16(col2, col5);            /* tmp5=col2-col5 */ \
+  \
+  DO_FDCT_COMMON() \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \
+}
+
+#define DO_FDCT_PASS2() { \
+  __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \
+  __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \
+  __m64 row0, row1, row2, row3, row4, row5, row6, row7; \
+  \
+  col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]);  /* (40 50 60 70) */ \
+  col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]);  /* (41 51 61 71) */ \
+  col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]);  /* (42 52 62 72) */ \
+  col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]);  /* (43 53 63 73) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  col23a = _mm_unpacklo_pi16(col2l, col3l);   /* col23a=(02 03 12 13) */ \
+  col23b = _mm_unpackhi_pi16(col2l, col3l);   /* col23b=(22 23 32 33) */ \
+  col23c = _mm_unpacklo_pi16(col2h, col3h);   /* col23c=(42 43 52 53) */ \
+  col23d = _mm_unpackhi_pi16(col2h, col3h);   /* col23d=(62 63 72 73) */ \
+  \
+  col01a = _mm_unpacklo_pi16(col0l, col1l);   /* col01a=(00 01 10 11) */ \
+  col01b = _mm_unpackhi_pi16(col0l, col1l);   /* col01b=(20 21 30 31) */ \
+  col01c = _mm_unpacklo_pi16(col0h, col1h);   /* col01c=(40 41 50 51) */ \
+  col01d = _mm_unpackhi_pi16(col0h, col1h);   /* col01d=(60 61 70 71) */ \
+  \
+  row0 = _mm_unpacklo_pi32(col01a, col23a);   /* row0=(00 01 02 03) */ \
+  row1 = _mm_unpackhi_pi32(col01a, col23a);   /* row1=(10 11 12 13) */ \
+  row6 = _mm_unpacklo_pi32(col01d, col23d);   /* row6=(60 61 62 63) */ \
+  row7 = _mm_unpackhi_pi32(col01d, col23d);   /* row7=(70 71 72 73) */ \
+  \
+  tmp6 = _mm_sub_pi16(row1, row6);            /* tmp6=row1-row6 */ \
+  tmp7 = _mm_sub_pi16(row0, row7);            /* tmp7=row0-row7 */ \
+  tmp1 = _mm_add_pi16(row1, row6);            /* tmp1=row1+row6 */ \
+  tmp0 = _mm_add_pi16(row0, row7);            /* tmp0=row0+row7 */ \
+  \
+  row2 = _mm_unpacklo_pi32(col01b, col23b);   /* row2=(20 21 22 23) */ \
+  row3 = _mm_unpackhi_pi32(col01b, col23b);   /* row3=(30 31 32 33) */ \
+  row4 = _mm_unpacklo_pi32(col01c, col23c);   /* row4=(40 41 42 43) */ \
+  row5 = _mm_unpackhi_pi32(col01c, col23c);   /* row5=(50 51 52 53) */ \
+  \
+  tmp3 = _mm_add_pi16(row3, row4);            /* tmp3=row3+row4 */ \
+  tmp2 = _mm_add_pi16(row2, row5);            /* tmp2=row2+row5 */ \
+  tmp4 = _mm_sub_pi16(row3, row4);            /* tmp4=row3-row4 */ \
+  tmp5 = _mm_sub_pi16(row2, row5);            /* tmp5=row2-row5 */ \
+  \
+  DO_FDCT_COMMON() \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \
+}
+
+void jsimd_fdct_ifast_mmi(DCTELEM *data)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  __m64 tmp10, tmp11, tmp12, tmp13, z1, z2, z3, z4, z5, z11, z13;
+  DCTELEM *dataptr = data;
+
+  /* Pass 1: process rows. */
+
+  DO_FDCT_PASS1()
+  dataptr += DCTSIZE * 4;
+  DO_FDCT_PASS1()
+
+  /* Pass 2: process columns. */
+
+  dataptr = data;
+  DO_FDCT_PASS2()
+  dataptr += 4;
+  DO_FDCT_PASS2()
+}
diff --git a/simd/loongson/jfdctint-mmi.c b/simd/mips64/jfdctint-mmi.c
similarity index 100%
rename from simd/loongson/jfdctint-mmi.c
rename to simd/mips64/jfdctint-mmi.c
diff --git a/simd/mips64/jidctfst-mmi.c b/simd/mips64/jidctfst-mmi.c
new file mode 100644
index 0000000..503bb35
--- /dev/null
+++ b/simd/mips64/jidctfst-mmi.c
@@ -0,0 +1,395 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  LiuQingfa <liuqingfa-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER INVERSE DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  8
+#define PASS1_BITS  2
+
+#define FIX_1_082  ((short)277)                   /* FIX(1.082392200) */
+#define FIX_1_414  ((short)362)                   /* FIX(1.414213562) */
+#define FIX_1_847  ((short)473)                   /* FIX(1.847759065) */
+#define FIX_2_613  ((short)669)                   /* FIX(2.613125930) */
+#define FIX_1_613  ((short)(FIX_2_613 - 256 * 3)) /* FIX(2.613125930) - FIX(1) */
+
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+enum const_index {
+  index_PW_F1082,
+  index_PW_F1414,
+  index_PW_F1847,
+  index_PW_MF1613,
+  index_PB_CENTERJSAMP
+};
+
+static uint64_t const_value[] = {
+  _uint64_set1_pi16(FIX_1_082 << CONST_SHIFT),
+  _uint64_set1_pi16(FIX_1_414 << CONST_SHIFT),
+  _uint64_set1_pi16(FIX_1_847 << CONST_SHIFT),
+  _uint64_set1_pi16(-FIX_1_613 << CONST_SHIFT),
+  _uint64_set1_pi8(CENTERJSAMPLE)
+};
+
+#define PW_F1414        get_const_value(index_PW_F1414)
+#define PW_F1847        get_const_value(index_PW_F1847)
+#define PW_MF1613       get_const_value(index_PW_MF1613)
+#define PW_F1082        get_const_value(index_PW_F1082)
+#define PB_CENTERJSAMP  get_const_value(index_PB_CENTERJSAMP)
+
+
+#define test_m32_zero(mm32)  (!(*(uint32_t *)&mm32))
+#define test_m64_zero(mm64)  (!(*(uint64_t *)&mm64))
+
+
+#define DO_IDCT_COMMON() { \
+  tmp7 = _mm_add_pi16(z11, z13); \
+  \
+  tmp11 = _mm_sub_pi16(z11, z13); \
+  tmp11 = _mm_slli_pi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
+  tmp11 = _mm_mulhi_pi16(tmp11, PW_F1414); \
+  \
+  tmp10 = _mm_slli_pi16(z12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_slli_pi16(z10, PRE_MULTIPLY_SCALE_BITS); \
+  \
+  /* To avoid overflow... \
+   * \
+   * (Original) \
+   * tmp12 = -2.613125930 * z10 + z5; \
+   * \
+   * (This implementation) \
+   * tmp12 = (-1.613125930 - 1) * z10 + z5; \
+   *       = -1.613125930 * z10 - z10 + z5; \
+   */ \
+  \
+  z5 = _mm_add_pi16(tmp10, tmp12); \
+  z5 = _mm_mulhi_pi16(z5, PW_F1847); \
+  \
+  tmp10 = _mm_mulhi_pi16(tmp10, PW_F1082); \
+  tmp10 = _mm_sub_pi16(tmp10, z5); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_MF1613); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_add_pi16(tmp12, z5); \
+  \
+  /* Final output stage */ \
+  \
+  tmp6 = _mm_sub_pi16(tmp12, tmp7); \
+  tmp5 = _mm_sub_pi16(tmp11, tmp6); \
+  tmp4 = _mm_add_pi16(tmp10, tmp5); \
+  \
+  out0 = _mm_add_pi16(tmp0, tmp7); \
+  out7 = _mm_sub_pi16(tmp0, tmp7); \
+  out1 = _mm_add_pi16(tmp1, tmp6); \
+  out6 = _mm_sub_pi16(tmp1, tmp6); \
+  \
+  out2 = _mm_add_pi16(tmp2, tmp5); \
+  out5 = _mm_sub_pi16(tmp2, tmp5); \
+  out4 = _mm_add_pi16(tmp3, tmp4); \
+  out3 = _mm_sub_pi16(tmp3, tmp4); \
+}
+
+#define DO_IDCT_PASS1(iter) { \
+  __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \
+  __m64 quant0l, quant1l, quant2l, quant3l; \
+  __m64 quant4l, quant5l, quant6l, quant7l; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m32 col0a, col1a, mm0; \
+  \
+  col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \
+  col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \
+  mm0 = _mm_or_si32(col0a, col1a); \
+  \
+  if (test_m32_zero(mm0)) { \
+    __m64 mm1, mm2; \
+    \
+    col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \
+    col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \
+    col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \
+    col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \
+    col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \
+    col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \
+    col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \
+    col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \
+    \
+    mm1 = _mm_or_si64(col1l, col3l); \
+    mm2 = _mm_or_si64(col2l, col4l); \
+    mm1 = _mm_or_si64(mm1, col5l); \
+    mm2 = _mm_or_si64(mm2, col6l); \
+    mm1 = _mm_or_si64(mm1, col7l); \
+    mm1 = _mm_or_si64(mm1, mm2); \
+    \
+    if (test_m64_zero(mm1)) { \
+      __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \
+      \
+      /* AC terms all zero */ \
+      \
+      quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+      \
+      dcval = _mm_mullo_pi16(col0l, quant0l);    /* dcval=(00 10 20 30) */ \
+      \
+      dcvall = _mm_unpacklo_pi16(dcval, dcval);  /* dcvall=(00 00 10 10) */ \
+      dcvalh = _mm_unpackhi_pi16(dcval, dcval);  /* dcvalh=(20 20 30 30) */ \
+      \
+      row0 = _mm_unpacklo_pi32(dcvall, dcvall);  /* row0=(00 00 00 00) */ \
+      row1 = _mm_unpackhi_pi32(dcvall, dcvall);  /* row1=(10 10 10 10) */ \
+      row2 = _mm_unpacklo_pi32(dcvalh, dcvalh);  /* row2=(20 20 20 20) */ \
+      row3 = _mm_unpackhi_pi32(dcvalh, dcvalh);  /* row3=(30 30 30 30) */ \
+      \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \
+      \
+      goto nextcolumn##iter; \
+    } \
+  } \
+  \
+  /* Even part */ \
+  \
+  col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]);  /* (04 14 24 34) */ \
+  col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]);  /* (06 16 26 36) */ \
+  \
+  quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+  quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \
+  quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \
+  quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \
+  \
+  tmp0 = _mm_mullo_pi16(col0l, quant0l); \
+  tmp1 = _mm_mullo_pi16(col2l, quant2l); \
+  tmp2 = _mm_mullo_pi16(col4l, quant4l); \
+  tmp3 = _mm_mullo_pi16(col6l, quant6l); \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp2); \
+  tmp11 = _mm_sub_pi16(tmp0, tmp2); \
+  tmp13 = _mm_add_pi16(tmp1, tmp3); \
+  \
+  tmp12 = _mm_sub_pi16(tmp1, tmp3); \
+  tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
+  tmp12 = _mm_sub_pi16(tmp12, tmp13); \
+  \
+  tmp0 = _mm_add_pi16(tmp10, tmp13); \
+  tmp3 = _mm_sub_pi16(tmp10, tmp13); \
+  tmp1 = _mm_add_pi16(tmp11, tmp12); \
+  tmp2 = _mm_sub_pi16(tmp11, tmp12); \
+  \
+  /* Odd part */ \
+  \
+  col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]);  /* (05 15 25 35) */ \
+  col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]);  /* (07 17 27 37) */ \
+  \
+  quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \
+  quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \
+  quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \
+  quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \
+  \
+  tmp4 = _mm_mullo_pi16(col1l, quant1l); \
+  tmp5 = _mm_mullo_pi16(col3l, quant3l); \
+  tmp6 = _mm_mullo_pi16(col5l, quant5l); \
+  tmp7 = _mm_mullo_pi16(col7l, quant7l); \
+  \
+  z13 = _mm_add_pi16(tmp6, tmp5); \
+  z10 = _mm_sub_pi16(tmp6, tmp5); \
+  z11 = _mm_add_pi16(tmp4, tmp7); \
+  z12 = _mm_sub_pi16(tmp4, tmp7); \
+  \
+  DO_IDCT_COMMON() \
+  \
+  /* out0=(00 10 20 30), out1=(01 11 21 31) */ \
+  /* out2=(02 12 22 32), out3=(03 13 23 33) */ \
+  /* out4=(04 14 24 34), out5=(05 15 25 35) */ \
+  /* out6=(06 16 26 36), out7=(07 17 27 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row01a = _mm_unpacklo_pi16(out0, out1);     /* row01a=(00 01 10 11) */ \
+  row23a = _mm_unpackhi_pi16(out0, out1);     /* row23a=(20 21 30 31) */ \
+  row01d = _mm_unpacklo_pi16(out6, out7);     /* row01d=(06 07 16 17) */ \
+  row23d = _mm_unpackhi_pi16(out6, out7);     /* row23d=(26 27 36 37) */ \
+  \
+  row01b = _mm_unpacklo_pi16(out2, out3);     /* row01b=(02 03 12 13) */ \
+  row23b = _mm_unpackhi_pi16(out2, out3);     /* row23b=(22 23 32 33) */ \
+  row01c = _mm_unpacklo_pi16(out4, out5);     /* row01c=(04 05 14 15) */ \
+  row23c = _mm_unpackhi_pi16(out4, out5);     /* row23c=(24 25 34 35) */ \
+  \
+  row0l = _mm_unpacklo_pi32(row01a, row01b);  /* row0l=(00 01 02 03) */ \
+  row1l = _mm_unpackhi_pi32(row01a, row01b);  /* row1l=(10 11 12 13) */ \
+  row2l = _mm_unpacklo_pi32(row23a, row23b);  /* row2l=(20 21 22 23) */ \
+  row3l = _mm_unpackhi_pi32(row23a, row23b);  /* row3l=(30 31 32 33) */ \
+  \
+  row0h = _mm_unpacklo_pi32(row01c, row01d);  /* row0h=(04 05 06 07) */ \
+  row1h = _mm_unpackhi_pi32(row01c, row01d);  /* row1h=(14 15 16 17) */ \
+  row2h = _mm_unpacklo_pi32(row23c, row23d);  /* row2h=(24 25 26 27) */ \
+  row3h = _mm_unpackhi_pi32(row23c, row23d);  /* row3h=(34 35 36 37) */ \
+  \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \
+}
+
+#define DO_IDCT_PASS2(ctr) { \
+  __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \
+  __m64 col0123a, col0123b, col0123c, col0123d; \
+  __m64 col01l, col01h, col23l, col23h; \
+  __m64 col0, col1, col2, col3; \
+  __m64 row06, row17, row24, row35; \
+  \
+  row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]);  /* (00 01 02 03) */ \
+  row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]);  /* (10 11 12 13) */ \
+  row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]);  /* (20 21 22 23) */ \
+  row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]);  /* (30 31 32 33) */ \
+  row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]);  /* (40 41 42 43) */ \
+  row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]);  /* (50 51 52 53) */ \
+  row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]);  /* (60 61 62 63) */ \
+  row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]);  /* (70 71 72 73) */ \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(row0l, row4l); \
+  tmp11 = _mm_sub_pi16(row0l, row4l); \
+  tmp13 = _mm_add_pi16(row2l, row6l); \
+  \
+  tmp12 = _mm_sub_pi16(row2l, row6l); \
+  tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
+  tmp12 = _mm_sub_pi16(tmp12, tmp13); \
+  \
+  tmp0 = _mm_add_pi16(tmp10, tmp13); \
+  tmp3 = _mm_sub_pi16(tmp10, tmp13); \
+  tmp1 = _mm_add_pi16(tmp11, tmp12); \
+  tmp2 = _mm_sub_pi16(tmp11, tmp12); \
+  \
+  /* Odd part */ \
+  \
+  z13 = _mm_add_pi16(row5l, row3l); \
+  z10 = _mm_sub_pi16(row5l, row3l); \
+  z11 = _mm_add_pi16(row1l, row7l); \
+  z12 = _mm_sub_pi16(row1l, row7l); \
+  \
+  DO_IDCT_COMMON() \
+  \
+  /* out0=(00 01 02 03), out1=(10 11 12 13) */ \
+  /* out2=(20 21 22 23), out3=(30 31 32 33) */ \
+  /* out4=(40 41 42 43), out5=(50 51 52 53) */ \
+  /* out6=(60 61 62 63), out7=(70 71 72 73) */ \
+  \
+  out0 = _mm_srai_pi16(out0, PASS1_BITS + 3); \
+  out1 = _mm_srai_pi16(out1, PASS1_BITS + 3); \
+  out2 = _mm_srai_pi16(out2, PASS1_BITS + 3); \
+  out3 = _mm_srai_pi16(out3, PASS1_BITS + 3); \
+  out4 = _mm_srai_pi16(out4, PASS1_BITS + 3); \
+  out5 = _mm_srai_pi16(out5, PASS1_BITS + 3); \
+  out6 = _mm_srai_pi16(out6, PASS1_BITS + 3); \
+  out7 = _mm_srai_pi16(out7, PASS1_BITS + 3); \
+  \
+  row06 = _mm_packs_pi16(out0, out6);  /* row06=(00 01 02 03 60 61 62 63) */ \
+  row17 = _mm_packs_pi16(out1, out7);  /* row17=(10 11 12 13 70 71 72 73) */ \
+  row24 = _mm_packs_pi16(out2, out4);  /* row24=(20 21 22 23 40 41 42 43) */ \
+  row35 = _mm_packs_pi16(out3, out5);  /* row35=(30 31 32 33 50 51 52 53) */ \
+  \
+  row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
+  row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
+  row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \
+  row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \
+  \
+  /* Transpose coefficients */ \
+  \
+  col0123a = _mm_unpacklo_pi8(row06, row17);  /* col0123a=(00 10 01 11 02 12 03 13) */ \
+  col0123d = _mm_unpackhi_pi8(row06, row17);  /* col0123d=(60 70 61 71 62 72 63 73) */ \
+  col0123b = _mm_unpacklo_pi8(row24, row35);  /* col0123b=(20 30 21 31 22 32 23 33) */ \
+  col0123c = _mm_unpackhi_pi8(row24, row35);  /* col0123c=(40 50 41 51 42 52 43 53) */ \
+  \
+  col01l = _mm_unpacklo_pi16(col0123a, col0123b);  /* col01l=(00 10 20 30 01 11 21 31) */ \
+  col23l = _mm_unpackhi_pi16(col0123a, col0123b);  /* col23l=(02 12 22 32 03 13 23 33) */ \
+  col01h = _mm_unpacklo_pi16(col0123c, col0123d);  /* col01h=(40 50 60 70 41 51 61 71) */ \
+  col23h = _mm_unpackhi_pi16(col0123c, col0123d);  /* col23h=(42 52 62 72 43 53 63 73) */ \
+  \
+  col0 = _mm_unpacklo_pi32(col01l, col01h);   /* col0=(00 10 20 30 40 50 60 70) */ \
+  col1 = _mm_unpackhi_pi32(col01l, col01h);   /* col1=(01 11 21 31 41 51 61 71) */ \
+  col2 = _mm_unpacklo_pi32(col23l, col23h);   /* col2=(02 12 22 32 42 52 62 72) */ \
+  col3 = _mm_unpackhi_pi32(col23l, col23h);   /* col3=(03 13 23 33 43 53 63 73) */ \
+  \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \
+}
+
+void jsimd_idct_ifast_mmi(void *dct_table, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m64 tmp10, tmp11, tmp12, tmp13;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  __m64 z5, z10, z11, z12, z13;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE *quantptr;
+  JCOEF *wsptr;
+  JCOEF workspace[DCTSIZE2];  /* buffers data between passes */
+
+  /* Pass 1: process columns. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *)dct_table;
+  wsptr = workspace;
+
+  DO_IDCT_PASS1(1)
+nextcolumn1:
+  inptr += 4;
+  quantptr += 4;
+  wsptr += DCTSIZE * 4;
+  DO_IDCT_PASS1(2)
+nextcolumn2:
+
+  /* Pass 2: process rows. */
+
+  wsptr = workspace;
+
+  DO_IDCT_PASS2(0)
+  wsptr += 4;
+  DO_IDCT_PASS2(4)
+}
diff --git a/simd/loongson/jidctint-mmi.c b/simd/mips64/jidctint-mmi.c
similarity index 100%
rename from simd/loongson/jidctint-mmi.c
rename to simd/mips64/jidctint-mmi.c
diff --git a/simd/mips64/jquanti-mmi.c b/simd/mips64/jquanti-mmi.c
new file mode 100644
index 0000000..339002f
--- /dev/null
+++ b/simd/mips64/jquanti-mmi.c
@@ -0,0 +1,124 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ * Copyright (C) 2018-2019, D. R. Commander.  All Rights Reserved.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define DO_QUANT() { \
+  __m64 rowl, rowh, rowls, rowhs, rowlsave, rowhsave; \
+  __m64 corrl, corrh, recipl, reciph, scalel, scaleh; \
+  \
+  rowl = _mm_load_si64((__m64 *)&workspace[0]); \
+  rowh = _mm_load_si64((__m64 *)&workspace[4]); \
+  \
+  /* Branch-less absolute value */ \
+  rowls = _mm_srai_pi16(rowl, (WORD_BIT - 1));  /* -1 if value < 0, */ \
+                                                /* 0 otherwise */ \
+  rowhs = _mm_srai_pi16(rowh, (WORD_BIT - 1)); \
+  \
+  rowl = _mm_xor_si64(rowl, rowls);           /* val = -val */ \
+  rowh = _mm_xor_si64(rowh, rowhs); \
+  rowl = _mm_sub_pi16(rowl, rowls); \
+  rowh = _mm_sub_pi16(rowh, rowhs); \
+  \
+  corrl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]);  /* correction */ \
+  corrh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \
+  \
+  rowlsave = rowl = _mm_add_pi16(rowl, corrl);  /* correction + roundfactor */ \
+  rowhsave = rowh = _mm_add_pi16(rowh, corrh); \
+  \
+  recipl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]);  /* reciprocal */ \
+  reciph = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \
+  \
+  rowl = _mm_mulhi_pi16(rowl, recipl); \
+  rowh = _mm_mulhi_pi16(rowh, reciph); \
+  \
+  /* reciprocal is always negative (MSB=1), so we always need to add the */ \
+  /* initial value (input value is never negative as we inverted it at the */ \
+  /* start of this routine) */ \
+  rowlsave = rowl = _mm_add_pi16(rowl, rowlsave); \
+  rowhsave = rowh = _mm_add_pi16(rowh, rowhsave); \
+  \
+  scalel = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]);  /* scale */ \
+  scaleh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \
+  \
+  rowl = _mm_mulhi_pi16(rowl, scalel); \
+  rowh = _mm_mulhi_pi16(rowh, scaleh); \
+  \
+  /* determine if scale is negative */ \
+  scalel = _mm_srai_pi16(scalel, (WORD_BIT - 1)); \
+  scaleh = _mm_srai_pi16(scaleh, (WORD_BIT - 1)); \
+  \
+  /* and add input if it is */ \
+  scalel = _mm_and_si64(scalel, rowlsave); \
+  scaleh = _mm_and_si64(scaleh, rowhsave); \
+  rowl = _mm_add_pi16(rowl, scalel); \
+  rowh = _mm_add_pi16(rowh, scaleh); \
+  \
+  /* then check if negative input */ \
+  rowlsave = _mm_srai_pi16(rowlsave, (WORD_BIT - 1)); \
+  rowhsave = _mm_srai_pi16(rowhsave, (WORD_BIT - 1)); \
+  \
+  /* and add scale if it is */ \
+  rowlsave = _mm_and_si64(rowlsave, scalel); \
+  rowhsave = _mm_and_si64(rowhsave, scaleh); \
+  rowl = _mm_add_pi16(rowl, rowlsave); \
+  rowh = _mm_add_pi16(rowh, rowhsave); \
+  \
+  rowl = _mm_xor_si64(rowl, rowls);           /* val = -val */ \
+  rowh = _mm_xor_si64(rowh, rowhs); \
+  rowl = _mm_sub_pi16(rowl, rowls); \
+  rowh = _mm_sub_pi16(rowh, rowhs); \
+  \
+  _mm_store_si64((__m64 *)&output_ptr[0], rowl); \
+  _mm_store_si64((__m64 *)&output_ptr[4], rowh); \
+  \
+  workspace += DCTSIZE; \
+  divisors += DCTSIZE; \
+  output_ptr += DCTSIZE; \
+}
+
+
+void jsimd_quantize_mmi(JCOEFPTR coef_block, DCTELEM *divisors,
+                        DCTELEM *workspace)
+{
+  JCOEFPTR output_ptr = coef_block;
+
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+}
diff --git a/simd/loongson/jsimd.c b/simd/mips64/jsimd.c
similarity index 66%
rename from simd/loongson/jsimd.c
rename to simd/mips64/jsimd.c
index e8b1832..e8f1af5 100644
--- a/simd/loongson/jsimd.c
+++ b/simd/mips64/jsimd.c
@@ -1,11 +1,11 @@
 /*
- * jsimd_loongson.c
+ * jsimd_mips64.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2015, 2018, Matthieu Darbois.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -13,7 +13,7 @@
  *
  * This file contains the interface between the "normal" portions
  * of the library and the SIMD implementations when running on a
- * Loongson architecture.
+ * 64-bit MIPS architecture.
  */
 
 #define JPEG_INTERNALS
@@ -24,8 +24,76 @@
 #include "../../jsimddct.h"
 #include "../jsimd.h"
 
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
 static unsigned int simd_support = ~0;
 
+#if defined(__linux__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+  char *p;
+
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "ASEs implemented", 16) != 0)
+    return 0;
+  buffer += 16;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "loongson-mmi"))
+        simd_support |= JSIMD_MMI;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
 /*
  * Check what SIMD accelerations are supported.
  *
@@ -37,14 +105,32 @@
 #ifndef NO_GETENV
   char *env = NULL;
 #endif
+#if defined(__linux__)
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
 
   if (simd_support != ~0U)
     return;
 
+  simd_support = 0;
+
+#if defined(__linux__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#elif defined(__mips_loongson_vector_rev)
+  /* Only enable MMI by default on non-Linux platforms when the compiler flags
+   * support it. */
   simd_support |= JSIMD_MMI;
+#endif
 
 #ifndef NO_GETENV
   /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEMMI");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_MMI;
   env = getenv("JSIMD_FORCENONE");
   if ((env != NULL) && (strcmp(env, "1") == 0))
     simd_support = 0;
@@ -73,6 +159,19 @@
 GLOBAL(int)
 jsimd_can_rgb_gray(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
   return 0;
 }
 
@@ -150,6 +249,37 @@
                        JSAMPIMAGE output_buf, JDIMENSION output_row,
                        int num_rows)
 {
+  void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_extrgb_gray_convert_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_extrgbx_gray_convert_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_extbgr_gray_convert_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_extbgrx_gray_convert_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_extxbgr_gray_convert_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_extxrgb_gray_convert_mmi;
+    break;
+  default:
+    mmifct = jsimd_rgb_gray_convert_mmi;
+    break;
+  }
+
+  mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
 }
 
 GLOBAL(void)
@@ -311,6 +441,17 @@
 GLOBAL(int)
 jsimd_can_h2v1_fancy_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
   return 0;
 }
 
@@ -327,17 +468,42 @@
 jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v1_fancy_upsample_mmi(cinfo->max_v_samp_factor,
+                                compptr->downsampled_width, input_data,
+                                output_data_ptr);
 }
 
 GLOBAL(int)
 jsimd_can_h2v2_merged_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
   return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_merged_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
   return 0;
 }
 
@@ -345,12 +511,74 @@
 jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                            JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
+  void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_h2v2_extrgb_merged_upsample_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_h2v2_extrgbx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_h2v2_extbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_h2v2_extbgrx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_h2v2_extxbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_h2v2_extxrgb_merged_upsample_mmi;
+    break;
+  default:
+    mmifct = jsimd_h2v2_merged_upsample_mmi;
+    break;
+  }
+
+  mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
 }
 
 GLOBAL(void)
 jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                            JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
+  void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_h2v1_extrgb_merged_upsample_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_h2v1_extrgbx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_h2v1_extbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_h2v1_extbgrx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_h2v1_extxbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_h2v1_extxrgb_merged_upsample_mmi;
+    break;
+  default:
+    mmifct = jsimd_h2v1_merged_upsample_mmi;
+    break;
+  }
+
+  mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
 }
 
 GLOBAL(int)
@@ -397,6 +625,17 @@
 GLOBAL(int)
 jsimd_can_fdct_ifast(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
   return 0;
 }
 
@@ -415,6 +654,7 @@
 GLOBAL(void)
 jsimd_fdct_ifast(DCTELEM *data)
 {
+  jsimd_fdct_ifast_mmi(data);
 }
 
 GLOBAL(void)
@@ -537,6 +777,25 @@
 GLOBAL(int)
 jsimd_can_idct_ifast(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
   return 0;
 }
 
@@ -559,6 +818,7 @@
                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
                  JDIMENSION output_col)
 {
+  jsimd_idct_ifast_mmi(compptr->dct_table, coef_block, output_buf, output_col);
 }
 
 GLOBAL(void)
diff --git a/simd/loongson/jsimd_mmi.h b/simd/mips64/jsimd_mmi.h
similarity index 83%
rename from simd/loongson/jsimd_mmi.h
rename to simd/mips64/jsimd_mmi.h
index 59b2ee0..5e4261c 100644
--- a/simd/loongson/jsimd_mmi.h
+++ b/simd/mips64/jsimd_mmi.h
@@ -1,11 +1,12 @@
 /*
  * Loongson MMI optimizations for libjpeg-turbo
  *
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  *                          All Rights Reserved.
  * Authors:  ZhuChen     <zhuchen@loongson.cn>
  *           CaiWanwei   <caiwanwei@loongson.cn>
  *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           QingfaLiu   <liuqingfa-hf@loongson.cn>
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -32,6 +33,13 @@
 
 
 /* Common code */
+#if defined(_ABI64) && _MIPS_SIM == _ABI64
+# define PTR_ADDU  "daddu "
+# define PTR_SLL   "dsll "
+#else
+# define PTR_ADDU  "addu "
+# define PTR_SLL   "sll "
+#endif
 
 #define SIZEOF_MMWORD  8
 #define BYTE_BIT  8
@@ -47,11 +55,13 @@
    ((uint64_t)(uint8_t)f << 16) | \
    ((uint64_t)(uint8_t)g << 8)  | \
    ((uint64_t)(uint8_t)h))
+#define _uint64_set1_pi8(a)  _uint64_set_pi8(a, a, a, a, a, a, a, a)
 #define _uint64_set_pi16(a, b, c, d) \
   (((uint64_t)(uint16_t)a << 48) | \
    ((uint64_t)(uint16_t)b << 32) | \
    ((uint64_t)(uint16_t)c << 16) | \
    ((uint64_t)(uint16_t)d))
+#define _uint64_set1_pi16(a)  _uint64_set_pi16(a, a, a, a)
 #define _uint64_set_pi32(a, b) \
   (((uint64_t)(uint32_t)a << 32) | \
    ((uint64_t)(uint32_t)b))
diff --git a/simd/loongson/loongson-mmintrin.h b/simd/mips64/loongson-mmintrin.h
similarity index 98%
rename from simd/loongson/loongson-mmintrin.h
rename to simd/mips64/loongson-mmintrin.h
index 50d166b..db9b35a 100644
--- a/simd/loongson/loongson-mmintrin.h
+++ b/simd/mips64/loongson-mmintrin.h
@@ -1217,14 +1217,24 @@
 extern __inline void FUNCTION_ATTRIBS
 _mm_store_si64(__m64 *dest, __m64 src)
 {
-  asm("gssdlc1 %1, 7+%0\n\t"
-      "gssdrc1 %1, %0\n\t"
+  asm("sdc1 %1, %0 \n\t"
       : "=m" (*dest)
       : "f" (src)
       : "memory"
      );
 }
 
+extern __inline void FUNCTION_ATTRIBS
+_mm_storeu_si64(__m64 *dest, __m64 src)
+{
+  asm("gssdlc1 %1, 7(%0) \n\t"
+      "gssdrc1 %1, 0(%0) \n\t"
+      :
+      : "r" (dest), "f" (src)
+      : "memory"
+     );
+}
+
 extern __inline __m64 FUNCTION_ATTRIBS
 _mm_load_si32(const __m32 *src)
 {
diff --git a/simd/nasm/jpeg_nbits_table.inc b/simd/nasm/jpeg_nbits_table.inc
deleted file mode 100644
index 2ce6c28..0000000
--- a/simd/nasm/jpeg_nbits_table.inc
+++ /dev/null
@@ -1,4097 +0,0 @@
-jpeg_nbits_table db \
-   0,  1,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4, \
-   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, \
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6, \
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6, \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
diff --git a/simd/nasm/jsimdext.inc b/simd/nasm/jsimdext.inc
index 9930d80..e8d50b0 100644
--- a/simd/nasm/jsimdext.inc
+++ b/simd/nasm/jsimdext.inc
@@ -2,8 +2,9 @@
 ; jsimdext.inc - common declarations
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2010, 2016, 2019, D. R. Commander.
+; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander.
 ; Copyright (C) 2018, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
 ;
@@ -130,13 +131,53 @@
 ;  Common types
 ;
 %ifdef __x86_64__
+%ifnidn __OUTPUT_FORMAT__, elfx32
 %define POINTER         qword           ; general pointer type
 %define SIZEOF_POINTER  SIZEOF_QWORD    ; sizeof(POINTER)
 %define POINTER_BIT     QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
-%else
+%define resp            resq
+%define dp              dq
+%define raxp            rax
+%define rbxp            rbx
+%define rcxp            rcx
+%define rdxp            rdx
+%define rsip            rsi
+%define rdip            rdi
+%define rbpp            rbp
+%define rspp            rsp
+%define r8p             r8
+%define r9p             r9
+%define r10p            r10
+%define r11p            r11
+%define r12p            r12
+%define r13p            r13
+%define r14p            r14
+%define r15p            r15
+%endif
+%endif
+%ifndef raxp
 %define POINTER         dword           ; general pointer type
 %define SIZEOF_POINTER  SIZEOF_DWORD    ; sizeof(POINTER)
 %define POINTER_BIT     DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
+%define resp            resd
+%define dp              dd
+; x86_64 ILP32 ABI (x32)
+%define raxp            eax
+%define rbxp            ebx
+%define rcxp            ecx
+%define rdxp            edx
+%define rsip            esi
+%define rdip            edi
+%define rbpp            ebp
+%define rspp            esp
+%define r8p             r8d
+%define r9p             r9d
+%define r10p            r10d
+%define r11p            r11d
+%define r12p            r12d
+%define r13p            r13d
+%define r14p            r14d
+%define r15p            r15d
 %endif
 
 %define INT             dword           ; signed integer type
diff --git a/simd/powerpc/jcsample.h b/simd/powerpc/jcsample.h
index 2ac4816..bd07fcc 100644
--- a/simd/powerpc/jcsample.h
+++ b/simd/powerpc/jcsample.h
@@ -20,7 +20,7 @@
   if (numcols > 0) {
     for (row = 0; row < num_rows; row++) {
       ptr = image_data[row] + input_cols;
-      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
+      pixval = ptr[-1];
       for (count = numcols; count > 0; count--)
         *ptr++ = pixval;
     }
diff --git a/simd/x86_64/jccolext-avx2.asm b/simd/x86_64/jccolext-avx2.asm
index 10d2834..ffb527d 100644
--- a/simd/x86_64/jccolext-avx2.asm
+++ b/simd/x86_64/jccolext-avx2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright (C) 2009, 2016, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -57,9 +58,9 @@
 
     mov         rsi, r12
     mov         ecx, r13d
-    mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
-    mov         rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
-    mov         rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
     lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
     lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
     lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
@@ -77,10 +78,10 @@
     push        rsi
     push        rcx                     ; col
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr0
-    mov         rbx, JSAMPROW [rbx]     ; outptr1
-    mov         rdx, JSAMPROW [rdx]     ; outptr2
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+    mov         rbxp, JSAMPROW [rbx]    ; outptr1
+    mov         rdxp, JSAMPROW [rdx]    ; outptr2
 
     cmp         rcx, byte SIZEOF_YMMWORD
     jae         near .columnloop
diff --git a/simd/x86_64/jccolext-sse2.asm b/simd/x86_64/jccolext-sse2.asm
index 2c914d3..af70ed6 100644
--- a/simd/x86_64/jccolext-sse2.asm
+++ b/simd/x86_64/jccolext-sse2.asm
@@ -2,6 +2,7 @@
 ; jccolext.asm - colorspace conversion (64-bit SSE2)
 ;
 ; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -56,9 +57,9 @@
 
     mov         rsi, r12
     mov         ecx, r13d
-    mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
-    mov         rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
-    mov         rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
     lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
     lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
     lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
@@ -76,10 +77,10 @@
     push        rsi
     push        rcx                     ; col
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr0
-    mov         rbx, JSAMPROW [rbx]     ; outptr1
-    mov         rdx, JSAMPROW [rdx]     ; outptr2
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+    mov         rbxp, JSAMPROW [rbx]    ; outptr1
+    mov         rdxp, JSAMPROW [rdx]    ; outptr2
 
     cmp         rcx, byte SIZEOF_XMMWORD
     jae         near .columnloop
diff --git a/simd/x86_64/jcgryext-avx2.asm b/simd/x86_64/jcgryext-avx2.asm
index 175b60d..ddcc2c0 100644
--- a/simd/x86_64/jcgryext-avx2.asm
+++ b/simd/x86_64/jcgryext-avx2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright (C) 2011, 2016, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -57,7 +58,7 @@
 
     mov         rsi, r12
     mov         ecx, r13d
-    mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
     lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
 
     pop         rcx
@@ -71,8 +72,8 @@
     push        rsi
     push        rcx                     ; col
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr0
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
 
     cmp         rcx, byte SIZEOF_YMMWORD
     jae         near .columnloop
diff --git a/simd/x86_64/jcgryext-sse2.asm b/simd/x86_64/jcgryext-sse2.asm
index 873be80..f1d399a 100644
--- a/simd/x86_64/jcgryext-sse2.asm
+++ b/simd/x86_64/jcgryext-sse2.asm
@@ -2,6 +2,7 @@
 ; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
 ;
 ; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -56,7 +57,7 @@
 
     mov         rsi, r12
     mov         ecx, r13d
-    mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
     lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
 
     pop         rcx
@@ -70,8 +71,8 @@
     push        rsi
     push        rcx                     ; col
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr0
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
 
     cmp         rcx, byte SIZEOF_XMMWORD
     jae         near .columnloop
diff --git a/simd/x86_64/jchuff-sse2.asm b/simd/x86_64/jchuff-sse2.asm
index aa78fd5..0072028 100644
--- a/simd/x86_64/jchuff-sse2.asm
+++ b/simd/x86_64/jchuff-sse2.asm
@@ -1,8 +1,9 @@
 ;
 ; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
 ;
-; Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
+; Copyright (C) 2009-2011, 2014-2016, 2019, D. R. Commander.
 ; Copyright (C) 2015, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -15,11 +16,25 @@
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
 ;
 ; This file contains an SSE2 implementation for Huffman coding of one block.
-; The following code is based directly on jchuff.c; see jchuff.c for more
-; details.
+; The following code is based on jchuff.c; see jchuff.c for more details.
 
 %include "jsimdext.inc"
 
+struc working_state
+.next_output_byte:   resp 1     ; => next byte to write in buffer
+.free_in_buffer:     resp 1     ; # of byte spaces remaining in buffer
+.cur.put_buffer.simd resq 1     ; current bit accumulation buffer
+.cur.free_bits       resd 1     ; # of bits available in it
+.cur.last_dc_val     resd 4     ; last DC coef for each component
+.cinfo:              resp 1     ; dump_buffer needs access to this
+endstruc
+
+struc c_derived_tbl
+.ehufco:             resd 256   ; code for each symbol
+.ehufsi:             resb 256   ; length of code for each symbol
+; If no code has been allocated for a symbol S, ehufsi[S] contains 0
+endstruc
+
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
@@ -28,134 +43,137 @@
 
 EXTN(jconst_huff_encode_one_block):
 
-%include "jpeg_nbits_table.inc"
+jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007
+               dd 0x000f, 0x001f, 0x003f, 0x007f
+               dd 0x00ff, 0x01ff, 0x03ff, 0x07ff
+               dd 0x0fff, 0x1fff, 0x3fff, 0x7fff
 
     alignz      32
 
+times 1 << 14 db 15
+times 1 << 13 db 14
+times 1 << 12 db 13
+times 1 << 11 db 12
+times 1 << 10 db 11
+times 1 <<  9 db 10
+times 1 <<  8 db  9
+times 1 <<  7 db  8
+times 1 <<  6 db  7
+times 1 <<  5 db  6
+times 1 <<  4 db  5
+times 1 <<  3 db  4
+times 1 <<  2 db  3
+times 1 <<  1 db  2
+times 1 <<  0 db  1
+times 1       db  0
+jpeg_nbits_table:
+times 1       db  0
+times 1 <<  0 db  1
+times 1 <<  1 db  2
+times 1 <<  2 db  3
+times 1 <<  3 db  4
+times 1 <<  4 db  5
+times 1 <<  5 db  6
+times 1 <<  6 db  7
+times 1 <<  7 db  8
+times 1 <<  8 db  9
+times 1 <<  9 db 10
+times 1 << 10 db 11
+times 1 << 11 db 12
+times 1 << 12 db 13
+times 1 << 13 db 14
+times 1 << 14 db 15
+
+    alignz      32
+
+%define NBITS(x)      nbits_base + x
+%define MASK_BITS(x)  NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table)
+
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
     BITS        64
 
-; These macros perform the same task as the emit_bits() function in the
-; original libjpeg code.  In addition to reducing overhead by explicitly
-; inlining the code, additional performance is achieved by taking into
-; account the size of the bit buffer and waiting until it is almost full
-; before emptying it.  This mostly benefits 64-bit platforms, since 6
-; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
+; Shorthand used to describe SIMD operations:
+; wN:  xmmN treated as eight signed 16-bit values
+; wN[i]:  perform the same operation on all eight signed 16-bit values, i=0..7
+; bN:  xmmN treated as 16 unsigned 8-bit values
+; bN[i]:  perform the same operation on all 16 unsigned 8-bit values, i=0..15
+; Contents of SIMD registers are shown in memory order.
 
-%macro EMIT_BYTE 0
-    sub         put_bits, 8             ; put_bits -= 8;
-    mov         rdx, put_buffer
-    mov         ecx, put_bits
-    shr         rdx, cl                 ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
-    mov         byte [buffer], dl       ; *buffer++ = c;
-    add         buffer, 1
-    cmp         dl, 0xFF                ; need to stuff a zero byte?
-    jne         %%.EMIT_BYTE_END
-    mov         byte [buffer], 0        ; *buffer++ = 0;
-    add         buffer, 1
-%%.EMIT_BYTE_END:
-%endmacro
+; Fill the bit buffer to capacity with the leading bits from code, then output
+; the bit buffer and put the remaining bits from code into the bit buffer.
+;
+; Usage:
+; code - contains the bits to shift into the bit buffer (LSB-aligned)
+; %1 - the label to which to jump when the macro completes
+; %2 (optional) - extra instructions to execute after nbits has been set
+;
+; Upon completion, free_bits will be set to the number of remaining bits from
+; code, and put_buffer will contain those remaining bits.  temp and code will
+; be clobbered.
+;
+; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
+; macro in jchuff.c.
 
-%macro PUT_BITS 1
-    add         put_bits, ecx           ; put_bits += size;
-    shl         put_buffer, cl          ; put_buffer = (put_buffer << size);
-    or          put_buffer, %1
-%endmacro
-
-%macro CHECKBUF31 0
-    cmp         put_bits, 32            ; if (put_bits > 31) {
-    jl          %%.CHECKBUF31_END
-    EMIT_BYTE
-    EMIT_BYTE
-    EMIT_BYTE
-    EMIT_BYTE
-%%.CHECKBUF31_END:
-%endmacro
-
-%macro CHECKBUF47 0
-    cmp         put_bits, 48            ; if (put_bits > 47) {
-    jl          %%.CHECKBUF47_END
-    EMIT_BYTE
-    EMIT_BYTE
-    EMIT_BYTE
-    EMIT_BYTE
-    EMIT_BYTE
-    EMIT_BYTE
-%%.CHECKBUF47_END:
-%endmacro
-
-%macro EMIT_BITS 2
-    CHECKBUF47
-    mov         ecx, %2
-    PUT_BITS    %1
-%endmacro
-
-%macro kloop_prepare 37                 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
-    pxor        xmm8, xmm8              ; __m128i neg = _mm_setzero_si128();
-    pxor        xmm9, xmm9              ; __m128i neg = _mm_setzero_si128();
-    pxor        xmm10, xmm10            ; __m128i neg = _mm_setzero_si128();
-    pxor        xmm11, xmm11            ; __m128i neg = _mm_setzero_si128();
-    pinsrw      %34, word [r12 + %2  * SIZEOF_WORD], 0  ; xmm_shadow[0] = block[jno0];
-    pinsrw      %35, word [r12 + %10 * SIZEOF_WORD], 0  ; xmm_shadow[8] = block[jno8];
-    pinsrw      %36, word [r12 + %18 * SIZEOF_WORD], 0  ; xmm_shadow[16] = block[jno16];
-    pinsrw      %37, word [r12 + %26 * SIZEOF_WORD], 0  ; xmm_shadow[24] = block[jno24];
-    pinsrw      %34, word [r12 + %3  * SIZEOF_WORD], 1  ; xmm_shadow[1] = block[jno1];
-    pinsrw      %35, word [r12 + %11 * SIZEOF_WORD], 1  ; xmm_shadow[9] = block[jno9];
-    pinsrw      %36, word [r12 + %19 * SIZEOF_WORD], 1  ; xmm_shadow[17] = block[jno17];
-    pinsrw      %37, word [r12 + %27 * SIZEOF_WORD], 1  ; xmm_shadow[25] = block[jno25];
-    pinsrw      %34, word [r12 + %4  * SIZEOF_WORD], 2  ; xmm_shadow[2] = block[jno2];
-    pinsrw      %35, word [r12 + %12 * SIZEOF_WORD], 2  ; xmm_shadow[10] = block[jno10];
-    pinsrw      %36, word [r12 + %20 * SIZEOF_WORD], 2  ; xmm_shadow[18] = block[jno18];
-    pinsrw      %37, word [r12 + %28 * SIZEOF_WORD], 2  ; xmm_shadow[26] = block[jno26];
-    pinsrw      %34, word [r12 + %5  * SIZEOF_WORD], 3  ; xmm_shadow[3] = block[jno3];
-    pinsrw      %35, word [r12 + %13 * SIZEOF_WORD], 3  ; xmm_shadow[11] = block[jno11];
-    pinsrw      %36, word [r12 + %21 * SIZEOF_WORD], 3  ; xmm_shadow[19] = block[jno19];
-    pinsrw      %37, word [r12 + %29 * SIZEOF_WORD], 3  ; xmm_shadow[27] = block[jno27];
-    pinsrw      %34, word [r12 + %6  * SIZEOF_WORD], 4  ; xmm_shadow[4] = block[jno4];
-    pinsrw      %35, word [r12 + %14 * SIZEOF_WORD], 4  ; xmm_shadow[12] = block[jno12];
-    pinsrw      %36, word [r12 + %22 * SIZEOF_WORD], 4  ; xmm_shadow[20] = block[jno20];
-    pinsrw      %37, word [r12 + %30 * SIZEOF_WORD], 4  ; xmm_shadow[28] = block[jno28];
-    pinsrw      %34, word [r12 + %7  * SIZEOF_WORD], 5  ; xmm_shadow[5] = block[jno5];
-    pinsrw      %35, word [r12 + %15 * SIZEOF_WORD], 5  ; xmm_shadow[13] = block[jno13];
-    pinsrw      %36, word [r12 + %23 * SIZEOF_WORD], 5  ; xmm_shadow[21] = block[jno21];
-    pinsrw      %37, word [r12 + %31 * SIZEOF_WORD], 5  ; xmm_shadow[29] = block[jno29];
-    pinsrw      %34, word [r12 + %8  * SIZEOF_WORD], 6  ; xmm_shadow[6] = block[jno6];
-    pinsrw      %35, word [r12 + %16 * SIZEOF_WORD], 6  ; xmm_shadow[14] = block[jno14];
-    pinsrw      %36, word [r12 + %24 * SIZEOF_WORD], 6  ; xmm_shadow[22] = block[jno22];
-    pinsrw      %37, word [r12 + %32 * SIZEOF_WORD], 6  ; xmm_shadow[30] = block[jno30];
-    pinsrw      %34, word [r12 + %9  * SIZEOF_WORD], 7  ; xmm_shadow[7] = block[jno7];
-    pinsrw      %35, word [r12 + %17 * SIZEOF_WORD], 7  ; xmm_shadow[15] = block[jno15];
-    pinsrw      %36, word [r12 + %25 * SIZEOF_WORD], 7  ; xmm_shadow[23] = block[jno23];
-%if %1 != 32
-    pinsrw      %37, word [r12 + %33 * SIZEOF_WORD], 7  ; xmm_shadow[31] = block[jno31];
-%else
-    pinsrw      %37, ebx, 7             ; xmm_shadow[31] = block[jno31];
-%endif
-    pcmpgtw     xmm8, %34               ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw     xmm9, %35               ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw     xmm10, %36              ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw     xmm11, %37              ; neg = _mm_cmpgt_epi16(neg, x1);
-    paddw       %34, xmm8               ; x1 = _mm_add_epi16(x1, neg);
-    paddw       %35, xmm9               ; x1 = _mm_add_epi16(x1, neg);
-    paddw       %36, xmm10              ; x1 = _mm_add_epi16(x1, neg);
-    paddw       %37, xmm11              ; x1 = _mm_add_epi16(x1, neg);
-    pxor        %34, xmm8               ; x1 = _mm_xor_si128(x1, neg);
-    pxor        %35, xmm9               ; x1 = _mm_xor_si128(x1, neg);
-    pxor        %36, xmm10              ; x1 = _mm_xor_si128(x1, neg);
-    pxor        %37, xmm11              ; x1 = _mm_xor_si128(x1, neg);
-    pxor        xmm8, %34               ; neg = _mm_xor_si128(neg, x1);
-    pxor        xmm9, %35               ; neg = _mm_xor_si128(neg, x1);
-    pxor        xmm10, %36              ; neg = _mm_xor_si128(neg, x1);
-    pxor        xmm11, %37              ; neg = _mm_xor_si128(neg, x1);
-    movdqa      XMMWORD [t1 + %1 * SIZEOF_WORD], %34           ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
-    movdqa      XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35     ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
-    movdqa      XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36    ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
-    movdqa      XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37    ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
-    movdqa      XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8          ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
-    movdqa      XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9    ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
-    movdqa      XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10  ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
-    movdqa      XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11  ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
+%macro EMIT_QWORD 1-2
+    add         nbitsb, free_bitsb      ; nbits += free_bits;
+    neg         free_bitsb              ; free_bits = -free_bits;
+    mov         tempd, code             ; temp = code;
+    shl         put_buffer, nbitsb      ; put_buffer <<= nbits;
+    mov         nbitsb, free_bitsb      ; nbits = free_bits;
+    neg         free_bitsb              ; free_bits = -free_bits;
+    shr         tempd, nbitsb           ; temp >>= nbits;
+    or          tempq, put_buffer       ; temp |= put_buffer;
+    movq        xmm0, tempq             ; xmm0.u64 = { temp, 0 };
+    bswap       tempq                   ; temp = htonl(temp);
+    mov         put_buffer, codeq       ; put_buffer = code;
+    pcmpeqb     xmm0, xmm1              ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0);
+    %2
+    pmovmskb    code, xmm0              ; code = 0;  code |= ((b0[i] >> 7) << i);
+    mov         qword [buffer], tempq   ; memcpy(buffer, &temp, 8);
+                                        ; (speculative; will be overwritten if
+                                        ; code contains any 0xFF bytes)
+    add         free_bitsb, 64          ; free_bits += 64;
+    add         bufferp, 8              ; buffer += 8;
+    test        code, code              ; if (code == 0)  /* No 0xFF bytes */
+    jz          %1                      ;   return;
+    ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
+    ; bytes in the qword.
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer-7], 0      ; buffer[-7] = 0;
+    sbb         bufferp, 6              ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         tempq, 16               ; temp >>= 16;
+    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         tempq, 16               ; temp >>= 16;
+    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         tempd, 16               ; temp >>= 16;
+    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    jmp         %1                      ; return;
 %endmacro
 
 ;
@@ -166,181 +184,399 @@
 ;                                  JCOEFPTR block, int last_dc_val,
 ;                                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
 ;
+; NOTES:
+; When shuffling data, we try to avoid pinsrw as much as possible, since it is
+; slow on many CPUs.  Its reciprocal throughput (issue latency) is 1 even on
+; modern CPUs, so chains of pinsrw instructions (even with different outputs)
+; can limit performance.  pinsrw is a VectorPath instruction on AMD K8 and
+; requires 2 µops (with memory operand) on Intel.  In either case, only one
+; pinsrw instruction can be decoded per cycle (and nothing else if they are
+; back-to-back), so out-of-order execution cannot be used to work around long
+; pinsrw chains (though for Sandy Bridge and later, this may be less of a
+; problem if the code runs from the µop cache.)
+;
+; We use tzcnt instead of bsf without checking for support.  The instruction is
+; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
+; rep bsf.)  The destination (first) operand of bsf (and tzcnt on some CPUs) is
+; an input dependency (although the behavior is not formally defined, Intel
+; CPUs usually leave the destination unmodified if the source is zero.)  This
+; can prevent out-of-order execution, so we clear the destination before
+; invoking tzcnt.
+;
+; Initial register allocation
+; rax - buffer
+; rbx - temp
+; rcx - nbits
+; rdx - block --> free_bits
+; rsi - nbits_base
+; rdi - t
+; rbp - code
+; r8  - dctbl --> code_temp
+; r9  - actbl
+; r10 - state
+; r11 - index
+; r12 - put_buffer
 
-; r10 = working_state *state
-; r11 = JOCTET *buffer
-; r12 = JCOEFPTR block
-; r13d = int last_dc_val
-; r14 = c_derived_tbl *dctbl
-; r15 = c_derived_tbl *actbl
+%define buffer       rax
+%ifdef WIN64
+%define bufferp      rax
+%else
+%define bufferp      raxp
+%endif
+%define tempq        rbx
+%define tempd        ebx
+%define tempb        bl
+%define temph        bh
+%define nbitsq       rcx
+%define nbits        ecx
+%define nbitsb       cl
+%define block        rdx
+%define nbits_base   rsi
+%define t            rdi
+%define td           edi
+%define codeq        rbp
+%define code         ebp
+%define dctbl        r8
+%define actbl        r9
+%define state        r10
+%define index        r11
+%define indexd       r11d
+%define put_buffer   r12
+%define put_bufferd  r12d
 
-%define t1          rbp - (DCTSIZE2 * SIZEOF_WORD)
-%define t2          t1 - (DCTSIZE2 * SIZEOF_WORD)
-%define put_buffer  r8
-%define put_bits    r9d
-%define buffer      rax
+; Step 1: Re-arrange input data according to jpeg_natural_order
+; xx 01 02 03 04 05 06 07      xx 01 08 16 09 02 03 10
+; 08 09 10 11 12 13 14 15      17 24 32 25 18 11 04 05
+; 16 17 18 19 20 21 22 23      12 19 26 33 40 48 41 34
+; 24 25 26 27 28 29 30 31 ==>  27 20 13 06 07 14 21 28
+; 32 33 34 35 36 37 38 39      35 42 49 56 57 50 43 36
+; 40 41 42 43 44 45 46 47      29 22 15 23 30 37 44 51
+; 48 49 50 51 52 53 54 55      58 59 52 45 38 31 39 46
+; 56 57 58 59 60 61 62 63      53 60 61 54 47 55 62 63
 
     align       32
     GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
 
 EXTN(jsimd_huff_encode_one_block_sse2):
-    push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
-    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [t2]
-    push_xmm    4
-    collect_args 6
+
+%ifdef WIN64
+
+; rcx = working_state *state
+; rdx = JOCTET *buffer
+; r8 = JCOEFPTR block
+; r9 = int last_dc_val
+; [rax+48] = c_derived_tbl *dctbl
+; [rax+56] = c_derived_tbl *actbl
+
+                                                          ;X: X = code stream
+    mov         buffer, rdx
+    mov         block, r8
+    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
     push        rbx
+    push        rbp
+    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
+    push        rsi
+    push        rdi
+    push        r12
+    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
+    mov         state, rcx
+    movsx       code, word [block]                        ;Z:     code = block[0];
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    sub         code, r9d                                 ;Z:     code -= last_dc_val;
+    mov         dctbl, POINTER [rsp+6*8+4*8]
+    mov         actbl, POINTER [rsp+6*8+5*8]
+    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
+    lea         nbits_base, [rel jpeg_nbits_table]
+    add         rsp, -DCTSIZE2 * SIZEOF_WORD
+    mov         t, rsp
 
-    mov         buffer, r11                  ; r11 is now sratch
+%else
 
-    mov         put_buffer, MMWORD [r10+16]  ; put_buffer = state->cur.put_buffer;
-    mov         put_bits,    dword [r10+24]  ; put_bits = state->cur.put_bits;
-    push        r10                          ; r10 is now scratch
+; rdi = working_state *state
+; rsi = JOCTET *buffer
+; rdx = JCOEFPTR block
+; rcx = int last_dc_val
+; r8 = c_derived_tbl *dctbl
+; r9 = c_derived_tbl *actbl
 
-    ; Encode the DC coefficient difference per section F.1.2.1
-    movsx       edi, word [r12]         ; temp = temp2 = block[0] - last_dc_val;
-    sub         edi, r13d               ; r13 is not used anymore
-    mov         ebx, edi
+                                                          ;X: X = code stream
+    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
+    push        rbx
+    push        rbp
+    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
+    push        r12
+    mov         state, rdi
+    mov         buffer, rsi
+    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
+    movsx       codeq, word [block]                       ;Z:     code = block[0];
+    lea         nbits_base, [rel jpeg_nbits_table]
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    sub         codeq, rcx                                ;Z:     code -= last_dc_val;
+    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
+    lea         t, [rsp - DCTSIZE2 * SIZEOF_WORD]         ;   use red zone for t_
 
-    ; This is a well-known technique for obtaining the absolute value
-    ; without a branch.  It is derived from an assembly language technique
-    ; presented in "How to Optimize for the Pentium Processors",
-    ; Copyright (c) 1996, 1997 by Agner Fog.
-    mov         esi, edi
-    sar         esi, 31                 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
-    xor         edi, esi                ; temp ^= temp3;
-    sub         edi, esi                ; temp -= temp3;
+%endif
 
-    ; For a negative input, want temp2 = bitwise complement of abs(input)
-    ; This code assumes we are on a two's complement machine
-    add         ebx, esi                ; temp2 += temp3;
+    pshuflw     xmm0, xmm0, 11001001b                     ;A: w0 = 01 08 xx 09 02 03 10 11
+    pinsrw      xmm0, word [block + 16 * SIZEOF_WORD], 2  ;A: w0 = 01 08 16 09 02 03 10 11
+    punpckhdq   xmm3, xmm1                                ;D: w3 = 04 05 12 13 06 07 14 15
+    punpcklqdq  xmm1, xmm3                                ;B: w1 = 08 09 10 11 04 05 12 13
+    pinsrw      xmm0, word [block + 17 * SIZEOF_WORD], 7  ;A: w0 = 01 08 16 09 02 03 10 17
+                                                          ;A:      (Row 0, offset 1)
+    pcmpgtw     xmm4, xmm0                                ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
+    paddw       xmm0, xmm4                                ;A: w0[i] += w4[i];
+    movaps      XMMWORD [t + 0 * SIZEOF_WORD], xmm0       ;A: t[i] = w0[i];
 
-    ; Find the number of bits needed for the magnitude of the coefficient
-    lea         r11, [rel jpeg_nbits_table]
-    movzx       rdi, byte [r11 + rdi]         ; nbits = JPEG_NBITS(temp);
-    ; Emit the Huffman-coded symbol for the number of bits
-    mov         r11d,  INT [r14 + rdi * 4]    ; code = dctbl->ehufco[nbits];
-    movzx       esi, byte [r14 + rdi + 1024]  ; size = dctbl->ehufsi[nbits];
-    EMIT_BITS   r11, esi                      ; EMIT_BITS(code, size)
+    movq        xmm2, qword [block + 24 * SIZEOF_WORD]    ;B: w2 = 24 25 26 27 -- -- -- --
+    pshuflw     xmm2, xmm2, 11011000b                     ;B: w2 = 24 26 25 27 -- -- -- --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;B: w1 = -- 08 09 10 11 04 05 12
+    movups      xmm5, XMMWORD [block + 48 * SIZEOF_WORD]  ;H: w5 = 48 49 50 51 52 53 54 55
+    movsd       xmm1, xmm2                                ;B: w1 = 24 26 25 27 11 04 05 12
+    punpcklqdq  xmm2, xmm5                                ;C: w2 = 24 26 25 27 48 49 50 51
+    pinsrw      xmm1, word [block + 32 * SIZEOF_WORD], 1  ;B: w1 = 24 32 25 27 11 04 05 12
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    psrldq      xmm3, 2 * SIZEOF_WORD                     ;D: w3 = 12 13 06 07 14 15 -- --
+    pcmpeqw     xmm0, xmm4                                ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 18 * SIZEOF_WORD], 3  ;B: w1 = 24 32 25 18 11 04 05 12
+                                                          ;        (Row 1, offset 1)
+    pcmpgtw     xmm4, xmm1                                ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm4                                ;B: w1[i] += w4[i];
+    movaps      XMMWORD [t + 8 * SIZEOF_WORD], xmm1       ;B: t[i+8] = w1[i];
+    pxor        xmm4, xmm4                                ;B: w4[i] = 0;
+    pcmpeqw     xmm1, xmm4                                ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
 
-    ; Mask off any extra bits in code
-    mov         esi, 1
-    mov         ecx, edi
-    shl         esi, cl
-    dec         esi
-    and         ebx, esi                ; temp2 &= (((JLONG)1)<<nbits) - 1;
+    packsswb    xmm0, xmm1                                ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
+                                                          ;    w/ signed saturation
 
-    ; Emit that number of bits of the value, if positive,
-    ; or the complement of its magnitude, if negative.
-    EMIT_BITS   rbx, edi                ; EMIT_BITS(temp2, nbits)
+    pinsrw      xmm3, word [block + 20 * SIZEOF_WORD], 0  ;D: w3 = 20 13 06 07 14 15 -- --
+    pinsrw      xmm3, word [block + 21 * SIZEOF_WORD], 5  ;D: w3 = 20 13 06 07 14 21 -- --
+    pinsrw      xmm3, word [block + 28 * SIZEOF_WORD], 6  ;D: w3 = 20 13 06 07 14 21 28 --
+    pinsrw      xmm3, word [block + 35 * SIZEOF_WORD], 7  ;D: w3 = 20 13 06 07 14 21 28 35
+                                                          ;        (Row 3, offset 1)
+    pcmpgtw     xmm4, xmm3                                ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
+    paddw       xmm3, xmm4                                ;D: w3[i] += w4[i];
+    movaps      XMMWORD [t + 24 * SIZEOF_WORD], xmm3      ;D: t[i+24] = w3[i];
+    pxor        xmm4, xmm4                                ;D: w4[i] = 0;
+    pcmpeqw     xmm3, xmm4                                ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
 
-    ; Prepare data
-    xor         ebx, ebx
-    kloop_prepare  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, \
-                   18, 11, 4,  5,  12, 19, 26, 33, 40, 48, 41, 34, \
-                   27, 20, 13, 6,  7,  14, 21, 28, 35, \
-                   xmm0, xmm1, xmm2, xmm3
-    kloop_prepare  32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
-                   30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
-                   53, 60, 61, 54, 47, 55, 62, 63, 63, \
-                   xmm4, xmm5, xmm6, xmm7
+    pinsrw      xmm2, word [block + 19 * SIZEOF_WORD], 0  ;C: w2 = 19 26 25 27 48 49 50 51
+    cmp         code, 1 << 31                             ;Z:     Set CF if code < 0x80000000,
+                                                          ;Z:     i.e. if code is positive
+    pinsrw      xmm2, word [block + 33 * SIZEOF_WORD], 2  ;C: w2 = 19 26 33 27 48 49 50 51
+    pinsrw      xmm2, word [block + 40 * SIZEOF_WORD], 3  ;C: w2 = 19 26 33 40 48 49 50 51
+    adc         code, -1                                  ;Z:     code += -1 + (code >= 0 ? 1 : 0);
+    pinsrw      xmm2, word [block + 41 * SIZEOF_WORD], 5  ;C: w2 = 19 26 33 40 48 41 50 51
+    pinsrw      xmm2, word [block + 34 * SIZEOF_WORD], 6  ;C: w2 = 19 26 33 40 48 41 34 51
+    movsxd      codeq, code                               ;Z:     sign extend code
+    pinsrw      xmm2, word [block + 27 * SIZEOF_WORD], 7  ;C: w2 = 19 26 33 40 48 41 34 27
+                                                          ;        (Row 2, offset 1)
+    pcmpgtw     xmm4, xmm2                                ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
+    paddw       xmm2, xmm4                                ;C: w2[i] += w4[i];
+    movaps      XMMWORD [t + 16 * SIZEOF_WORD], xmm2      ;C: t[i+16] = w2[i];
+    pxor        xmm4, xmm4                                ;C: w4[i] = 0;
+    pcmpeqw     xmm2, xmm4                                ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
 
-    pxor        xmm8, xmm8
-    pcmpeqw     xmm0, xmm8              ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
-    pcmpeqw     xmm1, xmm8              ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
-    pcmpeqw     xmm2, xmm8              ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
-    pcmpeqw     xmm3, xmm8              ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
-    pcmpeqw     xmm4, xmm8              ; tmp4 = _mm_cmpeq_epi16(tmp4, zero);
-    pcmpeqw     xmm5, xmm8              ; tmp5 = _mm_cmpeq_epi16(tmp5, zero);
-    pcmpeqw     xmm6, xmm8              ; tmp6 = _mm_cmpeq_epi16(tmp6, zero);
-    pcmpeqw     xmm7, xmm8              ; tmp7 = _mm_cmpeq_epi16(tmp7, zero);
-    packsswb    xmm0, xmm1              ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
-    packsswb    xmm2, xmm3              ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
-    packsswb    xmm4, xmm5              ; tmp4 = _mm_packs_epi16(tmp4, tmp5);
-    packsswb    xmm6, xmm7              ; tmp6 = _mm_packs_epi16(tmp6, tmp7);
-    pmovmskb    r11d, xmm0              ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
-    pmovmskb    r12d, xmm2              ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
-    pmovmskb    r13d, xmm4              ; index  = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32;
-    pmovmskb    r14d, xmm6              ; index  = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48;
-    shl         r12, 16
-    shl         r14, 16
-    or          r11, r12
-    or          r13, r14
-    shl         r13, 32
-    or          r11, r13
-    not         r11                     ; index = ~index;
+    packsswb    xmm2, xmm3                                ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
+                                                          ;    w/ signed saturation
 
-    ;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11
-    ;jmp .EFN
+    movzx       nbitsq, byte [NBITS(codeq)]               ;Z:     nbits = JPEG_NBITS(code);
+    movdqa      xmm3, xmm5                                ;H: w3 = 48 49 50 51 52 53 54 55
+    pmovmskb    tempd, xmm2                               ;Z:     temp = 0;  temp |= ((b2[i] >> 7) << i);
+    pmovmskb    put_bufferd, xmm0                         ;Z:     put_buffer = 0;  put_buffer |= ((b0[i] >> 7) << i);
+    movups      xmm0, XMMWORD [block + 56 * SIZEOF_WORD]  ;H: w0 = 56 57 58 59 60 61 62 63
+    punpckhdq   xmm3, xmm0                                ;H: w3 = 52 53 60 61 54 55 62 63
+    shl         tempd, 16                                 ;Z:     temp <<= 16;
+    psrldq      xmm3, 1 * SIZEOF_WORD                     ;H: w3 = 53 60 61 54 55 62 63 --
+    pxor        xmm2, xmm2                                ;H: w2[i] = 0;
+    or          put_bufferd, tempd                        ;Z:     put_buffer |= temp;
+    pshuflw     xmm3, xmm3, 00111001b                     ;H: w3 = 60 61 54 53 55 62 63 --
+    movq        xmm1, qword [block + 44 * SIZEOF_WORD]    ;G: w1 = 44 45 46 47 -- -- -- --
+    unpcklps    xmm5, xmm0                                ;E: w5 = 48 49 56 57 50 51 58 59
+    pxor        xmm0, xmm0                                ;H: w0[i] = 0;
+    pinsrw      xmm3, word [block + 47 * SIZEOF_WORD], 3  ;H: w3 = 60 61 54 47 55 62 63 --
+                                                          ;        (Row 7, offset 1)
+    pcmpgtw     xmm2, xmm3                                ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
+    paddw       xmm3, xmm2                                ;H: w3[i] += w2[i];
+    movaps      XMMWORD [t + 56 * SIZEOF_WORD], xmm3      ;H: t[i+56] = w3[i];
+    movq        xmm4, qword [block + 36 * SIZEOF_WORD]    ;G: w4 = 36 37 38 39 -- -- -- --
+    pcmpeqw     xmm3, xmm0                                ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
+    punpckldq   xmm4, xmm1                                ;G: w4 = 36 37 44 45 38 39 46 47
+    mov         tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4]
+                                                          ;Z:     temp = dctbl->ehufco[nbits];
+    movdqa      xmm1, xmm4                                ;F: w1 = 36 37 44 45 38 39 46 47
+    psrldq      xmm4, 1 * SIZEOF_WORD                     ;G: w4 = 37 44 45 38 39 46 47 --
+    shufpd      xmm1, xmm5, 10b                           ;F: w1 = 36 37 44 45 50 51 58 59
+    and         code, dword [MASK_BITS(nbitsq)]           ;Z:     code &= (1 << nbits) - 1;
+    pshufhw     xmm4, xmm4, 11010011b                     ;G: w4 = 37 44 45 38 -- 39 46 --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;F: w1 = -- 36 37 44 45 50 51 58
+    shl         tempq, nbitsb                             ;Z:     temp <<= nbits;
+    pinsrw      xmm4, word [block + 59 * SIZEOF_WORD], 0  ;G: w4 = 59 44 45 38 -- 39 46 --
+    pshufd      xmm1, xmm1, 11011000b                     ;F: w1 = -- 36 45 50 37 44 51 58
+    pinsrw      xmm4, word [block + 52 * SIZEOF_WORD], 1  ;G: w4 = 59 52 45 38 -- 39 46 --
+    or          code, tempd                               ;Z:     code |= temp;
+    movlps      xmm1, qword [block + 20 * SIZEOF_WORD]    ;F: w1 = 20 21 22 23 37 44 51 58
+    pinsrw      xmm4, word [block + 31 * SIZEOF_WORD], 4  ;G: w4 = 59 52 45 38 31 39 46 --
+    pshuflw     xmm1, xmm1, 01110010b                     ;F: w1 = 22 20 23 21 37 44 51 58
+    pinsrw      xmm4, word [block + 53 * SIZEOF_WORD], 7  ;G: w4 = 59 52 45 38 31 39 46 53
+                                                          ;        (Row 6, offset 1)
+    pxor        xmm2, xmm2                                ;G: w2[i] = 0;
+    pcmpgtw     xmm0, xmm4                                ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 15 * SIZEOF_WORD], 1  ;F: w1 = 22 15 23 21 37 44 51 58
+    paddw       xmm4, xmm0                                ;G: w4[i] += w0[i];
+    movaps      XMMWORD [t + 48 * SIZEOF_WORD], xmm4      ;G: t[48+i] = w4[i];
+    pinsrw      xmm1, word [block + 30 * SIZEOF_WORD], 3  ;F: w1 = 22 15 23 30 37 44 51 58
+                                                          ;        (Row 5, offset 1)
+    pcmpeqw     xmm4, xmm2                                ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
+    pinsrw      xmm5, word [block + 42 * SIZEOF_WORD], 0  ;E: w5 = 42 49 56 57 50 51 58 59
 
-    mov         r13d,  INT [r15 + 240 * 4]     ; code_0xf0 = actbl->ehufco[0xf0];
-    movzx       r14d, byte [r15 + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
-    lea         rsi, [t1]
-.BLOOP:
-    bsf         r12, r11                     ; r = __builtin_ctzl(index);
-    jz          .ELOOP
-    mov         rcx, r12
-    lea         rsi, [rsi+r12*2]             ; k += r;
-    shr         r11, cl                      ; index >>= r;
-    movzx       rdi, word [rsi]              ; temp = t1[k];
-    lea         rbx, [rel jpeg_nbits_table]
-    movzx       rdi, byte [rbx + rdi]        ; nbits = JPEG_NBITS(temp);
-.BRLOOP:
-    cmp         r12, 16                 ; while (r > 15) {
-    jl          .ERLOOP
-    EMIT_BITS   r13, r14d               ; EMIT_BITS(code_0xf0, size_0xf0)
-    sub         r12, 16                 ; r -= 16;
-    jmp         .BRLOOP
-.ERLOOP:
-    ; Emit Huffman symbol for run length / number of bits
-    CHECKBUF31  ; uses rcx, rdx
+    packsswb    xmm4, xmm3                                ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
+                                                          ;    w/ signed saturation
 
-    shl         r12, 4                        ; temp3 = (r << 4) + nbits;
-    add         r12, rdi
-    mov         ebx,  INT [r15 + r12 * 4]     ; code = actbl->ehufco[temp3];
-    movzx       ecx, byte [r15 + r12 + 1024]  ; size = actbl->ehufsi[temp3];
-    PUT_BITS    rbx
+    pxor        xmm0, xmm0                                ;F: w0[i] = 0;
+    pinsrw      xmm5, word [block + 43 * SIZEOF_WORD], 5  ;E: w5 = 42 49 56 57 50 43 58 59
+    pcmpgtw     xmm2, xmm1                                ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
+    pmovmskb    tempd, xmm4                               ;Z:     temp = 0;  temp |= ((b4[i] >> 7) << i);
+    pinsrw      xmm5, word [block + 36 * SIZEOF_WORD], 6  ;E: w5 = 42 49 56 57 50 43 36 59
+    paddw       xmm1, xmm2                                ;F: w1[i] += w2[i];
+    movaps      XMMWORD [t + 40 * SIZEOF_WORD], xmm1      ;F: t[40+i] = w1[i];
+    pinsrw      xmm5, word [block + 29 * SIZEOF_WORD], 7  ;E: w5 = 42 49 56 57 50 43 36 29
+                                                          ;        (Row 4, offset 1)
+%undef block
+%define free_bitsq  rdx
+%define free_bitsd  edx
+%define free_bitsb  dl
+    pcmpeqw     xmm1, xmm0                                ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
+    shl         tempq, 48                                 ;Z:     temp <<= 48;
+    pxor        xmm2, xmm2                                ;E: w2[i] = 0;
+    pcmpgtw     xmm0, xmm5                                ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
+    paddw       xmm5, xmm0                                ;E: w5[i] += w0[i];
+    or          tempq, put_buffer                         ;Z:     temp |= put_buffer;
+    movaps      XMMWORD [t + 32 * SIZEOF_WORD], xmm5      ;E: t[32+i] = w5[i];
+    lea         t, [dword t - 2]                          ;Z:     t = &t[-1];
+    pcmpeqw     xmm5, xmm2                                ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
 
-    ;EMIT_CODE(code, size)
+    packsswb    xmm5, xmm1                                ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
+                                                          ;    w/ signed saturation
 
-    movsx       ebx, word [rsi-DCTSIZE2*2]    ; temp2 = t2[k];
-    ; Mask off any extra bits in code
-    mov         rcx, rdi
-    mov         rdx, 1
-    shl         rdx, cl
-    dec         rdx
-    and         rbx, rdx                ; temp2 &= (((JLONG)1)<<nbits) - 1;
-    PUT_BITS    rbx                     ; PUT_BITS(temp2, nbits)
+    add         nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq]
+                                                          ;Z:     nbits += dctbl->ehufsi[nbits];
+%undef dctbl
+%define code_temp  r8d
+    pmovmskb    indexd, xmm5                              ;Z:     index = 0;  index |= ((b5[i] >> 7) << i);
+    mov         free_bitsd, [state+working_state.cur.free_bits]
+                                                          ;Z:     free_bits = state->cur.free_bits;
+    pcmpeqw     xmm1, xmm1                                ;Z:     b1[i] = 0xFF;
+    shl         index, 32                                 ;Z:     index <<= 32;
+    mov         put_buffer, [state+working_state.cur.put_buffer.simd]
+                                                          ;Z:     put_buffer = state->cur.put_buffer.simd;
+    or          index, tempq                              ;Z:     index |= temp;
+    not         index                                     ;Z:     index = ~index;
+    sub         free_bitsb, nbitsb                        ;Z:     if ((free_bits -= nbits) >= 0)
+    jnl         .ENTRY_SKIP_EMIT_CODE                     ;Z:       goto .ENTRY_SKIP_EMIT_CODE;
+    align       16
+.EMIT_CODE:                                               ;Z:     .EMIT_CODE:
+    EMIT_QWORD  .BLOOP_COND                               ;Z:     insert code, flush buffer, goto .BLOOP_COND
 
-    shr         r11, 1                  ; index >>= 1;
-    add         rsi, 2                  ; ++k;
-    jmp         .BLOOP
-.ELOOP:
-    ; If the last coef(s) were zero, emit an end-of-block code
-    lea         rdi, [t1 + (DCTSIZE2-1) * 2]  ; r = DCTSIZE2-1-k;
-    cmp         rdi, rsi                      ; if (r > 0) {
-    je          .EFN
-    mov         ebx,  INT [r15]               ; code = actbl->ehufco[0];
-    movzx       r12d, byte [r15 + 1024]       ; size = actbl->ehufsi[0];
-    EMIT_BITS   rbx, r12d
-.EFN:
-    pop         r10
-    ; Save put_buffer & put_bits
-    mov         MMWORD [r10+16], put_buffer  ; state->cur.put_buffer = put_buffer;
-    mov         dword  [r10+24], put_bits    ; state->cur.put_bits = put_bits;
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    pop         rbx
-    uncollect_args 6
-    pop_xmm     4
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+    align       16
+.BRLOOP:                                                  ; do {
+    lea         code_temp, [nbitsq - 16]                  ;   code_temp = nbits - 16;
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ;   nbits = actbl->ehufsi[0xf0];
+    mov         code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ;   code = actbl->ehufco[0xf0];
+    sub         free_bitsb, nbitsb                        ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP_CODE                         ;     goto .EMIT_BRLOOP_CODE;
+    shl         put_buffer, nbitsb                        ;   put_buffer <<= nbits;
+    mov         nbits, code_temp                          ;   nbits = code_temp;
+    or          put_buffer, codeq                         ;   put_buffer |= code;
+    cmp         nbits, 16                                 ;   if (nbits <= 16)
+    jle         .ERLOOP                                   ;     break;
+    jmp         .BRLOOP                                   ; } while (1);
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+    times 5     nop
+.ENTRY_SKIP_EMIT_CODE:                                    ; .ENTRY_SKIP_EMIT_CODE:
+    shl         put_buffer, nbitsb                        ; put_buffer <<= nbits;
+    or          put_buffer, codeq                         ; put_buffer |= code;
+.BLOOP_COND:                                              ; .BLOOP_COND:
+    test        index, index                              ; if (index != 0)
+    jz          .ELOOP                                    ; {
+.BLOOP:                                                   ;   do {
+    xor         nbits, nbits                              ;     nbits = 0;  /* kill tzcnt input dependency */
+    tzcnt       nbitsq, index                             ;     nbits = # of trailing 0 bits in index
+    inc         nbits                                     ;     ++nbits;
+    lea         t, [t + nbitsq * 2]                       ;     t = &t[nbits];
+    shr         index, nbitsb                             ;     index >>= nbits;
+.EMIT_BRLOOP_CODE_END:                                    ; .EMIT_BRLOOP_CODE_END:
+    cmp         nbits, 16                                 ;     if (nbits > 16)
+    jg          .BRLOOP                                   ;       goto .BRLOOP;
+.ERLOOP:                                                  ; .ERLOOP:
+    movsx       codeq, word [t]                           ;     code = *t;
+    lea         tempd, [nbitsq * 2]                       ;     temp = nbits * 2;
+    movzx       nbits, byte [NBITS(codeq)]                ;     nbits = JPEG_NBITS(code);
+    lea         tempd, [nbitsq + tempq * 8]               ;     temp = temp * 8 + nbits;
+    mov         code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4]
+                                                          ;     code_temp = actbl->ehufco[temp-16];
+    shl         code_temp, nbitsb                         ;     code_temp <<= nbits;
+    and         code, dword [MASK_BITS(nbitsq)]           ;     code &= (1 << nbits) - 1;
+    add         nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)]
+                                                          ;     free_bits -= actbl->ehufsi[temp-16];
+    or          code, code_temp                           ;     code |= code_temp;
+    sub         free_bitsb, nbitsb                        ;     if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_CODE                                ;       goto .EMIT_CODE;
+    shl         put_buffer, nbitsb                        ;     put_buffer <<= nbits;
+    or          put_buffer, codeq                         ;     put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP                                    ;   } while (index != 0);
+.ELOOP:                                                   ; }  /* index != 0 */
+    sub         td, esp                                   ; t -= (WIN64: &t_[0], UNIX: &t_[64]);
+%ifdef WIN64
+    cmp         td, (DCTSIZE2 - 2) * SIZEOF_WORD          ; if (t != 62)
+%else
+    cmp         td, -2 * SIZEOF_WORD                      ; if (t != -2)
+%endif
+    je          .EFN                                      ; {
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
+                                                          ;   nbits = actbl->ehufsi[0];
+    mov         code, [actbl + c_derived_tbl.ehufco + 0]  ;   code = actbl->ehufco[0];
+    sub         free_bitsb, nbitsb                        ;   if ((free_bits -= nbits) <= 0)
+    jg          .EFN_SKIP_EMIT_CODE                       ;   {
+    EMIT_QWORD  .EFN                                      ;     insert code, flush buffer
+    align       16
+.EFN_SKIP_EMIT_CODE:                                      ;   } else {
+    shl         put_buffer, nbitsb                        ;     put_buffer <<= nbits;
+    or          put_buffer, codeq                         ;     put_buffer |= code;
+.EFN:                                                     ; } }
+    mov         [state + working_state.cur.put_buffer.simd], put_buffer
+                                                          ; state->cur.put_buffer.simd = put_buffer;
+    mov         byte [state + working_state.cur.free_bits], free_bitsb
+                                                          ; state->cur.free_bits = free_bits;
+%ifdef WIN64
+    sub         rsp, -DCTSIZE2 * SIZEOF_WORD
+    pop         r12
+    pop         rdi
+    pop         rsi
     pop         rbp
+    pop         rbx
+%else
+    pop         r12
+    pop         rbp
+    pop         rbx
+%endif
     ret
 
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP_CODE:
+    EMIT_QWORD  .EMIT_BRLOOP_CODE_END, { mov nbits, code_temp }
+                                                          ; insert code, flush buffer,
+                                                          ; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END
+
 ; For some reason, the OS X linker does not honor the request to align the
 ; segment unless we do this.
     align       32
diff --git a/simd/x86_64/jcsample-avx2.asm b/simd/x86_64/jcsample-avx2.asm
index d9922bb..b32527a 100644
--- a/simd/x86_64/jcsample-avx2.asm
+++ b/simd/x86_64/jcsample-avx2.asm
@@ -4,6 +4,7 @@
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -71,7 +72,7 @@
     push        rax
     push        rcx
 
-    mov         rdi, JSAMPROW [rsi]
+    mov         rdip, JSAMPROW [rsi]
     add         rdi, rdx
     mov         al, JSAMPLE [rdi-1]
 
@@ -107,8 +108,8 @@
     push        rdi
     push        rsi
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
 
     cmp         rcx, byte SIZEOF_YMMWORD
     jae         short .columnloop
@@ -233,7 +234,7 @@
     push        rax
     push        rcx
 
-    mov         rdi, JSAMPROW [rsi]
+    mov         rdip, JSAMPROW [rsi]
     add         rdi, rdx
     mov         al, JSAMPLE [rdi-1]
 
@@ -269,9 +270,9 @@
     push        rdi
     push        rsi
 
-    mov         rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
-    mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
-    mov         rdi, JSAMPROW [rdi]                    ; outptr
+    mov         rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdip, JSAMPROW [rdi]                    ; outptr
 
     cmp         rcx, byte SIZEOF_YMMWORD
     jae         short .columnloop
diff --git a/simd/x86_64/jcsample-sse2.asm b/simd/x86_64/jcsample-sse2.asm
index 0f107e9..2fcfe45 100644
--- a/simd/x86_64/jcsample-sse2.asm
+++ b/simd/x86_64/jcsample-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -70,7 +71,7 @@
     push        rax
     push        rcx
 
-    mov         rdi, JSAMPROW [rsi]
+    mov         rdip, JSAMPROW [rsi]
     add         rdi, rdx
     mov         al, JSAMPLE [rdi-1]
 
@@ -105,8 +106,8 @@
     push        rdi
     push        rsi
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
 
     cmp         rcx, byte SIZEOF_XMMWORD
     jae         short .columnloop
@@ -215,7 +216,7 @@
     push        rax
     push        rcx
 
-    mov         rdi, JSAMPROW [rsi]
+    mov         rdip, JSAMPROW [rsi]
     add         rdi, rdx
     mov         al, JSAMPLE [rdi-1]
 
@@ -250,9 +251,9 @@
     push        rdi
     push        rsi
 
-    mov         rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
-    mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
-    mov         rdi, JSAMPROW [rdi]                    ; outptr
+    mov         rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdip, JSAMPROW [rdi]                    ; outptr
 
     cmp         rcx, byte SIZEOF_XMMWORD
     jae         short .columnloop
diff --git a/simd/x86_64/jdcolext-avx2.asm b/simd/x86_64/jdcolext-avx2.asm
index 677b8ed..2370fda 100644
--- a/simd/x86_64/jdcolext-avx2.asm
+++ b/simd/x86_64/jdcolext-avx2.asm
@@ -4,6 +4,7 @@
 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2012, 2016, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -58,9 +59,9 @@
 
     mov         rdi, r11
     mov         ecx, r12d
-    mov         rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-    mov         rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-    mov         rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
     lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
     lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
     lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
@@ -79,10 +80,10 @@
     push        rsi
     push        rcx                     ; col
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr0
-    mov         rbx, JSAMPROW [rbx]     ; inptr1
-    mov         rdx, JSAMPROW [rdx]     ; inptr2
-    mov         rdi, JSAMPROW [rdi]     ; outptr
+    mov         rsip, JSAMPROW [rsi]    ; inptr0
+    mov         rbxp, JSAMPROW [rbx]    ; inptr1
+    mov         rdxp, JSAMPROW [rdx]    ; inptr2
+    mov         rdip, JSAMPROW [rdi]    ; outptr
 .columnloop:
 
     vmovdqu     ymm5, YMMWORD [rbx]     ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
diff --git a/simd/x86_64/jdcolext-sse2.asm b/simd/x86_64/jdcolext-sse2.asm
index 071aa62..e07c8d7 100644
--- a/simd/x86_64/jdcolext-sse2.asm
+++ b/simd/x86_64/jdcolext-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -57,9 +58,9 @@
 
     mov         rdi, r11
     mov         ecx, r12d
-    mov         rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-    mov         rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-    mov         rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
     lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
     lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
     lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
@@ -78,10 +79,10 @@
     push        rsi
     push        rcx                     ; col
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr0
-    mov         rbx, JSAMPROW [rbx]     ; inptr1
-    mov         rdx, JSAMPROW [rdx]     ; inptr2
-    mov         rdi, JSAMPROW [rdi]     ; outptr
+    mov         rsip, JSAMPROW [rsi]    ; inptr0
+    mov         rbxp, JSAMPROW [rbx]    ; inptr1
+    mov         rdxp, JSAMPROW [rdx]    ; inptr2
+    mov         rdip, JSAMPROW [rdi]    ; outptr
 .columnloop:
 
     movdqa      xmm5, XMMWORD [rbx]     ; xmm5=Cb(0123456789ABCDEF)
diff --git a/simd/x86_64/jdmrgext-avx2.asm b/simd/x86_64/jdmrgext-avx2.asm
index bb733c5..8b264b4 100644
--- a/simd/x86_64/jdmrgext-avx2.asm
+++ b/simd/x86_64/jdmrgext-avx2.asm
@@ -4,6 +4,7 @@
 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2012, 2016, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -58,14 +59,14 @@
 
     mov         rdi, r11
     mov         ecx, r12d
-    mov         rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-    mov         rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-    mov         rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
     mov         rdi, r13
-    mov         rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]  ; inptr0
-    mov         rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]  ; inptr1
-    mov         rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]  ; inptr2
-    mov         rdi, JSAMPROW [rdi]                      ; outptr
+    mov         rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         rdip, JSAMPROW [rdi]                      ; outptr
 
     pop         rcx                     ; col
 
@@ -514,15 +515,16 @@
 
     mov         rdi, r11
     mov         ecx, r12d
-    mov         rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-    mov         rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-    mov         rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
     mov         rdi, r13
     lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
 
-    push        rdx                     ; inptr2
-    push        rbx                     ; inptr1
-    push        rsi                     ; inptr00
+    sub         rsp, SIZEOF_JSAMPARRAY*4
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
     mov         rbx, rsp
 
     push        rdi
@@ -546,16 +548,16 @@
     pop         rax
     pop         rcx
     pop         rdi
-    pop         rsi
-    pop         rbx
-    pop         rdx
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
 
     add         rdi, byte SIZEOF_JSAMPROW  ; outptr1
     add         rsi, byte SIZEOF_JSAMPROW  ; inptr01
 
-    push        rdx                     ; inptr2
-    push        rbx                     ; inptr1
-    push        rsi                     ; inptr00
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
     mov         rbx, rsp
 
     push        rdi
@@ -579,9 +581,10 @@
     pop         rax
     pop         rcx
     pop         rdi
-    pop         rsi
-    pop         rbx
-    pop         rdx
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+    add         rsp, SIZEOF_JSAMPARRAY*4
 
     pop         rbx
     uncollect_args 4
diff --git a/simd/x86_64/jdmrgext-sse2.asm b/simd/x86_64/jdmrgext-sse2.asm
index b176a4c..eb3ab9d 100644
--- a/simd/x86_64/jdmrgext-sse2.asm
+++ b/simd/x86_64/jdmrgext-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -57,14 +58,14 @@
 
     mov         rdi, r11
     mov         ecx, r12d
-    mov         rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-    mov         rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-    mov         rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
     mov         rdi, r13
-    mov         rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]  ; inptr0
-    mov         rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]  ; inptr1
-    mov         rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]  ; inptr2
-    mov         rdi, JSAMPROW [rdi]                      ; outptr
+    mov         rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         rdip, JSAMPROW [rdi]                      ; outptr
 
     pop         rcx                     ; col
 
@@ -456,15 +457,16 @@
 
     mov         rdi, r11
     mov         ecx, r12d
-    mov         rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-    mov         rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-    mov         rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
     mov         rdi, r13
     lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
 
-    push        rdx                     ; inptr2
-    push        rbx                     ; inptr1
-    push        rsi                     ; inptr00
+    sub         rsp, SIZEOF_JSAMPARRAY*4
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
     mov         rbx, rsp
 
     push        rdi
@@ -488,16 +490,16 @@
     pop         rax
     pop         rcx
     pop         rdi
-    pop         rsi
-    pop         rbx
-    pop         rdx
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
 
     add         rdi, byte SIZEOF_JSAMPROW  ; outptr1
     add         rsi, byte SIZEOF_JSAMPROW  ; inptr01
 
-    push        rdx                     ; inptr2
-    push        rbx                     ; inptr1
-    push        rsi                     ; inptr00
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
     mov         rbx, rsp
 
     push        rdi
@@ -521,9 +523,10 @@
     pop         rax
     pop         rcx
     pop         rdi
-    pop         rsi
-    pop         rbx
-    pop         rdx
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+    add         rsp, SIZEOF_JSAMPARRAY*4
 
     pop         rbx
     uncollect_args 4
diff --git a/simd/x86_64/jdsample-avx2.asm b/simd/x86_64/jdsample-avx2.asm
index fc274a9..1e4979f 100644
--- a/simd/x86_64/jdsample-avx2.asm
+++ b/simd/x86_64/jdsample-avx2.asm
@@ -4,6 +4,7 @@
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -76,7 +77,7 @@
 
     mov         rsi, r12                ; input_data
     mov         rdi, r13
-    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
 
     vpxor       ymm0, ymm0, ymm0                 ; ymm0=(all 0's)
     vpcmpeqb    xmm9, xmm9, xmm9
@@ -90,8 +91,8 @@
     push        rdi
     push        rsi
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
 
     test        rax, SIZEOF_YMMWORD-1
     jz          short .skip
@@ -235,18 +236,18 @@
 
     mov         rsi, r12                ; input_data
     mov         rdi, r13
-    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
 .rowloop:
     push        rax                     ; colctr
     push        rcx
     push        rdi
     push        rsi
 
-    mov         rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
-    mov         rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
-    mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
-    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
-    mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
 
     vpxor       ymm8, ymm8, ymm8                 ; ymm8=(all 0's)
     vpcmpeqb    xmm9, xmm9, xmm9
@@ -539,13 +540,13 @@
 
     mov         rsi, r12                ; input_data
     mov         rdi, r13
-    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
 .rowloop:
     push        rdi
     push        rsi
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
     mov         rax, rdx                ; colctr
 .columnloop:
 
@@ -629,14 +630,14 @@
 
     mov         rsi, r12                ; input_data
     mov         rdi, r13
-    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
 .rowloop:
     push        rdi
     push        rsi
 
-    mov         rsi, JSAMPROW [rsi]                    ; inptr
-    mov         rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
-    mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         rsip, JSAMPROW [rsi]                   ; inptr
+    mov         rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
     mov         rax, rdx                               ; colctr
 .columnloop:
 
diff --git a/simd/x86_64/jdsample-sse2.asm b/simd/x86_64/jdsample-sse2.asm
index 20e0767..38dbcee 100644
--- a/simd/x86_64/jdsample-sse2.asm
+++ b/simd/x86_64/jdsample-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -74,14 +75,14 @@
 
     mov         rsi, r12                ; input_data
     mov         rdi, r13
-    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
 .rowloop:
     push        rax                     ; colctr
     push        rdi
     push        rsi
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
 
     test        rax, SIZEOF_XMMWORD-1
     jz          short .skip
@@ -221,18 +222,18 @@
 
     mov         rsi, r12                ; input_data
     mov         rdi, r13
-    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
 .rowloop:
     push        rax                     ; colctr
     push        rcx
     push        rdi
     push        rsi
 
-    mov         rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
-    mov         rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
-    mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
-    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
-    mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
 
     test        rax, SIZEOF_XMMWORD-1
     jz          short .skip
@@ -512,13 +513,13 @@
 
     mov         rsi, r12                ; input_data
     mov         rdi, r13
-    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
 .rowloop:
     push        rdi
     push        rsi
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
     mov         rax, rdx                ; colctr
 .columnloop:
 
@@ -600,14 +601,14 @@
 
     mov         rsi, r12                ; input_data
     mov         rdi, r13
-    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
 .rowloop:
     push        rdi
     push        rsi
 
-    mov         rsi, JSAMPROW [rsi]                    ; inptr
-    mov         rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
-    mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         rsip, JSAMPROW [rsi]                   ; inptr
+    mov         rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
     mov         rax, rdx                               ; colctr
 .columnloop:
 
diff --git a/simd/x86_64/jidctflt-sse2.asm b/simd/x86_64/jidctflt-sse2.asm
index ab95e1a..60bf961 100644
--- a/simd/x86_64/jidctflt-sse2.asm
+++ b/simd/x86_64/jidctflt-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -455,12 +456,12 @@
     pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
     pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
 
-    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-    mov         rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rbxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
     movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
-    mov         rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-    mov         rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rbxp, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
     movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
 
diff --git a/simd/x86_64/jidctfst-sse2.asm b/simd/x86_64/jidctfst-sse2.asm
index a66a681..cb97fdf 100644
--- a/simd/x86_64/jidctfst-sse2.asm
+++ b/simd/x86_64/jidctfst-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -460,21 +461,21 @@
     pshufd      xmm6, xmm4, 0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
     pshufd      xmm2, xmm7, 0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
 
-    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-    mov         rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
 
-    mov         rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
-    mov         rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
 
diff --git a/simd/x86_64/jidctint-avx2.asm b/simd/x86_64/jidctint-avx2.asm
index 9c38f9e..ca7e317 100644
--- a/simd/x86_64/jidctint-avx2.asm
+++ b/simd/x86_64/jidctint-avx2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -387,23 +388,23 @@
 
     mov         eax, r13d
 
-    mov         rdx, JSAMPROW [r12+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rsi, JSAMPROW [r12+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [r12+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm0
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
 
-    mov         rdx, JSAMPROW [r12+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rsi, JSAMPROW [r12+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [r12+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
 
-    mov         rdx, JSAMPROW [r12+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rsi, JSAMPROW [r12+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [r12+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
 
-    mov         rdx, JSAMPROW [r12+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rsi, JSAMPROW [r12+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [r12+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
 
diff --git a/simd/x86_64/jidctint-sse2.asm b/simd/x86_64/jidctint-sse2.asm
index 8983bf0..7aa869b 100644
--- a/simd/x86_64/jidctint-sse2.asm
+++ b/simd/x86_64/jidctint-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, 2020, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -817,21 +818,21 @@
     pshufd      xmm2, xmm4, 0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
     pshufd      xmm5, xmm3, 0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
 
-    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
-    mov         rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
 
-    mov         rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
-    mov         rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
 
diff --git a/simd/x86_64/jidctred-sse2.asm b/simd/x86_64/jidctred-sse2.asm
index 7fbfcc5..4ece9d8 100644
--- a/simd/x86_64/jidctred-sse2.asm
+++ b/simd/x86_64/jidctred-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -379,12 +380,12 @@
     pshufd      xmm1, xmm4, 0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
     pshufd      xmm3, xmm4, 0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
 
-    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
     movd        XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
     movd        XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
-    mov         rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
     movd        XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
     movd        XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
 
@@ -558,8 +559,8 @@
     pextrw      ebx, xmm6, 0x00         ; ebx=(C0 D0 -- --)
     pextrw      ecx, xmm6, 0x01         ; ecx=(C1 D1 -- --)
 
-    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
     mov         word [rdx+rax*SIZEOF_JSAMPLE], bx
     mov         word [rsi+rax*SIZEOF_JSAMPLE], cx
 
diff --git a/simd/x86_64/jquantf-sse2.asm b/simd/x86_64/jquantf-sse2.asm
index 83596a9..ab2e395 100644
--- a/simd/x86_64/jquantf-sse2.asm
+++ b/simd/x86_64/jquantf-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -51,8 +52,8 @@
     mov         rdi, r12
     mov         rcx, DCTSIZE/2
 .convloop:
-    mov         rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
 
     movq        xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
     movq        xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
diff --git a/simd/x86_64/jquanti-avx2.asm b/simd/x86_64/jquanti-avx2.asm
index 5f04d22..70fe811 100644
--- a/simd/x86_64/jquanti-avx2.asm
+++ b/simd/x86_64/jquanti-avx2.asm
@@ -4,6 +4,7 @@
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, 2018, D. R. Commander.
 ; Copyright (C) 2016, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -44,23 +45,23 @@
 
     mov         eax, r11d
 
-    mov         rsi, JSAMPROW [r10+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rdi, JSAMPROW [r10+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r10+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     movq        xmm0, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
     pinsrq      xmm0, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
 
-    mov         rsi, JSAMPROW [r10+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rdi, JSAMPROW [r10+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r10+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     movq        xmm1, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
     pinsrq      xmm1, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
 
-    mov         rsi, JSAMPROW [r10+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rdi, JSAMPROW [r10+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r10+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     movq        xmm2, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
     pinsrq      xmm2, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
 
-    mov         rsi, JSAMPROW [r10+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rdi, JSAMPROW [r10+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r10+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     movq        xmm3, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
     pinsrq      xmm3, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
 
diff --git a/simd/x86_64/jquanti-sse2.asm b/simd/x86_64/jquanti-sse2.asm
index bb6fa69..3ee4420 100644
--- a/simd/x86_64/jquanti-sse2.asm
+++ b/simd/x86_64/jquanti-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -51,14 +52,14 @@
     mov         rdi, r12
     mov         rcx, DCTSIZE/4
 .convloop:
-    mov         rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
 
     movq        xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]  ; xmm0=(01234567)
     movq        xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]  ; xmm1=(89ABCDEF)
 
-    mov         rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rbxp, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
 
     movq        xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]  ; xmm2=(GHIJKLMN)
     movq        xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]  ; xmm3=(OPQRSTUV)
diff --git a/simd/x86_64/jsimd.c b/simd/x86_64/jsimd.c
index 1e5698b..eb76679 100644
--- a/simd/x86_64/jsimd.c
+++ b/simd/x86_64/jsimd.c
@@ -1031,8 +1031,6 @@
     return 0;
   if (sizeof(JCOEF) != 2)
     return 0;
-  if (SIZEOF_SIZE_T != 8)
-    return 0;
   if (simd_support & JSIMD_SSE2)
     return 1;
 
@@ -1057,8 +1055,6 @@
     return 0;
   if (sizeof(JCOEF) != 2)
     return 0;
-  if (SIZEOF_SIZE_T != 8)
-    return 0;
   if (simd_support & JSIMD_SSE2)
     return 1;
 
diff --git a/structure.txt b/structure.txt
index c0792a3..15b8d37 100644
--- a/structure.txt
+++ b/structure.txt
@@ -548,13 +548,9 @@
     typedef JSAMPROW *JSAMPARRAY;       ptr to a list of rows
     typedef JSAMPARRAY *JSAMPIMAGE;     ptr to a list of color-component arrays
 
-The basic element type JSAMPLE will typically be one of unsigned char,
-(signed) char, or short.  Short will be used if samples wider than 8 bits are
-to be supported (this is a compile-time option).  Otherwise, unsigned char is
-used if possible.  If the compiler only supports signed chars, then it is
-necessary to mask off the value when reading.  Thus, all reads of JSAMPLE
-values must be coded as "GETJSAMPLE(value)", where the macro will be defined
-as "((value) & 0xFF)" on signed-char machines and "((int) (value))" elsewhere.
+The basic element type JSAMPLE will be one of unsigned char or short.  Short
+will be used if samples wider than 8 bits are to be supported (this is a
+compile-time option).  Otherwise, unsigned char is used.
 
 With these conventions, JSAMPLE values can be assumed to be >= 0.  This helps
 simplify correct rounding during downsampling, etc.  The JPEG standard's
@@ -587,7 +583,7 @@
 is helpful when dealing with noninterleaved JPEG files.
 
 In general, a specific sample value is accessed by code such as
-        GETJSAMPLE(image[colorcomponent][row][col])
+        image[colorcomponent][row][col]
 where col is measured from the image left edge, but row is measured from the
 first sample row currently in memory.  Either of the first two indexings can
 be precomputed by copying the relevant pointer.
diff --git a/transupp.c b/transupp.c
index 77b33c4..6e86077 100644
--- a/transupp.c
+++ b/transupp.c
@@ -2,7 +2,7 @@
  * transupp.c
  *
  * This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1997-2011, Thomas G. Lane, Guido Vollbeding.
+ * Copyright (C) 1997-2019, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2010, 2017, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -89,6 +89,189 @@
 
 
 LOCAL(void)
+dequant_comp(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+             jvirt_barray_ptr coef_array, JQUANT_TBL *qtblptr1)
+{
+  JDIMENSION blk_x, blk_y;
+  int offset_y, k;
+  JQUANT_TBL *qtblptr;
+  JBLOCKARRAY buffer;
+  JBLOCKROW block;
+  JCOEFPTR ptr;
+
+  qtblptr = compptr->quant_table;
+  for (blk_y = 0; blk_y < compptr->height_in_blocks;
+       blk_y += compptr->v_samp_factor) {
+    buffer = (*cinfo->mem->access_virt_barray)
+      ((j_common_ptr)cinfo, coef_array, blk_y,
+       (JDIMENSION)compptr->v_samp_factor, TRUE);
+    for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+      block = buffer[offset_y];
+      for (blk_x = 0; blk_x < compptr->width_in_blocks; blk_x++) {
+        ptr = block[blk_x];
+        for (k = 0; k < DCTSIZE2; k++)
+          if (qtblptr->quantval[k] != qtblptr1->quantval[k])
+            ptr[k] *= qtblptr->quantval[k] / qtblptr1->quantval[k];
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+requant_comp(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+             jvirt_barray_ptr coef_array, JQUANT_TBL *qtblptr1)
+{
+  JDIMENSION blk_x, blk_y;
+  int offset_y, k;
+  JQUANT_TBL *qtblptr;
+  JBLOCKARRAY buffer;
+  JBLOCKROW block;
+  JCOEFPTR ptr;
+  JCOEF temp, qval;
+
+  qtblptr = compptr->quant_table;
+  for (blk_y = 0; blk_y < compptr->height_in_blocks;
+       blk_y += compptr->v_samp_factor) {
+    buffer = (*cinfo->mem->access_virt_barray)
+      ((j_common_ptr)cinfo, coef_array, blk_y,
+       (JDIMENSION)compptr->v_samp_factor, TRUE);
+    for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+      block = buffer[offset_y];
+      for (blk_x = 0; blk_x < compptr->width_in_blocks; blk_x++) {
+        ptr = block[blk_x];
+        for (k = 0; k < DCTSIZE2; k++) {
+          temp = qtblptr->quantval[k];
+          qval = qtblptr1->quantval[k];
+          if (temp != qval) {
+            temp *= ptr[k];
+            /* The following quantization code is copied from jcdctmgr.c */
+#ifdef FAST_DIVIDE
+#define DIVIDE_BY(a, b)  a /= b
+#else
+#define DIVIDE_BY(a, b)  if (a >= b) a /= b;  else a = 0
+#endif
+            if (temp < 0) {
+              temp = -temp;
+              temp += qval >> 1; /* for rounding */
+              DIVIDE_BY(temp, qval);
+              temp = -temp;
+            } else {
+              temp += qval >> 1; /* for rounding */
+              DIVIDE_BY(temp, qval);
+            }
+            ptr[k] = temp;
+          }
+        }
+      }
+    }
+  }
+}
+
+
+/*
+ * Calculate largest common denominator using Euclid's algorithm.
+ */
+LOCAL(JCOEF)
+largest_common_denominator(JCOEF a, JCOEF b)
+{
+  JCOEF c;
+
+  do {
+    c = a % b;
+    a = b;
+    b = c;
+  } while (c);
+
+  return a;
+}
+
+
+LOCAL(void)
+adjust_quant(j_decompress_ptr srcinfo, jvirt_barray_ptr *src_coef_arrays,
+             j_decompress_ptr dropinfo, jvirt_barray_ptr *drop_coef_arrays,
+             boolean trim, j_compress_ptr dstinfo)
+{
+  jpeg_component_info *compptr1, *compptr2;
+  JQUANT_TBL *qtblptr1, *qtblptr2, *qtblptr3;
+  int ci, k;
+
+  for (ci = 0; ci < dstinfo->num_components && ci < dropinfo->num_components;
+       ci++) {
+    compptr1 = srcinfo->comp_info + ci;
+    compptr2 = dropinfo->comp_info + ci;
+    qtblptr1 = compptr1->quant_table;
+    qtblptr2 = compptr2->quant_table;
+    for (k = 0; k < DCTSIZE2; k++) {
+      if (qtblptr1->quantval[k] != qtblptr2->quantval[k]) {
+        if (trim)
+          requant_comp(dropinfo, compptr2, drop_coef_arrays[ci], qtblptr1);
+        else {
+          qtblptr3 = dstinfo->quant_tbl_ptrs[compptr1->quant_tbl_no];
+          for (k = 0; k < DCTSIZE2; k++)
+            if (qtblptr1->quantval[k] != qtblptr2->quantval[k])
+              qtblptr3->quantval[k] =
+                largest_common_denominator(qtblptr1->quantval[k],
+                                           qtblptr2->quantval[k]);
+          dequant_comp(srcinfo, compptr1, src_coef_arrays[ci], qtblptr3);
+          dequant_comp(dropinfo, compptr2, drop_coef_arrays[ci], qtblptr3);
+        }
+        break;
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_drop(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+        JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+        jvirt_barray_ptr *src_coef_arrays,
+        j_decompress_ptr dropinfo, jvirt_barray_ptr *drop_coef_arrays,
+        JDIMENSION drop_width, JDIMENSION drop_height)
+/* Drop (insert) the contents of another image into the source image.  If the
+ * number of components in the drop image is smaller than the number of
+ * components in the destination image, then we fill in the remaining
+ * components with zero.  This allows for dropping the contents of grayscale
+ * images into (arbitrarily sampled) color images.
+ */
+{
+  JDIMENSION comp_width, comp_height;
+  JDIMENSION blk_y, x_drop_blocks, y_drop_blocks;
+  int ci, offset_y;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  jpeg_component_info *compptr;
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    comp_width = drop_width * compptr->h_samp_factor;
+    comp_height = drop_height * compptr->v_samp_factor;
+    x_drop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_drop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (blk_y = 0; blk_y < comp_height; blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, src_coef_arrays[ci], blk_y + y_drop_blocks,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      if (ci < dropinfo->num_components) {
+        src_buffer = (*dropinfo->mem->access_virt_barray)
+          ((j_common_ptr)dropinfo, drop_coef_arrays[ci], blk_y,
+           (JDIMENSION)compptr->v_samp_factor, FALSE);
+        for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+          jcopy_block_row(src_buffer[offset_y],
+                          dst_buffer[offset_y] + x_drop_blocks, comp_width);
+        }
+      } else {
+        for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+          MEMZERO(dst_buffer[offset_y] + x_drop_blocks,
+                  comp_width * sizeof(JBLOCK));
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
 do_crop(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
         JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
         jvirt_barray_ptr *src_coef_arrays,
@@ -125,6 +308,417 @@
 
 
 LOCAL(void)
+do_crop_ext_zero(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+                 JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+                 jvirt_barray_ptr *src_coef_arrays,
+                 jvirt_barray_ptr *dst_coef_arrays)
+/* Crop.  This is only used when no rotate/flip is requested with the crop.
+ * Extension: If the destination size is larger than the source, we fill in the
+ * expanded region with zero (neutral gray).  Note that we also have to zero
+ * partial iMCUs at the right and bottom edge of the source image area in this
+ * case.
+ */
+{
+  JDIMENSION MCU_cols, MCU_rows, comp_width, comp_height;
+  JDIMENSION dst_blk_y, x_crop_blocks, y_crop_blocks;
+  int ci, offset_y;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  jpeg_component_info *compptr;
+
+  MCU_cols = srcinfo->output_width /
+             (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
+  MCU_rows = srcinfo->output_height /
+             (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    comp_width = MCU_cols * compptr->h_samp_factor;
+    comp_height = MCU_rows * compptr->v_samp_factor;
+    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
+         dst_blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      if (dstinfo->_jpeg_height > srcinfo->output_height) {
+        if (dst_blk_y < y_crop_blocks ||
+            dst_blk_y >= y_crop_blocks + comp_height) {
+          for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+            MEMZERO(dst_buffer[offset_y],
+                    compptr->width_in_blocks * sizeof(JBLOCK));
+          }
+          continue;
+        }
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y - y_crop_blocks, (JDIMENSION)compptr->v_samp_factor,
+           FALSE);
+      } else {
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y + y_crop_blocks, (JDIMENSION)compptr->v_samp_factor,
+           FALSE);
+      }
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        if (dstinfo->_jpeg_width > srcinfo->output_width) {
+          if (x_crop_blocks > 0) {
+            MEMZERO(dst_buffer[offset_y], x_crop_blocks * sizeof(JBLOCK));
+          }
+          jcopy_block_row(src_buffer[offset_y],
+                          dst_buffer[offset_y] + x_crop_blocks, comp_width);
+          if (compptr->width_in_blocks > x_crop_blocks + comp_width) {
+            MEMZERO(dst_buffer[offset_y] + x_crop_blocks + comp_width,
+                    (compptr->width_in_blocks - x_crop_blocks - comp_width) *
+                    sizeof(JBLOCK));
+          }
+        } else {
+          jcopy_block_row(src_buffer[offset_y] + x_crop_blocks,
+                          dst_buffer[offset_y], compptr->width_in_blocks);
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_crop_ext_flat(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+                 JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+                 jvirt_barray_ptr *src_coef_arrays,
+                 jvirt_barray_ptr *dst_coef_arrays)
+/* Crop.  This is only used when no rotate/flip is requested with the crop.
+ * Extension: The destination width is larger than the source, and we fill in
+ * the expanded region with the DC coefficient of the adjacent block.  Note
+ * that we also have to fill partial iMCUs at the right and bottom edge of the
+ * source image area in this case.
+ */
+{
+  JDIMENSION MCU_cols, MCU_rows, comp_width, comp_height;
+  JDIMENSION dst_blk_x, dst_blk_y, x_crop_blocks, y_crop_blocks;
+  int ci, offset_y;
+  JCOEF dc;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  jpeg_component_info *compptr;
+
+  MCU_cols = srcinfo->output_width /
+             (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
+  MCU_rows = srcinfo->output_height /
+             (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    comp_width = MCU_cols * compptr->h_samp_factor;
+    comp_height = MCU_rows * compptr->v_samp_factor;
+    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
+         dst_blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      if (dstinfo->_jpeg_height > srcinfo->output_height) {
+        if (dst_blk_y < y_crop_blocks ||
+            dst_blk_y >= y_crop_blocks + comp_height) {
+          for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+            MEMZERO(dst_buffer[offset_y],
+                    compptr->width_in_blocks * sizeof(JBLOCK));
+          }
+          continue;
+        }
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y - y_crop_blocks, (JDIMENSION)compptr->v_samp_factor,
+           FALSE);
+      } else {
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y + y_crop_blocks, (JDIMENSION)compptr->v_samp_factor,
+          FALSE);
+      }
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        if (x_crop_blocks > 0) {
+          MEMZERO(dst_buffer[offset_y], x_crop_blocks * sizeof(JBLOCK));
+          dc = src_buffer[offset_y][0][0];
+          for (dst_blk_x = 0; dst_blk_x < x_crop_blocks; dst_blk_x++) {
+            dst_buffer[offset_y][dst_blk_x][0] = dc;
+          }
+        }
+        jcopy_block_row(src_buffer[offset_y],
+                        dst_buffer[offset_y] + x_crop_blocks, comp_width);
+        if (compptr->width_in_blocks > x_crop_blocks + comp_width) {
+          MEMZERO(dst_buffer[offset_y] + x_crop_blocks + comp_width,
+                  (compptr->width_in_blocks - x_crop_blocks - comp_width) *
+                  sizeof(JBLOCK));
+          dc = src_buffer[offset_y][comp_width - 1][0];
+          for (dst_blk_x = x_crop_blocks + comp_width;
+               dst_blk_x < compptr->width_in_blocks; dst_blk_x++) {
+            dst_buffer[offset_y][dst_blk_x][0] = dc;
+          }
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_crop_ext_reflect(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+                    JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+                    jvirt_barray_ptr *src_coef_arrays,
+                    jvirt_barray_ptr *dst_coef_arrays)
+/* Crop.  This is only used when no rotate/flip is requested with the crop.
+ * Extension: The destination width is larger than the source, and we fill in
+ * the expanded region with repeated reflections of the source image.  Note
+ * that we also have to fill partial iMCUs at the right and bottom edge of the
+ * source image area in this case.
+ */
+{
+  JDIMENSION MCU_cols, MCU_rows, comp_width, comp_height, src_blk_x;
+  JDIMENSION dst_blk_x, dst_blk_y, x_crop_blocks, y_crop_blocks;
+  int ci, k, offset_y;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  JBLOCKROW src_row_ptr, dst_row_ptr;
+  JCOEFPTR src_ptr, dst_ptr;
+  jpeg_component_info *compptr;
+
+  MCU_cols = srcinfo->output_width /
+             (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
+  MCU_rows = srcinfo->output_height /
+             (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    comp_width = MCU_cols * compptr->h_samp_factor;
+    comp_height = MCU_rows * compptr->v_samp_factor;
+    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
+         dst_blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      if (dstinfo->_jpeg_height > srcinfo->output_height) {
+        if (dst_blk_y < y_crop_blocks ||
+            dst_blk_y >= y_crop_blocks + comp_height) {
+          for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+            MEMZERO(dst_buffer[offset_y],
+                    compptr->width_in_blocks * sizeof(JBLOCK));
+          }
+          continue;
+        }
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y - y_crop_blocks, (JDIMENSION)compptr->v_samp_factor,
+           FALSE);
+      } else {
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y + y_crop_blocks, (JDIMENSION)compptr->v_samp_factor,
+           FALSE);
+      }
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        /* Copy source region */
+        jcopy_block_row(src_buffer[offset_y],
+                        dst_buffer[offset_y] + x_crop_blocks, comp_width);
+        if (x_crop_blocks > 0) {
+          /* Reflect to left */
+          dst_row_ptr = dst_buffer[offset_y] + x_crop_blocks;
+          for (dst_blk_x = x_crop_blocks; dst_blk_x > 0;) {
+            src_row_ptr = dst_row_ptr;      /* (re)set axis of reflection */
+            for (src_blk_x = comp_width; src_blk_x > 0 && dst_blk_x > 0;
+                 src_blk_x--, dst_blk_x--) {
+              dst_ptr = *(--dst_row_ptr);   /* destination goes left */
+              src_ptr = *src_row_ptr++;     /* source goes right */
+              /* This unrolled loop doesn't need to know which row it's on. */
+              for (k = 0; k < DCTSIZE2; k += 2) {
+                *dst_ptr++ = *src_ptr++;    /* copy even column */
+                *dst_ptr++ = -(*src_ptr++); /* copy odd column with sign
+                                               change */
+              }
+            }
+          }
+        }
+        if (compptr->width_in_blocks > x_crop_blocks + comp_width) {
+          /* Reflect to right */
+          dst_row_ptr = dst_buffer[offset_y] + x_crop_blocks + comp_width;
+          for (dst_blk_x = compptr->width_in_blocks - x_crop_blocks - comp_width;
+               dst_blk_x > 0;) {
+            src_row_ptr = dst_row_ptr;      /* (re)set axis of reflection */
+            for (src_blk_x = comp_width; src_blk_x > 0 && dst_blk_x > 0;
+                 src_blk_x--, dst_blk_x--) {
+              dst_ptr = *dst_row_ptr++;     /* destination goes right */
+              src_ptr = *(--src_row_ptr);   /* source goes left */
+              /* This unrolled loop doesn't need to know which row it's on. */
+              for (k = 0; k < DCTSIZE2; k += 2) {
+                *dst_ptr++ = *src_ptr++;    /* copy even column */
+                *dst_ptr++ = -(*src_ptr++); /* copy odd column with sign
+                                               change */
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_wipe(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+        JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+        jvirt_barray_ptr *src_coef_arrays,
+        JDIMENSION drop_width, JDIMENSION drop_height)
+/* Wipe - discard image contents of specified region and fill with zero
+ * (neutral gray)
+ */
+{
+  JDIMENSION x_wipe_blocks, wipe_width;
+  JDIMENSION y_wipe_blocks, wipe_bottom;
+  int ci, offset_y;
+  JBLOCKARRAY buffer;
+  jpeg_component_info *compptr;
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    x_wipe_blocks = x_crop_offset * compptr->h_samp_factor;
+    wipe_width = drop_width * compptr->h_samp_factor;
+    y_wipe_blocks = y_crop_offset * compptr->v_samp_factor;
+    wipe_bottom = drop_height * compptr->v_samp_factor + y_wipe_blocks;
+    for (; y_wipe_blocks < wipe_bottom;
+         y_wipe_blocks += compptr->v_samp_factor) {
+      buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, src_coef_arrays[ci], y_wipe_blocks,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        MEMZERO(buffer[offset_y] + x_wipe_blocks, wipe_width * sizeof(JBLOCK));
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_flatten(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+           JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+           jvirt_barray_ptr *src_coef_arrays,
+           JDIMENSION drop_width, JDIMENSION drop_height)
+/* Flatten - discard image contents of specified region, similarly to wipe,
+ * but fill with the average of adjacent blocks instead of zero.
+ */
+{
+  JDIMENSION x_wipe_blocks, wipe_width, wipe_right;
+  JDIMENSION y_wipe_blocks, wipe_bottom, blk_x;
+  int ci, offset_y, dc_left_value, dc_right_value, average;
+  JBLOCKARRAY buffer;
+  jpeg_component_info *compptr;
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    x_wipe_blocks = x_crop_offset * compptr->h_samp_factor;
+    wipe_width = drop_width * compptr->h_samp_factor;
+    wipe_right = wipe_width + x_wipe_blocks;
+    y_wipe_blocks = y_crop_offset * compptr->v_samp_factor;
+    wipe_bottom = drop_height * compptr->v_samp_factor + y_wipe_blocks;
+    for (; y_wipe_blocks < wipe_bottom;
+         y_wipe_blocks += compptr->v_samp_factor) {
+      buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, src_coef_arrays[ci], y_wipe_blocks,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        MEMZERO(buffer[offset_y] + x_wipe_blocks, wipe_width * sizeof(JBLOCK));
+        if (x_wipe_blocks > 0) {
+          dc_left_value = buffer[offset_y][x_wipe_blocks - 1][0];
+          if (wipe_right < compptr->width_in_blocks) {
+            dc_right_value = buffer[offset_y][wipe_right][0];
+            average = (dc_left_value + dc_right_value) >> 1;
+          } else {
+            average = dc_left_value;
+          }
+        } else if (wipe_right < compptr->width_in_blocks) {
+          average = buffer[offset_y][wipe_right][0];
+        } else continue;
+        for (blk_x = x_wipe_blocks; blk_x < wipe_right; blk_x++) {
+          buffer[offset_y][blk_x][0] = (JCOEF)average;
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_reflect(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+           JDIMENSION x_crop_offset, jvirt_barray_ptr *src_coef_arrays,
+           JDIMENSION drop_width, JDIMENSION drop_height)
+/* Reflect - discard image contents of specified region, similarly to wipe,
+ * but fill with repeated reflections of the outside region instead of zero.
+ * NB: y_crop_offset is assumed to be zero.
+ */
+{
+  JDIMENSION x_wipe_blocks, wipe_width;
+  JDIMENSION y_wipe_blocks, wipe_bottom;
+  JDIMENSION src_blk_x, dst_blk_x;
+  int ci, k, offset_y;
+  JBLOCKARRAY buffer;
+  JBLOCKROW src_row_ptr, dst_row_ptr;
+  JCOEFPTR src_ptr, dst_ptr;
+  jpeg_component_info *compptr;
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    x_wipe_blocks = x_crop_offset * compptr->h_samp_factor;
+    wipe_width = drop_width * compptr->h_samp_factor;
+    wipe_bottom = drop_height * compptr->v_samp_factor;
+    for (y_wipe_blocks = 0; y_wipe_blocks < wipe_bottom;
+         y_wipe_blocks += compptr->v_samp_factor) {
+      buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, src_coef_arrays[ci], y_wipe_blocks,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        if (x_wipe_blocks > 0) {
+          /* Reflect from left */
+          dst_row_ptr = buffer[offset_y] + x_wipe_blocks;
+          for (dst_blk_x = wipe_width; dst_blk_x > 0;) {
+            src_row_ptr = dst_row_ptr;     /* (re)set axis of reflection */
+            for (src_blk_x = x_wipe_blocks;
+                 src_blk_x > 0 && dst_blk_x > 0; src_blk_x--, dst_blk_x--) {
+              dst_ptr = *dst_row_ptr++;    /* destination goes right */
+              src_ptr = *(--src_row_ptr);  /* source goes left */
+              /* this unrolled loop doesn't need to know which row it's on... */
+              for (k = 0; k < DCTSIZE2; k += 2) {
+                *dst_ptr++ = *src_ptr++;   /* copy even column */
+                *dst_ptr++ = -(*src_ptr++); /* copy odd column with sign change */
+              }
+            }
+          }
+        } else if (compptr->width_in_blocks > x_wipe_blocks + wipe_width) {
+          /* Reflect from right */
+          dst_row_ptr = buffer[offset_y] + x_wipe_blocks + wipe_width;
+          for (dst_blk_x = wipe_width; dst_blk_x > 0;) {
+            src_row_ptr = dst_row_ptr;     /* (re)set axis of reflection */
+            src_blk_x = compptr->width_in_blocks - x_wipe_blocks - wipe_width;
+            for (; src_blk_x > 0 && dst_blk_x > 0; src_blk_x--, dst_blk_x--) {
+              dst_ptr = *(--dst_row_ptr);  /* destination goes left */
+              src_ptr = *src_row_ptr++;    /* source goes right */
+              /* this unrolled loop doesn't need to know which row it's on... */
+              for (k = 0; k < DCTSIZE2; k += 2) {
+                *dst_ptr++ = *src_ptr++;   /* copy even column */
+                *dst_ptr++ = -(*src_ptr++); /* copy odd column with sign change */
+              }
+            }
+          }
+        } else {
+          MEMZERO(buffer[offset_y] + x_wipe_blocks,
+                  wipe_width * sizeof(JBLOCK));
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
 do_flip_h_no_crop(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
                   JDIMENSION x_crop_offset, jvirt_barray_ptr *src_coef_arrays)
 /* Horizontal flip; done in-place, so no separate dest array is required.
@@ -783,7 +1377,7 @@
  * The routine returns TRUE if the spec string is valid, FALSE if not.
  *
  * The crop spec string should have the format
- *      <width>[f]x<height>[f]{+-}<xoffset>{+-}<yoffset>
+ *      <width>[{fr}]x<height>[{fr}]{+-}<xoffset>{+-}<yoffset>
  * where width, height, xoffset, and yoffset are unsigned integers.
  * Each of the elements can be omitted to indicate a default value.
  * (A weakness of this style is that it is not possible to omit xoffset
@@ -808,6 +1402,9 @@
     if (*spec == 'f' || *spec == 'F') {
       spec++;
       info->crop_width_set = JCROP_FORCE;
+    } else if (*spec == 'r' || *spec == 'R') {
+      spec++;
+      info->crop_width_set = JCROP_REFLECT;
     } else
       info->crop_width_set = JCROP_POS;
   }
@@ -819,6 +1416,9 @@
     if (*spec == 'f' || *spec == 'F') {
       spec++;
       info->crop_height_set = JCROP_FORCE;
+    } else if (*spec == 'r' || *spec == 'R') {
+      spec++;
+      info->crop_height_set = JCROP_REFLECT;
     } else
       info->crop_height_set = JCROP_POS;
   }
@@ -893,10 +1493,10 @@
   jvirt_barray_ptr *coef_arrays;
   boolean need_workspace, transpose_it;
   jpeg_component_info *compptr;
-  JDIMENSION xoffset, yoffset;
+  JDIMENSION xoffset, yoffset, dtemp;
   JDIMENSION width_in_iMCUs, height_in_iMCUs;
   JDIMENSION width_in_blocks, height_in_blocks;
-  int ci, h_samp_factor, v_samp_factor;
+  int itemp, ci, h_samp_factor, v_samp_factor;
 
   /* Determine number of components in output image */
   if (info->force_grayscale &&
@@ -982,39 +1582,129 @@
       info->crop_xoffset = 0;   /* default to +0 */
     if (info->crop_yoffset_set == JCROP_UNSET)
       info->crop_yoffset = 0;   /* default to +0 */
-    if (info->crop_xoffset >= info->output_width ||
-        info->crop_yoffset >= info->output_height)
-      ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
-    if (info->crop_width_set == JCROP_UNSET)
+    if (info->crop_width_set == JCROP_UNSET) {
+      if (info->crop_xoffset >= info->output_width)
+        ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
       info->crop_width = info->output_width - info->crop_xoffset;
-    if (info->crop_height_set == JCROP_UNSET)
+    } else {
+      /* Check for crop extension */
+      if (info->crop_width > info->output_width) {
+        /* Crop extension does not work when transforming! */
+        if (info->transform != JXFORM_NONE ||
+            info->crop_xoffset >= info->crop_width ||
+            info->crop_xoffset > info->crop_width - info->output_width)
+          ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
+      } else {
+        if (info->crop_xoffset >= info->output_width ||
+            info->crop_width <= 0 ||
+            info->crop_xoffset > info->output_width - info->crop_width)
+          ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
+      }
+    }
+    if (info->crop_height_set == JCROP_UNSET) {
+      if (info->crop_yoffset >= info->output_height)
+        ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
       info->crop_height = info->output_height - info->crop_yoffset;
-    /* Ensure parameters are valid */
-    if (info->crop_width <= 0 || info->crop_width > info->output_width ||
-        info->crop_height <= 0 || info->crop_height > info->output_height ||
-        info->crop_xoffset > info->output_width - info->crop_width ||
-        info->crop_yoffset > info->output_height - info->crop_height)
-      ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
+    } else {
+      /* Check for crop extension */
+      if (info->crop_height > info->output_height) {
+        /* Crop extension does not work when transforming! */
+        if (info->transform != JXFORM_NONE ||
+            info->crop_yoffset >= info->crop_height ||
+            info->crop_yoffset > info->crop_height - info->output_height)
+          ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
+      } else {
+        if (info->crop_yoffset >= info->output_height ||
+            info->crop_height <= 0 ||
+            info->crop_yoffset > info->output_height - info->crop_height)
+          ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
+      }
+    }
     /* Convert negative crop offsets into regular offsets */
-    if (info->crop_xoffset_set == JCROP_NEG)
-      xoffset = info->output_width - info->crop_width - info->crop_xoffset;
-    else
+    if (info->crop_xoffset_set != JCROP_NEG)
       xoffset = info->crop_xoffset;
-    if (info->crop_yoffset_set == JCROP_NEG)
-      yoffset = info->output_height - info->crop_height - info->crop_yoffset;
+    else if (info->crop_width > info->output_width) /* crop extension */
+      xoffset = info->crop_width - info->output_width - info->crop_xoffset;
     else
+      xoffset = info->output_width - info->crop_width - info->crop_xoffset;
+    if (info->crop_yoffset_set != JCROP_NEG)
       yoffset = info->crop_yoffset;
+    else if (info->crop_height > info->output_height) /* crop extension */
+      yoffset = info->crop_height - info->output_height - info->crop_yoffset;
+    else
+      yoffset = info->output_height - info->crop_height - info->crop_yoffset;
     /* Now adjust so that upper left corner falls at an iMCU boundary */
-    if (info->crop_width_set == JCROP_FORCE)
-      info->output_width = info->crop_width;
-    else
-      info->output_width =
-        info->crop_width + (xoffset % info->iMCU_sample_width);
-    if (info->crop_height_set == JCROP_FORCE)
-      info->output_height = info->crop_height;
-    else
-      info->output_height =
-        info->crop_height + (yoffset % info->iMCU_sample_height);
+    switch (info->transform) {
+    case JXFORM_DROP:
+      /* Ensure the effective drop region will not exceed the requested */
+      itemp = info->iMCU_sample_width;
+      dtemp = itemp - 1 - ((xoffset + itemp - 1) % itemp);
+      xoffset += dtemp;
+      if (info->crop_width <= dtemp)
+        info->drop_width = 0;
+      else if (xoffset + info->crop_width - dtemp == info->output_width)
+        /* Matching right edge: include partial iMCU */
+        info->drop_width = (info->crop_width - dtemp + itemp - 1) / itemp;
+      else
+        info->drop_width = (info->crop_width - dtemp) / itemp;
+      itemp = info->iMCU_sample_height;
+      dtemp = itemp - 1 - ((yoffset + itemp - 1) % itemp);
+      yoffset += dtemp;
+      if (info->crop_height <= dtemp)
+        info->drop_height = 0;
+      else if (yoffset + info->crop_height - dtemp == info->output_height)
+        /* Matching bottom edge: include partial iMCU */
+        info->drop_height = (info->crop_height - dtemp + itemp - 1) / itemp;
+      else
+        info->drop_height = (info->crop_height - dtemp) / itemp;
+      /* Check if sampling factors match for dropping */
+      if (info->drop_width != 0 && info->drop_height != 0)
+        for (ci = 0; ci < info->num_components &&
+                     ci < info->drop_ptr->num_components; ci++) {
+          if (info->drop_ptr->comp_info[ci].h_samp_factor *
+              srcinfo->max_h_samp_factor !=
+              srcinfo->comp_info[ci].h_samp_factor *
+              info->drop_ptr->max_h_samp_factor)
+            ERREXIT6(srcinfo, JERR_BAD_DROP_SAMPLING, ci,
+              info->drop_ptr->comp_info[ci].h_samp_factor,
+              info->drop_ptr->max_h_samp_factor,
+              srcinfo->comp_info[ci].h_samp_factor,
+              srcinfo->max_h_samp_factor, 'h');
+          if (info->drop_ptr->comp_info[ci].v_samp_factor *
+              srcinfo->max_v_samp_factor !=
+              srcinfo->comp_info[ci].v_samp_factor *
+              info->drop_ptr->max_v_samp_factor)
+            ERREXIT6(srcinfo, JERR_BAD_DROP_SAMPLING, ci,
+              info->drop_ptr->comp_info[ci].v_samp_factor,
+              info->drop_ptr->max_v_samp_factor,
+              srcinfo->comp_info[ci].v_samp_factor,
+              srcinfo->max_v_samp_factor, 'v');
+        }
+      break;
+    case JXFORM_WIPE:
+      /* Ensure the effective wipe region will cover the requested */
+      info->drop_width = (JDIMENSION)jdiv_round_up
+        ((long)(info->crop_width + (xoffset % info->iMCU_sample_width)),
+         (long)info->iMCU_sample_width);
+      info->drop_height = (JDIMENSION)jdiv_round_up
+        ((long)(info->crop_height + (yoffset % info->iMCU_sample_height)),
+         (long)info->iMCU_sample_height);
+      break;
+    default:
+      /* Ensure the effective crop region will cover the requested */
+      if (info->crop_width_set == JCROP_FORCE ||
+          info->crop_width > info->output_width)
+        info->output_width = info->crop_width;
+      else
+        info->output_width =
+          info->crop_width + (xoffset % info->iMCU_sample_width);
+      if (info->crop_height_set == JCROP_FORCE ||
+          info->crop_height > info->output_height)
+        info->output_height = info->crop_height;
+      else
+        info->output_height =
+          info->crop_height + (yoffset % info->iMCU_sample_height);
+    }
     /* Save x/y offsets measured in iMCUs */
     info->x_crop_offset = xoffset / info->iMCU_sample_width;
     info->y_crop_offset = yoffset / info->iMCU_sample_height;
@@ -1030,7 +1720,9 @@
   transpose_it = FALSE;
   switch (info->transform) {
   case JXFORM_NONE:
-    if (info->x_crop_offset != 0 || info->y_crop_offset != 0)
+    if (info->x_crop_offset != 0 || info->y_crop_offset != 0 ||
+        info->output_width > srcinfo->output_width ||
+        info->output_height > srcinfo->output_height)
       need_workspace = TRUE;
     /* No workspace needed if neither cropping nor transforming */
     break;
@@ -1084,6 +1776,10 @@
     need_workspace = TRUE;
     transpose_it = TRUE;
     break;
+  case JXFORM_WIPE:
+    break;
+  case JXFORM_DROP:
+    break;
   }
 
   /* Allocate workspace if needed.
@@ -1187,47 +1883,47 @@
   if (length < 12) return; /* Length of an IFD entry */
 
   /* Discover byte order */
-  if (GETJOCTET(data[0]) == 0x49 && GETJOCTET(data[1]) == 0x49)
+  if (data[0] == 0x49 && data[1] == 0x49)
     is_motorola = FALSE;
-  else if (GETJOCTET(data[0]) == 0x4D && GETJOCTET(data[1]) == 0x4D)
+  else if (data[0] == 0x4D && data[1] == 0x4D)
     is_motorola = TRUE;
   else
     return;
 
   /* Check Tag Mark */
   if (is_motorola) {
-    if (GETJOCTET(data[2]) != 0) return;
-    if (GETJOCTET(data[3]) != 0x2A) return;
+    if (data[2] != 0) return;
+    if (data[3] != 0x2A) return;
   } else {
-    if (GETJOCTET(data[3]) != 0) return;
-    if (GETJOCTET(data[2]) != 0x2A) return;
+    if (data[3] != 0) return;
+    if (data[2] != 0x2A) return;
   }
 
   /* Get first IFD offset (offset to IFD0) */
   if (is_motorola) {
-    if (GETJOCTET(data[4]) != 0) return;
-    if (GETJOCTET(data[5]) != 0) return;
-    firstoffset = GETJOCTET(data[6]);
+    if (data[4] != 0) return;
+    if (data[5] != 0) return;
+    firstoffset = data[6];
     firstoffset <<= 8;
-    firstoffset += GETJOCTET(data[7]);
+    firstoffset += data[7];
   } else {
-    if (GETJOCTET(data[7]) != 0) return;
-    if (GETJOCTET(data[6]) != 0) return;
-    firstoffset = GETJOCTET(data[5]);
+    if (data[7] != 0) return;
+    if (data[6] != 0) return;
+    firstoffset = data[5];
     firstoffset <<= 8;
-    firstoffset += GETJOCTET(data[4]);
+    firstoffset += data[4];
   }
   if (firstoffset > length - 2) return; /* check end of data segment */
 
   /* Get the number of directory entries contained in this IFD */
   if (is_motorola) {
-    number_of_tags = GETJOCTET(data[firstoffset]);
+    number_of_tags = data[firstoffset];
     number_of_tags <<= 8;
-    number_of_tags += GETJOCTET(data[firstoffset + 1]);
+    number_of_tags += data[firstoffset + 1];
   } else {
-    number_of_tags = GETJOCTET(data[firstoffset + 1]);
+    number_of_tags = data[firstoffset + 1];
     number_of_tags <<= 8;
-    number_of_tags += GETJOCTET(data[firstoffset]);
+    number_of_tags += data[firstoffset];
   }
   if (number_of_tags == 0) return;
   firstoffset += 2;
@@ -1237,13 +1933,13 @@
     if (firstoffset > length - 12) return; /* check end of data segment */
     /* Get Tag number */
     if (is_motorola) {
-      tagnum = GETJOCTET(data[firstoffset]);
+      tagnum = data[firstoffset];
       tagnum <<= 8;
-      tagnum += GETJOCTET(data[firstoffset + 1]);
+      tagnum += data[firstoffset + 1];
     } else {
-      tagnum = GETJOCTET(data[firstoffset + 1]);
+      tagnum = data[firstoffset + 1];
       tagnum <<= 8;
-      tagnum += GETJOCTET(data[firstoffset]);
+      tagnum += data[firstoffset];
     }
     if (tagnum == 0x8769) break; /* found ExifSubIFD offset Tag */
     if (--number_of_tags == 0) return;
@@ -1252,29 +1948,29 @@
 
   /* Get the ExifSubIFD offset */
   if (is_motorola) {
-    if (GETJOCTET(data[firstoffset + 8]) != 0) return;
-    if (GETJOCTET(data[firstoffset + 9]) != 0) return;
-    offset = GETJOCTET(data[firstoffset + 10]);
+    if (data[firstoffset + 8] != 0) return;
+    if (data[firstoffset + 9] != 0) return;
+    offset = data[firstoffset + 10];
     offset <<= 8;
-    offset += GETJOCTET(data[firstoffset + 11]);
+    offset += data[firstoffset + 11];
   } else {
-    if (GETJOCTET(data[firstoffset + 11]) != 0) return;
-    if (GETJOCTET(data[firstoffset + 10]) != 0) return;
-    offset = GETJOCTET(data[firstoffset + 9]);
+    if (data[firstoffset + 11] != 0) return;
+    if (data[firstoffset + 10] != 0) return;
+    offset = data[firstoffset + 9];
     offset <<= 8;
-    offset += GETJOCTET(data[firstoffset + 8]);
+    offset += data[firstoffset + 8];
   }
   if (offset > length - 2) return; /* check end of data segment */
 
   /* Get the number of directory entries contained in this SubIFD */
   if (is_motorola) {
-    number_of_tags = GETJOCTET(data[offset]);
+    number_of_tags = data[offset];
     number_of_tags <<= 8;
-    number_of_tags += GETJOCTET(data[offset + 1]);
+    number_of_tags += data[offset + 1];
   } else {
-    number_of_tags = GETJOCTET(data[offset + 1]);
+    number_of_tags = data[offset + 1];
     number_of_tags <<= 8;
-    number_of_tags += GETJOCTET(data[offset]);
+    number_of_tags += data[offset];
   }
   if (number_of_tags < 2) return;
   offset += 2;
@@ -1284,13 +1980,13 @@
     if (offset > length - 12) return; /* check end of data segment */
     /* Get Tag number */
     if (is_motorola) {
-      tagnum = GETJOCTET(data[offset]);
+      tagnum = data[offset];
       tagnum <<= 8;
-      tagnum += GETJOCTET(data[offset + 1]);
+      tagnum += data[offset + 1];
     } else {
-      tagnum = GETJOCTET(data[offset + 1]);
+      tagnum = data[offset + 1];
       tagnum <<= 8;
-      tagnum += GETJOCTET(data[offset]);
+      tagnum += data[offset];
     }
     if (tagnum == 0xA002 || tagnum == 0xA003) {
       if (tagnum == 0xA002)
@@ -1384,7 +2080,7 @@
   dstinfo->jpeg_height = info->output_height;
 #endif
 
-  /* Transpose destination image parameters */
+  /* Transpose destination image parameters, adjust quantization */
   switch (info->transform) {
   case JXFORM_TRANSPOSE:
   case JXFORM_TRANSVERSE:
@@ -1396,6 +2092,12 @@
 #endif
     transpose_critical_parameters(dstinfo);
     break;
+  case JXFORM_DROP:
+    if (info->drop_width != 0 && info->drop_height != 0)
+      adjust_quant(srcinfo, src_coef_arrays,
+                   info->drop_ptr, info->drop_coef_arrays,
+                   info->trim, dstinfo);
+    break;
   default:
 #if JPEG_LIB_VERSION < 80
     dstinfo->image_width = info->output_width;
@@ -1408,12 +2110,12 @@
   if (srcinfo->marker_list != NULL &&
       srcinfo->marker_list->marker == JPEG_APP0 + 1 &&
       srcinfo->marker_list->data_length >= 6 &&
-      GETJOCTET(srcinfo->marker_list->data[0]) == 0x45 &&
-      GETJOCTET(srcinfo->marker_list->data[1]) == 0x78 &&
-      GETJOCTET(srcinfo->marker_list->data[2]) == 0x69 &&
-      GETJOCTET(srcinfo->marker_list->data[3]) == 0x66 &&
-      GETJOCTET(srcinfo->marker_list->data[4]) == 0 &&
-      GETJOCTET(srcinfo->marker_list->data[5]) == 0) {
+      srcinfo->marker_list->data[0] == 0x45 &&
+      srcinfo->marker_list->data[1] == 0x78 &&
+      srcinfo->marker_list->data[2] == 0x69 &&
+      srcinfo->marker_list->data[3] == 0x66 &&
+      srcinfo->marker_list->data[4] == 0 &&
+      srcinfo->marker_list->data[5] == 0) {
     /* Suppress output of JFIF marker */
     dstinfo->write_JFIF_header = FALSE;
     /* Adjust Exif image parameters */
@@ -1462,7 +2164,23 @@
    */
   switch (info->transform) {
   case JXFORM_NONE:
-    if (info->x_crop_offset != 0 || info->y_crop_offset != 0)
+    if (info->output_width > srcinfo->output_width ||
+        info->output_height > srcinfo->output_height) {
+      if (info->output_width > srcinfo->output_width &&
+          info->crop_width_set == JCROP_REFLECT)
+        do_crop_ext_reflect(srcinfo, dstinfo,
+                            info->x_crop_offset, info->y_crop_offset,
+                            src_coef_arrays, dst_coef_arrays);
+      else if (info->output_width > srcinfo->output_width &&
+               info->crop_width_set == JCROP_FORCE)
+        do_crop_ext_flat(srcinfo, dstinfo,
+                         info->x_crop_offset, info->y_crop_offset,
+                         src_coef_arrays, dst_coef_arrays);
+      else
+        do_crop_ext_zero(srcinfo, dstinfo,
+                         info->x_crop_offset, info->y_crop_offset,
+                         src_coef_arrays, dst_coef_arrays);
+    } else if (info->x_crop_offset != 0 || info->y_crop_offset != 0)
       do_crop(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
               src_coef_arrays, dst_coef_arrays);
     break;
@@ -1498,6 +2216,30 @@
     do_rot_270(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
                src_coef_arrays, dst_coef_arrays);
     break;
+  case JXFORM_WIPE:
+    if (info->crop_width_set == JCROP_REFLECT &&
+        info->y_crop_offset == 0 && info->drop_height ==
+        (JDIMENSION)jdiv_round_up
+          ((long)info->output_height, (long)info->iMCU_sample_height) &&
+        (info->x_crop_offset == 0 ||
+         info->x_crop_offset + info->drop_width ==
+         (JDIMENSION)jdiv_round_up
+           ((long)info->output_width, (long)info->iMCU_sample_width)))
+      do_reflect(srcinfo, dstinfo, info->x_crop_offset,
+                 src_coef_arrays, info->drop_width, info->drop_height);
+    else if (info->crop_width_set == JCROP_FORCE)
+      do_flatten(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
+                 src_coef_arrays, info->drop_width, info->drop_height);
+    else
+      do_wipe(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
+              src_coef_arrays, info->drop_width, info->drop_height);
+    break;
+  case JXFORM_DROP:
+    if (info->drop_width != 0 && info->drop_height != 0)
+      do_drop(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
+              src_coef_arrays, info->drop_ptr, info->drop_coef_arrays,
+              info->drop_width, info->drop_height);
+    break;
   }
 }
 
@@ -1604,20 +2346,20 @@
     if (dstinfo->write_JFIF_header &&
         marker->marker == JPEG_APP0 &&
         marker->data_length >= 5 &&
-        GETJOCTET(marker->data[0]) == 0x4A &&
-        GETJOCTET(marker->data[1]) == 0x46 &&
-        GETJOCTET(marker->data[2]) == 0x49 &&
-        GETJOCTET(marker->data[3]) == 0x46 &&
-        GETJOCTET(marker->data[4]) == 0)
+        marker->data[0] == 0x4A &&
+        marker->data[1] == 0x46 &&
+        marker->data[2] == 0x49 &&
+        marker->data[3] == 0x46 &&
+        marker->data[4] == 0)
       continue;                 /* reject duplicate JFIF */
     if (dstinfo->write_Adobe_marker &&
         marker->marker == JPEG_APP0 + 14 &&
         marker->data_length >= 5 &&
-        GETJOCTET(marker->data[0]) == 0x41 &&
-        GETJOCTET(marker->data[1]) == 0x64 &&
-        GETJOCTET(marker->data[2]) == 0x6F &&
-        GETJOCTET(marker->data[3]) == 0x62 &&
-        GETJOCTET(marker->data[4]) == 0x65)
+        marker->data[0] == 0x41 &&
+        marker->data[1] == 0x64 &&
+        marker->data[2] == 0x6F &&
+        marker->data[3] == 0x62 &&
+        marker->data[4] == 0x65)
       continue;                 /* reject duplicate Adobe */
     jpeg_write_marker(dstinfo, marker->marker,
                       marker->data, marker->data_length);
diff --git a/transupp.h b/transupp.h
index 80264cc..ea6be1f 100644
--- a/transupp.h
+++ b/transupp.h
@@ -2,7 +2,7 @@
  * transupp.h
  *
  * This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1997-2011, Thomas G. Lane, Guido Vollbeding.
+ * Copyright (C) 1997-2019, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2017, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -62,6 +62,17 @@
  * output image covers at least the requested region, but may cover more.)
  * The adjustment of the region dimensions may be optionally disabled.
  *
+ * A complementary lossless wipe option is provided to discard (gray out) data
+ * inside a given image region while losslessly preserving what is outside.
+ * A lossless drop option is also provided, which allows another JPEG image to
+ * be inserted ("dropped") into the source image data at a given position,
+ * replacing the existing image data at that position.  Both the source image
+ * and the drop image must have the same subsampling level.  It is best if they
+ * also have the same quantization (quality.)  Otherwise, the quantization of
+ * the output image will be adapted to accommodate the higher of the source
+ * image quality and the drop image quality.  The trim option can be used with
+ * the drop option to requantize the drop image to match the source image.
+ *
  * We also provide a lossless-resize option, which is kind of a lossless-crop
  * operation in the DCT coefficient block domain - it discards higher-order
  * coefficients and losslessly preserves lower-order coefficients of a
@@ -92,20 +103,23 @@
   JXFORM_TRANSVERSE,      /* transpose across UR-to-LL axis */
   JXFORM_ROT_90,          /* 90-degree clockwise rotation */
   JXFORM_ROT_180,         /* 180-degree rotation */
-  JXFORM_ROT_270          /* 270-degree clockwise (or 90 ccw) */
+  JXFORM_ROT_270,         /* 270-degree clockwise (or 90 ccw) */
+  JXFORM_WIPE,            /* wipe */
+  JXFORM_DROP             /* drop */
 } JXFORM_CODE;
 
 /*
  * Codes for crop parameters, which can individually be unspecified,
  * positive or negative for xoffset or yoffset,
- * positive or forced for width or height.
+ * positive or force or reflect for width or height.
  */
 
 typedef enum {
   JCROP_UNSET,
   JCROP_POS,
   JCROP_NEG,
-  JCROP_FORCE
+  JCROP_FORCE,
+  JCROP_REFLECT
 } JCROP_CODE;
 
 /*
@@ -120,7 +134,7 @@
   boolean perfect;              /* if TRUE, fail if partial MCUs are requested */
   boolean trim;                 /* if TRUE, trim partial MCUs as needed */
   boolean force_grayscale;      /* if TRUE, convert color image to grayscale */
-  boolean crop;                 /* if TRUE, crop source image */
+  boolean crop;                 /* if TRUE, crop or wipe source image, or drop */
   boolean slow_hflip;  /* For best performance, the JXFORM_FLIP_H transform
                           normally modifies the source coefficients in place.
                           Setting this to TRUE will instead use a slower,
@@ -133,14 +147,18 @@
    * These can be filled in by jtransform_parse_crop_spec().
    */
   JDIMENSION crop_width;        /* Width of selected region */
-  JCROP_CODE crop_width_set;    /* (forced disables adjustment) */
+  JCROP_CODE crop_width_set;    /* (force-disables adjustment) */
   JDIMENSION crop_height;       /* Height of selected region */
-  JCROP_CODE crop_height_set;   /* (forced disables adjustment) */
+  JCROP_CODE crop_height_set;   /* (force-disables adjustment) */
   JDIMENSION crop_xoffset;      /* X offset of selected region */
   JCROP_CODE crop_xoffset_set;  /* (negative measures from right edge) */
   JDIMENSION crop_yoffset;      /* Y offset of selected region */
   JCROP_CODE crop_yoffset_set;  /* (negative measures from bottom edge) */
 
+  /* Drop parameters: set by caller for drop request */
+  j_decompress_ptr drop_ptr;
+  jvirt_barray_ptr *drop_coef_arrays;
+
   /* Internal workspace: caller should not touch these */
   int num_components;           /* # of components in workspace */
   jvirt_barray_ptr *workspace_coef_arrays; /* workspace for transformations */
@@ -148,6 +166,8 @@
   JDIMENSION output_height;
   JDIMENSION x_crop_offset;     /* destination crop offsets measured in iMCUs */
   JDIMENSION y_crop_offset;
+  JDIMENSION drop_width;        /* drop/wipe dimensions measured in iMCUs */
+  JDIMENSION drop_height;
   int iMCU_sample_width;        /* destination iMCU size */
   int iMCU_sample_height;
 } jpeg_transform_info;
diff --git a/usage.txt b/usage.txt
index ad0aa8d..f7fa3c0 100644
--- a/usage.txt
+++ b/usage.txt
@@ -50,11 +50,9 @@
 This syntax works on all systems, so it is useful for scripts.
 
 The currently supported image file formats are: PPM (PBMPLUS color format),
-PGM (PBMPLUS grayscale format), BMP, Targa, and RLE (Utah Raster Toolkit
-format).  (RLE is supported only if the URT library is available, which it
-isn't on most non-Unix systems.)  cjpeg recognizes the input image format
-automatically, with the exception of some Targa files.  You have to tell djpeg
-which format to generate.
+PGM (PBMPLUS grayscale format), BMP, GIF, and Targa.  cjpeg recognizes the
+input image format automatically, with the exception of some Targa files.  You
+have to tell djpeg which format to generate.
 
 JPEG files are in the defacto standard JFIF file format.  There are other,
 less widely used JPEG-based file formats, but we don't support them.
@@ -76,10 +74,10 @@
 
         -grayscale      Create monochrome JPEG file from color input.
                         Be sure to use this switch when compressing a grayscale
-                        BMP file, because cjpeg isn't bright enough to notice
-                        whether a BMP file uses only shades of gray.  By
-                        saying -grayscale, you'll get a smaller JPEG file that
-                        takes less time to process.
+                        BMP or GIF file, because cjpeg isn't bright enough to
+                        notice whether a BMP or GIF file uses only shades of
+                        gray.  By saying -grayscale, you'll get a smaller JPEG
+                        file that takes less time to process.
 
         -rgb            Create RGB JPEG file.
                         Using this switch suppresses the conversion from RGB
@@ -298,10 +296,17 @@
                         is specified, or if the JPEG file is grayscale;
                         otherwise, 24-bit full-color format is emitted.
 
-        -gif            Select GIF output format.  Since GIF does not support
-                        more than 256 colors, -colors 256 is assumed (unless
-                        you specify a smaller number of colors).  If you
-                        specify -fast, the default number of colors is 216.
+        -gif            Select GIF output format (LZW-compressed).  Since GIF
+                        does not support more than 256 colors, -colors 256 is
+                        assumed (unless you specify a smaller number of
+                        colors).  If you specify -fast, the default number of
+                        colors is 216.
+
+        -gif0           Select GIF output format (uncompressed).  Since GIF
+                        does not support more than 256 colors, -colors 256 is
+                        assumed (unless you specify a smaller number of
+                        colors).  If you specify -fast, the default number of
+                        colors is 216.
 
         -os2            Select BMP output format (OS/2 1.x flavor).  8-bit
                         colormapped format is emitted if -colors or -grayscale
@@ -313,8 +318,6 @@
                         grayscale or if -grayscale is specified; otherwise
                         PPM is emitted.
 
-        -rle            Select RLE output format.  (Requires URT library.)
-
         -targa          Select Targa output format.  Grayscale format is
                         emitted if the JPEG file is grayscale or if
                         -grayscale is specified; otherwise, colormapped format
@@ -421,11 +424,6 @@
 is often a lot more than it is on larger files.  (At present, -optimize
 mode is always selected when generating progressive JPEG files.)
 
-Support for GIF input files was removed in cjpeg v6b due to concerns over
-the Unisys LZW patent.  Although this patent expired in 2006, cjpeg still
-lacks GIF support, for these historical reasons.  (Conversion of GIF files to
-JPEG is usually a bad idea anyway.)
-
 
 HINTS FOR DJPEG
 
@@ -440,10 +438,6 @@
 much lower quality than the default behavior.  "-dither none" may give
 acceptable results in two-pass mode, but is seldom tolerable in one-pass mode.
 
-To avoid the Unisys LZW patent (now expired), djpeg produces uncompressed GIF
-files.  These are larger than they should be, but are readable by standard GIF
-decoders.
-
 
 HINTS FOR BOTH PROGRAMS
 
@@ -550,6 +544,43 @@
         -crop WxH+X+Y   Crop to a rectangular region of width W and height H,
                         starting at point X,Y.
 
+If W or H is larger than the width/height of the input image, then the output
+image is expanded in size, and the expanded region is filled in with zeros
+(neutral gray).  Attaching an 'f' character ("flatten") to the width number
+will cause each block in the expanded region to be filled in with the DC
+coefficient of the nearest block in the input image rather than grayed out.
+Attaching an 'r' character ("reflect") to the width number will cause the
+expanded region to be filled in with repeated reflections of the input image
+rather than grayed out.
+
+A complementary lossless wipe option is provided to discard (gray out) data
+inside a given image region while losslessly preserving what is outside:
+        -wipe WxH+X+Y   Wipe (gray out) a rectangular region of width W and
+                        height H from the input image, starting at point X,Y.
+
+Attaching an 'f' character ("flatten") to the width number will cause the
+region to be filled with the average of adjacent blocks rather than grayed out.
+If the wipe region and the region outside the wipe region, when adjusted to the
+nearest iMCU boundary, form two horizontally adjacent rectangles, then
+attaching an 'r' character ("reflect") to the width number will cause the wipe
+region to be filled with repeated reflections of the outside region rather than
+grayed out.
+
+A lossless drop option is also provided, which allows another JPEG image to be
+inserted ("dropped") into the input image data at a given position, replacing
+the existing image data at that position:
+        -drop +X+Y filename     Drop (insert) another image at point X,Y
+
+Both the input image and the drop image must have the same subsampling level.
+It is best if they also have the same quantization (quality.)  Otherwise, the
+quantization of the output image will be adapted to accommodate the higher of
+the input image quality and the drop image quality.  The trim option can be
+used with the drop option to requantize the drop image to match the input
+image.  Note that a grayscale image can be dropped into a full-color image or
+vice versa, as long as the full-color image has no vertical subsampling.  If
+the input image is grayscale and the drop image is full-color, then the
+chrominance channels from the drop image will be discarded.
+
 Other not-strictly-lossless transformation switches are:
 
         -grayscale      Force grayscale output.
diff --git a/win/gcc/projectTargets-release.cmake.in b/win/gcc/projectTargets-release.cmake.in
new file mode 100644
index 0000000..1e1a8a3
--- /dev/null
+++ b/win/gcc/projectTargets-release.cmake.in
@@ -0,0 +1,49 @@
+#----------------------------------------------------------------
+# Generated CMake target import file for configuration "Release".
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Import target "@CMAKE_PROJECT_NAME@::jpeg" for configuration "Release"
+set_property(TARGET @CMAKE_PROJECT_NAME@::jpeg APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(@CMAKE_PROJECT_NAME@::jpeg PROPERTIES
+  IMPORTED_IMPLIB_RELEASE "${_IMPORT_PREFIX}/lib/libjpeg.dll.a"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/bin/libjpeg-62.dll"
+  )
+
+list(APPEND _IMPORT_CHECK_TARGETS @CMAKE_PROJECT_NAME@::jpeg )
+list(APPEND _IMPORT_CHECK_FILES_FOR_@CMAKE_PROJECT_NAME@::jpeg "${_IMPORT_PREFIX}/lib/libjpeg.dll.a" "${_IMPORT_PREFIX}/bin/libjpeg-62.dll" )
+
+# Import target "@CMAKE_PROJECT_NAME@::turbojpeg" for configuration "Release"
+set_property(TARGET @CMAKE_PROJECT_NAME@::turbojpeg APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(@CMAKE_PROJECT_NAME@::turbojpeg PROPERTIES
+  IMPORTED_IMPLIB_RELEASE "${_IMPORT_PREFIX}/lib/libturbojpeg.dll.a"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/bin/libturbojpeg.dll"
+  )
+
+list(APPEND _IMPORT_CHECK_TARGETS @CMAKE_PROJECT_NAME@::turbojpeg )
+list(APPEND _IMPORT_CHECK_FILES_FOR_@CMAKE_PROJECT_NAME@::turbojpeg "${_IMPORT_PREFIX}/lib/libturbojpeg.dll.a" "${_IMPORT_PREFIX}/bin/libturbojpeg.dll" )
+
+# Import target "@CMAKE_PROJECT_NAME@::turbojpeg-static" for configuration "Release"
+set_property(TARGET @CMAKE_PROJECT_NAME@::turbojpeg-static APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(@CMAKE_PROJECT_NAME@::turbojpeg-static PROPERTIES
+  IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "ASM_NASM;C"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/libturbojpeg.a"
+  )
+
+list(APPEND _IMPORT_CHECK_TARGETS @CMAKE_PROJECT_NAME@::turbojpeg-static )
+list(APPEND _IMPORT_CHECK_FILES_FOR_@CMAKE_PROJECT_NAME@::turbojpeg-static "${_IMPORT_PREFIX}/lib/libturbojpeg.a" )
+
+# Import target "@CMAKE_PROJECT_NAME@::jpeg-static" for configuration "Release"
+set_property(TARGET @CMAKE_PROJECT_NAME@::jpeg-static APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(@CMAKE_PROJECT_NAME@::jpeg-static PROPERTIES
+  IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "ASM_NASM;C"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/libjpeg.a"
+  )
+
+list(APPEND _IMPORT_CHECK_TARGETS @CMAKE_PROJECT_NAME@::jpeg-static )
+list(APPEND _IMPORT_CHECK_FILES_FOR_@CMAKE_PROJECT_NAME@::jpeg-static "${_IMPORT_PREFIX}/lib/libjpeg.a" )
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
diff --git a/win/jconfig.h.in b/win/jconfig.h.in
index 6db0b34..13cceef 100644
--- a/win/jconfig.h.in
+++ b/win/jconfig.h.in
@@ -18,7 +18,6 @@
 #define HAVE_UNSIGNED_SHORT
 #undef INCOMPLETE_TYPES_BROKEN
 #undef RIGHT_SHIFT_IS_UNSIGNED
-#undef __CHAR_UNSIGNED__
 
 /* Define "boolean" as unsigned char, not int, per Windows custom */
 #ifndef __RPCNDR_H__            /* don't conflict if rpcndr.h already read */
diff --git a/win/projectTargets.cmake.in b/win/projectTargets.cmake.in
new file mode 100644
index 0000000..05ab498
--- /dev/null
+++ b/win/projectTargets.cmake.in
@@ -0,0 +1,115 @@
+# Generated by CMake
+
+if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.5)
+   message(FATAL_ERROR "CMake >= 2.6.0 required")
+endif()
+cmake_policy(PUSH)
+cmake_policy(VERSION 2.6)
+#----------------------------------------------------------------
+# Generated CMake target import file.
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Protect against multiple inclusion, which would fail when already imported targets are added once more.
+set(_targetsDefined)
+set(_targetsNotDefined)
+set(_expectedTargets)
+foreach(_expectedTarget @CMAKE_PROJECT_NAME@::jpeg @CMAKE_PROJECT_NAME@::turbojpeg @CMAKE_PROJECT_NAME@::turbojpeg-static @CMAKE_PROJECT_NAME@::jpeg-static)
+  list(APPEND _expectedTargets ${_expectedTarget})
+  if(NOT TARGET ${_expectedTarget})
+    list(APPEND _targetsNotDefined ${_expectedTarget})
+  endif()
+  if(TARGET ${_expectedTarget})
+    list(APPEND _targetsDefined ${_expectedTarget})
+  endif()
+endforeach()
+if("${_targetsDefined}" STREQUAL "${_expectedTargets}")
+  unset(_targetsDefined)
+  unset(_targetsNotDefined)
+  unset(_expectedTargets)
+  set(CMAKE_IMPORT_FILE_VERSION)
+  cmake_policy(POP)
+  return()
+endif()
+if(NOT "${_targetsDefined}" STREQUAL "")
+  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_targetsDefined}\nTargets not yet defined: ${_targetsNotDefined}\n")
+endif()
+unset(_targetsDefined)
+unset(_targetsNotDefined)
+unset(_expectedTargets)
+
+
+# Compute the installation prefix relative to this file.
+get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+if(_IMPORT_PREFIX STREQUAL "/")
+  set(_IMPORT_PREFIX "")
+endif()
+
+# Create imported target @CMAKE_PROJECT_NAME@::jpeg
+add_library(@CMAKE_PROJECT_NAME@::jpeg SHARED IMPORTED)
+
+set_target_properties(@CMAKE_PROJECT_NAME@::jpeg PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
+)
+
+# Create imported target @CMAKE_PROJECT_NAME@::turbojpeg
+add_library(@CMAKE_PROJECT_NAME@::turbojpeg SHARED IMPORTED)
+
+set_target_properties(@CMAKE_PROJECT_NAME@::turbojpeg PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
+)
+
+# Create imported target @CMAKE_PROJECT_NAME@::turbojpeg-static
+add_library(@CMAKE_PROJECT_NAME@::turbojpeg-static STATIC IMPORTED)
+
+set_target_properties(@CMAKE_PROJECT_NAME@::turbojpeg-static PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
+)
+
+# Create imported target @CMAKE_PROJECT_NAME@::jpeg-static
+add_library(@CMAKE_PROJECT_NAME@::jpeg-static STATIC IMPORTED)
+
+set_target_properties(@CMAKE_PROJECT_NAME@::jpeg-static PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
+)
+
+# Load information for each installed configuration.
+get_filename_component(_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+file(GLOB CONFIG_FILES "${_DIR}/@CMAKE_PROJECT_NAME@Targets-*.cmake")
+foreach(f ${CONFIG_FILES})
+  include(${f})
+endforeach()
+
+# Cleanup temporary variables.
+set(_IMPORT_PREFIX)
+
+# Loop over all imported files and verify that they actually exist
+foreach(target ${_IMPORT_CHECK_TARGETS} )
+  foreach(file ${_IMPORT_CHECK_FILES_FOR_${target}} )
+    if(NOT EXISTS "${file}" )
+      message(FATAL_ERROR "The imported target \"${target}\" references the file
+   \"${file}\"
+but this file does not exist.  Possible reasons include:
+* The file was deleted, renamed, or moved to another location.
+* An install or uninstall procedure did not complete successfully.
+* The installation package was faulty and contained
+   \"${CMAKE_CURRENT_LIST_FILE}\"
+but not all the files it references.
+")
+    endif()
+  endforeach()
+  unset(_IMPORT_CHECK_FILES_FOR_${target})
+endforeach()
+unset(_IMPORT_CHECK_TARGETS)
+
+# This file does not depend on other imported targets which have
+# been exported from the same project but in a separate export set.
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
+cmake_policy(POP)
diff --git a/win/vc/projectTargets-release.cmake.in b/win/vc/projectTargets-release.cmake.in
new file mode 100644
index 0000000..7abb281
--- /dev/null
+++ b/win/vc/projectTargets-release.cmake.in
@@ -0,0 +1,49 @@
+#----------------------------------------------------------------
+# Generated CMake target import file for configuration "Release".
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Import target "@CMAKE_PROJECT_NAME@::jpeg" for configuration "Release"
+set_property(TARGET @CMAKE_PROJECT_NAME@::jpeg APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(@CMAKE_PROJECT_NAME@::jpeg PROPERTIES
+  IMPORTED_IMPLIB_RELEASE "${_IMPORT_PREFIX}/lib/jpeg.lib"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/bin/jpeg62.dll"
+  )
+
+list(APPEND _IMPORT_CHECK_TARGETS @CMAKE_PROJECT_NAME@::jpeg )
+list(APPEND _IMPORT_CHECK_FILES_FOR_@CMAKE_PROJECT_NAME@::jpeg "${_IMPORT_PREFIX}/lib/jpeg.lib" "${_IMPORT_PREFIX}/bin/jpeg62.dll" )
+
+# Import target "@CMAKE_PROJECT_NAME@::turbojpeg" for configuration "Release"
+set_property(TARGET @CMAKE_PROJECT_NAME@::turbojpeg APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(@CMAKE_PROJECT_NAME@::turbojpeg PROPERTIES
+  IMPORTED_IMPLIB_RELEASE "${_IMPORT_PREFIX}/lib/turbojpeg.lib"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/bin/turbojpeg.dll"
+  )
+
+list(APPEND _IMPORT_CHECK_TARGETS @CMAKE_PROJECT_NAME@::turbojpeg )
+list(APPEND _IMPORT_CHECK_FILES_FOR_@CMAKE_PROJECT_NAME@::turbojpeg "${_IMPORT_PREFIX}/lib/turbojpeg.lib" "${_IMPORT_PREFIX}/bin/turbojpeg.dll" )
+
+# Import target "@CMAKE_PROJECT_NAME@::turbojpeg-static" for configuration "Release"
+set_property(TARGET @CMAKE_PROJECT_NAME@::turbojpeg-static APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(@CMAKE_PROJECT_NAME@::turbojpeg-static PROPERTIES
+  IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "ASM_NASM;C"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/turbojpeg-static.lib"
+  )
+
+list(APPEND _IMPORT_CHECK_TARGETS @CMAKE_PROJECT_NAME@::turbojpeg-static )
+list(APPEND _IMPORT_CHECK_FILES_FOR_@CMAKE_PROJECT_NAME@::turbojpeg-static "${_IMPORT_PREFIX}/lib/turbojpeg-static.lib" )
+
+# Import target "@CMAKE_PROJECT_NAME@::jpeg-static" for configuration "Release"
+set_property(TARGET @CMAKE_PROJECT_NAME@::jpeg-static APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(@CMAKE_PROJECT_NAME@::jpeg-static PROPERTIES
+  IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "ASM_NASM;C"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/jpeg-static.lib"
+  )
+
+list(APPEND _IMPORT_CHECK_TARGETS @CMAKE_PROJECT_NAME@::jpeg-static )
+list(APPEND _IMPORT_CHECK_FILES_FOR_@CMAKE_PROJECT_NAME@::jpeg-static "${_IMPORT_PREFIX}/lib/jpeg-static.lib" )
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
diff --git a/wrbmp.c b/wrbmp.c
index 239f64e..408a722 100644
--- a/wrbmp.c
+++ b/wrbmp.c
@@ -141,7 +141,6 @@
     }
   } else if (cinfo->out_color_space == JCS_CMYK) {
     for (col = cinfo->output_width; col > 0; col--) {
-      /* can omit GETJSAMPLE() safely */
       JSAMPLE c = *inptr++, m = *inptr++, y = *inptr++, k = *inptr++;
       cmyk_to_rgb(c, m, y, k, outptr + 2, outptr + 1, outptr);
       outptr += 3;
@@ -153,7 +152,6 @@
     register int ps = rgb_pixelsize[cinfo->out_color_space];
 
     for (col = cinfo->output_width; col > 0; col--) {
-      /* can omit GETJSAMPLE() safely */
       outptr[0] = inptr[bindex];
       outptr[1] = inptr[gindex];
       outptr[2] = inptr[rindex];
@@ -372,18 +370,18 @@
     if (cinfo->out_color_components == 3) {
       /* Normal case with RGB colormap */
       for (i = 0; i < num_colors; i++) {
-        putc(GETJSAMPLE(colormap[2][i]), outfile);
-        putc(GETJSAMPLE(colormap[1][i]), outfile);
-        putc(GETJSAMPLE(colormap[0][i]), outfile);
+        putc(colormap[2][i], outfile);
+        putc(colormap[1][i], outfile);
+        putc(colormap[0][i], outfile);
         if (map_entry_size == 4)
           putc(0, outfile);
       }
     } else {
       /* Grayscale colormap (only happens with grayscale quantization) */
       for (i = 0; i < num_colors; i++) {
-        putc(GETJSAMPLE(colormap[0][i]), outfile);
-        putc(GETJSAMPLE(colormap[0][i]), outfile);
-        putc(GETJSAMPLE(colormap[0][i]), outfile);
+        putc(colormap[0][i], outfile);
+        putc(colormap[0][i], outfile);
+        putc(colormap[0][i], outfile);
         if (map_entry_size == 4)
           putc(0, outfile);
       }
@@ -438,7 +436,6 @@
   JSAMPARRAY image_ptr;
   register JSAMPROW data_ptr;
   JDIMENSION row;
-  register JDIMENSION col;
   cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
 
   if (dest->use_inversion_array) {
@@ -459,10 +456,7 @@
         ((j_common_ptr)cinfo, dest->whole_image, row - 1, (JDIMENSION)1,
          FALSE);
       data_ptr = image_ptr[0];
-      for (col = dest->row_width; col > 0; col--) {
-        putc(GETJSAMPLE(*data_ptr), outfile);
-        data_ptr++;
-      }
+      (void)JFWRITE(outfile, data_ptr, dest->row_width);
     }
     if (progress != NULL)
       progress->completed_extra_passes++;
diff --git a/wrgif.c b/wrgif.c
index 1804e0b..82a2429 100644
--- a/wrgif.c
+++ b/wrgif.c
@@ -3,6 +3,7 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2015-2019 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2015, 2017, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -10,12 +11,6 @@
  *
  * This file contains routines to write output images in GIF format.
  *
- **************************************************************************
- * NOTE: to avoid entanglements with Unisys' patent on LZW compression,   *
- * this code has been modified to output "uncompressed GIF" files.        *
- * There is no trace of the LZW algorithm in this file.                   *
- **************************************************************************
- *
  * These routines may need modification for non-Unix environments or
  * specialized applications.  As they stand, they assume output to
  * an ordinary stdio stream.
@@ -33,11 +28,6 @@
  *    copyright notice and this permission notice appear in supporting
  *    documentation.  This software is provided "as is" without express or
  *    implied warranty.
- *
- * We are also required to state that
- *    "The Graphics Interchange Format(c) is the Copyright property of
- *    CompuServe Incorporated. GIF(sm) is a Service Mark property of
- *    CompuServe Incorporated."
  */
 
 #include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
@@ -45,6 +35,37 @@
 #ifdef GIF_SUPPORTED
 
 
+#define MAX_LZW_BITS     12     /* maximum LZW code size (4096 symbols) */
+
+typedef INT16 code_int;         /* must hold -1 .. 2**MAX_LZW_BITS */
+
+#define LZW_TABLE_SIZE   ((code_int)1 << MAX_LZW_BITS)
+
+#define HSIZE            5003   /* hash table size for 80% occupancy */
+
+typedef int hash_int;           /* must hold -2*HSIZE..2*HSIZE */
+
+#define MAXCODE(n_bits)  (((code_int)1 << (n_bits)) - 1)
+
+
+/*
+ * The LZW hash table consists of two parallel arrays:
+ *   hash_code[i]       code of symbol in slot i, or 0 if empty slot
+ *   hash_value[i]      symbol's value; undefined if empty slot
+ * where slot values (i) range from 0 to HSIZE-1.  The symbol value is
+ * its prefix symbol's code concatenated with its suffix character.
+ *
+ * Algorithm:  use open addressing double hashing (no chaining) on the
+ * prefix code / suffix character combination.  We do a variant of Knuth's
+ * algorithm D (vol. 3, sec. 6.4) along with G. Knott's relatively-prime
+ * secondary probe.
+ */
+
+typedef int hash_entry;         /* must hold (code_int << 8) | byte */
+
+#define HASH_ENTRY(prefix, suffix)  ((((hash_entry)(prefix)) << 8) | (suffix))
+
+
 /* Private version of data destination object */
 
 typedef struct {
@@ -54,14 +75,24 @@
 
   /* State for packing variable-width codes into a bitstream */
   int n_bits;                   /* current number of bits/code */
-  int maxcode;                  /* maximum code, given n_bits */
-  long cur_accum;               /* holds bits not yet output */
+  code_int maxcode;             /* maximum code, given n_bits */
+  int init_bits;                /* initial n_bits ... restored after clear */
+  int cur_accum;                /* holds bits not yet output */
   int cur_bits;                 /* # of bits in cur_accum */
 
+  /* LZW string construction */
+  code_int waiting_code;        /* symbol not yet output; may be extendable */
+  boolean first_byte;           /* if TRUE, waiting_code is not valid */
+
   /* State for GIF code assignment */
-  int ClearCode;                /* clear code (doesn't change) */
-  int EOFCode;                  /* EOF code (ditto) */
-  int code_counter;             /* counts output symbols */
+  code_int ClearCode;           /* clear code (doesn't change) */
+  code_int EOFCode;             /* EOF code (ditto) */
+  code_int free_code;           /* LZW: first not-yet-used symbol code */
+  code_int code_counter;        /* not LZW: counts output symbols */
+
+  /* LZW hash table */
+  code_int *hash_code;          /* => hash table of symbol codes */
+  hash_entry *hash_value;       /* => hash table of symbol values */
 
   /* GIF data packet construction buffer */
   int bytesinpkt;               /* # of bytes in current packet */
@@ -71,9 +102,6 @@
 
 typedef gif_dest_struct *gif_dest_ptr;
 
-/* Largest value that will fit in N bits */
-#define MAXCODE(n_bits)  ((1 << (n_bits)) - 1)
-
 
 /*
  * Routines to package finished data bytes into GIF data blocks.
@@ -105,7 +133,7 @@
 /* Routine to convert variable-width codes into a byte stream */
 
 LOCAL(void)
-output(gif_dest_ptr dinfo, int code)
+output(gif_dest_ptr dinfo, code_int code)
 /* Emit a code of n_bits bits */
 /* Uses cur_accum and cur_bits to reblock into 8-bit bytes */
 {
@@ -117,74 +145,76 @@
     dinfo->cur_accum >>= 8;
     dinfo->cur_bits -= 8;
   }
+
+  /*
+   * If the next entry is going to be too big for the code size,
+   * then increase it, if possible.  We do this here to ensure
+   * that it's done in sync with the decoder's codesize increases.
+   */
+  if (dinfo->free_code > dinfo->maxcode) {
+    dinfo->n_bits++;
+    if (dinfo->n_bits == MAX_LZW_BITS)
+      dinfo->maxcode = LZW_TABLE_SIZE; /* free_code will never exceed this */
+    else
+      dinfo->maxcode = MAXCODE(dinfo->n_bits);
+  }
 }
 
 
-/* The pseudo-compression algorithm.
- *
- * In this module we simply output each pixel value as a separate symbol;
- * thus, no compression occurs.  In fact, there is expansion of one bit per
- * pixel, because we use a symbol width one bit wider than the pixel width.
- *
- * GIF ordinarily uses variable-width symbols, and the decoder will expect
- * to ratchet up the symbol width after a fixed number of symbols.
- * To simplify the logic and keep the expansion penalty down, we emit a
- * GIF Clear code to reset the decoder just before the width would ratchet up.
- * Thus, all the symbols in the output file will have the same bit width.
- * Note that emitting the Clear codes at the right times is a mere matter of
- * counting output symbols and is in no way dependent on the LZW patent.
- *
- * With a small basic pixel width (low color count), Clear codes will be
- * needed very frequently, causing the file to expand even more.  So this
- * simplistic approach wouldn't work too well on bilevel images, for example.
- * But for output of JPEG conversions the pixel width will usually be 8 bits
- * (129 to 256 colors), so the overhead added by Clear symbols is only about
- * one symbol in every 256.
- */
+/* Compression initialization & termination */
+
+
+LOCAL(void)
+clear_hash(gif_dest_ptr dinfo)
+/* Fill the hash table with empty entries */
+{
+  /* It's sufficient to zero hash_code[] */
+  MEMZERO(dinfo->hash_code, HSIZE * sizeof(code_int));
+}
+
+
+LOCAL(void)
+clear_block(gif_dest_ptr dinfo)
+/* Reset compressor and issue a Clear code */
+{
+  clear_hash(dinfo);                    /* delete all the symbols */
+  dinfo->free_code = dinfo->ClearCode + 2;
+  output(dinfo, dinfo->ClearCode);      /* inform decoder */
+  dinfo->n_bits = dinfo->init_bits;     /* reset code size */
+  dinfo->maxcode = MAXCODE(dinfo->n_bits);
+}
+
 
 LOCAL(void)
 compress_init(gif_dest_ptr dinfo, int i_bits)
-/* Initialize pseudo-compressor */
+/* Initialize compressor */
 {
   /* init all the state variables */
-  dinfo->n_bits = i_bits;
+  dinfo->n_bits = dinfo->init_bits = i_bits;
   dinfo->maxcode = MAXCODE(dinfo->n_bits);
-  dinfo->ClearCode = (1 << (i_bits - 1));
+  dinfo->ClearCode = ((code_int) 1 << (i_bits - 1));
   dinfo->EOFCode = dinfo->ClearCode + 1;
-  dinfo->code_counter = dinfo->ClearCode + 2;
+  dinfo->code_counter = dinfo->free_code = dinfo->ClearCode + 2;
+  dinfo->first_byte = TRUE;     /* no waiting symbol yet */
   /* init output buffering vars */
   dinfo->bytesinpkt = 0;
   dinfo->cur_accum = 0;
   dinfo->cur_bits = 0;
+  /* clear hash table */
+  if (dinfo->hash_code != NULL)
+    clear_hash(dinfo);
   /* GIF specifies an initial Clear code */
   output(dinfo, dinfo->ClearCode);
 }
 
 
 LOCAL(void)
-compress_pixel(gif_dest_ptr dinfo, int c)
-/* Accept and "compress" one pixel value.
- * The given value must be less than n_bits wide.
- */
-{
-  /* Output the given pixel value as a symbol. */
-  output(dinfo, c);
-  /* Issue Clear codes often enough to keep the reader from ratcheting up
-   * its symbol size.
-   */
-  if (dinfo->code_counter < dinfo->maxcode) {
-    dinfo->code_counter++;
-  } else {
-    output(dinfo, dinfo->ClearCode);
-    dinfo->code_counter = dinfo->ClearCode + 2; /* reset the counter */
-  }
-}
-
-
-LOCAL(void)
 compress_term(gif_dest_ptr dinfo)
 /* Clean up at end */
 {
+  /* Flush out the buffered LZW code */
+  if (!dinfo->first_byte)
+    output(dinfo, dinfo->waiting_code);
   /* Send an EOF code */
   output(dinfo, dinfo->EOFCode);
   /* Flush the bit-packing buffer */
@@ -221,7 +251,7 @@
 LOCAL(void)
 emit_header(gif_dest_ptr dinfo, int num_colors, JSAMPARRAY colormap)
 /* Output the GIF file header, including color map */
-/* If colormap==NULL, synthesize a grayscale colormap */
+/* If colormap == NULL, synthesize a grayscale colormap */
 {
   int BitsPerPixel, ColorMapSize, InitCodeSize, FlagByte;
   int cshift = dinfo->cinfo->data_precision - 8;
@@ -265,12 +295,12 @@
       if (colormap != NULL) {
         if (dinfo->cinfo->out_color_space == JCS_RGB) {
           /* Normal case: RGB color map */
-          putc(GETJSAMPLE(colormap[0][i]) >> cshift, dinfo->pub.output_file);
-          putc(GETJSAMPLE(colormap[1][i]) >> cshift, dinfo->pub.output_file);
-          putc(GETJSAMPLE(colormap[2][i]) >> cshift, dinfo->pub.output_file);
+          putc(colormap[0][i] >> cshift, dinfo->pub.output_file);
+          putc(colormap[1][i] >> cshift, dinfo->pub.output_file);
+          putc(colormap[2][i] >> cshift, dinfo->pub.output_file);
         } else {
           /* Grayscale "color map": possible if quantizing grayscale image */
-          put_3bytes(dinfo, GETJSAMPLE(colormap[0][i]) >> cshift);
+          put_3bytes(dinfo, colormap[0][i] >> cshift);
         }
       } else {
         /* Create a grayscale map of num_colors values, range 0..255 */
@@ -278,7 +308,7 @@
       }
     } else {
       /* fill out the map to a power of 2 */
-      put_3bytes(dinfo, 0);
+      put_3bytes(dinfo, CENTERJSAMPLE >> cshift);
     }
   }
   /* Write image separator and Image Descriptor */
@@ -292,7 +322,7 @@
   /* Write Initial Code Size byte */
   putc(InitCodeSize, dinfo->pub.output_file);
 
-  /* Initialize for "compression" of image data */
+  /* Initialize for compression of image data */
   compress_init(dinfo, InitCodeSize + 1);
 }
 
@@ -318,17 +348,139 @@
  * In this module rows_supplied will always be 1.
  */
 
+
+/*
+ * The LZW algorithm proper
+ */
+
 METHODDEF(void)
-put_pixel_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-               JDIMENSION rows_supplied)
+put_LZW_pixel_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
+                   JDIMENSION rows_supplied)
 {
   gif_dest_ptr dest = (gif_dest_ptr)dinfo;
   register JSAMPROW ptr;
   register JDIMENSION col;
+  code_int c;
+  register hash_int i;
+  register hash_int disp;
+  register hash_entry probe_value;
 
   ptr = dest->pub.buffer[0];
   for (col = cinfo->output_width; col > 0; col--) {
-    compress_pixel(dest, GETJSAMPLE(*ptr++));
+    /* Accept and compress one 8-bit byte */
+    c = (code_int)(*ptr++);
+
+    if (dest->first_byte) {     /* need to initialize waiting_code */
+      dest->waiting_code = c;
+      dest->first_byte = FALSE;
+      continue;
+    }
+
+    /* Probe hash table to see if a symbol exists for
+     * waiting_code followed by c.
+     * If so, replace waiting_code by that symbol and continue.
+     */
+    i = ((hash_int)c << (MAX_LZW_BITS - 8)) + dest->waiting_code;
+    /* i is less than twice 2**MAX_LZW_BITS, therefore less than twice HSIZE */
+    if (i >= HSIZE)
+      i -= HSIZE;
+
+    probe_value = HASH_ENTRY(dest->waiting_code, c);
+
+    if (dest->hash_code[i] == 0) {
+      /* hit empty slot; desired symbol not in table */
+      output(dest, dest->waiting_code);
+      if (dest->free_code < LZW_TABLE_SIZE) {
+        dest->hash_code[i] = dest->free_code++; /* add symbol to hashtable */
+        dest->hash_value[i] = probe_value;
+      } else
+        clear_block(dest);
+      dest->waiting_code = c;
+      continue;
+    }
+    if (dest->hash_value[i] == probe_value) {
+      dest->waiting_code = dest->hash_code[i];
+      continue;
+    }
+
+    if (i == 0)                 /* secondary hash (after G. Knott) */
+      disp = 1;
+    else
+      disp = HSIZE - i;
+    for (;;) {
+      i -= disp;
+      if (i < 0)
+        i += HSIZE;
+      if (dest->hash_code[i] == 0) {
+        /* hit empty slot; desired symbol not in table */
+        output(dest, dest->waiting_code);
+        if (dest->free_code < LZW_TABLE_SIZE) {
+          dest->hash_code[i] = dest->free_code++; /* add symbol to hashtable */
+          dest->hash_value[i] = probe_value;
+        } else
+          clear_block(dest);
+        dest->waiting_code = c;
+        break;
+      }
+      if (dest->hash_value[i] == probe_value) {
+        dest->waiting_code = dest->hash_code[i];
+        break;
+      }
+    }
+  }
+}
+
+
+/*
+ * The pseudo-compression algorithm.
+ *
+ * In this version we simply output each pixel value as a separate symbol;
+ * thus, no compression occurs.  In fact, there is expansion of one bit per
+ * pixel, because we use a symbol width one bit wider than the pixel width.
+ *
+ * GIF ordinarily uses variable-width symbols, and the decoder will expect
+ * to ratchet up the symbol width after a fixed number of symbols.
+ * To simplify the logic and keep the expansion penalty down, we emit a
+ * GIF Clear code to reset the decoder just before the width would ratchet up.
+ * Thus, all the symbols in the output file will have the same bit width.
+ * Note that emitting the Clear codes at the right times is a mere matter of
+ * counting output symbols and is in no way dependent on the LZW algorithm.
+ *
+ * With a small basic pixel width (low color count), Clear codes will be
+ * needed very frequently, causing the file to expand even more.  So this
+ * simplistic approach wouldn't work too well on bilevel images, for example.
+ * But for output of JPEG conversions the pixel width will usually be 8 bits
+ * (129 to 256 colors), so the overhead added by Clear symbols is only about
+ * one symbol in every 256.
+ */
+
+METHODDEF(void)
+put_raw_pixel_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
+                   JDIMENSION rows_supplied)
+{
+  gif_dest_ptr dest = (gif_dest_ptr)dinfo;
+  register JSAMPROW ptr;
+  register JDIMENSION col;
+  code_int c;
+
+  ptr = dest->pub.buffer[0];
+  for (col = cinfo->output_width; col > 0; col--) {
+    c = (code_int)(*ptr++);
+    /* Accept and output one pixel value.
+     * The given value must be less than n_bits wide.
+     */
+
+    /* Output the given pixel value as a symbol. */
+    output(dest, c);
+    /* Issue Clear codes often enough to keep the reader from ratcheting up
+     * its symbol size.
+     */
+    if (dest->code_counter < dest->maxcode) {
+      dest->code_counter++;
+    } else {
+      output(dest, dest->ClearCode);
+      dest->code_counter = dest->ClearCode + 2; /* reset the counter */
+    }
   }
 }
 
@@ -342,7 +494,7 @@
 {
   gif_dest_ptr dest = (gif_dest_ptr)dinfo;
 
-  /* Flush "compression" mechanism */
+  /* Flush compression mechanism */
   compress_term(dest);
   /* Write a zero-length data block to end the series */
   putc(0, dest->pub.output_file);
@@ -370,7 +522,7 @@
  */
 
 GLOBAL(djpeg_dest_ptr)
-jinit_write_gif(j_decompress_ptr cinfo)
+jinit_write_gif(j_decompress_ptr cinfo, boolean is_lzw)
 {
   gif_dest_ptr dest;
 
@@ -380,7 +532,6 @@
                                 sizeof(gif_dest_struct));
   dest->cinfo = cinfo;          /* make back link for subroutines */
   dest->pub.start_output = start_output_gif;
-  dest->pub.put_pixel_rows = put_pixel_rows;
   dest->pub.finish_output = finish_output_gif;
   dest->pub.calc_buffer_dimensions = calc_buffer_dimensions_gif;
 
@@ -407,6 +558,22 @@
     ((j_common_ptr)cinfo, JPOOL_IMAGE, cinfo->output_width, (JDIMENSION)1);
   dest->pub.buffer_height = 1;
 
+  if (is_lzw) {
+    dest->pub.put_pixel_rows = put_LZW_pixel_rows;
+    /* Allocate space for hash table */
+    dest->hash_code = (code_int *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                  HSIZE * sizeof(code_int));
+    dest->hash_value = (hash_entry *)
+      (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                  HSIZE * sizeof(hash_entry));
+  } else {
+    dest->pub.put_pixel_rows = put_raw_pixel_rows;
+    /* Mark tables unused */
+    dest->hash_code = NULL;
+    dest->hash_value = NULL;
+  }
+
   return (djpeg_dest_ptr)dest;
 }
 
diff --git a/wrppm.c b/wrppm.c
index 8cabaf0..3081ec3 100644
--- a/wrppm.c
+++ b/wrppm.c
@@ -108,17 +108,17 @@
   ppm_dest_ptr dest = (ppm_dest_ptr)dinfo;
   register char *bufferptr;
   register JSAMPROW ptr;
-#if BITS_IN_JSAMPLE != 8 || (!defined(HAVE_UNSIGNED_CHAR) && !defined(__CHAR_UNSIGNED__))
+#if BITS_IN_JSAMPLE != 8
   register JDIMENSION col;
 #endif
 
   ptr = dest->pub.buffer[0];
   bufferptr = dest->iobuffer;
-#if BITS_IN_JSAMPLE == 8 && (defined(HAVE_UNSIGNED_CHAR) || defined(__CHAR_UNSIGNED__))
+#if BITS_IN_JSAMPLE == 8
   MEMCOPY(bufferptr, ptr, dest->samples_per_row);
 #else
   for (col = dest->samples_per_row; col > 0; col--) {
-    PUTPPMSAMPLE(bufferptr, GETJSAMPLE(*ptr++));
+    PUTPPMSAMPLE(bufferptr, *ptr++);
   }
 #endif
   (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
@@ -200,10 +200,10 @@
   ptr = dest->pub.buffer[0];
   bufferptr = dest->iobuffer;
   for (col = cinfo->output_width; col > 0; col--) {
-    pixval = GETJSAMPLE(*ptr++);
-    PUTPPMSAMPLE(bufferptr, GETJSAMPLE(color_map0[pixval]));
-    PUTPPMSAMPLE(bufferptr, GETJSAMPLE(color_map1[pixval]));
-    PUTPPMSAMPLE(bufferptr, GETJSAMPLE(color_map2[pixval]));
+    pixval = *ptr++;
+    PUTPPMSAMPLE(bufferptr, color_map0[pixval]);
+    PUTPPMSAMPLE(bufferptr, color_map1[pixval]);
+    PUTPPMSAMPLE(bufferptr, color_map2[pixval]);
   }
   (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
 }
@@ -222,7 +222,7 @@
   ptr = dest->pub.buffer[0];
   bufferptr = dest->iobuffer;
   for (col = cinfo->output_width; col > 0; col--) {
-    PUTPPMSAMPLE(bufferptr, GETJSAMPLE(color_map[GETJSAMPLE(*ptr++)]));
+    PUTPPMSAMPLE(bufferptr, color_map[*ptr++]);
   }
   (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
 }
diff --git a/wrrle.c b/wrrle.c
deleted file mode 100644
index 5c98ec0..0000000
--- a/wrrle.c
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
- * wrrle.c
- *
- * This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1991-1996, Thomas G. Lane.
- * libjpeg-turbo Modifications:
- * Copyright (C) 2017, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README.ijg
- * file.
- *
- * This file contains routines to write output images in RLE format.
- * The Utah Raster Toolkit library is required (version 3.1 or later).
- *
- * These routines may need modification for non-Unix environments or
- * specialized applications.  As they stand, they assume output to
- * an ordinary stdio stream.
- *
- * Based on code contributed by Mike Lijewski,
- * with updates from Robert Hutchinson.
- */
-
-#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
-
-#ifdef RLE_SUPPORTED
-
-/* rle.h is provided by the Utah Raster Toolkit. */
-
-#include <rle.h>
-
-/*
- * We assume that JSAMPLE has the same representation as rle_pixel,
- * to wit, "unsigned char".  Hence we can't cope with 12- or 16-bit samples.
- */
-
-#if BITS_IN_JSAMPLE != 8
-  Sorry, this code only copes with 8-bit JSAMPLEs. /* deliberate syntax err */
-#endif
-
-
-/*
- * Since RLE stores scanlines bottom-to-top, we have to invert the image
- * from JPEG's top-to-bottom order.  To do this, we save the outgoing data
- * in a virtual array during put_pixel_row calls, then actually emit the
- * RLE file during finish_output.
- */
-
-
-/*
- * For now, if we emit an RLE color map then it is always 256 entries long,
- * though not all of the entries need be used.
- */
-
-#define CMAPBITS        8
-#define CMAPLENGTH      (1 << (CMAPBITS))
-
-typedef struct {
-  struct djpeg_dest_struct pub; /* public fields */
-
-  jvirt_sarray_ptr image;       /* virtual array to store the output image */
-  rle_map *colormap;            /* RLE-style color map, or NULL if none */
-  rle_pixel **rle_row;          /* To pass rows to rle_putrow() */
-
-} rle_dest_struct;
-
-typedef rle_dest_struct *rle_dest_ptr;
-
-/* Forward declarations */
-METHODDEF(void) rle_put_pixel_rows(j_decompress_ptr cinfo,
-                                   djpeg_dest_ptr dinfo,
-                                   JDIMENSION rows_supplied);
-
-
-/*
- * Write the file header.
- *
- * In this module it's easier to wait till finish_output to write anything.
- */
-
-METHODDEF(void)
-start_output_rle(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
-{
-  rle_dest_ptr dest = (rle_dest_ptr)dinfo;
-  size_t cmapsize;
-  int i, ci;
-#ifdef PROGRESS_REPORT
-  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
-#endif
-
-  /*
-   * Make sure the image can be stored in RLE format.
-   *
-   * - RLE stores image dimensions as *signed* 16 bit integers.  JPEG
-   *   uses unsigned, so we have to check the width.
-   *
-   * - Colorspace is expected to be grayscale or RGB.
-   *
-   * - The number of channels (components) is expected to be 1 (grayscale/
-   *   pseudocolor) or 3 (truecolor/directcolor).
-   *   (could be 2 or 4 if using an alpha channel, but we aren't)
-   */
-
-  if (cinfo->output_width > 32767 || cinfo->output_height > 32767)
-    ERREXIT2(cinfo, JERR_RLE_DIMENSIONS, cinfo->output_width,
-             cinfo->output_height);
-
-  if (cinfo->out_color_space != JCS_GRAYSCALE &&
-      cinfo->out_color_space != JCS_RGB)
-    ERREXIT(cinfo, JERR_RLE_COLORSPACE);
-
-  if (cinfo->output_components != 1 && cinfo->output_components != 3)
-    ERREXIT1(cinfo, JERR_RLE_TOOMANYCHANNELS, cinfo->num_components);
-
-  /* Convert colormap, if any, to RLE format. */
-
-  dest->colormap = NULL;
-
-  if (cinfo->quantize_colors) {
-    /* Allocate storage for RLE-style cmap, zero any extra entries */
-    cmapsize = cinfo->out_color_components * CMAPLENGTH * sizeof(rle_map);
-    dest->colormap = (rle_map *)(*cinfo->mem->alloc_small)
-      ((j_common_ptr)cinfo, JPOOL_IMAGE, cmapsize);
-    MEMZERO(dest->colormap, cmapsize);
-
-    /* Save away data in RLE format --- note 8-bit left shift! */
-    /* Shifting would need adjustment for JSAMPLEs wider than 8 bits. */
-    for (ci = 0; ci < cinfo->out_color_components; ci++) {
-      for (i = 0; i < cinfo->actual_number_of_colors; i++) {
-        dest->colormap[ci * CMAPLENGTH + i] =
-          GETJSAMPLE(cinfo->colormap[ci][i]) << 8;
-      }
-    }
-  }
-
-  /* Set the output buffer to the first row */
-  dest->pub.buffer = (*cinfo->mem->access_virt_sarray)
-    ((j_common_ptr)cinfo, dest->image, (JDIMENSION)0, (JDIMENSION)1, TRUE);
-  dest->pub.buffer_height = 1;
-
-  dest->pub.put_pixel_rows = rle_put_pixel_rows;
-
-#ifdef PROGRESS_REPORT
-  if (progress != NULL) {
-    progress->total_extra_passes++;  /* count file writing as separate pass */
-  }
-#endif
-}
-
-
-/*
- * Write some pixel data.
- *
- * This routine just saves the data away in a virtual array.
- */
-
-METHODDEF(void)
-rle_put_pixel_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-                   JDIMENSION rows_supplied)
-{
-  rle_dest_ptr dest = (rle_dest_ptr)dinfo;
-
-  if (cinfo->output_scanline < cinfo->output_height) {
-    dest->pub.buffer = (*cinfo->mem->access_virt_sarray)
-      ((j_common_ptr)cinfo, dest->image,
-       cinfo->output_scanline, (JDIMENSION)1, TRUE);
-  }
-}
-
-/*
- * Finish up at the end of the file.
- *
- * Here is where we really output the RLE file.
- */
-
-METHODDEF(void)
-finish_output_rle(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
-{
-  rle_dest_ptr dest = (rle_dest_ptr)dinfo;
-  rle_hdr header;               /* Output file information */
-  rle_pixel **rle_row, *red, *green, *blue;
-  JSAMPROW output_row;
-  char cmapcomment[80];
-  int row, col;
-  int ci;
-#ifdef PROGRESS_REPORT
-  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
-#endif
-
-  /* Initialize the header info */
-  header = *rle_hdr_init(NULL);
-  header.rle_file = dest->pub.output_file;
-  header.xmin     = 0;
-  header.xmax     = cinfo->output_width  - 1;
-  header.ymin     = 0;
-  header.ymax     = cinfo->output_height - 1;
-  header.alpha    = 0;
-  header.ncolors  = cinfo->output_components;
-  for (ci = 0; ci < cinfo->output_components; ci++) {
-    RLE_SET_BIT(header, ci);
-  }
-  if (cinfo->quantize_colors) {
-    header.ncmap   = cinfo->out_color_components;
-    header.cmaplen = CMAPBITS;
-    header.cmap    = dest->colormap;
-    /* Add a comment to the output image with the true colormap length. */
-    sprintf(cmapcomment, "color_map_length=%d",
-            cinfo->actual_number_of_colors);
-    rle_putcom(cmapcomment, &header);
-  }
-
-  /* Emit the RLE header and color map (if any) */
-  rle_put_setup(&header);
-
-  /* Now output the RLE data from our virtual array.
-   * We assume here that rle_pixel is represented the same as JSAMPLE.
-   */
-
-#ifdef PROGRESS_REPORT
-  if (progress != NULL) {
-    progress->pub.pass_limit = cinfo->output_height;
-    progress->pub.pass_counter = 0;
-    (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
-  }
-#endif
-
-  if (cinfo->output_components == 1) {
-    for (row = cinfo->output_height - 1; row >= 0; row--) {
-      rle_row = (rle_pixel **)(*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr)cinfo, dest->image,
-         (JDIMENSION)row, (JDIMENSION)1, FALSE);
-      rle_putrow(rle_row, (int)cinfo->output_width, &header);
-#ifdef PROGRESS_REPORT
-      if (progress != NULL) {
-        progress->pub.pass_counter++;
-        (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
-      }
-#endif
-    }
-  } else {
-    for (row = cinfo->output_height - 1; row >= 0; row--) {
-      rle_row = (rle_pixel **)dest->rle_row;
-      output_row = *(*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr)cinfo, dest->image,
-         (JDIMENSION)row, (JDIMENSION)1, FALSE);
-      red = rle_row[0];
-      green = rle_row[1];
-      blue = rle_row[2];
-      for (col = cinfo->output_width; col > 0; col--) {
-        *red++ = GETJSAMPLE(*output_row++);
-        *green++ = GETJSAMPLE(*output_row++);
-        *blue++ = GETJSAMPLE(*output_row++);
-      }
-      rle_putrow(rle_row, (int)cinfo->output_width, &header);
-#ifdef PROGRESS_REPORT
-      if (progress != NULL) {
-        progress->pub.pass_counter++;
-        (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
-      }
-#endif
-    }
-  }
-
-#ifdef PROGRESS_REPORT
-  if (progress != NULL)
-    progress->completed_extra_passes++;
-#endif
-
-  /* Emit file trailer */
-  rle_puteof(&header);
-  fflush(dest->pub.output_file);
-  if (ferror(dest->pub.output_file))
-    ERREXIT(cinfo, JERR_FILE_WRITE);
-}
-
-
-/*
- * The module selection routine for RLE format output.
- */
-
-GLOBAL(djpeg_dest_ptr)
-jinit_write_rle(j_decompress_ptr cinfo)
-{
-  rle_dest_ptr dest;
-
-  /* Create module interface object, fill in method pointers */
-  dest = (rle_dest_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                sizeof(rle_dest_struct));
-  dest->pub.start_output = start_output_rle;
-  dest->pub.finish_output = finish_output_rle;
-  dest->pub.calc_buffer_dimensions = NULL;
-
-  /* Calculate output image dimensions so we can allocate space */
-  jpeg_calc_output_dimensions(cinfo);
-
-  /* Allocate a work array for output to the RLE library. */
-  dest->rle_row = (*cinfo->mem->alloc_sarray)
-    ((j_common_ptr)cinfo, JPOOL_IMAGE,
-     cinfo->output_width, (JDIMENSION)cinfo->output_components);
-
-  /* Allocate a virtual array to hold the image. */
-  dest->image = (*cinfo->mem->request_virt_sarray)
-    ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
-     (JDIMENSION)(cinfo->output_width * cinfo->output_components),
-     cinfo->output_height, (JDIMENSION)1);
-
-  return (djpeg_dest_ptr)dest;
-}
-
-#endif /* RLE_SUPPORTED */
diff --git a/wrtarga.c b/wrtarga.c
index 9dfa920..7a654ff 100644
--- a/wrtarga.c
+++ b/wrtarga.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2017, D. R. Commander.
+ * Copyright (C) 2017, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -102,9 +102,9 @@
   inptr = dest->pub.buffer[0];
   outptr = dest->iobuffer;
   for (col = cinfo->output_width; col > 0; col--) {
-    outptr[0] = (char)GETJSAMPLE(inptr[2]); /* RGB to BGR order */
-    outptr[1] = (char)GETJSAMPLE(inptr[1]);
-    outptr[2] = (char)GETJSAMPLE(inptr[0]);
+    outptr[0] = inptr[2]; /* RGB to BGR order */
+    outptr[1] = inptr[1];
+    outptr[2] = inptr[0];
     inptr += 3, outptr += 3;
   }
   (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
@@ -118,13 +118,10 @@
   tga_dest_ptr dest = (tga_dest_ptr)dinfo;
   register JSAMPROW inptr;
   register char *outptr;
-  register JDIMENSION col;
 
   inptr = dest->pub.buffer[0];
   outptr = dest->iobuffer;
-  for (col = cinfo->output_width; col > 0; col--) {
-    *outptr++ = (char)GETJSAMPLE(*inptr++);
-  }
+  MEMCOPY(outptr, inptr, cinfo->output_width);
   (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
 }
 
@@ -147,7 +144,7 @@
   inptr = dest->pub.buffer[0];
   outptr = dest->iobuffer;
   for (col = cinfo->output_width; col > 0; col--) {
-    *outptr++ = (char)GETJSAMPLE(color_map0[GETJSAMPLE(*inptr++)]);
+    *outptr++ = color_map0[*inptr++];
   }
   (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
 }
@@ -182,9 +179,9 @@
       /* Write the colormap.  Note Targa uses BGR byte order */
       outfile = dest->pub.output_file;
       for (i = 0; i < num_colors; i++) {
-        putc(GETJSAMPLE(cinfo->colormap[2][i]), outfile);
-        putc(GETJSAMPLE(cinfo->colormap[1][i]), outfile);
-        putc(GETJSAMPLE(cinfo->colormap[0][i]), outfile);
+        putc(cinfo->colormap[2][i], outfile);
+        putc(cinfo->colormap[1][i], outfile);
+        putc(cinfo->colormap[0][i], outfile);
       }
       dest->pub.put_pixel_rows = put_gray_rows;
     } else {