Merge branch 'master' into dev
diff --git a/.travis.yml b/.travis.yml
index f909f96..548dbaf 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -33,9 +33,11 @@
         CMAKE_FLAGS="-DWITH_12BIT=1"
         CTEST_OUTPUT_ON_FAILURE=1
     - os: linux
+      dist: bionic
       compiler: gcc
       env:
-        CMAKE_FLAGS="-DWITH_JPEG7=1"
+        BUILD_X32=1
+        CMAKE_FLAGS="-DWITH_JPEG7=1 -DCMAKE_C_FLAGS=-mx32"
         CTEST_OUTPUT_ON_FAILURE=1
       addons:
         apt:
@@ -59,18 +61,17 @@
         CTEST_OUTPUT_ON_FAILURE=1
 
 before_install:
+  - if [ "$BUILD_X32" = "1" ]; then
+      sudo apt -y --install-recommends install libc6-dev-x32;
+    fi
   - if [ "$TRAVIS_OS_NAME" = "osx" ]; then
       pushd $HOME/Downloads &&
-      curl -LO https://updates.cdn-apple.com/2019/cert/041-88384-20191011-3d8da658-dca4-4a5b-b67c-26e686876403/JavaForOSX.dmg &&
-      hdid JavaForOSX.dmg &&
-      sudo installer -pkg /Volumes/Java\ for\ macOS\ 2017-001/JavaForOSX.pkg -target / &&
-      hdiutil detach /Volumes/Java\ for\ macOS\ 2017-001 &&
       curl -LO https://raw.githubusercontent.com/GiovanniBussi/macports-ci/master/macports-ci &&
       . ./macports-ci install &&
-      sudo /opt/local/bin/port -N install gcc5 yasm md5sha1sum &&
+      sudo /opt/local/bin/port -N install yasm md5sha1sum &&
       popd &&
       git clone --depth=1 https://github.com/libjpeg-turbo/gas-preprocessor.git ~/src/gas-preprocessor &&
-      ln -fs /Applications/Xcode.app /Applications/Xcode72.app;
+      ln -fs /Applications/Xcode.app /Applications/Xcode83.app;
     fi
   - if [ "${BUILD_OFFICIAL:-}" != "" ]; then
       if [ "$TRAVIS_OS_NAME" = "linux" ]; then
@@ -93,7 +94,7 @@
       if [ "$TRAVIS_OS_NAME" = "linux" ]; then
         mkdir $HOME/rpmkeys &&
         wget --no-check-certificate "http://www.libjpeg-turbo.org/key/LJTPR-GPG-KEY" -O $HOME/rpmkeys/LJTPR-GPG-KEY &&
-        docker run -v $HOME/src/ljt.nightly:/root/src/ljt.nightly -v $HOME/src/buildscripts:/root/src/buildscripts -v $TRAVIS_BUILD_DIR:/root/src/libjpeg-turbo -v $HOME/.gnupg:/root/.gnupg -v $HOME/rpmkeys:/rpmkeys -t dcommander/buildljt:latest bash -c "rpm --import /rpmkeys/LJTPR-GPG-KEY && ~/src/buildscripts/buildljt -d /root/src/libjpeg-turbo -v" &&
+        docker run -v $HOME/src/ljt.nightly:/root/src/ljt.nightly -v $HOME/src/buildscripts:/root/src/buildscripts -v $TRAVIS_BUILD_DIR:/root/src/libjpeg-turbo -v $HOME/.gnupg:/root/.gnupg -v $HOME/rpmkeys:/rpmkeys -t dcommander/buildljt:$TRAVIS_BRANCH bash -c "rpm --import /rpmkeys/LJTPR-GPG-KEY && ~/src/buildscripts/buildljt -d /root/src/libjpeg-turbo -v" &&
         sudo chown -R travis:travis ~/src/ljt.nightly &&
         mv ~/src/ljt.nightly/latest/log-$TRAVIS_OS_NAME.txt ~/src/ljt.nightly/latest/files/;
       else
diff --git a/BUILDING.md b/BUILDING.md
index 6828809..ec579e4 100644
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -12,10 +12,7 @@
 
 - [NASM](http://www.nasm.us) or [YASM](http://yasm.tortall.net)
   (if building x86 or x86-64 SIMD extensions)
-  * If using NASM, 2.10 or later is required.
-  * If using NASM, 2.10 or later (except 2.11.08) is required for an x86-64 Mac
-    build (2.11.08 does not work properly with libjpeg-turbo's x86-64 SIMD code
-    when building macho64 objects.)
+  * If using NASM, 2.13 or later is required.
   * If using YASM, 1.2.0 or later is required.
   * If building on macOS, NASM or YASM can be obtained from
     [MacPorts](http://www.macports.org/) or [Homebrew](http://brew.sh/).
@@ -49,10 +46,8 @@
 
 - If building the TurboJPEG Java wrapper, JDK or OpenJDK 1.5 or later is
   required.  Most modern Linux distributions, as well as Solaris 10 and later,
-  include JDK or OpenJDK.  On OS X 10.5 and 10.6, it will be necessary to
-  install the Java Developer Package, which can be downloaded from
-  <http://developer.apple.com/downloads> (Apple ID required.)  For other
-  systems, you can obtain the Oracle Java Development Kit from
+  include JDK or OpenJDK.  For other systems, you can obtain the Oracle Java
+  Development Kit from
   <http://www.oracle.com/technetwork/java/javase/downloads>.
 
   * If using JDK 11 or later, CMake 3.10.x or later must also be used.
@@ -62,22 +57,22 @@
 - Microsoft Visual C++ 2005 or later
 
   If you don't already have Visual C++, then the easiest way to get it is by
-  installing the
-  [Windows SDK](http://msdn.microsoft.com/en-us/windows/bb980924.aspx).
-  The Windows SDK includes both 32-bit and 64-bit Visual C++ compilers and
-  everything necessary to build libjpeg-turbo.
+  installing
+  [Visual Studio Community Edition](https://visualstudio.microsoft.com),
+  which includes everything necessary to build libjpeg-turbo.
 
-  * You can also use Microsoft Visual Studio Express/Community Edition, which
-    is a free download.  (NOTE: versions prior to 2012 can only be used to
-    build 32-bit code.)
+  * You can also download and install the standalone Windows SDK (for Windows 7
+    or later), which includes command-line versions of the 32-bit and 64-bit
+    Visual C++ compilers.
   * If you intend to build libjpeg-turbo from the command line, then add the
     appropriate compiler and SDK directories to the `INCLUDE`, `LIB`, and
     `PATH` environment variables.  This is generally accomplished by
-    executing `vcvars32.bat` or `vcvars64.bat` and `SetEnv.cmd`.
-    `vcvars32.bat` and `vcvars64.bat` are part of Visual C++ and are located in
-    the same directory as the compiler.  `SetEnv.cmd` is part of the Windows
-    SDK.  You can pass optional arguments to `SetEnv.cmd` to specify a 32-bit
-    or 64-bit build environment.
+    executing `vcvars32.bat` or `vcvars64.bat`, which are located in the same
+    directory as the compiler.
+  * If built with Visual C++ 2015 or later, the libjpeg-turbo static libraries
+    cannot be used with earlier versions of Visual C++, and vice versa.
+  * The libjpeg API DLL (**jpeg{version}.dll**) will depend on the C run-time
+    DLLs corresponding to the version of Visual C++ that was used to build it.
 
    ... OR ...
 
@@ -333,7 +328,7 @@
 -------------
 
 
-### 32-bit Build on 64-bit Linux/Unix/Mac
+### 32-bit Build on 64-bit Linux/Unix
 
 Use export/setenv to set the following environment variables before running
 CMake:
@@ -412,93 +407,6 @@
   it should be installed in your `PATH`.
 
 
-### Armv7 (32-bit)
-
-**gas-preprocessor.pl required**
-
-The following scripts demonstrate how to build libjpeg-turbo to run on the
-iPhone 3GS-4S/iPad 1st-3rd Generation and newer:
-
-#### Xcode 4.2 and earlier (LLVM-GCC)
-
-    IOS_PLATFORMDIR=/Developer/Platforms/iPhoneOS.platform
-    IOS_SYSROOT=($IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk)
-    export CFLAGS="-mfloat-abi=softfp -march=armv7 -mcpu=cortex-a8 -mtune=cortex-a8 -mfpu=neon -miphoneos-version-min=3.0"
-
-    cd {build_directory}
-
-    cat <<EOF >toolchain.cmake
-    set(CMAKE_SYSTEM_NAME Darwin)
-    set(CMAKE_SYSTEM_PROCESSOR arm)
-    set(CMAKE_C_COMPILER ${IOS_PLATFORMDIR}/Developer/usr/bin/arm-apple-darwin10-llvm-gcc-4.2)
-    EOF
-
-    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
-      -DCMAKE_OSX_SYSROOT=${IOS_SYSROOT[0]} \
-      [additional CMake flags] {source_directory}
-    make
-
-#### Xcode 4.3-4.6 (LLVM-GCC)
-
-Same as above, but replace the first line with:
-
-    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
-
-#### Xcode 5 and later (Clang)
-
-    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
-    IOS_SYSROOT=($IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk)
-    export CFLAGS="-mfloat-abi=softfp -arch armv7 -miphoneos-version-min=3.0"
-    export ASMFLAGS="-no-integrated-as"
-
-    cd {build_directory}
-
-    cat <<EOF >toolchain.cmake
-    set(CMAKE_SYSTEM_NAME Darwin)
-    set(CMAKE_SYSTEM_PROCESSOR arm)
-    set(CMAKE_C_COMPILER /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang)
-    EOF
-
-    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
-      -DCMAKE_OSX_SYSROOT=${IOS_SYSROOT[0]} \
-      [additional CMake flags] {source_directory}
-    make
-
-
-### Armv7s (32-bit)
-
-**gas-preprocessor.pl required**
-
-The following scripts demonstrate how to build libjpeg-turbo to run on the
-iPhone 5/iPad 4th Generation and newer:
-
-#### Xcode 4.5-4.6 (LLVM-GCC)
-
-    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
-    IOS_SYSROOT=($IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk)
-    export CFLAGS="-Wall -mfloat-abi=softfp -march=armv7s -mcpu=swift -mtune=swift -mfpu=neon -miphoneos-version-min=6.0"
-
-    cd {build_directory}
-
-    cat <<EOF >toolchain.cmake
-    set(CMAKE_SYSTEM_NAME Darwin)
-    set(CMAKE_SYSTEM_PROCESSOR arm)
-    set(CMAKE_C_COMPILER ${IOS_PLATFORMDIR}/Developer/usr/bin/arm-apple-darwin10-llvm-gcc-4.2)
-    EOF
-
-    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
-      -DCMAKE_OSX_SYSROOT=${IOS_SYSROOT[0]} \
-      [additional CMake flags] {source_directory}
-    make
-
-#### Xcode 5 and later (Clang)
-
-Same as the Armv7 build procedure for Xcode 5 and later, except replace the
-compiler flags as follows:
-
-    export CFLAGS="-Wall -mfloat-abi=softfp -arch armv7s -miphoneos-version-min=6.0"
-
-
 ### Armv8 (64-bit)
 
 **gas-preprocessor.pl required if using Xcode < 6**
@@ -523,9 +431,6 @@
       [additional CMake flags] {source_directory}
     make
 
-Once built, lipo can be used to combine the Armv7, v7s, and/or v8 variants into
-a universal library.
-
 
 Building libjpeg-turbo for Android
 ----------------------------------
@@ -735,44 +640,22 @@
     make dmg
 
 Create Mac package/disk image.  This requires pkgbuild and productbuild, which
-are installed by default on OS X 10.7 and later and which can be obtained by
-installing Xcode 3.2.6 (with the "Unix Development" option) on OS X 10.6.
-Packages built in this manner can be installed on OS X 10.5 and later, but they
-must be built on OS X 10.6 or later.
+are installed by default on OS X 10.7 and later.
 
-    make udmg
+In order to create a Mac package/disk image that contains universal
+x86-64/Arm binaries, set the following CMake variable:
 
-This creates a Mac package/disk image that contains universal x86-64/i386/Arm
-binaries.  The following CMake variables control which architectures are
-included in the universal binaries.  Setting any of these variables to an empty
-string excludes that architecture from the package.
-
-* `OSX_32BIT_BUILD`: Directory containing an i386 (32-bit) Mac build of
-  libjpeg-turbo (default: *{source_directory}*/osxx86)
-* `IOS_ARMV7_BUILD`: Directory containing an Armv7 (32-bit) iOS build of
-  libjpeg-turbo (default: *{source_directory}*/iosarmv7)
-* `IOS_ARMV7S_BUILD`: Directory containing an Armv7s (32-bit) iOS build of
-  libjpeg-turbo (default: *{source_directory}*/iosarmv7s)
 * `IOS_ARMV8_BUILD`: Directory containing an Armv8 (64-bit) iOS build of
-  libjpeg-turbo (default: *{source_directory}*/iosarmv8)
+  libjpeg-turbo to include in the universal binaries
 
-You should first use CMake to configure i386, Armv7, Armv7s, and/or Armv8
-sub-builds of libjpeg-turbo (see "Build Recipes" and "Building libjpeg-turbo
-for iOS" above) in build directories that match those specified in the
-aforementioned CMake variables.  Next, configure the primary build of
-libjpeg-turbo as an out-of-tree build, and build it.  Once the primary build
-has been built, run `make udmg` from the build directory.  The packaging system
-will build the sub-builds, use lipo to combine them into a single set of
-universal binaries, then package the universal binaries in the same manner as
-`make dmg`.
-
-
-Cygwin
-------
-
-    make cygwinpkg
-
-Build a Cygwin binary package.
+You should first use CMake to configure an Armv8 sub-build of libjpeg-turbo
+(see "Building libjpeg-turbo for iOS" above) in a build directory that matches
+the one specified in the aforementioned CMake variable.  Next, configure the
+primary (x86-64) build of libjpeg-turbo as an out-of-tree build, specifying the
+aforementioned CMake variable, and build it.  Once the primary build has been
+built, run `make dmg` from the build directory.  The packaging system will
+build the sub-build, use lipo to combine it with the primary build into a
+single set of universal binaries, then package the universal binaries.
 
 
 Windows
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 94efdcc..75da6da 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@
 endif()
 
 project(libjpeg-turbo C)
-set(VERSION 2.0.6)
+set(VERSION 2.0.80)
 string(REPLACE "." ";" VERSION_TRIPLET ${VERSION})
 list(GET VERSION_TRIPLET 0 VERSION_MAJOR)
 list(GET VERSION_TRIPLET 1 VERSION_MINOR)
@@ -46,7 +46,7 @@
   CMAKE_SYSTEM_PROCESSOR_LC MATCHES "i[0-9]86" OR
   CMAKE_SYSTEM_PROCESSOR_LC MATCHES "x86" OR
   CMAKE_SYSTEM_PROCESSOR_LC MATCHES "ia32")
-  if(BITS EQUAL 64)
+  if(BITS EQUAL 64 OR CMAKE_C_COMPILER_ABI MATCHES "ELF X32")
     set(CPU_TYPE x86_64)
   else()
     set(CPU_TYPE i386)
@@ -84,7 +84,9 @@
     set(CMAKE_INSTALL_DEFAULT_PREFIX "${CMAKE_INSTALL_DEFAULT_PREFIX}64")
   endif()
 else()
-  set(CMAKE_INSTALL_DEFAULT_PREFIX /opt/${CMAKE_PROJECT_NAME})
+  if(NOT CMAKE_INSTALL_DEFAULT_PREFIX)
+    set(CMAKE_INSTALL_DEFAULT_PREFIX /opt/${CMAKE_PROJECT_NAME})
+  endif()
 endif()
 if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
   set(CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_DEFAULT_PREFIX}" CACHE PATH
@@ -103,6 +105,8 @@
   if(UNIX AND NOT APPLE)
     if(BITS EQUAL 64)
       set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib64")
+    elseif(CMAKE_C_COMPILER_ABI MATCHES "ELF X32")
+      set(CMAKE_INSTALL_DEFAULT_LIBDIR "libx32")
     else()
       set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib32")
     endif()
@@ -153,8 +157,12 @@
 boolean_number(WITH_ARITH_DEC)
 option(WITH_ARITH_ENC "Include arithmetic encoding support when emulating the libjpeg v6b API/ABI" TRUE)
 boolean_number(WITH_ARITH_ENC)
-option(WITH_JAVA "Build Java wrapper for the TurboJPEG API library (implies ENABLE_SHARED=1)" FALSE)
-boolean_number(WITH_JAVA)
+if(CMAKE_C_COMPILER_ABI MATCHES "ELF X32")
+  set(WITH_JAVA 0)
+else()
+  option(WITH_JAVA "Build Java wrapper for the TurboJPEG API library (implies ENABLE_SHARED=1)" FALSE)
+  boolean_number(WITH_JAVA)
+endif()
 option(WITH_JPEG7 "Emulate libjpeg v7 API/ABI (this makes ${CMAKE_PROJECT_NAME} backward-incompatible with libjpeg v6b)" FALSE)
 boolean_number(WITH_JPEG7)
 option(WITH_JPEG8 "Emulate libjpeg v8 API/ABI (this makes ${CMAKE_PROJECT_NAME} backward-incompatible with libjpeg v6b)" FALSE)
@@ -416,13 +424,6 @@
         exit(is_shifting_signed(-0x7F7E80B1L));
       }" RIGHT_SHIFT_IS_UNSIGNED)
   endif()
-
-  if(CMAKE_CROSSCOMPILING)
-    set(__CHAR_UNSIGNED__ 0)
-  else()
-    check_c_source_runs("int main(void) { return ((char) -1 < 0); }"
-      __CHAR_UNSIGNED__)
-  endif()
 endif()
 
 if(MSVC)
@@ -1373,10 +1374,13 @@
 
 if(WITH_TURBOJPEG)
   if(ENABLE_SHARED)
-    install(TARGETS turbojpeg tjbench
+    install(TARGETS turbojpeg EXPORT ${CMAKE_PROJECT_NAME}Targets
+      INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
       ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
       LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
       RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+    install(TARGETS tjbench
+      RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
     if(NOT CMAKE_VERSION VERSION_LESS "3.1" AND MSVC AND
       CMAKE_C_LINKER_SUPPORTS_PDB)
       install(FILES "$<TARGET_PDB_FILE:turbojpeg>"
@@ -1384,8 +1388,9 @@
     endif()
   endif()
   if(ENABLE_STATIC)
-    install(TARGETS turbojpeg-static ARCHIVE
-      DESTINATION ${CMAKE_INSTALL_LIBDIR})
+    install(TARGETS turbojpeg-static EXPORT ${CMAKE_PROJECT_NAME}Targets
+      INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+      ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
     if(NOT ENABLE_SHARED)
       if(MSVC_IDE OR XCODE)
         set(DIR "${CMAKE_CURRENT_BINARY_DIR}/\${CMAKE_INSTALL_CONFIG_NAME}")
@@ -1401,7 +1406,9 @@
 endif()
 
 if(ENABLE_STATIC)
-  install(TARGETS jpeg-static ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+  install(TARGETS jpeg-static EXPORT ${CMAKE_PROJECT_NAME}Targets
+    INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
   if(NOT ENABLE_SHARED)
     if(MSVC_IDE OR XCODE)
       set(DIR "${CMAKE_CURRENT_BINARY_DIR}/\${CMAKE_INSTALL_CONFIG_NAME}")
@@ -1441,6 +1448,13 @@
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/pkgscripts/libjpeg.pc
   ${CMAKE_CURRENT_BINARY_DIR}/pkgscripts/libturbojpeg.pc
   DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
+install(FILES
+  ${CMAKE_CURRENT_BINARY_DIR}/pkgscripts/${CMAKE_PROJECT_NAME}Config.cmake
+  ${CMAKE_CURRENT_BINARY_DIR}/pkgscripts/${CMAKE_PROJECT_NAME}ConfigVersion.cmake
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CMAKE_PROJECT_NAME})
+install(EXPORT ${CMAKE_PROJECT_NAME}Targets
+  NAMESPACE ${CMAKE_PROJECT_NAME}::
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CMAKE_PROJECT_NAME})
 
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/jconfig.h
   ${CMAKE_CURRENT_SOURCE_DIR}/jerror.h ${CMAKE_CURRENT_SOURCE_DIR}/jmorecfg.h
diff --git a/ChangeLog.md b/ChangeLog.md
index b04ba36..176264d 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -1,3 +1,107 @@
+2.1 pre-beta
+============
+
+### Significant changes relative to 2.0.6:
+
+1. The build system, x86-64 SIMD extensions, and accelerated Huffman codec now
+support the x32 ABI on Linux, which allows for using x86-64 instructions with
+32-bit pointers.  The x32 ABI is generally enabled by adding `-mx32` to the
+compiler flags.
+
+     Caveats:
+     - CMake 3.9.0 or later is required in order for the build system to
+automatically detect an x32 build.
+     - Java does not support the x32 ABI, and thus the TurboJPEG Java API will
+automatically be disabled with x32 builds.
+
+2. Added Loongson MMI SIMD implementations of the RGB-to-grayscale, 4:2:2 fancy
+chroma upsampling, 4:2:2 and 4:2:0 merged chroma upsampling/color conversion,
+and fast integer DCT/IDCT algorithms.  Relative to libjpeg-turbo 2.0.x, this
+speeds up:
+
+     - the compression of RGB source images into grayscale JPEG images by
+approximately 20%
+     - the decompression of 4:2:2 JPEG images by approximately 40-60% when
+using fancy upsampling
+     - the decompression of 4:2:2 and 4:2:0 JPEG images by approximately
+15-20% when using merged upsampling
+     - the compression of RGB source images by approximately 30-45% when using
+the fast integer DCT
+     - the decompression of JPEG images into RGB destination images by
+approximately 2x when using the fast integer IDCT
+
+    The overall decompression speedup for RGB images is now approximately
+2.3-3.7x (compared to 2-3.5x with libjpeg-turbo 2.0.x.)
+
+3. 32-bit (Armv7 or Armv7s) iOS builds of libjpeg-turbo are no longer
+supported, and the libjpeg-turbo build system can no longer be used to package
+such builds.  32-bit iOS apps cannot run in iOS 11 and later, and the App Store
+no longer allows them.
+
+4. 32-bit (i386) OS X/macOS builds of libjpeg-turbo are no longer supported,
+and the libjpeg-turbo build system can no longer be used to package such
+builds.  32-bit Mac applications cannot run in macOS 10.15 "Catalina" and
+later, and the App Store no longer allows them.
+
+5. The SSE2 (x86 SIMD) and C Huffman encoding algorithms have been
+significantly optimized, resulting in a measured average overall compression
+speedup of 12-28% for 64-bit code and 22-52% for 32-bit code on various Intel
+and AMD CPUs, as well as a measured average overall compression speedup of
+0-23% on platforms that do not have a SIMD-accelerated Huffman encoding
+implementation.
+
+6. When decompressing progressive Huffman-encoded JPEG images, the block
+smoothing algorithm that the libjpeg API library optionally applies is now more
+fault-tolerant.  Previously, if a particular scan was incomplete, then the
+smoothing parameters for the incomplete scan would be applied to the entire
+output image, including the parts of the image that were generated by the prior
+(complete) scan.  Visually, this had the effect of removing block smoothing
+from lower-frequency scans if they were followed by an incomplete
+higher-frequency scan.  libjpeg-turbo now applies block smoothing parameters to
+each iMCU row based on which scan generated the pixels in that row, rather than
+always using the block smoothing parameters for the most recent scan.
+
+7. Added SIMD acceleration for progressive Huffman encoding on Arm 64-bit
+(Armv8) platforms.  This speeds up the compression of full-color progressive
+JPEGs by about 30-40% on average (relative to libjpeg-turbo 2.0.x) when using
+modern Armv8 CPUs.
+
+8. Added configure-time and run-time auto-detection of Loongson MMI SIMD
+instructions, so that the Loongson MMI SIMD extensions can be included in any
+MIPS64 libjpeg-turbo build.
+
+9. Added fault tolerance features to djpeg and jpegtran, mainly to demonstrate
+methods by which applications can guard against the exploits of the JPEG format
+described in the report
+["Two Issues with the JPEG Standard"](https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf).
+
+     - Both programs now accept a `-maxscans` argument, which can be used to
+limit the number of allowable scans in the input file.
+     - Both programs now accept a `-strict` argument, which can be used to
+treat all warnings as fatal.
+
+10. CMake package config files are now included for both the libjpeg and
+TurboJPEG API libraries.  This facilitates using libjpeg-turbo with CMake's
+`find_package()` function.  For example:
+
+        find_package(libjpeg-turbo CONFIG REQUIRED)
+
+        add_executable(libjpeg_program libjpeg_program.c)
+        target_link_libraries(libjpeg_program PUBLIC libjpeg-turbo::jpeg)
+
+        add_executable(libjpeg_program_static libjpeg_program.c)
+        target_link_libraries(libjpeg_program_static PUBLIC
+          libjpeg-turbo::jpeg-static)
+
+        add_executable(turbojpeg_program turbojpeg_program.c)
+        target_link_libraries(turbojpeg_program PUBLIC
+          libjpeg-turbo::turbojpeg)
+
+        add_executable(turbojpeg_program_static turbojpeg_program.c)
+        target_link_libraries(turbojpeg_program_static PUBLIC
+          libjpeg-turbo::turbojpeg-static)
+
+
 2.0.6
 =====
 
diff --git a/README.md b/README.md
index 1ff632e..da57040 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 
 libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
 baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and
-MIPS systems, as well as progressive JPEG compression on x86 and x86-64
+MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Armv8
 systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
 all else being equal.  On other types of systems, libjpeg-turbo can still
 outperform libjpeg by a significant amount, by virtue of its highly-optimized
diff --git a/appveyor.yml b/appveyor.yml
index 36af19b..1e5f557 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -20,16 +20,18 @@
 
       7z x c:\installers\x86_64-6.4.0-release-posix-seh-rt_v5-rev0.7z -o"c:\Program Files\mingw-w64\x86_64-6.4.0-posix-seh-rt_v5-rev0" > c:\installers\mingw64.install.log
 
-      set INCLUDE=c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include;c:\Program Files (x86)\Microsoft SDKs\Windows\v7.1A\include
+      set PATH=c:\nasm-2.10.01;c:\Program Files (x86)\NSIS;c:\msys64\usr\bin;%PATH%
 
-      set LIB=c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\lib\amd64;c:\Program Files (x86)\Microsoft SDKs\Windows\v7.1A\lib\x64
+      "c:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\amd64\vcvars64.bat"
 
-      set PATH=c:\nasm-2.10.01;c:\Program Files (x86)\NSIS;c:\msys64\usr\bin;c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\amd64;c:\Program Files (x86)\Microsoft Visual Studio 10.0\Common7\IDE;c:\Program Files (x86)\Microsoft SDKs\Windows\v7.1A\bin\x64;c:\Program Files (x86)\Microsoft SDKs\Windows\v7.1A\bin;%PATH%
+      set INCLUDE
+
+      set LIB
+
+      set PATH
 
       set MSYSTEM=MINGW32
 
-      bash -c "pacman --noconfirm -S zip"
-
       mklink /d "%ProgramData%\Oracle\Java32" "c:\Program Files (x86)\Java\jdk1.6.0"
 
       git clone --depth=1 https://github.com/libjpeg-turbo/buildscripts.git -b %APPVEYOR_REPO_BRANCH% c:/buildscripts
@@ -49,8 +51,6 @@
 
       move c:\ljt.nightly\files\*.exe .
 
-      move c:\ljt.nightly\files\*.zip .
-
       move c:\ljt.nightly\log-windows.txt .
 
 artifacts:
@@ -63,9 +63,6 @@
   - path: '*-vc*.exe'
     name: SDK for Visual C++
 
-  - path: '*.zip'
-    name: Windows JNI JARs
-
   - path: 'log-windows.txt'
     name: Build log
 
diff --git a/cderror.h b/cderror.h
index 4f2c7a3..25f95d3 100644
--- a/cderror.h
+++ b/cderror.h
@@ -84,23 +84,6 @@
 JMESSAGE(JTRC_PPM_TEXT, "%ux%u text PPM image")
 #endif /* PPM_SUPPORTED */
 
-#ifdef RLE_SUPPORTED
-JMESSAGE(JERR_RLE_BADERROR, "Bogus error code from RLE library")
-JMESSAGE(JERR_RLE_COLORSPACE, "RLE output must be grayscale or RGB")
-JMESSAGE(JERR_RLE_DIMENSIONS, "Image dimensions (%ux%u) too large for RLE")
-JMESSAGE(JERR_RLE_EMPTY, "Empty RLE file")
-JMESSAGE(JERR_RLE_EOF, "Premature EOF in RLE header")
-JMESSAGE(JERR_RLE_MEM, "Insufficient memory for RLE header")
-JMESSAGE(JERR_RLE_NOT, "Not an RLE file")
-JMESSAGE(JERR_RLE_TOOMANYCHANNELS, "Cannot handle %d output channels for RLE")
-JMESSAGE(JERR_RLE_UNSUPPORTED, "Cannot handle this RLE setup")
-JMESSAGE(JTRC_RLE, "%ux%u full-color RLE file")
-JMESSAGE(JTRC_RLE_FULLMAP, "%ux%u full-color RLE file with map of length %d")
-JMESSAGE(JTRC_RLE_GRAY, "%ux%u grayscale RLE file")
-JMESSAGE(JTRC_RLE_MAPGRAY, "%ux%u grayscale RLE file with map of length %d")
-JMESSAGE(JTRC_RLE_MAPPED, "%ux%u colormapped RLE file with map of length %d")
-#endif /* RLE_SUPPORTED */
-
 #ifdef TARGA_SUPPORTED
 JMESSAGE(JERR_TGA_BADCMAP, "Unsupported Targa colormap format")
 JMESSAGE(JERR_TGA_BADPARMS, "Invalid or unsupported Targa file")
diff --git a/cdjpeg.c b/cdjpeg.c
index e0e382d..5278c1d 100644
--- a/cdjpeg.c
+++ b/cdjpeg.c
@@ -3,8 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -25,26 +25,37 @@
  * Optional progress monitor: display a percent-done figure on stderr.
  */
 
-#ifdef PROGRESS_REPORT
-
 METHODDEF(void)
 progress_monitor(j_common_ptr cinfo)
 {
   cd_progress_ptr prog = (cd_progress_ptr)cinfo->progress;
-  int total_passes = prog->pub.total_passes + prog->total_extra_passes;
-  int percent_done =
-    (int)(prog->pub.pass_counter * 100L / prog->pub.pass_limit);
 
-  if (percent_done != prog->percent_done) {
-    prog->percent_done = percent_done;
-    if (total_passes > 1) {
-      fprintf(stderr, "\rPass %d/%d: %3d%% ",
-              prog->pub.completed_passes + prog->completed_extra_passes + 1,
-              total_passes, percent_done);
-    } else {
-      fprintf(stderr, "\r %3d%% ", percent_done);
+  if (prog->max_scans != 0 && cinfo->is_decompressor) {
+    int scan_no = ((j_decompress_ptr)cinfo)->input_scan_number;
+
+    if (scan_no > (int)prog->max_scans) {
+      fprintf(stderr, "Scan number %d exceeds maximum scans (%d)\n", scan_no,
+              prog->max_scans);
+      exit(EXIT_FAILURE);
     }
-    fflush(stderr);
+  }
+
+  if (prog->report) {
+    int total_passes = prog->pub.total_passes + prog->total_extra_passes;
+    int percent_done =
+      (int)(prog->pub.pass_counter * 100L / prog->pub.pass_limit);
+
+    if (percent_done != prog->percent_done) {
+      prog->percent_done = percent_done;
+      if (total_passes > 1) {
+        fprintf(stderr, "\rPass %d/%d: %3d%% ",
+                prog->pub.completed_passes + prog->completed_extra_passes + 1,
+                total_passes, percent_done);
+      } else {
+        fprintf(stderr, "\r %3d%% ", percent_done);
+      }
+      fflush(stderr);
+    }
   }
 }
 
@@ -57,6 +68,8 @@
     progress->pub.progress_monitor = progress_monitor;
     progress->completed_extra_passes = 0;
     progress->total_extra_passes = 0;
+    progress->max_scans = 0;
+    progress->report = FALSE;
     progress->percent_done = -1;
     cinfo->progress = &progress->pub;
   }
@@ -73,8 +86,6 @@
   }
 }
 
-#endif
-
 
 /*
  * Case-insensitive matching of possibly-abbreviated keyword switches.
diff --git a/cdjpeg.h b/cdjpeg.h
index 9868a0b..d283d42 100644
--- a/cdjpeg.h
+++ b/cdjpeg.h
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2017, D. R. Commander.
+ * Copyright (C) 2017, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -56,9 +56,9 @@
   void (*finish_output) (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo);
   /* Re-calculate buffer dimensions based on output dimensions (for use with
      partial image decompression.)  If this is NULL, then the output format
-     does not support partial image decompression (BMP and RLE, in particular,
-     cannot support partial decompression because they use an inversion buffer
-     to write the image in bottom-up order.) */
+     does not support partial image decompression (BMP, in particular, cannot
+     support partial decompression because it uses an inversion buffer to write
+     the image in bottom-up order.) */
   void (*calc_buffer_dimensions) (j_decompress_ptr cinfo,
                                   djpeg_dest_ptr dinfo);
 
@@ -87,6 +87,9 @@
   struct jpeg_progress_mgr pub; /* fields known to JPEG library */
   int completed_extra_passes;   /* extra passes completed */
   int total_extra_passes;       /* total extra */
+  JDIMENSION max_scans;         /* abort if the number of scans exceeds this
+                                   value and the value is non-zero */
+  boolean report;               /* whether or not to report progress */
   /* last printed percentage stored here to avoid multiple printouts */
   int percent_done;
 };
@@ -104,8 +107,6 @@
 EXTERN(djpeg_dest_ptr) jinit_write_gif(j_decompress_ptr cinfo);
 EXTERN(cjpeg_source_ptr) jinit_read_ppm(j_compress_ptr cinfo);
 EXTERN(djpeg_dest_ptr) jinit_write_ppm(j_decompress_ptr cinfo);
-EXTERN(cjpeg_source_ptr) jinit_read_rle(j_compress_ptr cinfo);
-EXTERN(djpeg_dest_ptr) jinit_write_rle(j_decompress_ptr cinfo);
 EXTERN(cjpeg_source_ptr) jinit_read_targa(j_compress_ptr cinfo);
 EXTERN(djpeg_dest_ptr) jinit_write_targa(j_decompress_ptr cinfo);
 
diff --git a/cjpeg.1 b/cjpeg.1
index a3e47ba..0bf6a5e 100644
--- a/cjpeg.1
+++ b/cjpeg.1
@@ -1,4 +1,4 @@
-.TH CJPEG 1 "18 March 2017"
+.TH CJPEG 1 "18 December 2019"
 .SH NAME
 cjpeg \- compress an image file to a JPEG file
 .SH SYNOPSIS
@@ -16,8 +16,7 @@
 compresses the named image file, or the standard input if no file is
 named, and produces a JPEG/JFIF file on the standard output.
 The currently supported input file formats are: PPM (PBMPLUS color
-format), PGM (PBMPLUS grayscale format), BMP, Targa, and RLE (Utah Raster
-Toolkit format).  (RLE is supported only if the URT library is available.)
+format), PGM (PBMPLUS grayscale format), BMP, and Targa.
 .SH OPTIONS
 All switch names may be abbreviated; for example,
 .B \-grayscale
@@ -215,6 +214,9 @@
 way of testing the in-memory destination manager (jpeg_mem_dest()), but it is
 also useful for benchmarking, since it reduces the I/O overhead.
 .TP
+.BI \-report
+Report compression progress.
+.TP
 .B \-verbose
 Enable debug printout.  More
 .BR \-v 's
diff --git a/cjpeg.c b/cjpeg.c
index 52a6005..a74ecf9 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modified 2003-2011 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2013-2014, 2017, D. R. Commander.
+ * Copyright (C) 2010, 2013-2014, 2017, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -69,9 +69,9 @@
  *     2) assume we can push back more than one character (works in
  *        some C implementations, but unportable);
  *     3) provide our own buffering (breaks input readers that want to use
- *        stdio directly, such as the RLE library);
+ *        stdio directly);
  * or  4) don't put back the data, and modify the input_init methods to assume
- *        they start reading after the start of file (also breaks RLE library).
+ *        they start reading after the start of file.
  * #1 is attractive for MS-DOS but is untenable on Unix.
  *
  * The most portable solution for file types that can't be identified by their
@@ -117,10 +117,6 @@
   case 'P':
     return jinit_read_ppm(cinfo);
 #endif
-#ifdef RLE_SUPPORTED
-  case 'R':
-    return jinit_read_rle(cinfo);
-#endif
 #ifdef TARGA_SUPPORTED
   case 0x00:
     return jinit_read_targa(cinfo);
@@ -147,6 +143,7 @@
 static char *icc_filename;      /* for -icc switch */
 static char *outfilename;       /* for -outfile switch */
 boolean memdst;                 /* for -memdst switch */
+boolean report;                 /* for -report switch */
 
 
 LOCAL(void)
@@ -200,6 +197,7 @@
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
   fprintf(stderr, "  -memdst        Compress to memory instead of file (useful for benchmarking)\n");
 #endif
+  fprintf(stderr, "  -report        Report compression progress\n");
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
   fprintf(stderr, "  -version       Print version information and exit\n");
   fprintf(stderr, "Switches for wizards:\n");
@@ -244,6 +242,7 @@
   icc_filename = NULL;
   outfilename = NULL;
   memdst = FALSE;
+  report = FALSE;
   cinfo->err->trace_level = 0;
 
   /* Scan command line options, adjust parameters */
@@ -395,6 +394,9 @@
       qtablefile = argv[argn];
       /* We postpone actually reading the file in case -quality comes later. */
 
+    } else if (keymatch(arg, "report", 3)) {
+      report = TRUE;
+
     } else if (keymatch(arg, "restart", 1)) {
       /* Restart interval in MCU rows (or in MCUs with 'b'). */
       long lval;
@@ -505,9 +507,7 @@
 {
   struct jpeg_compress_struct cinfo;
   struct jpeg_error_mgr jerr;
-#ifdef PROGRESS_REPORT
   struct cdjpeg_progress_mgr progress;
-#endif
   int file_index;
   cjpeg_source_ptr src_mgr;
   FILE *input_file;
@@ -628,9 +628,10 @@
     fclose(icc_file);
   }
 
-#ifdef PROGRESS_REPORT
-  start_progress_monitor((j_common_ptr)&cinfo, &progress);
-#endif
+  if (report) {
+    start_progress_monitor((j_common_ptr)&cinfo, &progress);
+    progress.report = report;
+  }
 
   /* Figure out the input file format, and set up to read it. */
   src_mgr = select_file_type(&cinfo, input_file);
@@ -676,9 +677,8 @@
   if (output_file != stdout && output_file != NULL)
     fclose(output_file);
 
-#ifdef PROGRESS_REPORT
-  end_progress_monitor((j_common_ptr)&cinfo);
-#endif
+  if (report)
+    end_progress_monitor((j_common_ptr)&cinfo);
 
   if (memdst) {
     fprintf(stderr, "Compressed size:  %lu bytes\n", outsize);
diff --git a/cmakescripts/BuildPackages.cmake b/cmakescripts/BuildPackages.cmake
index 277c72f..367b78a 100644
--- a/cmakescripts/BuildPackages.cmake
+++ b/cmakescripts/BuildPackages.cmake
@@ -78,12 +78,14 @@
 
 if(MSVC)
   set(INST_PLATFORM "Visual C++")
-  set(INST_NAME ${CMAKE_PROJECT_NAME}-${VERSION}-vc)
+  set(INST_ID vc)
+  set(INST_NAME ${CMAKE_PROJECT_NAME}-${VERSION}-${INST_ID})
   set(INST_REG_NAME ${CMAKE_PROJECT_NAME})
 elseif(MINGW)
   set(INST_PLATFORM GCC)
-  set(INST_NAME ${CMAKE_PROJECT_NAME}-${VERSION}-gcc)
-  set(INST_REG_NAME ${CMAKE_PROJECT_NAME}-gcc)
+  set(INST_ID gcc)
+  set(INST_NAME ${CMAKE_PROJECT_NAME}-${VERSION}-${INST_ID})
+  set(INST_REG_NAME ${CMAKE_PROJECT_NAME}-${INST_ID})
   set(INST_DEFS -DGCC)
 endif()
 
@@ -107,6 +109,12 @@
 string(REGEX REPLACE "/" "\\\\" INST_DIR ${CMAKE_INSTALL_PREFIX})
 
 configure_file(release/installer.nsi.in installer.nsi @ONLY)
+# TODO: It would be nice to eventually switch to CPack and eliminate this mess,
+# but not today.
+configure_file(win/projectTargets.cmake.in
+  win/${CMAKE_PROJECT_NAME}Targets.cmake @ONLY)
+configure_file(win/${INST_ID}/projectTargets-release.cmake.in
+  win/${CMAKE_PROJECT_NAME}Targets-release.cmake @ONLY)
 
 if(WITH_JAVA)
   set(JAVA_DEPEND turbojpeg-java)
@@ -121,36 +129,13 @@
 
 
 ###############################################################################
-# Cygwin Package
-###############################################################################
-
-if(CYGWIN)
-
-configure_file(release/makecygwinpkg.in pkgscripts/makecygwinpkg)
-
-add_custom_target(cygwinpkg pkgscripts/makecygwinpkg)
-
-endif() # CYGWIN
-
-
-###############################################################################
 # Mac DMG
 ###############################################################################
 
 if(APPLE)
 
-set(DEFAULT_OSX_32BIT_BUILD ${CMAKE_SOURCE_DIR}/osxx86)
-set(OSX_32BIT_BUILD ${DEFAULT_OSX_32BIT_BUILD} CACHE PATH
-  "Directory containing 32-bit (i386) Mac build to include in universal binaries (default: ${DEFAULT_OSX_32BIT_BUILD})")
-set(DEFAULT_IOS_ARMV7_BUILD ${CMAKE_SOURCE_DIR}/iosarmv7)
-set(IOS_ARMV7_BUILD ${DEFAULT_IOS_ARMV7_BUILD} CACHE PATH
-  "Directory containing Armv7 iOS build to include in universal binaries (default: ${DEFAULT_IOS_ARMV7_BUILD})")
-set(DEFAULT_IOS_ARMV7S_BUILD ${CMAKE_SOURCE_DIR}/iosarmv7s)
-set(IOS_ARMV7S_BUILD ${DEFAULT_IOS_ARMV7S_BUILD} CACHE PATH
-  "Directory containing Armv7s iOS build to include in universal binaries (default: ${DEFAULT_IOS_ARMV7S_BUILD})")
-set(DEFAULT_IOS_ARMV8_BUILD ${CMAKE_SOURCE_DIR}/iosarmv8)
-set(IOS_ARMV8_BUILD ${DEFAULT_IOS_ARMV8_BUILD} CACHE PATH
-  "Directory containing Armv8 iOS build to include in universal binaries (default: ${DEFAULT_IOS_ARMV8_BUILD})")
+set(IOS_ARMV8_BUILD "" CACHE PATH
+  "Directory containing Armv8 iOS build to include in universal binaries")
 
 set(OSX_APP_CERT_NAME "" CACHE STRING
   "Name of the Developer ID Application certificate (in the macOS keychain) that should be used to sign the libjpeg-turbo DMG.  Leave this blank to generate an unsigned DMG.")
@@ -159,14 +144,12 @@
 
 configure_file(release/makemacpkg.in pkgscripts/makemacpkg)
 configure_file(release/Distribution.xml.in pkgscripts/Distribution.xml)
+configure_file(release/Welcome.rtf.in pkgscripts/Welcome.rtf)
 configure_file(release/uninstall.in pkgscripts/uninstall)
 
 add_custom_target(dmg pkgscripts/makemacpkg
   SOURCES pkgscripts/makemacpkg)
 
-add_custom_target(udmg pkgscripts/makemacpkg universal
-  SOURCES pkgscripts/makemacpkg)
-
 endif() # APPLE
 
 
@@ -187,3 +170,12 @@
 configure_file(release/libjpeg.pc.in pkgscripts/libjpeg.pc @ONLY)
 
 configure_file(release/libturbojpeg.pc.in pkgscripts/libturbojpeg.pc @ONLY)
+
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file(
+  pkgscripts/${CMAKE_PROJECT_NAME}ConfigVersion.cmake
+  VERSION ${VERSION} COMPATIBILITY AnyNewerVersion)
+
+configure_package_config_file(release/Config.cmake.in
+  pkgscripts/${CMAKE_PROJECT_NAME}Config.cmake
+  INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CMAKE_PROJECT_NAME})
diff --git a/cmakescripts/GNUInstallDirs.cmake b/cmakescripts/GNUInstallDirs.cmake
index 7c41196..6408fca 100644
--- a/cmakescripts/GNUInstallDirs.cmake
+++ b/cmakescripts/GNUInstallDirs.cmake
@@ -118,6 +118,7 @@
 #   absolute paths where necessary, using the same logic.
 
 #=============================================================================
+# Copyright 2018 Matthias Räncker
 # Copyright 2016, 2019 D. R. Commander
 # Copyright 2016 Dmitry Marakasov
 # Copyright 2016 Roger Leigh
@@ -259,6 +260,8 @@
       else()
         if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
           set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib64")
+        elseif(CMAKE_C_COMPILER_ABI MATCHES "ELF X32")
+          set(CMAKE_INSTALL_DEFAULT_LIBDIR "libx32")
         endif()
       endif()
     endif()
diff --git a/djpeg.1 b/djpeg.1
index e4204b2..f442471 100644
--- a/djpeg.1
+++ b/djpeg.1
@@ -1,4 +1,4 @@
-.TH DJPEG 1 "13 November 2017"
+.TH DJPEG 1 "18 December 2019"
 .SH NAME
 djpeg \- decompress a JPEG file to an image file
 .SH SYNOPSIS
@@ -15,8 +15,7 @@
 .B djpeg
 decompresses the named JPEG file, or the standard input if no file is named,
 and produces an image file on the standard output.  PBMPLUS (PPM/PGM), BMP,
-GIF, Targa, or RLE (Utah Raster Toolkit) output format can be selected.
-(RLE is supported only if the URT library is available.)
+GIF, or Targa output format can be selected.
 .SH OPTIONS
 All switch names may be abbreviated; for example,
 .B \-grayscale
@@ -100,9 +99,6 @@
 .B \-grayscale
 is specified; otherwise PPM is emitted.
 .TP
-.B \-rle
-Select RLE output format.  (Requires URT library.)
-.TP
 .B \-targa
 Select Targa output format.  Grayscale format is emitted if the JPEG file is
 grayscale or if
@@ -190,6 +186,19 @@
 .B \-max 4m
 selects 4000000 bytes.  If more space is needed, an error will occur.
 .TP
+.BI \-maxscans " N"
+Abort if the JPEG image contains more than
+.I N
+scans.  This feature demonstrates a method by which applications can guard
+against denial-of-service attacks instigated by specially-crafted malformed
+JPEG images containing numerous scans with missing image data or image data
+consisting only of "EOB runs" (a feature of progressive JPEG images that allows
+potentially hundreds of thousands of adjoining zero-value pixels to be
+represented using only a few bytes.)  Attempting to decompress such malformed
+JPEG images can cause excessive CPU activity, since the decompressor must fully
+process each scan (even if the scan is corrupt) before it can proceed to the
+next scan.
+.TP
 .BI \-outfile " name"
 Send output image to the named file, not to standard output.
 .TP
@@ -197,6 +206,9 @@
 Load input file into memory before decompressing.  This feature was implemented
 mainly as a way of testing the in-memory source manager (jpeg_mem_src().)
 .TP
+.BI \-report
+Report decompression progress.
+.TP
 .BI \-skip " Y0,Y1"
 Decompress all rows of the JPEG image except those between Y0 and Y1
 (inclusive.)  Note that if decompression scaling is being used, then Y0 and Y1
@@ -210,6 +222,12 @@
 scaled image dimensions.  Currently this option only works with the
 PBMPLUS (PPM/PGM), GIF, and Targa output formats.
 .TP
+.BI \-strict
+Treat all warnings as fatal.  This feature also demonstrates a method by which
+applications can guard against attacks instigated by specially-crafted
+malformed JPEG images.  Enabling this option will cause the decompressor to
+abort if the JPEG image contains incomplete or corrupt image data.
+.TP
 .B \-verbose
 Enable debug printout.  More
 .BR \-v 's
diff --git a/djpeg.c b/djpeg.c
index fb4e7a6..fb01573 100644
--- a/djpeg.c
+++ b/djpeg.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2013 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010-2011, 2013-2017, 2020, D. R. Commander.
+ * Copyright (C) 2010-2011, 2013-2017, 2019-2020, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -71,7 +71,6 @@
   FMT_GIF,                      /* GIF format */
   FMT_OS2,                      /* BMP format (OS/2 flavor) */
   FMT_PPM,                      /* PPM/PGM (PBMPLUS formats) */
-  FMT_RLE,                      /* RLE format */
   FMT_TARGA,                    /* Targa format */
   FMT_TIFF                      /* TIFF format */
 } IMAGE_FORMATS;
@@ -94,11 +93,14 @@
 
 static const char *progname;    /* program name for error messages */
 static char *icc_filename;      /* for -icc switch */
+JDIMENSION max_scans;           /* for -maxscans switch */
 static char *outfilename;       /* for -outfile switch */
 boolean memsrc;                 /* for -memsrc switch */
+boolean report;                 /* for -report switch */
 boolean skip, crop;
 JDIMENSION skip_start, skip_end;
 JDIMENSION crop_x, crop_y, crop_width, crop_height;
+boolean strict;                 /* for -strict switch */
 #define INPUT_BUF_SIZE  4096
 
 
@@ -138,10 +140,6 @@
   fprintf(stderr, "  -pnm           Select PBMPLUS (PPM/PGM) output format%s\n",
           (DEFAULT_FMT == FMT_PPM ? " (default)" : ""));
 #endif
-#ifdef RLE_SUPPORTED
-  fprintf(stderr, "  -rle           Select Utah RLE output format%s\n",
-          (DEFAULT_FMT == FMT_RLE ? " (default)" : ""));
-#endif
 #ifdef TARGA_SUPPORTED
   fprintf(stderr, "  -targa         Select Targa output format%s\n",
           (DEFAULT_FMT == FMT_TARGA ? " (default)" : ""));
@@ -171,14 +169,16 @@
   fprintf(stderr, "  -onepass       Use 1-pass quantization (fast, low quality)\n");
 #endif
   fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
+  fprintf(stderr, "  -maxscans N    Maximum number of scans to allow in input file\n");
   fprintf(stderr, "  -outfile name  Specify name for output file\n");
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
   fprintf(stderr, "  -memsrc        Load input file into memory before decompressing\n");
 #endif
-
+  fprintf(stderr, "  -report        Report decompression progress\n");
   fprintf(stderr, "  -skip Y0,Y1    Decompress all rows except those between Y0 and Y1 (inclusive)\n");
   fprintf(stderr, "  -crop WxH+X+Y  Decompress only a rectangular subregion of the image\n");
   fprintf(stderr, "                 [requires PBMPLUS (PPM/PGM), GIF, or Targa output format]\n");
+  fprintf(stderr, "  -strict        Treat all warnings as fatal\n");
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
   fprintf(stderr, "  -version       Print version information and exit\n");
   exit(EXIT_FAILURE);
@@ -203,10 +203,13 @@
   /* Set up default JPEG parameters. */
   requested_fmt = DEFAULT_FMT;  /* set default output file format */
   icc_filename = NULL;
+  max_scans = 0;
   outfilename = NULL;
   memsrc = FALSE;
+  report = FALSE;
   skip = FALSE;
   crop = FALSE;
+  strict = FALSE;
   cinfo->err->trace_level = 0;
 
   /* Scan command line options, adjust parameters */
@@ -351,6 +354,12 @@
         lval *= 1000L;
       cinfo->mem->max_memory_to_use = lval * 1000L;
 
+    } else if (keymatch(arg, "maxscans", 4)) {
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (sscanf(argv[argn], "%u", &max_scans) != 1)
+        usage();
+
     } else if (keymatch(arg, "nosmooth", 3)) {
       /* Suppress fancy upsampling */
       cinfo->do_fancy_upsampling = FALSE;
@@ -383,9 +392,8 @@
       /* PPM/PGM output format. */
       requested_fmt = FMT_PPM;
 
-    } else if (keymatch(arg, "rle", 1)) {
-      /* RLE output format. */
-      requested_fmt = FMT_RLE;
+    } else if (keymatch(arg, "report", 2)) {
+      report = TRUE;
 
     } else if (keymatch(arg, "scale", 2)) {
       /* Scale the output image by a fraction M/N. */
@@ -413,6 +421,9 @@
         usage();
       crop = TRUE;
 
+    } else if (keymatch(arg, "strict", 2)) {
+      strict = TRUE;
+
     } else if (keymatch(arg, "targa", 1)) {
       /* Targa output format. */
       requested_fmt = FMT_TARGA;
@@ -444,7 +455,7 @@
       ERREXIT(cinfo, JERR_CANT_SUSPEND);
   }
   datasrc->bytes_in_buffer--;
-  return GETJOCTET(*datasrc->next_input_byte++);
+  return *datasrc->next_input_byte++;
 }
 
 
@@ -499,6 +510,19 @@
 }
 
 
+METHODDEF(void)
+my_emit_message(j_common_ptr cinfo, int msg_level)
+{
+  if (msg_level < 0) {
+    /* Treat warning as fatal */
+    cinfo->err->error_exit(cinfo);
+  } else {
+    if (cinfo->err->trace_level >= msg_level)
+      cinfo->err->output_message(cinfo);
+  }
+}
+
+
 /*
  * The main program.
  */
@@ -508,9 +532,7 @@
 {
   struct jpeg_decompress_struct cinfo;
   struct jpeg_error_mgr jerr;
-#ifdef PROGRESS_REPORT
   struct cdjpeg_progress_mgr progress;
-#endif
   int file_index;
   djpeg_dest_ptr dest_mgr = NULL;
   FILE *input_file;
@@ -557,6 +579,9 @@
 
   file_index = parse_switches(&cinfo, argc, argv, 0, FALSE);
 
+  if (strict)
+    jerr.emit_message = my_emit_message;
+
 #ifdef TWO_FILE_COMMANDLINE
   /* Must have either -outfile switch or explicit output file name */
   if (outfilename == NULL) {
@@ -603,9 +628,11 @@
     output_file = write_stdout();
   }
 
-#ifdef PROGRESS_REPORT
-  start_progress_monitor((j_common_ptr)&cinfo, &progress);
-#endif
+  if (report || max_scans != 0) {
+    start_progress_monitor((j_common_ptr)&cinfo, &progress);
+    progress.report = report;
+    progress.max_scans = max_scans;
+  }
 
   /* Specify data source for decompression */
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
@@ -661,11 +688,6 @@
     dest_mgr = jinit_write_ppm(&cinfo);
     break;
 #endif
-#ifdef RLE_SUPPORTED
-  case FMT_RLE:
-    dest_mgr = jinit_write_rle(&cinfo);
-    break;
-#endif
 #ifdef TARGA_SUPPORTED
   case FMT_TARGA:
     dest_mgr = jinit_write_targa(&cinfo);
@@ -781,12 +803,11 @@
     }
   }
 
-#ifdef PROGRESS_REPORT
   /* Hack: count final pass as done in case finish_output does an extra pass.
    * The library won't have updated completed_passes.
    */
-  progress.pub.completed_passes = progress.pub.total_passes;
-#endif
+  if (report || max_scans != 0)
+    progress.pub.completed_passes = progress.pub.total_passes;
 
   if (icc_filename != NULL) {
     FILE *icc_file;
@@ -825,9 +846,8 @@
   if (output_file != stdout)
     fclose(output_file);
 
-#ifdef PROGRESS_REPORT
-  end_progress_monitor((j_common_ptr)&cinfo);
-#endif
+  if (report || max_scans != 0)
+    end_progress_monitor((j_common_ptr)&cinfo);
 
   if (memsrc)
     free(inbuffer);
diff --git a/java/README b/java/README
index 88ddc3b..5af1e31 100644
--- a/java/README
+++ b/java/README
@@ -38,7 +38,7 @@
 ----------------------
 
 The TurboJPEG Java Wrapper will look for the TurboJPEG JNI library
-(libturbojpeg.so, libturbojpeg.jnilib, or turbojpeg.dll) in the system library
+(libturbojpeg.so, libturbojpeg.dylib, or turbojpeg.dll) in the system library
 paths or in any paths specified in LD_LIBRARY_PATH (Un*x), DYLD_LIBRARY_PATH
 (Mac), or PATH (Windows.)  Failing this, on Un*x and Mac systems, the wrapper
 will look for the JNI library under the library directory configured when
diff --git a/java/org/libjpegturbo/turbojpeg/TJLoader-unix.java.in b/java/org/libjpegturbo/turbojpeg/TJLoader-unix.java.in
index 65884e8..d8cc495 100644
--- a/java/org/libjpegturbo/turbojpeg/TJLoader-unix.java.in
+++ b/java/org/libjpegturbo/turbojpeg/TJLoader-unix.java.in
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2011-2013, 2016 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2011-2013, 2016, 2020 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,9 +36,9 @@
       String os = System.getProperty("os.name").toLowerCase();
       if (os.indexOf("mac") >= 0) {
         try {
-          System.load("@CMAKE_INSTALL_FULL_LIBDIR@/libturbojpeg.jnilib");
+          System.load("@CMAKE_INSTALL_FULL_LIBDIR@/libturbojpeg.dylib");
         } catch (java.lang.UnsatisfiedLinkError e2) {
-          System.load("/usr/lib/libturbojpeg.jnilib");
+          System.load("/usr/lib/libturbojpeg.dylib");
         }
       } else {
         try {
diff --git a/jccolext.c b/jccolext.c
index 19c955c..303b322 100644
--- a/jccolext.c
+++ b/jccolext.c
@@ -48,9 +48,9 @@
     outptr2 = output_buf[2][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr[RGB_RED]);
-      g = GETJSAMPLE(inptr[RGB_GREEN]);
-      b = GETJSAMPLE(inptr[RGB_BLUE]);
+      r = inptr[RGB_RED];
+      g = inptr[RGB_GREEN];
+      b = inptr[RGB_BLUE];
       inptr += RGB_PIXELSIZE;
       /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
        * must be too; we do not need an explicit range-limiting operation.
@@ -100,9 +100,9 @@
     outptr = output_buf[0][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr[RGB_RED]);
-      g = GETJSAMPLE(inptr[RGB_GREEN]);
-      b = GETJSAMPLE(inptr[RGB_BLUE]);
+      r = inptr[RGB_RED];
+      g = inptr[RGB_GREEN];
+      b = inptr[RGB_BLUE];
       inptr += RGB_PIXELSIZE;
       /* Y */
       outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
@@ -135,9 +135,9 @@
     outptr2 = output_buf[2][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      outptr0[col] = GETJSAMPLE(inptr[RGB_RED]);
-      outptr1[col] = GETJSAMPLE(inptr[RGB_GREEN]);
-      outptr2[col] = GETJSAMPLE(inptr[RGB_BLUE]);
+      outptr0[col] = inptr[RGB_RED];
+      outptr1[col] = inptr[RGB_GREEN];
+      outptr2[col] = inptr[RGB_BLUE];
       inptr += RGB_PIXELSIZE;
     }
   }
diff --git a/jccolor.c b/jccolor.c
index 036f601..bdc563c 100644
--- a/jccolor.c
+++ b/jccolor.c
@@ -392,11 +392,11 @@
     outptr3 = output_buf[3][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = MAXJSAMPLE - GETJSAMPLE(inptr[0]);
-      g = MAXJSAMPLE - GETJSAMPLE(inptr[1]);
-      b = MAXJSAMPLE - GETJSAMPLE(inptr[2]);
+      r = MAXJSAMPLE - inptr[0];
+      g = MAXJSAMPLE - inptr[1];
+      b = MAXJSAMPLE - inptr[2];
       /* K passes through as-is */
-      outptr3[col] = inptr[3];  /* don't need GETJSAMPLE here */
+      outptr3[col] = inptr[3];
       inptr += 4;
       /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
        * must be too; we do not need an explicit range-limiting operation.
@@ -438,7 +438,7 @@
     outptr = output_buf[0][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      outptr[col] = inptr[0];   /* don't need GETJSAMPLE() here */
+      outptr[col] = inptr[0];
       inptr += instride;
     }
   }
@@ -497,7 +497,7 @@
         inptr = *input_buf;
         outptr = output_buf[ci][output_row];
         for (col = 0; col < num_cols; col++) {
-          outptr[col] = inptr[ci]; /* don't need GETJSAMPLE() here */
+          outptr[col] = inptr[ci];
           inptr += nc;
         }
       }
diff --git a/jcdctmgr.c b/jcdctmgr.c
index c04058e..7dae17a 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -381,19 +381,19 @@
     elemptr = sample_data[elemr] + start_col;
 
 #if DCTSIZE == 8                /* unroll the inner loop */
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
 #else
     {
       register int elemc;
       for (elemc = DCTSIZE; elemc > 0; elemc--)
-        *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+        *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
     }
 #endif
   }
@@ -533,20 +533,19 @@
   for (elemr = 0; elemr < DCTSIZE; elemr++) {
     elemptr = sample_data[elemr] + start_col;
 #if DCTSIZE == 8                /* unroll the inner loop */
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
 #else
     {
       register int elemc;
       for (elemc = DCTSIZE; elemc > 0; elemc--)
-        *workspaceptr++ = (FAST_FLOAT)
-                          (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+        *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
     }
 #endif
   }
diff --git a/jchuff.c b/jchuff.c
index db85ce1..6b21445 100644
--- a/jchuff.c
+++ b/jchuff.c
@@ -6,6 +6,7 @@
  * libjpeg-turbo Modifications:
  * Copyright (C) 2009-2011, 2014-2016, 2018-2019, D. R. Commander.
  * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2018, Matthias Räncker.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -65,32 +66,42 @@
  * but must not be updated permanently until we complete the MCU.
  */
 
+#if defined(__x86_64__) && defined(__ILP32__)
+typedef unsigned long long bit_buf_type;
+#else
+typedef size_t bit_buf_type;
+#endif
+
+/* NOTE: The more optimal Huffman encoding algorithm has not yet been
+ * implemented in the Arm Neon SIMD extensions, which is why we retain the old
+ * Huffman encoder behavior for that platform.
+ */
+#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__))
+typedef unsigned long long simd_bit_buf_type;
+#else
+typedef bit_buf_type simd_bit_buf_type;
+#endif
+
+#if (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 8) || defined(_WIN64) || \
+    (defined(__x86_64__) && defined(__ILP32__))
+#define BIT_BUF_SIZE  64
+#elif (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 4) || defined(_WIN32)
+#define BIT_BUF_SIZE  32
+#else
+#error Cannot determine word size
+#endif
+#define SIMD_BIT_BUF_SIZE  (sizeof(simd_bit_buf_type) * 8)
+
 typedef struct {
-  size_t put_buffer;                    /* current bit-accumulation buffer */
-  int put_bits;                         /* # of bits now in it */
+  union {
+    bit_buf_type c;
+    simd_bit_buf_type simd;
+  } put_buffer;                         /* current bit accumulation buffer */
+  int free_bits;                        /* # of bits available in it */
+                                        /* (Arm SIMD: # of bits now in it) */
   int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
 } savable_state;
 
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest, src)  ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest, src) \
-  ((dest).put_buffer = (src).put_buffer, \
-   (dest).put_bits = (src).put_bits, \
-   (dest).last_dc_val[0] = (src).last_dc_val[0], \
-   (dest).last_dc_val[1] = (src).last_dc_val[1], \
-   (dest).last_dc_val[2] = (src).last_dc_val[2], \
-   (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
 typedef struct {
   struct jpeg_entropy_encoder pub; /* public fields */
 
@@ -123,6 +134,7 @@
   size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
   savable_state cur;            /* Current bit buffer & DC state */
   j_compress_ptr cinfo;         /* dump_buffer needs access to this */
+  int simd;
 } working_state;
 
 
@@ -201,8 +213,17 @@
   }
 
   /* Initialize bit buffer to empty */
-  entropy->saved.put_buffer = 0;
-  entropy->saved.put_bits = 0;
+  if (entropy->simd) {
+    entropy->saved.put_buffer.simd = 0;
+#if defined(__arm__) || defined(__aarch64__)
+    entropy->saved.free_bits = 0;
+#else
+    entropy->saved.free_bits = SIMD_BIT_BUF_SIZE;
+#endif
+  } else {
+    entropy->saved.put_buffer.c = 0;
+    entropy->saved.free_bits = BIT_BUF_SIZE;
+  }
 
   /* Initialize restart stuff */
   entropy->restarts_to_go = cinfo->restart_interval;
@@ -334,94 +355,94 @@
 
 /* Outputting bits to the file */
 
-/* These macros perform the same task as the emit_bits() function in the
- * original libjpeg code.  In addition to reducing overhead by explicitly
- * inlining the code, additional performance is achieved by taking into
- * account the size of the bit buffer and waiting until it is almost full
- * before emptying it.  This mostly benefits 64-bit platforms, since 6
- * bytes can be stored in a 64-bit bit buffer before it has to be emptied.
+/* Output byte b and, speculatively, an additional 0 byte.  0xFF must be
+ * encoded as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the
+ * byte is 0xFF.  Otherwise, the output buffer pointer is advanced by 1, and
+ * the speculative 0 byte will be overwritten by the next byte.
  */
-
-#define EMIT_BYTE() { \
-  JOCTET c; \
-  put_bits -= 8; \
-  c = (JOCTET)GETJOCTET(put_buffer >> put_bits); \
-  *buffer++ = c; \
-  if (c == 0xFF)  /* need to stuff a zero byte? */ \
-    *buffer++ = 0; \
+#define EMIT_BYTE(b) { \
+  buffer[0] = (JOCTET)(b); \
+  buffer[1] = 0; \
+  buffer -= -2 + ((JOCTET)(b) < 0xFF); \
 }
 
-#define PUT_BITS(code, size) { \
-  put_bits += size; \
-  put_buffer = (put_buffer << size) | code; \
-}
+/* Output the entire bit buffer.  If there are no 0xFF bytes in it, then write
+ * directly to the output buffer.  Otherwise, use the EMIT_BYTE() macro to
+ * encode 0xFF as 0xFF 0x00.
+ */
+#if BIT_BUF_SIZE == 64
 
-#if SIZEOF_SIZE_T != 8 && !defined(_WIN64)
-
-#define CHECKBUF15() { \
-  if (put_bits > 15) { \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
+#define FLUSH() { \
+  if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+    EMIT_BYTE(put_buffer >> 56) \
+    EMIT_BYTE(put_buffer >> 48) \
+    EMIT_BYTE(put_buffer >> 40) \
+    EMIT_BYTE(put_buffer >> 32) \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    buffer[0] = (JOCTET)(put_buffer >> 56); \
+    buffer[1] = (JOCTET)(put_buffer >> 48); \
+    buffer[2] = (JOCTET)(put_buffer >> 40); \
+    buffer[3] = (JOCTET)(put_buffer >> 32); \
+    buffer[4] = (JOCTET)(put_buffer >> 24); \
+    buffer[5] = (JOCTET)(put_buffer >> 16); \
+    buffer[6] = (JOCTET)(put_buffer >> 8); \
+    buffer[7] = (JOCTET)(put_buffer); \
+    buffer += 8; \
   } \
 }
 
-#endif
-
-#define CHECKBUF31() { \
-  if (put_bits > 31) { \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-  } \
-}
-
-#define CHECKBUF47() { \
-  if (put_bits > 47) { \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-  } \
-}
-
-#if !defined(_WIN32) && !defined(SIZEOF_SIZE_T)
-#error Cannot determine word size
-#endif
-
-#if SIZEOF_SIZE_T == 8 || defined(_WIN64)
-
-#define EMIT_BITS(code, size) { \
-  CHECKBUF47() \
-  PUT_BITS(code, size) \
-}
-
-#define EMIT_CODE(code, size) { \
-  temp2 &= (((JLONG)1) << nbits) - 1; \
-  CHECKBUF31() \
-  PUT_BITS(code, size) \
-  PUT_BITS(temp2, nbits) \
-}
-
 #else
 
-#define EMIT_BITS(code, size) { \
-  PUT_BITS(code, size) \
-  CHECKBUF15() \
-}
-
-#define EMIT_CODE(code, size) { \
-  temp2 &= (((JLONG)1) << nbits) - 1; \
-  PUT_BITS(code, size) \
-  CHECKBUF15() \
-  PUT_BITS(temp2, nbits) \
-  CHECKBUF15() \
+#define FLUSH() { \
+  if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    buffer[0] = (JOCTET)(put_buffer >> 24); \
+    buffer[1] = (JOCTET)(put_buffer >> 16); \
+    buffer[2] = (JOCTET)(put_buffer >> 8); \
+    buffer[3] = (JOCTET)(put_buffer); \
+    buffer += 4; \
+  } \
 }
 
 #endif
 
+/* Fill the bit buffer to capacity with the leading bits from code, then output
+ * the bit buffer and put the remaining bits from code into the bit buffer.
+ */
+#define PUT_AND_FLUSH(code, size) { \
+  put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+  FLUSH() \
+  free_bits += BIT_BUF_SIZE; \
+  put_buffer = code; \
+}
+
+/* Insert code into the bit buffer and output the bit buffer if needed.
+ * NOTE: We can't flush with free_bits == 0, since the left shift in
+ * PUT_AND_FLUSH() would have undefined behavior.
+ */
+#define PUT_BITS(code, size) { \
+  free_bits -= size; \
+  if (free_bits < 0) \
+    PUT_AND_FLUSH(code, size) \
+  else \
+    put_buffer = (put_buffer << size) | code; \
+}
+
+#define PUT_CODE(code, size) { \
+  temp &= (((JLONG)1) << nbits) - 1; \
+  temp |= code << nbits; \
+  nbits += size; \
+  PUT_BITS(temp, nbits) \
+}
+
 
 /* Although it is exceedingly rare, it is possible for a Huffman-encoded
  * coefficient block to be larger than the 128-byte unencoded block.  For each
@@ -444,6 +465,7 @@
 
 #define STORE_BUFFER() { \
   if (localbuf) { \
+    size_t bytes, bytestocopy; \
     bytes = buffer - _buffer; \
     buffer = _buffer; \
     while (bytes > 0) { \
@@ -466,20 +488,46 @@
 LOCAL(boolean)
 flush_bits(working_state *state)
 {
-  JOCTET _buffer[BUFSIZE], *buffer;
-  size_t put_buffer;  int put_bits;
-  size_t bytes, bytestocopy;  int localbuf = 0;
+  JOCTET _buffer[BUFSIZE], *buffer, temp;
+  simd_bit_buf_type put_buffer;  int put_bits;
+  int localbuf = 0;
 
-  put_buffer = state->cur.put_buffer;
-  put_bits = state->cur.put_bits;
+  if (state->simd) {
+#if defined(__arm__) || defined(__aarch64__)
+    put_bits = state->cur.free_bits;
+#else
+    put_bits = SIMD_BIT_BUF_SIZE - state->cur.free_bits;
+#endif
+    put_buffer = state->cur.put_buffer.simd;
+  } else {
+    put_bits = BIT_BUF_SIZE - state->cur.free_bits;
+    put_buffer = state->cur.put_buffer.c;
+  }
+
   LOAD_BUFFER()
 
-  /* fill any partial byte with ones */
-  PUT_BITS(0x7F, 7)
-  while (put_bits >= 8) EMIT_BYTE()
+  while (put_bits >= 8) {
+    put_bits -= 8;
+    temp = (JOCTET)(put_buffer >> put_bits);
+    EMIT_BYTE(temp)
+  }
+  if (put_bits) {
+    /* fill partial byte with ones */
+    temp = (JOCTET)((put_buffer << (8 - put_bits)) | (0xFF >> put_bits));
+    EMIT_BYTE(temp)
+  }
 
-  state->cur.put_buffer = 0;    /* and reset bit-buffer to empty */
-  state->cur.put_bits = 0;
+  if (state->simd) {                    /* and reset bit buffer to empty */
+    state->cur.put_buffer.simd = 0;
+#if defined(__arm__) || defined(__aarch64__)
+    state->cur.free_bits = 0;
+#else
+    state->cur.free_bits = SIMD_BIT_BUF_SIZE;
+#endif
+  } else {
+    state->cur.put_buffer.c = 0;
+    state->cur.free_bits = BIT_BUF_SIZE;
+  }
   STORE_BUFFER()
 
   return TRUE;
@@ -493,7 +541,7 @@
                       c_derived_tbl *dctbl, c_derived_tbl *actbl)
 {
   JOCTET _buffer[BUFSIZE], *buffer;
-  size_t bytes, bytestocopy;  int localbuf = 0;
+  int localbuf = 0;
 
   LOAD_BUFFER()
 
@@ -509,53 +557,41 @@
 encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val,
                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
 {
-  int temp, temp2, temp3;
-  int nbits;
-  int r, code, size;
+  int temp, nbits, free_bits;
+  bit_buf_type put_buffer;
   JOCTET _buffer[BUFSIZE], *buffer;
-  size_t put_buffer;  int put_bits;
-  int code_0xf0 = actbl->ehufco[0xf0], size_0xf0 = actbl->ehufsi[0xf0];
-  size_t bytes, bytestocopy;  int localbuf = 0;
+  int localbuf = 0;
 
-  put_buffer = state->cur.put_buffer;
-  put_bits = state->cur.put_bits;
+  free_bits = state->cur.free_bits;
+  put_buffer = state->cur.put_buffer.c;
   LOAD_BUFFER()
 
   /* Encode the DC coefficient difference per section F.1.2.1 */
 
-  temp = temp2 = block[0] - last_dc_val;
+  temp = block[0] - last_dc_val;
 
   /* This is a well-known technique for obtaining the absolute value without a
    * branch.  It is derived from an assembly language technique presented in
    * "How to Optimize for the Pentium Processors", Copyright (c) 1996, 1997 by
-   * Agner Fog.
+   * Agner Fog.  This code assumes we are on a two's complement machine.
    */
-  temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
-  temp ^= temp3;
-  temp -= temp3;
-
-  /* For a negative input, want temp2 = bitwise complement of abs(input) */
-  /* This code assumes we are on a two's complement machine */
-  temp2 += temp3;
+  nbits = temp >> (CHAR_BIT * sizeof(int) - 1);
+  temp += nbits;
+  nbits ^= temp;
 
   /* Find the number of bits needed for the magnitude of the coefficient */
-  nbits = JPEG_NBITS(temp);
+  nbits = JPEG_NBITS(nbits);
 
-  /* Emit the Huffman-coded symbol for the number of bits */
-  code = dctbl->ehufco[nbits];
-  size = dctbl->ehufsi[nbits];
-  EMIT_BITS(code, size)
-
-  /* Mask off any extra bits in code */
-  temp2 &= (((JLONG)1) << nbits) - 1;
-
-  /* Emit that number of bits of the value, if positive, */
-  /* or the complement of its magnitude, if negative. */
-  EMIT_BITS(temp2, nbits)
+  /* Emit the Huffman-coded symbol for the number of bits.
+   * Emit that number of bits of the value, if positive,
+   * or the complement of its magnitude, if negative.
+   */
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits])
 
   /* Encode the AC coefficients per section F.1.2.2 */
 
-  r = 0;                        /* r = run length of zeros */
+  {
+    int r = 0;                  /* r = run length of zeros */
 
 /* Manually unroll the k loop to eliminate the counter variable.  This
  * improves performance greatly on systems with a limited number of
@@ -563,51 +599,46 @@
  */
 #define kloop(jpeg_natural_order_of_k) { \
   if ((temp = block[jpeg_natural_order_of_k]) == 0) { \
-    r++; \
+    r += 16; \
   } else { \
-    temp2 = temp; \
     /* Branch-less absolute value, bitwise complement, etc., same as above */ \
-    temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); \
-    temp ^= temp3; \
-    temp -= temp3; \
-    temp2 += temp3; \
-    nbits = JPEG_NBITS_NONZERO(temp); \
+    nbits = temp >> (CHAR_BIT * sizeof(int) - 1); \
+    temp += nbits; \
+    nbits ^= temp; \
+    nbits = JPEG_NBITS_NONZERO(nbits); \
     /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
-    while (r > 15) { \
-      EMIT_BITS(code_0xf0, size_0xf0) \
-      r -= 16; \
+    while (r >= 16 * 16) { \
+      r -= 16 * 16; \
+      PUT_BITS(actbl->ehufco[0xf0], actbl->ehufsi[0xf0]) \
     } \
     /* Emit Huffman symbol for run length / number of bits */ \
-    temp3 = (r << 4) + nbits; \
-    code = actbl->ehufco[temp3]; \
-    size = actbl->ehufsi[temp3]; \
-    EMIT_CODE(code, size) \
+    r += nbits; \
+    PUT_CODE(actbl->ehufco[r], actbl->ehufsi[r]) \
     r = 0; \
   } \
 }
 
-  /* One iteration for each value in jpeg_natural_order[] */
-  kloop(1);   kloop(8);   kloop(16);  kloop(9);   kloop(2);   kloop(3);
-  kloop(10);  kloop(17);  kloop(24);  kloop(32);  kloop(25);  kloop(18);
-  kloop(11);  kloop(4);   kloop(5);   kloop(12);  kloop(19);  kloop(26);
-  kloop(33);  kloop(40);  kloop(48);  kloop(41);  kloop(34);  kloop(27);
-  kloop(20);  kloop(13);  kloop(6);   kloop(7);   kloop(14);  kloop(21);
-  kloop(28);  kloop(35);  kloop(42);  kloop(49);  kloop(56);  kloop(57);
-  kloop(50);  kloop(43);  kloop(36);  kloop(29);  kloop(22);  kloop(15);
-  kloop(23);  kloop(30);  kloop(37);  kloop(44);  kloop(51);  kloop(58);
-  kloop(59);  kloop(52);  kloop(45);  kloop(38);  kloop(31);  kloop(39);
-  kloop(46);  kloop(53);  kloop(60);  kloop(61);  kloop(54);  kloop(47);
-  kloop(55);  kloop(62);  kloop(63);
+    /* One iteration for each value in jpeg_natural_order[] */
+    kloop(1);   kloop(8);   kloop(16);  kloop(9);   kloop(2);   kloop(3);
+    kloop(10);  kloop(17);  kloop(24);  kloop(32);  kloop(25);  kloop(18);
+    kloop(11);  kloop(4);   kloop(5);   kloop(12);  kloop(19);  kloop(26);
+    kloop(33);  kloop(40);  kloop(48);  kloop(41);  kloop(34);  kloop(27);
+    kloop(20);  kloop(13);  kloop(6);   kloop(7);   kloop(14);  kloop(21);
+    kloop(28);  kloop(35);  kloop(42);  kloop(49);  kloop(56);  kloop(57);
+    kloop(50);  kloop(43);  kloop(36);  kloop(29);  kloop(22);  kloop(15);
+    kloop(23);  kloop(30);  kloop(37);  kloop(44);  kloop(51);  kloop(58);
+    kloop(59);  kloop(52);  kloop(45);  kloop(38);  kloop(31);  kloop(39);
+    kloop(46);  kloop(53);  kloop(60);  kloop(61);  kloop(54);  kloop(47);
+    kloop(55);  kloop(62);  kloop(63);
 
-  /* If the last coef(s) were zero, emit an end-of-block code */
-  if (r > 0) {
-    code = actbl->ehufco[0];
-    size = actbl->ehufsi[0];
-    EMIT_BITS(code, size)
+    /* If the last coef(s) were zero, emit an end-of-block code */
+    if (r > 0) {
+      PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+    }
   }
 
-  state->cur.put_buffer = put_buffer;
-  state->cur.put_bits = put_bits;
+  state->cur.put_buffer.c = put_buffer;
+  state->cur.free_bits = free_bits;
   STORE_BUFFER()
 
   return TRUE;
@@ -654,8 +685,9 @@
   /* Load up working state */
   state.next_output_byte = cinfo->dest->next_output_byte;
   state.free_in_buffer = cinfo->dest->free_in_buffer;
-  ASSIGN_STATE(state.cur, entropy->saved);
+  state.cur = entropy->saved;
   state.cinfo = cinfo;
+  state.simd = entropy->simd;
 
   /* Emit restart marker if needed */
   if (cinfo->restart_interval) {
@@ -694,7 +726,7 @@
   /* Completed MCU, so update state */
   cinfo->dest->next_output_byte = state.next_output_byte;
   cinfo->dest->free_in_buffer = state.free_in_buffer;
-  ASSIGN_STATE(entropy->saved, state.cur);
+  entropy->saved = state.cur;
 
   /* Update restart-interval state too */
   if (cinfo->restart_interval) {
@@ -723,8 +755,9 @@
   /* Load up working state ... flush_bits needs it */
   state.next_output_byte = cinfo->dest->next_output_byte;
   state.free_in_buffer = cinfo->dest->free_in_buffer;
-  ASSIGN_STATE(state.cur, entropy->saved);
+  state.cur = entropy->saved;
   state.cinfo = cinfo;
+  state.simd = entropy->simd;
 
   /* Flush out the last data */
   if (!flush_bits(&state))
@@ -733,7 +766,7 @@
   /* Update state */
   cinfo->dest->next_output_byte = state.next_output_byte;
   cinfo->dest->free_in_buffer = state.free_in_buffer;
-  ASSIGN_STATE(entropy->saved, state.cur);
+  entropy->saved = state.cur;
 }
 
 
diff --git a/jconfig.h.in b/jconfig.h.in
index 18a69a4..d4284d9 100644
--- a/jconfig.h.in
+++ b/jconfig.h.in
@@ -61,11 +61,6 @@
    unsigned. */
 #cmakedefine RIGHT_SHIFT_IS_UNSIGNED 1
 
-/* Define to 1 if type `char' is unsigned and you are not using gcc.  */
-#ifndef __CHAR_UNSIGNED__
-  #cmakedefine __CHAR_UNSIGNED__ 1
-#endif
-
 /* Define to empty if `const' does not conform to ANSI C. */
 /* #undef const */
 
diff --git a/jconfig.txt b/jconfig.txt
index 90cd724..21f35c1 100644
--- a/jconfig.txt
+++ b/jconfig.txt
@@ -42,12 +42,6 @@
  */
 /* #define const */
 
-/* Define this if an ordinary "char" type is unsigned.
- * If you're not sure, leaving it undefined will work at some cost in speed.
- * If you defined HAVE_UNSIGNED_CHAR then the speed difference is minimal.
- */
-#undef __CHAR_UNSIGNED__
-
 /* Define this if your system has an ANSI-conforming <stddef.h> file.
  */
 #define HAVE_STDDEF_H
@@ -118,7 +112,6 @@
 #define BMP_SUPPORTED           /* BMP image file format */
 #define GIF_SUPPORTED           /* GIF image file format */
 #define PPM_SUPPORTED           /* PBMPLUS PPM/PGM image file format */
-#undef RLE_SUPPORTED            /* Utah RLE image file format */
 #define TARGA_SUPPORTED         /* Targa image file format */
 
 /* Define this if you want to name both input and output files on the command
diff --git a/jcsample.c b/jcsample.c
index bd27b84..e8515eb 100644
--- a/jcsample.c
+++ b/jcsample.c
@@ -6,7 +6,7 @@
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2014, MIPS Technologies, Inc., California.
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -103,7 +103,7 @@
   if (numcols > 0) {
     for (row = 0; row < num_rows; row++) {
       ptr = image_data[row] + input_cols;
-      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
+      pixval = ptr[-1];
       for (count = numcols; count > 0; count--)
         *ptr++ = pixval;
     }
@@ -174,7 +174,7 @@
       for (v = 0; v < v_expand; v++) {
         inptr = input_data[inrow + v] + outcol_h;
         for (h = 0; h < h_expand; h++) {
-          outvalue += (JLONG)GETJSAMPLE(*inptr++);
+          outvalue += (JLONG)(*inptr++);
         }
       }
       *outptr++ = (JSAMPLE)((outvalue + numpix2) / numpix);
@@ -237,8 +237,7 @@
     inptr = input_data[outrow];
     bias = 0;                   /* bias = 0,1,0,1,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
-      *outptr++ =
-        (JSAMPLE)((GETJSAMPLE(*inptr) + GETJSAMPLE(inptr[1]) + bias) >> 1);
+      *outptr++ = (JSAMPLE)((inptr[0] + inptr[1] + bias) >> 1);
       bias ^= 1;                /* 0=>1, 1=>0 */
       inptr += 2;
     }
@@ -277,8 +276,7 @@
     bias = 1;                   /* bias = 1,2,1,2,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
       *outptr++ =
-        (JSAMPLE)((GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                   GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]) + bias) >> 2);
+        (JSAMPLE)((inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1] + bias) >> 2);
       bias ^= 3;                /* 1=>2, 2=>1 */
       inptr0 += 2;  inptr1 += 2;
     }
@@ -337,33 +335,25 @@
     below_ptr = input_data[inrow + 2];
 
     /* Special case for first column: pretend column -1 is same as column 0 */
-    membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
-    neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-               GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-               GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[2]) +
-               GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[2]);
+    membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
+    neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+               inptr0[0] + inptr0[2] + inptr1[0] + inptr1[2];
     neighsum += neighsum;
-    neighsum += GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[2]) +
-                GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[2]);
+    neighsum += above_ptr[0] + above_ptr[2] + below_ptr[0] + below_ptr[2];
     membersum = membersum * memberscale + neighsum * neighscale;
     *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
     inptr0 += 2;  inptr1 += 2;  above_ptr += 2;  below_ptr += 2;
 
     for (colctr = output_cols - 2; colctr > 0; colctr--) {
       /* sum of pixels directly mapped to this output element */
-      membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                  GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
+      membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
       /* sum of edge-neighbor pixels */
-      neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-                 GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-                 GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[2]) +
-                 GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[2]);
+      neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+                 inptr0[-1] + inptr0[2] + inptr1[-1] + inptr1[2];
       /* The edge-neighbors count twice as much as corner-neighbors */
       neighsum += neighsum;
       /* Add in the corner-neighbors */
-      neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[2]) +
-                  GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[2]);
+      neighsum += above_ptr[-1] + above_ptr[2] + below_ptr[-1] + below_ptr[2];
       /* form final output scaled up by 2^16 */
       membersum = membersum * memberscale + neighsum * neighscale;
       /* round, descale and output it */
@@ -372,15 +362,11 @@
     }
 
     /* Special case for last column */
-    membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
-    neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-               GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-               GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[1]) +
-               GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[1]);
+    membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
+    neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+               inptr0[-1] + inptr0[1] + inptr1[-1] + inptr1[1];
     neighsum += neighsum;
-    neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[1]) +
-                GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[1]);
+    neighsum += above_ptr[-1] + above_ptr[1] + below_ptr[-1] + below_ptr[1];
     membersum = membersum * memberscale + neighsum * neighscale;
     *outptr = (JSAMPLE)((membersum + 32768) >> 16);
 
@@ -429,21 +415,18 @@
     below_ptr = input_data[outrow + 1];
 
     /* Special case for first column */
-    colsum = GETJSAMPLE(*above_ptr++) + GETJSAMPLE(*below_ptr++) +
-             GETJSAMPLE(*inptr);
-    membersum = GETJSAMPLE(*inptr++);
-    nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) +
-                 GETJSAMPLE(*inptr);
+    colsum = (*above_ptr++) + (*below_ptr++) + inptr[0];
+    membersum = *inptr++;
+    nextcolsum = above_ptr[0] + below_ptr[0] + inptr[0];
     neighsum = colsum + (colsum - membersum) + nextcolsum;
     membersum = membersum * memberscale + neighsum * neighscale;
     *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
     lastcolsum = colsum;  colsum = nextcolsum;
 
     for (colctr = output_cols - 2; colctr > 0; colctr--) {
-      membersum = GETJSAMPLE(*inptr++);
+      membersum = *inptr++;
       above_ptr++;  below_ptr++;
-      nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) +
-                   GETJSAMPLE(*inptr);
+      nextcolsum = above_ptr[0] + below_ptr[0] + inptr[0];
       neighsum = lastcolsum + (colsum - membersum) + nextcolsum;
       membersum = membersum * memberscale + neighsum * neighscale;
       *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
@@ -451,7 +434,7 @@
     }
 
     /* Special case for last column */
-    membersum = GETJSAMPLE(*inptr);
+    membersum = *inptr;
     neighsum = lastcolsum + (colsum - membersum) + colsum;
     membersum = membersum * memberscale + neighsum * neighscale;
     *outptr = (JSAMPLE)((membersum + 32768) >> 16);
diff --git a/jdapistd.c b/jdapistd.c
index 38bd111..bd68bdc 100644
--- a/jdapistd.c
+++ b/jdapistd.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2015-2018, 2020, D. R. Commander.
+ * Copyright (C) 2010, 2015-2020, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -532,6 +532,8 @@
          * decoded coefficients.  This is ~5% faster for large subsets, but
          * it's tough to tell a difference for smaller images.
          */
+        if (!cinfo->entropy->insufficient_data)
+          cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
         (*cinfo->entropy->decode_mcu) (cinfo, NULL);
       }
     }
diff --git a/jdarith.c b/jdarith.c
index 6002481..cbbde24 100644
--- a/jdarith.c
+++ b/jdarith.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Developed 1997-2015 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2018, D. R. Commander.
+ * Copyright (C) 2015-2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -80,7 +80,7 @@
     if (!(*src->fill_input_buffer) (cinfo))
       ERREXIT(cinfo, JERR_CANT_SUSPEND);
   src->bytes_in_buffer--;
-  return GETJOCTET(*src->next_input_byte++);
+  return *src->next_input_byte++;
 }
 
 
@@ -665,12 +665,16 @@
     for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
       int coefi, cindex = cinfo->cur_comp_info[ci]->component_index;
       int *coef_bit_ptr = &cinfo->coef_bits[cindex][0];
+      int *prev_coef_bit_ptr =
+        &cinfo->coef_bits[cindex + cinfo->num_components][0];
       if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
         WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
       for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
         int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
         if (cinfo->Ah != expected)
           WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
+        if (cinfo->input_scan_number > 1)
+          prev_coef_bit_ptr[coefi] = coef_bit_ptr[coefi];
         coef_bit_ptr[coefi] = cinfo->Al;
       }
     }
@@ -727,6 +731,7 @@
   entropy->c = 0;
   entropy->a = 0;
   entropy->ct = -16;    /* force reading 2 initial bytes to fill C */
+  entropy->pub.insufficient_data = FALSE;
 
   /* Initialize restart counter */
   entropy->restarts_to_go = cinfo->restart_interval;
@@ -763,7 +768,7 @@
     int *coef_bit_ptr, ci;
     cinfo->coef_bits = (int (*)[DCTSIZE2])
       (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                  cinfo->num_components * DCTSIZE2 *
+                                  cinfo->num_components * 2 * DCTSIZE2 *
                                   sizeof(int));
     coef_bit_ptr = &cinfo->coef_bits[0][0];
     for (ci = 0; ci < cinfo->num_components; ci++)
diff --git a/jdcoefct.c b/jdcoefct.c
index 2ba6aa1..ea2febd 100644
--- a/jdcoefct.c
+++ b/jdcoefct.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2010, 2015-2016, D. R. Commander.
+ * Copyright (C) 2010, 2015-2016, 2019, D. R. Commander.
  * Copyright (C) 2015, 2020, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -102,6 +102,8 @@
       /* Try to fetch an MCU.  Entropy decoder expects buffer to be zeroed. */
       jzero_far((void *)coef->MCU_buffer[0],
                 (size_t)(cinfo->blocks_in_MCU * sizeof(JBLOCK)));
+      if (!cinfo->entropy->insufficient_data)
+        cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
       if (!(*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
         coef->MCU_vert_offset = yoffset;
@@ -227,6 +229,8 @@
           }
         }
       }
+      if (!cinfo->entropy->insufficient_data)
+        cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
       /* Try to fetch the MCU. */
       if (!(*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
@@ -356,8 +360,8 @@
   int ci, coefi;
   jpeg_component_info *compptr;
   JQUANT_TBL *qtable;
-  int *coef_bits;
-  int *coef_bits_latch;
+  int *coef_bits, *prev_coef_bits;
+  int *coef_bits_latch, *prev_coef_bits_latch;
 
   if (!cinfo->progressive_mode || cinfo->coef_bits == NULL)
     return FALSE;
@@ -366,9 +370,11 @@
   if (coef->coef_bits_latch == NULL)
     coef->coef_bits_latch = (int *)
       (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                  cinfo->num_components *
+                                  cinfo->num_components * 2 *
                                   (SAVED_COEFS * sizeof(int)));
   coef_bits_latch = coef->coef_bits_latch;
+  prev_coef_bits_latch =
+    &coef->coef_bits_latch[cinfo->num_components * SAVED_COEFS];
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
@@ -385,15 +391,19 @@
       return FALSE;
     /* DC values must be at least partly known for all components. */
     coef_bits = cinfo->coef_bits[ci];
+    prev_coef_bits = cinfo->coef_bits[ci + cinfo->num_components];
     if (coef_bits[0] < 0)
       return FALSE;
     /* Block smoothing is helpful if some AC coefficients remain inaccurate. */
     for (coefi = 1; coefi <= 5; coefi++) {
+      if (cinfo->input_scan_number > 1)
+        prev_coef_bits_latch[coefi] = prev_coef_bits[coefi];
       coef_bits_latch[coefi] = coef_bits[coefi];
       if (coef_bits[coefi] != 0)
         smoothing_useful = TRUE;
     }
     coef_bits_latch += SAVED_COEFS;
+    prev_coef_bits_latch += SAVED_COEFS;
   }
 
   return smoothing_useful;
@@ -478,8 +488,15 @@
          (JDIMENSION)0, (JDIMENSION)access_rows, FALSE);
       first_row = TRUE;
     }
-    /* Fetch component-dependent info */
-    coef_bits = coef->coef_bits_latch + (ci * SAVED_COEFS);
+    /* Fetch component-dependent info.
+     * If the current scan is incomplete, then we use the component-dependent
+     * info from the previous scan.
+     */
+    if (cinfo->output_iMCU_row > cinfo->master->last_good_iMCU_row)
+      coef_bits =
+        coef->coef_bits_latch + ((ci + cinfo->num_components) * SAVED_COEFS);
+    else
+      coef_bits = coef->coef_bits_latch + (ci * SAVED_COEFS);
     quanttbl = compptr->quant_table;
     Q00 = quanttbl->quantval[0];
     Q01 = quanttbl->quantval[Q01_POS];
diff --git a/jdcol565.c b/jdcol565.c
index 40068ef..53c7bd9 100644
--- a/jdcol565.c
+++ b/jdcol565.c
@@ -45,9 +45,9 @@
     outptr = *output_buf++;
 
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
@@ -58,18 +58,18 @@
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
       b = range_limit[y + Cbbtab[cb]];
       rgb = PACK_SHORT_565(r, g, b);
 
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
@@ -80,9 +80,9 @@
       outptr += 4;
     }
     if (num_cols & 1) {
-      y  = GETJSAMPLE(*inptr0);
-      cb = GETJSAMPLE(*inptr1);
-      cr = GETJSAMPLE(*inptr2);
+      y  = *inptr0;
+      cb = *inptr1;
+      cr = *inptr2;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
@@ -125,9 +125,9 @@
     input_row++;
     outptr = *output_buf++;
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -139,9 +139,9 @@
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -150,9 +150,9 @@
       d0 = DITHER_ROTATE(d0);
       rgb = PACK_SHORT_565(r, g, b);
 
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -165,9 +165,9 @@
       outptr += 4;
     }
     if (num_cols & 1) {
-      y  = GETJSAMPLE(*inptr0);
-      cb = GETJSAMPLE(*inptr1);
-      cr = GETJSAMPLE(*inptr2);
+      y  = *inptr0;
+      cb = *inptr1;
+      cr = *inptr2;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -202,32 +202,32 @@
     input_row++;
     outptr = *output_buf++;
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      r = GETJSAMPLE(*inptr0++);
-      g = GETJSAMPLE(*inptr1++);
-      b = GETJSAMPLE(*inptr2++);
+      r = *inptr0++;
+      g = *inptr1++;
+      b = *inptr2++;
       rgb = PACK_SHORT_565(r, g, b);
       *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      r = GETJSAMPLE(*inptr0++);
-      g = GETJSAMPLE(*inptr1++);
-      b = GETJSAMPLE(*inptr2++);
+      r = *inptr0++;
+      g = *inptr1++;
+      b = *inptr2++;
       rgb = PACK_SHORT_565(r, g, b);
 
-      r = GETJSAMPLE(*inptr0++);
-      g = GETJSAMPLE(*inptr1++);
-      b = GETJSAMPLE(*inptr2++);
+      r = *inptr0++;
+      g = *inptr1++;
+      b = *inptr2++;
       rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
 
       WRITE_TWO_ALIGNED_PIXELS(outptr, rgb);
       outptr += 4;
     }
     if (num_cols & 1) {
-      r = GETJSAMPLE(*inptr0);
-      g = GETJSAMPLE(*inptr1);
-      b = GETJSAMPLE(*inptr2);
+      r = *inptr0;
+      g = *inptr1;
+      b = *inptr2;
       rgb = PACK_SHORT_565(r, g, b);
       *(INT16 *)outptr = (INT16)rgb;
     }
@@ -259,24 +259,24 @@
     input_row++;
     outptr = *output_buf++;
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      r = range_limit[DITHER_565_R(*inptr0++, d0)];
+      g = range_limit[DITHER_565_G(*inptr1++, d0)];
+      b = range_limit[DITHER_565_B(*inptr2++, d0)];
       rgb = PACK_SHORT_565(r, g, b);
       *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      r = range_limit[DITHER_565_R(*inptr0++, d0)];
+      g = range_limit[DITHER_565_G(*inptr1++, d0)];
+      b = range_limit[DITHER_565_B(*inptr2++, d0)];
       d0 = DITHER_ROTATE(d0);
       rgb = PACK_SHORT_565(r, g, b);
 
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      r = range_limit[DITHER_565_R(*inptr0++, d0)];
+      g = range_limit[DITHER_565_G(*inptr1++, d0)];
+      b = range_limit[DITHER_565_B(*inptr2++, d0)];
       d0 = DITHER_ROTATE(d0);
       rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
 
@@ -284,9 +284,9 @@
       outptr += 4;
     }
     if (num_cols & 1) {
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2), d0)];
+      r = range_limit[DITHER_565_R(*inptr0, d0)];
+      g = range_limit[DITHER_565_G(*inptr1, d0)];
+      b = range_limit[DITHER_565_B(*inptr2, d0)];
       rgb = PACK_SHORT_565(r, g, b);
       *(INT16 *)outptr = (INT16)rgb;
     }
diff --git a/jdcolext.c b/jdcolext.c
index 72a5301..863c7a2 100644
--- a/jdcolext.c
+++ b/jdcolext.c
@@ -53,9 +53,9 @@
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      y  = GETJSAMPLE(inptr0[col]);
-      cb = GETJSAMPLE(inptr1[col]);
-      cr = GETJSAMPLE(inptr2[col]);
+      y  = inptr0[col];
+      cb = inptr1[col];
+      cr = inptr2[col];
       /* Range-limiting is essential due to noise introduced by DCT losses. */
       outptr[RGB_RED] =   range_limit[y + Crrtab[cr]];
       outptr[RGB_GREEN] = range_limit[y +
@@ -93,7 +93,6 @@
     inptr = input_buf[0][input_row++];
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      /* We can dispense with GETJSAMPLE() here */
       outptr[RGB_RED] = outptr[RGB_GREEN] = outptr[RGB_BLUE] = inptr[col];
       /* Set unused byte to 0xFF so it can be interpreted as an opaque */
       /* alpha channel value */
@@ -128,7 +127,6 @@
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      /* We can dispense with GETJSAMPLE() here */
       outptr[RGB_RED] = inptr0[col];
       outptr[RGB_GREEN] = inptr1[col];
       outptr[RGB_BLUE] = inptr2[col];
diff --git a/jdcolor.c b/jdcolor.c
index dc0e3b6..5bcc1fc 100644
--- a/jdcolor.c
+++ b/jdcolor.c
@@ -341,9 +341,9 @@
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr0[col]);
-      g = GETJSAMPLE(inptr1[col]);
-      b = GETJSAMPLE(inptr2[col]);
+      r = inptr0[col];
+      g = inptr1[col];
+      b = inptr2[col];
       /* Y */
       outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
                                ctab[b + B_Y_OFF]) >> SCALEBITS);
@@ -550,9 +550,9 @@
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      y  = GETJSAMPLE(inptr0[col]);
-      cb = GETJSAMPLE(inptr1[col]);
-      cr = GETJSAMPLE(inptr2[col]);
+      y  = inptr0[col];
+      cb = inptr1[col];
+      cr = inptr2[col];
       /* Range-limiting is essential due to noise introduced by DCT losses. */
       outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])];   /* red */
       outptr[1] = range_limit[MAXJSAMPLE - (y +                 /* green */
@@ -560,7 +560,7 @@
                                                  SCALEBITS)))];
       outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])];   /* blue */
       /* K passes through unchanged */
-      outptr[3] = inptr3[col];  /* don't need GETJSAMPLE here */
+      outptr[3] = inptr3[col];
       outptr += 4;
     }
   }
diff --git a/jdhuff.c b/jdhuff.c
index a112817..b5665d5 100644
--- a/jdhuff.c
+++ b/jdhuff.c
@@ -5,6 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2009-2011, 2016, 2018-2019, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -39,24 +40,6 @@
   int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
 } savable_state;
 
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest, src)  ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest, src) \
-  ((dest).last_dc_val[0] = (src).last_dc_val[0], \
-   (dest).last_dc_val[1] = (src).last_dc_val[1], \
-   (dest).last_dc_val[2] = (src).last_dc_val[2], \
-   (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
 typedef struct {
   struct jpeg_entropy_decoder pub; /* public fields */
 
@@ -325,7 +308,7 @@
         bytes_in_buffer = cinfo->src->bytes_in_buffer;
       }
       bytes_in_buffer--;
-      c = GETJOCTET(*next_input_byte++);
+      c = *next_input_byte++;
 
       /* If it's 0xFF, check and discard stuffed zero byte */
       if (c == 0xFF) {
@@ -342,7 +325,7 @@
             bytes_in_buffer = cinfo->src->bytes_in_buffer;
           }
           bytes_in_buffer--;
-          c = GETJOCTET(*next_input_byte++);
+          c = *next_input_byte++;
         } while (c == 0xFF);
 
         if (c == 0) {
@@ -405,8 +388,8 @@
 
 #define GET_BYTE { \
   register int c0, c1; \
-  c0 = GETJOCTET(*buffer++); \
-  c1 = GETJOCTET(*buffer); \
+  c0 = *buffer++; \
+  c1 = *buffer; \
   /* Pre-execute most common case */ \
   get_buffer = (get_buffer << 8) | c0; \
   bits_left += 8; \
@@ -423,7 +406,7 @@
   } \
 }
 
-#if SIZEOF_SIZE_T == 8 || defined(_WIN64)
+#if SIZEOF_SIZE_T == 8 || defined(_WIN64) || (defined(__x86_64__) && defined(__ILP32__))
 
 /* Pre-fetch 48 bytes, because the holding register is 64-bit */
 #define FILL_BIT_BUFFER_FAST \
@@ -568,7 +551,7 @@
 
   /* Load up working state */
   BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
-  ASSIGN_STATE(state, entropy->saved);
+  state = entropy->saved;
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
     JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
@@ -653,7 +636,7 @@
 
   /* Completed MCU, so update state */
   BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
-  ASSIGN_STATE(entropy->saved, state);
+  entropy->saved = state;
   return TRUE;
 }
 
@@ -671,7 +654,7 @@
   /* Load up working state */
   BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
   buffer = (JOCTET *)br_state.next_input_byte;
-  ASSIGN_STATE(state, entropy->saved);
+  state = entropy->saved;
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
     JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
@@ -740,7 +723,7 @@
   br_state.bytes_in_buffer -= (buffer - br_state.next_input_byte);
   br_state.next_input_byte = buffer;
   BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
-  ASSIGN_STATE(entropy->saved, state);
+  entropy->saved = state;
   return TRUE;
 }
 
diff --git a/jdhuff.h b/jdhuff.h
index 6a8d90f..ac6e0e5 100644
--- a/jdhuff.h
+++ b/jdhuff.h
@@ -5,6 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2010-2011, 2015-2016, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -78,6 +79,11 @@
 typedef size_t bit_buf_type;            /* type of bit-extraction buffer */
 #define BIT_BUF_SIZE  64                /* size of buffer in bits */
 
+#elif defined(__x86_64__) && defined(__ILP32__)
+
+typedef unsigned long long bit_buf_type; /* type of bit-extraction buffer */
+#define BIT_BUF_SIZE  64                 /* size of buffer in bits */
+
 #else
 
 typedef unsigned long bit_buf_type;     /* type of bit-extraction buffer */
diff --git a/jdicc.c b/jdicc.c
index 7224695..a1a5b86 100644
--- a/jdicc.c
+++ b/jdicc.c
@@ -38,18 +38,18 @@
     marker->marker == ICC_MARKER &&
     marker->data_length >= ICC_OVERHEAD_LEN &&
     /* verify the identifying string */
-    GETJOCTET(marker->data[0]) == 0x49 &&
-    GETJOCTET(marker->data[1]) == 0x43 &&
-    GETJOCTET(marker->data[2]) == 0x43 &&
-    GETJOCTET(marker->data[3]) == 0x5F &&
-    GETJOCTET(marker->data[4]) == 0x50 &&
-    GETJOCTET(marker->data[5]) == 0x52 &&
-    GETJOCTET(marker->data[6]) == 0x4F &&
-    GETJOCTET(marker->data[7]) == 0x46 &&
-    GETJOCTET(marker->data[8]) == 0x49 &&
-    GETJOCTET(marker->data[9]) == 0x4C &&
-    GETJOCTET(marker->data[10]) == 0x45 &&
-    GETJOCTET(marker->data[11]) == 0x0;
+    marker->data[0] == 0x49 &&
+    marker->data[1] == 0x43 &&
+    marker->data[2] == 0x43 &&
+    marker->data[3] == 0x5F &&
+    marker->data[4] == 0x50 &&
+    marker->data[5] == 0x52 &&
+    marker->data[6] == 0x4F &&
+    marker->data[7] == 0x46 &&
+    marker->data[8] == 0x49 &&
+    marker->data[9] == 0x4C &&
+    marker->data[10] == 0x45 &&
+    marker->data[11] == 0x0;
 }
 
 
@@ -102,12 +102,12 @@
   for (marker = cinfo->marker_list; marker != NULL; marker = marker->next) {
     if (marker_is_icc(marker)) {
       if (num_markers == 0)
-        num_markers = GETJOCTET(marker->data[13]);
-      else if (num_markers != GETJOCTET(marker->data[13])) {
+        num_markers = marker->data[13];
+      else if (num_markers != marker->data[13]) {
         WARNMS(cinfo, JWRN_BOGUS_ICC);  /* inconsistent num_markers fields */
         return FALSE;
       }
-      seq_no = GETJOCTET(marker->data[12]);
+      seq_no = marker->data[12];
       if (seq_no <= 0 || seq_no > num_markers) {
         WARNMS(cinfo, JWRN_BOGUS_ICC);  /* bogus sequence number */
         return FALSE;
@@ -154,7 +154,7 @@
       JOCTET FAR *src_ptr;
       JOCTET *dst_ptr;
       unsigned int length;
-      seq_no = GETJOCTET(marker->data[12]);
+      seq_no = marker->data[12];
       dst_ptr = icc_data + data_offset[seq_no];
       src_ptr = marker->data + ICC_OVERHEAD_LEN;
       length = data_length[seq_no];
diff --git a/jdmarker.c b/jdmarker.c
index c9c7ef6..b964c3a 100644
--- a/jdmarker.c
+++ b/jdmarker.c
@@ -151,7 +151,7 @@
 #define INPUT_BYTE(cinfo, V, action) \
   MAKESTMT( MAKE_BYTE_AVAIL(cinfo, action); \
             bytes_in_buffer--; \
-            V = GETJOCTET(*next_input_byte++); )
+            V = *next_input_byte++; )
 
 /* As above, but read two bytes interpreted as an unsigned 16-bit integer.
  * V should be declared unsigned int or perhaps JLONG.
@@ -159,10 +159,10 @@
 #define INPUT_2BYTES(cinfo, V, action) \
   MAKESTMT( MAKE_BYTE_AVAIL(cinfo, action); \
             bytes_in_buffer--; \
-            V = ((unsigned int)GETJOCTET(*next_input_byte++)) << 8; \
+            V = ((unsigned int)(*next_input_byte++)) << 8; \
             MAKE_BYTE_AVAIL(cinfo, action); \
             bytes_in_buffer--; \
-            V += GETJOCTET(*next_input_byte++); )
+            V += *next_input_byte++; )
 
 
 /*
@@ -608,18 +608,18 @@
   JLONG totallen = (JLONG)datalen + remaining;
 
   if (datalen >= APP0_DATA_LEN &&
-      GETJOCTET(data[0]) == 0x4A &&
-      GETJOCTET(data[1]) == 0x46 &&
-      GETJOCTET(data[2]) == 0x49 &&
-      GETJOCTET(data[3]) == 0x46 &&
-      GETJOCTET(data[4]) == 0) {
+      data[0] == 0x4A &&
+      data[1] == 0x46 &&
+      data[2] == 0x49 &&
+      data[3] == 0x46 &&
+      data[4] == 0) {
     /* Found JFIF APP0 marker: save info */
     cinfo->saw_JFIF_marker = TRUE;
-    cinfo->JFIF_major_version = GETJOCTET(data[5]);
-    cinfo->JFIF_minor_version = GETJOCTET(data[6]);
-    cinfo->density_unit = GETJOCTET(data[7]);
-    cinfo->X_density = (GETJOCTET(data[8]) << 8) + GETJOCTET(data[9]);
-    cinfo->Y_density = (GETJOCTET(data[10]) << 8) + GETJOCTET(data[11]);
+    cinfo->JFIF_major_version = data[5];
+    cinfo->JFIF_minor_version = data[6];
+    cinfo->density_unit = data[7];
+    cinfo->X_density = (data[8] << 8) + data[9];
+    cinfo->Y_density = (data[10] << 8) + data[11];
     /* Check version.
      * Major version must be 1, anything else signals an incompatible change.
      * (We used to treat this as an error, but now it's a nonfatal warning,
@@ -634,24 +634,22 @@
              cinfo->JFIF_major_version, cinfo->JFIF_minor_version,
              cinfo->X_density, cinfo->Y_density, cinfo->density_unit);
     /* Validate thumbnail dimensions and issue appropriate messages */
-    if (GETJOCTET(data[12]) | GETJOCTET(data[13]))
-      TRACEMS2(cinfo, 1, JTRC_JFIF_THUMBNAIL,
-               GETJOCTET(data[12]), GETJOCTET(data[13]));
+    if (data[12] | data[13])
+      TRACEMS2(cinfo, 1, JTRC_JFIF_THUMBNAIL, data[12], data[13]);
     totallen -= APP0_DATA_LEN;
-    if (totallen !=
-        ((JLONG)GETJOCTET(data[12]) * (JLONG)GETJOCTET(data[13]) * (JLONG)3))
+    if (totallen != ((JLONG)data[12] * (JLONG)data[13] * (JLONG)3))
       TRACEMS1(cinfo, 1, JTRC_JFIF_BADTHUMBNAILSIZE, (int)totallen);
   } else if (datalen >= 6 &&
-             GETJOCTET(data[0]) == 0x4A &&
-             GETJOCTET(data[1]) == 0x46 &&
-             GETJOCTET(data[2]) == 0x58 &&
-             GETJOCTET(data[3]) == 0x58 &&
-             GETJOCTET(data[4]) == 0) {
+             data[0] == 0x4A &&
+             data[1] == 0x46 &&
+             data[2] == 0x58 &&
+             data[3] == 0x58 &&
+             data[4] == 0) {
     /* Found JFIF "JFXX" extension APP0 marker */
     /* The library doesn't actually do anything with these,
      * but we try to produce a helpful trace message.
      */
-    switch (GETJOCTET(data[5])) {
+    switch (data[5]) {
     case 0x10:
       TRACEMS1(cinfo, 1, JTRC_THUMB_JPEG, (int)totallen);
       break;
@@ -662,8 +660,7 @@
       TRACEMS1(cinfo, 1, JTRC_THUMB_RGB, (int)totallen);
       break;
     default:
-      TRACEMS2(cinfo, 1, JTRC_JFIF_EXTENSION,
-               GETJOCTET(data[5]), (int)totallen);
+      TRACEMS2(cinfo, 1, JTRC_JFIF_EXTENSION, data[5], (int)totallen);
       break;
     }
   } else {
@@ -684,16 +681,16 @@
   unsigned int version, flags0, flags1, transform;
 
   if (datalen >= APP14_DATA_LEN &&
-      GETJOCTET(data[0]) == 0x41 &&
-      GETJOCTET(data[1]) == 0x64 &&
-      GETJOCTET(data[2]) == 0x6F &&
-      GETJOCTET(data[3]) == 0x62 &&
-      GETJOCTET(data[4]) == 0x65) {
+      data[0] == 0x41 &&
+      data[1] == 0x64 &&
+      data[2] == 0x6F &&
+      data[3] == 0x62 &&
+      data[4] == 0x65) {
     /* Found Adobe APP14 marker */
-    version = (GETJOCTET(data[5]) << 8) + GETJOCTET(data[6]);
-    flags0 = (GETJOCTET(data[7]) << 8) + GETJOCTET(data[8]);
-    flags1 = (GETJOCTET(data[9]) << 8) + GETJOCTET(data[10]);
-    transform = GETJOCTET(data[11]);
+    version = (data[5] << 8) + data[6];
+    flags0 = (data[7] << 8) + data[8];
+    flags1 = (data[9] << 8) + data[10];
+    transform = data[11];
     TRACEMS4(cinfo, 1, JTRC_ADOBE, version, flags0, flags1, transform);
     cinfo->saw_Adobe_marker = TRUE;
     cinfo->Adobe_transform = (UINT8)transform;
diff --git a/jdmaster.c b/jdmaster.c
index b209064..bc39d1c 100644
--- a/jdmaster.c
+++ b/jdmaster.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2002-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2016, D. R. Commander.
+ * Copyright (C) 2009-2011, 2016, 2019, D. R. Commander.
  * Copyright (C) 2013, Linaro Limited.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -580,6 +580,7 @@
    */
   cinfo->master->first_iMCU_col = 0;
   cinfo->master->last_iMCU_col = cinfo->MCUs_per_row - 1;
+  cinfo->master->last_good_iMCU_row = 0;
 
 #ifdef D_MULTISCAN_FILES_SUPPORTED
   /* If jpeg_start_decompress will read the whole file, initialize
diff --git a/jdmrg565.c b/jdmrg565.c
index 53f1e16..980a4e2 100644
--- a/jdmrg565.c
+++ b/jdmrg565.c
@@ -43,20 +43,20 @@
   /* Loop for each pair of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -68,12 +68,12 @@
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
+    y  = *inptr0;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -115,21 +115,21 @@
   /* Loop for each pair of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     d0 = DITHER_ROTATE(d0);
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
@@ -142,12 +142,12 @@
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
+    y  = *inptr0;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
@@ -189,20 +189,20 @@
   /* Loop for each group of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -211,13 +211,13 @@
     WRITE_TWO_PIXELS(outptr0, rgb);
     outptr0 += 4;
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -229,20 +229,20 @@
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
-    y  = GETJSAMPLE(*inptr00);
+    y  = *inptr00;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
     *(INT16 *)outptr0 = (INT16)rgb;
 
-    y  = GETJSAMPLE(*inptr01);
+    y  = *inptr01;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -287,21 +287,21 @@
   /* Loop for each group of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     d0 = DITHER_ROTATE(d0);
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
@@ -311,14 +311,14 @@
     WRITE_TWO_PIXELS(outptr0, rgb);
     outptr0 += 4;
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[DITHER_565_R(y + cred, d1)];
     g = range_limit[DITHER_565_G(y + cgreen, d1)];
     b = range_limit[DITHER_565_B(y + cblue, d1)];
     d1 = DITHER_ROTATE(d1);
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[DITHER_565_R(y + cred, d1)];
     g = range_limit[DITHER_565_G(y + cgreen, d1)];
     b = range_limit[DITHER_565_B(y + cblue, d1)];
@@ -331,20 +331,20 @@
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
-    y  = GETJSAMPLE(*inptr00);
+    y  = *inptr00;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     rgb = PACK_SHORT_565(r, g, b);
     *(INT16 *)outptr0 = (INT16)rgb;
 
-    y  = GETJSAMPLE(*inptr01);
+    y  = *inptr01;
     r = range_limit[DITHER_565_R(y + cred, d1)];
     g = range_limit[DITHER_565_G(y + cgreen, d1)];
     b = range_limit[DITHER_565_B(y + cblue, d1)];
diff --git a/jdmrgext.c b/jdmrgext.c
index c9a44d8..9bf4f1a 100644
--- a/jdmrgext.c
+++ b/jdmrgext.c
@@ -46,13 +46,13 @@
   /* Loop for each pair of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
@@ -60,7 +60,7 @@
     outptr[RGB_ALPHA] = 0xFF;
 #endif
     outptr += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
@@ -71,12 +71,12 @@
   }
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
+    y  = *inptr0;
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
@@ -120,13 +120,13 @@
   /* Loop for each group of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
@@ -134,7 +134,7 @@
     outptr0[RGB_ALPHA] = 0xFF;
 #endif
     outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
@@ -142,7 +142,7 @@
     outptr0[RGB_ALPHA] = 0xFF;
 #endif
     outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
@@ -150,7 +150,7 @@
     outptr1[RGB_ALPHA] = 0xFF;
 #endif
     outptr1 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
@@ -161,19 +161,19 @@
   }
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr00);
+    y  = *inptr00;
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
 #ifdef RGB_ALPHA
     outptr0[RGB_ALPHA] = 0xFF;
 #endif
-    y  = GETJSAMPLE(*inptr01);
+    y  = *inptr01;
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
diff --git a/jdphuff.c b/jdphuff.c
index 9e82636..5ab99d6 100644
--- a/jdphuff.c
+++ b/jdphuff.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2016, 2018, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018-2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -41,25 +41,6 @@
   int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
 } savable_state;
 
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest, src)  ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest, src) \
-  ((dest).EOBRUN = (src).EOBRUN, \
-   (dest).last_dc_val[0] = (src).last_dc_val[0], \
-   (dest).last_dc_val[1] = (src).last_dc_val[1], \
-   (dest).last_dc_val[2] = (src).last_dc_val[2], \
-   (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
 typedef struct {
   struct jpeg_entropy_decoder pub; /* public fields */
 
@@ -102,7 +83,7 @@
   boolean is_DC_band, bad;
   int ci, coefi, tbl;
   d_derived_tbl **pdtbl;
-  int *coef_bit_ptr;
+  int *coef_bit_ptr, *prev_coef_bit_ptr;
   jpeg_component_info *compptr;
 
   is_DC_band = (cinfo->Ss == 0);
@@ -143,12 +124,15 @@
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     int cindex = cinfo->cur_comp_info[ci]->component_index;
     coef_bit_ptr = &cinfo->coef_bits[cindex][0];
+    prev_coef_bit_ptr = &cinfo->coef_bits[cindex + cinfo->num_components][0];
     if (!is_DC_band && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
       WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
     for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
       int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
       if (cinfo->Ah != expected)
         WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
+      if (cinfo->input_scan_number > 1)
+        prev_coef_bit_ptr[coefi] = coef_bit_ptr[coefi];
       coef_bit_ptr[coefi] = cinfo->Al;
     }
   }
@@ -323,7 +307,7 @@
 
     /* Load up working state */
     BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
-    ASSIGN_STATE(state, entropy->saved);
+    state = entropy->saved;
 
     /* Outer loop handles each block in the MCU */
 
@@ -356,7 +340,7 @@
 
     /* Completed MCU, so update state */
     BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
-    ASSIGN_STATE(entropy->saved, state);
+    entropy->saved = state;
   }
 
   /* Account for restart interval (no-op if not using restarts) */
@@ -676,7 +660,7 @@
   /* Create progression status table */
   cinfo->coef_bits = (int (*)[DCTSIZE2])
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                cinfo->num_components * DCTSIZE2 *
+                                cinfo->num_components * 2 * DCTSIZE2 *
                                 sizeof(int));
   coef_bit_ptr = &cinfo->coef_bits[0][0];
   for (ci = 0; ci < cinfo->num_components; ci++)
diff --git a/jdsample.c b/jdsample.c
index 50a68b3..da8f151 100644
--- a/jdsample.c
+++ b/jdsample.c
@@ -177,7 +177,7 @@
     outptr = output_data[outrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
+      invalue = *inptr++;
       for (h = h_expand; h > 0; h--) {
         *outptr++ = invalue;
       }
@@ -213,7 +213,7 @@
     outptr = output_data[inrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
+      invalue = *inptr++;
       *outptr++ = invalue;
       *outptr++ = invalue;
     }
@@ -242,7 +242,7 @@
     outptr = output_data[outrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
+      invalue = *inptr++;
       *outptr++ = invalue;
       *outptr++ = invalue;
     }
@@ -283,20 +283,20 @@
     inptr = input_data[inrow];
     outptr = output_data[inrow];
     /* Special case for first column */
-    invalue = GETJSAMPLE(*inptr++);
+    invalue = *inptr++;
     *outptr++ = (JSAMPLE)invalue;
-    *outptr++ = (JSAMPLE)((invalue * 3 + GETJSAMPLE(*inptr) + 2) >> 2);
+    *outptr++ = (JSAMPLE)((invalue * 3 + inptr[0] + 2) >> 2);
 
     for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
       /* General case: 3/4 * nearer pixel + 1/4 * further pixel */
-      invalue = GETJSAMPLE(*inptr++) * 3;
-      *outptr++ = (JSAMPLE)((invalue + GETJSAMPLE(inptr[-2]) + 1) >> 2);
-      *outptr++ = (JSAMPLE)((invalue + GETJSAMPLE(*inptr) + 2) >> 2);
+      invalue = (*inptr++) * 3;
+      *outptr++ = (JSAMPLE)((invalue + inptr[-2] + 1) >> 2);
+      *outptr++ = (JSAMPLE)((invalue + inptr[0] + 2) >> 2);
     }
 
     /* Special case for last column */
-    invalue = GETJSAMPLE(*inptr);
-    *outptr++ = (JSAMPLE)((invalue * 3 + GETJSAMPLE(inptr[-1]) + 1) >> 2);
+    invalue = *inptr;
+    *outptr++ = (JSAMPLE)((invalue * 3 + inptr[-1] + 1) >> 2);
     *outptr++ = (JSAMPLE)invalue;
   }
 }
@@ -338,7 +338,7 @@
       outptr = output_data[outrow++];
 
       for (colctr = 0; colctr < compptr->downsampled_width; colctr++) {
-        thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
+        thiscolsum = (*inptr0++) * 3 + (*inptr1++);
         *outptr++ = (JSAMPLE)((thiscolsum + bias) >> 2);
       }
     }
@@ -381,8 +381,8 @@
       outptr = output_data[outrow++];
 
       /* Special case for first column */
-      thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-      nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
+      thiscolsum = (*inptr0++) * 3 + (*inptr1++);
+      nextcolsum = (*inptr0++) * 3 + (*inptr1++);
       *outptr++ = (JSAMPLE)((thiscolsum * 4 + 8) >> 4);
       *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
       lastcolsum = thiscolsum;  thiscolsum = nextcolsum;
@@ -390,7 +390,7 @@
       for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
         /* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */
         /* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */
-        nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
+        nextcolsum = (*inptr0++) * 3 + (*inptr1++);
         *outptr++ = (JSAMPLE)((thiscolsum * 3 + lastcolsum + 8) >> 4);
         *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
         lastcolsum = thiscolsum;  thiscolsum = nextcolsum;
diff --git a/jmorecfg.h b/jmorecfg.h
index d0b9300..78f4bf1 100644
--- a/jmorecfg.h
+++ b/jmorecfg.h
@@ -43,25 +43,11 @@
 
 #if BITS_IN_JSAMPLE == 8
 /* JSAMPLE should be the smallest type that will hold the values 0..255.
- * You can use a signed char by having GETJSAMPLE mask it with 0xFF.
  */
 
-#ifdef HAVE_UNSIGNED_CHAR
-
 typedef unsigned char JSAMPLE;
 #define GETJSAMPLE(value)  ((int)(value))
 
-#else /* not HAVE_UNSIGNED_CHAR */
-
-typedef char JSAMPLE;
-#ifdef __CHAR_UNSIGNED__
-#define GETJSAMPLE(value)  ((int)(value))
-#else
-#define GETJSAMPLE(value)  ((int)(value) & 0xFF)
-#endif /* __CHAR_UNSIGNED__ */
-
-#endif /* HAVE_UNSIGNED_CHAR */
-
 #define MAXJSAMPLE      255
 #define CENTERJSAMPLE   128
 
@@ -97,22 +83,9 @@
  * managers, this is also the data type passed to fread/fwrite.
  */
 
-#ifdef HAVE_UNSIGNED_CHAR
-
 typedef unsigned char JOCTET;
 #define GETJOCTET(value)  (value)
 
-#else /* not HAVE_UNSIGNED_CHAR */
-
-typedef char JOCTET;
-#ifdef __CHAR_UNSIGNED__
-#define GETJOCTET(value)  (value)
-#else
-#define GETJOCTET(value)  ((value) & 0xFF)
-#endif /* __CHAR_UNSIGNED__ */
-
-#endif /* HAVE_UNSIGNED_CHAR */
-
 
 /* These typedefs are used for various table entries and so forth.
  * They must be at least as wide as specified; but making them too big
@@ -123,15 +96,7 @@
 
 /* UINT8 must hold at least the values 0..255. */
 
-#ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char UINT8;
-#else /* not HAVE_UNSIGNED_CHAR */
-#ifdef __CHAR_UNSIGNED__
-typedef char UINT8;
-#else /* not __CHAR_UNSIGNED__ */
-typedef short UINT8;
-#endif /* __CHAR_UNSIGNED__ */
-#endif /* HAVE_UNSIGNED_CHAR */
 
 /* UINT16 must hold at least the values 0..65535. */
 
diff --git a/jpegint.h b/jpegint.h
index ad36ca8..195fbcb 100644
--- a/jpegint.h
+++ b/jpegint.h
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2016, D. R. Commander.
+ * Copyright (C) 2015-2016, 2019, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -158,6 +158,9 @@
   JDIMENSION first_MCU_col[MAX_COMPONENTS];
   JDIMENSION last_MCU_col[MAX_COMPONENTS];
   boolean jinit_upsampler_no_alloc;
+
+  /* Last iMCU row that was successfully decoded */
+  JDIMENSION last_good_iMCU_row;
 };
 
 /* Input control module */
diff --git a/jpegtran.1 b/jpegtran.1
index 2efb264..3eb472d 100644
--- a/jpegtran.1
+++ b/jpegtran.1
@@ -1,4 +1,4 @@
-.TH JPEGTRAN 1 "18 March 2017"
+.TH JPEGTRAN 1 "18 December 2019"
 .SH NAME
 jpegtran \- lossless transformation of JPEG files
 .SH SYNOPSIS
@@ -229,9 +229,31 @@
 .B \-max 4m
 selects 4000000 bytes.  If more space is needed, an error will occur.
 .TP
+.BI \-maxscans " N"
+Abort if the input image contains more than
+.I N
+scans.  This feature demonstrates a method by which applications can guard
+against denial-of-service attacks instigated by specially-crafted malformed
+JPEG images containing numerous scans with missing image data or image data
+consisting only of "EOB runs" (a feature of progressive JPEG images that allows
+potentially hundreds of thousands of adjoining zero-value pixels to be
+represented using only a few bytes.)  Attempting to transform such malformed
+JPEG images can cause excessive CPU activity, since the decompressor must fully
+process each scan (even if the scan is corrupt) before it can proceed to the
+next scan.
+.TP
 .BI \-outfile " name"
 Send output image to the named file, not to standard output.
 .TP
+.BI \-report
+Report transformation progress.
+.TP
+.BI \-strict
+Treat all warnings as fatal.  This feature also demonstrates a method by which
+applications can guard against attacks instigated by specially-crafted
+malformed JPEG images.  Enabling this option will cause the decompressor to
+abort if the input image contains incomplete or corrupt image data.
+.TP
 .B \-verbose
 Enable debug printout.  More
 .BR \-v 's
diff --git a/jpegtran.c b/jpegtran.c
index 28cde2f..a02922b 100644
--- a/jpegtran.c
+++ b/jpegtran.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-2010, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2014, 2017, D. R. Commander.
+ * Copyright (C) 2010, 2014, 2017, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -41,7 +41,10 @@
 
 static const char *progname;    /* program name for error messages */
 static char *icc_filename;      /* for -icc switch */
+JDIMENSION max_scans;           /* for -maxscans switch */
 static char *outfilename;       /* for -outfile switch */
+boolean report;                 /* for -report switch */
+boolean strict;                 /* for -strict switch */
 static JCOPY_OPTION copyoption; /* -copy switch */
 static jpeg_transform_info transformoption; /* image transformation options */
 
@@ -87,7 +90,10 @@
   fprintf(stderr, "  -icc FILE      Embed ICC profile contained in FILE\n");
   fprintf(stderr, "  -restart N     Set restart interval in rows, or in blocks with B\n");
   fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
+  fprintf(stderr, "  -maxscans N    Maximum number of scans to allow in input file\n");
   fprintf(stderr, "  -outfile name  Specify name for output file\n");
+  fprintf(stderr, "  -report        Report transformation progress\n");
+  fprintf(stderr, "  -strict        Treat all warnings as fatal\n");
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
   fprintf(stderr, "  -version       Print version information and exit\n");
   fprintf(stderr, "Switches for wizards:\n");
@@ -141,7 +147,10 @@
   /* Set up default JPEG parameters. */
   simple_progressive = FALSE;
   icc_filename = NULL;
+  max_scans = 0;
   outfilename = NULL;
+  report = FALSE;
+  strict = FALSE;
   copyoption = JCOPYOPT_DEFAULT;
   transformoption.transform = JXFORM_NONE;
   transformoption.perfect = FALSE;
@@ -261,6 +270,12 @@
         lval *= 1000L;
       cinfo->mem->max_memory_to_use = lval * 1000L;
 
+    } else if (keymatch(arg, "maxscans", 4)) {
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (sscanf(argv[argn], "%u", &max_scans) != 1)
+        usage();
+
     } else if (keymatch(arg, "optimize", 1) || keymatch(arg, "optimise", 1)) {
       /* Enable entropy parm optimization. */
 #ifdef ENTROPY_OPT_SUPPORTED
@@ -293,6 +308,9 @@
       exit(EXIT_FAILURE);
 #endif
 
+    } else if (keymatch(arg, "report", 3)) {
+      report = TRUE;
+
     } else if (keymatch(arg, "restart", 1)) {
       /* Restart interval in MCU rows (or in MCUs with 'b'). */
       long lval;
@@ -338,6 +356,9 @@
       exit(EXIT_FAILURE);
 #endif
 
+    } else if (keymatch(arg, "strict", 2)) {
+      strict = TRUE;
+
     } else if (keymatch(arg, "transpose", 1)) {
       /* Transpose (across UL-to-LR axis). */
       select_transform(JXFORM_TRANSPOSE);
@@ -375,6 +396,19 @@
 }
 
 
+METHODDEF(void)
+my_emit_message(j_common_ptr cinfo, int msg_level)
+{
+  if (msg_level < 0) {
+    /* Treat warning as fatal */
+    cinfo->err->error_exit(cinfo);
+  } else {
+    if (cinfo->err->trace_level >= msg_level)
+      cinfo->err->output_message(cinfo);
+  }
+}
+
+
 /*
  * The main program.
  */
@@ -385,9 +419,7 @@
   struct jpeg_decompress_struct srcinfo;
   struct jpeg_compress_struct dstinfo;
   struct jpeg_error_mgr jsrcerr, jdsterr;
-#ifdef PROGRESS_REPORT
-  struct cdjpeg_progress_mgr progress;
-#endif
+  struct cdjpeg_progress_mgr src_progress, dst_progress;
   jvirt_barray_ptr *src_coef_arrays;
   jvirt_barray_ptr *dst_coef_arrays;
   int file_index;
@@ -427,6 +459,9 @@
   jsrcerr.trace_level = jdsterr.trace_level;
   srcinfo.mem->max_memory_to_use = dstinfo.mem->max_memory_to_use;
 
+  if (strict)
+    jsrcerr.emit_message = my_emit_message;
+
 #ifdef TWO_FILE_COMMANDLINE
   /* Must have either -outfile switch or explicit output file name */
   if (outfilename == NULL) {
@@ -492,9 +527,15 @@
       copyoption = JCOPYOPT_ALL_EXCEPT_ICC;
   }
 
-#ifdef PROGRESS_REPORT
-  start_progress_monitor((j_common_ptr)&dstinfo, &progress);
-#endif
+  if (report) {
+    start_progress_monitor((j_common_ptr)&dstinfo, &dst_progress);
+    dst_progress.report = report;
+  }
+  if (report || max_scans != 0) {
+    start_progress_monitor((j_common_ptr)&srcinfo, &src_progress);
+    src_progress.report = report;
+    src_progress.max_scans = max_scans;
+  }
 
   /* Specify data source for decompression */
   jpeg_stdio_src(&srcinfo, fp);
@@ -587,9 +628,10 @@
   if (fp != stdout)
     fclose(fp);
 
-#ifdef PROGRESS_REPORT
-  end_progress_monitor((j_common_ptr)&dstinfo);
-#endif
+  if (report)
+    end_progress_monitor((j_common_ptr)&dstinfo);
+  if (report || max_scans != 0)
+    end_progress_monitor((j_common_ptr)&srcinfo);
 
   free(icc_profile);
 
diff --git a/jquant1.c b/jquant1.c
index 40bbb28..73b83e1 100644
--- a/jquant1.c
+++ b/jquant1.c
@@ -479,7 +479,7 @@
     for (col = width; col > 0; col--) {
       pixcode = 0;
       for (ci = 0; ci < nc; ci++) {
-        pixcode += GETJSAMPLE(colorindex[ci][GETJSAMPLE(*ptrin++)]);
+        pixcode += colorindex[ci][*ptrin++];
       }
       *ptrout++ = (JSAMPLE)pixcode;
     }
@@ -506,9 +506,9 @@
     ptrin = input_buf[row];
     ptrout = output_buf[row];
     for (col = width; col > 0; col--) {
-      pixcode  = GETJSAMPLE(colorindex0[GETJSAMPLE(*ptrin++)]);
-      pixcode += GETJSAMPLE(colorindex1[GETJSAMPLE(*ptrin++)]);
-      pixcode += GETJSAMPLE(colorindex2[GETJSAMPLE(*ptrin++)]);
+      pixcode  = colorindex0[*ptrin++];
+      pixcode += colorindex1[*ptrin++];
+      pixcode += colorindex2[*ptrin++];
       *ptrout++ = (JSAMPLE)pixcode;
     }
   }
@@ -552,7 +552,7 @@
          * required amount of padding.
          */
         *output_ptr +=
-          colorindex_ci[GETJSAMPLE(*input_ptr) + dither[col_index]];
+          colorindex_ci[*input_ptr + dither[col_index]];
         input_ptr += nc;
         output_ptr++;
         col_index = (col_index + 1) & ODITHER_MASK;
@@ -595,12 +595,9 @@
     col_index = 0;
 
     for (col = width; col > 0; col--) {
-      pixcode  =
-        GETJSAMPLE(colorindex0[GETJSAMPLE(*input_ptr++) + dither0[col_index]]);
-      pixcode +=
-        GETJSAMPLE(colorindex1[GETJSAMPLE(*input_ptr++) + dither1[col_index]]);
-      pixcode +=
-        GETJSAMPLE(colorindex2[GETJSAMPLE(*input_ptr++) + dither2[col_index]]);
+      pixcode  = colorindex0[(*input_ptr++) + dither0[col_index]];
+      pixcode += colorindex1[(*input_ptr++) + dither1[col_index]];
+      pixcode += colorindex2[(*input_ptr++) + dither2[col_index]];
       *output_ptr++ = (JSAMPLE)pixcode;
       col_index = (col_index + 1) & ODITHER_MASK;
     }
@@ -677,15 +674,15 @@
          * The maximum error is +- MAXJSAMPLE; this sets the required size
          * of the range_limit array.
          */
-        cur += GETJSAMPLE(*input_ptr);
-        cur = GETJSAMPLE(range_limit[cur]);
+        cur += *input_ptr;
+        cur = range_limit[cur];
         /* Select output value, accumulate into output code for this pixel */
-        pixcode = GETJSAMPLE(colorindex_ci[cur]);
+        pixcode = colorindex_ci[cur];
         *output_ptr += (JSAMPLE)pixcode;
         /* Compute actual representation error at this pixel */
         /* Note: we can do this even though we don't have the final */
         /* pixel code, because the colormap is orthogonal. */
-        cur -= GETJSAMPLE(colormap_ci[pixcode]);
+        cur -= colormap_ci[pixcode];
         /* Compute error fractions to be propagated to adjacent pixels.
          * Add these into the running sums, and simultaneously shift the
          * next-line error sums left by 1 column.
diff --git a/jquant2.c b/jquant2.c
index 6570613..44efb18 100644
--- a/jquant2.c
+++ b/jquant2.c
@@ -215,9 +215,9 @@
     ptr = input_buf[row];
     for (col = width; col > 0; col--) {
       /* get pixel value and index into the histogram */
-      histp = &histogram[GETJSAMPLE(ptr[0]) >> C0_SHIFT]
-                        [GETJSAMPLE(ptr[1]) >> C1_SHIFT]
-                        [GETJSAMPLE(ptr[2]) >> C2_SHIFT];
+      histp = &histogram[ptr[0] >> C0_SHIFT]
+                        [ptr[1] >> C1_SHIFT]
+                        [ptr[2] >> C2_SHIFT];
       /* increment, check for overflow and undo increment if so. */
       if (++(*histp) <= 0)
         (*histp)--;
@@ -665,7 +665,7 @@
 
   for (i = 0; i < numcolors; i++) {
     /* We compute the squared-c0-distance term, then add in the other two. */
-    x = GETJSAMPLE(cinfo->colormap[0][i]);
+    x = cinfo->colormap[0][i];
     if (x < minc0) {
       tdist = (x - minc0) * C0_SCALE;
       min_dist = tdist * tdist;
@@ -688,7 +688,7 @@
       }
     }
 
-    x = GETJSAMPLE(cinfo->colormap[1][i]);
+    x = cinfo->colormap[1][i];
     if (x < minc1) {
       tdist = (x - minc1) * C1_SCALE;
       min_dist += tdist * tdist;
@@ -710,7 +710,7 @@
       }
     }
 
-    x = GETJSAMPLE(cinfo->colormap[2][i]);
+    x = cinfo->colormap[2][i];
     if (x < minc2) {
       tdist = (x - minc2) * C2_SCALE;
       min_dist += tdist * tdist;
@@ -788,13 +788,13 @@
 #define STEP_C2  ((1 << C2_SHIFT) * C2_SCALE)
 
   for (i = 0; i < numcolors; i++) {
-    icolor = GETJSAMPLE(colorlist[i]);
+    icolor = colorlist[i];
     /* Compute (square of) distance from minc0/c1/c2 to this color */
-    inc0 = (minc0 - GETJSAMPLE(cinfo->colormap[0][icolor])) * C0_SCALE;
+    inc0 = (minc0 - cinfo->colormap[0][icolor]) * C0_SCALE;
     dist0 = inc0 * inc0;
-    inc1 = (minc1 - GETJSAMPLE(cinfo->colormap[1][icolor])) * C1_SCALE;
+    inc1 = (minc1 - cinfo->colormap[1][icolor]) * C1_SCALE;
     dist0 += inc1 * inc1;
-    inc2 = (minc2 - GETJSAMPLE(cinfo->colormap[2][icolor])) * C2_SCALE;
+    inc2 = (minc2 - cinfo->colormap[2][icolor]) * C2_SCALE;
     dist0 += inc2 * inc2;
     /* Form the initial difference increments */
     inc0 = inc0 * (2 * STEP_C0) + STEP_C0 * STEP_C0;
@@ -879,7 +879,7 @@
     for (ic1 = 0; ic1 < BOX_C1_ELEMS; ic1++) {
       cachep = &histogram[c0 + ic0][c1 + ic1][c2];
       for (ic2 = 0; ic2 < BOX_C2_ELEMS; ic2++) {
-        *cachep++ = (histcell)(GETJSAMPLE(*cptr++) + 1);
+        *cachep++ = (histcell)((*cptr++) + 1);
       }
     }
   }
@@ -909,9 +909,9 @@
     outptr = output_buf[row];
     for (col = width; col > 0; col--) {
       /* get pixel value and index into the cache */
-      c0 = GETJSAMPLE(*inptr++) >> C0_SHIFT;
-      c1 = GETJSAMPLE(*inptr++) >> C1_SHIFT;
-      c2 = GETJSAMPLE(*inptr++) >> C2_SHIFT;
+      c0 = (*inptr++) >> C0_SHIFT;
+      c1 = (*inptr++) >> C1_SHIFT;
+      c2 = (*inptr++) >> C2_SHIFT;
       cachep = &histogram[c0][c1][c2];
       /* If we have not seen this color before, find nearest colormap entry */
       /* and update the cache */
@@ -996,12 +996,12 @@
        * The maximum error is +- MAXJSAMPLE (or less with error limiting);
        * this sets the required size of the range_limit array.
        */
-      cur0 += GETJSAMPLE(inptr[0]);
-      cur1 += GETJSAMPLE(inptr[1]);
-      cur2 += GETJSAMPLE(inptr[2]);
-      cur0 = GETJSAMPLE(range_limit[cur0]);
-      cur1 = GETJSAMPLE(range_limit[cur1]);
-      cur2 = GETJSAMPLE(range_limit[cur2]);
+      cur0 += inptr[0];
+      cur1 += inptr[1];
+      cur2 += inptr[2];
+      cur0 = range_limit[cur0];
+      cur1 = range_limit[cur1];
+      cur2 = range_limit[cur2];
       /* Index into the cache with adjusted pixel value */
       cachep =
         &histogram[cur0 >> C0_SHIFT][cur1 >> C1_SHIFT][cur2 >> C2_SHIFT];
@@ -1015,9 +1015,9 @@
         register int pixcode = *cachep - 1;
         *outptr = (JSAMPLE)pixcode;
         /* Compute representation error for this pixel */
-        cur0 -= GETJSAMPLE(colormap0[pixcode]);
-        cur1 -= GETJSAMPLE(colormap1[pixcode]);
-        cur2 -= GETJSAMPLE(colormap2[pixcode]);
+        cur0 -= colormap0[pixcode];
+        cur1 -= colormap1[pixcode];
+        cur2 -= colormap2[pixcode];
       }
       /* Compute error fractions to be propagated to adjacent pixels.
        * Add these into the running sums, and simultaneously shift the
diff --git a/rdbmp.c b/rdbmp.c
index 51af237..f386285 100644
--- a/rdbmp.c
+++ b/rdbmp.c
@@ -34,18 +34,8 @@
 
 /* Macros to deal with unsigned chars as efficiently as compiler allows */
 
-#ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char U_CHAR;
 #define UCH(x)  ((int)(x))
-#else /* !HAVE_UNSIGNED_CHAR */
-#ifdef __CHAR_UNSIGNED__
-typedef char U_CHAR;
-#define UCH(x)  ((int)(x))
-#else
-typedef char U_CHAR;
-#define UCH(x)  ((int)(x) & 0xFF)
-#endif
-#endif /* HAVE_UNSIGNED_CHAR */
 
 
 #define ReadOK(file, buffer, len) \
@@ -179,14 +169,14 @@
   outptr = source->pub.buffer[0];
   if (cinfo->in_color_space == JCS_GRAYSCALE) {
     for (col = cinfo->image_width; col > 0; col--) {
-      t = GETJSAMPLE(*inptr++);
+      t = *inptr++;
       if (t >= cmaplen)
         ERREXIT(cinfo, JERR_BMP_OUTOFRANGE);
       *outptr++ = colormap[0][t];
     }
   } else if (cinfo->in_color_space == JCS_CMYK) {
     for (col = cinfo->image_width; col > 0; col--) {
-      t = GETJSAMPLE(*inptr++);
+      t = *inptr++;
       if (t >= cmaplen)
         ERREXIT(cinfo, JERR_BMP_OUTOFRANGE);
       rgb_to_cmyk(colormap[0][t], colormap[1][t], colormap[2][t], outptr,
@@ -202,7 +192,7 @@
 
     if (aindex >= 0) {
       for (col = cinfo->image_width; col > 0; col--) {
-        t = GETJSAMPLE(*inptr++);
+        t = *inptr++;
         if (t >= cmaplen)
           ERREXIT(cinfo, JERR_BMP_OUTOFRANGE);
         outptr[rindex] = colormap[0][t];
@@ -213,7 +203,7 @@
       }
     } else {
       for (col = cinfo->image_width; col > 0; col--) {
-        t = GETJSAMPLE(*inptr++);
+        t = *inptr++;
         if (t >= cmaplen)
           ERREXIT(cinfo, JERR_BMP_OUTOFRANGE);
         outptr[rindex] = colormap[0][t];
@@ -258,7 +248,6 @@
     MEMCOPY(outptr, inptr, source->row_width);
   } else if (cinfo->in_color_space == JCS_CMYK) {
     for (col = cinfo->image_width; col > 0; col--) {
-      /* can omit GETJSAMPLE() safely */
       JSAMPLE b = *inptr++, g = *inptr++, r = *inptr++;
       rgb_to_cmyk(r, g, b, outptr, outptr + 1, outptr + 2, outptr + 3);
       outptr += 4;
@@ -272,7 +261,7 @@
 
     if (aindex >= 0) {
       for (col = cinfo->image_width; col > 0; col--) {
-        outptr[bindex] = *inptr++;      /* can omit GETJSAMPLE() safely */
+        outptr[bindex] = *inptr++;
         outptr[gindex] = *inptr++;
         outptr[rindex] = *inptr++;
         outptr[aindex] = 0xFF;
@@ -280,7 +269,7 @@
       }
     } else {
       for (col = cinfo->image_width; col > 0; col--) {
-        outptr[bindex] = *inptr++;      /* can omit GETJSAMPLE() safely */
+        outptr[bindex] = *inptr++;
         outptr[gindex] = *inptr++;
         outptr[rindex] = *inptr++;
         outptr += ps;
@@ -323,7 +312,6 @@
     MEMCOPY(outptr, inptr, source->row_width);
   } else if (cinfo->in_color_space == JCS_CMYK) {
     for (col = cinfo->image_width; col > 0; col--) {
-      /* can omit GETJSAMPLE() safely */
       JSAMPLE b = *inptr++, g = *inptr++, r = *inptr++;
       rgb_to_cmyk(r, g, b, outptr, outptr + 1, outptr + 2, outptr + 3);
       inptr++;                          /* skip the 4th byte (Alpha channel) */
@@ -338,7 +326,7 @@
 
     if (aindex >= 0) {
       for (col = cinfo->image_width; col > 0; col--) {
-        outptr[bindex] = *inptr++;      /* can omit GETJSAMPLE() safely */
+        outptr[bindex] = *inptr++;
         outptr[gindex] = *inptr++;
         outptr[rindex] = *inptr++;
         outptr[aindex] = *inptr++;
@@ -346,7 +334,7 @@
       }
     } else {
       for (col = cinfo->image_width; col > 0; col--) {
-        outptr[bindex] = *inptr++;      /* can omit GETJSAMPLE() safely */
+        outptr[bindex] = *inptr++;
         outptr[gindex] = *inptr++;
         outptr[rindex] = *inptr++;
         inptr++;                        /* skip the 4th byte (Alpha channel) */
diff --git a/rdcolmap.c b/rdcolmap.c
index cbbef59..d2ed95c 100644
--- a/rdcolmap.c
+++ b/rdcolmap.c
@@ -54,9 +54,8 @@
 
   /* Check for duplicate color. */
   for (index = 0; index < ncolors; index++) {
-    if (GETJSAMPLE(colormap0[index]) == R &&
-        GETJSAMPLE(colormap1[index]) == G &&
-        GETJSAMPLE(colormap2[index]) == B)
+    if (colormap0[index] == R && colormap1[index] == G &&
+        colormap2[index] == B)
       return;                   /* color is already in map */
   }
 
diff --git a/rdppm.c b/rdppm.c
index a8507b9..508c8c4 100644
--- a/rdppm.c
+++ b/rdppm.c
@@ -43,18 +43,8 @@
 
 /* Macros to deal with unsigned chars as efficiently as compiler allows */
 
-#ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char U_CHAR;
 #define UCH(x)  ((int)(x))
-#else /* !HAVE_UNSIGNED_CHAR */
-#ifdef __CHAR_UNSIGNED__
-typedef char U_CHAR;
-#define UCH(x)  ((int)(x))
-#else
-typedef char U_CHAR;
-#define UCH(x)  ((int)(x) & 0xFF)
-#endif
-#endif /* HAVE_UNSIGNED_CHAR */
 
 
 #define ReadOK(file, buffer, len) \
diff --git a/rdrle.c b/rdrle.c
deleted file mode 100644
index b694514..0000000
--- a/rdrle.c
+++ /dev/null
@@ -1,389 +0,0 @@
-/*
- * rdrle.c
- *
- * This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1991-1996, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code and
- * information relevant to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README.ijg
- * file.
- *
- * This file contains routines to read input images in Utah RLE format.
- * The Utah Raster Toolkit library is required (version 3.1 or later).
- *
- * These routines may need modification for non-Unix environments or
- * specialized applications.  As they stand, they assume input from
- * an ordinary stdio stream.  They further assume that reading begins
- * at the start of the file; start_input may need work if the
- * user interface has already read some data (e.g., to determine that
- * the file is indeed RLE format).
- *
- * Based on code contributed by Mike Lijewski,
- * with updates from Robert Hutchinson.
- */
-
-#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
-
-#ifdef RLE_SUPPORTED
-
-/* rle.h is provided by the Utah Raster Toolkit. */
-
-#include <rle.h>
-
-/*
- * We assume that JSAMPLE has the same representation as rle_pixel,
- * to wit, "unsigned char".  Hence we can't cope with 12- or 16-bit samples.
- */
-
-#if BITS_IN_JSAMPLE != 8
-  Sorry, this code only copes with 8-bit JSAMPLEs. /* deliberate syntax err */
-#endif
-
-/*
- * We support the following types of RLE files:
- *
- *   GRAYSCALE   - 8 bits, no colormap
- *   MAPPEDGRAY  - 8 bits, 1 channel colomap
- *   PSEUDOCOLOR - 8 bits, 3 channel colormap
- *   TRUECOLOR   - 24 bits, 3 channel colormap
- *   DIRECTCOLOR - 24 bits, no colormap
- *
- * For now, we ignore any alpha channel in the image.
- */
-
-typedef enum
-  { GRAYSCALE, MAPPEDGRAY, PSEUDOCOLOR, TRUECOLOR, DIRECTCOLOR } rle_kind;
-
-
-/*
- * Since RLE stores scanlines bottom-to-top, we have to invert the image
- * to conform to JPEG's top-to-bottom order.  To do this, we read the
- * incoming image into a virtual array on the first get_pixel_rows call,
- * then fetch the required row from the virtual array on subsequent calls.
- */
-
-typedef struct _rle_source_struct *rle_source_ptr;
-
-typedef struct _rle_source_struct {
-  struct cjpeg_source_struct pub; /* public fields */
-
-  rle_kind visual;              /* actual type of input file */
-  jvirt_sarray_ptr image;       /* virtual array to hold the image */
-  JDIMENSION row;               /* current row # in the virtual array */
-  rle_hdr header;               /* Input file information */
-  rle_pixel **rle_row;          /* holds a row returned by rle_getrow() */
-
-} rle_source_struct;
-
-
-/*
- * Read the file header; return image size and component count.
- */
-
-METHODDEF(void)
-start_input_rle(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
-{
-  rle_source_ptr source = (rle_source_ptr)sinfo;
-  JDIMENSION width, height;
-#ifdef PROGRESS_REPORT
-  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
-#endif
-
-  /* Use RLE library routine to get the header info */
-  source->header = *rle_hdr_init(NULL);
-  source->header.rle_file = source->pub.input_file;
-  switch (rle_get_setup(&(source->header))) {
-  case RLE_SUCCESS:
-    /* A-OK */
-    break;
-  case RLE_NOT_RLE:
-    ERREXIT(cinfo, JERR_RLE_NOT);
-    break;
-  case RLE_NO_SPACE:
-    ERREXIT(cinfo, JERR_RLE_MEM);
-    break;
-  case RLE_EMPTY:
-    ERREXIT(cinfo, JERR_RLE_EMPTY);
-    break;
-  case RLE_EOF:
-    ERREXIT(cinfo, JERR_RLE_EOF);
-    break;
-  default:
-    ERREXIT(cinfo, JERR_RLE_BADERROR);
-    break;
-  }
-
-  /* Figure out what we have, set private vars and return values accordingly */
-
-  width  = source->header.xmax - source->header.xmin + 1;
-  height = source->header.ymax - source->header.ymin + 1;
-  source->header.xmin = 0;              /* realign horizontally */
-  source->header.xmax = width - 1;
-
-  cinfo->image_width      = width;
-  cinfo->image_height     = height;
-  cinfo->data_precision   = 8;  /* we can only handle 8 bit data */
-
-  if (source->header.ncolors == 1 && source->header.ncmap == 0) {
-    source->visual     = GRAYSCALE;
-    TRACEMS2(cinfo, 1, JTRC_RLE_GRAY, width, height);
-  } else if (source->header.ncolors == 1 && source->header.ncmap == 1) {
-    source->visual     = MAPPEDGRAY;
-    TRACEMS3(cinfo, 1, JTRC_RLE_MAPGRAY, width, height,
-             1 << source->header.cmaplen);
-  } else if (source->header.ncolors == 1 && source->header.ncmap == 3) {
-    source->visual     = PSEUDOCOLOR;
-    TRACEMS3(cinfo, 1, JTRC_RLE_MAPPED, width, height,
-             1 << source->header.cmaplen);
-  } else if (source->header.ncolors == 3 && source->header.ncmap == 3) {
-    source->visual     = TRUECOLOR;
-    TRACEMS3(cinfo, 1, JTRC_RLE_FULLMAP, width, height,
-             1 << source->header.cmaplen);
-  } else if (source->header.ncolors == 3 && source->header.ncmap == 0) {
-    source->visual     = DIRECTCOLOR;
-    TRACEMS2(cinfo, 1, JTRC_RLE, width, height);
-  } else
-    ERREXIT(cinfo, JERR_RLE_UNSUPPORTED);
-
-  if (source->visual == GRAYSCALE || source->visual == MAPPEDGRAY) {
-    cinfo->in_color_space   = JCS_GRAYSCALE;
-    cinfo->input_components = 1;
-  } else {
-    cinfo->in_color_space   = JCS_RGB;
-    cinfo->input_components = 3;
-  }
-
-  /*
-   * A place to hold each scanline while it's converted.
-   * (GRAYSCALE scanlines don't need converting)
-   */
-  if (source->visual != GRAYSCALE) {
-    source->rle_row = (rle_pixel **)(*cinfo->mem->alloc_sarray)
-      ((j_common_ptr)cinfo, JPOOL_IMAGE,
-       (JDIMENSION)width, (JDIMENSION)cinfo->input_components);
-  }
-
-  /* request a virtual array to hold the image */
-  source->image = (*cinfo->mem->request_virt_sarray)
-    ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
-     (JDIMENSION)(width * source->header.ncolors),
-     (JDIMENSION)height, (JDIMENSION)1);
-
-#ifdef PROGRESS_REPORT
-  if (progress != NULL) {
-    /* count file input as separate pass */
-    progress->total_extra_passes++;
-  }
-#endif
-
-  source->pub.buffer_height = 1;
-}
-
-
-/*
- * Read one row of pixels.
- * Called only after load_image has read the image into the virtual array.
- * Used for GRAYSCALE, MAPPEDGRAY, TRUECOLOR, and DIRECTCOLOR images.
- */
-
-METHODDEF(JDIMENSION)
-get_rle_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
-{
-  rle_source_ptr source = (rle_source_ptr)sinfo;
-
-  source->row--;
-  source->pub.buffer = (*cinfo->mem->access_virt_sarray)
-    ((j_common_ptr)cinfo, source->image, source->row, (JDIMENSION)1, FALSE);
-
-  return 1;
-}
-
-/*
- * Read one row of pixels.
- * Called only after load_image has read the image into the virtual array.
- * Used for PSEUDOCOLOR images.
- */
-
-METHODDEF(JDIMENSION)
-get_pseudocolor_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
-{
-  rle_source_ptr source = (rle_source_ptr)sinfo;
-  JSAMPROW src_row, dest_row;
-  JDIMENSION col;
-  rle_map *colormap;
-  int val;
-
-  colormap = source->header.cmap;
-  dest_row = source->pub.buffer[0];
-  source->row--;
-  src_row = *(*cinfo->mem->access_virt_sarray)
-    ((j_common_ptr)cinfo, source->image, source->row, (JDIMENSION)1, FALSE);
-
-  for (col = cinfo->image_width; col > 0; col--) {
-    val = GETJSAMPLE(*src_row++);
-    *dest_row++ = (JSAMPLE)(colormap[val      ] >> 8);
-    *dest_row++ = (JSAMPLE)(colormap[val + 256] >> 8);
-    *dest_row++ = (JSAMPLE)(colormap[val + 512] >> 8);
-  }
-
-  return 1;
-}
-
-
-/*
- * Load the image into a virtual array.  We have to do this because RLE
- * files start at the lower left while the JPEG standard has them starting
- * in the upper left.  This is called the first time we want to get a row
- * of input.  What we do is load the RLE data into the array and then call
- * the appropriate routine to read one row from the array.  Before returning,
- * we set source->pub.get_pixel_rows so that subsequent calls go straight to
- * the appropriate row-reading routine.
- */
-
-METHODDEF(JDIMENSION)
-load_image(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
-{
-  rle_source_ptr source = (rle_source_ptr)sinfo;
-  JDIMENSION row, col;
-  JSAMPROW scanline, red_ptr, green_ptr, blue_ptr;
-  rle_pixel **rle_row;
-  rle_map *colormap;
-  char channel;
-#ifdef PROGRESS_REPORT
-  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
-#endif
-
-  colormap = source->header.cmap;
-  rle_row = source->rle_row;
-
-  /* Read the RLE data into our virtual array.
-   * We assume here that rle_pixel is represented the same as JSAMPLE.
-   */
-  RLE_CLR_BIT(source->header, RLE_ALPHA); /* don't read the alpha channel */
-
-#ifdef PROGRESS_REPORT
-  if (progress != NULL) {
-    progress->pub.pass_limit = cinfo->image_height;
-    progress->pub.pass_counter = 0;
-    (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
-  }
-#endif
-
-  switch (source->visual) {
-
-  case GRAYSCALE:
-  case PSEUDOCOLOR:
-    for (row = 0; row < cinfo->image_height; row++) {
-      rle_row = (rle_pixel **)(*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr)cinfo, source->image, row, (JDIMENSION)1, TRUE);
-      rle_getrow(&source->header, rle_row);
-#ifdef PROGRESS_REPORT
-      if (progress != NULL) {
-        progress->pub.pass_counter++;
-        (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
-      }
-#endif
-    }
-    break;
-
-  case MAPPEDGRAY:
-  case TRUECOLOR:
-    for (row = 0; row < cinfo->image_height; row++) {
-      scanline = *(*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr)cinfo, source->image, row, (JDIMENSION)1, TRUE);
-      rle_row = source->rle_row;
-      rle_getrow(&source->header, rle_row);
-
-      for (col = 0; col < cinfo->image_width; col++) {
-        for (channel = 0; channel < source->header.ncolors; channel++) {
-          *scanline++ = (JSAMPLE)
-            (colormap[GETJSAMPLE(rle_row[channel][col]) + 256 * channel] >> 8);
-        }
-      }
-
-#ifdef PROGRESS_REPORT
-      if (progress != NULL) {
-        progress->pub.pass_counter++;
-        (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
-      }
-#endif
-    }
-    break;
-
-  case DIRECTCOLOR:
-    for (row = 0; row < cinfo->image_height; row++) {
-      scanline = *(*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr)cinfo, source->image, row, (JDIMENSION)1, TRUE);
-      rle_getrow(&source->header, rle_row);
-
-      red_ptr   = rle_row[0];
-      green_ptr = rle_row[1];
-      blue_ptr  = rle_row[2];
-
-      for (col = cinfo->image_width; col > 0; col--) {
-        *scanline++ = *red_ptr++;
-        *scanline++ = *green_ptr++;
-        *scanline++ = *blue_ptr++;
-      }
-
-#ifdef PROGRESS_REPORT
-      if (progress != NULL) {
-        progress->pub.pass_counter++;
-        (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
-      }
-#endif
-    }
-  }
-
-#ifdef PROGRESS_REPORT
-  if (progress != NULL)
-    progress->completed_extra_passes++;
-#endif
-
-  /* Set up to call proper row-extraction routine in future */
-  if (source->visual == PSEUDOCOLOR) {
-    source->pub.buffer = source->rle_row;
-    source->pub.get_pixel_rows = get_pseudocolor_row;
-  } else {
-    source->pub.get_pixel_rows = get_rle_row;
-  }
-  source->row = cinfo->image_height;
-
-  /* And fetch the topmost (bottommost) row */
-  return (*source->pub.get_pixel_rows) (cinfo, sinfo);
-}
-
-
-/*
- * Finish up at the end of the file.
- */
-
-METHODDEF(void)
-finish_input_rle(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
-{
-  /* no work */
-}
-
-
-/*
- * The module selection routine for RLE format input.
- */
-
-GLOBAL(cjpeg_source_ptr)
-jinit_read_rle(j_compress_ptr cinfo)
-{
-  rle_source_ptr source;
-
-  /* Create module interface object */
-  source = (rle_source_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                sizeof(rle_source_struct));
-  /* Fill in method ptrs */
-  source->pub.start_input = start_input_rle;
-  source->pub.finish_input = finish_input_rle;
-  source->pub.get_pixel_rows = load_image;
-
-  return (cjpeg_source_ptr)source;
-}
-
-#endif /* RLE_SUPPORTED */
diff --git a/rdtarga.c b/rdtarga.c
index 37bd286..cd6e93d 100644
--- a/rdtarga.c
+++ b/rdtarga.c
@@ -28,18 +28,8 @@
 
 /* Macros to deal with unsigned chars as efficiently as compiler allows */
 
-#ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char U_CHAR;
 #define UCH(x)  ((int)(x))
-#else /* !HAVE_UNSIGNED_CHAR */
-#ifdef __CHAR_UNSIGNED__
-typedef char U_CHAR;
-#define UCH(x)  ((int)(x))
-#else
-typedef char U_CHAR;
-#define UCH(x)  ((int)(x) & 0xFF)
-#endif
-#endif /* HAVE_UNSIGNED_CHAR */
 
 
 #define ReadOK(file, buffer, len) \
diff --git a/release/Config.cmake.in b/release/Config.cmake.in
new file mode 100644
index 0000000..0c1ba8a
--- /dev/null
+++ b/release/Config.cmake.in
@@ -0,0 +1,4 @@
+@PACKAGE_INIT@
+
+include("${CMAKE_CURRENT_LIST_DIR}/@CMAKE_PROJECT_NAME@Targets.cmake")
+check_required_components("@CMAKE_PROJECT_NAME@")
diff --git a/release/ReadMe.txt b/release/ReadMe.txt
index 0d1888d..64fc294 100644
--- a/release/ReadMe.txt
+++ b/release/ReadMe.txt
@@ -1,4 +1,4 @@
-libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and MIPS systems, as well as progressive JPEG compression on x86 and x86-64 systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines.  In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
+libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Armv8 systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can still outperform libjpeg by a significant amount, by virtue of its highly-optimized Huffman coding routines.  In many cases, the performance of libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
 
 libjpeg-turbo implements both the traditional libjpeg API as well as the less powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features colorspace extensions that allow it to compress from/decompress to 32-bit and big-endian pixel buffers (RGBX, XBGR, etc.), as well as a full-featured Java interface.
 
diff --git a/release/Welcome.rtf b/release/Welcome.rtf.in
similarity index 91%
rename from release/Welcome.rtf
rename to release/Welcome.rtf.in
index a570c5b..6bec24d 100644
--- a/release/Welcome.rtf
+++ b/release/Welcome.rtf.in
@@ -9,9 +9,9 @@
 \
 \pard\pardeftab720\ql\qnatural
 
-\f1 \cf0   /opt/libjpeg-turbo/bin/uninstall\
+\f1 \cf0   @CMAKE_INSTALL_FULL_BINDIR@/uninstall\
 \pard\pardeftab720\ql\qnatural
 
 \f0 \cf0 \
 from the command line.\
-}
\ No newline at end of file
+}
diff --git a/release/deb-control.in b/release/deb-control.in
index b82bdac..b59f8f9 100644
--- a/release/deb-control.in
+++ b/release/deb-control.in
@@ -10,12 +10,12 @@
 Description: A SIMD-accelerated JPEG codec that provides both the libjpeg and TurboJPEG APIs
  libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
  baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and
- MIPS systems, as well as progressive JPEG compression on x86 and x86-64
- systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
- all else being equal.  On other types of systems, libjpeg-turbo can still
- outperform libjpeg by a significant amount, by virtue of its highly-optimized
- Huffman coding routines.  In many cases, the performance of libjpeg-turbo
- rivals that of proprietary high-speed JPEG codecs.
+ MIPS systems, as well as progressive JPEG compression on x86, x86-64, and
+ Armv8 systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as
+ libjpeg, all else being equal.  On other types of systems, libjpeg-turbo can
+ still outperform libjpeg by a significant amount, by virtue of its
+ highly-optimized Huffman coding routines.  In many cases, the performance of
+ libjpeg-turbo rivals that of proprietary high-speed JPEG codecs.
  .
  libjpeg-turbo implements both the traditional libjpeg API as well as the less
  powerful but more straightforward TurboJPEG API.  libjpeg-turbo also features
diff --git a/release/installer.nsi.in b/release/installer.nsi.in
index 44419fa..65db63d 100644
--- a/release/installer.nsi.in
+++ b/release/installer.nsi.in
@@ -71,6 +71,11 @@
 	SetOutPath $INSTDIR\lib\pkgconfig
 	File "@CMAKE_CURRENT_BINARY_DIR@\pkgscripts\libjpeg.pc"
 	File "@CMAKE_CURRENT_BINARY_DIR@\pkgscripts\libturbojpeg.pc"
+	SetOutPath $INSTDIR\lib\cmake\@CMAKE_PROJECT_NAME@
+	File "@CMAKE_CURRENT_BINARY_DIR@\pkgscripts\@CMAKE_PROJECT_NAME@Config.cmake"
+	File "@CMAKE_CURRENT_BINARY_DIR@\pkgscripts\@CMAKE_PROJECT_NAME@ConfigVersion.cmake"
+	File "@CMAKE_CURRENT_BINARY_DIR@\win\@CMAKE_PROJECT_NAME@Targets.cmake"
+	File "@CMAKE_CURRENT_BINARY_DIR@\win\@CMAKE_PROJECT_NAME@Targets-release.cmake"
 !ifdef JAVA
 	SetOutPath $INSTDIR\classes
 	File "@CMAKE_CURRENT_BINARY_DIR@\java\turbojpeg.jar"
@@ -141,6 +146,10 @@
 !endif
 	Delete $INSTDIR\lib\pkgconfig\libjpeg.pc
 	Delete $INSTDIR\lib\pkgconfig\libturbojpeg.pc
+	Delete $INSTDIR\lib\cmake\@CMAKE_PROJECT_NAME@\@CMAKE_PROJECT_NAME@Config.cmake
+	Delete $INSTDIR\lib\cmake\@CMAKE_PROJECT_NAME@\@CMAKE_PROJECT_NAME@ConfigVersion.cmake
+	Delete $INSTDIR\lib\cmake\@CMAKE_PROJECT_NAME@\@CMAKE_PROJECT_NAME@Targets.cmake
+	Delete $INSTDIR\lib\cmake\@CMAKE_PROJECT_NAME@\@CMAKE_PROJECT_NAME@Targets-release.cmake
 !ifdef JAVA
 	Delete $INSTDIR\classes\turbojpeg.jar
 !endif
@@ -176,6 +185,8 @@
 
 	RMDir "$INSTDIR\include"
 	RMDir "$INSTDIR\lib\pkgconfig"
+	RMDir "$INSTDIR\lib\cmake\@CMAKE_PROJECT_NAME@"
+	RMDir "$INSTDIR\lib\cmake"
 	RMDir "$INSTDIR\lib"
 	RMDir "$INSTDIR\doc"
 !ifdef GCC
diff --git a/release/makecygwinpkg.in b/release/makecygwinpkg.in
deleted file mode 100755
index b7f353e..0000000
--- a/release/makecygwinpkg.in
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/sh
-
-set -u
-set -e
-trap onexit INT
-trap onexit TERM
-trap onexit EXIT
-
-TMPDIR=
-
-onexit()
-{
-	if [ ! "$TMPDIR" = "" ]; then
-		rm -rf $TMPDIR
-	fi
-}
-
-safedirmove ()
-{
-	if [ "$1" = "$2" ]; then
-		return 0
-	fi
-	if [ "$1" = "" -o ! -d "$1" ]; then
-		echo safedirmove: source dir $1 is not valid
-		return 1
-	fi
-	if [ "$2" = "" -o -e "$2" ]; then
-		echo safedirmove: dest dir $2 is not valid
-		return 1
-	fi
-	if [ "$3" = "" -o -e "$3" ]; then
-		echo safedirmove: tmp dir $3 is not valid
-		return 1
-	fi
-	mkdir -p $3
-	mv $1/* $3/
-	rmdir $1
-	mkdir -p $2
-	mv $3/* $2/
-	rmdir $3
-	return 0
-}
-
-PKGNAME=@PKGNAME@
-VERSION=@VERSION@
-BUILD=@BUILD@
-
-PREFIX=@CMAKE_INSTALL_PREFIX@
-DOCDIR=@CMAKE_INSTALL_FULL_DOCDIR@
-LIBDIR=@CMAKE_INSTALL_FULL_LIBDIR@
-
-umask 022
-rm -f $PKGNAME-$VERSION-$BUILD.tar.bz2
-TMPDIR=`mktemp -d /tmp/ljtbuild.XXXXXX`
-__PWD=`pwd`
-make install DESTDIR=$TMPDIR/pkg
-if [ "$PREFIX" = "@CMAKE_INSTALL_DEFAULT_PREFIX@" -a "$DOCDIR" = "@CMAKE_INSTALL_DEFAULT_PREFIX@/doc" ]; then
-	safedirmove $TMPDIR/pkg$DOCDIR $TMPDIR/pkg/usr/share/doc/$PKGNAME-$VERSION $TMPDIR/__tmpdoc
-	ln -fs /usr/share/doc/$PKGNAME-$VERSION $TMPDIR/pkg$DOCDIR
-fi
-cd $TMPDIR/pkg
-tar cfj ../$PKGNAME-$VERSION-$BUILD.tar.bz2 *
-cd $__PWD
-mv $TMPDIR/*.tar.bz2 .
-
-exit 0
diff --git a/release/makemacpkg.in b/release/makemacpkg.in
index ae80bec..99c6e05 100755
--- a/release/makemacpkg.in
+++ b/release/makemacpkg.in
@@ -43,19 +43,14 @@
 
 usage()
 {
-	echo "$0 [universal] [-lipo [path to lipo]]"
+	echo "$0 [-lipo [path to lipo]]"
 	exit 1
 }
 
-UNIVERSAL=0
-
 PKGNAME=@PKGNAME@
 VERSION=@VERSION@
 BUILD=@BUILD@
 SRCDIR=@CMAKE_CURRENT_SOURCE_DIR@
-BUILDDIR32=@OSX_32BIT_BUILD@
-BUILDDIRARMV7=@IOS_ARMV7_BUILD@
-BUILDDIRARMV7S=@IOS_ARMV7S_BUILD@
 BUILDDIRARMV8=@IOS_ARMV8_BUILD@
 WITH_JAVA=@WITH_JAVA@
 OSX_APP_CERT_NAME="@OSX_APP_CERT_NAME@"
@@ -82,9 +77,6 @@
 			fi
 		fi
 		;;
-	universal)
-		UNIVERSAL=1
-		;;
 	esac
 	shift
 done
@@ -106,61 +98,6 @@
 	ln -fs /Library/Documentation/$PKGNAME $PKGROOT$DOCDIR
 fi
 
-if [ $UNIVERSAL = 1 -a "$BUILDDIR32" != "" ]; then
-	if [ ! -d $BUILDDIR32 ]; then
-		echo ERROR: 32-bit build directory $BUILDDIR32 does not exist
-		exit 1
-	fi
-	if [ ! -f $BUILDDIR32/Makefile ]; then
-		echo ERROR: 32-bit build directory $BUILDDIR32 is not configured
-		exit 1
-	fi
-	mkdir -p $TMPDIR/dist.x86
-	pushd $BUILDDIR32
-	make install DESTDIR=$TMPDIR/dist.x86
-	popd
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$LIBDIR/$LIBJPEG_DSO_NAME \
-		-arch x86_64 $PKGROOT/$LIBDIR/$LIBJPEG_DSO_NAME \
-		-output $PKGROOT/$LIBDIR/$LIBJPEG_DSO_NAME
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$LIBDIR/libjpeg.a \
-		-arch x86_64 $PKGROOT/$LIBDIR/libjpeg.a \
-		-output $PKGROOT/$LIBDIR/libjpeg.a
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$LIBDIR/$TURBOJPEG_DSO_NAME \
-		-arch x86_64 $PKGROOT/$LIBDIR/$TURBOJPEG_DSO_NAME \
-		-output $PKGROOT/$LIBDIR/$TURBOJPEG_DSO_NAME
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$LIBDIR/libturbojpeg.a \
-		-arch x86_64 $PKGROOT/$LIBDIR/libturbojpeg.a \
-		-output $PKGROOT/$LIBDIR/libturbojpeg.a
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$BINDIR/cjpeg \
-		-arch x86_64 $PKGROOT/$BINDIR/cjpeg \
-		-output $PKGROOT/$BINDIR/cjpeg
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$BINDIR/djpeg \
-		-arch x86_64 $PKGROOT/$BINDIR/djpeg \
-		-output $PKGROOT/$BINDIR/djpeg
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$BINDIR/jpegtran \
-		-arch x86_64 $PKGROOT/$BINDIR/jpegtran \
-		-output $PKGROOT/$BINDIR/jpegtran
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$BINDIR/tjbench \
-		-arch x86_64 $PKGROOT/$BINDIR/tjbench \
-		-output $PKGROOT/$BINDIR/tjbench
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$BINDIR/rdjpgcom \
-		-arch x86_64 $PKGROOT/$BINDIR/rdjpgcom \
-		-output $PKGROOT/$BINDIR/rdjpgcom
-	$LIPO -create \
-		-arch i386 $TMPDIR/dist.x86/$BINDIR/wrjpgcom \
-		-arch x86_64 $PKGROOT/$BINDIR/wrjpgcom \
-		-output $PKGROOT/$BINDIR/wrjpgcom
-fi
-
 install_ios()
 {
 	BUILDDIR=$1
@@ -222,28 +159,14 @@
 		-output $PKGROOT/$BINDIR/wrjpgcom
 }
 
-if [ $UNIVERSAL = 1 -a "$BUILDDIRARMV7" != "" ]; then
-	install_ios $BUILDDIRARMV7 Armv7 armv7 arm
-fi
-
-if [ $UNIVERSAL = 1 -a "$BUILDDIRARMV7S" != "" ]; then
-	install_ios $BUILDDIRARMV7S Armv7s armv7s arm
-fi
-
-if [ $UNIVERSAL = 1 -a "$BUILDDIRARMV8" != "" ]; then
+if [ "$BUILDDIRARMV8" != "" ]; then
 	install_ios $BUILDDIRARMV8 Armv8 armv8 arm64
 fi
 
 install_name_tool -id $LIBDIR/$LIBJPEG_DSO_NAME $PKGROOT/$LIBDIR/$LIBJPEG_DSO_NAME
 install_name_tool -id $LIBDIR/$TURBOJPEG_DSO_NAME $PKGROOT/$LIBDIR/$TURBOJPEG_DSO_NAME
 
-if [ $WITH_JAVA = 1 ]; then
-	ln -fs $TURBOJPEG_DSO_NAME $PKGROOT/$LIBDIR/libturbojpeg.jnilib
-fi
 if [ "$PREFIX" = "@CMAKE_INSTALL_DEFAULT_PREFIX@" -a "$LIBDIR" = "@CMAKE_INSTALL_DEFAULT_PREFIX@/lib" ]; then
-	if [ ! -h $PKGROOT/$PREFIX/lib32 ]; then
-		ln -fs lib $PKGROOT/$PREFIX/lib32
-	fi
 	if [ ! -h $PKGROOT/$PREFIX/lib64 ]; then
 		ln -fs lib $PKGROOT/$PREFIX/lib64
 	fi
@@ -255,7 +178,7 @@
 
 find $PKGROOT -type f | while read file; do xattr -c $file; done
 
-cp $SRCDIR/release/License.rtf $SRCDIR/release/Welcome.rtf $SRCDIR/release/ReadMe.txt $TMPDIR/pkg/
+cp $SRCDIR/release/License.rtf pkgscripts/Welcome.rtf $SRCDIR/release/ReadMe.txt $TMPDIR/pkg/
 
 mkdir $TMPDIR/dmg
 pkgbuild --root $PKGROOT --version $VERSION.$BUILD --identifier @PKGID@ \
diff --git a/release/rpm.spec.in b/release/rpm.spec.in
index f8db764..5a23944 100644
--- a/release/rpm.spec.in
+++ b/release/rpm.spec.in
@@ -53,7 +53,7 @@
 %description
 libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
 baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and
-MIPS systems, as well as progressive JPEG compression on x86 and x86-64
+MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Armv8
 systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
 all else being equal.  On other types of systems, libjpeg-turbo can still
 outperform libjpeg by a significant amount, by virtue of its highly-optimized
@@ -185,6 +185,9 @@
 %endif
 %dir %{_libdir}/pkgconfig
 %{_libdir}/pkgconfig/libjpeg.pc
+%dir %{_libdir}/cmake
+%dir %{_libdir}/cmake/@CMAKE_PROJECT_NAME@
+%{_libdir}/cmake/@CMAKE_PROJECT_NAME@
 %if "%{_with_turbojpeg}" == "1"
  %if "%{_enable_shared}" == "1" || "%{_with_java}" == "1"
   %{_libdir}/libturbojpeg.so.@TURBOJPEG_SO_VERSION@
diff --git a/release/uninstall.in b/release/uninstall.in
index cf1ba77..34757b0 100644
--- a/release/uninstall.in
+++ b/release/uninstall.in
@@ -1,4 +1,5 @@
-# Copyright (C)2009-2011, 2013, 2016 D. R. Commander.  All Rights Reserved.
+# Copyright (C)2009-2011, 2013, 2016, 2020 D. R. Commander.
+#                                          All Rights Reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
@@ -70,6 +71,12 @@
 if [ -d $LIBDIR/pkgconfig ]; then
 	rmdir $LIBDIR/pkgconfig 2>&1 || EXITSTATUS=-1
 fi
+if [ -d $LIBDIR/cmake/@CMAKE_PROJECT_NAME@ ]; then
+	rmdir $LIBDIR/cmake/@CMAKE_PROJECT_NAME@ || EXITSTATUS=-1
+fi
+if [ -d $LIBDIR/cmake ]; then
+	rmdir $LIBDIR/cmake || EXITSTATUS=-1
+fi
 if [ -d $LIBDIR ]; then
 	rmdir $LIBDIR 2>&1 || EXITSTATUS=-1
 fi
@@ -90,7 +97,7 @@
 if [ -d $MANDIR ]; then
 	rmdir $MANDIR 2>&1 || EXITSTATUS=-1
 fi
-if [ -d $JAVADIR ]; then
+if [ -d "$JAVADIR" ]; then
 	rmdir $JAVADIR 2>&1 || EXITSTATUS=-1
 fi
 if [ -d $DATAROOTDIR -a "$DATAROOTDIR" != "$PREFIX" ]; then
diff --git a/sharedlib/CMakeLists.txt b/sharedlib/CMakeLists.txt
index 8d65e58..78a2f28 100644
--- a/sharedlib/CMakeLists.txt
+++ b/sharedlib/CMakeLists.txt
@@ -88,10 +88,13 @@
 add_executable(jcstest ../jcstest.c)
 target_link_libraries(jcstest jpeg)
 
-install(TARGETS jpeg cjpeg djpeg jpegtran
+install(TARGETS jpeg EXPORT ${CMAKE_PROJECT_NAME}Targets
+  INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
   RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+install(TARGETS cjpeg djpeg jpegtran
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 if(NOT CMAKE_VERSION VERSION_LESS "3.1" AND MSVC AND
   CMAKE_C_LINKER_SUPPORTS_PDB)
   install(FILES "$<TARGET_PDB_FILE:jpeg>"
diff --git a/simd/CMakeLists.txt b/simd/CMakeLists.txt
index ba0bd13..ce304ae 100644
--- a/simd/CMakeLists.txt
+++ b/simd/CMakeLists.txt
@@ -30,6 +30,9 @@
   if(CYGWIN)
     set(CMAKE_ASM_NASM_OBJECT_FORMAT win64)
   endif()
+  if(CMAKE_C_COMPILER_ABI MATCHES "ELF X32")
+    set(CMAKE_ASM_NASM_OBJECT_FORMAT elfx32)
+  endif()
 elseif(CPU_TYPE STREQUAL "i386")
   if(BORLAND)
     set(CMAKE_ASM_NASM_OBJECT_FORMAT obj)
@@ -311,14 +314,35 @@
 endif()
 
 ###############################################################################
-# Loongson (Intrinsics)
+# MIPS64 (Intrinsics)
 ###############################################################################
 
-elseif(CPU_TYPE STREQUAL "loongson")
+elseif(CPU_TYPE STREQUAL "loongson" OR CPU_TYPE MATCHES "mips64*")
 
-set(SIMD_SOURCES loongson/jccolor-mmi.c loongson/jcsample-mmi.c
-  loongson/jdcolor-mmi.c loongson/jdsample-mmi.c loongson/jfdctint-mmi.c
-  loongson/jidctint-mmi.c loongson/jquanti-mmi.c)
+set(CMAKE_REQUIRED_FLAGS -Wa,-mloongson-mmi,-mloongson-ext)
+
+check_c_source_compiles("
+  int main(void) {
+    int c = 0, a = 0, b = 0;
+    asm (
+      \"paddb %0, %1, %2\"
+      : \"=f\" (c)
+      : \"f\" (a), \"f\" (b)
+    );
+    return c;
+  }" HAVE_MMI)
+
+unset(CMAKE_REQUIRED_FLAGS)
+
+if(NOT HAVE_MMI)
+  simd_fail("SIMD extensions not available for this CPU")
+  return()
+endif()
+
+set(SIMD_SOURCES mips64/jccolor-mmi.c mips64/jcgray-mmi.c mips64/jcsample-mmi.c
+  mips64/jdcolor-mmi.c mips64/jdmerge-mmi.c mips64/jdsample-mmi.c
+  mips64/jfdctfst-mmi.c mips64/jfdctint-mmi.c mips64/jidctfst-mmi.c
+  mips64/jidctint-mmi.c mips64/jquanti-mmi.c)
 
 if(CMAKE_COMPILER_IS_GNUCC)
   foreach(file ${SIMD_SOURCES})
@@ -326,8 +350,12 @@
       " -fno-strict-aliasing")
   endforeach()
 endif()
+foreach(file ${SIMD_SOURCES})
+  set_property(SOURCE ${file} APPEND_STRING PROPERTY COMPILE_FLAGS
+    " -Wa,-mloongson-mmi,-mloongson-ext")
+endforeach()
 
-add_library(simd OBJECT ${SIMD_SOURCES} loongson/jsimd.c)
+add_library(simd OBJECT ${SIMD_SOURCES} mips64/jsimd.c)
 
 if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
   set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
diff --git a/simd/arm64/jsimd.c b/simd/arm64/jsimd.c
index 808c0e3..90c3803 100644
--- a/simd/arm64/jsimd.c
+++ b/simd/arm64/jsimd.c
@@ -22,6 +22,7 @@
 #include "../../jdct.h"
 #include "../../jsimddct.h"
 #include "../jsimd.h"
+#include "jconfigint.h"
 
 #include <stdio.h>
 #include <string.h>
@@ -773,6 +774,18 @@
 GLOBAL(int)
 jsimd_can_encode_mcu_AC_first_prepare(void)
 {
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 8)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -781,11 +794,25 @@
                                   const int *jpeg_natural_order_start, int Sl,
                                   int Al, JCOEF *values, size_t *zerobits)
 {
+  jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
 }
 
 GLOBAL(int)
 jsimd_can_encode_mcu_AC_refine_prepare(void)
 {
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 8)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -794,5 +821,7 @@
                                    const int *jpeg_natural_order_start, int Sl,
                                    int Al, JCOEF *absvalues, size_t *bits)
 {
-  return 0;
+  return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+                                                 jpeg_natural_order_start,
+                                                 Sl, Al, absvalues, bits);
 }
diff --git a/simd/arm64/jsimd_neon.S b/simd/arm64/jsimd_neon.S
index 3ed5f58..0a2c359 100644
--- a/simd/arm64/jsimd_neon.S
+++ b/simd/arm64/jsimd_neon.S
@@ -278,6 +278,20 @@
     .byte    4,   5,   6,   7, 255, 255, 255, 255, \
            255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
 
+/* Constants for jsimd_encode_mcu_AC_first_prepare_neon() */
+
+.balign 16
+Ljsimd_encode_mcu_AC_first_prepare_neon_consts:
+    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
+          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
+
+/* Constants for jsimd_encode_mcu_AC_refine_prepare_neon() */
+
+.balign 16
+Ljsimd_encode_mcu_AC_refine_prepare_neon_consts:
+    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
+          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
+
 .text
 
 
@@ -3433,3 +3447,625 @@
 .purgem put_bits
 .purgem checkbuf31
 .purgem checkbuf47
+
+
+/*****************************************************************************/
+
+/*
+ * Macros to load data for jsimd_encode_mcu_AC_first_prepare_neon() and
+ * jsimd_encode_mcu_AC_refine_prepare_neon()
+ */
+
+.macro LOAD16
+    ldr             T0d, [LUT, #(0*4)]
+    ldr             T1d, [LUT, #(8*4)]
+    add             T0, BLOCK, T0, lsl #1
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[0], [T0]
+    ld1             {Y1.h}[0], [T1]
+
+    ldr             T0d, [LUT, #(1*4)]
+    ldr             T1d, [LUT, #(9*4)]
+    add             T0, BLOCK, T0, lsl #1
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[1], [T0]
+    ld1             {Y1.h}[1], [T1]
+
+    ldr             T0d, [LUT, #(2*4)]
+    ldr             T1d, [LUT, #(10*4)]
+    add             T0, BLOCK, T0, lsl #1
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[2], [T0]
+    ld1             {Y1.h}[2], [T1]
+
+    ldr             T0d, [LUT, #(3*4)]
+    ldr             T1d, [LUT, #(11*4)]
+    add             T0, BLOCK, T0, lsl #1
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[3], [T0]
+    ld1             {Y1.h}[3], [T1]
+
+    ldr             T0d, [LUT, #(4*4)]
+    ldr             T1d, [LUT, #(12*4)]
+    add             T0, BLOCK, T0, lsl #1
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[4], [T0]
+    ld1             {Y1.h}[4], [T1]
+
+    ldr             T0d, [LUT, #(5*4)]
+    ldr             T1d, [LUT, #(13*4)]
+    add             T0, BLOCK, T0, lsl #1
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[5], [T0]
+    ld1             {Y1.h}[5], [T1]
+
+    ldr             T0d, [LUT, #(6*4)]
+    ldr             T1d, [LUT, #(14*4)]
+    add             T0, BLOCK, T0, lsl #1
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[6], [T0]
+    ld1             {Y1.h}[6], [T1]
+
+    ldr             T0d, [LUT, #(7*4)]
+    ldr             T1d, [LUT, #(15*4)]
+    add             T0, BLOCK, T0, lsl #1
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[7], [T0]
+    ld1             {Y1.h}[7], [T1]
+
+    add             LUT, LUT, #(16*4)
+.endm
+
+.macro LOAD15
+    eor             Y1.16b, Y1.16b, Y1.16b
+
+    ldr             T0d, [LUT, #(0*4)]
+    ldr             T1d, [LUT, #(8*4)]
+    add             T0, BLOCK, T0, lsl #1
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[0], [T0]
+    ld1             {Y1.h}[0], [T1]
+
+    ldr             T0d, [LUT, #(1*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[1], [T0]
+
+    ldr             T0d, [LUT, #(2*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[2], [T0]
+
+    ldr             T0d, [LUT, #(3*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[3], [T0]
+
+    ldr             T0d, [LUT, #(4*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[4], [T0]
+
+    ldr             T0d, [LUT, #(5*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[5], [T0]
+
+    ldr             T0d, [LUT, #(6*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[6], [T0]
+
+    ldr             T0d, [LUT, #(7*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[7], [T0]
+
+    cmp             LENEND, #2
+    b.lt            1515f
+    ldr             T1d, [LUT, #(9*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y1.h}[1], [T1]
+
+    cmp             LENEND, #3
+    b.lt            1515f
+    ldr             T1d, [LUT, #(10*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y1.h}[2], [T1]
+
+    cmp             LENEND, #4
+    b.lt            1515f
+    ldr             T1d, [LUT, #(11*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y1.h}[3], [T1]
+
+    cmp             LENEND, #5
+    b.lt            1515f
+    ldr             T1d, [LUT, #(12*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y1.h}[4], [T1]
+
+    cmp             LENEND, #6
+    b.lt            1515f
+    ldr             T1d, [LUT, #(13*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y1.h}[5], [T1]
+
+    cmp             LENEND, #7
+    b.lt            1515f
+    ldr             T1d, [LUT, #(14*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y1.h}[6], [T1]
+
+1515:
+.endm
+
+.macro LOAD8
+    ldr             T0d, [LUT, #(0*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[0], [T0]
+
+    ldr             T0d, [LUT, #(1*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[1], [T0]
+
+    ldr             T0d, [LUT, #(2*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[2], [T0]
+
+    ldr             T0d, [LUT, #(3*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[3], [T0]
+
+    ldr             T0d, [LUT, #(4*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[4], [T0]
+
+    ldr             T0d, [LUT, #(5*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[5], [T0]
+
+    ldr             T0d, [LUT, #(6*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[6], [T0]
+
+    ldr             T0d, [LUT, #(7*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[7], [T0]
+.endm
+
+.macro LOAD7
+    eor             Y0.16b, Y0.16b, Y0.16b
+
+    ldr             T0d, [LUT, #(0*4)]
+    add             T0, BLOCK, T0, lsl #1
+    ld1             {Y0.h}[0], [T0]
+
+    cmp             LENEND, #2
+    b.lt            77f
+    ldr             T1d, [LUT, #(1*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[1], [T1]
+
+    cmp             LENEND, #3
+    b.lt            77f
+    ldr             T1d, [LUT, #(2*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[2], [T1]
+
+    cmp             LENEND, #4
+    b.lt            77f
+    ldr             T1d, [LUT, #(3*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[3], [T1]
+
+    cmp             LENEND, #5
+    b.lt            77f
+    ldr             T1d, [LUT, #(4*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[4], [T1]
+
+    cmp             LENEND, #6
+    b.lt            77f
+    ldr             T1d, [LUT, #(5*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[5], [T1]
+
+    cmp             LENEND, #7
+    b.lt            77f
+    ldr             T1d, [LUT, #(6*4)]
+    add             T1, BLOCK, T1, lsl #1
+    ld1             {Y0.h}[6], [T1]
+
+77:
+.endm
+
+.macro REDUCE0
+    ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [VALUES], #64
+    ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [VALUES], #64
+
+    cmeq            v0.8h, v0.8h, #0
+    cmeq            v1.8h, v1.8h, #0
+    cmeq            v2.8h, v2.8h, #0
+    cmeq            v3.8h, v3.8h, #0
+    cmeq            v4.8h, v4.8h, #0
+    cmeq            v5.8h, v5.8h, #0
+    cmeq            v6.8h, v6.8h, #0
+    cmeq            v7.8h, v7.8h, #0
+
+    xtn             v0.8b, v0.8h
+    xtn             v2.8b, v2.8h
+    xtn             v4.8b, v4.8h
+    xtn             v6.8b, v6.8h
+    xtn2            v0.16b, v1.8h
+    xtn2            v2.16b, v3.8h
+    xtn2            v4.16b, v5.8h
+    xtn2            v6.16b, v7.8h
+
+    and             v0.16b, v0.16b, ANDMASK.16b
+    and             v2.16b, v2.16b, ANDMASK.16b
+    and             v4.16b, v4.16b, ANDMASK.16b
+    and             v6.16b, v6.16b, ANDMASK.16b
+    addp            v0.16b, v0.16b, v2.16b
+    addp            v4.16b, v4.16b, v6.16b
+    addp            v0.16b, v0.16b, v4.16b
+    addp            v0.16b, v0.16b, v0.16b
+    umov            T0, v0.D[0]
+    mvn             T0, T0
+    str             T0, [BITS]
+.endm
+
+/*
+ * Prepare data for jsimd_encode_mcu_AC_first().
+ *
+ * GLOBAL(int)
+ * jsimd_encode_mcu_AC_first_prepare_neon(const JCOEF *block,
+ *                                        const int *jpeg_natural_order_start,
+ *                                        int Sl, int Al, JCOEF *values,
+ *                                        size_t *zerobits)
+ *
+ * x0 = const JCOEF *block
+ * x1 = const int *jpeg_natural_order_start
+ * w2 = int Sl
+ * w3 = int Al
+ * x4 = JCOEF *values
+ * x5 = size_t *zerobits
+ *
+ */
+
+    ZERO            .req v0
+    Y0              .req v2
+    Y1              .req v3
+    N0              .req v4
+    N1              .req v5
+    AL              .req v6
+    ANDMASK         .req v20
+    K               .req w12
+    LUT             .req x1
+    T0              .req x10
+    T0d             .req w10
+    T1              .req x11
+    T1d             .req w11
+    BLOCK           .req x0
+    VALUES          .req x4
+    XORVALUES       .req x14
+    LEN             .req w2
+    LENEND          .req w9
+    BITS            .req x5
+
+asm_function jsimd_encode_mcu_AC_first_prepare_neon
+    get_symbol_loc  T0, Ljsimd_encode_mcu_AC_first_prepare_neon_consts
+    neg             w3, w3                        /* Al = -Al */
+    eor             ZERO.16b, ZERO.16b, ZERO.16b
+    ld1             {ANDMASK.16b}, [T0]
+    dup             AL.8h, w3
+    add             XORVALUES, VALUES, #(/*DCTSIZE2*/ 64 * 2)
+    and             LENEND, LEN, 7
+    lsr             K, LEN, 4
+    cbz             K, 3f
+1:
+    LOAD16
+    cmlt            N0.8h, Y0.8h, #0
+    cmlt            N1.8h, Y1.8h, #0
+    abs             Y0.8h, Y0.8h
+    abs             Y1.8h, Y1.8h
+    ushl            Y0.8h, Y0.8h, AL.8h
+    ushl            Y1.8h, Y1.8h, AL.8h
+    eor             N0.16b, N0.16b, Y0.16b
+    eor             N1.16b, N1.16b, Y1.16b
+    st1             {Y0.8h, Y1.8h}, [VALUES], #32
+    st1             {N0.8h, N1.8h}, [XORVALUES], #32
+    subs            K, K, #1
+    b.ne            1b
+3:
+    tst             LEN, #8
+    b.eq            3f
+    tst             LEN, #7
+    b.eq            2f
+
+    LOAD15
+    cmlt            N0.8h, Y0.8h, #0
+    cmlt            N1.8h, Y1.8h, #0
+    abs             Y0.8h, Y0.8h
+    abs             Y1.8h, Y1.8h
+    ushl            Y0.8h, Y0.8h, AL.8h
+    ushl            Y1.8h, Y1.8h, AL.8h
+    eor             N0.16b, N0.16b, Y0.16b
+    eor             N1.16b, N1.16b, Y1.16b
+    st1             {Y0.8h, Y1.8h}, [VALUES], #32
+    st1             {N0.8h, N1.8h}, [XORVALUES], #32
+    b               4f
+2:
+    LOAD8
+    cmlt            N0.8h, Y0.8h, #0
+    abs             Y0.8h, Y0.8h
+    ushl            Y0.8h, Y0.8h, AL.8h
+    eor             N0.16b, N0.16b, Y0.16b
+    st1             {Y0.8h}, [VALUES], #16
+    st1             {N0.8h}, [XORVALUES], #16
+    b               4f
+3:
+    cbz             LENEND, 4f
+    LOAD7
+    cmlt            N0.8h, Y0.8h, #0
+    abs             Y0.8h, Y0.8h
+    ushl            Y0.8h, Y0.8h, AL.8h
+    eor             N0.16b, N0.16b, Y0.16b
+    st1             {Y0.8h}, [VALUES], #16
+    st1             {N0.8h}, [XORVALUES], #16
+    /* b               4f */
+    /* fallthrough */
+4:
+    add             K, LEN, #7
+    lsr             K, K, #3
+    subs            K, K, #(/*DCTSIZE2*/ 64 / 8)
+    b.eq            5f
+1:
+    st1             {ZERO.8h}, [VALUES], #16
+    st1             {ZERO.8h}, [XORVALUES], #16
+    adds            K, K, #1
+    b.ne            1b
+5:
+    sub             VALUES, VALUES, #(/*DCTSIZE2*/ 64 * 2)
+
+    REDUCE0
+
+    br              x30
+
+    .unreq          ZERO
+    .unreq          Y0
+    .unreq          Y1
+    .unreq          N0
+    .unreq          N1
+    .unreq          AL
+    .unreq          ANDMASK
+    .unreq          K
+    .unreq          LUT
+    .unreq          T0
+    .unreq          T0d
+    .unreq          T1
+    .unreq          T1d
+    .unreq          BLOCK
+    .unreq          VALUES
+    .unreq          XORVALUES
+    .unreq          LEN
+    .unreq          LENEND
+    .unreq          BITS
+
+/*
+ * Prepare data for jsimd_encode_mcu_AC_refine.
+ *
+ * GLOBAL(int)
+ * jsimd_encode_mcu_AC_refine_prepare_neon(const JCOEF *block,
+ *                                         const int *jpeg_natural_order_start,
+ *                                         int Sl, int Al, JCOEF *absvalues,
+ *                                         size_t *bits)
+ *
+ * x0 = const JCOEF *block
+ * x1 = const int *jpeg_natural_order_start
+ * w2 = int Sl
+ * w3 = int Al
+ * x4 = JCOEF *absvalues
+ * x5 = size_t *bits
+ *
+ */
+
+    ZERO            .req v0
+    ONE             .req v1
+    Y0              .req v2
+    Y1              .req v3
+    N0              .req v4
+    N1              .req v5
+    AL              .req v6
+    ANDMASK         .req v20
+    K               .req w12
+    KK              .req w13
+    EOB             .req w14
+    SIGN            .req x15
+    LUT             .req x1
+    T0              .req x10
+    T0d             .req w10
+    T1              .req x11
+    T1d             .req w11
+    BLOCK           .req x0
+    VALUES          .req x4
+    LEN             .req w2
+    LENEND          .req w9
+    BITS            .req x5
+
+asm_function jsimd_encode_mcu_AC_refine_prepare_neon
+    get_symbol_loc  T0, Ljsimd_encode_mcu_AC_refine_prepare_neon_consts
+    neg             w3, w3                        /* Al = -Al */
+    movi            ONE.8h, #1
+    eor             SIGN, SIGN, SIGN
+    eor             ZERO.16b, ZERO.16b, ZERO.16b
+    eor             EOB, EOB, EOB
+    ld1             {ANDMASK.16b}, [T0]
+    eor             KK, KK, KK
+    dup             AL.8h, w3
+    and             LENEND, LEN, 7
+    lsr             K, LEN, 4
+    cbz             K, 3f
+1:
+    LOAD16
+    cmlt            N0.8h, Y0.8h, #0
+    cmlt            N1.8h, Y1.8h, #0
+    abs             Y0.8h, Y0.8h
+    abs             Y1.8h, Y1.8h
+    ushl            Y0.8h, Y0.8h, AL.8h
+    ushl            Y1.8h, Y1.8h, AL.8h
+    st1             {Y0.8h, Y1.8h}, [VALUES], #32
+    xtn             N0.8b, N0.8h
+    xtn             N1.8b, N1.8h
+    cmeq            Y0.8h, Y0.8h, ONE.8h
+    cmeq            Y1.8h, Y1.8h, ONE.8h
+    xtn             Y0.8b, Y0.8h
+    xtn             Y1.8b, Y1.8h
+    and             N0.8b, N0.8b, ANDMASK.8b
+    and             N1.8b, N1.8b, ANDMASK.8b
+    and             Y0.8b, Y0.8b, ANDMASK.8b
+    and             Y1.8b, Y1.8b, ANDMASK.8b
+    addv            B28, N0.8b
+    addv            B29, N1.8b
+    addv            B30, Y0.8b
+    addv            B31, Y1.8b
+    ins             v28.b[1], v29.b[0]
+    ins             v30.b[1], v31.b[0]
+    umov            T0d, v28.h[0]    /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
+    umov            T1d, v30.h[0]    /* idx = _mm_movemask_epi8(x1); */
+    lsr             SIGN, SIGN, #16  /* make room for sizebits */
+    orr             SIGN, SIGN, T0, lsl #48
+    cbz             T1d, 2f
+    rbit            T1d, T1d
+    clz             T1d, T1d
+    add             EOB, KK, T1d     /* EOB = k + idx; */
+2:
+    add             KK, KK, #16
+    subs            K, K, #1
+    b.ne            1b
+3:
+    tst             LEN, #8
+    b.eq            3f
+    tst             LEN, #7
+    b.eq            2f
+
+    LOAD15
+    cmlt            N0.8h, Y0.8h, #0
+    cmlt            N1.8h, Y1.8h, #0
+    abs             Y0.8h, Y0.8h
+    abs             Y1.8h, Y1.8h
+    ushl            Y0.8h, Y0.8h, AL.8h
+    ushl            Y1.8h, Y1.8h, AL.8h
+    st1             {Y0.8h, Y1.8h}, [VALUES], #32
+    xtn             N0.8b, N0.8h
+    xtn             N1.8b, N1.8h
+    cmeq            Y0.8h, Y0.8h, ONE.8h
+    cmeq            Y1.8h, Y1.8h, ONE.8h
+    xtn             Y0.8b, Y0.8h
+    xtn             Y1.8b, Y1.8h
+    and             N0.8b, N0.8b, ANDMASK.8b
+    and             N1.8b, N1.8b, ANDMASK.8b
+    and             Y0.8b, Y0.8b, ANDMASK.8b
+    and             Y1.8b, Y1.8b, ANDMASK.8b
+    addv            B28, N0.8b
+    addv            B29, N1.8b
+    addv            B30, Y0.8b
+    addv            B31, Y1.8b
+    ins             v28.b[1], v29.b[0]
+    ins             v30.b[1], v31.b[0]
+    umov            T0d, v28.h[0]    /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
+    umov            T1d, v30.h[0]    /* idx = _mm_movemask_epi8(x1); */
+    lsr             SIGN, SIGN, #16  /* make room for sizebits */
+    orr             SIGN, SIGN, T0, lsl #48
+    cbz             T1d, 4f
+    rbit            T1d, T1d
+    clz             T1d, T1d
+    add             EOB, KK, T1d     /* EOB = k + idx; */
+    b               4f
+2:
+    LOAD8
+    cmlt            N0.8h, Y0.8h, #0
+    abs             Y0.8h, Y0.8h
+    ushl            Y0.8h, Y0.8h, AL.8h
+    st1             {Y0.8h}, [VALUES], #16
+    xtn             N0.8b, N0.8h
+    cmeq            Y0.8h, Y0.8h, ONE.8h
+    xtn             Y0.8b, Y0.8h
+    and             N0.8b, N0.8b, ANDMASK.8b
+    and             Y0.8b, Y0.8b, ANDMASK.8b
+    addv            B28, N0.8b
+    addv            B30, Y0.8b
+    umov            T0d, v28.b[0]    /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
+    umov            T1d, v30.b[0]    /* idx = _mm_movemask_epi8(x1); */
+    lsr             SIGN, SIGN, #8   /* make room for sizebits */
+    orr             SIGN, SIGN, T0, lsl #56
+    cbz             T1d, 4f
+    rbit            T1d, T1d
+    clz             T1d, T1d
+    add             EOB, KK, T1d     /* EOB = k + idx; */
+    b               4f
+3:
+    cbz             LENEND, 4f
+    LOAD7
+    cmlt            N0.8h, Y0.8h, #0
+    abs             Y0.8h, Y0.8h
+    ushl            Y0.8h, Y0.8h, AL.8h
+    st1             {Y0.8h}, [VALUES], #16
+    xtn             N0.8b, N0.8h
+    cmeq            Y0.8h, Y0.8h, ONE.8h
+    xtn             Y0.8b, Y0.8h
+    and             N0.8b, N0.8b, ANDMASK.8b
+    and             Y0.8b, Y0.8b, ANDMASK.8b
+    addv            B28, N0.8b
+    addv            B30, Y0.8b
+    umov            T0d, v28.b[0]    /* lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); */
+    umov            T1d, v30.b[0]    /* idx = _mm_movemask_epi8(x1); */
+    lsr             SIGN, SIGN, #8   /* make room for sizebits */
+    orr             SIGN, SIGN, T0, lsl #56
+    cbz             T1d, 4f
+    rbit            T1d, T1d
+    clz             T1d, T1d
+    add             EOB, KK, T1d     /* EOB = k + idx; */
+    /* b               4f */
+    /* fallthrough */
+4:
+    add             K, LEN, #7
+    lsr             K, K, #3
+    subs            K, K, #(/*DCTSIZE2*/ 64 / 8)
+    b.eq            5f
+1:
+    st1             {ZERO.8h}, [VALUES], #16
+    lsr             SIGN, SIGN, #8
+    adds            K, K, #1
+    b.ne            1b
+5:
+    mvn             SIGN, SIGN
+    sub             VALUES, VALUES, #(/*DCTSIZE2*/ 64 * 2)
+    str             SIGN, [BITS, #8]
+
+    REDUCE0
+
+    mov             w0, EOB
+    br              x30
+
+    .unreq          ZERO
+    .unreq          ONE
+    .unreq          Y0
+    .unreq          Y1
+    .unreq          N0
+    .unreq          N1
+    .unreq          AL
+    .unreq          ANDMASK
+    .unreq          K
+    .unreq          KK
+    .unreq          EOB
+    .unreq          SIGN
+    .unreq          LUT
+    .unreq          T0
+    .unreq          T0d
+    .unreq          T1
+    .unreq          T1d
+    .unreq          BLOCK
+    .unreq          VALUES
+    .unreq          LEN
+    .unreq          LENEND
+    .unreq          BITS
+
+.purgem LOAD16
+.purgem LOAD15
+.purgem LOAD8
+.purgem LOAD7
+.purgem REDUCE0
diff --git a/simd/i386/jchuff-sse2.asm b/simd/i386/jchuff-sse2.asm
index 79f0ca5..278cf5e 100644
--- a/simd/i386/jchuff-sse2.asm
+++ b/simd/i386/jchuff-sse2.asm
@@ -1,8 +1,9 @@
 ;
 ; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
 ;
-; Copyright (C) 2009-2011, 2014-2017, D. R. Commander.
+; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander.
 ; Copyright (C) 2015, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -15,134 +16,255 @@
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
 ;
 ; This file contains an SSE2 implementation for Huffman coding of one block.
-; The following code is based directly on jchuff.c; see jchuff.c for more
-; details.
+; The following code is based on jchuff.c; see jchuff.c for more details.
 
 %include "jsimdext.inc"
 
+struc working_state
+.next_output_byte:   resp 1     ; => next byte to write in buffer
+.free_in_buffer:     resp 1     ; # of byte spaces remaining in buffer
+.cur.put_buffer.simd resq 1     ; current bit accumulation buffer
+.cur.free_bits       resd 1     ; # of bits available in it
+.cur.last_dc_val     resd 4     ; last DC coef for each component
+.cinfo:              resp 1     ; dump_buffer needs access to this
+endstruc
+
+struc c_derived_tbl
+.ehufco:             resd 256   ; code for each symbol
+.ehufsi:             resb 256   ; length of code for each symbol
+; If no code has been allocated for a symbol S, ehufsi[S] contains 0
+endstruc
+
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
     GLOBAL_DATA(jconst_huff_encode_one_block)
 
 EXTN(jconst_huff_encode_one_block):
 
-%include "jpeg_nbits_table.inc"
+    alignz      32
+
+jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007
+               dq 0x000f, 0x001f, 0x003f, 0x007f
+               dq 0x00ff, 0x01ff, 0x03ff, 0x07ff
+               dq 0x0fff, 0x1fff, 0x3fff, 0x7fff
+
+times 1 << 14 db 15
+times 1 << 13 db 14
+times 1 << 12 db 13
+times 1 << 11 db 12
+times 1 << 10 db 11
+times 1 <<  9 db 10
+times 1 <<  8 db  9
+times 1 <<  7 db  8
+times 1 <<  6 db  7
+times 1 <<  5 db  6
+times 1 <<  4 db  5
+times 1 <<  3 db  4
+times 1 <<  2 db  3
+times 1 <<  1 db  2
+times 1 <<  0 db  1
+times 1       db  0
+jpeg_nbits_table:
+times 1       db  0
+times 1 <<  0 db  1
+times 1 <<  1 db  2
+times 1 <<  2 db  3
+times 1 <<  3 db  4
+times 1 <<  4 db  5
+times 1 <<  5 db  6
+times 1 <<  6 db  7
+times 1 <<  7 db  8
+times 1 <<  8 db  9
+times 1 <<  9 db 10
+times 1 << 10 db 11
+times 1 << 11 db 12
+times 1 << 12 db 13
+times 1 << 13 db 14
+times 1 << 14 db 15
 
     alignz      32
 
+%ifdef PIC
+%define NBITS(x)      nbits_base + x
+%else
+%define NBITS(x)      jpeg_nbits_table + x
+%endif
+%define MASK_BITS(x)  NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table)
+
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
     BITS        32
 
-; These macros perform the same task as the emit_bits() function in the
-; original libjpeg code.  In addition to reducing overhead by explicitly
-; inlining the code, additional performance is achieved by taking into
-; account the size of the bit buffer and waiting until it is almost full
-; before emptying it.  This mostly benefits 64-bit platforms, since 6
-; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
+%define mm_put_buffer     mm0
+%define mm_all_0xff       mm1
+%define mm_temp           mm2
+%define mm_nbits          mm3
+%define mm_code_bits      mm3
+%define mm_code           mm4
+%define mm_overflow_bits  mm5
+%define mm_save_nbits     mm6
 
-%macro EMIT_BYTE 0
-    sub         put_bits, 8             ; put_bits -= 8;
-    mov         edx, put_buffer
-    mov         ecx, put_bits
-    shr         edx, cl                 ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
-    mov         byte [eax], dl          ; *buffer++ = c;
-    add         eax, 1
-    cmp         dl, 0xFF                ; need to stuff a zero byte?
-    jne         %%.EMIT_BYTE_END
-    mov         byte [eax], 0           ; *buffer++ = 0;
-    add         eax, 1
-%%.EMIT_BYTE_END:
-%endmacro
+; Shorthand used to describe SIMD operations:
+; wN:  xmmN treated as eight signed 16-bit values
+; wN[i]:  perform the same operation on all eight signed 16-bit values, i=0..7
+; bN:  xmmN treated as 16 unsigned 8-bit values, or
+;      mmN treated as eight unsigned 8-bit values
+; bN[i]:  perform the same operation on all unsigned 8-bit values,
+;         i=0..15 (SSE register) or i=0..7 (MMX register)
+; Contents of SIMD registers are shown in memory order.
 
-%macro PUT_BITS 1
-    add         put_bits, ecx           ; put_bits += size;
-    shl         put_buffer, cl          ; put_buffer = (put_buffer << size);
-    or          put_buffer, %1
-%endmacro
+; Fill the bit buffer to capacity with the leading bits from code, then output
+; the bit buffer and put the remaining bits from code into the bit buffer.
+;
+; Usage:
+; code - contains the bits to shift into the bit buffer (LSB-aligned)
+; %1 - temp register
+; %2 - low byte of temp register
+; %3 - second byte of temp register
+; %4-%8 (optional) - extra instructions to execute before the macro completes
+; %9 - the label to which to jump when the macro completes
+;
+; Upon completion, free_bits will be set to the number of remaining bits from
+; code, and put_buffer will contain those remaining bits.  temp and code will
+; be clobbered.
+;
+; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
+; macro in jchuff.c.
 
-%macro CHECKBUF15 0
-    cmp         put_bits, 16            ; if (put_bits > 31) {
-    jl          %%.CHECKBUF15_END
-    mov         eax, POINTER [esp+buffer]
-    EMIT_BYTE
-    EMIT_BYTE
-    mov         POINTER [esp+buffer], eax
-%%.CHECKBUF15_END:
-%endmacro
-
-%macro EMIT_BITS 1
-    PUT_BITS    %1
-    CHECKBUF15
-%endmacro
-
-%macro kloop_prepare 37                 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
-    pxor        xmm4, xmm4              ; __m128i neg = _mm_setzero_si128();
-    pxor        xmm5, xmm5              ; __m128i neg = _mm_setzero_si128();
-    pxor        xmm6, xmm6              ; __m128i neg = _mm_setzero_si128();
-    pxor        xmm7, xmm7              ; __m128i neg = _mm_setzero_si128();
-    pinsrw      %34, word [esi + %2  * SIZEOF_WORD], 0  ; xmm_shadow[0] = block[jno0];
-    pinsrw      %35, word [esi + %10 * SIZEOF_WORD], 0  ; xmm_shadow[8] = block[jno8];
-    pinsrw      %36, word [esi + %18 * SIZEOF_WORD], 0  ; xmm_shadow[16] = block[jno16];
-    pinsrw      %37, word [esi + %26 * SIZEOF_WORD], 0  ; xmm_shadow[24] = block[jno24];
-    pinsrw      %34, word [esi + %3  * SIZEOF_WORD], 1  ; xmm_shadow[1] = block[jno1];
-    pinsrw      %35, word [esi + %11 * SIZEOF_WORD], 1  ; xmm_shadow[9] = block[jno9];
-    pinsrw      %36, word [esi + %19 * SIZEOF_WORD], 1  ; xmm_shadow[17] = block[jno17];
-    pinsrw      %37, word [esi + %27 * SIZEOF_WORD], 1  ; xmm_shadow[25] = block[jno25];
-    pinsrw      %34, word [esi + %4  * SIZEOF_WORD], 2  ; xmm_shadow[2] = block[jno2];
-    pinsrw      %35, word [esi + %12 * SIZEOF_WORD], 2  ; xmm_shadow[10] = block[jno10];
-    pinsrw      %36, word [esi + %20 * SIZEOF_WORD], 2  ; xmm_shadow[18] = block[jno18];
-    pinsrw      %37, word [esi + %28 * SIZEOF_WORD], 2  ; xmm_shadow[26] = block[jno26];
-    pinsrw      %34, word [esi + %5  * SIZEOF_WORD], 3  ; xmm_shadow[3] = block[jno3];
-    pinsrw      %35, word [esi + %13 * SIZEOF_WORD], 3  ; xmm_shadow[11] = block[jno11];
-    pinsrw      %36, word [esi + %21 * SIZEOF_WORD], 3  ; xmm_shadow[19] = block[jno19];
-    pinsrw      %37, word [esi + %29 * SIZEOF_WORD], 3  ; xmm_shadow[27] = block[jno27];
-    pinsrw      %34, word [esi + %6  * SIZEOF_WORD], 4  ; xmm_shadow[4] = block[jno4];
-    pinsrw      %35, word [esi + %14 * SIZEOF_WORD], 4  ; xmm_shadow[12] = block[jno12];
-    pinsrw      %36, word [esi + %22 * SIZEOF_WORD], 4  ; xmm_shadow[20] = block[jno20];
-    pinsrw      %37, word [esi + %30 * SIZEOF_WORD], 4  ; xmm_shadow[28] = block[jno28];
-    pinsrw      %34, word [esi + %7  * SIZEOF_WORD], 5  ; xmm_shadow[5] = block[jno5];
-    pinsrw      %35, word [esi + %15 * SIZEOF_WORD], 5  ; xmm_shadow[13] = block[jno13];
-    pinsrw      %36, word [esi + %23 * SIZEOF_WORD], 5  ; xmm_shadow[21] = block[jno21];
-    pinsrw      %37, word [esi + %31 * SIZEOF_WORD], 5  ; xmm_shadow[29] = block[jno29];
-    pinsrw      %34, word [esi + %8  * SIZEOF_WORD], 6  ; xmm_shadow[6] = block[jno6];
-    pinsrw      %35, word [esi + %16 * SIZEOF_WORD], 6  ; xmm_shadow[14] = block[jno14];
-    pinsrw      %36, word [esi + %24 * SIZEOF_WORD], 6  ; xmm_shadow[22] = block[jno22];
-    pinsrw      %37, word [esi + %32 * SIZEOF_WORD], 6  ; xmm_shadow[30] = block[jno30];
-    pinsrw      %34, word [esi + %9  * SIZEOF_WORD], 7  ; xmm_shadow[7] = block[jno7];
-    pinsrw      %35, word [esi + %17 * SIZEOF_WORD], 7  ; xmm_shadow[15] = block[jno15];
-    pinsrw      %36, word [esi + %25 * SIZEOF_WORD], 7  ; xmm_shadow[23] = block[jno23];
-%if %1 != 32
-    pinsrw      %37, word [esi + %33 * SIZEOF_WORD], 7  ; xmm_shadow[31] = block[jno31];
-%else
-    pinsrw      %37, ecx, 7             ; xmm_shadow[31] = block[jno31];
+%macro EMIT_QWORD 9
+%define %%temp   %1
+%define %%tempb  %2
+%define %%temph  %3
+    add         nbits, free_bits             ; nbits += free_bits;
+    neg         free_bits                    ; free_bits = -free_bits;
+    movq        mm_temp, mm_code             ; temp = code;
+    movd        mm_nbits, nbits              ; nbits --> MMX register
+    movd        mm_overflow_bits, free_bits  ; overflow_bits (temp register) = free_bits;
+    neg         free_bits                    ; free_bits = -free_bits;
+    psllq       mm_put_buffer, mm_nbits      ; put_buffer <<= nbits;
+    psrlq       mm_temp, mm_overflow_bits    ; temp >>= overflow_bits;
+    add         free_bits, 64                ; free_bits += 64;
+    por         mm_temp, mm_put_buffer       ; temp |= put_buffer;
+%ifidn %%temp, nbits_base
+    movd        mm_save_nbits, nbits_base    ; save nbits_base
 %endif
-    pcmpgtw     xmm4, %34               ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw     xmm5, %35               ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw     xmm6, %36               ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw     xmm7, %37               ; neg = _mm_cmpgt_epi16(neg, x1);
-    paddw       %34, xmm4               ; x1 = _mm_add_epi16(x1, neg);
-    paddw       %35, xmm5               ; x1 = _mm_add_epi16(x1, neg);
-    paddw       %36, xmm6               ; x1 = _mm_add_epi16(x1, neg);
-    paddw       %37, xmm7               ; x1 = _mm_add_epi16(x1, neg);
-    pxor        %34, xmm4               ; x1 = _mm_xor_si128(x1, neg);
-    pxor        %35, xmm5               ; x1 = _mm_xor_si128(x1, neg);
-    pxor        %36, xmm6               ; x1 = _mm_xor_si128(x1, neg);
-    pxor        %37, xmm7               ; x1 = _mm_xor_si128(x1, neg);
-    pxor        xmm4, %34               ; neg = _mm_xor_si128(neg, x1);
-    pxor        xmm5, %35               ; neg = _mm_xor_si128(neg, x1);
-    pxor        xmm6, %36               ; neg = _mm_xor_si128(neg, x1);
-    pxor        xmm7, %37               ; neg = _mm_xor_si128(neg, x1);
-    movdqa      XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34          ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
-    movdqa      XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35    ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
-    movdqa      XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36   ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
-    movdqa      XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37   ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
-    movdqa      XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4         ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
-    movdqa      XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5   ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
-    movdqa      XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6  ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
-    movdqa      XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7  ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
+    movq        mm_code_bits, mm_temp        ; code_bits (temp register) = temp;
+    movq        mm_put_buffer, mm_code       ; put_buffer = code;
+    pcmpeqb     mm_temp, mm_all_0xff         ; b_temp[i] = (b_temp[i] == 0xFF ? 0xFF : 0);
+    movq        mm_code, mm_code_bits        ; code = code_bits;
+    psrlq       mm_code_bits, 32             ; code_bits >>= 32;
+    pmovmskb    nbits, mm_temp               ; nbits = 0;  nbits |= ((b_temp[i] >> 7) << i);
+    movd        %%temp, mm_code_bits         ; temp = code_bits;
+    bswap       %%temp                       ; temp = htonl(temp);
+    test        nbits, nbits                 ; if (nbits != 0)  /* Some 0xFF bytes */
+    jnz         %%.SLOW                      ;   goto %%.SLOW
+    mov         dword [buffer], %%temp       ; *(uint32_t)buffer = temp;
+%ifidn %%temp, nbits_base
+    movd        nbits_base, mm_save_nbits    ; restore nbits_base
+%endif
+    %4
+    movd        nbits, mm_code               ; nbits = (uint32_t)(code);
+    %5
+    bswap       nbits                        ; nbits = htonl(nbits);
+    mov         dword [buffer + 4], nbits    ; *(uint32_t)(buffer + 4) = nbits;
+    lea         buffer, [buffer + 8]         ; buffer += 8;
+    %6
+    %7
+    %8
+    jmp %9                                   ; return
+%%.SLOW:
+    ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
+    ; bytes in the qword.
+    mov         byte [buffer], %%tempb     ; buffer[0] = temp[0];
+    cmp         %%tempb, 0xFF              ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], %%temph     ; buffer[0] = temp[1];
+    cmp         %%temph, 0xFF              ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         %%temp, 16                 ; temp >>= 16;
+    mov         byte [buffer], %%tempb     ; buffer[0] = temp[0];
+    cmp         %%tempb, 0xFF              ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], %%temph     ; buffer[0] = temp[1];
+    cmp         %%temph, 0xFF              ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    movd        nbits, mm_code             ; nbits (temp register) = (uint32_t)(code)
+%ifidn %%temp, nbits_base
+    movd        nbits_base, mm_save_nbits  ; restore nbits_base
+%endif
+    bswap       nbits                      ; nbits = htonl(nbits)
+    mov         byte [buffer], nbitsb      ; buffer[0] = nbits[0];
+    cmp         nbitsb, 0xFF               ; Set CF if nbits[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], nbitsh      ; buffer[0] = nbits[1];
+    cmp         nbitsh, 0xFF               ; Set CF if nbits[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+    shr         nbits, 16                  ; nbits >>= 16;
+    mov         byte [buffer], nbitsb      ; buffer[0] = nbits[0];
+    cmp         nbitsb, 0xFF               ; Set CF if nbits[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], nbitsh      ; buffer[0] = nbits[1];
+    %4
+    cmp         nbitsh, 0xFF               ; Set CF if nbits[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+    %5
+    %6
+    %7
+    %8
+    jmp %9                                 ; return;
+%endmacro
+
+%macro PUSH 1
+    push        %1
+%assign stack_offset  stack_offset + 4
+%endmacro
+
+%macro POP 1
+    pop         %1
+%assign stack_offset  stack_offset - 4
+%endmacro
+
+; If PIC is defined, load the address of a symbol defined in this file into a
+; register.  Equivalent to
+;   get_GOT     %1
+;   lea         %1, [GOTOFF(%1, %2)]
+; without using the GOT.
+;
+; Usage:
+; %1 - register into which to load the address of the symbol
+; %2 - symbol whose address should be loaded
+; %3 - optional multi-line macro to execute before the symbol address is loaded
+; %4 - optional multi-line macro to execute after the symbol address is loaded
+;
+; If PIC is not defined, then %3 and %4 are executed in order.
+
+%macro GET_SYM 2-4
+%ifdef PIC
+    call        %%.geteip
+%%.ref:
+    %4
+    add         %1, %2 - %%.ref
+    jmp         short %%.done
+    align       32
+%%.geteip:
+    %3          4               ; must adjust stack pointer because of call
+    mov         %1, POINTER [esp]
+    ret
+    align       32
+%%.done:
+%else
+    %3          0
+    %4
+%endif
 %endmacro
 
 ;
@@ -153,272 +275,487 @@
 ;                                  JCOEFPTR block, int last_dc_val,
 ;                                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
 ;
+; Stack layout:
+; Function args
+; Return address
+; Saved ebx
+; Saved ebp
+; Saved esi
+; Saved edi <-- esp_save
+; ...
+; esp_save
+; t_ 64*2 bytes (aligned to 128 bytes)
+;
+; esp is used (as t) to point into t_ (data in lower indices is not used once
+; esp passes over them, so this is signal-safe.)  Aligning to 128 bytes allows
+; us to find the rest of the data again.
+;
+; NOTES:
+; When shuffling data, we try to avoid pinsrw as much as possible, since it is
+; slow on many CPUs.  Its reciprocal throughput (issue latency) is 1 even on
+; modern CPUs, so chains of pinsrw instructions (even with different outputs)
+; can limit performance.  pinsrw is a VectorPath instruction on AMD K8 and
+; requires 2 µops (with memory operand) on Intel.  In either case, only one
+; pinsrw instruction can be decoded per cycle (and nothing else if they are
+; back-to-back), so out-of-order execution cannot be used to work around long
+; pinsrw chains (though for Sandy Bridge and later, this may be less of a
+; problem if the code runs from the µop cache.)
+;
+; We use tzcnt instead of bsf without checking for support.  The instruction is
+; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
+; rep bsf.)  The destination (first) operand of bsf (and tzcnt on some CPUs) is
+; an input dependency (although the behavior is not formally defined, Intel
+; CPUs usually leave the destination unmodified if the source is zero.)  This
+; can prevent out-of-order execution, so we clear the destination before
+; invoking tzcnt.
+;
+; Initial register allocation
+; eax - frame --> buffer
+; ebx - nbits_base (PIC) / emit_temp
+; ecx - dctbl --> size --> state
+; edx - block --> nbits
+; esi - code_temp --> state --> actbl
+; edi - index_temp --> free_bits
+; esp - t
+; ebp - index
 
-; eax + 8 = working_state *state
-; eax + 12 = JOCTET *buffer
-; eax + 16 = JCOEFPTR block
-; eax + 20 = int last_dc_val
-; eax + 24 = c_derived_tbl *dctbl
-; eax + 28 = c_derived_tbl *actbl
+%define frame       eax
+%ifdef PIC
+%define nbits_base  ebx
+%endif
+%define emit_temp   ebx
+%define emit_tempb  bl
+%define emit_temph  bh
+%define dctbl       ecx
+%define block       edx
+%define code_temp   esi
+%define index_temp  edi
+%define t           esp
+%define index       ebp
 
-%define pad         6 * SIZEOF_DWORD    ; Align to 16 bytes
-%define t1          pad
-%define t2          t1 + (DCTSIZE2 * SIZEOF_WORD)
-%define block       t2 + (DCTSIZE2 * SIZEOF_WORD)
-%define actbl       block + SIZEOF_DWORD
-%define buffer      actbl + SIZEOF_DWORD
-%define temp        buffer + SIZEOF_DWORD
-%define temp2       temp + SIZEOF_DWORD
-%define temp3       temp2 + SIZEOF_DWORD
-%define temp4       temp3 + SIZEOF_DWORD
-%define temp5       temp4 + SIZEOF_DWORD
-%define gotptr      temp5 + SIZEOF_DWORD  ; void *gotptr
-%define put_buffer  ebx
-%define put_bits    edi
+%assign save_frame  DCTSIZE2 * SIZEOF_WORD
+
+; Step 1: Re-arrange input data according to jpeg_natural_order
+; xx 01 02 03 04 05 06 07      xx 01 08 16 09 02 03 10
+; 08 09 10 11 12 13 14 15      17 24 32 25 18 11 04 05
+; 16 17 18 19 20 21 22 23      12 19 26 33 40 48 41 34
+; 24 25 26 27 28 29 30 31 ==>  27 20 13 06 07 14 21 28
+; 32 33 34 35 36 37 38 39      35 42 49 56 57 50 43 36
+; 40 41 42 43 44 45 46 47      29 22 15 23 30 37 44 51
+; 48 49 50 51 52 53 54 55      58 59 52 45 38 31 39 46
+; 56 57 58 59 60 61 62 63      53 60 61 54 47 55 62 63
 
     align       32
     GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
 
 EXTN(jsimd_huff_encode_one_block_sse2):
-    push        ebp
-    mov         eax, esp                     ; eax = original ebp
-    sub         esp, byte 4
-    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
-    mov         [esp], eax
-    mov         ebp, esp                     ; ebp = aligned ebp
-    sub         esp, temp5+9*SIZEOF_DWORD-pad
-    push        ebx
-    push        ecx
-;   push        edx                     ; need not be preserved
-    push        esi
-    push        edi
-    push        ebp
 
-    mov         esi, POINTER [eax+8]       ; (working_state *state)
-    mov         put_buffer, dword [esi+8]  ; put_buffer = state->cur.put_buffer;
-    mov         put_bits, dword [esi+12]   ; put_bits = state->cur.put_bits;
-    push        esi                        ; esi is now scratch
+%assign stack_offset      0
+%define arg_state         4 + stack_offset
+%define arg_buffer        8 + stack_offset
+%define arg_block        12 + stack_offset
+%define arg_last_dc_val  16 + stack_offset
+%define arg_dctbl        20 + stack_offset
+%define arg_actbl        24 + stack_offset
 
-    get_GOT     edx                        ; get GOT address
-    movpic      POINTER [esp+gotptr], edx  ; save GOT address
+                                                          ;X: X = code stream
+    mov         block, [esp + arg_block]
+    PUSH        ebx
+    PUSH        ebp
+    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
+    PUSH        esi
+    PUSH        edi
+    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
+    mov         frame, esp
+    lea         t, [frame - (save_frame + 4)]
+    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
+    and         t, -DCTSIZE2 * SIZEOF_WORD                                             ; t = &t_[0]
+    mov         [t + save_frame], frame
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
+    pshuflw     xmm0, xmm0, 11001001b                     ;A: w0 = 01 08 xx 09 02 03 10 11
+    pinsrw      xmm0, word [block + 16 * SIZEOF_WORD], 2  ;A: w0 = 01 08 16 09 02 03 10 11
+    punpckhdq   xmm3, xmm1                                ;D: w3 = 04 05 12 13 06 07 14 15
+    punpcklqdq  xmm1, xmm3                                ;B: w1 = 08 09 10 11 04 05 12 13
+    pinsrw      xmm0, word [block + 17 * SIZEOF_WORD], 7  ;A: w0 = 01 08 16 09 02 03 10 17
+                                                          ;A:      (Row 0, offset 1)
+    pcmpgtw     xmm4, xmm0                                ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
+    paddw       xmm0, xmm4                                ;A: w0[i] += w4[i];
+    movaps      XMMWORD [t + 0 * SIZEOF_WORD], xmm0       ;A: t[i] = w0[i];
 
-    mov         ecx, POINTER [eax+28]
-    mov         edx, POINTER [eax+16]
-    mov         esi, POINTER [eax+12]
-    mov         POINTER [esp+actbl], ecx
-    mov         POINTER [esp+block], edx
-    mov         POINTER [esp+buffer], esi
+    movq        xmm2, qword [block + 24 * SIZEOF_WORD]    ;B: w2 = 24 25 26 27 -- -- -- --
+    pshuflw     xmm2, xmm2, 11011000b                     ;B: w2 = 24 26 25 27 -- -- -- --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;B: w1 = -- 08 09 10 11 04 05 12
+    movups      xmm5, XMMWORD [block + 48 * SIZEOF_WORD]  ;H: w5 = 48 49 50 51 52 53 54 55
+    movsd       xmm1, xmm2                                ;B: w1 = 24 26 25 27 11 04 05 12
+    punpcklqdq  xmm2, xmm5                                ;C: w2 = 24 26 25 27 48 49 50 51
+    pinsrw      xmm1, word [block + 32 * SIZEOF_WORD], 1  ;B: w1 = 24 32 25 27 11 04 05 12
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    psrldq      xmm3, 2 * SIZEOF_WORD                     ;D: w3 = 12 13 06 07 14 15 -- --
+    pcmpeqw     xmm0, xmm4                                ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 18 * SIZEOF_WORD], 3  ;B: w1 = 24 32 25 18 11 04 05 12
+                                                          ;        (Row 1, offset 1)
+    pcmpgtw     xmm4, xmm1                                ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm4                                ;B: w1[i] += w4[i];
+    movaps      XMMWORD [t + 8 * SIZEOF_WORD], xmm1       ;B: t[i+8] = w1[i];
+    pxor        xmm4, xmm4                                ;B: w4[i] = 0;
+    pcmpeqw     xmm1, xmm4                                ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
 
-    ; Encode the DC coefficient difference per section F.1.2.1
-    mov         esi, POINTER [esp+block]  ; block
-    movsx       ecx, word [esi]           ; temp = temp2 = block[0] - last_dc_val;
-    sub         ecx, dword [eax+20]
-    mov         esi, ecx
+    packsswb    xmm0, xmm1                                ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
+                                                          ;    w/ signed saturation
 
-    ; This is a well-known technique for obtaining the absolute value
-    ; with out a branch.  It is derived from an assembly language technique
-    ; presented in "How to Optimize for the Pentium Processors",
-    ; Copyright (c) 1996, 1997 by Agner Fog.
-    mov         edx, ecx
-    sar         edx, 31                 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
-    xor         ecx, edx                ; temp ^= temp3;
-    sub         ecx, edx                ; temp -= temp3;
+    pinsrw      xmm3, word [block + 20 * SIZEOF_WORD], 0  ;D: w3 = 20 13 06 07 14 15 -- --
+    pinsrw      xmm3, word [block + 21 * SIZEOF_WORD], 5  ;D: w3 = 20 13 06 07 14 21 -- --
+    pinsrw      xmm3, word [block + 28 * SIZEOF_WORD], 6  ;D: w3 = 20 13 06 07 14 21 28 --
+    pinsrw      xmm3, word [block + 35 * SIZEOF_WORD], 7  ;D: w3 = 20 13 06 07 14 21 28 35
+                                                          ;        (Row 3, offset 1)
+    pcmpgtw     xmm4, xmm3                                ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
+    paddw       xmm3, xmm4                                ;D: w3[i] += w4[i];
+    movaps      XMMWORD [t + 24 * SIZEOF_WORD], xmm3      ;D: t[i+24] = w3[i];
+    pxor        xmm4, xmm4                                ;D: w4[i] = 0;
+    pcmpeqw     xmm3, xmm4                                ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
 
-    ; For a negative input, want temp2 = bitwise complement of abs(input)
-    ; This code assumes we are on a two's complement machine
-    add         esi, edx                ; temp2 += temp3;
-    mov         dword [esp+temp], esi   ; backup temp2 in temp
+    pinsrw      xmm2, word [block + 19 * SIZEOF_WORD], 0  ;C: w2 = 19 26 25 27 48 49 50 51
+    pinsrw      xmm2, word [block + 33 * SIZEOF_WORD], 2  ;C: w2 = 19 26 33 27 48 49 50 51
+    pinsrw      xmm2, word [block + 40 * SIZEOF_WORD], 3  ;C: w2 = 19 26 33 40 48 49 50 51
+    pinsrw      xmm2, word [block + 41 * SIZEOF_WORD], 5  ;C: w2 = 19 26 33 40 48 41 50 51
+    pinsrw      xmm2, word [block + 34 * SIZEOF_WORD], 6  ;C: w2 = 19 26 33 40 48 41 34 51
+    pinsrw      xmm2, word [block + 27 * SIZEOF_WORD], 7  ;C: w2 = 19 26 33 40 48 41 34 27
+                                                          ;        (Row 2, offset 1)
+    pcmpgtw     xmm4, xmm2                                ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
+    paddw       xmm2, xmm4                                ;C: w2[i] += w4[i];
+    movsx       code_temp, word [block]                   ;Z:     code_temp = block[0];
 
-    ; Find the number of bits needed for the magnitude of the coefficient
-    movpic      ebp, POINTER [esp+gotptr]                        ; load GOT address (ebp)
-    movzx       edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)]  ; nbits = JPEG_NBITS(temp);
-    mov         dword [esp+temp2], edx                           ; backup nbits in temp2
+; %1 - stack pointer adjustment
+%macro GET_SYM_BEFORE 1
+    movaps      XMMWORD [t + 16 * SIZEOF_WORD + %1], xmm2
+                                                          ;C: t[i+16] = w2[i];
+    pxor        xmm4, xmm4                                ;C: w4[i] = 0;
+    pcmpeqw     xmm2, xmm4                                ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
+    sub         code_temp, [frame + arg_last_dc_val]      ;Z:     code_temp -= last_dc_val;
 
-    ; Emit the Huffman-coded symbol for the number of bits
-    mov         ebp, POINTER [eax+24]         ; After this point, arguments are not accessible anymore
-    mov         eax,  INT [ebp + edx * 4]     ; code = dctbl->ehufco[nbits];
-    movzx       ecx, byte [ebp + edx + 1024]  ; size = dctbl->ehufsi[nbits];
-    EMIT_BITS   eax                           ; EMIT_BITS(code, size)
+    packsswb    xmm2, xmm3                                ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
+                                                          ;    w/ signed saturation
 
-    mov         ecx, dword [esp+temp2]        ; restore nbits
+    movdqa      xmm3, xmm5                                ;H: w3 = 48 49 50 51 52 53 54 55
+    pmovmskb    index_temp, xmm2                          ;Z:     index_temp = 0;  index_temp |= ((b2[i] >> 7) << i);
+    pmovmskb    index, xmm0                               ;Z:     index = 0;  index |= ((b0[i] >> 7) << i);
+    movups      xmm0, XMMWORD [block + 56 * SIZEOF_WORD]  ;H: w0 = 56 57 58 59 60 61 62 63
+    punpckhdq   xmm3, xmm0                                ;H: w3 = 52 53 60 61 54 55 62 63
+    shl         index_temp, 16                            ;Z:     index_temp <<= 16;
+    psrldq      xmm3, 1 * SIZEOF_WORD                     ;H: w3 = 53 60 61 54 55 62 63 --
+    pxor        xmm2, xmm2                                ;H: w2[i] = 0;
+    pshuflw     xmm3, xmm3, 00111001b                     ;H: w3 = 60 61 54 53 55 62 63 --
+    or          index, index_temp                         ;Z:     index |= index_temp;
+%undef index_temp
+%define free_bits  edi
+%endmacro
 
-    ; Mask off any extra bits in code
-    mov         eax, 1
-    shl         eax, cl
-    dec         eax
-    and         eax, dword [esp+temp]   ; temp2 &= (((JLONG)1)<<nbits) - 1;
+%macro GET_SYM_AFTER 0
+    movq        xmm1, qword [block + 44 * SIZEOF_WORD]    ;G: w1 = 44 45 46 47 -- -- -- --
+    unpcklps    xmm5, xmm0                                ;E: w5 = 48 49 56 57 50 51 58 59
+    pxor        xmm0, xmm0                                ;H: w0[i] = 0;
+    not         index                                     ;Z:     index = ~index;
+    pinsrw      xmm3, word [block + 47 * SIZEOF_WORD], 3  ;H: w3 = 60 61 54 47 55 62 63 --
+                                                          ;        (Row 7, offset 1)
+    pcmpgtw     xmm2, xmm3                                ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
+    mov         dctbl, [frame + arg_dctbl]
+    paddw       xmm3, xmm2                                ;H: w3[i] += w2[i];
+    movaps      XMMWORD [t + 56 * SIZEOF_WORD], xmm3      ;H: t[i+56] = w3[i];
+    movq        xmm4, qword [block + 36 * SIZEOF_WORD]    ;G: w4 = 36 37 38 39 -- -- -- --
+    pcmpeqw     xmm3, xmm0                                ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
+    punpckldq   xmm4, xmm1                                ;G: w4 = 36 37 44 45 38 39 46 47
+    movdqa      xmm1, xmm4                                ;F: w1 = 36 37 44 45 38 39 46 47
+    pcmpeqw     mm_all_0xff, mm_all_0xff                  ;Z:     all_0xff[i] = 0xFF;
+%endmacro
 
-    ; Emit that number of bits of the value, if positive,
-    ; or the complement of its magnitude, if negative.
-    EMIT_BITS   eax                     ; EMIT_BITS(temp2, nbits)
+    GET_SYM     nbits_base, jpeg_nbits_table, GET_SYM_BEFORE, GET_SYM_AFTER
 
-    ; Prepare data
-    xor         ecx, ecx
-    mov         esi, POINTER [esp+block]
-    kloop_prepare  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, \
-                   18, 11, 4,  5,  12, 19, 26, 33, 40, 48, 41, 34, \
-                   27, 20, 13, 6,  7,  14, 21, 28, 35, \
-                   xmm0, xmm1, xmm2, xmm3
-    kloop_prepare  32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
-                   30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
-                   53, 60, 61, 54, 47, 55, 62, 63, 63, \
-                   xmm0, xmm1, xmm2, xmm3
+    psrldq      xmm4, 1 * SIZEOF_WORD                     ;G: w4 = 37 44 45 38 39 46 47 --
+    shufpd      xmm1, xmm5, 10b                           ;F: w1 = 36 37 44 45 50 51 58 59
+    pshufhw     xmm4, xmm4, 11010011b                     ;G: w4 = 37 44 45 38 -- 39 46 --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;F: w1 = -- 36 37 44 45 50 51 58
+    pinsrw      xmm4, word [block + 59 * SIZEOF_WORD], 0  ;G: w4 = 59 44 45 38 -- 39 46 --
+    pshufd      xmm1, xmm1, 11011000b                     ;F: w1 = -- 36 45 50 37 44 51 58
+    cmp         code_temp, 1 << 31                        ;Z:     Set CF if code_temp < 0x80000000,
+                                                          ;Z:     i.e. if code_temp is positive
+    pinsrw      xmm4, word [block + 52 * SIZEOF_WORD], 1  ;G: w4 = 59 52 45 38 -- 39 46 --
+    movlps      xmm1, qword [block + 20 * SIZEOF_WORD]    ;F: w1 = 20 21 22 23 37 44 51 58
+    pinsrw      xmm4, word [block + 31 * SIZEOF_WORD], 4  ;G: w4 = 59 52 45 38 31 39 46 --
+    pshuflw     xmm1, xmm1, 01110010b                     ;F: w1 = 22 20 23 21 37 44 51 58
+    pinsrw      xmm4, word [block + 53 * SIZEOF_WORD], 7  ;G: w4 = 59 52 45 38 31 39 46 53
+                                                          ;        (Row 6, offset 1)
+    adc         code_temp, -1                             ;Z:     code_temp += -1 + (code_temp >= 0 ? 1 : 0);
+    pxor        xmm2, xmm2                                ;G: w2[i] = 0;
+    pcmpgtw     xmm0, xmm4                                ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 15 * SIZEOF_WORD], 1  ;F: w1 = 22 15 23 21 37 44 51 58
+    paddw       xmm4, xmm0                                ;G: w4[i] += w0[i];
+    movaps      XMMWORD [t + 48 * SIZEOF_WORD], xmm4      ;G: t[48+i] = w4[i];
+    movd        mm_temp, code_temp                        ;Z:     temp = code_temp
+    pinsrw      xmm1, word [block + 30 * SIZEOF_WORD], 3  ;F: w1 = 22 15 23 30 37 44 51 58
+                                                          ;        (Row 5, offset 1)
+    pcmpeqw     xmm4, xmm2                                ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
 
-    pxor        xmm7, xmm7
-    movdqa      xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD]   ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
-    movdqa      xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD]   ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
-    movdqa      xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
-    movdqa      xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
-    pcmpeqw     xmm0, xmm7              ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
-    pcmpeqw     xmm1, xmm7              ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
-    pcmpeqw     xmm2, xmm7              ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
-    pcmpeqw     xmm3, xmm7              ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
-    packsswb    xmm0, xmm1              ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
-    packsswb    xmm2, xmm3              ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
-    pmovmskb    edx, xmm0               ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
-    pmovmskb    ecx, xmm2               ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
-    shl         ecx, 16
-    or          edx, ecx
-    not         edx                     ; index = ~index;
+    packsswb    xmm4, xmm3                                ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
+                                                          ;    w/ signed saturation
 
-    lea         esi, [esp+t1]
-    mov         ebp, POINTER [esp+actbl]  ; ebp = actbl
+    lea         t, [t - SIZEOF_WORD]                      ;Z:     t = &t[-1]
+    pxor        xmm0, xmm0                                ;F: w0[i] = 0;
+    pcmpgtw     xmm2, xmm1                                ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm2                                ;F: w1[i] += w2[i];
+    movaps      XMMWORD [t + (40+1) * SIZEOF_WORD], xmm1  ;F: t[40+i] = w1[i];
+    pcmpeqw     xmm1, xmm0                                ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
+    pinsrw      xmm5, word [block + 42 * SIZEOF_WORD], 0  ;E: w5 = 42 49 56 57 50 51 58 59
+    pinsrw      xmm5, word [block + 43 * SIZEOF_WORD], 5  ;E: w5 = 42 49 56 57 50 43 58 59
+    pinsrw      xmm5, word [block + 36 * SIZEOF_WORD], 6  ;E: w5 = 42 49 56 57 50 43 36 59
+    pinsrw      xmm5, word [block + 29 * SIZEOF_WORD], 7  ;E: w5 = 42 49 56 57 50 43 36 29
+                                                          ;        (Row 4, offset 1)
+%undef block
+%define nbits  edx
+%define nbitsb  dl
+%define nbitsh  dh
+    movzx       nbits, byte [NBITS(code_temp)]            ;Z:     nbits = JPEG_NBITS(code_temp);
+%undef code_temp
+%define state  esi
+    pxor        xmm2, xmm2                                ;E: w2[i] = 0;
+    mov         state, [frame + arg_state]
+    movd        mm_nbits, nbits                           ;Z:     nbits --> MMX register
+    pcmpgtw     xmm0, xmm5                                ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
+    movd        mm_code, dword [dctbl + c_derived_tbl.ehufco + nbits * 4]
+                                                          ;Z:     code = dctbl->ehufco[nbits];
+%define size  ecx
+%define sizeb  cl
+%define sizeh  ch
+    paddw       xmm5, xmm0                                ;E: w5[i] += w0[i];
+    movaps      XMMWORD [t + (32+1) * SIZEOF_WORD], xmm5  ;E: t[32+i] = w5[i];
+    movzx       size, byte [dctbl + c_derived_tbl.ehufsi + nbits]
+                                                          ;Z:     size = dctbl->ehufsi[nbits];
+%undef dctbl
+    pcmpeqw     xmm5, xmm2                                ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
 
-.BLOOP:
-    bsf         ecx, edx                ; r = __builtin_ctzl(index);
-    jz          near .ELOOP
-    lea         esi, [esi+ecx*2]        ; k += r;
-    shr         edx, cl                 ; index >>= r;
-    mov         dword [esp+temp3], edx
-.BRLOOP:
-    cmp         ecx, 16                       ; while (r > 15) {
-    jl          near .ERLOOP
-    sub         ecx, 16                       ; r -= 16;
-    mov         dword [esp+temp], ecx
-    mov         eax, INT [ebp + 240 * 4]      ; code_0xf0 = actbl->ehufco[0xf0];
-    movzx       ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
-    EMIT_BITS   eax                           ; EMIT_BITS(code_0xf0, size_0xf0)
-    mov         ecx, dword [esp+temp]
-    jmp         .BRLOOP
-.ERLOOP:
-    movsx       eax, word [esi]                                  ; temp = t1[k];
-    movpic      edx, POINTER [esp+gotptr]                        ; load GOT address (edx)
-    movzx       eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)]  ; nbits = JPEG_NBITS(temp);
-    mov         dword [esp+temp2], eax
-    ; Emit Huffman symbol for run length / number of bits
-    shl         ecx, 4                        ; temp3 = (r << 4) + nbits;
-    add         ecx, eax
-    mov         eax,  INT [ebp + ecx * 4]     ; code = actbl->ehufco[temp3];
-    movzx       ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
-    EMIT_BITS   eax
+    packsswb    xmm5, xmm1                                ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
+                                                          ;    w/ signed saturation
 
-    movsx       edx, word [esi+DCTSIZE2*2]    ; temp2 = t2[k];
-    ; Mask off any extra bits in code
-    mov         ecx, dword [esp+temp2]
-    mov         eax, 1
-    shl         eax, cl
-    dec         eax
-    and         eax, edx                ; temp2 &= (((JLONG)1)<<nbits) - 1;
-    EMIT_BITS   eax                     ; PUT_BITS(temp2, nbits)
-    mov         edx, dword [esp+temp3]
-    add         esi, 2                  ; ++k;
-    shr         edx, 1                  ; index >>= 1;
+    movq        mm_put_buffer, [state + working_state.cur.put_buffer.simd]
+                                                          ;Z:     put_buffer = state->cur.put_buffer.simd;
+    mov         free_bits, [state + working_state.cur.free_bits]
+                                                          ;Z:     free_bits = state->cur.free_bits;
+%undef state
+%define actbl  esi
+    mov         actbl, [frame + arg_actbl]
+%define buffer  eax
+    mov         buffer, [frame + arg_buffer]
+%undef frame
+    jmp        .BEGIN
 
-    jmp         .BLOOP
-.ELOOP:
-    movdqa      xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD]  ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
-    movdqa      xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD]  ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
-    movdqa      xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
-    movdqa      xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
-    pcmpeqw     xmm0, xmm7              ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
-    pcmpeqw     xmm1, xmm7              ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
-    pcmpeqw     xmm2, xmm7              ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
-    pcmpeqw     xmm3, xmm7              ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
-    packsswb    xmm0, xmm1              ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
-    packsswb    xmm2, xmm3              ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
-    pmovmskb    edx, xmm0               ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
-    pmovmskb    ecx, xmm2               ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
-    shl         ecx, 16
-    or          edx, ecx
-    not         edx                     ; index = ~index;
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    lea         eax, [esp + t1 + (DCTSIZE2/2) * 2]
-    sub         eax, esi
-    shr         eax, 1
-    bsf         ecx, edx                ; r = __builtin_ctzl(index);
-    jz          near .ELOOP2
-    shr         edx, cl                 ; index >>= r;
-    add         ecx, eax
-    lea         esi, [esi+ecx*2]        ; k += r;
-    mov         dword [esp+temp3], edx
-    jmp         .BRLOOP2
-.BLOOP2:
-    bsf         ecx, edx                ; r = __builtin_ctzl(index);
-    jz          near .ELOOP2
-    lea         esi, [esi+ecx*2]        ; k += r;
-    shr         edx, cl                 ; index >>= r;
-    mov         dword [esp+temp3], edx
-.BRLOOP2:
-    cmp         ecx, 16                       ; while (r > 15) {
-    jl          near .ERLOOP2
-    sub         ecx, 16                       ; r -= 16;
-    mov         dword [esp+temp], ecx
-    mov         eax, INT [ebp + 240 * 4]      ; code_0xf0 = actbl->ehufco[0xf0];
-    movzx       ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
-    EMIT_BITS   eax                           ; EMIT_BITS(code_0xf0, size_0xf0)
-    mov         ecx, dword [esp+temp]
-    jmp         .BRLOOP2
-.ERLOOP2:
-    movsx       eax, word [esi]         ; temp = t1[k];
-    bsr         eax, eax                ; nbits = 32 - __builtin_clz(temp);
-    inc         eax
-    mov         dword [esp+temp2], eax
-    ; Emit Huffman symbol for run length / number of bits
-    shl         ecx, 4                        ; temp3 = (r << 4) + nbits;
-    add         ecx, eax
-    mov         eax,  INT [ebp + ecx * 4]     ; code = actbl->ehufco[temp3];
-    movzx       ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
-    EMIT_BITS   eax
+    align       16
+; size <= 32, so this is not really a loop
+.BRLOOP1:                                                 ; .BRLOOP1:
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ; nbits = actbl->ehufsi[0xf0];
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ; code = actbl->ehufco[0xf0];
+    and         index, 0x7ffffff                          ; clear index if size == 32
+    sub         size, 16                                  ; size -= 16;
+    sub         free_bits, nbits                          ; if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP1                             ;   goto .EMIT_BRLOOP1;
+    movd        mm_nbits, nbits                           ; nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ; put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ; put_buffer |= code;
+    jmp         .ERLOOP1                                  ; goto .ERLOOP1;
 
-    movsx       edx, word [esi+DCTSIZE2*2]    ; temp2 = t2[k];
-    ; Mask off any extra bits in code
-    mov         ecx, dword [esp+temp2]
-    mov         eax, 1
-    shl         eax, cl
-    dec         eax
-    and         eax, edx                ; temp2 &= (((JLONG)1)<<nbits) - 1;
-    EMIT_BITS   eax                     ; PUT_BITS(temp2, nbits)
-    mov         edx, dword [esp+temp3]
-    add         esi, 2                  ; ++k;
-    shr         edx, 1                  ; index >>= 1;
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    jmp         .BLOOP2
-.ELOOP2:
-    ; If the last coef(s) were zero, emit an end-of-block code
-    lea         edx, [esp + t1 + (DCTSIZE2-1) * 2]  ; r = DCTSIZE2-1-k;
-    cmp         edx, esi                            ; if (r > 0) {
-    je          .EFN
-    mov         eax,  INT [ebp]                     ; code = actbl->ehufco[0];
-    movzx       ecx, byte [ebp + 1024]              ; size = actbl->ehufsi[0];
-    EMIT_BITS   eax
-.EFN:
-    mov         eax, [esp+buffer]
-    pop         esi
-    ; Save put_buffer & put_bits
-    mov         dword [esi+8], put_buffer  ; state->cur.put_buffer = put_buffer;
-    mov         dword [esi+12], put_bits   ; state->cur.put_bits = put_bits;
+    align       16
+%ifdef PIC
+    times 6     nop
+%else
+    times 2     nop
+%endif
+.BLOOP1:                                                  ; do {  /* size = # of zero bits/elements to skip */
+; if size == 32, index remains unchanged.  Correct in .BRLOOP.
+    shr         index, sizeb                              ;   index >>= size;
+    lea         t, [t + size * SIZEOF_WORD]               ;   t += size;
+    cmp         size, 16                                  ;   if (size > 16)
+    jg          .BRLOOP1                                  ;     goto .BRLOOP1;
+.ERLOOP1:                                                 ; .ERLOOP1:
+    movsx       nbits, word [t]                           ;   nbits = *t;
+%ifdef PIC
+    add         size, size                                ;   size += size;
+%else
+    lea         size, [size * 2]                          ;   size += size;
+%endif
+    movd        mm_temp, nbits                            ;   temp = nbits;
+    movzx       nbits, byte [NBITS(nbits)]                ;   nbits = JPEG_NBITS(nbits);
+    lea         size, [size * 8 + nbits]                  ;   size = size * 8 + nbits;
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+                                                          ;   code = actbl->ehufco[size-16];
+    movzx       size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+                                                          ;   size = actbl->ehufsi[size-16];
+.BEGIN:                                                   ; .BEGIN:
+    pand        mm_temp, [MASK_BITS(nbits)]               ;   temp &= (1 << nbits) - 1;
+    psllq       mm_code, mm_nbits                         ;   code <<= nbits;
+    add         nbits, size                               ;   nbits += size;
+    por         mm_code, mm_temp                          ;   code |= temp;
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_ERLOOP1                             ;     insert code, flush buffer, init size, goto .BLOOP1
+    xor         size, size                                ;   size = 0;  /* kill tzcnt input dependency */
+    tzcnt       size, index                               ;   size = # of trailing 0 bits in index
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;   put_buffer <<= nbits;
+    inc         size                                      ;   ++size;
+    por         mm_put_buffer, mm_code                    ;   put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP1                                   ; } while (index != 0);
+; Round 2
+; t points to the last used word, possibly below t_ if the previous index had 32 zero bits.
+.ELOOP1:                                                  ; .ELOOP1:
+    pmovmskb    size, xmm4                                ; size = 0;  size |= ((b4[i] >> 7) << i);
+    pmovmskb    index, xmm5                               ; index = 0;  index |= ((b5[i] >> 7) << i);
+    shl         size, 16                                  ; size <<= 16;
+    or          index, size                               ; index |= size;
+    not         index                                     ; index = ~index;
+    lea         nbits, [t + (1 + DCTSIZE2) * SIZEOF_WORD]
+                                                          ; nbits = t + 1 + 64;
+    and         nbits, -DCTSIZE2 * SIZEOF_WORD            ; nbits &= -128;  /* now points to &t_[64] */
+    sub         nbits, t                                  ; nbits -= t;
+    shr         nbits, 1                                  ; nbits >>= 1;  /* # of leading 0 bits in old index + 33 */
+    tzcnt       size, index                               ; size = # of trailing 0 bits in index
+    inc         size                                      ; ++size;
+    test        index, index                              ; if (index == 0)
+    jz          .ELOOP2                                   ;   goto .ELOOP2;
+; NOTE: size == 32 cannot happen, since the last element is always 0.
+    shr         index, sizeb                              ; index >>= size;
+    lea         size, [size + nbits - 33]                 ; size = size + nbits - 33;
+    lea         t, [t + size * SIZEOF_WORD]               ; t += size;
+    cmp         size, 16                                  ; if (size <= 16)
+    jle         .ERLOOP2                                  ;   goto .ERLOOP2;
+.BRLOOP2:                                                 ; do {
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ;   nbits = actbl->ehufsi[0xf0];
+    sub         size, 16                                  ;   size -= 16;
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ;   code = actbl->ehufco[0xf0];
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP2                             ;     insert code and flush put_buffer
+    movd        mm_nbits, nbits                           ;   else { nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;     put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ;     put_buffer |= code;
+    cmp         size, 16                                  ;     if (size <= 16)
+    jle        .ERLOOP2                                   ;       goto .ERLOOP2;
+    jmp        .BRLOOP2                                   ; } while (1);
 
-    pop         ebp
-    pop         edi
-    pop         esi
-;   pop         edx                     ; need not be preserved
-    pop         ecx
-    pop         ebx
-    mov         esp, ebp                ; esp <- aligned ebp
-    pop         esp                     ; esp <- original ebp
-    pop         ebp
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align      16
+.BLOOP2:                                                  ; do {  /* size = # of zero bits/elements to skip */
+    shr         index, sizeb                              ;   index >>= size;
+    lea         t, [t + size * SIZEOF_WORD]               ;   t += size;
+    cmp         size, 16                                  ;   if (size > 16)
+    jg          .BRLOOP2                                  ;     goto .BRLOOP2;
+.ERLOOP2:                                                 ; .ERLOOP2:
+    movsx       nbits, word [t]                           ;   nbits = *t;
+    add         size, size                                ;   size += size;
+    movd        mm_temp, nbits                            ;   temp = nbits;
+    movzx       nbits, byte [NBITS(nbits)]                ;   nbits = JPEG_NBITS(nbits);
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    lea         size, [size * 8 + nbits]                  ;   size = size * 8 + nbits;
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+                                                          ;   code = actbl->ehufco[size-16];
+    movzx       size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+                                                          ;   size = actbl->ehufsi[size-16];
+    psllq       mm_code, mm_nbits                         ;   code <<= nbits;
+    pand        mm_temp, [MASK_BITS(nbits)]               ;   temp &= (1 << nbits) - 1;
+    lea         nbits, [nbits + size]                     ;   nbits += size;
+    por         mm_code, mm_temp                          ;   code |= temp;
+    xor         size, size                                ;   size = 0;  /* kill tzcnt input dependency */
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_ERLOOP2                             ;     insert code, flush buffer, init size, goto .BLOOP2
+    tzcnt       size, index                               ;   size = # of trailing 0 bits in index
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;   put_buffer <<= nbits;
+    inc         size                                      ;   ++size;
+    por         mm_put_buffer, mm_code                    ;   put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP2                                   ; } while (index != 0);
+.ELOOP2:                                                  ; .ELOOP2:
+    mov         nbits, t                                  ; nbits = t;
+    lea         t, [t + SIZEOF_WORD]                      ; t = &t[1];
+    and         nbits, DCTSIZE2 * SIZEOF_WORD - 1         ; nbits &= 127;
+    and         t, -DCTSIZE2 * SIZEOF_WORD                ; t &= -128;  /* t = &t_[0]; */
+    cmp         nbits, (DCTSIZE2 - 2) * SIZEOF_WORD       ; if (nbits != 62 * 2)
+    je          .EFN                                      ; {
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0]
+                                                          ;   code = actbl->ehufco[0];
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
+                                                          ;   nbits = actbl->ehufsi[0];
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jg          .EFN_SKIP_EMIT_CODE                       ;   {
+    EMIT_QWORD  size, sizeb, sizeh, , , , , , .EFN        ;     insert code, flush put_buffer
+    align       16
+.EFN_SKIP_EMIT_CODE:                                      ;   } else {
+    movd        mm_nbits, nbits                           ;     nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;     put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ;     put_buffer |= code;
+.EFN:                                                     ; } }
+%define frame  esp
+    mov         frame, [t + save_frame]
+%define state  ecx
+    mov         state, [frame + arg_state]
+    movq        [state + working_state.cur.put_buffer.simd], mm_put_buffer
+                                                          ; state->cur.put_buffer.simd = put_buffer;
+    emms
+    mov         [state + working_state.cur.free_bits], free_bits
+                                                          ; state->cur.free_bits = free_bits;
+    POP         edi
+    POP         esi
+    POP         ebp
+    POP         ebx
     ret
 
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP1:
+    EMIT_QWORD  emit_temp, emit_tempb, emit_temph, , , , , , \
+      .ERLOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_ERLOOP1:
+    EMIT_QWORD  size, sizeb, sizeh, \
+      { xor     size, size }, \
+      { tzcnt   size, index }, \
+      { inc     size }, \
+      { test    index, index }, \
+      { jnz     .BLOOP1 }, \
+      .ELOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP2:
+    EMIT_QWORD  emit_temp, emit_tempb, emit_temph, , , , \
+      { cmp     size, 16 }, \
+      { jle     .ERLOOP2 }, \
+      .BRLOOP2
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_ERLOOP2:
+    EMIT_QWORD  size, sizeb, sizeh, \
+      { xor     size, size }, \
+      { tzcnt   size, index }, \
+      { inc     size }, \
+      { test    index, index }, \
+      { jnz     .BLOOP2 }, \
+      .ELOOP2
+
 ; For some reason, the OS X linker does not honor the request to align the
 ; segment unless we do this.
     align       32
diff --git a/simd/jsimd.h b/simd/jsimd.h
index a9fc812..97a0062 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -6,7 +6,7 @@
  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2014, Linaro Limited.
  * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -285,6 +285,28 @@
   (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
    JDIMENSION output_row, int num_rows);
 
+EXTERN(void) jsimd_rgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
 EXTERN(void) jsimd_rgb_gray_convert_altivec
   (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
    JDIMENSION output_row, int num_rows);
@@ -616,6 +638,9 @@
   (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
    JSAMPARRAY *output_data_ptr);
 
+EXTERN(void) jsimd_h2v1_fancy_upsample_mmi
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_fancy_upsample_mmi
   (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
    JSAMPARRAY *output_data_ptr);
@@ -806,6 +831,50 @@
   (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
    JSAMPARRAY output_buf, JSAMPLE *range);
 
+EXTERN(void) jsimd_h2v1_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
 EXTERN(void) jsimd_h2v1_merged_upsample_altivec
   (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
    JSAMPARRAY output_buf);
@@ -909,6 +978,8 @@
 
 EXTERN(void) jsimd_fdct_ifast_dspr2(DCTELEM *data);
 
+EXTERN(void) jsimd_fdct_ifast_mmi(DCTELEM *data);
+
 EXTERN(void) jsimd_fdct_ifast_altivec(DCTELEM *data);
 
 /* Floating Point Forward DCT */
@@ -1040,6 +1111,10 @@
   (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
    const int *idct_coefs);
 
+EXTERN(void) jsimd_idct_ifast_mmi
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
 EXTERN(void) jsimd_idct_ifast_altivec
   (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
    JDIMENSION output_col);
@@ -1078,6 +1153,14 @@
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
    JCOEF *values, size_t *zerobits);
 
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits);
+
 EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
    JCOEF *absvalues, size_t *bits);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits);
diff --git a/simd/loongson/jccolext-mmi.c b/simd/loongson/jccolext-mmi.c
deleted file mode 100644
index 6cdeb5e..0000000
--- a/simd/loongson/jccolext-mmi.c
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
- *                          All Rights Reserved.
- * Authors:  ZhuChen     <zhuchen@loongson.cn>
- *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- *           CaiWanwei   <caiwanwei@loongson.cn>
- *           ZhangLixia  <zhanglixia-hf@loongson.cn>
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* This file is included by jccolor-mmi.c */
-
-
-#if RGB_RED == 0
-#define mmA  mm0
-#define mmB  mm1
-#elif RGB_GREEN == 0
-#define mmA  mm2
-#define mmB  mm3
-#elif RGB_BLUE == 0
-#define mmA  mm4
-#define mmB  mm5
-#else
-#define mmA  mm6
-#define mmB  mm7
-#endif
-
-#if RGB_RED == 1
-#define mmC  mm0
-#define mmD  mm1
-#elif RGB_GREEN == 1
-#define mmC  mm2
-#define mmD  mm3
-#elif RGB_BLUE == 1
-#define mmC  mm4
-#define mmD  mm5
-#else
-#define mmC  mm6
-#define mmD  mm7
-#endif
-
-#if RGB_RED == 2
-#define mmE  mm0
-#define mmF  mm1
-#elif RGB_GREEN == 2
-#define mmE  mm2
-#define mmF  mm3
-#elif RGB_BLUE == 2
-#define mmE  mm4
-#define mmF  mm5
-#else
-#define mmE  mm6
-#define mmF  mm7
-#endif
-
-#if RGB_RED == 3
-#define mmG  mm0
-#define mmH  mm1
-#elif RGB_GREEN == 3
-#define mmG  mm2
-#define mmH  mm3
-#elif RGB_BLUE == 3
-#define mmG  mm4
-#define mmH  mm5
-#else
-#define mmG  mm6
-#define mmH  mm7
-#endif
-
-
-void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
-                               JSAMPIMAGE output_buf, JDIMENSION output_row,
-                               int num_rows)
-{
-  JSAMPROW inptr, outptr0, outptr1, outptr2;
-  int num_cols, col;
-  __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
-  __m64 wk[7];
-  __m64 Y_BG, Cb_RG, Cr_BG;
-
-  while (--num_rows >= 0) {
-    inptr = *input_buf++;
-    outptr0 = output_buf[0][output_row];
-    outptr1 = output_buf[1][output_row];
-    outptr2 = output_buf[2][output_row];
-    output_row++;
-
-    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
-         outptr0 += 8, outptr1 += 8, outptr2 += 8) {
-
-#if RGB_PIXELSIZE == 3
-
-      if (num_cols < 8) {
-        col = num_cols * 3;
-        asm(".set noreorder\r\n"
-
-            "li     $8, 1\r\n"
-            "move   $9, %3\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 1f\r\n"
-            "nop    \r\n"
-            "subu   $9, $9, 1\r\n"
-            "xor    $12, $12, $12\r\n"
-            "move   $13, %5\r\n"
-            "dadd   $13, $13, $9\r\n"
-            "lbu    $12, 0($13)\r\n"
-
-            "1:     \r\n"
-            "li     $8, 2\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 2f\r\n"
-            "nop    \r\n"
-            "subu   $9, $9, 2\r\n"
-            "xor    $11, $11, $11\r\n"
-            "move   $13, %5\r\n"
-            "dadd   $13, $13, $9\r\n"
-            "lhu    $11, 0($13)\r\n"
-            "sll    $12, $12, 16\r\n"
-            "or     $12, $12, $11\r\n"
-
-            "2:     \r\n"
-            "dmtc1  $12, %0\r\n"
-            "li     $8, 4\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 3f\r\n"
-            "nop    \r\n"
-            "subu   $9, $9, 4\r\n"
-            "move   $13, %5\r\n"
-            "dadd   $13, $13, $9\r\n"
-            "lwu    $14, 0($13)\r\n"
-            "dmtc1  $14, %1\r\n"
-            "dsll32 $12, $12, 0\r\n"
-            "or     $12, $12, $14\r\n"
-            "dmtc1  $12, %0\r\n"
-
-            "3:     \r\n"
-            "li     $8, 8\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 4f\r\n"
-            "nop    \r\n"
-            "mov.s  %1, %0\r\n"
-            "ldc1   %0, 0(%5)\r\n"
-            "li     $9, 8\r\n"
-            "j      5f\r\n"
-            "nop    \r\n"
-
-            "4:     \r\n"
-            "li     $8, 16\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 5f\r\n"
-            "nop    \r\n"
-            "mov.s  %2, %0\r\n"
-            "ldc1   %0, 0(%5)\r\n"
-            "ldc1   %1, 8(%5)\r\n"
-
-            "5:     \r\n"
-            "nop    \r\n"
-            ".set reorder\r\n"
-
-            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
-            : "r" (col), "r" (num_rows), "r" (inptr)
-            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
-              "$14", "memory"
-           );
-      } else {
-        if (!(((long)inptr) & 7)) {
-          mmA = _mm_load_si64((__m64 *)&inptr[0]);
-          mmG = _mm_load_si64((__m64 *)&inptr[8]);
-          mmF = _mm_load_si64((__m64 *)&inptr[16]);
-        } else {
-          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
-          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
-          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
-        }
-        inptr += RGB_PIXELSIZE * 8;
-      }
-      mmD = mmA;
-      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
-      mmD = _mm_srli_si64(mmD, 4 * BYTE_BIT);
-
-      mmA = _mm_unpackhi_pi8(mmA, mmG);
-      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
-
-      mmD = _mm_unpacklo_pi8(mmD, mmF);
-      mmG = _mm_unpackhi_pi8(mmG, mmF);
-
-      mmE = mmA;
-      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
-      mmE = _mm_srli_si64(mmE, 4 * BYTE_BIT);
-
-      mmA = _mm_unpackhi_pi8(mmA, mmD);
-      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
-
-      mmE = _mm_unpacklo_pi8(mmE, mmG);
-      mmD = _mm_unpackhi_pi8(mmD, mmG);
-      mmC = mmA;
-      mmA = _mm_loadlo_pi8_f(mmA);
-      mmC = _mm_loadhi_pi8_f(mmC);
-
-      mmB = mmE;
-      mmE = _mm_loadlo_pi8_f(mmE);
-      mmB = _mm_loadhi_pi8_f(mmB);
-
-      mmF = mmD;
-      mmD = _mm_loadlo_pi8_f(mmD);
-      mmF = _mm_loadhi_pi8_f(mmF);
-
-#else  /* RGB_PIXELSIZE == 4 */
-
-      if (num_cols < 8) {
-        col = num_cols;
-        asm(".set noreorder\r\n"
-
-            "li     $8, 1\r\n"
-            "move   $9, %4\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 1f\r\n"
-            "nop    \r\n"
-            "subu   $9, $9, 1\r\n"
-            "dsll   $11, $9, 2\r\n"
-            "move   $13, %5\r\n"
-            "daddu  $13, $13, $11\r\n"
-            "lwc1   %0, 0($13)\r\n"
-
-            "1:     \r\n"
-            "li     $8, 2\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 2f\r\n"
-            "nop    \r\n"
-            "subu   $9, $9, 2\r\n"
-            "dsll   $11, $9, 2\r\n"
-            "move   $13, %5\r\n"
-            "daddu  $13, $13, $11\r\n"
-            "mov.s  %1, %0\r\n"
-            "ldc1   %0, 0($13)\r\n"
-
-            "2:     \r\n"
-            "li     $8, 4\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 3f\r\n"
-            "nop    \r\n"
-            "mov.s  %2, %0\r\n"
-            "mov.s  %3, %1\r\n"
-            "ldc1   %0, 0(%5)\r\n"
-            "ldc1   %1, 8(%5)\r\n"
-
-            "3:     \r\n"
-            "nop    \r\n"
-            ".set reorder\r\n"
-
-            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
-            : "r" (col), "r" (inptr)
-            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
-           );
-      } else {
-        if (!(((long)inptr) & 7)) {
-          mmA = _mm_load_si64((__m64 *)&inptr[0]);
-          mmF = _mm_load_si64((__m64 *)&inptr[8]);
-          mmD = _mm_load_si64((__m64 *)&inptr[16]);
-          mmC = _mm_load_si64((__m64 *)&inptr[24]);
-        } else {
-          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
-          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
-          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
-          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
-        }
-        inptr += RGB_PIXELSIZE * 8;
-      }
-      mmB = mmA;
-      mmA = _mm_unpacklo_pi8(mmA, mmF);
-      mmB = _mm_unpackhi_pi8(mmB, mmF);
-
-      mmG = mmD;
-      mmD = _mm_unpacklo_pi8(mmD, mmC);
-      mmG = _mm_unpackhi_pi8(mmG, mmC);
-
-      mmE = mmA;
-      mmA = _mm_unpacklo_pi16(mmA, mmD);
-      mmE = _mm_unpackhi_pi16(mmE, mmD);
-
-      mmH = mmB;
-      mmB = _mm_unpacklo_pi16(mmB, mmG);
-      mmH = _mm_unpackhi_pi16(mmH, mmG);
-
-      mmC = mmA;
-      mmA = _mm_loadlo_pi8_f(mmA);
-      mmC = _mm_loadhi_pi8_f(mmC);
-
-      mmD = mmB;
-      mmB = _mm_loadlo_pi8_f(mmB);
-      mmD = _mm_loadhi_pi8_f(mmD);
-
-      mmG = mmE;
-      mmE = _mm_loadlo_pi8_f(mmE);
-      mmG = _mm_loadhi_pi8_f(mmG);
-
-      mmF = mmH;
-      mmF = _mm_unpacklo_pi8(mmF, mmH);
-      mmH = _mm_unpackhi_pi8(mmH, mmH);
-      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
-      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
-
-#endif
-
-      wk[0] = mm0;
-      wk[1] = mm1;
-      wk[2] = mm4;
-      wk[3] = mm5;
-
-      mm6 = mm1;
-      mm1 = _mm_unpacklo_pi16(mm1, mm3);
-      mm6 = _mm_unpackhi_pi16(mm6, mm3);
-      mm7 = mm1;
-      mm4 = mm6;
-      mm1 = _mm_madd_pi16(mm1, PW_F0299_F0337);
-      mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
-      mm7 = _mm_madd_pi16(mm7, PW_MF016_MF033);
-      mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
-
-      wk[4] = mm1;
-      wk[5] = mm6;
-
-      mm1 = _mm_loadlo_pi16_f(mm5);
-      mm6 = _mm_loadhi_pi16_f(mm5);
-      mm1 = _mm_srli_pi32(mm1, 1);
-      mm6 = _mm_srli_pi32(mm6, 1);
-
-      mm5 = PD_ONEHALFM1_CJ;
-      mm7 = _mm_add_pi32(mm7, mm1);
-      mm4 = _mm_add_pi32(mm4, mm6);
-      mm7 = _mm_add_pi32(mm7, mm5);
-      mm4 = _mm_add_pi32(mm4, mm5);
-      mm7 = _mm_srli_pi32(mm7, SCALEBITS);
-      mm4 = _mm_srli_pi32(mm4, SCALEBITS);
-      mm7 = _mm_packs_pi32(mm7, mm4);
-
-      mm1 = wk[2];
-      mm6 = mm0;
-      mm0 = _mm_unpacklo_pi16(mm0, mm2);
-      mm6 = _mm_unpackhi_pi16(mm6, mm2);
-      mm5 = mm0;
-      mm4 = mm6;
-      mm0 = _mm_madd_pi16(mm0, PW_F0299_F0337);
-      mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
-      mm5 = _mm_madd_pi16(mm5, PW_MF016_MF033);
-      mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
-
-      wk[6] = mm0;
-      wk[7] = mm6;
-      mm0 = _mm_loadlo_pi16_f(mm1);
-      mm6 = _mm_loadhi_pi16_f(mm1);
-      mm0 = _mm_srli_pi32(mm0, 1);
-      mm6 = _mm_srli_pi32(mm6, 1);
-
-      mm1 = PD_ONEHALFM1_CJ;
-      mm5 = _mm_add_pi32(mm5, mm0);
-      mm4 = _mm_add_pi32(mm4, mm6);
-      mm5 = _mm_add_pi32(mm5, mm1);
-      mm4 = _mm_add_pi32(mm4, mm1);
-      mm5 = _mm_srli_pi32(mm5, SCALEBITS);
-      mm4 = _mm_srli_pi32(mm4, SCALEBITS);
-      mm5 = _mm_packs_pi32(mm5, mm4);
-
-      mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
-      mm5  = _mm_or_si64(mm5, mm7);
-      Cb_RG = mm5;
-
-      mm0 = wk[3];
-      mm6 = wk[2];
-      mm1 = wk[1];
-
-      mm4 = mm0;
-      mm0 = _mm_unpacklo_pi16(mm0, mm3);
-      mm4 = _mm_unpackhi_pi16(mm4, mm3);
-      mm7 = mm0;
-      mm5 = mm4;
-      mm0 = _mm_madd_pi16(mm0, PW_F0114_F0250);
-      mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
-      mm7 = _mm_madd_pi16(mm7, PW_MF008_MF041);
-      mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
-
-      mm3 = PD_ONEHALF;
-      mm0 = _mm_add_pi32(mm0, wk[4]);
-      mm4 = _mm_add_pi32(mm4, wk[5]);
-      mm0 = _mm_add_pi32(mm0, mm3);
-      mm4 = _mm_add_pi32(mm4, mm3);
-      mm0 = _mm_srli_pi32(mm0, SCALEBITS);
-      mm4 = _mm_srli_pi32(mm4, SCALEBITS);
-      mm0 = _mm_packs_pi32(mm0, mm4);
-
-      mm3 = _mm_loadlo_pi16_f(mm1);
-      mm4 = _mm_loadhi_pi16_f(mm1);
-      mm3 = _mm_srli_pi32(mm3, 1);
-      mm4 = _mm_srli_pi32(mm4, 1);
-
-      mm1 = PD_ONEHALFM1_CJ;
-      mm7 = _mm_add_pi32(mm7, mm3);
-      mm5 = _mm_add_pi32(mm5, mm4);
-      mm7 = _mm_add_pi32(mm7, mm1);
-      mm5 = _mm_add_pi32(mm5, mm1);
-      mm7 = _mm_srli_pi32(mm7, SCALEBITS);
-      mm5 = _mm_srli_pi32(mm5, SCALEBITS);
-      mm7 = _mm_packs_pi32(mm7, mm5);
-
-      mm3 = wk[0];
-      mm4 = mm6;
-      mm6 = _mm_unpacklo_pi16(mm6, mm2);
-      mm4 = _mm_unpackhi_pi16(mm4, mm2);
-      mm1 = mm6;
-      mm5 = mm4;
-      mm6 = _mm_madd_pi16(mm6, PW_F0114_F0250);
-      mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
-      mm1 = _mm_madd_pi16(mm1, PW_MF008_MF041);
-      mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
-
-      mm2 = PD_ONEHALF;
-      mm6 = _mm_add_pi32(mm6, wk[6]);
-      mm4 = _mm_add_pi32(mm4, wk[7]);
-      mm6 = _mm_add_pi32(mm6, mm2);
-      mm4 = _mm_add_pi32(mm4, mm2);
-      mm6 = _mm_srli_pi32(mm6, SCALEBITS);
-      mm4 = _mm_srli_pi32(mm4, SCALEBITS);
-      mm6 = _mm_packs_pi32(mm6, mm4);
-
-      mm0 = _mm_slli_pi16(mm0, BYTE_BIT);
-      mm6 = _mm_or_si64(mm6, mm0);
-      Y_BG = mm6;
-
-      mm2 = _mm_loadlo_pi16_f(mm3);
-      mm4 = _mm_loadhi_pi16_f(mm3);
-      mm2 = _mm_srli_pi32(mm2, 1);
-      mm4 = _mm_srli_pi32(mm4, 1);
-
-      mm0 = PD_ONEHALFM1_CJ;
-      mm1 = _mm_add_pi32(mm1, mm2);
-      mm5 = _mm_add_pi32(mm5, mm4);
-      mm1 = _mm_add_pi32(mm1, mm0);
-      mm5 = _mm_add_pi32(mm5, mm0);
-      mm1 = _mm_srli_pi32(mm1, SCALEBITS);
-      mm5 = _mm_srli_pi32(mm5, SCALEBITS);
-      mm1 = _mm_packs_pi32(mm1, mm5);
-
-      mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
-      mm1 = _mm_or_si64(mm1, mm7);
-      Cr_BG = mm1;
-
-      _mm_store_si64((__m64 *)&outptr0[0], Y_BG);
-      _mm_store_si64((__m64 *)&outptr1[0], Cb_RG);
-      _mm_store_si64((__m64 *)&outptr2[0], Cr_BG);
-    }
-  }
-}
-
-#undef mmA
-#undef mmB
-#undef mmC
-#undef mmD
-#undef mmE
-#undef mmF
-#undef mmG
-#undef mmH
diff --git a/simd/loongson/jcsample-mmi.c b/simd/loongson/jcsample-mmi.c
deleted file mode 100644
index 2f2d851..0000000
--- a/simd/loongson/jcsample-mmi.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright (C) 2015, 2018, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- *                          All Rights Reserved.
- * Authors:  ZhuChen     <zhuchen@loongson.cn>
- *           CaiWanwei   <caiwanwei@loongson.cn>
- *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* CHROMA DOWNSAMPLING */
-
-#include "jsimd_mmi.h"
-#include "jcsample.h"
-
-
-void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor,
-                               JDIMENSION v_samp_factor,
-                               JDIMENSION width_in_blocks,
-                               JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  int inrow, outrow, outcol, bias;
-  JDIMENSION output_cols = width_in_blocks * DCTSIZE;
-  JSAMPROW inptr0, inptr1, outptr;
-  __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6 = 0.0, mm7;
-
-  expand_right_edge(input_data, max_v_samp_factor, image_width,
-                    output_cols * 2);
-
-  bias = (1 << 17) + 1;                      /* 0x00020001 (bias pattern) */
-  mm7 = _mm_set1_pi32(bias);                 /* mm7={1, 2, 1, 2} */
-  mm6 = _mm_cmpeq_pi16(mm6, mm6);
-  mm6 = _mm_srli_pi16(mm6, BYTE_BIT);        /* mm6={0xFF 0x00 0xFF 0x00 ..} */
-
-  for (inrow = 0, outrow = 0; outrow < v_samp_factor;
-       inrow += 2, outrow++) {
-
-    inptr0 = input_data[inrow];
-    inptr1 = input_data[inrow + 1];
-    outptr = output_data[outrow];
-
-    for (outcol = output_cols; outcol > 0;
-         outcol -= 8, inptr0 += 16, inptr1 += 16, outptr += 8) {
-
-      mm0 = _mm_load_si64((__m64 *)&inptr0[0]);
-      mm1 = _mm_load_si64((__m64 *)&inptr1[0]);
-      mm2 = _mm_load_si64((__m64 *)&inptr0[8]);
-      mm3 = _mm_load_si64((__m64 *)&inptr1[8]);
-
-      mm4 = mm0;
-      mm5 = mm1;
-      mm0 = _mm_and_si64(mm0, mm6);
-      mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
-      mm1 = _mm_and_si64(mm1, mm6);
-      mm5 = _mm_srli_pi16(mm5, BYTE_BIT);
-      mm0 = _mm_add_pi16(mm0, mm4);
-      mm1 = _mm_add_pi16(mm1, mm5);
-
-      mm4 = mm2;
-      mm5 = mm3;
-      mm2 = _mm_and_si64(mm2, mm6);
-      mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
-      mm3 = _mm_and_si64(mm3, mm6);
-      mm5 = _mm_srli_pi16(mm5, BYTE_BIT);
-      mm2 = _mm_add_pi16(mm2, mm4);
-      mm3 = _mm_add_pi16(mm3, mm5);
-
-      mm0 = _mm_add_pi16(mm0, mm1);
-      mm2 = _mm_add_pi16(mm2, mm3);
-      mm0 = _mm_add_pi16(mm0, mm7);
-      mm2 = _mm_add_pi16(mm2, mm7);
-      mm0 = _mm_srli_pi16(mm0, 2);
-      mm2 = _mm_srli_pi16(mm2, 2);
-
-      mm0 = _mm_packs_pu16(mm0, mm2);
-
-      _mm_store_si64((__m64 *)&outptr[0], mm0);
-    }
-  }
-}
diff --git a/simd/loongson/jdcolext-mmi.c b/simd/loongson/jdcolext-mmi.c
deleted file mode 100644
index 560d9b0..0000000
--- a/simd/loongson/jdcolext-mmi.c
+++ /dev/null
@@ -1,424 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- *                          All Rights Reserved.
- * Authors:  ZhuChen     <zhuchen@loongson.cn>
- *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- *           CaiWanwei   <caiwanwei@loongson.cn>
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* This file is included by jdcolor-mmi.c */
-
-
-#if RGB_RED == 0
-#define mmA  mm0
-#define mmB  mm1
-#elif RGB_GREEN == 0
-#define mmA  mm2
-#define mmB  mm3
-#elif RGB_BLUE == 0
-#define mmA  mm4
-#define mmB  mm5
-#else
-#define mmA  mm6
-#define mmB  mm7
-#endif
-
-#if RGB_RED == 1
-#define mmC  mm0
-#define mmD  mm1
-#elif RGB_GREEN == 1
-#define mmC  mm2
-#define mmD  mm3
-#elif RGB_BLUE == 1
-#define mmC  mm4
-#define mmD  mm5
-#else
-#define mmC  mm6
-#define mmD  mm7
-#endif
-
-#if RGB_RED == 2
-#define mmE  mm0
-#define mmF  mm1
-#elif RGB_GREEN == 2
-#define mmE  mm2
-#define mmF  mm3
-#elif RGB_BLUE == 2
-#define mmE  mm4
-#define mmF  mm5
-#else
-#define mmE  mm6
-#define mmF  mm7
-#endif
-
-#if RGB_RED == 3
-#define mmG  mm0
-#define mmH  mm1
-#elif RGB_GREEN == 3
-#define mmG  mm2
-#define mmH  mm3
-#elif RGB_BLUE == 3
-#define mmG  mm4
-#define mmH  mm5
-#else
-#define mmG  mm6
-#define mmH  mm7
-#endif
-
-
-void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
-                               JDIMENSION input_row, JSAMPARRAY output_buf,
-                               int num_rows)
-{
-  JSAMPROW outptr, inptr0, inptr1, inptr2;
-  int num_cols, col;
-  __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
-  __m64 mm8, wk[2];
-
-  while (--num_rows >= 0) {
-    inptr0 = input_buf[0][input_row];
-    inptr1 = input_buf[1][input_row];
-    inptr2 = input_buf[2][input_row];
-    input_row++;
-    outptr = *output_buf++;
-
-    for (num_cols = out_width; num_cols > 0; num_cols -= 8,
-         inptr0 += 8, inptr1 += 8, inptr2 += 8) {
-
-      mm5 = _mm_load_si64((__m64 *)inptr1);
-      mm1 = _mm_load_si64((__m64 *)inptr2);
-      mm8 = _mm_load_si64((__m64 *)inptr0);
-      mm4 = 0;
-      mm7 = 0;
-      mm4 = _mm_cmpeq_pi16(mm4, mm4);
-      mm7 = _mm_cmpeq_pi16(mm7, mm7);
-      mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
-      mm7 = _mm_slli_pi16(mm7, 7);      /* mm7={0xFF80 0xFF80 0xFF80 0xFF80} */
-      mm0 = mm4;                        /* mm0=mm4={0xFF 0x00 0xFF 0x00 ..} */
-
-      mm4 = _mm_and_si64(mm4, mm5);           /* mm4=Cb(0246)=CbE */
-      mm5 = _mm_srli_pi16(mm5, BYTE_BIT);     /* mm5=Cb(1357)=CbO */
-      mm0 = _mm_and_si64(mm0, mm1);           /* mm0=Cr(0246)=CrE */
-      mm1 = _mm_srli_pi16(mm1, BYTE_BIT);     /* mm1=Cr(1357)=CrO */
-      mm4 = _mm_add_pi16(mm4, mm7);
-      mm5 = _mm_add_pi16(mm5, mm7);
-      mm0 = _mm_add_pi16(mm0, mm7);
-      mm1 = _mm_add_pi16(mm1, mm7);
-
-      /* (Original)
-       * R = Y                + 1.40200 * Cr
-       * G = Y - 0.34414 * Cb - 0.71414 * Cr
-       * B = Y + 1.77200 * Cb
-       *
-       * (This implementation)
-       * R = Y                + 0.40200 * Cr + Cr
-       * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-       * B = Y - 0.22800 * Cb + Cb + Cb
-       */
-
-      mm2 = mm4;                              /* mm2 = CbE */
-      mm3 = mm5;                              /* mm3 = CbO */
-      mm4 = _mm_add_pi16(mm4, mm4);           /* mm4 = 2*CbE */
-      mm5 = _mm_add_pi16(mm5, mm5);           /* mm5 = 2*CbO */
-      mm6 = mm0;                              /* mm6 = CrE */
-      mm7 = mm1;                              /* mm7 = CrO */
-      mm0 = _mm_add_pi16(mm0, mm0);           /* mm0 = 2*CrE */
-      mm1 = _mm_add_pi16(mm1, mm1);           /* mm1 = 2*CrO */
-
-      mm4 = _mm_mulhi_pi16(mm4, PW_MF0228);   /* mm4=(2*CbE * -FIX(0.22800) */
-      mm5 = _mm_mulhi_pi16(mm5, PW_MF0228);   /* mm5=(2*CbO * -FIX(0.22800) */
-      mm0 = _mm_mulhi_pi16(mm0, PW_F0402);    /* mm0=(2*CrE * FIX(0.40200)) */
-      mm1 = _mm_mulhi_pi16(mm1, PW_F0402);    /* mm1=(2*CrO * FIX(0.40200)) */
-
-      mm4 = _mm_add_pi16(mm4, PW_ONE);
-      mm5 = _mm_add_pi16(mm5, PW_ONE);
-      mm4 = _mm_srai_pi16(mm4, 1);            /* mm4=(CbE * -FIX(0.22800)) */
-      mm5 = _mm_srai_pi16(mm5, 1);            /* mm5=(CbO * -FIX(0.22800)) */
-      mm0 = _mm_add_pi16(mm0, PW_ONE);
-      mm1 = _mm_add_pi16(mm1, PW_ONE);
-      mm0 = _mm_srai_pi16(mm0, 1);            /* mm0=(CrE * FIX(0.40200)) */
-      mm1 = _mm_srai_pi16(mm1, 1);            /* mm1=(CrO * FIX(0.40200)) */
-
-      mm4 = _mm_add_pi16(mm4, mm2);
-      mm5 = _mm_add_pi16(mm5, mm3);
-      mm4 = _mm_add_pi16(mm4, mm2);       /* mm4=(CbE * FIX(1.77200))=(B-Y)E */
-      mm5 = _mm_add_pi16(mm5, mm3);       /* mm5=(CbO * FIX(1.77200))=(B-Y)O */
-      mm0 = _mm_add_pi16(mm0, mm6);       /* mm0=(CrE * FIX(1.40200))=(R-Y)E */
-      mm1 = _mm_add_pi16(mm1, mm7);       /* mm1=(CrO * FIX(1.40200))=(R-Y)O */
-
-      wk[0] = mm4;                            /* wk(0)=(B-Y)E */
-      wk[1] = mm5;                            /* wk(1)=(B-Y)O */
-
-      mm4 = mm2;
-      mm5 = mm3;
-      mm2 = _mm_unpacklo_pi16(mm2, mm6);
-      mm4 = _mm_unpackhi_pi16(mm4, mm6);
-      mm2 = _mm_madd_pi16(mm2, PW_MF0344_F0285);
-      mm4 = _mm_madd_pi16(mm4, PW_MF0344_F0285);
-      mm3 = _mm_unpacklo_pi16(mm3, mm7);
-      mm5 = _mm_unpackhi_pi16(mm5, mm7);
-      mm3 = _mm_madd_pi16(mm3, PW_MF0344_F0285);
-      mm5 = _mm_madd_pi16(mm5, PW_MF0344_F0285);
-
-      mm2 = _mm_add_pi32(mm2, PD_ONEHALF);
-      mm4 = _mm_add_pi32(mm4, PD_ONEHALF);
-      mm2 = _mm_srai_pi32(mm2, SCALEBITS);
-      mm4 = _mm_srai_pi32(mm4, SCALEBITS);
-      mm3 = _mm_add_pi32(mm3, PD_ONEHALF);
-      mm5 = _mm_add_pi32(mm5, PD_ONEHALF);
-      mm3 = _mm_srai_pi32(mm3, SCALEBITS);
-      mm5 = _mm_srai_pi32(mm5, SCALEBITS);
-
-      mm2 = _mm_packs_pi32(mm2, mm4);  /* mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) */
-      mm3 = _mm_packs_pi32(mm3, mm5);  /* mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) */
-      mm2 = _mm_sub_pi16(mm2, mm6);  /* mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
-      mm3 = _mm_sub_pi16(mm3, mm7);  /* mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
-
-      mm5 = mm8;                              /* mm5=Y(01234567) */
-
-      mm4 = _mm_cmpeq_pi16(mm4, mm4);
-      mm4 = _mm_srli_pi16(mm4, BYTE_BIT);    /* mm4={0xFF 0x00 0xFF 0x00 ..} */
-      mm4 = _mm_and_si64(mm4, mm5);          /* mm4=Y(0246)=YE */
-      mm5 = _mm_srli_pi16(mm5, BYTE_BIT);    /* mm5=Y(1357)=YO */
-
-      mm0 = _mm_add_pi16(mm0, mm4);      /* mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) */
-      mm1 = _mm_add_pi16(mm1, mm5);      /* mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) */
-      mm0 = _mm_packs_pu16(mm0, mm0);    /* mm0=(R0 R2 R4 R6 ** ** ** **) */
-      mm1 = _mm_packs_pu16(mm1, mm1);    /* mm1=(R1 R3 R5 R7 ** ** ** **) */
-
-      mm2 = _mm_add_pi16(mm2, mm4);      /* mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) */
-      mm3 = _mm_add_pi16(mm3, mm5);      /* mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) */
-      mm2 = _mm_packs_pu16(mm2, mm2);    /* mm2=(G0 G2 G4 G6 ** ** ** **) */
-      mm3 = _mm_packs_pu16(mm3, mm3);    /* mm3=(G1 G3 G5 G7 ** ** ** **) */
-
-      mm4 = _mm_add_pi16(mm4, wk[0]);    /* mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) */
-      mm5 = _mm_add_pi16(mm5, wk[1]);    /* mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) */
-      mm4 = _mm_packs_pu16(mm4, mm4);    /* mm4=(B0 B2 B4 B6 ** ** ** **) */
-      mm5 = _mm_packs_pu16(mm5, mm5);    /* mm5=(B1 B3 B5 B7 ** ** ** **) */
-
-#if RGB_PIXELSIZE == 3
-
-      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
-      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
-      mmA = _mm_unpacklo_pi8(mmA, mmC);     /* mmA=(00 10 02 12 04 14 06 16) */
-      mmE = _mm_unpacklo_pi8(mmE, mmB);     /* mmE=(20 01 22 03 24 05 26 07) */
-      mmD = _mm_unpacklo_pi8(mmD, mmF);     /* mmD=(11 21 13 23 15 25 17 27) */
-
-      mmG = mmA;
-      mmH = mmA;
-      mmA = _mm_unpacklo_pi16(mmA, mmE);    /* mmA=(00 10 20 01 02 12 22 03) */
-      mmG = _mm_unpackhi_pi16(mmG, mmE);    /* mmG=(04 14 24 05 06 16 26 07) */
-
-      mmH = _mm_srli_si64(mmH, 2 * BYTE_BIT);
-      mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT);
-
-      mmC = mmD;
-      mmB = mmD;
-      mmD = _mm_unpacklo_pi16(mmD, mmH);    /* mmD=(11 21 02 12 13 23 04 14) */
-      mmC = _mm_unpackhi_pi16(mmC, mmH);    /* mmC=(15 25 06 16 17 27 -- --) */
-
-      mmB = _mm_srli_si64(mmB, 2 * BYTE_BIT); /* mmB=(13 23 15 25 17 27 -- --) */
-
-      mmF = mmE;
-      mmE = _mm_unpacklo_pi16(mmE, mmB);    /* mmE=(22 03 13 23 24 05 15 25) */
-      mmF = _mm_unpackhi_pi16(mmF, mmB);    /* mmF=(26 07 17 27 -- -- -- --) */
-
-      mmA = _mm_unpacklo_pi32(mmA, mmD);    /* mmA=(00 10 20 01 11 21 02 12) */
-      mmE = _mm_unpacklo_pi32(mmE, mmG);    /* mmE=(22 03 13 23 04 14 24 05) */
-      mmC = _mm_unpacklo_pi32(mmC, mmF);    /* mmC=(15 25 06 16 26 07 17 27) */
-
-      if (num_cols >= 8) {
-        _mm_store_si64((__m64 *)outptr, mmA);
-        _mm_store_si64((__m64 *)(outptr + 8), mmE);
-        _mm_store_si64((__m64 *)(outptr + 16), mmC);
-        outptr += RGB_PIXELSIZE * 8;
-      } else {
-        col = num_cols * 3;
-        asm(".set noreorder\r\n"
-
-            "li      $8, 16\r\n"
-            "move    $9, %4\r\n"
-            "mov.s   $f4, %1\r\n"
-            "mov.s   $f6, %3\r\n"
-            "move    $10, %5\r\n"
-            "bltu    $9, $8, 1f\r\n"
-            "nop     \r\n"
-            "gssdlc1 $f4, 7($10)\r\n"
-            "gssdrc1 $f4, 0($10)\r\n"
-            "gssdlc1 $f6, 7+8($10)\r\n"
-            "gssdrc1 $f6, 8($10)\r\n"
-            "mov.s   $f4, %2\r\n"
-            "subu    $9, $9, 16\r\n"
-            "daddu   $10, $10, 16\r\n"
-            "b       2f\r\n"
-            "nop     \r\n"
-
-            "1:      \r\n"
-            "li      $8, 8\r\n"               /* st8 */
-            "bltu    $9, $8, 2f\r\n"
-            "nop     \r\n"
-            "gssdlc1 $f4, 7($10)\r\n"
-            "gssdrc1 $f4, ($10)\r\n"
-            "mov.s   $f4, %3\r\n"
-            "subu    $9, $9, 8\r\n"
-            "daddu   $10, $10, 8\r\n"
-
-            "2:      \r\n"
-            "li      $8, 4\r\n"               /* st4 */
-            "mfc1    $11, $f4\r\n"
-            "bltu    $9, $8, 3f\r\n"
-            "nop     \r\n"
-            "swl     $11, 3($10)\r\n"
-            "swr     $11, 0($10)\r\n"
-            "li      $8, 32\r\n"
-            "mtc1    $8, $f6\r\n"
-            "dsrl    $f4, $f4, $f6\r\n"
-            "mfc1    $11, $f4\r\n"
-            "subu    $9, $9, 4\r\n"
-            "daddu   $10, $10, 4\r\n"
-
-            "3:      \r\n"
-            "li      $8, 2\r\n"               /* st2 */
-            "bltu    $9, $8, 4f\r\n"
-            "nop     \r\n"
-            "ush     $11, 0($10)\r\n"
-            "srl     $11, 16\r\n"
-            "subu    $9, $9, 2\r\n"
-            "daddu   $10, $10, 2\r\n"
-
-            "4:      \r\n"
-            "li      $8, 1\r\n"               /* st1 */
-            "bltu    $9, $8, 5f\r\n"
-            "nop     \r\n"
-            "sb      $11, 0($10)\r\n"
-
-            "5:      \r\n"
-            "nop     \r\n"                    /* end */
-            : "=m" (*outptr)
-            : "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr)
-            : "$f4", "$f6", "$8", "$9", "$10", "$11", "memory"
-           );
-      }
-
-#else  /* RGB_PIXELSIZE == 4 */
-
-#ifdef RGBX_FILLER_0XFF
-      mm6 = _mm_cmpeq_pi8(mm6, mm6);
-      mm7 = _mm_cmpeq_pi8(mm7, mm7);
-#else
-      mm6 = _mm_xor_si64(mm6, mm6);
-      mm7 = _mm_xor_si64(mm7, mm7);
-#endif
-      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
-      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
-      /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */
-      /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */
-
-      mmA = _mm_unpacklo_pi8(mmA, mmC);     /* mmA=(00 10 02 12 04 14 06 16) */
-      mmE = _mm_unpacklo_pi8(mmE, mmG);     /* mmE=(20 30 22 32 24 34 26 36) */
-      mmB = _mm_unpacklo_pi8(mmB, mmD);     /* mmB=(01 11 03 13 05 15 07 17) */
-      mmF = _mm_unpacklo_pi8(mmF, mmH);     /* mmF=(21 31 23 33 25 35 27 37) */
-
-      mmC = mmA;
-      mmA = _mm_unpacklo_pi16(mmA, mmE);    /* mmA=(00 10 20 30 02 12 22 32) */
-      mmC = _mm_unpackhi_pi16(mmC, mmE);    /* mmC=(04 14 24 34 06 16 26 36) */
-      mmG = mmB;
-      mmB = _mm_unpacklo_pi16(mmB, mmF);    /* mmB=(01 11 21 31 03 13 23 33) */
-      mmG = _mm_unpackhi_pi16(mmG, mmF);    /* mmG=(05 15 25 35 07 17 27 37) */
-
-      mmD = mmA;
-      mmA = _mm_unpacklo_pi32(mmA, mmB);    /* mmA=(00 10 20 30 01 11 21 31) */
-      mmD = _mm_unpackhi_pi32(mmD, mmB);    /* mmD=(02 12 22 32 03 13 23 33) */
-      mmH = mmC;
-      mmC = _mm_unpacklo_pi32(mmC, mmG);    /* mmC=(04 14 24 34 05 15 25 35) */
-      mmH = _mm_unpackhi_pi32(mmH, mmG);    /* mmH=(06 16 26 36 07 17 27 37) */
-
-      if (num_cols >= 8) {
-        _mm_store_si64((__m64 *)outptr, mmA);
-        _mm_store_si64((__m64 *)(outptr + 8), mmD);
-        _mm_store_si64((__m64 *)(outptr + 16), mmC);
-        _mm_store_si64((__m64 *)(outptr + 24), mmH);
-        outptr += RGB_PIXELSIZE * 8;
-      } else {
-        col = num_cols;
-        asm(".set noreorder\r\n"              /* st16 */
-
-            "li      $8, 4\r\n"
-            "move    $9, %6\r\n"
-            "move    $10, %7\r\n"
-            "mov.s   $f4, %2\r\n"
-            "mov.s   $f6, %4\r\n"
-            "bltu    $9, $8, 1f\r\n"
-            "nop     \r\n"
-            "gssdlc1 $f4, 7($10)\r\n"
-            "gssdrc1 $f4, ($10)\r\n"
-            "gssdlc1 $f6, 7+8($10)\r\n"
-            "gssdrc1 $f6, 8($10)\r\n"
-            "mov.s   $f4, %3\r\n"
-            "mov.s   $f6, %5\r\n"
-            "subu    $9, $9, 4\r\n"
-            "daddu   $10, $10, 16\r\n"
-
-            "1:      \r\n"
-            "li      $8, 2\r\n"               /* st8 */
-            "bltu    $9, $8, 2f\r\n"
-            "nop     \r\n"
-            "gssdlc1 $f4, 7($10)\r\n"
-            "gssdrc1 $f4, 0($10)\r\n"
-            "mov.s   $f4, $f6\r\n"
-            "subu    $9, $9, 2\r\n"
-            "daddu   $10, $10, 8\r\n"
-
-            "2:      \r\n"
-            "li      $8, 1\r\n"               /* st4 */
-            "bltu    $9, $8, 3f\r\n"
-            "nop     \r\n"
-            "gsswlc1 $f4, 3($10)\r\n"
-            "gsswrc1 $f4, 0($10)\r\n"
-
-            "3:      \r\n"
-            "li      %1, 0\r\n"               /* end */
-            : "=m" (*outptr), "=r" (col)
-            : "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col),
-              "r" (outptr)
-            : "$f4", "$f6", "$8", "$9", "$10", "memory"
-           );
-      }
-
-#endif
-
-    }
-  }
-}
-
-#undef mmA
-#undef mmB
-#undef mmC
-#undef mmD
-#undef mmE
-#undef mmF
-#undef mmG
-#undef mmH
diff --git a/simd/loongson/jdsample-mmi.c b/simd/loongson/jdsample-mmi.c
deleted file mode 100644
index 00a6265..0000000
--- a/simd/loongson/jdsample-mmi.c
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright (C) 2015, 2018, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- *                          All Rights Reserved.
- * Authors:  ZhuChen     <zhuchen@loongson.cn>
- *           CaiWanwei   <caiwanwei@loongson.cn>
- *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* CHROMA UPSAMPLING */
-
-#include "jsimd_mmi.h"
-
-
-enum const_index {
-  index_PW_THREE,
-  index_PW_SEVEN,
-  index_PW_EIGHT,
-};
-
-static uint64_t const_value[] = {
-  _uint64_set_pi16(3, 3, 3, 3),
-  _uint64_set_pi16(7, 7, 7, 7),
-  _uint64_set_pi16(8, 8, 8, 8),
-};
-
-#define PW_THREE  get_const_value(index_PW_THREE)
-#define PW_SEVEN  get_const_value(index_PW_SEVEN)
-#define PW_EIGHT  get_const_value(index_PW_EIGHT)
-
-
-#define PROCESS_ROW(r) { \
-  mm7 = _mm_load_si64((__m64 *)outptr##r);      /* mm7=IntrL=( 0 1 2 3) */ \
-  mm3 = _mm_load_si64((__m64 *)outptr##r + 1);  /* mm3=IntrH=( 4 5 6 7) */ \
-  \
-  mm0 = mm7; \
-  mm4 = mm3; \
-  mm0 = _mm_srli_si64(mm0, 2 * BYTE_BIT);                   /* mm0=( 1 2 3 -) */ \
-  mm4 = _mm_slli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( - - - 4) */ \
-  mm5 = mm7; \
-  mm6 = mm3; \
-  mm5 = _mm_srli_si64(mm5, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm5=( 3 - - -) */ \
-  mm6 = _mm_slli_si64(mm6, 2 * BYTE_BIT);                   /* mm6=( - 4 5 6) */ \
-  \
-  mm0 = _mm_or_si64(mm0, mm4);                /* mm0=( 1 2 3 4) */ \
-  mm5 = _mm_or_si64(mm5, mm6);                /* mm5=( 3 4 5 6) */ \
-  \
-  mm1 = mm7; \
-  mm2 = mm3; \
-  mm1 = _mm_slli_si64(mm1, 2 * BYTE_BIT);     /* mm1=( - 0 1 2) */ \
-  mm2 = _mm_srli_si64(mm2, 2 * BYTE_BIT);     /* mm2=( 5 6 7 -) */ \
-  mm4 = mm3; \
-  mm4 = _mm_srli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( 7 - - -) */ \
-  \
-  mm1 = _mm_or_si64(mm1, wk[r]);              /* mm1=(-1 0 1 2) */ \
-  mm2 = _mm_or_si64(mm2, wk[r + 2]);          /* mm2=( 5 6 6 8) */ \
-  \
-  wk[r] = mm4; \
-  \
-  mm7 = _mm_mullo_pi16(mm7, PW_THREE); \
-  mm3 = _mm_mullo_pi16(mm3, PW_THREE); \
-  mm1 = _mm_add_pi16(mm1, PW_EIGHT); \
-  mm5 = _mm_add_pi16(mm5, PW_EIGHT); \
-  mm0 = _mm_add_pi16(mm0, PW_SEVEN); \
-  mm2 = _mm_add_pi16(mm2, PW_SEVEN); \
-  \
-  mm1 = _mm_add_pi16(mm1, mm7); \
-  mm5 = _mm_add_pi16(mm5, mm3); \
-  mm1 = _mm_srli_pi16(mm1, 4);                /* mm1=OutrLE=( 0  2  4  6) */ \
-  mm5 = _mm_srli_pi16(mm5, 4);                /* mm5=OutrHE=( 8 10 12 14) */ \
-  mm0 = _mm_add_pi16(mm0, mm7); \
-  mm2 = _mm_add_pi16(mm2, mm3); \
-  mm0 = _mm_srli_pi16(mm0, 4);                /* mm0=OutrLO=( 1  3  5  7) */ \
-  mm2 = _mm_srli_pi16(mm2, 4);                /* mm2=OutrHO=( 9 11 13 15) */ \
-  \
-  mm0 = _mm_slli_pi16(mm0, BYTE_BIT); \
-  mm2 = _mm_slli_pi16(mm2, BYTE_BIT); \
-  mm1 = _mm_or_si64(mm1, mm0);     /* mm1=OutrL=( 0  1  2  3  4  5  6  7) */ \
-  mm5 = _mm_or_si64(mm5, mm2);     /* mm5=OutrH=( 8  9 10 11 12 13 14 15) */ \
-  \
-  _mm_store_si64((__m64 *)outptr##r, mm1); \
-  _mm_store_si64((__m64 *)outptr##r + 1, mm5); \
-}
-
-void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor,
-                                   JDIMENSION downsampled_width,
-                                   JSAMPARRAY input_data,
-                                   JSAMPARRAY *output_data_ptr)
-{
-  JSAMPARRAY output_data = *output_data_ptr;
-  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
-  int inrow, outrow, incol, tmp, tmp1;
-  __m64 mm0, mm1, mm2, mm3 = 0.0, mm4, mm5, mm6, mm7 = 0.0;
-  __m64 wk[4], mm_tmp;
-
-  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
-
-    inptr_1 = input_data[inrow - 1];
-    inptr0 = input_data[inrow];
-    inptr1 = input_data[inrow + 1];
-    outptr0 = output_data[outrow++];
-    outptr1 = output_data[outrow++];
-
-    if (downsampled_width & 7) {
-      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
-      tmp1 =  downsampled_width * sizeof(JSAMPLE);
-      asm("daddu  $8, %3, %6\r\n"
-          "lb     $9, ($8)\r\n"
-          "daddu  $8, %3, %7\r\n"
-          "sb     $9, ($8)\r\n"
-          "daddu  $8, %4, %6\r\n"
-          "lb     $9, ($8)\r\n"
-          "daddu  $8, %4, %7\r\n"
-          "sb     $9, ($8)\r\n"
-          "daddu  $8, %5, %6\r\n"
-          "lb     $9, ($8)\r\n"
-          "daddu  $8, %5, %7\r\n"
-          "sb     $9, ($8)\r\n"
-          : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1)
-          : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1)
-          : "$8", "$9"
-         );
-    }
-
-    /* process the first column block */
-    mm0 = _mm_load_si64((__m64 *)inptr0);     /* mm0 = row[ 0][0] */
-    mm1 = _mm_load_si64((__m64 *)inptr_1);    /* mm1 = row[-1][0] */
-    mm2 = _mm_load_si64((__m64 *)inptr1);     /* mm2 = row[ 1][0] */
-
-    mm3 = _mm_xor_si64(mm3, mm3);             /* mm3 = (all 0's) */
-    mm4 = mm0;
-    mm0 = _mm_unpacklo_pi8(mm0, mm3);         /* mm0 = row[ 0][0]( 0 1 2 3) */
-    mm4 = _mm_unpackhi_pi8(mm4, mm3);         /* mm4 = row[ 0][0]( 4 5 6 7) */
-    mm5 = mm1;
-    mm1 = _mm_unpacklo_pi8(mm1, mm3);         /* mm1 = row[-1][0]( 0 1 2 3) */
-    mm5 = _mm_unpackhi_pi8(mm5, mm3);         /* mm5 = row[-1][0]( 4 5 6 7) */
-    mm6 = mm2;
-    mm2 = _mm_unpacklo_pi8(mm2, mm3);         /* mm2 = row[+1][0]( 0 1 2 3) */
-    mm6 = _mm_unpackhi_pi8(mm6, mm3);         /* mm6 = row[+1][0]( 4 5 6 7) */
-
-    mm0 = _mm_mullo_pi16(mm0, PW_THREE);
-    mm4 = _mm_mullo_pi16(mm4, PW_THREE);
-
-    mm7 = _mm_cmpeq_pi8(mm7, mm7);
-    mm7 = _mm_srli_si64(mm7, (SIZEOF_MMWORD - 2) * BYTE_BIT);
-
-    mm1 = _mm_add_pi16(mm1, mm0);             /* mm1=Int0L=( 0 1 2 3) */
-    mm5 = _mm_add_pi16(mm5, mm4);             /* mm5=Int0H=( 4 5 6 7) */
-    mm2 = _mm_add_pi16(mm2, mm0);             /* mm2=Int1L=( 0 1 2 3) */
-    mm6 = _mm_add_pi16(mm6, mm4);             /* mm6=Int1H=( 4 5 6 7) */
-
-    _mm_store_si64((__m64 *)outptr0, mm1);      /* temporarily save */
-    _mm_store_si64((__m64 *)outptr0 + 1, mm5);  /* the intermediate data */
-    _mm_store_si64((__m64 *)outptr1, mm2);
-    _mm_store_si64((__m64 *)outptr1 + 1, mm6);
-
-    mm1 = _mm_and_si64(mm1, mm7);             /* mm1=( 0 - - -) */
-    mm2 = _mm_and_si64(mm2, mm7);             /* mm2=( 0 - - -) */
-
-    wk[0] = mm1;
-    wk[1] = mm2;
-
-    for (incol = downsampled_width; incol > 0;
-         incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8,
-         outptr0 += 16, outptr1 += 16) {
-
-      if (incol > 8) {
-        /* process the next column block */
-        mm0 = _mm_load_si64((__m64 *)inptr0 + 1);   /* mm0 = row[ 0][1] */
-        mm1 = _mm_load_si64((__m64 *)inptr_1 + 1);  /* mm1 = row[-1][1] */
-        mm2 = _mm_load_si64((__m64 *)inptr1 + 1);   /* mm2 = row[+1][1] */
-
-        mm3 = _mm_setzero_si64();             /* mm3 = (all 0's) */
-        mm4 = mm0;
-        mm0 = _mm_unpacklo_pi8(mm0, mm3);     /* mm0 = row[ 0][1]( 0 1 2 3) */
-        mm4 = _mm_unpackhi_pi8(mm4, mm3);     /* mm4 = row[ 0][1]( 4 5 6 7) */
-        mm5 = mm1;
-        mm1 = _mm_unpacklo_pi8(mm1, mm3);     /* mm1 = row[-1][1]( 0 1 2 3) */
-        mm5 = _mm_unpackhi_pi8(mm5, mm3);     /* mm5 = row[-1][1]( 4 5 6 7) */
-        mm6 = mm2;
-        mm2 = _mm_unpacklo_pi8(mm2, mm3);     /* mm2 = row[+1][1]( 0 1 2 3) */
-        mm6 = _mm_unpackhi_pi8(mm6, mm3);     /* mm6 = row[+1][1]( 4 5 6 7) */
-
-        mm0 = _mm_mullo_pi16(mm0, PW_THREE);
-        mm4 = _mm_mullo_pi16(mm4, PW_THREE);
-
-        mm1 = _mm_add_pi16(mm1, mm0);         /* mm1 = Int0L = ( 0 1 2 3) */
-        mm5 = _mm_add_pi16(mm5, mm4);         /* mm5 = Int0H = ( 4 5 6 7) */
-        mm2 = _mm_add_pi16(mm2, mm0);         /* mm2 = Int1L = ( 0 1 2 3) */
-        mm6 = _mm_add_pi16(mm6, mm4);         /* mm6 = Int1H = ( 4 5 6 7) */
-
-        _mm_store_si64((__m64 *)outptr0 + 2, mm1);  /* temporarily save */
-        _mm_store_si64((__m64 *)outptr0 + 3, mm5);  /* the intermediate data */
-        _mm_store_si64((__m64 *)outptr1 + 2, mm2);
-        _mm_store_si64((__m64 *)outptr1 + 3, mm6);
-
-        mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm1=( - - - 0) */
-        mm2 = _mm_slli_si64(mm2, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm2=( - - - 0) */
-
-        wk[2] = mm1;
-        wk[3] = mm2;
-      } else {
-        /* process the last column block */
-        mm1 = _mm_cmpeq_pi8(mm1, mm1);
-        mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT);
-        mm2 = mm1;
-
-        mm_tmp = _mm_load_si64((__m64 *)outptr0 + 1);
-        mm1 = _mm_and_si64(mm1, mm_tmp);      /* mm1=( - - - 7) */
-        mm_tmp = _mm_load_si64((__m64 *)outptr1 + 1);
-        mm2 = _mm_and_si64(mm2, mm_tmp);      /* mm2=( - - - 7) */
-
-        wk[2] = mm1;
-        wk[3] = mm2;
-      }
-
-      /* process the upper row */
-      PROCESS_ROW(0)
-
-      /* process the lower row */
-      PROCESS_ROW(1)
-    }
-  }
-}
diff --git a/simd/loongson/jquanti-mmi.c b/simd/loongson/jquanti-mmi.c
deleted file mode 100644
index f9a3f81..0000000
--- a/simd/loongson/jquanti-mmi.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- *                          All Rights Reserved.
- * Authors:  ZhuChen     <zhuchen@loongson.cn>
- *           CaiWanwei   <caiwanwei@loongson.cn>
- *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- * Copyright (C) 2018, D. R. Commander.  All Rights Reserved.
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
-
-#include "jsimd_mmi.h"
-
-
-#define DO_QUANT() { \
-  mm2 = _mm_load_si64((__m64 *)&workspace[0]); \
-  mm3 = _mm_load_si64((__m64 *)&workspace[4]); \
-  \
-  mm0 = mm2; \
-  mm1 = mm3; \
-  \
-  mm2 = _mm_srai_pi16(mm2, (WORD_BIT - 1));   /* -1 if value < 0, */ \
-                                              /* 0 otherwise */ \
-  mm3 = _mm_srai_pi16(mm3, (WORD_BIT - 1)); \
-  \
-  mm0 = _mm_xor_si64(mm0, mm2);               /* val = -val */ \
-  mm1 = _mm_xor_si64(mm1, mm3); \
-  mm0 = _mm_sub_pi16(mm0, mm2); \
-  mm1 = _mm_sub_pi16(mm1, mm3); \
-  \
-  corr0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]);  /* correction */ \
-  corr1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \
-  \
-  mm0 = _mm_add_pi16(mm0, corr0);             /* correction + roundfactor */ \
-  mm1 = _mm_add_pi16(mm1, corr1); \
-  \
-  mm4 = mm0; \
-  mm5 = mm1; \
-  \
-  recip0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]);  /* reciprocal */ \
-  recip1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \
-  \
-  mm0 = _mm_mulhi_pi16(mm0, recip0); \
-  mm1 = _mm_mulhi_pi16(mm1, recip1); \
-  \
-  mm0 = _mm_add_pi16(mm0, mm4);  /* reciprocal is always negative */ \
-  mm1 = _mm_add_pi16(mm1, mm5);  /* (MSB=1), so we always need to add the */ \
-                                 /* initial value (input value is never */ \
-                                 /* negative as we inverted it at the */ \
-                                 /* start of this routine) */ \
-  \
-  scale0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]);  /* scale */ \
-  scale1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \
-  \
-  mm6 = scale0; \
-  mm7 = scale1; \
-  mm4 = mm0; \
-  mm5 = mm1; \
-  \
-  mm0 = _mm_mulhi_pi16(mm0, mm6); \
-  mm1 = _mm_mulhi_pi16(mm1, mm7); \
-  \
-  mm6 = _mm_srai_pi16(mm6, (WORD_BIT - 1));   /* determine if scale... */ \
-                                              /* is negative */ \
-  mm7 = _mm_srai_pi16(mm7, (WORD_BIT - 1)); \
-  \
-  mm6 = _mm_and_si64(mm6, mm4);               /* and add input if it is */ \
-  mm7 = _mm_and_si64(mm7, mm5); \
-  mm0 = _mm_add_pi16(mm0, mm6); \
-  mm1 = _mm_add_pi16(mm1, mm7); \
-  \
-  mm4 = _mm_srai_pi16(mm4, (WORD_BIT - 1));   /* then check if... */ \
-  mm5 = _mm_srai_pi16(mm5, (WORD_BIT - 1));   /* negative input */ \
-  \
-  mm4 = _mm_and_si64(mm4, scale0);            /* and add scale if it is */ \
-  mm5 = _mm_and_si64(mm5, scale1); \
-  mm0 = _mm_add_pi16(mm0, mm4); \
-  mm1 = _mm_add_pi16(mm1, mm5); \
-  \
-  mm0 = _mm_xor_si64(mm0, mm2);               /* val = -val */ \
-  mm1 = _mm_xor_si64(mm1, mm3); \
-  mm0 = _mm_sub_pi16(mm0, mm2); \
-  mm1 = _mm_sub_pi16(mm1, mm3); \
-  \
-  _mm_store_si64((__m64 *)&output_ptr[0], mm0); \
-  _mm_store_si64((__m64 *)&output_ptr[4], mm1); \
-  \
-  workspace += DCTSIZE; \
-  divisors += DCTSIZE; \
-  output_ptr += DCTSIZE; \
-}
-
-
-void jsimd_quantize_mmi(JCOEFPTR coef_block, DCTELEM *divisors,
-                        DCTELEM *workspace)
-{
-  JCOEFPTR output_ptr = coef_block;
-  __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
-  __m64 corr0, corr1, recip0, recip1, scale0, scale1;
-
-  DO_QUANT()
-  DO_QUANT()
-  DO_QUANT()
-  DO_QUANT()
-  DO_QUANT()
-  DO_QUANT()
-  DO_QUANT()
-  DO_QUANT()
-}
diff --git a/simd/mips64/jccolext-mmi.c b/simd/mips64/jccolext-mmi.c
new file mode 100644
index 0000000..558eb2a
--- /dev/null
+++ b/simd/mips64/jccolext-mmi.c
@@ -0,0 +1,455 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           ZhangLixia  <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
+                               JSAMPIMAGE output_buf, JDIMENSION output_row,
+                               int num_rows)
+{
+  JSAMPROW inptr, outptr0, outptr1, outptr2;
+  int num_cols, col;
+  __m64 re, ro, ge, go, be, bo, xe;
+#if RGB_PIXELSIZE == 4
+  __m64 xo;
+#endif
+  __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
+  __m64 ble, halfble, bhe, halfbhe, blo, halfblo, bho, halfbho;
+  __m64 rle, halfrle, rhe, halfrhe, rlo, halfrlo, rho, halfrho;
+  __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
+  __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
+  __m64 cble, cbhe, cbe, cblo, cbho, cbo, cb;
+  __m64 crle, crhe, cre, crlo, crho, cro, cr;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
+         outptr0 += 8, outptr1 += 8, outptr2 += 8) {
+
+#if RGB_PIXELSIZE == 3
+
+      if (num_cols < 8) {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %3\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            "xor      $12, $12, $12\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lbu      $12, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            "xor      $11, $11, $11\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lhu      $11, 0($13)\r\n"
+            "sll      $12, $12, 16\r\n"
+            "or       $12, $12, $11\r\n"
+
+            "2:       \r\n"
+            "dmtc1    $12, %0\r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 4\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lwu      $14, 0($13)\r\n"
+            "dmtc1    $14, %1\r\n"
+            "dsll32   $12, $12, 0\r\n"
+            "or       $12, $12, $14\r\n"
+            "dmtc1    $12, %0\r\n"
+
+            "3:       \r\n"
+            "li       $8, 8\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 4f\r\n"
+            "nop      \r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "li       $9, 8\r\n"
+            "j        5f\r\n"
+            "nop      \r\n"
+
+            "4:       \r\n"
+            "li       $8, 16\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 5f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
+            : "r" (col), "r" (num_rows), "r" (inptr)
+            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
+              "$14", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmG = _mm_load_si64((__m64 *)&inptr[8]);
+          mmF = _mm_load_si64((__m64 *)&inptr[16]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmG);
+      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
+
+      mmD = _mm_unpacklo_pi8(mmD, mmF);
+      mmG = _mm_unpackhi_pi8(mmG, mmF);
+
+      mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmD);
+      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
+
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmD = _mm_unpackhi_pi8(mmD, mmG);
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmB = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_loadhi_pi8_f(mmD);
+      mmD = _mm_loadlo_pi8_f(mmD);
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+      if (num_cols < 8) {
+        col = num_cols;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "lwc1     %0, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0($13)\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "mov.s    %3, %1\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "3:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
+            : "r" (col), "r" (inptr)
+            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmF = _mm_load_si64((__m64 *)&inptr[8]);
+          mmD = _mm_load_si64((__m64 *)&inptr[16]);
+          mmC = _mm_load_si64((__m64 *)&inptr[24]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
+          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmB = _mm_unpackhi_pi8(mmA, mmF);
+      mmA = _mm_unpacklo_pi8(mmA, mmF);
+
+      mmG = _mm_unpackhi_pi8(mmD, mmC);
+      mmD = _mm_unpacklo_pi8(mmD, mmC);
+
+      mmE = _mm_unpackhi_pi16(mmA, mmD);
+      mmA = _mm_unpacklo_pi16(mmA, mmD);
+
+      mmH = _mm_unpackhi_pi16(mmB, mmG);
+      mmB = _mm_unpacklo_pi16(mmB, mmG);
+
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmD = _mm_loadhi_pi8_f(mmB);
+      mmB = _mm_loadlo_pi8_f(mmB);
+
+      mmG = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_unpacklo_pi8(mmH, mmH);
+      mmH = _mm_unpackhi_pi8(mmH, mmH);
+      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
+      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
+
+#endif
+
+      /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
+       * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
+       *
+       * (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       */
+
+      rglo = _mm_unpacklo_pi16(ro, go);
+      rgho = _mm_unpackhi_pi16(ro, go);
+      ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
+      yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
+      cblo = _mm_madd_pi16(rglo, PW_MF016_MF033);
+      cbho = _mm_madd_pi16(rgho, PW_MF016_MF033);
+
+      blo = _mm_loadlo_pi16_f(bo);
+      bho = _mm_loadhi_pi16_f(bo);
+      halfblo = _mm_srli_pi32(blo, 1);
+      halfbho = _mm_srli_pi32(bho, 1);
+
+      cblo = _mm_add_pi32(cblo, halfblo);
+      cbho = _mm_add_pi32(cbho, halfbho);
+      cblo = _mm_add_pi32(cblo, PD_ONEHALFM1_CJ);
+      cbho = _mm_add_pi32(cbho, PD_ONEHALFM1_CJ);
+      cblo = _mm_srli_pi32(cblo, SCALEBITS);
+      cbho = _mm_srli_pi32(cbho, SCALEBITS);
+      cbo = _mm_packs_pi32(cblo, cbho);
+
+      rgle = _mm_unpacklo_pi16(re, ge);
+      rghe = _mm_unpackhi_pi16(re, ge);
+      yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
+      yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
+      cble = _mm_madd_pi16(rgle, PW_MF016_MF033);
+      cbhe = _mm_madd_pi16(rghe, PW_MF016_MF033);
+
+      ble = _mm_loadlo_pi16_f(be);
+      bhe = _mm_loadhi_pi16_f(be);
+      halfble = _mm_srli_pi32(ble, 1);
+      halfbhe = _mm_srli_pi32(bhe, 1);
+
+      cble = _mm_add_pi32(cble, halfble);
+      cbhe = _mm_add_pi32(cbhe, halfbhe);
+      cble = _mm_add_pi32(cble, PD_ONEHALFM1_CJ);
+      cbhe = _mm_add_pi32(cbhe, PD_ONEHALFM1_CJ);
+      cble = _mm_srli_pi32(cble, SCALEBITS);
+      cbhe = _mm_srli_pi32(cbhe, SCALEBITS);
+      cbe = _mm_packs_pi32(cble, cbhe);
+
+      cbo = _mm_slli_pi16(cbo, BYTE_BIT);
+      cb = _mm_or_si64(cbe, cbo);
+
+      bglo = _mm_unpacklo_pi16(bo, go);
+      bgho = _mm_unpackhi_pi16(bo, go);
+      ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
+      yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
+      crlo = _mm_madd_pi16(bglo, PW_MF008_MF041);
+      crho = _mm_madd_pi16(bgho, PW_MF008_MF041);
+
+      ylo = _mm_add_pi32(ylo_bg, ylo_rg);
+      yho = _mm_add_pi32(yho_bg, yho_rg);
+      ylo = _mm_add_pi32(ylo, PD_ONEHALF);
+      yho = _mm_add_pi32(yho, PD_ONEHALF);
+      ylo = _mm_srli_pi32(ylo, SCALEBITS);
+      yho = _mm_srli_pi32(yho, SCALEBITS);
+      yo = _mm_packs_pi32(ylo, yho);
+
+      rlo = _mm_loadlo_pi16_f(ro);
+      rho = _mm_loadhi_pi16_f(ro);
+      halfrlo = _mm_srli_pi32(rlo, 1);
+      halfrho = _mm_srli_pi32(rho, 1);
+
+      crlo = _mm_add_pi32(crlo, halfrlo);
+      crho = _mm_add_pi32(crho, halfrho);
+      crlo = _mm_add_pi32(crlo, PD_ONEHALFM1_CJ);
+      crho = _mm_add_pi32(crho, PD_ONEHALFM1_CJ);
+      crlo = _mm_srli_pi32(crlo, SCALEBITS);
+      crho = _mm_srli_pi32(crho, SCALEBITS);
+      cro = _mm_packs_pi32(crlo, crho);
+
+      bgle = _mm_unpacklo_pi16(be, ge);
+      bghe = _mm_unpackhi_pi16(be, ge);
+      yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
+      yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
+      crle = _mm_madd_pi16(bgle, PW_MF008_MF041);
+      crhe = _mm_madd_pi16(bghe, PW_MF008_MF041);
+
+      yle = _mm_add_pi32(yle_bg, yle_rg);
+      yhe = _mm_add_pi32(yhe_bg, yhe_rg);
+      yle = _mm_add_pi32(yle, PD_ONEHALF);
+      yhe = _mm_add_pi32(yhe, PD_ONEHALF);
+      yle = _mm_srli_pi32(yle, SCALEBITS);
+      yhe = _mm_srli_pi32(yhe, SCALEBITS);
+      ye = _mm_packs_pi32(yle, yhe);
+
+      yo = _mm_slli_pi16(yo, BYTE_BIT);
+      y = _mm_or_si64(ye, yo);
+
+      rle = _mm_loadlo_pi16_f(re);
+      rhe = _mm_loadhi_pi16_f(re);
+      halfrle = _mm_srli_pi32(rle, 1);
+      halfrhe = _mm_srli_pi32(rhe, 1);
+
+      crle = _mm_add_pi32(crle, halfrle);
+      crhe = _mm_add_pi32(crhe, halfrhe);
+      crle = _mm_add_pi32(crle, PD_ONEHALFM1_CJ);
+      crhe = _mm_add_pi32(crhe, PD_ONEHALFM1_CJ);
+      crle = _mm_srli_pi32(crle, SCALEBITS);
+      crhe = _mm_srli_pi32(crhe, SCALEBITS);
+      cre = _mm_packs_pi32(crle, crhe);
+
+      cro = _mm_slli_pi16(cro, BYTE_BIT);
+      cr = _mm_or_si64(cre, cro);
+
+      _mm_store_si64((__m64 *)&outptr0[0], y);
+      _mm_store_si64((__m64 *)&outptr1[0], cb);
+      _mm_store_si64((__m64 *)&outptr2[0], cr);
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/simd/loongson/jccolor-mmi.c b/simd/mips64/jccolor-mmi.c
similarity index 100%
rename from simd/loongson/jccolor-mmi.c
rename to simd/mips64/jccolor-mmi.c
diff --git a/simd/mips64/jcgray-mmi.c b/simd/mips64/jcgray-mmi.c
new file mode 100644
index 0000000..9c7b833
--- /dev/null
+++ b/simd/mips64/jcgray-mmi.c
@@ -0,0 +1,132 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2014, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> GRAYSCALE CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_114  ((short)7471)                /* FIX(0.11400) */
+#define F_0_250  ((short)16384)               /* FIX(0.25000) */
+#define F_0_299  ((short)19595)               /* FIX(0.29900) */
+#define F_0_587  ((short)38470)               /* FIX(0.58700) */
+#define F_0_337  ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
+
+enum const_index {
+  index_PD_ONEHALF,
+  index_PW_F0299_F0337,
+  index_PW_F0114_F0250
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))),
+  _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299),
+  _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114)
+};
+
+#define get_const_value(index)  (*(__m64 *)&const_value[index])
+
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+#define PW_F0299_F0337   get_const_value(index_PW_F0299_F0337)
+#define PW_F0114_F0250   get_const_value(index_PW_F0114_F0250)
+
+
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extrgb_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extrgbx_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extbgr_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extbgrx_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extxbgr_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extxrgb_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
diff --git a/simd/mips64/jcgryext-mmi.c b/simd/mips64/jcgryext-mmi.c
new file mode 100644
index 0000000..08a83d6
--- /dev/null
+++ b/simd/mips64/jcgryext-mmi.c
@@ -0,0 +1,374 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_rgb_gray_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  JSAMPROW inptr, outptr;
+  int num_cols, col;
+  __m64 re, ro, ge, go, be, bo, xe;
+#if RGB_PIXELSIZE == 4
+  __m64 xo;
+#endif
+  __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
+  __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
+  __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
+         outptr += 8) {
+
+#if RGB_PIXELSIZE == 3
+
+      if (num_cols < 8) {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %3\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            "xor      $12, $12, $12\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lbu      $12, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            "xor      $11, $11, $11\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lhu      $11, 0($13)\r\n"
+            "sll      $12, $12, 16\r\n"
+            "or       $12, $12, $11\r\n"
+
+            "2:       \r\n"
+            "dmtc1    $12, %0\r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 4\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lwu      $14, 0($13)\r\n"
+            "dmtc1    $14, %1\r\n"
+            "dsll32   $12, $12, 0\r\n"
+            "or       $12, $12, $14\r\n"
+            "dmtc1    $12, %0\r\n"
+
+            "3:       \r\n"
+            "li       $8, 8\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 4f\r\n"
+            "nop      \r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "li       $9, 8\r\n"
+            "j        5f\r\n"
+            "nop      \r\n"
+
+            "4:       \r\n"
+            "li       $8, 16\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 5f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
+            : "r" (col), "r" (num_rows), "r" (inptr)
+            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
+              "$14", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmG = _mm_load_si64((__m64 *)&inptr[8]);
+          mmF = _mm_load_si64((__m64 *)&inptr[16]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmG);
+      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
+
+      mmD = _mm_unpacklo_pi8(mmD, mmF);
+      mmG = _mm_unpackhi_pi8(mmG, mmF);
+
+      mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmD);
+      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
+
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmD = _mm_unpackhi_pi8(mmD, mmG);
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmB = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_loadhi_pi8_f(mmD);
+      mmD = _mm_loadlo_pi8_f(mmD);
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+      if (num_cols < 8) {
+        col = num_cols;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "lwc1     %0, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0($13)\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "mov.s    %3, %1\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "3:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
+            : "r" (col), "r" (inptr)
+            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmF = _mm_load_si64((__m64 *)&inptr[8]);
+          mmD = _mm_load_si64((__m64 *)&inptr[16]);
+          mmC = _mm_load_si64((__m64 *)&inptr[24]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
+          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmB = _mm_unpackhi_pi8(mmA, mmF);
+      mmA = _mm_unpacklo_pi8(mmA, mmF);
+
+      mmG = _mm_unpackhi_pi8(mmD, mmC);
+      mmD = _mm_unpacklo_pi8(mmD, mmC);
+
+      mmE = _mm_unpackhi_pi16(mmA, mmD);
+      mmA = _mm_unpacklo_pi16(mmA, mmD);
+
+      mmH = _mm_unpackhi_pi16(mmB, mmG);
+      mmB = _mm_unpacklo_pi16(mmB, mmG);
+
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmD = _mm_loadhi_pi8_f(mmB);
+      mmB = _mm_loadlo_pi8_f(mmB);
+
+      mmG = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_unpacklo_pi8(mmH, mmH);
+      mmH = _mm_unpackhi_pi8(mmH, mmH);
+      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
+      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
+
+#endif
+
+      /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
+       * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
+       *
+       * (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       */
+
+      rglo = _mm_unpacklo_pi16(ro, go);
+      rgho = _mm_unpackhi_pi16(ro, go);
+      ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
+      yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
+
+      rgle = _mm_unpacklo_pi16(re, ge);
+      rghe = _mm_unpackhi_pi16(re, ge);
+      yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
+      yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
+
+      bglo = _mm_unpacklo_pi16(bo, go);
+      bgho = _mm_unpackhi_pi16(bo, go);
+      ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
+      yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
+
+      ylo = _mm_add_pi32(ylo_bg, ylo_rg);
+      yho = _mm_add_pi32(yho_bg, yho_rg);
+      ylo = _mm_add_pi32(ylo, PD_ONEHALF);
+      yho = _mm_add_pi32(yho, PD_ONEHALF);
+      ylo = _mm_srli_pi32(ylo, SCALEBITS);
+      yho = _mm_srli_pi32(yho, SCALEBITS);
+      yo = _mm_packs_pi32(ylo, yho);
+
+      bgle = _mm_unpacklo_pi16(be, ge);
+      bghe = _mm_unpackhi_pi16(be, ge);
+      yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
+      yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
+
+      yle = _mm_add_pi32(yle_bg, yle_rg);
+      yhe = _mm_add_pi32(yhe_bg, yhe_rg);
+      yle = _mm_add_pi32(yle, PD_ONEHALF);
+      yhe = _mm_add_pi32(yhe, PD_ONEHALF);
+      yle = _mm_srli_pi32(yle, SCALEBITS);
+      yhe = _mm_srli_pi32(yhe, SCALEBITS);
+      ye = _mm_packs_pi32(yle, yhe);
+
+      yo = _mm_slli_pi16(yo, BYTE_BIT);
+      y = _mm_or_si64(ye, yo);
+
+      _mm_store_si64((__m64 *)&outptr[0], y);
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/simd/mips64/jcsample-mmi.c b/simd/mips64/jcsample-mmi.c
new file mode 100644
index 0000000..0354dac
--- /dev/null
+++ b/simd/mips64/jcsample-mmi.c
@@ -0,0 +1,98 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA DOWNSAMPLING */
+
+#include "jsimd_mmi.h"
+#include "jcsample.h"
+
+
+void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor,
+                               JDIMENSION v_samp_factor,
+                               JDIMENSION width_in_blocks,
+                               JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  int inrow, outrow, outcol;
+  JDIMENSION output_cols = width_in_blocks * DCTSIZE;
+  JSAMPROW inptr0, inptr1, outptr;
+  __m64 bias, mask = 0.0, thisavg, nextavg, avg;
+  __m64 this0o, this0e, this0, this0sum, next0o, next0e, next0, next0sum;
+  __m64 this1o, this1e, this1, this1sum, next1o, next1e, next1, next1sum;
+
+  expand_right_edge(input_data, max_v_samp_factor, image_width,
+                    output_cols * 2);
+
+  bias = _mm_set1_pi32((1 << 17) + 1);   /* 0x00020001 (32-bit bias pattern) */
+                                         /* bias={1, 2, 1, 2} (16-bit) */
+  mask = _mm_cmpeq_pi16(mask, mask);
+  mask = _mm_srli_pi16(mask, BYTE_BIT);  /* {0xFF 0x00 0xFF 0x00 ..} */
+
+  for (inrow = 0, outrow = 0; outrow < v_samp_factor;
+       inrow += 2, outrow++) {
+
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr = output_data[outrow];
+
+    for (outcol = output_cols; outcol > 0;
+         outcol -= 8, inptr0 += 16, inptr1 += 16, outptr += 8) {
+
+      this0 = _mm_load_si64((__m64 *)&inptr0[0]);
+      this1 = _mm_load_si64((__m64 *)&inptr1[0]);
+      next0 = _mm_load_si64((__m64 *)&inptr0[8]);
+      next1 = _mm_load_si64((__m64 *)&inptr1[8]);
+
+      this0o = _mm_and_si64(this0, mask);
+      this0e = _mm_srli_pi16(this0, BYTE_BIT);
+      this1o = _mm_and_si64(this1, mask);
+      this1e = _mm_srli_pi16(this1, BYTE_BIT);
+      this0sum = _mm_add_pi16(this0o, this0e);
+      this1sum = _mm_add_pi16(this1o, this1e);
+
+      next0o = _mm_and_si64(next0, mask);
+      next0e = _mm_srli_pi16(next0, BYTE_BIT);
+      next1o = _mm_and_si64(next1, mask);
+      next1e = _mm_srli_pi16(next1, BYTE_BIT);
+      next0sum = _mm_add_pi16(next0o, next0e);
+      next1sum = _mm_add_pi16(next1o, next1e);
+
+      thisavg = _mm_add_pi16(this0sum, this1sum);
+      nextavg = _mm_add_pi16(next0sum, next1sum);
+      thisavg = _mm_add_pi16(thisavg, bias);
+      nextavg = _mm_add_pi16(nextavg, bias);
+      thisavg = _mm_srli_pi16(thisavg, 2);
+      nextavg = _mm_srli_pi16(nextavg, 2);
+
+      avg = _mm_packs_pu16(thisavg, nextavg);
+
+      _mm_store_si64((__m64 *)&outptr[0], avg);
+    }
+  }
+}
diff --git a/simd/loongson/jcsample.h b/simd/mips64/jcsample.h
similarity index 90%
rename from simd/loongson/jcsample.h
rename to simd/mips64/jcsample.h
index 2ac4816..bd07fcc 100644
--- a/simd/loongson/jcsample.h
+++ b/simd/mips64/jcsample.h
@@ -20,7 +20,7 @@
   if (numcols > 0) {
     for (row = 0; row < num_rows; row++) {
       ptr = image_data[row] + input_cols;
-      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
+      pixval = ptr[-1];
       for (count = numcols; count > 0; count--)
         *ptr++ = pixval;
     }
diff --git a/simd/mips64/jdcolext-mmi.c b/simd/mips64/jdcolext-mmi.c
new file mode 100644
index 0000000..3b5b2f2
--- /dev/null
+++ b/simd/mips64/jdcolext-mmi.c
@@ -0,0 +1,415 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
+                               JDIMENSION input_row, JSAMPARRAY output_buf,
+                               int num_rows)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int num_cols, col;
+  __m64 ye, yo, y, cbe, cbe2, cbo, cbo2, cb, cre, cre2, cro, cro2, cr;
+  __m64 re, ro, gle, ghe, ge, glo, gho, go, be, bo, xe = 0.0, xo = 0.0;
+  __m64 decenter, mask;
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+
+    for (num_cols = out_width; num_cols > 0; num_cols -= 8,
+         inptr0 += 8, inptr1 += 8, inptr2 += 8) {
+
+      cb = _mm_load_si64((__m64 *)inptr1);
+      cr = _mm_load_si64((__m64 *)inptr2);
+      y = _mm_load_si64((__m64 *)inptr0);
+
+      mask = decenter = 0.0;
+      mask = _mm_cmpeq_pi16(mask, mask);
+      decenter = _mm_cmpeq_pi16(decenter, decenter);
+      mask = _mm_srli_pi16(mask, BYTE_BIT);   /* {0xFF 0x00 0xFF 0x00 ..} */
+      decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+      cbe = _mm_and_si64(mask, cb);           /* Cb(0246) */
+      cbo = _mm_srli_pi16(cb, BYTE_BIT);      /* Cb(1357) */
+      cre = _mm_and_si64(mask, cr);           /* Cr(0246) */
+      cro = _mm_srli_pi16(cr, BYTE_BIT);      /* Cr(1357) */
+      cbe = _mm_add_pi16(cbe, decenter);
+      cbo = _mm_add_pi16(cbo, decenter);
+      cre = _mm_add_pi16(cre, decenter);
+      cro = _mm_add_pi16(cro, decenter);
+
+      /* (Original)
+       * R = Y                + 1.40200 * Cr
+       * G = Y - 0.34414 * Cb - 0.71414 * Cr
+       * B = Y + 1.77200 * Cb
+       *
+       * (This implementation)
+       * R = Y                + 0.40200 * Cr + Cr
+       * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+       * B = Y - 0.22800 * Cb + Cb + Cb
+       */
+
+      cbe2 = _mm_add_pi16(cbe, cbe);          /* 2*CbE */
+      cbo2 = _mm_add_pi16(cbo, cbo);          /* 2*CbO */
+      cre2 = _mm_add_pi16(cre, cre);          /* 2*CrE */
+      cro2 = _mm_add_pi16(cro, cro);          /* 2*CrO */
+
+      be = _mm_mulhi_pi16(cbe2, PW_MF0228);   /* (2*CbE * -FIX(0.22800) */
+      bo = _mm_mulhi_pi16(cbo2, PW_MF0228);   /* (2*CbO * -FIX(0.22800) */
+      re = _mm_mulhi_pi16(cre2, PW_F0402);    /* (2*CrE * FIX(0.40200)) */
+      ro = _mm_mulhi_pi16(cro2, PW_F0402);    /* (2*CrO * FIX(0.40200)) */
+
+      be = _mm_add_pi16(be, PW_ONE);
+      bo = _mm_add_pi16(bo, PW_ONE);
+      be = _mm_srai_pi16(be, 1);              /* (CbE * -FIX(0.22800)) */
+      bo = _mm_srai_pi16(bo, 1);              /* (CbO * -FIX(0.22800)) */
+      re = _mm_add_pi16(re, PW_ONE);
+      ro = _mm_add_pi16(ro, PW_ONE);
+      re = _mm_srai_pi16(re, 1);              /* (CrE * FIX(0.40200)) */
+      ro = _mm_srai_pi16(ro, 1);              /* (CrO * FIX(0.40200)) */
+
+      be = _mm_add_pi16(be, cbe);
+      bo = _mm_add_pi16(bo, cbo);
+      be = _mm_add_pi16(be, cbe);             /* (CbE * FIX(1.77200))=(B-Y)E */
+      bo = _mm_add_pi16(bo, cbo);             /* (CbO * FIX(1.77200))=(B-Y)O */
+      re = _mm_add_pi16(re, cre);             /* (CrE * FIX(1.40200))=(R-Y)E */
+      ro = _mm_add_pi16(ro, cro);             /* (CrO * FIX(1.40200))=(R-Y)O */
+
+      gle = _mm_unpacklo_pi16(cbe, cre);
+      ghe = _mm_unpackhi_pi16(cbe, cre);
+      gle = _mm_madd_pi16(gle, PW_MF0344_F0285);
+      ghe = _mm_madd_pi16(ghe, PW_MF0344_F0285);
+      glo = _mm_unpacklo_pi16(cbo, cro);
+      gho = _mm_unpackhi_pi16(cbo, cro);
+      glo = _mm_madd_pi16(glo, PW_MF0344_F0285);
+      gho = _mm_madd_pi16(gho, PW_MF0344_F0285);
+
+      gle = _mm_add_pi32(gle, PD_ONEHALF);
+      ghe = _mm_add_pi32(ghe, PD_ONEHALF);
+      gle = _mm_srai_pi32(gle, SCALEBITS);
+      ghe = _mm_srai_pi32(ghe, SCALEBITS);
+      glo = _mm_add_pi32(glo, PD_ONEHALF);
+      gho = _mm_add_pi32(gho, PD_ONEHALF);
+      glo = _mm_srai_pi32(glo, SCALEBITS);
+      gho = _mm_srai_pi32(gho, SCALEBITS);
+
+      ge = _mm_packs_pi32(gle, ghe);       /* CbE*-FIX(0.344)+CrE*FIX(0.285) */
+      go = _mm_packs_pi32(glo, gho);       /* CbO*-FIX(0.344)+CrO*FIX(0.285) */
+      ge = _mm_sub_pi16(ge, cre);  /* CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
+      go = _mm_sub_pi16(go, cro);  /* CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
+
+      ye = _mm_and_si64(mask, y);             /* Y(0246) */
+      yo = _mm_srli_pi16(y, BYTE_BIT);        /* Y(1357) */
+
+      re = _mm_add_pi16(re, ye);              /* ((R-Y)E+YE)=(R0 R2 R4 R6) */
+      ro = _mm_add_pi16(ro, yo);              /* ((R-Y)O+YO)=(R1 R3 R5 R7) */
+      re = _mm_packs_pu16(re, re);            /* (R0 R2 R4 R6 ** ** ** **) */
+      ro = _mm_packs_pu16(ro, ro);            /* (R1 R3 R5 R7 ** ** ** **) */
+
+      ge = _mm_add_pi16(ge, ye);              /* ((G-Y)E+YE)=(G0 G2 G4 G6) */
+      go = _mm_add_pi16(go, yo);              /* ((G-Y)O+YO)=(G1 G3 G5 G7) */
+      ge = _mm_packs_pu16(ge, ge);            /* (G0 G2 G4 G6 ** ** ** **) */
+      go = _mm_packs_pu16(go, go);            /* (G1 G3 G5 G7 ** ** ** **) */
+
+      be = _mm_add_pi16(be, ye);              /* (YE+(B-Y)E)=(B0 B2 B4 B6) */
+      bo = _mm_add_pi16(bo, yo);              /* (YO+(B-Y)O)=(B1 B3 B5 B7) */
+      be = _mm_packs_pu16(be, be);            /* (B0 B2 B4 B6 ** ** ** **) */
+      bo = _mm_packs_pu16(bo, bo);            /* (B1 B3 B5 B7 ** ** ** **) */
+
+#if RGB_PIXELSIZE == 3
+
+      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+      mmA = _mm_unpacklo_pi8(mmA, mmC);       /* (00 10 02 12 04 14 06 16) */
+      mmE = _mm_unpacklo_pi8(mmE, mmB);       /* (20 01 22 03 24 05 26 07) */
+      mmD = _mm_unpacklo_pi8(mmD, mmF);       /* (11 21 13 23 15 25 17 27) */
+
+      mmH = _mm_srli_si64(mmA, 2 * BYTE_BIT);
+
+      mmG = _mm_unpackhi_pi16(mmA, mmE);      /* (04 14 24 05 06 16 26 07) */
+      mmA = _mm_unpacklo_pi16(mmA, mmE);      /* (00 10 20 01 02 12 22 03) */
+
+      mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT);
+      mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT);  /* (13 23 15 25 17 27 -- --) */
+
+      mmC = _mm_unpackhi_pi16(mmD, mmH);      /* (15 25 06 16 17 27 -- --) */
+      mmD = _mm_unpacklo_pi16(mmD, mmH);      /* (11 21 02 12 13 23 04 14) */
+
+      mmF = _mm_unpackhi_pi16(mmE, mmB);      /* (26 07 17 27 -- -- -- --) */
+      mmE = _mm_unpacklo_pi16(mmE, mmB);      /* (22 03 13 23 24 05 15 25) */
+
+      mmA = _mm_unpacklo_pi32(mmA, mmD);      /* (00 10 20 01 11 21 02 12) */
+      mmE = _mm_unpacklo_pi32(mmE, mmG);      /* (22 03 13 23 04 14 24 05) */
+      mmC = _mm_unpacklo_pi32(mmC, mmF);      /* (15 25 06 16 26 07 17 27) */
+
+      if (num_cols >= 8) {
+        if (!(((long)outptr) & 7)) {
+          _mm_store_si64((__m64 *)outptr, mmA);
+          _mm_store_si64((__m64 *)(outptr + 8), mmE);
+          _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        } else {
+          _mm_storeu_si64((__m64 *)outptr, mmA);
+          _mm_storeu_si64((__m64 *)(outptr + 8), mmE);
+          _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        }
+        outptr += RGB_PIXELSIZE * 8;
+      } else {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 16\r\n"
+            "move     $9, %4\r\n"
+            "mov.s    $f4, %1\r\n"
+            "mov.s    $f6, %3\r\n"
+            "move     $10, %5\r\n"
+            "bltu     $9, $8, 1f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "gssdlc1  $f6, 7+8($10)\r\n"
+            "gssdrc1  $f6, 8($10)\r\n"
+            "mov.s    $f4, %2\r\n"
+            "subu     $9, $9, 16\r\n"
+            PTR_ADDU  "$10, $10, 16\r\n"
+            "b        2f\r\n"
+            "nop      \r\n"
+
+            "1:       \r\n"
+            "li       $8, 8\r\n"              /* st8 */
+            "bltu     $9, $8, 2f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "mov.s    $f4, %3\r\n"
+            "subu     $9, $9, 8\r\n"
+            PTR_ADDU  "$10, $10, 8\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"              /* st4 */
+            "mfc1     $11, $f4\r\n"
+            "bltu     $9, $8, 3f\r\n"
+            "nop      \r\n"
+            "swl      $11, 3($10)\r\n"
+            "swr      $11, 0($10)\r\n"
+            "li       $8, 32\r\n"
+            "mtc1     $8, $f6\r\n"
+            "dsrl     $f4, $f4, $f6\r\n"
+            "mfc1     $11, $f4\r\n"
+            "subu     $9, $9, 4\r\n"
+            PTR_ADDU  "$10, $10, 4\r\n"
+
+            "3:       \r\n"
+            "li       $8, 2\r\n"              /* st2 */
+            "bltu     $9, $8, 4f\r\n"
+            "nop      \r\n"
+            "ush      $11, 0($10)\r\n"
+            "srl      $11, 16\r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_ADDU  "$10, $10, 2\r\n"
+
+            "4:       \r\n"
+            "li       $8, 1\r\n"              /* st1 */
+            "bltu     $9, $8, 5f\r\n"
+            "nop      \r\n"
+            "sb       $11, 0($10)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"                   /* end */
+            : "=m" (*outptr)
+            : "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr)
+            : "$f4", "$f6", "$8", "$9", "$10", "$11", "memory"
+           );
+      }
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+      xe = _mm_cmpeq_pi8(xe, xe);
+      xo = _mm_cmpeq_pi8(xo, xo);
+#else
+      xe = _mm_xor_si64(xe, xe);
+      xo = _mm_xor_si64(xo, xo);
+#endif
+      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+      /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */
+      /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */
+
+      mmA = _mm_unpacklo_pi8(mmA, mmC);       /* (00 10 02 12 04 14 06 16) */
+      mmE = _mm_unpacklo_pi8(mmE, mmG);       /* (20 30 22 32 24 34 26 36) */
+      mmB = _mm_unpacklo_pi8(mmB, mmD);       /* (01 11 03 13 05 15 07 17) */
+      mmF = _mm_unpacklo_pi8(mmF, mmH);       /* (21 31 23 33 25 35 27 37) */
+
+      mmC = _mm_unpackhi_pi16(mmA, mmE);      /* (04 14 24 34 06 16 26 36) */
+      mmA = _mm_unpacklo_pi16(mmA, mmE);      /* (00 10 20 30 02 12 22 32) */
+      mmG = _mm_unpackhi_pi16(mmB, mmF);      /* (05 15 25 35 07 17 27 37) */
+      mmB = _mm_unpacklo_pi16(mmB, mmF);      /* (01 11 21 31 03 13 23 33) */
+
+      mmD = _mm_unpackhi_pi32(mmA, mmB);      /* (02 12 22 32 03 13 23 33) */
+      mmA = _mm_unpacklo_pi32(mmA, mmB);      /* (00 10 20 30 01 11 21 31) */
+      mmH = _mm_unpackhi_pi32(mmC, mmG);      /* (06 16 26 36 07 17 27 37) */
+      mmC = _mm_unpacklo_pi32(mmC, mmG);      /* (04 14 24 34 05 15 25 35) */
+
+      if (num_cols >= 8) {
+        if (!(((long)outptr) & 7)) {
+          _mm_store_si64((__m64 *)outptr, mmA);
+          _mm_store_si64((__m64 *)(outptr + 8), mmD);
+          _mm_store_si64((__m64 *)(outptr + 16), mmC);
+          _mm_store_si64((__m64 *)(outptr + 24), mmH);
+        } else {
+          _mm_storeu_si64((__m64 *)outptr, mmA);
+          _mm_storeu_si64((__m64 *)(outptr + 8), mmD);
+          _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+          _mm_storeu_si64((__m64 *)(outptr + 24), mmH);
+        }
+        outptr += RGB_PIXELSIZE * 8;
+      } else {
+        col = num_cols;
+        asm(".set noreorder\r\n"              /* st16 */
+
+            "li       $8, 4\r\n"
+            "move     $9, %6\r\n"
+            "move     $10, %7\r\n"
+            "mov.s    $f4, %2\r\n"
+            "mov.s    $f6, %4\r\n"
+            "bltu     $9, $8, 1f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "gssdlc1  $f6, 7+8($10)\r\n"
+            "gssdrc1  $f6, 8($10)\r\n"
+            "mov.s    $f4, %3\r\n"
+            "mov.s    $f6, %5\r\n"
+            "subu     $9, $9, 4\r\n"
+            PTR_ADDU  "$10, $10, 16\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"              /* st8 */
+            "bltu     $9, $8, 2f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "mov.s    $f4, $f6\r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_ADDU  "$10, $10, 8\r\n"
+
+            "2:       \r\n"
+            "li       $8, 1\r\n"              /* st4 */
+            "bltu     $9, $8, 3f\r\n"
+            "nop      \r\n"
+            "gsswlc1  $f4, 3($10)\r\n"
+            "gsswrc1  $f4, 0($10)\r\n"
+
+            "3:       \r\n"
+            "li       %1, 0\r\n"              /* end */
+            : "=m" (*outptr), "=r" (col)
+            : "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col),
+              "r" (outptr)
+            : "$f4", "$f6", "$8", "$9", "$10", "memory"
+           );
+      }
+
+#endif
+
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/simd/loongson/jdcolor-mmi.c b/simd/mips64/jdcolor-mmi.c
similarity index 100%
rename from simd/loongson/jdcolor-mmi.c
rename to simd/mips64/jdcolor-mmi.c
diff --git a/simd/mips64/jdmerge-mmi.c b/simd/mips64/jdmerge-mmi.c
new file mode 100644
index 0000000..0a39bd5
--- /dev/null
+++ b/simd/mips64/jdmerge-mmi.c
@@ -0,0 +1,149 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_344  ((short)22554)  /* FIX(0.34414) */
+#define F_0_402  ((short)26345)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  ((short)18734)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  ((short)14942)  /* FIX(2) - FIX(1.77200) */
+
+enum const_index {
+  index_PW_ONE,
+  index_PW_F0402,
+  index_PW_MF0228,
+  index_PW_MF0344_F0285,
+  index_PD_ONEHALF
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(1, 1, 1, 1),
+  _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402),
+  _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228),
+  _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344),
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1)))
+};
+
+#define PW_ONE           get_const_value(index_PW_ONE)
+#define PW_F0402         get_const_value(index_PW_F0402)
+#define PW_MF0228        get_const_value(index_PW_MF0228)
+#define PW_MF0344_F0285  get_const_value(index_PW_MF0344_F0285)
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+
+#define RGBX_FILLER_0XFF  1
+
+
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extrgb_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extrgb_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extrgbx_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extrgbx_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extbgr_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extbgr_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extbgrx_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extbgrx_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extxbgr_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extxbgr_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extxrgb_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extxrgb_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
diff --git a/simd/mips64/jdmrgext-mmi.c b/simd/mips64/jdmrgext-mmi.c
new file mode 100644
index 0000000..be09ff2
--- /dev/null
+++ b/simd/mips64/jdmrgext-mmi.c
@@ -0,0 +1,615 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
+                                    JSAMPIMAGE input_buf,
+                                    JDIMENSION in_row_group_ctr,
+                                    JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int num_cols, col;
+  __m64 ythise, ythiso, ythis, ynexte, ynexto, ynext, yl, y;
+  __m64 cbl, cbl2, cbh, cbh2, cb, crl, crl2, crh, crh2, cr;
+  __m64 rle, rlo, rl, rhe, rho, rh, re, ro;
+  __m64 ga, gb, gle, glo, gl, gc, gd, ghe, gho, gh, ge, go;
+  __m64 ble, blo, bl, bhe, bho, bh, be, bo, xe = 0.0, xo = 0.0;
+  __m64 decenter, mask, zero = 0.0;
+#if RGB_PIXELSIZE == 4
+  __m64 mm8, mm9;
+#endif
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  for (num_cols = output_width >> 1; num_cols > 0; num_cols -= 8,
+       inptr0 += 16, inptr1 += 8, inptr2 += 8) {
+
+    cb = _mm_load_si64((__m64 *)inptr1);
+    cr = _mm_load_si64((__m64 *)inptr2);
+    ythis = _mm_load_si64((__m64 *)inptr0);
+    ynext = _mm_load_si64((__m64 *)inptr0 + 1);
+
+    mask = decenter = 0.0;
+    mask = _mm_cmpeq_pi16(mask, mask);
+    decenter = _mm_cmpeq_pi16(decenter, decenter);
+    mask = _mm_srli_pi16(mask, BYTE_BIT);   /* {0xFF 0x00 0xFF 0x00 ..} */
+    decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+    cbl = _mm_unpacklo_pi8(cb, zero);         /* Cb(0123) */
+    cbh = _mm_unpackhi_pi8(cb, zero);         /* Cb(4567) */
+    crl = _mm_unpacklo_pi8(cr, zero);         /* Cr(0123) */
+    crh = _mm_unpackhi_pi8(cr, zero);         /* Cr(4567) */
+    cbl = _mm_add_pi16(cbl, decenter);
+    cbh = _mm_add_pi16(cbh, decenter);
+    crl = _mm_add_pi16(crl, decenter);
+    crh = _mm_add_pi16(crh, decenter);
+
+    /* (Original)
+     * R = Y                + 1.40200 * Cr
+     * G = Y - 0.34414 * Cb - 0.71414 * Cr
+     * B = Y + 1.77200 * Cb
+     *
+     * (This implementation)
+     * R = Y                + 0.40200 * Cr + Cr
+     * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+     * B = Y - 0.22800 * Cb + Cb + Cb
+     */
+
+    cbl2 = _mm_add_pi16(cbl, cbl);            /* 2*CbL */
+    cbh2 = _mm_add_pi16(cbh, cbh);            /* 2*CbH */
+    crl2 = _mm_add_pi16(crl, crl);            /* 2*CrL */
+    crh2 = _mm_add_pi16(crh, crh);            /* 2*CrH */
+
+    bl = _mm_mulhi_pi16(cbl2, PW_MF0228);     /* (2*CbL * -FIX(0.22800) */
+    bh = _mm_mulhi_pi16(cbh2, PW_MF0228);     /* (2*CbH * -FIX(0.22800) */
+    rl = _mm_mulhi_pi16(crl2, PW_F0402);      /* (2*CrL * FIX(0.40200)) */
+    rh = _mm_mulhi_pi16(crh2, PW_F0402);      /* (2*CrH * FIX(0.40200)) */
+
+    bl = _mm_add_pi16(bl, PW_ONE);
+    bh = _mm_add_pi16(bh, PW_ONE);
+    bl = _mm_srai_pi16(bl, 1);                /* (CbL * -FIX(0.22800)) */
+    bh = _mm_srai_pi16(bh, 1);                /* (CbH * -FIX(0.22800)) */
+    rl = _mm_add_pi16(rl, PW_ONE);
+    rh = _mm_add_pi16(rh, PW_ONE);
+    rl = _mm_srai_pi16(rl, 1);                /* (CrL * FIX(0.40200)) */
+    rh = _mm_srai_pi16(rh, 1);                /* (CrH * FIX(0.40200)) */
+
+    bl = _mm_add_pi16(bl, cbl);
+    bh = _mm_add_pi16(bh, cbh);
+    bl = _mm_add_pi16(bl, cbl);               /* (CbL * FIX(1.77200))=(B-Y)L */
+    bh = _mm_add_pi16(bh, cbh);               /* (CbH * FIX(1.77200))=(B-Y)H */
+    rl = _mm_add_pi16(rl, crl);               /* (CrL * FIX(1.40200))=(R-Y)L */
+    rh = _mm_add_pi16(rh, crh);               /* (CrH * FIX(1.40200))=(R-Y)H */
+
+    ga = _mm_unpacklo_pi16(cbl, crl);
+    gb = _mm_unpackhi_pi16(cbl, crl);
+    ga = _mm_madd_pi16(ga, PW_MF0344_F0285);
+    gb = _mm_madd_pi16(gb, PW_MF0344_F0285);
+    gc = _mm_unpacklo_pi16(cbh, crh);
+    gd = _mm_unpackhi_pi16(cbh, crh);
+    gc = _mm_madd_pi16(gc, PW_MF0344_F0285);
+    gd = _mm_madd_pi16(gd, PW_MF0344_F0285);
+
+    ga = _mm_add_pi32(ga, PD_ONEHALF);
+    gb = _mm_add_pi32(gb, PD_ONEHALF);
+    ga = _mm_srai_pi32(ga, SCALEBITS);
+    gb = _mm_srai_pi32(gb, SCALEBITS);
+    gc = _mm_add_pi32(gc, PD_ONEHALF);
+    gd = _mm_add_pi32(gd, PD_ONEHALF);
+    gc = _mm_srai_pi32(gc, SCALEBITS);
+    gd = _mm_srai_pi32(gd, SCALEBITS);
+
+    gl = _mm_packs_pi32(ga, gb);           /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
+    gh = _mm_packs_pi32(gc, gd);           /* CbH*-FIX(0.344)+CrH*FIX(0.285) */
+    gl = _mm_sub_pi16(gl, crl);    /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
+    gh = _mm_sub_pi16(gh, crh);    /* CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H */
+
+    ythise = _mm_and_si64(mask, ythis);       /* Y(0246) */
+    ythiso = _mm_srli_pi16(ythis, BYTE_BIT);  /* Y(1357) */
+    ynexte = _mm_and_si64(mask, ynext);       /* Y(8ACE) */
+    ynexto = _mm_srli_pi16(ynext, BYTE_BIT);  /* Y(9BDF) */
+
+    rle = _mm_add_pi16(rl, ythise);           /* (R0 R2 R4 R6) */
+    rlo = _mm_add_pi16(rl, ythiso);           /* (R1 R3 R5 R7) */
+    rhe = _mm_add_pi16(rh, ynexte);           /* (R8 RA RC RE) */
+    rho = _mm_add_pi16(rh, ynexto);           /* (R9 RB RD RF) */
+    re = _mm_packs_pu16(rle, rhe);            /* (R0 R2 R4 R6 R8 RA RC RE) */
+    ro = _mm_packs_pu16(rlo, rho);            /* (R1 R3 R5 R7 R9 RB RD RF) */
+
+    gle = _mm_add_pi16(gl, ythise);           /* (G0 G2 G4 G6) */
+    glo = _mm_add_pi16(gl, ythiso);           /* (G1 G3 G5 G7) */
+    ghe = _mm_add_pi16(gh, ynexte);           /* (G8 GA GC GE) */
+    gho = _mm_add_pi16(gh, ynexto);           /* (G9 GB GD GF) */
+    ge = _mm_packs_pu16(gle, ghe);            /* (G0 G2 G4 G6 G8 GA GC GE) */
+    go = _mm_packs_pu16(glo, gho);            /* (G1 G3 G5 G7 G9 GB GD GF) */
+
+    ble = _mm_add_pi16(bl, ythise);           /* (B0 B2 B4 B6) */
+    blo = _mm_add_pi16(bl, ythiso);           /* (B1 B3 B5 B7) */
+    bhe = _mm_add_pi16(bh, ynexte);           /* (B8 BA BC BE) */
+    bho = _mm_add_pi16(bh, ynexto);           /* (B9 BB BD BF) */
+    be = _mm_packs_pu16(ble, bhe);            /* (B0 B2 B4 B6 B8 BA BC BE) */
+    bo = _mm_packs_pu16(blo, bho);            /* (B1 B3 B5 B7 B9 BB BD BF) */
+
+#if RGB_PIXELSIZE == 3
+
+    /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
+    /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
+    /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
+    mmG = _mm_unpacklo_pi8(mmA, mmC);         /* (00 10 02 12 04 14 06 16) */
+    mmA = _mm_unpackhi_pi8(mmA, mmC);         /* (08 18 0A 1A 0C 1C 0E 1E) */
+    mmH = _mm_unpacklo_pi8(mmE, mmB);         /* (20 01 22 03 24 05 26 07) */
+    mmE = _mm_unpackhi_pi8(mmE, mmB);         /* (28 09 2A 0B 2C 0D 2E 0F) */
+    mmC = _mm_unpacklo_pi8(mmD, mmF);         /* (11 21 13 23 15 25 17 27) */
+    mmD = _mm_unpackhi_pi8(mmD, mmF);         /* (19 29 1B 2B 1D 2D 1F 2F) */
+
+    mmB = _mm_unpacklo_pi16(mmG, mmA);        /* (00 10 08 18 02 12 0A 1A) */
+    mmA = _mm_unpackhi_pi16(mmG, mmA);        /* (04 14 0C 1C 06 16 0E 1E) */
+    mmF = _mm_unpacklo_pi16(mmH, mmE);        /* (20 01 28 09 22 03 2A 0B) */
+    mmE = _mm_unpackhi_pi16(mmH, mmE);        /* (24 05 2C 0D 26 07 2E 0F) */
+    mmH = _mm_unpacklo_pi16(mmC, mmD);        /* (11 21 19 29 13 23 1B 2B) */
+    mmG = _mm_unpackhi_pi16(mmC, mmD);        /* (15 25 1D 2D 17 27 1F 2F) */
+
+    mmC = _mm_unpacklo_pi16(mmB, mmF);        /* (00 10 20 01 08 18 28 09) */
+    mmB = _mm_srli_si64(mmB, 4 * BYTE_BIT);
+    mmB = _mm_unpacklo_pi16(mmH, mmB);        /* (11 21 02 12 19 29 0A 1A) */
+    mmD = _mm_unpackhi_pi16(mmF, mmH);        /* (22 03 13 23 2A 0B 1B 2B) */
+    mmF = _mm_unpacklo_pi16(mmA, mmE);        /* (04 14 24 05 0C 1C 2C 0D) */
+    mmA = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+    mmH = _mm_unpacklo_pi16(mmG, mmA);        /* (15 25 06 16 1D 2D 0E 1E) */
+    mmG = _mm_unpackhi_pi16(mmE, mmG);        /* (26 07 17 27 2E 0F 1F 2F) */
+
+    mmA = _mm_unpacklo_pi32(mmC, mmB);        /* (00 10 20 01 11 21 02 12) */
+    mmE = _mm_unpackhi_pi32(mmC, mmB);        /* (08 18 28 09 19 29 0A 1A) */
+    mmB = _mm_unpacklo_pi32(mmD, mmF);        /* (22 03 13 23 04 14 24 05) */
+    mmF = _mm_unpackhi_pi32(mmD, mmF);        /* (2A 0B 1B 2B 0C 1C 2C 0D) */
+    mmC = _mm_unpacklo_pi32(mmH, mmG);        /* (15 25 06 16 26 07 17 27) */
+    mmG = _mm_unpackhi_pi32(mmH, mmG);        /* (1D 2D 0E 1E 2E 0F 1F 2F) */
+
+    if (num_cols >= 8) {
+      if (!(((long)outptr) & 7)) {
+        _mm_store_si64((__m64 *)outptr, mmA);
+        _mm_store_si64((__m64 *)(outptr + 8), mmB);
+        _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        _mm_store_si64((__m64 *)(outptr + 24), mmE);
+        _mm_store_si64((__m64 *)(outptr + 32), mmF);
+        _mm_store_si64((__m64 *)(outptr + 40), mmG);
+      } else {
+        _mm_storeu_si64((__m64 *)outptr, mmA);
+        _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
+        _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        _mm_storeu_si64((__m64 *)(outptr + 24), mmE);
+        _mm_storeu_si64((__m64 *)(outptr + 32), mmF);
+        _mm_storeu_si64((__m64 *)(outptr + 40), mmG);
+      }
+      outptr += RGB_PIXELSIZE * 16;
+    } else {
+      if (output_width & 1)
+        col = num_cols * 6 + 3;
+      else
+        col = num_cols * 6;
+
+      asm(".set noreorder\r\n"                /* st24 */
+
+          "li       $8, 24\r\n"
+          "move     $9, %7\r\n"
+          "mov.s    $f4, %1\r\n"
+          "mov.s    $f6, %2\r\n"
+          "mov.s    $f8, %3\r\n"
+          "move     $10, %8\r\n"
+          "bltu     $9, $8, 1f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "gssdlc1  $f8, 7+16($10)\r\n"
+          "gssdrc1  $f8, 16($10)\r\n"
+          "mov.s    $f4, %4\r\n"
+          "mov.s    $f6, %5\r\n"
+          "mov.s    $f8, %6\r\n"
+          "subu     $9, $9, 24\r\n"
+          PTR_ADDU  "$10, $10, 24\r\n"
+
+          "1:       \r\n"
+          "li       $8, 16\r\n"               /* st16 */
+          "bltu     $9, $8, 2f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "mov.s    $f4, $f8\r\n"
+          "subu     $9, $9, 16\r\n"
+          PTR_ADDU  "$10, $10, 16\r\n"
+
+          "2:       \r\n"
+          "li       $8,  8\r\n"               /* st8 */
+          "bltu     $9, $8, 3f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "mov.s    $f4, $f6\r\n"
+          "subu     $9, $9, 8\r\n"
+          PTR_ADDU  "$10, $10, 8\r\n"
+
+          "3:       \r\n"
+          "li       $8,  4\r\n"               /* st4 */
+          "mfc1     $11, $f4\r\n"
+          "bltu     $9, $8, 4f\r\n"
+          "nop      \r\n"
+          "swl      $11, 3($10)\r\n"
+          "swr      $11, 0($10)\r\n"
+          "li       $8, 32\r\n"
+          "mtc1     $8, $f6\r\n"
+          "dsrl     $f4, $f4, $f6\r\n"
+          "mfc1     $11, $f4\r\n"
+          "subu     $9, $9, 4\r\n"
+          PTR_ADDU  "$10, $10, 4\r\n"
+
+          "4:       \r\n"
+          "li       $8, 2\r\n"                /* st2 */
+          "bltu     $9, $8, 5f\r\n"
+          "nop      \r\n"
+          "ush      $11, 0($10)\r\n"
+          "srl      $11, 16\r\n"
+          "subu     $9, $9, 2\r\n"
+          PTR_ADDU  "$10, $10, 2\r\n"
+
+          "5:       \r\n"
+          "li       $8, 1\r\n"                /* st1 */
+          "bltu     $9, $8, 6f\r\n"
+          "nop      \r\n"
+          "sb       $11, 0($10)\r\n"
+
+          "6:       \r\n"
+          "nop      \r\n"                     /* end */
+          : "=m" (*outptr)
+          : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmE), "f" (mmF),
+            "f" (mmG), "r" (col), "r" (outptr)
+          : "$f4", "$f6", "$f8", "$8", "$9", "$10", "$11", "memory"
+         );
+    }
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+    xe = _mm_cmpeq_pi8(xe, xe);
+    xo = _mm_cmpeq_pi8(xo, xo);
+#else
+    xe = _mm_xor_si64(xe, xe);
+    xo = _mm_xor_si64(xo, xo);
+#endif
+    /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
+    /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
+    /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
+    /* mmG=(30 32 34 36 38 3A 3C 3E), mmH=(31 33 35 37 39 3B 3D 3F) */
+
+    mm8 = _mm_unpacklo_pi8(mmA, mmC);         /* (00 10 02 12 04 14 06 16) */
+    mm9 = _mm_unpackhi_pi8(mmA, mmC);         /* (08 18 0A 1A 0C 1C 0E 1E) */
+    mmA = _mm_unpacklo_pi8(mmE, mmG);         /* (20 30 22 32 24 34 26 36) */
+    mmE = _mm_unpackhi_pi8(mmE, mmG);         /* (28 38 2A 3A 2C 3C 2E 3E) */
+
+    mmG = _mm_unpacklo_pi8(mmB, mmD);         /* (01 11 03 13 05 15 07 17) */
+    mmB = _mm_unpackhi_pi8(mmB, mmD);         /* (09 19 0B 1B 0D 1D 0F 1F) */
+    mmD = _mm_unpacklo_pi8(mmF, mmH);         /* (21 31 23 33 25 35 27 37) */
+    mmF = _mm_unpackhi_pi8(mmF, mmH);         /* (29 39 2B 3B 2D 3D 2F 3F) */
+
+    mmH = _mm_unpacklo_pi16(mm8, mmA);        /* (00 10 20 30 02 12 22 32) */
+    mm8 = _mm_unpackhi_pi16(mm8, mmA);        /* (04 14 24 34 06 16 26 36) */
+    mmA = _mm_unpacklo_pi16(mmG, mmD);        /* (01 11 21 31 03 13 23 33) */
+    mmD = _mm_unpackhi_pi16(mmG, mmD);        /* (05 15 25 35 07 17 27 37) */
+
+    mmG = _mm_unpackhi_pi16(mm9, mmE);        /* (0C 1C 2C 3C 0E 1E 2E 3E) */
+    mm9 = _mm_unpacklo_pi16(mm9, mmE);        /* (08 18 28 38 0A 1A 2A 3A) */
+    mmE = _mm_unpacklo_pi16(mmB, mmF);        /* (09 19 29 39 0B 1B 2B 3B) */
+    mmF = _mm_unpackhi_pi16(mmB, mmF);        /* (0D 1D 2D 3D 0F 1F 2F 3F) */
+
+    mmB = _mm_unpackhi_pi32(mmH, mmA);        /* (02 12 22 32 03 13 23 33) */
+    mmA = _mm_unpacklo_pi32(mmH, mmA);        /* (00 10 20 30 01 11 21 31) */
+    mmC = _mm_unpacklo_pi32(mm8, mmD);        /* (04 14 24 34 05 15 25 35) */
+    mmD = _mm_unpackhi_pi32(mm8, mmD);        /* (06 16 26 36 07 17 27 37) */
+
+    mmH = _mm_unpackhi_pi32(mmG, mmF);        /* (0E 1E 2E 3E 0F 1F 2F 3F) */
+    mmG = _mm_unpacklo_pi32(mmG, mmF);        /* (0C 1C 2C 3C 0D 1D 2D 3D) */
+    mmF = _mm_unpackhi_pi32(mm9, mmE);        /* (0A 1A 2A 3A 0B 1B 2B 3B) */
+    mmE = _mm_unpacklo_pi32(mm9, mmE);        /* (08 18 28 38 09 19 29 39) */
+
+    if (num_cols >= 8) {
+      if (!(((long)outptr) & 7)) {
+        _mm_store_si64((__m64 *)outptr, mmA);
+        _mm_store_si64((__m64 *)(outptr + 8), mmB);
+        _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        _mm_store_si64((__m64 *)(outptr + 24), mmD);
+        _mm_store_si64((__m64 *)(outptr + 32), mmE);
+        _mm_store_si64((__m64 *)(outptr + 40), mmF);
+        _mm_store_si64((__m64 *)(outptr + 48), mmG);
+        _mm_store_si64((__m64 *)(outptr + 56), mmH);
+      } else {
+        _mm_storeu_si64((__m64 *)outptr, mmA);
+        _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
+        _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        _mm_storeu_si64((__m64 *)(outptr + 24), mmD);
+        _mm_storeu_si64((__m64 *)(outptr + 32), mmE);
+        _mm_storeu_si64((__m64 *)(outptr + 40), mmF);
+        _mm_storeu_si64((__m64 *)(outptr + 48), mmG);
+        _mm_storeu_si64((__m64 *)(outptr + 56), mmH);
+      }
+      outptr += RGB_PIXELSIZE * 16;
+    } else {
+      if (output_width & 1)
+        col = num_cols * 2 + 1;
+      else
+        col = num_cols * 2;
+      asm(".set noreorder\r\n"                /* st32 */
+
+          "li       $8, 8\r\n"
+          "move     $9, %10\r\n"
+          "move     $10, %11\r\n"
+          "mov.s    $f4, %2\r\n"
+          "mov.s    $f6, %3\r\n"
+          "mov.s    $f8, %4\r\n"
+          "mov.s    $f10, %5\r\n"
+          "bltu     $9, $8, 1f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "gssdlc1  $f8, 7+16($10)\r\n"
+          "gssdrc1  $f8, 16($10)\r\n"
+          "gssdlc1  $f10, 7+24($10)\r\n"
+          "gssdrc1  $f10, 24($10)\r\n"
+          "mov.s    $f4, %6\r\n"
+          "mov.s    $f6, %7\r\n"
+          "mov.s    $f8, %8\r\n"
+          "mov.s    $f10, %9\r\n"
+          "subu     $9, $9, 8\r\n"
+          PTR_ADDU  "$10, $10, 32\r\n"
+
+          "1:       \r\n"
+          "li       $8, 4\r\n"                /* st16 */
+          "bltu     $9, $8, 2f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "mov.s    $f4, $f8\r\n"
+          "mov.s    $f6, $f10\r\n"
+          "subu     $9, $9, 4\r\n"
+          PTR_ADDU  "$10, $10, 16\r\n"
+
+          "2:       \r\n"
+          "li       $8, 2\r\n"                /* st8 */
+          "bltu     $9, $8, 3f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "mov.s    $f4, $f6\r\n"
+          "subu     $9, $9, 2\r\n"
+          PTR_ADDU  "$10, $10, 8\r\n"
+
+          "3:       \r\n"
+          "li       $8, 1\r\n"                /* st4 */
+          "bltu     $9, $8, 4f\r\n"
+          "nop      \r\n"
+          "gsswlc1  $f4, 3($10)\r\n"
+          "gsswrc1  $f4, 0($10)\r\n"
+
+          "4:       \r\n"
+          "li       %1, 0\r\n"                /* end */
+          : "=m" (*outptr), "=r" (col)
+          : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmD), "f" (mmE), "f" (mmF),
+            "f" (mmG), "f" (mmH), "r" (col), "r" (outptr)
+          : "$f4", "$f6", "$f8", "$f10", "$8", "$9", "$10", "memory"
+         );
+    }
+
+#endif
+
+  }
+
+  if (!((output_width >> 1) & 7)) {
+    if (output_width & 1) {
+      cb = _mm_load_si64((__m64 *)inptr1);
+      cr = _mm_load_si64((__m64 *)inptr2);
+      y = _mm_load_si64((__m64 *)inptr0);
+
+      decenter = 0.0;
+      decenter = _mm_cmpeq_pi16(decenter, decenter);
+      decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+      cbl = _mm_unpacklo_pi8(cb, zero);       /* Cb(0123) */
+      crl = _mm_unpacklo_pi8(cr, zero);       /* Cr(0123) */
+      cbl = _mm_add_pi16(cbl, decenter);
+      crl = _mm_add_pi16(crl, decenter);
+
+      cbl2 = _mm_add_pi16(cbl, cbl);          /* 2*CbL */
+      crl2 = _mm_add_pi16(crl, crl);          /* 2*CrL */
+      bl = _mm_mulhi_pi16(cbl2, PW_MF0228);   /* (2*CbL * -FIX(0.22800) */
+      rl = _mm_mulhi_pi16(crl2, PW_F0402);    /* (2*CrL * FIX(0.40200)) */
+
+      bl = _mm_add_pi16(bl, PW_ONE);
+      bl = _mm_srai_pi16(bl, 1);              /* (CbL * -FIX(0.22800)) */
+      rl = _mm_add_pi16(rl, PW_ONE);
+      rl = _mm_srai_pi16(rl, 1);              /* (CrL * FIX(0.40200)) */
+
+      bl = _mm_add_pi16(bl, cbl);
+      bl = _mm_add_pi16(bl, cbl);             /* (CbL * FIX(1.77200))=(B-Y)L */
+      rl = _mm_add_pi16(rl, crl);             /* (CrL * FIX(1.40200))=(R-Y)L */
+
+      gl = _mm_unpacklo_pi16(cbl, crl);
+      gl = _mm_madd_pi16(gl, PW_MF0344_F0285);
+      gl = _mm_add_pi32(gl, PD_ONEHALF);
+      gl = _mm_srai_pi32(gl, SCALEBITS);
+      gl = _mm_packs_pi32(gl, zero);       /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
+      gl = _mm_sub_pi16(gl, crl);  /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
+
+      yl = _mm_unpacklo_pi8(y, zero);         /* Y(0123) */
+      rl = _mm_add_pi16(rl, yl);              /* (R0 R1 R2 R3) */
+      gl = _mm_add_pi16(gl, yl);              /* (G0 G1 G2 G3) */
+      bl = _mm_add_pi16(bl, yl);              /* (B0 B1 B2 B3) */
+      re = _mm_packs_pu16(rl, rl);
+      ge = _mm_packs_pu16(gl, gl);
+      be = _mm_packs_pu16(bl, bl);
+#if RGB_PIXELSIZE == 3
+      mmA = _mm_unpacklo_pi8(mmA, mmC);
+      mmA = _mm_unpacklo_pi16(mmA, mmE);
+      asm(".set noreorder\r\n"
+
+          "move    $8, %2\r\n"
+          "mov.s   $f4, %1\r\n"
+          "mfc1    $9, $f4\r\n"
+          "ush     $9, 0($8)\r\n"
+          "srl     $9, 16\r\n"
+          "sb      $9, 2($8)\r\n"
+          : "=m" (*outptr)
+          : "f" (mmA), "r" (outptr)
+          : "$f4", "$8", "$9", "memory"
+         );
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+      xe = _mm_cmpeq_pi8(xe, xe);
+#else
+      xe = _mm_xor_si64(xe, xe);
+#endif
+      mmA = _mm_unpacklo_pi8(mmA, mmC);
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmA = _mm_unpacklo_pi16(mmA, mmE);
+      asm(".set noreorder\r\n"
+
+          "move    $8, %2\r\n"
+          "mov.s   $f4, %1\r\n"
+          "gsswlc1 $f4, 3($8)\r\n"
+          "gsswrc1 $f4, 0($8)\r\n"
+          : "=m" (*outptr)
+          : "f" (mmA), "r" (outptr)
+          : "$f4", "$8", "memory"
+         );
+#endif
+    }
+  }
+}
+
+
+void jsimd_h2v2_merged_upsample_mmi(JDIMENSION output_width,
+                                    JSAMPIMAGE input_buf,
+                                    JDIMENSION in_row_group_ctr,
+                                    JSAMPARRAY output_buf)
+{
+  JSAMPROW inptr, outptr;
+
+  inptr = input_buf[0][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
+  jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
+                                 output_buf);
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
+  output_buf[0] = output_buf[1];
+  jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
+                                 output_buf);
+
+  input_buf[0][in_row_group_ctr] = inptr;
+  output_buf[0] = outptr;
+}
+
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/simd/mips64/jdsample-mmi.c b/simd/mips64/jdsample-mmi.c
new file mode 100644
index 0000000..8ae94e7
--- /dev/null
+++ b/simd/mips64/jdsample-mmi.c
@@ -0,0 +1,304 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           ZhangLixia  <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA UPSAMPLING */
+
+#include "jsimd_mmi.h"
+
+
+enum const_index {
+  index_PW_ONE,
+  index_PW_TWO,
+  index_PW_THREE,
+  index_PW_SEVEN,
+  index_PW_EIGHT,
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(1, 1, 1, 1),
+  _uint64_set_pi16(2, 2, 2, 2),
+  _uint64_set_pi16(3, 3, 3, 3),
+  _uint64_set_pi16(7, 7, 7, 7),
+  _uint64_set_pi16(8, 8, 8, 8),
+};
+
+#define PW_ONE    get_const_value(index_PW_ONE)
+#define PW_TWO    get_const_value(index_PW_TWO)
+#define PW_THREE  get_const_value(index_PW_THREE)
+#define PW_SEVEN  get_const_value(index_PW_SEVEN)
+#define PW_EIGHT  get_const_value(index_PW_EIGHT)
+
+
+#define PROCESS_ROW(row, wkoffset, bias1, bias2, shift) { \
+  __m64 samp123X, samp3XXX, samp1234, sampX012, samp_1012; \
+  __m64 sampXXX4, sampX456, samp3456, samp567X, samp7XXX, samp5678; \
+  __m64 outle, outhe, outlo, outho, outl, outh; \
+  \
+  samp123X = _mm_srli_si64(samp0123, 2 * BYTE_BIT);  /* ( 1 2 3 -) */ \
+  sampXXX4 = _mm_slli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 4) */ \
+  samp3XXX = _mm_srli_si64(samp0123, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( 3 - - -) */ \
+  sampX456 = _mm_slli_si64(samp4567, 2 * BYTE_BIT);  /* ( - 4 5 6) */ \
+  \
+  samp1234 = _mm_or_si64(samp123X, sampXXX4);  /* ( 1 2 3 4) */ \
+  samp3456 = _mm_or_si64(samp3XXX, sampX456);  /* ( 3 4 5 6) */ \
+  \
+  sampX012 = _mm_slli_si64(samp0123, 2 * BYTE_BIT);  /* ( - 0 1 2) */ \
+  samp567X = _mm_srli_si64(samp4567, 2 * BYTE_BIT);  /* ( 5 6 7 -) */ \
+  samp7XXX = _mm_srli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( 7 - - -) */ \
+  \
+  samp_1012 = _mm_or_si64(sampX012, wk[row]);            /* (-1 0 1 2) */ \
+  samp5678 = _mm_or_si64(samp567X, wk[row + wkoffset]);  /* ( 5 6 7 8) */ \
+  \
+  wk[row] = samp7XXX; \
+  \
+  samp0123 = _mm_mullo_pi16(samp0123, PW_THREE); \
+  samp4567 = _mm_mullo_pi16(samp4567, PW_THREE); \
+  samp_1012 = _mm_add_pi16(samp_1012, bias1); \
+  samp3456 = _mm_add_pi16(samp3456, bias1); \
+  samp1234 = _mm_add_pi16(samp1234, bias2); \
+  samp5678 = _mm_add_pi16(samp5678, bias2); \
+  \
+  outle = _mm_add_pi16(samp_1012, samp0123); \
+  outhe = _mm_add_pi16(samp3456, samp4567); \
+  outle = _mm_srli_pi16(outle, shift);        /* ( 0  2  4  6) */ \
+  outhe = _mm_srli_pi16(outhe, shift);        /* ( 8 10 12 14) */ \
+  outlo = _mm_add_pi16(samp1234, samp0123); \
+  outho = _mm_add_pi16(samp5678, samp4567); \
+  outlo = _mm_srli_pi16(outlo, shift);        /* ( 1  3  5  7) */ \
+  outho = _mm_srli_pi16(outho, shift);        /* ( 9 11 13 15) */ \
+  \
+  outlo = _mm_slli_pi16(outlo, BYTE_BIT); \
+  outho = _mm_slli_pi16(outho, BYTE_BIT); \
+  outl = _mm_or_si64(outle, outlo);           /* ( 0  1  2  3  4  5  6  7) */ \
+  outh = _mm_or_si64(outhe, outho);           /* ( 8  9 10 11 12 13 14 15) */ \
+  \
+  _mm_store_si64((__m64 *)outptr##row, outl); \
+  _mm_store_si64((__m64 *)outptr##row + 1, outh); \
+}
+
+void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
+  int inrow, outrow, incol, tmp, tmp1;
+  __m64 this_1l, this_1h, this_1, thiscolsum_1l, thiscolsum_1h;
+  __m64 this0l, this0h, this0;
+  __m64 this1l, this1h, this1, thiscolsum1l, thiscolsum1h;
+  __m64 next_1l, next_1h, next_1, nextcolsum_1l, nextcolsum_1h;
+  __m64 next0l, next0h, next0;
+  __m64 next1l, next1h, next1, nextcolsum1l, nextcolsum1h;
+  __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[4], zero = 0.0;
+
+  mask0 = _mm_cmpeq_pi8(mask0, mask0);
+  masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+  mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+    inptr_1 = input_data[inrow - 1];
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    if (downsampled_width & 7) {
+      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
+      tmp1 = downsampled_width * sizeof(JSAMPLE);
+      asm(PTR_ADDU  "$8, %3, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %3, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %4, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %4, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %5, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %5, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1)
+          : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1)
+          : "$8", "$9"
+         );
+    }
+
+    /* process the first column block */
+    this0 = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+    this_1 = _mm_load_si64((__m64 *)inptr_1);  /* row[-1][0] */
+    this1 = _mm_load_si64((__m64 *)inptr1);    /* row[ 1][0] */
+
+    this0l = _mm_unpacklo_pi8(this0, zero);    /* row[ 0][0]( 0 1 2 3) */
+    this0h = _mm_unpackhi_pi8(this0, zero);    /* row[ 0][0]( 4 5 6 7) */
+    this_1l = _mm_unpacklo_pi8(this_1, zero);  /* row[-1][0]( 0 1 2 3) */
+    this_1h = _mm_unpackhi_pi8(this_1, zero);  /* row[-1][0]( 4 5 6 7) */
+    this1l = _mm_unpacklo_pi8(this1, zero);    /* row[+1][0]( 0 1 2 3) */
+    this1h = _mm_unpackhi_pi8(this1, zero);    /* row[+1][0]( 4 5 6 7) */
+
+    this0l = _mm_mullo_pi16(this0l, PW_THREE);
+    this0h = _mm_mullo_pi16(this0h, PW_THREE);
+
+    thiscolsum_1l = _mm_add_pi16(this_1l, this0l);  /* ( 0 1 2 3) */
+    thiscolsum_1h = _mm_add_pi16(this_1h, this0h);  /* ( 4 5 6 7) */
+    thiscolsum1l = _mm_add_pi16(this0l, this1l);    /* ( 0 1 2 3) */
+    thiscolsum1h = _mm_add_pi16(this0h, this1h);    /* ( 4 5 6 7) */
+
+    /* temporarily save the intermediate data */
+    _mm_store_si64((__m64 *)outptr0, thiscolsum_1l);
+    _mm_store_si64((__m64 *)outptr0 + 1, thiscolsum_1h);
+    _mm_store_si64((__m64 *)outptr1, thiscolsum1l);
+    _mm_store_si64((__m64 *)outptr1 + 1, thiscolsum1h);
+
+    wk[0] = _mm_and_si64(thiscolsum_1l, mask0);  /* ( 0 - - -) */
+    wk[1] = _mm_and_si64(thiscolsum1l, mask0);   /* ( 0 - - -) */
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8,
+         outptr0 += 16, outptr1 += 16) {
+
+      if (incol > 8) {
+        /* process the next column block */
+        next0 = _mm_load_si64((__m64 *)inptr0 + 1);    /* row[ 0][1] */
+        next_1 = _mm_load_si64((__m64 *)inptr_1 + 1);  /* row[-1][1] */
+        next1 = _mm_load_si64((__m64 *)inptr1 + 1);    /* row[+1][1] */
+
+        next0l = _mm_unpacklo_pi8(next0, zero);    /* row[ 0][1]( 0 1 2 3) */
+        next0h = _mm_unpackhi_pi8(next0, zero);    /* row[ 0][1]( 4 5 6 7) */
+        next_1l = _mm_unpacklo_pi8(next_1, zero);  /* row[-1][1]( 0 1 2 3) */
+        next_1h = _mm_unpackhi_pi8(next_1, zero);  /* row[-1][1]( 4 5 6 7) */
+        next1l = _mm_unpacklo_pi8(next1, zero);    /* row[+1][1]( 0 1 2 3) */
+        next1h = _mm_unpackhi_pi8(next1, zero);    /* row[+1][1]( 4 5 6 7) */
+
+        next0l = _mm_mullo_pi16(next0l, PW_THREE);
+        next0h = _mm_mullo_pi16(next0h, PW_THREE);
+
+        nextcolsum_1l = _mm_add_pi16(next_1l, next0l);  /* ( 0 1 2 3) */
+        nextcolsum_1h = _mm_add_pi16(next_1h, next0h);  /* ( 4 5 6 7) */
+        nextcolsum1l = _mm_add_pi16(next0l, next1l);    /* ( 0 1 2 3) */
+        nextcolsum1h = _mm_add_pi16(next0h, next1h);    /* ( 4 5 6 7) */
+
+        /* temporarily save the intermediate data */
+        _mm_store_si64((__m64 *)outptr0 + 2, nextcolsum_1l);
+        _mm_store_si64((__m64 *)outptr0 + 3, nextcolsum_1h);
+        _mm_store_si64((__m64 *)outptr1 + 2, nextcolsum1l);
+        _mm_store_si64((__m64 *)outptr1 + 3, nextcolsum1h);
+
+        wk[2] = _mm_slli_si64(nextcolsum_1l, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 0) */
+        wk[3] = _mm_slli_si64(nextcolsum1l, (SIZEOF_MMWORD - 2) * BYTE_BIT);   /* ( - - - 0) */
+      } else {
+        __m64 tmp;
+
+        /* process the last column block */
+        tmp = _mm_load_si64((__m64 *)outptr0 + 1);
+        wk[2] = _mm_and_si64(masklast, tmp);        /* ( - - - 7) */
+        tmp = _mm_load_si64((__m64 *)outptr1 + 1);
+        wk[3] = _mm_and_si64(masklast, tmp);        /* ( - - - 7) */
+      }
+
+      /* process the upper row */
+      samp0123 = _mm_load_si64((__m64 *)outptr0);      /* ( 0 1 2 3) */ \
+      samp4567 = _mm_load_si64((__m64 *)outptr0 + 1);  /* ( 4 5 6 7) */ \
+      PROCESS_ROW(0, 2, PW_EIGHT, PW_SEVEN, 4)
+
+      /* process the lower row */
+      samp0123 = _mm_load_si64((__m64 *)outptr1);      /* ( 0 1 2 3) */ \
+      samp4567 = _mm_load_si64((__m64 *)outptr1 + 1);  /* ( 4 5 6 7) */ \
+      PROCESS_ROW(1, 2, PW_EIGHT, PW_SEVEN, 4)
+    }
+  }
+}
+
+
+void jsimd_h2v1_fancy_upsample_mmi(int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, outptr0;
+  int inrow, incol, tmp, tmp1;
+  __m64 thisl, this, nextl, next;
+  __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[2], zero = 0.0;
+
+  mask0 = _mm_cmpeq_pi8(mask0, mask0);
+  masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+  mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+
+    inptr0 = input_data[inrow];
+    outptr0 = output_data[inrow];
+
+    if (downsampled_width & 7) {
+      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
+      tmp1 = downsampled_width * sizeof(JSAMPLE);
+      asm(PTR_ADDU  "$8, %1, %2\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %1, %3\r\n"
+          "sb       $9, ($8)\r\n"
+          : "=m" (*inptr0)
+          : "r" (inptr0), "r" (tmp), "r" (tmp1)
+          : "$8", "$9"
+         );
+    }
+
+    /* process the first column block */
+    this = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+    thisl = _mm_unpacklo_pi8(this, zero);     /* row[ 0][0]( 0 1 2 3) */
+    wk[0] = _mm_and_si64(thisl, mask0);       /* ( 0 - - -) */
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 8, inptr0 += 8, outptr0 += 16) {
+
+      if (incol > 8) {
+        /* process the next column block */
+        next = _mm_load_si64((__m64 *)inptr0 + 1);  /* row[ 0][1] */
+        nextl = _mm_unpacklo_pi8(next, zero);       /* row[ 0][1]( 0 1 2 3) */
+        wk[1] = _mm_slli_si64(nextl, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 0) */
+      } else {
+        __m64 thish;
+
+        /* process the last column block */
+        this = _mm_load_si64((__m64 *)inptr0);  /* row[ 0][0] */
+        thish = _mm_unpackhi_pi8(this, zero);   /* row[ 0][1]( 4 5 6 7) */
+        wk[1] = _mm_and_si64(masklast, thish);  /* ( - - - 7) */
+      }
+
+      /* process the row */
+      this = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+      samp0123 = _mm_unpacklo_pi8(this, zero);  /* ( 0 1 2 3) */
+      samp4567 = _mm_unpackhi_pi8(this, zero);  /* ( 4 5 6 7) */
+      PROCESS_ROW(0, 1, PW_ONE, PW_TWO, 2)
+    }
+  }
+}
diff --git a/simd/mips64/jfdctfst-mmi.c b/simd/mips64/jfdctfst-mmi.c
new file mode 100644
index 0000000..f7caf09
--- /dev/null
+++ b/simd/mips64/jfdctfst-mmi.c
@@ -0,0 +1,255 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  LiuQingfa <liuqingfa-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER FORWARD DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  8
+
+#define F_0_382  ((short)98)   /* FIX(0.382683433) */
+#define F_0_541  ((short)139)  /* FIX(0.541196100) */
+#define F_0_707  ((short)181)  /* FIX(0.707106781) */
+#define F_1_306  ((short)334)  /* FIX(1.306562965) */
+
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+enum const_index {
+  index_PW_F0707,
+  index_PW_F0382,
+  index_PW_F0541,
+  index_PW_F1306
+};
+
+static uint64_t const_value[] = {
+  _uint64_set1_pi16(F_0_707),
+  _uint64_set1_pi16(F_0_382),
+  _uint64_set1_pi16(F_0_541),
+  _uint64_set1_pi16(F_1_306)
+};
+
+#define PW_F0707  get_const_value(index_PW_F0707)
+#define PW_F0382  get_const_value(index_PW_F0382)
+#define PW_F0541  get_const_value(index_PW_F0541)
+#define PW_F1306  get_const_value(index_PW_F1306)
+
+
+#define DO_FDCT_MULTIPLY(out, in, multiplier) { \
+  __m64 mulhi, mullo, mul12, mul34; \
+  \
+  mullo = _mm_mullo_pi16(in, multiplier); \
+  mulhi = _mm_mulhi_pi16(in, multiplier); \
+  mul12 = _mm_unpacklo_pi16(mullo, mulhi); \
+  mul34 = _mm_unpackhi_pi16(mullo, mulhi); \
+  mul12 = _mm_srai_pi32(mul12, CONST_BITS); \
+  mul34 = _mm_srai_pi32(mul34, CONST_BITS); \
+  out = _mm_packs_pi32(mul12, mul34); \
+}
+
+#define DO_FDCT_COMMON() { \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp3); \
+  tmp13 = _mm_sub_pi16(tmp0, tmp3); \
+  tmp11 = _mm_add_pi16(tmp1, tmp2); \
+  tmp12 = _mm_sub_pi16(tmp1, tmp2); \
+  \
+  out0 = _mm_add_pi16(tmp10, tmp11); \
+  out4 = _mm_sub_pi16(tmp10, tmp11); \
+  \
+  z1 = _mm_add_pi16(tmp12, tmp13); \
+  DO_FDCT_MULTIPLY(z1, z1, PW_F0707) \
+  \
+  out2 = _mm_add_pi16(tmp13, z1); \
+  out6 = _mm_sub_pi16(tmp13, z1); \
+  \
+  /* Odd part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp4, tmp5); \
+  tmp11 = _mm_add_pi16(tmp5, tmp6); \
+  tmp12 = _mm_add_pi16(tmp6, tmp7); \
+  \
+  z5 = _mm_sub_pi16(tmp10, tmp12); \
+  DO_FDCT_MULTIPLY(z5, z5, PW_F0382) \
+  \
+  DO_FDCT_MULTIPLY(z2, tmp10, PW_F0541) \
+  z2 = _mm_add_pi16(z2, z5); \
+  \
+  DO_FDCT_MULTIPLY(z4, tmp12, PW_F1306) \
+  z4 = _mm_add_pi16(z4, z5); \
+  \
+  DO_FDCT_MULTIPLY(z3, tmp11, PW_F0707) \
+  \
+  z11 = _mm_add_pi16(tmp7, z3); \
+  z13 = _mm_sub_pi16(tmp7, z3); \
+  \
+  out5 = _mm_add_pi16(z13, z2); \
+  out3 = _mm_sub_pi16(z13, z2); \
+  out1 = _mm_add_pi16(z11, z4); \
+  out7 = _mm_sub_pi16(z11, z4); \
+}
+
+#define DO_FDCT_PASS1() { \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 col0, col1, col2, col3, col4, col5, col6, col7; \
+  \
+  row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);     /* (00 01 02 03) */ \
+  row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
+  row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);     /* (10 11 12 13) */ \
+  row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
+  row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);     /* (20 21 22 23) */ \
+  row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
+  row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);     /* (30 31 32 33) */ \
+  row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row23a = _mm_unpacklo_pi16(row2l, row3l);   /* row23a=(20 30 21 31) */ \
+  row23b = _mm_unpackhi_pi16(row2l, row3l);   /* row23b=(22 32 23 33) */ \
+  row23c = _mm_unpacklo_pi16(row2h, row3h);   /* row23c=(24 34 25 35) */ \
+  row23d = _mm_unpackhi_pi16(row2h, row3h);   /* row23d=(26 36 27 37) */ \
+  \
+  row01a = _mm_unpacklo_pi16(row0l, row1l);   /* row01a=(00 10 01 11) */ \
+  row01b = _mm_unpackhi_pi16(row0l, row1l);   /* row01b=(02 12 03 13) */ \
+  row01c = _mm_unpacklo_pi16(row0h, row1h);   /* row01c=(04 14 05 15) */ \
+  row01d = _mm_unpackhi_pi16(row0h, row1h);   /* row01d=(06 16 07 17) */ \
+  \
+  col0 = _mm_unpacklo_pi32(row01a, row23a);   /* col0=(00 10 20 30) */ \
+  col1 = _mm_unpackhi_pi32(row01a, row23a);   /* col1=(01 11 21 31) */ \
+  col6 = _mm_unpacklo_pi32(row01d, row23d);   /* col6=(06 16 26 36) */ \
+  col7 = _mm_unpackhi_pi32(row01d, row23d);   /* col7=(07 17 27 37) */ \
+  \
+  tmp6 = _mm_sub_pi16(col1, col6);            /* tmp6=col1-col6 */ \
+  tmp7 = _mm_sub_pi16(col0, col7);            /* tmp7=col0-col7 */ \
+  tmp1 = _mm_add_pi16(col1, col6);            /* tmp1=col1+col6 */ \
+  tmp0 = _mm_add_pi16(col0, col7);            /* tmp0=col0+col7 */ \
+  \
+  col2 = _mm_unpacklo_pi32(row01b, row23b);   /* col2=(02 12 22 32) */ \
+  col3 = _mm_unpackhi_pi32(row01b, row23b);   /* col3=(03 13 23 33) */ \
+  col4 = _mm_unpacklo_pi32(row01c, row23c);   /* col4=(04 14 24 34) */ \
+  col5 = _mm_unpackhi_pi32(row01c, row23c);   /* col5=(05 15 25 35) */ \
+  \
+  tmp3 = _mm_add_pi16(col3, col4);            /* tmp3=col3+col4 */ \
+  tmp2 = _mm_add_pi16(col2, col5);            /* tmp2=col2+col5 */ \
+  tmp4 = _mm_sub_pi16(col3, col4);            /* tmp4=col3-col4 */ \
+  tmp5 = _mm_sub_pi16(col2, col5);            /* tmp5=col2-col5 */ \
+  \
+  DO_FDCT_COMMON() \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \
+}
+
+#define DO_FDCT_PASS2() { \
+  __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \
+  __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \
+  __m64 row0, row1, row2, row3, row4, row5, row6, row7; \
+  \
+  col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]);  /* (40 50 60 70) */ \
+  col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]);  /* (41 51 61 71) */ \
+  col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]);  /* (42 52 62 72) */ \
+  col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]);  /* (43 53 63 73) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  col23a = _mm_unpacklo_pi16(col2l, col3l);   /* col23a=(02 03 12 13) */ \
+  col23b = _mm_unpackhi_pi16(col2l, col3l);   /* col23b=(22 23 32 33) */ \
+  col23c = _mm_unpacklo_pi16(col2h, col3h);   /* col23c=(42 43 52 53) */ \
+  col23d = _mm_unpackhi_pi16(col2h, col3h);   /* col23d=(62 63 72 73) */ \
+  \
+  col01a = _mm_unpacklo_pi16(col0l, col1l);   /* col01a=(00 01 10 11) */ \
+  col01b = _mm_unpackhi_pi16(col0l, col1l);   /* col01b=(20 21 30 31) */ \
+  col01c = _mm_unpacklo_pi16(col0h, col1h);   /* col01c=(40 41 50 51) */ \
+  col01d = _mm_unpackhi_pi16(col0h, col1h);   /* col01d=(60 61 70 71) */ \
+  \
+  row0 = _mm_unpacklo_pi32(col01a, col23a);   /* row0=(00 01 02 03) */ \
+  row1 = _mm_unpackhi_pi32(col01a, col23a);   /* row1=(10 11 12 13) */ \
+  row6 = _mm_unpacklo_pi32(col01d, col23d);   /* row6=(60 61 62 63) */ \
+  row7 = _mm_unpackhi_pi32(col01d, col23d);   /* row7=(70 71 72 73) */ \
+  \
+  tmp6 = _mm_sub_pi16(row1, row6);            /* tmp6=row1-row6 */ \
+  tmp7 = _mm_sub_pi16(row0, row7);            /* tmp7=row0-row7 */ \
+  tmp1 = _mm_add_pi16(row1, row6);            /* tmp1=row1+row6 */ \
+  tmp0 = _mm_add_pi16(row0, row7);            /* tmp0=row0+row7 */ \
+  \
+  row2 = _mm_unpacklo_pi32(col01b, col23b);   /* row2=(20 21 22 23) */ \
+  row3 = _mm_unpackhi_pi32(col01b, col23b);   /* row3=(30 31 32 33) */ \
+  row4 = _mm_unpacklo_pi32(col01c, col23c);   /* row4=(40 41 42 43) */ \
+  row5 = _mm_unpackhi_pi32(col01c, col23c);   /* row5=(50 51 52 53) */ \
+  \
+  tmp3 = _mm_add_pi16(row3, row4);            /* tmp3=row3+row4 */ \
+  tmp2 = _mm_add_pi16(row2, row5);            /* tmp2=row2+row5 */ \
+  tmp4 = _mm_sub_pi16(row3, row4);            /* tmp4=row3-row4 */ \
+  tmp5 = _mm_sub_pi16(row2, row5);            /* tmp5=row2-row5 */ \
+  \
+  DO_FDCT_COMMON() \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \
+}
+
+void jsimd_fdct_ifast_mmi(DCTELEM *data)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  __m64 tmp10, tmp11, tmp12, tmp13, z1, z2, z3, z4, z5, z11, z13;
+  DCTELEM *dataptr = data;
+
+  /* Pass 1: process rows. */
+
+  DO_FDCT_PASS1()
+  dataptr += DCTSIZE * 4;
+  DO_FDCT_PASS1()
+
+  /* Pass 2: process columns. */
+
+  dataptr = data;
+  DO_FDCT_PASS2()
+  dataptr += 4;
+  DO_FDCT_PASS2()
+}
diff --git a/simd/loongson/jfdctint-mmi.c b/simd/mips64/jfdctint-mmi.c
similarity index 100%
rename from simd/loongson/jfdctint-mmi.c
rename to simd/mips64/jfdctint-mmi.c
diff --git a/simd/mips64/jidctfst-mmi.c b/simd/mips64/jidctfst-mmi.c
new file mode 100644
index 0000000..503bb35
--- /dev/null
+++ b/simd/mips64/jidctfst-mmi.c
@@ -0,0 +1,395 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  LiuQingfa <liuqingfa-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER INVERSE DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  8
+#define PASS1_BITS  2
+
+#define FIX_1_082  ((short)277)                   /* FIX(1.082392200) */
+#define FIX_1_414  ((short)362)                   /* FIX(1.414213562) */
+#define FIX_1_847  ((short)473)                   /* FIX(1.847759065) */
+#define FIX_2_613  ((short)669)                   /* FIX(2.613125930) */
+#define FIX_1_613  ((short)(FIX_2_613 - 256 * 3)) /* FIX(2.613125930) - FIX(1) */
+
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+enum const_index {
+  index_PW_F1082,
+  index_PW_F1414,
+  index_PW_F1847,
+  index_PW_MF1613,
+  index_PB_CENTERJSAMP
+};
+
+static uint64_t const_value[] = {
+  _uint64_set1_pi16(FIX_1_082 << CONST_SHIFT),
+  _uint64_set1_pi16(FIX_1_414 << CONST_SHIFT),
+  _uint64_set1_pi16(FIX_1_847 << CONST_SHIFT),
+  _uint64_set1_pi16(-FIX_1_613 << CONST_SHIFT),
+  _uint64_set1_pi8(CENTERJSAMPLE)
+};
+
+#define PW_F1414        get_const_value(index_PW_F1414)
+#define PW_F1847        get_const_value(index_PW_F1847)
+#define PW_MF1613       get_const_value(index_PW_MF1613)
+#define PW_F1082        get_const_value(index_PW_F1082)
+#define PB_CENTERJSAMP  get_const_value(index_PB_CENTERJSAMP)
+
+
+#define test_m32_zero(mm32)  (!(*(uint32_t *)&mm32))
+#define test_m64_zero(mm64)  (!(*(uint64_t *)&mm64))
+
+
+#define DO_IDCT_COMMON() { \
+  tmp7 = _mm_add_pi16(z11, z13); \
+  \
+  tmp11 = _mm_sub_pi16(z11, z13); \
+  tmp11 = _mm_slli_pi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
+  tmp11 = _mm_mulhi_pi16(tmp11, PW_F1414); \
+  \
+  tmp10 = _mm_slli_pi16(z12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_slli_pi16(z10, PRE_MULTIPLY_SCALE_BITS); \
+  \
+  /* To avoid overflow... \
+   * \
+   * (Original) \
+   * tmp12 = -2.613125930 * z10 + z5; \
+   * \
+   * (This implementation) \
+   * tmp12 = (-1.613125930 - 1) * z10 + z5; \
+   *       = -1.613125930 * z10 - z10 + z5; \
+   */ \
+  \
+  z5 = _mm_add_pi16(tmp10, tmp12); \
+  z5 = _mm_mulhi_pi16(z5, PW_F1847); \
+  \
+  tmp10 = _mm_mulhi_pi16(tmp10, PW_F1082); \
+  tmp10 = _mm_sub_pi16(tmp10, z5); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_MF1613); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_add_pi16(tmp12, z5); \
+  \
+  /* Final output stage */ \
+  \
+  tmp6 = _mm_sub_pi16(tmp12, tmp7); \
+  tmp5 = _mm_sub_pi16(tmp11, tmp6); \
+  tmp4 = _mm_add_pi16(tmp10, tmp5); \
+  \
+  out0 = _mm_add_pi16(tmp0, tmp7); \
+  out7 = _mm_sub_pi16(tmp0, tmp7); \
+  out1 = _mm_add_pi16(tmp1, tmp6); \
+  out6 = _mm_sub_pi16(tmp1, tmp6); \
+  \
+  out2 = _mm_add_pi16(tmp2, tmp5); \
+  out5 = _mm_sub_pi16(tmp2, tmp5); \
+  out4 = _mm_add_pi16(tmp3, tmp4); \
+  out3 = _mm_sub_pi16(tmp3, tmp4); \
+}
+
+#define DO_IDCT_PASS1(iter) { \
+  __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \
+  __m64 quant0l, quant1l, quant2l, quant3l; \
+  __m64 quant4l, quant5l, quant6l, quant7l; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m32 col0a, col1a, mm0; \
+  \
+  col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \
+  col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \
+  mm0 = _mm_or_si32(col0a, col1a); \
+  \
+  if (test_m32_zero(mm0)) { \
+    __m64 mm1, mm2; \
+    \
+    col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \
+    col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \
+    col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \
+    col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \
+    col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \
+    col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \
+    col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \
+    col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \
+    \
+    mm1 = _mm_or_si64(col1l, col3l); \
+    mm2 = _mm_or_si64(col2l, col4l); \
+    mm1 = _mm_or_si64(mm1, col5l); \
+    mm2 = _mm_or_si64(mm2, col6l); \
+    mm1 = _mm_or_si64(mm1, col7l); \
+    mm1 = _mm_or_si64(mm1, mm2); \
+    \
+    if (test_m64_zero(mm1)) { \
+      __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \
+      \
+      /* AC terms all zero */ \
+      \
+      quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+      \
+      dcval = _mm_mullo_pi16(col0l, quant0l);    /* dcval=(00 10 20 30) */ \
+      \
+      dcvall = _mm_unpacklo_pi16(dcval, dcval);  /* dcvall=(00 00 10 10) */ \
+      dcvalh = _mm_unpackhi_pi16(dcval, dcval);  /* dcvalh=(20 20 30 30) */ \
+      \
+      row0 = _mm_unpacklo_pi32(dcvall, dcvall);  /* row0=(00 00 00 00) */ \
+      row1 = _mm_unpackhi_pi32(dcvall, dcvall);  /* row1=(10 10 10 10) */ \
+      row2 = _mm_unpacklo_pi32(dcvalh, dcvalh);  /* row2=(20 20 20 20) */ \
+      row3 = _mm_unpackhi_pi32(dcvalh, dcvalh);  /* row3=(30 30 30 30) */ \
+      \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \
+      \
+      goto nextcolumn##iter; \
+    } \
+  } \
+  \
+  /* Even part */ \
+  \
+  col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]);  /* (04 14 24 34) */ \
+  col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]);  /* (06 16 26 36) */ \
+  \
+  quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+  quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \
+  quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \
+  quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \
+  \
+  tmp0 = _mm_mullo_pi16(col0l, quant0l); \
+  tmp1 = _mm_mullo_pi16(col2l, quant2l); \
+  tmp2 = _mm_mullo_pi16(col4l, quant4l); \
+  tmp3 = _mm_mullo_pi16(col6l, quant6l); \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp2); \
+  tmp11 = _mm_sub_pi16(tmp0, tmp2); \
+  tmp13 = _mm_add_pi16(tmp1, tmp3); \
+  \
+  tmp12 = _mm_sub_pi16(tmp1, tmp3); \
+  tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
+  tmp12 = _mm_sub_pi16(tmp12, tmp13); \
+  \
+  tmp0 = _mm_add_pi16(tmp10, tmp13); \
+  tmp3 = _mm_sub_pi16(tmp10, tmp13); \
+  tmp1 = _mm_add_pi16(tmp11, tmp12); \
+  tmp2 = _mm_sub_pi16(tmp11, tmp12); \
+  \
+  /* Odd part */ \
+  \
+  col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]);  /* (05 15 25 35) */ \
+  col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]);  /* (07 17 27 37) */ \
+  \
+  quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \
+  quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \
+  quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \
+  quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \
+  \
+  tmp4 = _mm_mullo_pi16(col1l, quant1l); \
+  tmp5 = _mm_mullo_pi16(col3l, quant3l); \
+  tmp6 = _mm_mullo_pi16(col5l, quant5l); \
+  tmp7 = _mm_mullo_pi16(col7l, quant7l); \
+  \
+  z13 = _mm_add_pi16(tmp6, tmp5); \
+  z10 = _mm_sub_pi16(tmp6, tmp5); \
+  z11 = _mm_add_pi16(tmp4, tmp7); \
+  z12 = _mm_sub_pi16(tmp4, tmp7); \
+  \
+  DO_IDCT_COMMON() \
+  \
+  /* out0=(00 10 20 30), out1=(01 11 21 31) */ \
+  /* out2=(02 12 22 32), out3=(03 13 23 33) */ \
+  /* out4=(04 14 24 34), out5=(05 15 25 35) */ \
+  /* out6=(06 16 26 36), out7=(07 17 27 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row01a = _mm_unpacklo_pi16(out0, out1);     /* row01a=(00 01 10 11) */ \
+  row23a = _mm_unpackhi_pi16(out0, out1);     /* row23a=(20 21 30 31) */ \
+  row01d = _mm_unpacklo_pi16(out6, out7);     /* row01d=(06 07 16 17) */ \
+  row23d = _mm_unpackhi_pi16(out6, out7);     /* row23d=(26 27 36 37) */ \
+  \
+  row01b = _mm_unpacklo_pi16(out2, out3);     /* row01b=(02 03 12 13) */ \
+  row23b = _mm_unpackhi_pi16(out2, out3);     /* row23b=(22 23 32 33) */ \
+  row01c = _mm_unpacklo_pi16(out4, out5);     /* row01c=(04 05 14 15) */ \
+  row23c = _mm_unpackhi_pi16(out4, out5);     /* row23c=(24 25 34 35) */ \
+  \
+  row0l = _mm_unpacklo_pi32(row01a, row01b);  /* row0l=(00 01 02 03) */ \
+  row1l = _mm_unpackhi_pi32(row01a, row01b);  /* row1l=(10 11 12 13) */ \
+  row2l = _mm_unpacklo_pi32(row23a, row23b);  /* row2l=(20 21 22 23) */ \
+  row3l = _mm_unpackhi_pi32(row23a, row23b);  /* row3l=(30 31 32 33) */ \
+  \
+  row0h = _mm_unpacklo_pi32(row01c, row01d);  /* row0h=(04 05 06 07) */ \
+  row1h = _mm_unpackhi_pi32(row01c, row01d);  /* row1h=(14 15 16 17) */ \
+  row2h = _mm_unpacklo_pi32(row23c, row23d);  /* row2h=(24 25 26 27) */ \
+  row3h = _mm_unpackhi_pi32(row23c, row23d);  /* row3h=(34 35 36 37) */ \
+  \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \
+}
+
+#define DO_IDCT_PASS2(ctr) { \
+  __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \
+  __m64 col0123a, col0123b, col0123c, col0123d; \
+  __m64 col01l, col01h, col23l, col23h; \
+  __m64 col0, col1, col2, col3; \
+  __m64 row06, row17, row24, row35; \
+  \
+  row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]);  /* (00 01 02 03) */ \
+  row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]);  /* (10 11 12 13) */ \
+  row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]);  /* (20 21 22 23) */ \
+  row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]);  /* (30 31 32 33) */ \
+  row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]);  /* (40 41 42 43) */ \
+  row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]);  /* (50 51 52 53) */ \
+  row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]);  /* (60 61 62 63) */ \
+  row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]);  /* (70 71 72 73) */ \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(row0l, row4l); \
+  tmp11 = _mm_sub_pi16(row0l, row4l); \
+  tmp13 = _mm_add_pi16(row2l, row6l); \
+  \
+  tmp12 = _mm_sub_pi16(row2l, row6l); \
+  tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
+  tmp12 = _mm_sub_pi16(tmp12, tmp13); \
+  \
+  tmp0 = _mm_add_pi16(tmp10, tmp13); \
+  tmp3 = _mm_sub_pi16(tmp10, tmp13); \
+  tmp1 = _mm_add_pi16(tmp11, tmp12); \
+  tmp2 = _mm_sub_pi16(tmp11, tmp12); \
+  \
+  /* Odd part */ \
+  \
+  z13 = _mm_add_pi16(row5l, row3l); \
+  z10 = _mm_sub_pi16(row5l, row3l); \
+  z11 = _mm_add_pi16(row1l, row7l); \
+  z12 = _mm_sub_pi16(row1l, row7l); \
+  \
+  DO_IDCT_COMMON() \
+  \
+  /* out0=(00 01 02 03), out1=(10 11 12 13) */ \
+  /* out2=(20 21 22 23), out3=(30 31 32 33) */ \
+  /* out4=(40 41 42 43), out5=(50 51 52 53) */ \
+  /* out6=(60 61 62 63), out7=(70 71 72 73) */ \
+  \
+  out0 = _mm_srai_pi16(out0, PASS1_BITS + 3); \
+  out1 = _mm_srai_pi16(out1, PASS1_BITS + 3); \
+  out2 = _mm_srai_pi16(out2, PASS1_BITS + 3); \
+  out3 = _mm_srai_pi16(out3, PASS1_BITS + 3); \
+  out4 = _mm_srai_pi16(out4, PASS1_BITS + 3); \
+  out5 = _mm_srai_pi16(out5, PASS1_BITS + 3); \
+  out6 = _mm_srai_pi16(out6, PASS1_BITS + 3); \
+  out7 = _mm_srai_pi16(out7, PASS1_BITS + 3); \
+  \
+  row06 = _mm_packs_pi16(out0, out6);  /* row06=(00 01 02 03 60 61 62 63) */ \
+  row17 = _mm_packs_pi16(out1, out7);  /* row17=(10 11 12 13 70 71 72 73) */ \
+  row24 = _mm_packs_pi16(out2, out4);  /* row24=(20 21 22 23 40 41 42 43) */ \
+  row35 = _mm_packs_pi16(out3, out5);  /* row35=(30 31 32 33 50 51 52 53) */ \
+  \
+  row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
+  row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
+  row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \
+  row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \
+  \
+  /* Transpose coefficients */ \
+  \
+  col0123a = _mm_unpacklo_pi8(row06, row17);  /* col0123a=(00 10 01 11 02 12 03 13) */ \
+  col0123d = _mm_unpackhi_pi8(row06, row17);  /* col0123d=(60 70 61 71 62 72 63 73) */ \
+  col0123b = _mm_unpacklo_pi8(row24, row35);  /* col0123b=(20 30 21 31 22 32 23 33) */ \
+  col0123c = _mm_unpackhi_pi8(row24, row35);  /* col0123c=(40 50 41 51 42 52 43 53) */ \
+  \
+  col01l = _mm_unpacklo_pi16(col0123a, col0123b);  /* col01l=(00 10 20 30 01 11 21 31) */ \
+  col23l = _mm_unpackhi_pi16(col0123a, col0123b);  /* col23l=(02 12 22 32 03 13 23 33) */ \
+  col01h = _mm_unpacklo_pi16(col0123c, col0123d);  /* col01h=(40 50 60 70 41 51 61 71) */ \
+  col23h = _mm_unpackhi_pi16(col0123c, col0123d);  /* col23h=(42 52 62 72 43 53 63 73) */ \
+  \
+  col0 = _mm_unpacklo_pi32(col01l, col01h);   /* col0=(00 10 20 30 40 50 60 70) */ \
+  col1 = _mm_unpackhi_pi32(col01l, col01h);   /* col1=(01 11 21 31 41 51 61 71) */ \
+  col2 = _mm_unpacklo_pi32(col23l, col23h);   /* col2=(02 12 22 32 42 52 62 72) */ \
+  col3 = _mm_unpackhi_pi32(col23l, col23h);   /* col3=(03 13 23 33 43 53 63 73) */ \
+  \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \
+}
+
+void jsimd_idct_ifast_mmi(void *dct_table, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m64 tmp10, tmp11, tmp12, tmp13;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  __m64 z5, z10, z11, z12, z13;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE *quantptr;
+  JCOEF *wsptr;
+  JCOEF workspace[DCTSIZE2];  /* buffers data between passes */
+
+  /* Pass 1: process columns. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *)dct_table;
+  wsptr = workspace;
+
+  DO_IDCT_PASS1(1)
+nextcolumn1:
+  inptr += 4;
+  quantptr += 4;
+  wsptr += DCTSIZE * 4;
+  DO_IDCT_PASS1(2)
+nextcolumn2:
+
+  /* Pass 2: process rows. */
+
+  wsptr = workspace;
+
+  DO_IDCT_PASS2(0)
+  wsptr += 4;
+  DO_IDCT_PASS2(4)
+}
diff --git a/simd/loongson/jidctint-mmi.c b/simd/mips64/jidctint-mmi.c
similarity index 100%
rename from simd/loongson/jidctint-mmi.c
rename to simd/mips64/jidctint-mmi.c
diff --git a/simd/mips64/jquanti-mmi.c b/simd/mips64/jquanti-mmi.c
new file mode 100644
index 0000000..339002f
--- /dev/null
+++ b/simd/mips64/jquanti-mmi.c
@@ -0,0 +1,124 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ * Copyright (C) 2018-2019, D. R. Commander.  All Rights Reserved.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define DO_QUANT() { \
+  __m64 rowl, rowh, rowls, rowhs, rowlsave, rowhsave; \
+  __m64 corrl, corrh, recipl, reciph, scalel, scaleh; \
+  \
+  rowl = _mm_load_si64((__m64 *)&workspace[0]); \
+  rowh = _mm_load_si64((__m64 *)&workspace[4]); \
+  \
+  /* Branch-less absolute value */ \
+  rowls = _mm_srai_pi16(rowl, (WORD_BIT - 1));  /* -1 if value < 0, */ \
+                                                /* 0 otherwise */ \
+  rowhs = _mm_srai_pi16(rowh, (WORD_BIT - 1)); \
+  \
+  rowl = _mm_xor_si64(rowl, rowls);           /* val = -val */ \
+  rowh = _mm_xor_si64(rowh, rowhs); \
+  rowl = _mm_sub_pi16(rowl, rowls); \
+  rowh = _mm_sub_pi16(rowh, rowhs); \
+  \
+  corrl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]);  /* correction */ \
+  corrh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \
+  \
+  rowlsave = rowl = _mm_add_pi16(rowl, corrl);  /* correction + roundfactor */ \
+  rowhsave = rowh = _mm_add_pi16(rowh, corrh); \
+  \
+  recipl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]);  /* reciprocal */ \
+  reciph = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \
+  \
+  rowl = _mm_mulhi_pi16(rowl, recipl); \
+  rowh = _mm_mulhi_pi16(rowh, reciph); \
+  \
+  /* reciprocal is always negative (MSB=1), so we always need to add the */ \
+  /* initial value (input value is never negative as we inverted it at the */ \
+  /* start of this routine) */ \
+  rowlsave = rowl = _mm_add_pi16(rowl, rowlsave); \
+  rowhsave = rowh = _mm_add_pi16(rowh, rowhsave); \
+  \
+  scalel = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]);  /* scale */ \
+  scaleh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \
+  \
+  rowl = _mm_mulhi_pi16(rowl, scalel); \
+  rowh = _mm_mulhi_pi16(rowh, scaleh); \
+  \
+  /* determine if scale is negative */ \
+  scalel = _mm_srai_pi16(scalel, (WORD_BIT - 1)); \
+  scaleh = _mm_srai_pi16(scaleh, (WORD_BIT - 1)); \
+  \
+  /* and add input if it is */ \
+  scalel = _mm_and_si64(scalel, rowlsave); \
+  scaleh = _mm_and_si64(scaleh, rowhsave); \
+  rowl = _mm_add_pi16(rowl, scalel); \
+  rowh = _mm_add_pi16(rowh, scaleh); \
+  \
+  /* then check if negative input */ \
+  rowlsave = _mm_srai_pi16(rowlsave, (WORD_BIT - 1)); \
+  rowhsave = _mm_srai_pi16(rowhsave, (WORD_BIT - 1)); \
+  \
+  /* and add scale if it is */ \
+  rowlsave = _mm_and_si64(rowlsave, scalel); \
+  rowhsave = _mm_and_si64(rowhsave, scaleh); \
+  rowl = _mm_add_pi16(rowl, rowlsave); \
+  rowh = _mm_add_pi16(rowh, rowhsave); \
+  \
+  rowl = _mm_xor_si64(rowl, rowls);           /* val = -val */ \
+  rowh = _mm_xor_si64(rowh, rowhs); \
+  rowl = _mm_sub_pi16(rowl, rowls); \
+  rowh = _mm_sub_pi16(rowh, rowhs); \
+  \
+  _mm_store_si64((__m64 *)&output_ptr[0], rowl); \
+  _mm_store_si64((__m64 *)&output_ptr[4], rowh); \
+  \
+  workspace += DCTSIZE; \
+  divisors += DCTSIZE; \
+  output_ptr += DCTSIZE; \
+}
+
+
+void jsimd_quantize_mmi(JCOEFPTR coef_block, DCTELEM *divisors,
+                        DCTELEM *workspace)
+{
+  JCOEFPTR output_ptr = coef_block;
+
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+}
diff --git a/simd/loongson/jsimd.c b/simd/mips64/jsimd.c
similarity index 66%
rename from simd/loongson/jsimd.c
rename to simd/mips64/jsimd.c
index e8b1832..e8f1af5 100644
--- a/simd/loongson/jsimd.c
+++ b/simd/mips64/jsimd.c
@@ -1,11 +1,11 @@
 /*
- * jsimd_loongson.c
+ * jsimd_mips64.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2015, 2018, Matthieu Darbois.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -13,7 +13,7 @@
  *
  * This file contains the interface between the "normal" portions
  * of the library and the SIMD implementations when running on a
- * Loongson architecture.
+ * 64-bit MIPS architecture.
  */
 
 #define JPEG_INTERNALS
@@ -24,8 +24,76 @@
 #include "../../jsimddct.h"
 #include "../jsimd.h"
 
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
 static unsigned int simd_support = ~0;
 
+#if defined(__linux__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+  char *p;
+
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "ASEs implemented", 16) != 0)
+    return 0;
+  buffer += 16;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "loongson-mmi"))
+        simd_support |= JSIMD_MMI;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
 /*
  * Check what SIMD accelerations are supported.
  *
@@ -37,14 +105,32 @@
 #ifndef NO_GETENV
   char *env = NULL;
 #endif
+#if defined(__linux__)
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
 
   if (simd_support != ~0U)
     return;
 
+  simd_support = 0;
+
+#if defined(__linux__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#elif defined(__mips_loongson_vector_rev)
+  /* Only enable MMI by default on non-Linux platforms when the compiler flags
+   * support it. */
   simd_support |= JSIMD_MMI;
+#endif
 
 #ifndef NO_GETENV
   /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEMMI");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_MMI;
   env = getenv("JSIMD_FORCENONE");
   if ((env != NULL) && (strcmp(env, "1") == 0))
     simd_support = 0;
@@ -73,6 +159,19 @@
 GLOBAL(int)
 jsimd_can_rgb_gray(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
   return 0;
 }
 
@@ -150,6 +249,37 @@
                        JSAMPIMAGE output_buf, JDIMENSION output_row,
                        int num_rows)
 {
+  void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_extrgb_gray_convert_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_extrgbx_gray_convert_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_extbgr_gray_convert_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_extbgrx_gray_convert_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_extxbgr_gray_convert_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_extxrgb_gray_convert_mmi;
+    break;
+  default:
+    mmifct = jsimd_rgb_gray_convert_mmi;
+    break;
+  }
+
+  mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
 }
 
 GLOBAL(void)
@@ -311,6 +441,17 @@
 GLOBAL(int)
 jsimd_can_h2v1_fancy_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
   return 0;
 }
 
@@ -327,17 +468,42 @@
 jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v1_fancy_upsample_mmi(cinfo->max_v_samp_factor,
+                                compptr->downsampled_width, input_data,
+                                output_data_ptr);
 }
 
 GLOBAL(int)
 jsimd_can_h2v2_merged_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
   return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_merged_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
   return 0;
 }
 
@@ -345,12 +511,74 @@
 jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                            JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
+  void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_h2v2_extrgb_merged_upsample_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_h2v2_extrgbx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_h2v2_extbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_h2v2_extbgrx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_h2v2_extxbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_h2v2_extxrgb_merged_upsample_mmi;
+    break;
+  default:
+    mmifct = jsimd_h2v2_merged_upsample_mmi;
+    break;
+  }
+
+  mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
 }
 
 GLOBAL(void)
 jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                            JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
+  void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_h2v1_extrgb_merged_upsample_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_h2v1_extrgbx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_h2v1_extbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_h2v1_extbgrx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_h2v1_extxbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_h2v1_extxrgb_merged_upsample_mmi;
+    break;
+  default:
+    mmifct = jsimd_h2v1_merged_upsample_mmi;
+    break;
+  }
+
+  mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
 }
 
 GLOBAL(int)
@@ -397,6 +625,17 @@
 GLOBAL(int)
 jsimd_can_fdct_ifast(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
   return 0;
 }
 
@@ -415,6 +654,7 @@
 GLOBAL(void)
 jsimd_fdct_ifast(DCTELEM *data)
 {
+  jsimd_fdct_ifast_mmi(data);
 }
 
 GLOBAL(void)
@@ -537,6 +777,25 @@
 GLOBAL(int)
 jsimd_can_idct_ifast(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
   return 0;
 }
 
@@ -559,6 +818,7 @@
                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
                  JDIMENSION output_col)
 {
+  jsimd_idct_ifast_mmi(compptr->dct_table, coef_block, output_buf, output_col);
 }
 
 GLOBAL(void)
diff --git a/simd/loongson/jsimd_mmi.h b/simd/mips64/jsimd_mmi.h
similarity index 83%
rename from simd/loongson/jsimd_mmi.h
rename to simd/mips64/jsimd_mmi.h
index 2506aa8..a5ffc35 100644
--- a/simd/loongson/jsimd_mmi.h
+++ b/simd/mips64/jsimd_mmi.h
@@ -1,11 +1,12 @@
 /*
  * Loongson MMI optimizations for libjpeg-turbo
  *
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  *                          All Rights Reserved.
  * Authors:  ZhuChen     <zhuchen@loongson.cn>
  *           CaiWanwei   <caiwanwei@loongson.cn>
  *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           QingfaLiu   <liuqingfa-hf@loongson.cn>
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -32,6 +33,13 @@
 
 
 /* Common code */
+#if defined(_ABI64) && _MIPS_SIM == _ABI64
+# define PTR_ADDU  "daddu "
+# define PTR_SLL   "dsll "
+#else
+# define PTR_ADDU  "addu "
+# define PTR_SLL   "sll "
+#endif
 
 #define SIZEOF_MMWORD  8
 #define BYTE_BIT  8
@@ -47,10 +55,12 @@
    ((uint64_t)(uint8_t)f << 16) | \
    ((uint64_t)(uint8_t)g << 8)  | \
    ((uint64_t)(uint8_t)h))
+#define _uint64_set1_pi8(a)  _uint64_set_pi8(a, a, a, a, a, a, a, a)
 #define _uint64_set_pi16(a, b, c, d)  (((uint64_t)(uint16_t)a << 48) | \
                                        ((uint64_t)(uint16_t)b << 32) | \
                                        ((uint64_t)(uint16_t)c << 16) | \
                                        ((uint64_t)(uint16_t)d))
+#define _uint64_set1_pi16(a)  _uint64_set_pi16(a, a, a, a)
 #define _uint64_set_pi32(a, b)  (((uint64_t)(uint32_t)a << 32) | \
                                  ((uint64_t)(uint32_t)b))
 
diff --git a/simd/loongson/loongson-mmintrin.h b/simd/mips64/loongson-mmintrin.h
similarity index 98%
rename from simd/loongson/loongson-mmintrin.h
rename to simd/mips64/loongson-mmintrin.h
index 50d166b..db9b35a 100644
--- a/simd/loongson/loongson-mmintrin.h
+++ b/simd/mips64/loongson-mmintrin.h
@@ -1217,14 +1217,24 @@
 extern __inline void FUNCTION_ATTRIBS
 _mm_store_si64(__m64 *dest, __m64 src)
 {
-  asm("gssdlc1 %1, 7+%0\n\t"
-      "gssdrc1 %1, %0\n\t"
+  asm("sdc1 %1, %0 \n\t"
       : "=m" (*dest)
       : "f" (src)
       : "memory"
      );
 }
 
+extern __inline void FUNCTION_ATTRIBS
+_mm_storeu_si64(__m64 *dest, __m64 src)
+{
+  asm("gssdlc1 %1, 7(%0) \n\t"
+      "gssdrc1 %1, 0(%0) \n\t"
+      :
+      : "r" (dest), "f" (src)
+      : "memory"
+     );
+}
+
 extern __inline __m64 FUNCTION_ATTRIBS
 _mm_load_si32(const __m32 *src)
 {
diff --git a/simd/nasm/jpeg_nbits_table.inc b/simd/nasm/jpeg_nbits_table.inc
deleted file mode 100644
index 2ce6c28..0000000
--- a/simd/nasm/jpeg_nbits_table.inc
+++ /dev/null
@@ -1,4097 +0,0 @@
-jpeg_nbits_table db \
-   0,  1,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4, \
-   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, \
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6, \
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6, \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,