IJG R6b with x86SIMD V1.02
Independent JPEG Group's JPEG software release 6b
with x86 SIMD extension for IJG JPEG library version 1.02
diff --git a/aclocal.m4 b/aclocal.m4
new file mode 100644
index 0000000..54e986b
--- /dev/null
+++ b/aclocal.m4
@@ -0,0 +1,3655 @@
+# generated automatically by aclocal 1.8.5 -*- Autoconf -*-
+
+# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004
+# Free Software Foundation, Inc.
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+# libtool.m4 - Configure libtool for the host system. -*-Shell-script-*-
+
+# serial 46 AC_PROG_LIBTOOL
+
+AC_DEFUN([AC_PROG_LIBTOOL],
+[AC_REQUIRE([AC_LIBTOOL_SETUP])dnl
+
+# This can be used to rebuild libtool when needed
+LIBTOOL_DEPS="$ac_aux_dir/ltmain.sh"
+
+# Always use our own libtool.
+LIBTOOL='$(SHELL) $(top_builddir)/libtool'
+AC_SUBST(LIBTOOL)dnl
+
+# Prevent multiple expansion
+define([AC_PROG_LIBTOOL], [])
+])
+
+AC_DEFUN([AC_LIBTOOL_SETUP],
+[AC_PREREQ(2.13)dnl
+AC_REQUIRE([AC_ENABLE_SHARED])dnl
+AC_REQUIRE([AC_ENABLE_STATIC])dnl
+AC_REQUIRE([AC_ENABLE_FAST_INSTALL])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_REQUIRE([AC_CANONICAL_BUILD])dnl
+AC_REQUIRE([AC_PROG_CC])dnl
+AC_REQUIRE([AC_PROG_LD])dnl
+AC_REQUIRE([AC_PROG_LD_RELOAD_FLAG])dnl
+AC_REQUIRE([AC_PROG_NM])dnl
+AC_REQUIRE([LT_AC_PROG_SED])dnl
+
+AC_REQUIRE([AC_PROG_LN_S])dnl
+AC_REQUIRE([AC_DEPLIBS_CHECK_METHOD])dnl
+AC_REQUIRE([AC_OBJEXT])dnl
+AC_REQUIRE([AC_EXEEXT])dnl
+dnl
+
+_LT_AC_PROG_ECHO_BACKSLASH
+# Only perform the check for file, if the check method requires it
+case $deplibs_check_method in
+file_magic*)
+  if test "$file_magic_cmd" = '$MAGIC_CMD'; then
+    AC_PATH_MAGIC
+  fi
+  ;;
+esac
+
+AC_CHECK_TOOL(RANLIB, ranlib, :)
+AC_CHECK_TOOL(STRIP, strip, :)
+
+ifdef([AC_PROVIDE_AC_LIBTOOL_DLOPEN], enable_dlopen=yes, enable_dlopen=no)
+ifdef([AC_PROVIDE_AC_LIBTOOL_WIN32_DLL],
+enable_win32_dll=yes, enable_win32_dll=no)
+
+AC_ARG_ENABLE(libtool-lock,
+  [  --disable-libtool-lock  avoid locking (might break parallel builds)])
+test "x$enable_libtool_lock" != xno && enable_libtool_lock=yes
+
+# Some flags need to be propagated to the compiler or linker for good
+# libtool support.
+case $host in
+*-*-irix6*)
+  # Find out which ABI we are using.
+  echo '[#]line __oline__ "configure"' > conftest.$ac_ext
+  if AC_TRY_EVAL(ac_compile); then
+    case `/usr/bin/file conftest.$ac_objext` in
+    *32-bit*)
+      LD="${LD-ld} -32"
+      ;;
+    *N32*)
+      LD="${LD-ld} -n32"
+      ;;
+    *64-bit*)
+      LD="${LD-ld} -64"
+      ;;
+    esac
+  fi
+  rm -rf conftest*
+  ;;
+
+*-*-sco3.2v5*)
+  # On SCO OpenServer 5, we need -belf to get full-featured binaries.
+  SAVE_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -belf"
+  AC_CACHE_CHECK([whether the C compiler needs -belf], lt_cv_cc_needs_belf,
+    [AC_LANG_SAVE
+     AC_LANG_C
+     AC_TRY_LINK([],[],[lt_cv_cc_needs_belf=yes],[lt_cv_cc_needs_belf=no])
+     AC_LANG_RESTORE])
+  if test x"$lt_cv_cc_needs_belf" != x"yes"; then
+    # this is probably gcc 2.8.0, egcs 1.0 or newer; no need for -belf
+    CFLAGS="$SAVE_CFLAGS"
+  fi
+  ;;
+
+ifdef([AC_PROVIDE_AC_LIBTOOL_WIN32_DLL],
+[*-*-cygwin* | *-*-mingw* | *-*-pw32*)
+  AC_CHECK_TOOL(DLLTOOL, dlltool, false)
+  AC_CHECK_TOOL(AS, as, false)
+  AC_CHECK_TOOL(OBJDUMP, objdump, false)
+
+  # recent cygwin and mingw systems supply a stub DllMain which the user
+  # can override, but on older systems we have to supply one
+  AC_CACHE_CHECK([if libtool should supply DllMain function], lt_cv_need_dllmain,
+    [AC_TRY_LINK([],
+      [extern int __attribute__((__stdcall__)) DllMain(void*, int, void*);
+      DllMain (0, 0, 0);],
+      [lt_cv_need_dllmain=no],[lt_cv_need_dllmain=yes])])
+
+  case $host/$CC in
+  *-*-cygwin*/gcc*-mno-cygwin*|*-*-mingw*)
+    # old mingw systems require "-dll" to link a DLL, while more recent ones
+    # require "-mdll"
+    SAVE_CFLAGS="$CFLAGS"
+    CFLAGS="$CFLAGS -mdll"
+    AC_CACHE_CHECK([how to link DLLs], lt_cv_cc_dll_switch,
+      [AC_TRY_LINK([], [], [lt_cv_cc_dll_switch=-mdll],[lt_cv_cc_dll_switch=-dll])])
+    CFLAGS="$SAVE_CFLAGS" ;;
+  *-*-cygwin* | *-*-pw32*)
+    # cygwin systems need to pass --dll to the linker, and not link
+    # crt.o which will require a WinMain@16 definition.
+    lt_cv_cc_dll_switch="-Wl,--dll -nostartfiles" ;;
+  esac
+  ;;
+  ])
+esac
+
+_LT_AC_LTCONFIG_HACK
+
+])
+
+# AC_LIBTOOL_HEADER_ASSERT
+# ------------------------
+AC_DEFUN([AC_LIBTOOL_HEADER_ASSERT],
+[AC_CACHE_CHECK([whether $CC supports assert without backlinking],
+    [lt_cv_func_assert_works],
+    [case $host in
+    *-*-solaris*)
+      if test "$GCC" = yes && test "$with_gnu_ld" != yes; then
+        case `$CC --version 2>/dev/null` in
+        [[12]].*) lt_cv_func_assert_works=no ;;
+        *)        lt_cv_func_assert_works=yes ;;
+        esac
+      fi
+      ;;
+    esac])
+
+if test "x$lt_cv_func_assert_works" = xyes; then
+  AC_CHECK_HEADERS(assert.h)
+fi
+])# AC_LIBTOOL_HEADER_ASSERT
+
+# _LT_AC_CHECK_DLFCN
+# --------------------
+AC_DEFUN([_LT_AC_CHECK_DLFCN],
+[AC_CHECK_HEADERS(dlfcn.h)
+])# _LT_AC_CHECK_DLFCN
+
+# AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE
+# ---------------------------------
+AC_DEFUN([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE],
+[AC_REQUIRE([AC_CANONICAL_HOST])
+AC_REQUIRE([AC_PROG_NM])
+AC_REQUIRE([AC_OBJEXT])
+# Check for command to grab the raw symbol name followed by C symbol from nm.
+AC_MSG_CHECKING([command to parse $NM output])
+AC_CACHE_VAL([lt_cv_sys_global_symbol_pipe], [dnl
+
+# These are sane defaults that work on at least a few old systems.
+# [They come from Ultrix.  What could be older than Ultrix?!! ;)]
+
+# Character class describing NM global symbol codes.
+symcode='[[BCDEGRST]]'
+
+# Regexp to match symbols that can be accessed directly from C.
+sympat='\([[_A-Za-z]][[_A-Za-z0-9]]*\)'
+
+# Transform the above into a raw symbol and a C symbol.
+symxfrm='\1 \2\3 \3'
+
+# Transform an extracted symbol line into a proper C declaration
+lt_cv_global_symbol_to_cdecl="sed -n -e 's/^. .* \(.*\)$/extern char \1;/p'"
+
+# Transform an extracted symbol line into symbol name and symbol address
+lt_cv_global_symbol_to_c_name_address="sed -n -e 's/^: \([[^ ]]*\) $/  {\\\"\1\\\", (lt_ptr) 0},/p' -e 's/^$symcode \([[^ ]]*\) \([[^ ]]*\)$/  {\"\2\", (lt_ptr) \&\2},/p'"
+
+# Define system-specific variables.
+case $host_os in
+aix*)
+  symcode='[[BCDT]]'
+  ;;
+cygwin* | mingw* | pw32*)
+  symcode='[[ABCDGISTW]]'
+  ;;
+hpux*) # Its linker distinguishes data from code symbols
+  lt_cv_global_symbol_to_cdecl="sed -n -e 's/^T .* \(.*\)$/extern char \1();/p' -e 's/^$symcode* .* \(.*\)$/extern char \1;/p'"
+  lt_cv_global_symbol_to_c_name_address="sed -n -e 's/^: \([[^ ]]*\) $/  {\\\"\1\\\", (lt_ptr) 0},/p' -e 's/^$symcode* \([[^ ]]*\) \([[^ ]]*\)$/  {\"\2\", (lt_ptr) \&\2},/p'"
+  ;;
+irix* | nonstopux*)
+  symcode='[[BCDEGRST]]'
+  ;;
+osf*)
+  symcode='[[BCDEGQRST]]'
+  ;;
+solaris* | sysv5*)
+  symcode='[[BDT]]'
+  ;;
+sysv4)
+  symcode='[[DFNSTU]]'
+  ;;
+esac
+
+# Handle CRLF in mingw tool chain
+opt_cr=
+case $host_os in
+mingw*)
+  opt_cr=`echo 'x\{0,1\}' | tr x '\015'` # option cr in regexp
+  ;;
+esac
+
+# If we're using GNU nm, then use its standard symbol codes.
+if $NM -V 2>&1 | egrep '(GNU|with BFD)' > /dev/null; then
+  symcode='[[ABCDGISTW]]'
+fi
+
+# Try without a prefix undercore, then with it.
+for ac_symprfx in "" "_"; do
+
+  # Write the raw and C identifiers.
+lt_cv_sys_global_symbol_pipe="sed -n -e 's/^.*[[ 	]]\($symcode$symcode*\)[[ 	]][[ 	]]*\($ac_symprfx\)$sympat$opt_cr$/$symxfrm/p'"
+
+  # Check to see that the pipe works correctly.
+  pipe_works=no
+  rm -f conftest*
+  cat > conftest.$ac_ext <<EOF
+#ifdef __cplusplus
+extern "C" {
+#endif
+char nm_test_var;
+void nm_test_func(){}
+#ifdef __cplusplus
+}
+#endif
+int main(){nm_test_var='a';nm_test_func();return(0);}
+EOF
+
+  if AC_TRY_EVAL(ac_compile); then
+    # Now try to grab the symbols.
+    nlist=conftest.nm
+    if AC_TRY_EVAL(NM conftest.$ac_objext \| $lt_cv_sys_global_symbol_pipe \> $nlist) && test -s "$nlist"; then
+      # Try sorting and uniquifying the output.
+      if sort "$nlist" | uniq > "$nlist"T; then
+	mv -f "$nlist"T "$nlist"
+      else
+	rm -f "$nlist"T
+      fi
+
+      # Make sure that we snagged all the symbols we need.
+      if egrep ' nm_test_var$' "$nlist" >/dev/null; then
+	if egrep ' nm_test_func$' "$nlist" >/dev/null; then
+	  cat <<EOF > conftest.$ac_ext
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+EOF
+	  # Now generate the symbol file.
+	  eval "$lt_cv_global_symbol_to_cdecl"' < "$nlist" >> conftest.$ac_ext'
+
+	  cat <<EOF >> conftest.$ac_ext
+#if defined (__STDC__) && __STDC__
+# define lt_ptr void *
+#else
+# define lt_ptr char *
+# define const
+#endif
+
+/* The mapping between symbol names and symbols. */
+const struct {
+  const char *name;
+  lt_ptr address;
+}
+lt_preloaded_symbols[[]] =
+{
+EOF
+	  sed "s/^$symcode$symcode* \(.*\) \(.*\)$/  {\"\2\", (lt_ptr) \&\2},/" < "$nlist" >> conftest.$ac_ext
+	  cat <<\EOF >> conftest.$ac_ext
+  {0, (lt_ptr) 0}
+};
+
+#ifdef __cplusplus
+}
+#endif
+EOF
+	  # Now try linking the two files.
+	  mv conftest.$ac_objext conftstm.$ac_objext
+	  save_LIBS="$LIBS"
+	  save_CFLAGS="$CFLAGS"
+	  LIBS="conftstm.$ac_objext"
+	  CFLAGS="$CFLAGS$no_builtin_flag"
+	  if AC_TRY_EVAL(ac_link) && test -s conftest$ac_exeext; then
+	    pipe_works=yes
+	  fi
+	  LIBS="$save_LIBS"
+	  CFLAGS="$save_CFLAGS"
+	else
+	  echo "cannot find nm_test_func in $nlist" >&AC_FD_CC
+	fi
+      else
+	echo "cannot find nm_test_var in $nlist" >&AC_FD_CC
+      fi
+    else
+      echo "cannot run $lt_cv_sys_global_symbol_pipe" >&AC_FD_CC
+    fi
+  else
+    echo "$progname: failed program was:" >&AC_FD_CC
+    cat conftest.$ac_ext >&5
+  fi
+  rm -f conftest* conftst*
+
+  # Do not use the global_symbol_pipe unless it works.
+  if test "$pipe_works" = yes; then
+    break
+  else
+    lt_cv_sys_global_symbol_pipe=
+  fi
+done
+])
+global_symbol_pipe="$lt_cv_sys_global_symbol_pipe"
+if test -z "$lt_cv_sys_global_symbol_pipe"; then
+  global_symbol_to_cdecl=
+  global_symbol_to_c_name_address=
+else
+  global_symbol_to_cdecl="$lt_cv_global_symbol_to_cdecl"
+  global_symbol_to_c_name_address="$lt_cv_global_symbol_to_c_name_address"
+fi
+if test -z "$global_symbol_pipe$global_symbol_to_cdec$global_symbol_to_c_name_address";
+then
+  AC_MSG_RESULT(failed)
+else
+  AC_MSG_RESULT(ok)
+fi
+]) # AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE
+
+# _LT_AC_LIBTOOL_SYS_PATH_SEPARATOR
+# ---------------------------------
+AC_DEFUN([_LT_AC_LIBTOOL_SYS_PATH_SEPARATOR],
+[# Find the correct PATH separator.  Usually this is `:', but
+# DJGPP uses `;' like DOS.
+if test "X${PATH_SEPARATOR+set}" != Xset; then
+  UNAME=${UNAME-`uname 2>/dev/null`}
+  case X$UNAME in
+    *-DOS) lt_cv_sys_path_separator=';' ;;
+    *)     lt_cv_sys_path_separator=':' ;;
+  esac
+  PATH_SEPARATOR=$lt_cv_sys_path_separator
+fi
+])# _LT_AC_LIBTOOL_SYS_PATH_SEPARATOR
+
+# _LT_AC_PROG_ECHO_BACKSLASH
+# --------------------------
+# Add some code to the start of the generated configure script which
+# will find an echo command which doesn't interpret backslashes.
+AC_DEFUN([_LT_AC_PROG_ECHO_BACKSLASH],
+[ifdef([AC_DIVERSION_NOTICE], [AC_DIVERT_PUSH(AC_DIVERSION_NOTICE)],
+			      [AC_DIVERT_PUSH(NOTICE)])
+_LT_AC_LIBTOOL_SYS_PATH_SEPARATOR
+
+# Check that we are running under the correct shell.
+SHELL=${CONFIG_SHELL-/bin/sh}
+
+case X$ECHO in
+X*--fallback-echo)
+  # Remove one level of quotation (which was required for Make).
+  ECHO=`echo "$ECHO" | sed 's,\\\\\[$]\\[$]0,'[$]0','`
+  ;;
+esac
+
+echo=${ECHO-echo}
+if test "X[$]1" = X--no-reexec; then
+  # Discard the --no-reexec flag, and continue.
+  shift
+elif test "X[$]1" = X--fallback-echo; then
+  # Avoid inline document here, it may be left over
+  :
+elif test "X`($echo '\t') 2>/dev/null`" = 'X\t'; then
+  # Yippee, $echo works!
+  :
+else
+  # Restart under the correct shell.
+  exec $SHELL "[$]0" --no-reexec ${1+"[$]@"}
+fi
+
+if test "X[$]1" = X--fallback-echo; then
+  # used as fallback echo
+  shift
+  cat <<EOF
+$*
+EOF
+  exit 0
+fi
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+if test "X${CDPATH+set}" = Xset; then CDPATH=:; export CDPATH; fi
+
+if test -z "$ECHO"; then
+if test "X${echo_test_string+set}" != Xset; then
+# find a string as large as possible, as long as the shell can cope with it
+  for cmd in 'sed 50q "[$]0"' 'sed 20q "[$]0"' 'sed 10q "[$]0"' 'sed 2q "[$]0"' 'echo test'; do
+    # expected sizes: less than 2Kb, 1Kb, 512 bytes, 16 bytes, ...
+    if (echo_test_string="`eval $cmd`") 2>/dev/null &&
+       echo_test_string="`eval $cmd`" &&
+       (test "X$echo_test_string" = "X$echo_test_string") 2>/dev/null
+    then
+      break
+    fi
+  done
+fi
+
+if test "X`($echo '\t') 2>/dev/null`" = 'X\t' &&
+   echo_testing_string=`($echo "$echo_test_string") 2>/dev/null` &&
+   test "X$echo_testing_string" = "X$echo_test_string"; then
+  :
+else
+  # The Solaris, AIX, and Digital Unix default echo programs unquote
+  # backslashes.  This makes it impossible to quote backslashes using
+  #   echo "$something" | sed 's/\\/\\\\/g'
+  #
+  # So, first we look for a working echo in the user's PATH.
+
+  IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=$PATH_SEPARATOR
+  for dir in $PATH /usr/ucb; do
+    if (test -f $dir/echo || test -f $dir/echo$ac_exeext) &&
+       test "X`($dir/echo '\t') 2>/dev/null`" = 'X\t' &&
+       echo_testing_string=`($dir/echo "$echo_test_string") 2>/dev/null` &&
+       test "X$echo_testing_string" = "X$echo_test_string"; then
+      echo="$dir/echo"
+      break
+    fi
+  done
+  IFS="$save_ifs"
+
+  if test "X$echo" = Xecho; then
+    # We didn't find a better echo, so look for alternatives.
+    if test "X`(print -r '\t') 2>/dev/null`" = 'X\t' &&
+       echo_testing_string=`(print -r "$echo_test_string") 2>/dev/null` &&
+       test "X$echo_testing_string" = "X$echo_test_string"; then
+      # This shell has a builtin print -r that does the trick.
+      echo='print -r'
+    elif (test -f /bin/ksh || test -f /bin/ksh$ac_exeext) &&
+	 test "X$CONFIG_SHELL" != X/bin/ksh; then
+      # If we have ksh, try running configure again with it.
+      ORIGINAL_CONFIG_SHELL=${CONFIG_SHELL-/bin/sh}
+      export ORIGINAL_CONFIG_SHELL
+      CONFIG_SHELL=/bin/ksh
+      export CONFIG_SHELL
+      exec $CONFIG_SHELL "[$]0" --no-reexec ${1+"[$]@"}
+    else
+      # Try using printf.
+      echo='printf %s\n'
+      if test "X`($echo '\t') 2>/dev/null`" = 'X\t' &&
+	 echo_testing_string=`($echo "$echo_test_string") 2>/dev/null` &&
+	 test "X$echo_testing_string" = "X$echo_test_string"; then
+	# Cool, printf works
+	:
+      elif echo_testing_string=`($ORIGINAL_CONFIG_SHELL "[$]0" --fallback-echo '\t') 2>/dev/null` &&
+	   test "X$echo_testing_string" = 'X\t' &&
+	   echo_testing_string=`($ORIGINAL_CONFIG_SHELL "[$]0" --fallback-echo "$echo_test_string") 2>/dev/null` &&
+	   test "X$echo_testing_string" = "X$echo_test_string"; then
+	CONFIG_SHELL=$ORIGINAL_CONFIG_SHELL
+	export CONFIG_SHELL
+	SHELL="$CONFIG_SHELL"
+	export SHELL
+	echo="$CONFIG_SHELL [$]0 --fallback-echo"
+      elif echo_testing_string=`($CONFIG_SHELL "[$]0" --fallback-echo '\t') 2>/dev/null` &&
+	   test "X$echo_testing_string" = 'X\t' &&
+	   echo_testing_string=`($CONFIG_SHELL "[$]0" --fallback-echo "$echo_test_string") 2>/dev/null` &&
+	   test "X$echo_testing_string" = "X$echo_test_string"; then
+	echo="$CONFIG_SHELL [$]0 --fallback-echo"
+      else
+	# maybe with a smaller string...
+	prev=:
+
+	for cmd in 'echo test' 'sed 2q "[$]0"' 'sed 10q "[$]0"' 'sed 20q "[$]0"' 'sed 50q "[$]0"'; do
+	  if (test "X$echo_test_string" = "X`eval $cmd`") 2>/dev/null
+	  then
+	    break
+	  fi
+	  prev="$cmd"
+	done
+
+	if test "$prev" != 'sed 50q "[$]0"'; then
+	  echo_test_string=`eval $prev`
+	  export echo_test_string
+	  exec ${ORIGINAL_CONFIG_SHELL-${CONFIG_SHELL-/bin/sh}} "[$]0" ${1+"[$]@"}
+	else
+	  # Oops.  We lost completely, so just stick with echo.
+	  echo=echo
+	fi
+      fi
+    fi
+  fi
+fi
+fi
+
+# Copy echo and quote the copy suitably for passing to libtool from
+# the Makefile, instead of quoting the original, which is used later.
+ECHO=$echo
+if test "X$ECHO" = "X$CONFIG_SHELL [$]0 --fallback-echo"; then
+   ECHO="$CONFIG_SHELL \\\$\[$]0 --fallback-echo"
+fi
+
+AC_SUBST(ECHO)
+AC_DIVERT_POP
+])# _LT_AC_PROG_ECHO_BACKSLASH
+
+# _LT_AC_TRY_DLOPEN_SELF (ACTION-IF-TRUE, ACTION-IF-TRUE-W-USCORE,
+#                           ACTION-IF-FALSE, ACTION-IF-CROSS-COMPILING)
+# ------------------------------------------------------------------
+AC_DEFUN([_LT_AC_TRY_DLOPEN_SELF],
+[if test "$cross_compiling" = yes; then :
+  [$4]
+else
+  AC_REQUIRE([_LT_AC_CHECK_DLFCN])dnl
+  lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
+  lt_status=$lt_dlunknown
+  cat > conftest.$ac_ext <<EOF
+[#line __oline__ "configure"
+#include "confdefs.h"
+
+#if HAVE_DLFCN_H
+#include <dlfcn.h>
+#endif
+
+#include <stdio.h>
+
+#ifdef RTLD_GLOBAL
+#  define LT_DLGLOBAL		RTLD_GLOBAL
+#else
+#  ifdef DL_GLOBAL
+#    define LT_DLGLOBAL		DL_GLOBAL
+#  else
+#    define LT_DLGLOBAL		0
+#  endif
+#endif
+
+/* We may have to define LT_DLLAZY_OR_NOW in the command line if we
+   find out it does not work in some platform. */
+#ifndef LT_DLLAZY_OR_NOW
+#  ifdef RTLD_LAZY
+#    define LT_DLLAZY_OR_NOW		RTLD_LAZY
+#  else
+#    ifdef DL_LAZY
+#      define LT_DLLAZY_OR_NOW		DL_LAZY
+#    else
+#      ifdef RTLD_NOW
+#        define LT_DLLAZY_OR_NOW	RTLD_NOW
+#      else
+#        ifdef DL_NOW
+#          define LT_DLLAZY_OR_NOW	DL_NOW
+#        else
+#          define LT_DLLAZY_OR_NOW	0
+#        endif
+#      endif
+#    endif
+#  endif
+#endif
+
+#ifdef __cplusplus
+extern "C" void exit (int);
+#endif
+
+void fnord() { int i=42;}
+int main ()
+{
+  void *self = dlopen (0, LT_DLGLOBAL|LT_DLLAZY_OR_NOW);
+  int status = $lt_dlunknown;
+
+  if (self)
+    {
+      if (dlsym (self,"fnord"))       status = $lt_dlno_uscore;
+      else if (dlsym( self,"_fnord")) status = $lt_dlneed_uscore;
+      /* dlclose (self); */
+    }
+
+    exit (status);
+}]
+EOF
+  if AC_TRY_EVAL(ac_link) && test -s conftest${ac_exeext} 2>/dev/null; then
+    (./conftest; exit; ) 2>/dev/null
+    lt_status=$?
+    case x$lt_status in
+      x$lt_dlno_uscore) $1 ;;
+      x$lt_dlneed_uscore) $2 ;;
+      x$lt_unknown|x*) $3 ;;
+    esac
+  else :
+    # compilation failed
+    $3
+  fi
+fi
+rm -fr conftest*
+])# _LT_AC_TRY_DLOPEN_SELF
+
+# AC_LIBTOOL_DLOPEN_SELF
+# -------------------
+AC_DEFUN([AC_LIBTOOL_DLOPEN_SELF],
+[if test "x$enable_dlopen" != xyes; then
+  enable_dlopen=unknown
+  enable_dlopen_self=unknown
+  enable_dlopen_self_static=unknown
+else
+  lt_cv_dlopen=no
+  lt_cv_dlopen_libs=
+
+  case $host_os in
+  beos*)
+    lt_cv_dlopen="load_add_on"
+    lt_cv_dlopen_libs=
+    lt_cv_dlopen_self=yes
+    ;;
+
+  cygwin* | mingw* | pw32*)
+    lt_cv_dlopen="LoadLibrary"
+    lt_cv_dlopen_libs=
+   ;;
+
+  *)
+    AC_CHECK_FUNC([shl_load],
+          [lt_cv_dlopen="shl_load"],
+      [AC_CHECK_LIB([dld], [shl_load],
+            [lt_cv_dlopen="shl_load" lt_cv_dlopen_libs="-dld"],
+	[AC_CHECK_FUNC([dlopen],
+	      [lt_cv_dlopen="dlopen"],
+	  [AC_CHECK_LIB([dl], [dlopen],
+	        [lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-ldl"],
+	    [AC_CHECK_LIB([svld], [dlopen],
+	          [lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-lsvld"],
+	      [AC_CHECK_LIB([dld], [dld_link],
+	            [lt_cv_dlopen="dld_link" lt_cv_dlopen_libs="-dld"])
+	      ])
+	    ])
+	  ])
+	])
+      ])
+    ;;
+  esac
+
+  if test "x$lt_cv_dlopen" != xno; then
+    enable_dlopen=yes
+  else
+    enable_dlopen=no
+  fi
+
+  case $lt_cv_dlopen in
+  dlopen)
+    save_CPPFLAGS="$CPPFLAGS"
+    AC_REQUIRE([_LT_AC_CHECK_DLFCN])dnl
+    test "x$ac_cv_header_dlfcn_h" = xyes && CPPFLAGS="$CPPFLAGS -DHAVE_DLFCN_H"
+
+    save_LDFLAGS="$LDFLAGS"
+    eval LDFLAGS=\"\$LDFLAGS $export_dynamic_flag_spec\"
+
+    save_LIBS="$LIBS"
+    LIBS="$lt_cv_dlopen_libs $LIBS"
+
+    AC_CACHE_CHECK([whether a program can dlopen itself],
+	  lt_cv_dlopen_self, [dnl
+	  _LT_AC_TRY_DLOPEN_SELF(
+	    lt_cv_dlopen_self=yes, lt_cv_dlopen_self=yes,
+	    lt_cv_dlopen_self=no, lt_cv_dlopen_self=cross)
+    ])
+
+    if test "x$lt_cv_dlopen_self" = xyes; then
+      LDFLAGS="$LDFLAGS $link_static_flag"
+      AC_CACHE_CHECK([whether a statically linked program can dlopen itself],
+    	  lt_cv_dlopen_self_static, [dnl
+	  _LT_AC_TRY_DLOPEN_SELF(
+	    lt_cv_dlopen_self_static=yes, lt_cv_dlopen_self_static=yes,
+	    lt_cv_dlopen_self_static=no,  lt_cv_dlopen_self_static=cross)
+      ])
+    fi
+
+    CPPFLAGS="$save_CPPFLAGS"
+    LDFLAGS="$save_LDFLAGS"
+    LIBS="$save_LIBS"
+    ;;
+  esac
+
+  case $lt_cv_dlopen_self in
+  yes|no) enable_dlopen_self=$lt_cv_dlopen_self ;;
+  *) enable_dlopen_self=unknown ;;
+  esac
+
+  case $lt_cv_dlopen_self_static in
+  yes|no) enable_dlopen_self_static=$lt_cv_dlopen_self_static ;;
+  *) enable_dlopen_self_static=unknown ;;
+  esac
+fi
+])# AC_LIBTOOL_DLOPEN_SELF
+
+AC_DEFUN([_LT_AC_LTCONFIG_HACK],
+[AC_REQUIRE([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE])dnl
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+Xsed='sed -e s/^X//'
+sed_quote_subst='s/\([[\\"\\`$\\\\]]\)/\\\1/g'
+
+# Same as above, but do not quote variable references.
+double_quote_subst='s/\([[\\"\\`\\\\]]\)/\\\1/g'
+
+# Sed substitution to delay expansion of an escaped shell variable in a
+# double_quote_subst'ed string.
+delay_variable_subst='s/\\\\\\\\\\\$/\\\\\\$/g'
+
+# Constants:
+rm="rm -f"
+
+# Global variables:
+default_ofile=libtool
+can_build_shared=yes
+
+# All known linkers require a `.a' archive for static linking (except M$VC,
+# which needs '.lib').
+libext=a
+ltmain="$ac_aux_dir/ltmain.sh"
+ofile="$default_ofile"
+with_gnu_ld="$lt_cv_prog_gnu_ld"
+need_locks="$enable_libtool_lock"
+
+old_CC="$CC"
+old_CFLAGS="$CFLAGS"
+
+# Set sane defaults for various variables
+test -z "$AR" && AR=ar
+test -z "$AR_FLAGS" && AR_FLAGS=cru
+test -z "$AS" && AS=as
+test -z "$CC" && CC=cc
+test -z "$DLLTOOL" && DLLTOOL=dlltool
+test -z "$LD" && LD=ld
+test -z "$LN_S" && LN_S="ln -s"
+test -z "$MAGIC_CMD" && MAGIC_CMD=file
+test -z "$NM" && NM=nm
+test -z "$OBJDUMP" && OBJDUMP=objdump
+test -z "$RANLIB" && RANLIB=:
+test -z "$STRIP" && STRIP=:
+test -z "$ac_objext" && ac_objext=o
+
+if test x"$host" != x"$build"; then
+  ac_tool_prefix=${host_alias}-
+else
+  ac_tool_prefix=
+fi
+
+# Transform linux* to *-*-linux-gnu*, to support old configure scripts.
+case $host_os in
+linux-gnu*) ;;
+linux*) host=`echo $host | sed 's/^\(.*-.*-linux\)\(.*\)$/\1-gnu\2/'`
+esac
+
+case $host_os in
+aix3*)
+  # AIX sometimes has problems with the GCC collect2 program.  For some
+  # reason, if we set the COLLECT_NAMES environment variable, the problems
+  # vanish in a puff of smoke.
+  if test "X${COLLECT_NAMES+set}" != Xset; then
+    COLLECT_NAMES=
+    export COLLECT_NAMES
+  fi
+  ;;
+esac
+
+# Determine commands to create old-style static archives.
+old_archive_cmds='$AR $AR_FLAGS $oldlib$oldobjs$old_deplibs'
+old_postinstall_cmds='chmod 644 $oldlib'
+old_postuninstall_cmds=
+
+if test -n "$RANLIB"; then
+  case $host_os in
+  openbsd*)
+    old_postinstall_cmds="\$RANLIB -t \$oldlib~$old_postinstall_cmds"
+    ;;
+  *)
+    old_postinstall_cmds="\$RANLIB \$oldlib~$old_postinstall_cmds"
+    ;;
+  esac
+  old_archive_cmds="$old_archive_cmds~\$RANLIB \$oldlib"
+fi
+
+# Allow CC to be a program name with arguments.
+set dummy $CC
+compiler="[$]2"
+
+AC_MSG_CHECKING([for objdir])
+rm -f .libs 2>/dev/null
+mkdir .libs 2>/dev/null
+if test -d .libs; then
+  objdir=.libs
+else
+  # MS-DOS does not allow filenames that begin with a dot.
+  objdir=_libs
+fi
+rmdir .libs 2>/dev/null
+AC_MSG_RESULT($objdir)
+
+
+AC_ARG_WITH(pic,
+[  --with-pic              try to use only PIC/non-PIC objects [default=use both]],
+pic_mode="$withval", pic_mode=default)
+test -z "$pic_mode" && pic_mode=default
+
+# We assume here that the value for lt_cv_prog_cc_pic will not be cached
+# in isolation, and that seeing it set (from the cache) indicates that
+# the associated values are set (in the cache) correctly too.
+AC_MSG_CHECKING([for $compiler option to produce PIC])
+AC_CACHE_VAL(lt_cv_prog_cc_pic,
+[ lt_cv_prog_cc_pic=
+  lt_cv_prog_cc_shlib=
+  lt_cv_prog_cc_wl=
+  lt_cv_prog_cc_static=
+  lt_cv_prog_cc_no_builtin=
+  lt_cv_prog_cc_can_build_shared=$can_build_shared
+
+  if test "$GCC" = yes; then
+    lt_cv_prog_cc_wl='-Wl,'
+    lt_cv_prog_cc_static='-static'
+
+    case $host_os in
+    aix*)
+      # Below there is a dirty hack to force normal static linking with -ldl
+      # The problem is because libdl dynamically linked with both libc and
+      # libC (AIX C++ library), which obviously doesn't included in libraries
+      # list by gcc. This cause undefined symbols with -static flags.
+      # This hack allows C programs to be linked with "-static -ldl", but
+      # not sure about C++ programs.
+      lt_cv_prog_cc_static="$lt_cv_prog_cc_static ${lt_cv_prog_cc_wl}-lC"
+      ;;
+    amigaos*)
+      # FIXME: we need at least 68020 code to build shared libraries, but
+      # adding the `-m68020' flag to GCC prevents building anything better,
+      # like `-m68040'.
+      lt_cv_prog_cc_pic='-m68020 -resident32 -malways-restore-a4'
+      ;;
+    beos* | irix5* | irix6* | nonstopux* | osf3* | osf4* | osf5*)
+      # PIC is the default for these OSes.
+      ;;
+    darwin* | rhapsody*)
+      # PIC is the default on this platform
+      # Common symbols not allowed in MH_DYLIB files
+      lt_cv_prog_cc_pic='-fno-common'
+      ;;
+    cygwin* | mingw* | pw32* | os2*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      lt_cv_prog_cc_pic='-DDLL_EXPORT'
+      ;;
+    sysv4*MP*)
+      if test -d /usr/nec; then
+	 lt_cv_prog_cc_pic=-Kconform_pic
+      fi
+      ;;
+    *)
+      lt_cv_prog_cc_pic='-fPIC'
+      ;;
+    esac
+  else
+    # PORTME Check for PIC flags for the system compiler.
+    case $host_os in
+    aix3* | aix4* | aix5*)
+      lt_cv_prog_cc_wl='-Wl,'
+      # All AIX code is PIC.
+      if test "$host_cpu" = ia64; then
+	# AIX 5 now supports IA64 processor
+	lt_cv_prog_cc_static='-Bstatic'
+      else
+	lt_cv_prog_cc_static='-bnso -bI:/lib/syscalls.exp'
+      fi
+      ;;
+
+    hpux9* | hpux10* | hpux11*)
+      # Is there a better lt_cv_prog_cc_static that works with the bundled CC?
+      lt_cv_prog_cc_wl='-Wl,'
+      lt_cv_prog_cc_static="${lt_cv_prog_cc_wl}-a ${lt_cv_prog_cc_wl}archive"
+      lt_cv_prog_cc_pic='+Z'
+      ;;
+
+    irix5* | irix6* | nonstopux*)
+      lt_cv_prog_cc_wl='-Wl,'
+      lt_cv_prog_cc_static='-non_shared'
+      # PIC (with -KPIC) is the default.
+      ;;
+
+    cygwin* | mingw* | pw32* | os2*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      lt_cv_prog_cc_pic='-DDLL_EXPORT'
+      ;;
+
+    newsos6)
+      lt_cv_prog_cc_pic='-KPIC'
+      lt_cv_prog_cc_static='-Bstatic'
+      ;;
+
+    osf3* | osf4* | osf5*)
+      # All OSF/1 code is PIC.
+      lt_cv_prog_cc_wl='-Wl,'
+      lt_cv_prog_cc_static='-non_shared'
+      ;;
+
+    sco3.2v5*)
+      lt_cv_prog_cc_pic='-Kpic'
+      lt_cv_prog_cc_static='-dn'
+      lt_cv_prog_cc_shlib='-belf'
+      ;;
+
+    solaris*)
+      lt_cv_prog_cc_pic='-KPIC'
+      lt_cv_prog_cc_static='-Bstatic'
+      lt_cv_prog_cc_wl='-Wl,'
+      ;;
+
+    sunos4*)
+      lt_cv_prog_cc_pic='-PIC'
+      lt_cv_prog_cc_static='-Bstatic'
+      lt_cv_prog_cc_wl='-Qoption ld '
+      ;;
+
+    sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+      lt_cv_prog_cc_pic='-KPIC'
+      lt_cv_prog_cc_static='-Bstatic'
+      lt_cv_prog_cc_wl='-Wl,'
+      ;;
+
+    uts4*)
+      lt_cv_prog_cc_pic='-pic'
+      lt_cv_prog_cc_static='-Bstatic'
+      ;;
+
+    sysv4*MP*)
+      if test -d /usr/nec ;then
+	lt_cv_prog_cc_pic='-Kconform_pic'
+	lt_cv_prog_cc_static='-Bstatic'
+      fi
+      ;;
+
+    *)
+      lt_cv_prog_cc_can_build_shared=no
+      ;;
+    esac
+  fi
+])
+if test -z "$lt_cv_prog_cc_pic"; then
+  AC_MSG_RESULT([none])
+else
+  AC_MSG_RESULT([$lt_cv_prog_cc_pic])
+
+  # Check to make sure the pic_flag actually works.
+  AC_MSG_CHECKING([if $compiler PIC flag $lt_cv_prog_cc_pic works])
+  AC_CACHE_VAL(lt_cv_prog_cc_pic_works, [dnl
+    save_CFLAGS="$CFLAGS"
+    CFLAGS="$CFLAGS $lt_cv_prog_cc_pic -DPIC"
+    AC_TRY_COMPILE([], [], [dnl
+      case $host_os in
+      hpux9* | hpux10* | hpux11*)
+	# On HP-UX, both CC and GCC only warn that PIC is supported... then
+	# they create non-PIC objects.  So, if there were any warnings, we
+	# assume that PIC is not supported.
+	if test -s conftest.err; then
+	  lt_cv_prog_cc_pic_works=no
+	else
+	  lt_cv_prog_cc_pic_works=yes
+	fi
+	;;
+      *)
+	lt_cv_prog_cc_pic_works=yes
+	;;
+      esac
+    ], [dnl
+      lt_cv_prog_cc_pic_works=no
+    ])
+    CFLAGS="$save_CFLAGS"
+  ])
+
+  if test "X$lt_cv_prog_cc_pic_works" = Xno; then
+    lt_cv_prog_cc_pic=
+    lt_cv_prog_cc_can_build_shared=no
+  else
+    lt_cv_prog_cc_pic=" $lt_cv_prog_cc_pic"
+  fi
+
+  AC_MSG_RESULT([$lt_cv_prog_cc_pic_works])
+fi
+
+# Check for any special shared library compilation flags.
+if test -n "$lt_cv_prog_cc_shlib"; then
+  AC_MSG_WARN([\`$CC' requires \`$lt_cv_prog_cc_shlib' to build shared libraries])
+  if echo "$old_CC $old_CFLAGS " | egrep -e "[[ 	]]$lt_cv_prog_cc_shlib[[ 	]]" >/dev/null; then :
+  else
+   AC_MSG_WARN([add \`$lt_cv_prog_cc_shlib' to the CC or CFLAGS env variable and reconfigure])
+    lt_cv_prog_cc_can_build_shared=no
+  fi
+fi
+
+AC_MSG_CHECKING([if $compiler static flag $lt_cv_prog_cc_static works])
+AC_CACHE_VAL([lt_cv_prog_cc_static_works], [dnl
+  lt_cv_prog_cc_static_works=no
+  save_LDFLAGS="$LDFLAGS"
+  LDFLAGS="$LDFLAGS $lt_cv_prog_cc_static"
+  AC_TRY_LINK([], [], [lt_cv_prog_cc_static_works=yes])
+  LDFLAGS="$save_LDFLAGS"
+])
+
+# Belt *and* braces to stop my trousers falling down:
+test "X$lt_cv_prog_cc_static_works" = Xno && lt_cv_prog_cc_static=
+AC_MSG_RESULT([$lt_cv_prog_cc_static_works])
+
+pic_flag="$lt_cv_prog_cc_pic"
+special_shlib_compile_flags="$lt_cv_prog_cc_shlib"
+wl="$lt_cv_prog_cc_wl"
+link_static_flag="$lt_cv_prog_cc_static"
+no_builtin_flag="$lt_cv_prog_cc_no_builtin"
+can_build_shared="$lt_cv_prog_cc_can_build_shared"
+
+
+# Check to see if options -o and -c are simultaneously supported by compiler
+AC_MSG_CHECKING([if $compiler supports -c -o file.$ac_objext])
+AC_CACHE_VAL([lt_cv_compiler_c_o], [
+$rm -r conftest 2>/dev/null
+mkdir conftest
+cd conftest
+echo "int some_variable = 0;" > conftest.$ac_ext
+mkdir out
+# According to Tom Tromey, Ian Lance Taylor reported there are C compilers
+# that will create temporary files in the current directory regardless of
+# the output directory.  Thus, making CWD read-only will cause this test
+# to fail, enabling locking or at least warning the user not to do parallel
+# builds.
+chmod -w .
+save_CFLAGS="$CFLAGS"
+CFLAGS="$CFLAGS -o out/conftest2.$ac_objext"
+compiler_c_o=no
+if { (eval echo configure:__oline__: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>out/conftest.err; } && test -s out/conftest2.$ac_objext; then
+  # The compiler can only warn and ignore the option if not recognized
+  # So say no if there are warnings
+  if test -s out/conftest.err; then
+    lt_cv_compiler_c_o=no
+  else
+    lt_cv_compiler_c_o=yes
+  fi
+else
+  # Append any errors to the config.log.
+  cat out/conftest.err 1>&AC_FD_CC
+  lt_cv_compiler_c_o=no
+fi
+CFLAGS="$save_CFLAGS"
+chmod u+w .
+$rm conftest* out/*
+rmdir out
+cd ..
+rmdir conftest
+$rm -r conftest 2>/dev/null
+])
+compiler_c_o=$lt_cv_compiler_c_o
+AC_MSG_RESULT([$compiler_c_o])
+
+if test x"$compiler_c_o" = x"yes"; then
+  # Check to see if we can write to a .lo
+  AC_MSG_CHECKING([if $compiler supports -c -o file.lo])
+  AC_CACHE_VAL([lt_cv_compiler_o_lo], [
+  lt_cv_compiler_o_lo=no
+  save_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -c -o conftest.lo"
+  save_objext="$ac_objext"
+  ac_objext=lo
+  AC_TRY_COMPILE([], [int some_variable = 0;], [dnl
+    # The compiler can only warn and ignore the option if not recognized
+    # So say no if there are warnings
+    if test -s conftest.err; then
+      lt_cv_compiler_o_lo=no
+    else
+      lt_cv_compiler_o_lo=yes
+    fi
+  ])
+  ac_objext="$save_objext"
+  CFLAGS="$save_CFLAGS"
+  ])
+  compiler_o_lo=$lt_cv_compiler_o_lo
+  AC_MSG_RESULT([$compiler_o_lo])
+else
+  compiler_o_lo=no
+fi
+
+# Check to see if we can do hard links to lock some files if needed
+hard_links="nottested"
+if test "$compiler_c_o" = no && test "$need_locks" != no; then
+  # do not overwrite the value of need_locks provided by the user
+  AC_MSG_CHECKING([if we can lock with hard links])
+  hard_links=yes
+  $rm conftest*
+  ln conftest.a conftest.b 2>/dev/null && hard_links=no
+  touch conftest.a
+  ln conftest.a conftest.b 2>&5 || hard_links=no
+  ln conftest.a conftest.b 2>/dev/null && hard_links=no
+  AC_MSG_RESULT([$hard_links])
+  if test "$hard_links" = no; then
+    AC_MSG_WARN([\`$CC' does not support \`-c -o', so \`make -j' may be unsafe])
+    need_locks=warn
+  fi
+else
+  need_locks=no
+fi
+
+if test "$GCC" = yes; then
+  # Check to see if options -fno-rtti -fno-exceptions are supported by compiler
+  AC_MSG_CHECKING([if $compiler supports -fno-rtti -fno-exceptions])
+  echo "int some_variable = 0;" > conftest.$ac_ext
+  save_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -fno-rtti -fno-exceptions -c conftest.$ac_ext"
+  compiler_rtti_exceptions=no
+  AC_TRY_COMPILE([], [int some_variable = 0;], [dnl
+    # The compiler can only warn and ignore the option if not recognized
+    # So say no if there are warnings
+    if test -s conftest.err; then
+      compiler_rtti_exceptions=no
+    else
+      compiler_rtti_exceptions=yes
+    fi
+  ])
+  CFLAGS="$save_CFLAGS"
+  AC_MSG_RESULT([$compiler_rtti_exceptions])
+
+  if test "$compiler_rtti_exceptions" = "yes"; then
+    no_builtin_flag=' -fno-builtin -fno-rtti -fno-exceptions'
+  else
+    no_builtin_flag=' -fno-builtin'
+  fi
+fi
+
+# See if the linker supports building shared libraries.
+AC_MSG_CHECKING([whether the linker ($LD) supports shared libraries])
+
+allow_undefined_flag=
+no_undefined_flag=
+need_lib_prefix=unknown
+need_version=unknown
+# when you set need_version to no, make sure it does not cause -set_version
+# flags to be left without arguments
+archive_cmds=
+archive_expsym_cmds=
+old_archive_from_new_cmds=
+old_archive_from_expsyms_cmds=
+export_dynamic_flag_spec=
+whole_archive_flag_spec=
+thread_safe_flag_spec=
+hardcode_into_libs=no
+hardcode_libdir_flag_spec=
+hardcode_libdir_separator=
+hardcode_direct=no
+hardcode_minus_L=no
+hardcode_shlibpath_var=unsupported
+runpath_var=
+link_all_deplibs=unknown
+always_export_symbols=no
+export_symbols_cmds='$NM $libobjs $convenience | $global_symbol_pipe | sed '\''s/.* //'\'' | sort | uniq > $export_symbols'
+# include_expsyms should be a list of space-separated symbols to be *always*
+# included in the symbol list
+include_expsyms=
+# exclude_expsyms can be an egrep regular expression of symbols to exclude
+# it will be wrapped by ` (' and `)$', so one must not match beginning or
+# end of line.  Example: `a|bc|.*d.*' will exclude the symbols `a' and `bc',
+# as well as any symbol that contains `d'.
+exclude_expsyms="_GLOBAL_OFFSET_TABLE_"
+# Although _GLOBAL_OFFSET_TABLE_ is a valid symbol C name, most a.out
+# platforms (ab)use it in PIC code, but their linkers get confused if
+# the symbol is explicitly referenced.  Since portable code cannot
+# rely on this symbol name, it's probably fine to never include it in
+# preloaded symbol tables.
+extract_expsyms_cmds=
+
+case $host_os in
+cygwin* | mingw* | pw32*)
+  # FIXME: the MSVC++ port hasn't been tested in a loooong time
+  # When not using gcc, we currently assume that we are using
+  # Microsoft Visual C++.
+  if test "$GCC" != yes; then
+    with_gnu_ld=no
+  fi
+  ;;
+openbsd*)
+  with_gnu_ld=no
+  ;;
+esac
+
+ld_shlibs=yes
+if test "$with_gnu_ld" = yes; then
+  # If archive_cmds runs LD, not CC, wlarc should be empty
+  wlarc='${wl}'
+
+  # See if GNU ld supports shared libraries.
+  case $host_os in
+  aix3* | aix4* | aix5*)
+    # On AIX, the GNU linker is very broken
+    # Note:Check GNU linker on AIX 5-IA64 when/if it becomes available.
+    ld_shlibs=no
+    cat <<EOF 1>&2
+
+*** Warning: the GNU linker, at least up to release 2.9.1, is reported
+*** to be unable to reliably create shared libraries on AIX.
+*** Therefore, libtool is disabling shared libraries support.  If you
+*** really care for shared libraries, you may want to modify your PATH
+*** so that a non-GNU linker is found, and then restart.
+
+EOF
+    ;;
+
+  amigaos*)
+    archive_cmds='$rm $output_objdir/a2ixlibrary.data~$echo "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$echo "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$echo "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$echo "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_minus_L=yes
+
+    # Samuel A. Falvo II <kc5tja@dolphin.openprojects.net> reports
+    # that the semantics of dynamic libraries on AmigaOS, at least up
+    # to version 4, is to share data among multiple programs linked
+    # with the same dynamic library.  Since this doesn't match the
+    # behavior of shared libraries on other platforms, we can use
+    # them.
+    ld_shlibs=no
+    ;;
+
+  beos*)
+    if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+      allow_undefined_flag=unsupported
+      # Joseph Beckenbach <jrb3@best.com> says some releases of gcc
+      # support --undefined.  This deserves some investigation.  FIXME
+      archive_cmds='$CC -nostart $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+    else
+      ld_shlibs=no
+    fi
+    ;;
+
+  cygwin* | mingw* | pw32*)
+    # hardcode_libdir_flag_spec is actually meaningless, as there is
+    # no search path for DLLs.
+    hardcode_libdir_flag_spec='-L$libdir'
+    allow_undefined_flag=unsupported
+    always_export_symbols=yes
+
+    extract_expsyms_cmds='test -f $output_objdir/impgen.c || \
+      sed -e "/^# \/\* impgen\.c starts here \*\//,/^# \/\* impgen.c ends here \*\// { s/^# //;s/^# *$//; p; }" -e d < $''0 > $output_objdir/impgen.c~
+      test -f $output_objdir/impgen.exe || (cd $output_objdir && \
+      if test "x$HOST_CC" != "x" ; then $HOST_CC -o impgen impgen.c ; \
+      else $CC -o impgen impgen.c ; fi)~
+      $output_objdir/impgen $dir/$soroot > $output_objdir/$soname-def'
+
+    old_archive_from_expsyms_cmds='$DLLTOOL --as=$AS --dllname $soname --def $output_objdir/$soname-def --output-lib $output_objdir/$newlib'
+
+    # cygwin and mingw dlls have different entry points and sets of symbols
+    # to exclude.
+    # FIXME: what about values for MSVC?
+    dll_entry=__cygwin_dll_entry@12
+    dll_exclude_symbols=DllMain@12,_cygwin_dll_entry@12,_cygwin_noncygwin_dll_entry@12~
+    case $host_os in
+    mingw*)
+      # mingw values
+      dll_entry=_DllMainCRTStartup@12
+      dll_exclude_symbols=DllMain@12,DllMainCRTStartup@12,DllEntryPoint@12~
+      ;;
+    esac
+
+    # mingw and cygwin differ, and it's simplest to just exclude the union
+    # of the two symbol sets.
+    dll_exclude_symbols=DllMain@12,_cygwin_dll_entry@12,_cygwin_noncygwin_dll_entry@12,DllMainCRTStartup@12,DllEntryPoint@12
+
+    # recent cygwin and mingw systems supply a stub DllMain which the user
+    # can override, but on older systems we have to supply one (in ltdll.c)
+    if test "x$lt_cv_need_dllmain" = "xyes"; then
+      ltdll_obj='$output_objdir/$soname-ltdll.'"$ac_objext "
+      ltdll_cmds='test -f $output_objdir/$soname-ltdll.c || sed -e "/^# \/\* ltdll\.c starts here \*\//,/^# \/\* ltdll.c ends here \*\// { s/^# //; p; }" -e d < $''0 > $output_objdir/$soname-ltdll.c~
+	test -f $output_objdir/$soname-ltdll.$ac_objext || (cd $output_objdir && $CC -c $soname-ltdll.c)~'
+    else
+      ltdll_obj=
+      ltdll_cmds=
+    fi
+
+    # Extract the symbol export list from an `--export-all' def file,
+    # then regenerate the def file from the symbol export list, so that
+    # the compiled dll only exports the symbol export list.
+    # Be careful not to strip the DATA tag left be newer dlltools.
+    export_symbols_cmds="$ltdll_cmds"'
+      $DLLTOOL --export-all --exclude-symbols '$dll_exclude_symbols' --output-def $output_objdir/$soname-def '$ltdll_obj'$libobjs $convenience~
+      sed -e "1,/EXPORTS/d" -e "s/ @ [[0-9]]*//" -e "s/ *;.*$//" < $output_objdir/$soname-def > $export_symbols'
+
+    # If the export-symbols file already is a .def file (1st line
+    # is EXPORTS), use it as is.
+    # If DATA tags from a recent dlltool are present, honour them!
+    archive_expsym_cmds='if test "x`sed 1q $export_symbols`" = xEXPORTS; then
+	cp $export_symbols $output_objdir/$soname-def;
+      else
+	echo EXPORTS > $output_objdir/$soname-def;
+	_lt_hint=1;
+	cat $export_symbols | while read symbol; do
+	 set dummy \$symbol;
+	 case \[$]# in
+	   2) echo "   \[$]2 @ \$_lt_hint ; " >> $output_objdir/$soname-def;;
+	   4) echo "   \[$]2 \[$]3 \[$]4 ; " >> $output_objdir/$soname-def; _lt_hint=`expr \$_lt_hint - 1`;;
+	   *) echo "     \[$]2 @ \$_lt_hint \[$]3 ; " >> $output_objdir/$soname-def;;
+	 esac;
+	 _lt_hint=`expr 1 + \$_lt_hint`;
+	done;
+      fi~
+      '"$ltdll_cmds"'
+      $CC -Wl,--base-file,$output_objdir/$soname-base '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $output_objdir/$soname '$ltdll_obj'$libobjs $deplibs $compiler_flags~
+      $DLLTOOL --as=$AS --dllname $soname --exclude-symbols '$dll_exclude_symbols' --def $output_objdir/$soname-def --base-file $output_objdir/$soname-base --output-exp $output_objdir/$soname-exp~
+      $CC -Wl,--base-file,$output_objdir/$soname-base $output_objdir/$soname-exp '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $output_objdir/$soname '$ltdll_obj'$libobjs $deplibs $compiler_flags~
+      $DLLTOOL --as=$AS --dllname $soname --exclude-symbols '$dll_exclude_symbols' --def $output_objdir/$soname-def --base-file $output_objdir/$soname-base --output-exp $output_objdir/$soname-exp --output-lib $output_objdir/$libname.dll.a~
+      $CC $output_objdir/$soname-exp '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $output_objdir/$soname '$ltdll_obj'$libobjs $deplibs $compiler_flags'
+    ;;
+
+  netbsd*)
+    if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+      archive_cmds='$LD -Bshareable $libobjs $deplibs $linker_flags -o $lib'
+      wlarc=
+    else
+      archive_cmds='$CC -shared -nodefaultlibs $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      archive_expsym_cmds='$CC -shared -nodefaultlibs $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+    fi
+    ;;
+
+  solaris* | sysv5*)
+    if $LD -v 2>&1 | egrep 'BFD 2\.8' > /dev/null; then
+      ld_shlibs=no
+      cat <<EOF 1>&2
+
+*** Warning: The releases 2.8.* of the GNU linker cannot reliably
+*** create shared libraries on Solaris systems.  Therefore, libtool
+*** is disabling shared libraries support.  We urge you to upgrade GNU
+*** binutils to release 2.9.1 or newer.  Another option is to modify
+*** your PATH or compiler configuration so that the native linker is
+*** used, and then restart.
+
+EOF
+    elif $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+      archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+    else
+      ld_shlibs=no
+    fi
+    ;;
+
+  sunos4*)
+    archive_cmds='$LD -assert pure-text -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+    wlarc=
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  *)
+    if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+      archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+    else
+      ld_shlibs=no
+    fi
+    ;;
+  esac
+
+  if test "$ld_shlibs" = yes; then
+    runpath_var=LD_RUN_PATH
+    hardcode_libdir_flag_spec='${wl}--rpath ${wl}$libdir'
+    export_dynamic_flag_spec='${wl}--export-dynamic'
+    case $host_os in
+    cygwin* | mingw* | pw32*)
+      # dlltool doesn't understand --whole-archive et. al.
+      whole_archive_flag_spec=
+      ;;
+    *)
+      # ancient GNU ld didn't support --whole-archive et. al.
+      if $LD --help 2>&1 | egrep 'no-whole-archive' > /dev/null; then
+	whole_archive_flag_spec="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
+      else
+	whole_archive_flag_spec=
+      fi
+      ;;
+    esac
+  fi
+else
+  # PORTME fill in a description of your system's linker (not GNU ld)
+  case $host_os in
+  aix3*)
+    allow_undefined_flag=unsupported
+    always_export_symbols=yes
+    archive_expsym_cmds='$LD -o $output_objdir/$soname $libobjs $deplibs $linker_flags -bE:$export_symbols -T512 -H512 -bM:SRE~$AR $AR_FLAGS $lib $output_objdir/$soname'
+    # Note: this linker hardcodes the directories in LIBPATH if there
+    # are no directories specified by -L.
+    hardcode_minus_L=yes
+    if test "$GCC" = yes && test -z "$link_static_flag"; then
+      # Neither direct hardcoding nor static linking is supported with a
+      # broken collect2.
+      hardcode_direct=unsupported
+    fi
+    ;;
+
+  aix4* | aix5*)
+    if test "$host_cpu" = ia64; then
+      # On IA64, the linker does run time linking by default, so we don't
+      # have to do anything special.
+      aix_use_runtimelinking=no
+      exp_sym_flag='-Bexport'
+      no_entry_flag=""
+    else
+      aix_use_runtimelinking=no
+
+      # Test if we are trying to use run time linking or normal
+      # AIX style linking. If -brtl is somewhere in LDFLAGS, we
+      # need to do runtime linking.
+      case $host_os in aix4.[[23]]|aix4.[[23]].*|aix5*)
+	for ld_flag in $LDFLAGS; do
+	  case $ld_flag in
+	  *-brtl*)
+	    aix_use_runtimelinking=yes
+	    break
+	  ;;
+	  esac
+	done
+      esac
+
+      exp_sym_flag='-bexport'
+      no_entry_flag='-bnoentry'
+    fi
+
+    # When large executables or shared objects are built, AIX ld can
+    # have problems creating the table of contents.  If linking a library
+    # or program results in "error TOC overflow" add -mminimal-toc to
+    # CXXFLAGS/CFLAGS for g++/gcc.  In the cases where that is not
+    # enough to fix the problem, add -Wl,-bbigtoc to LDFLAGS.
+
+    hardcode_direct=yes
+    archive_cmds=''
+    hardcode_libdir_separator=':'
+    if test "$GCC" = yes; then
+      case $host_os in aix4.[[012]]|aix4.[[012]].*)
+	collect2name=`${CC} -print-prog-name=collect2`
+	if test -f "$collect2name" && \
+	  strings "$collect2name" | grep resolve_lib_name >/dev/null
+	then
+	  # We have reworked collect2
+	  hardcode_direct=yes
+	else
+	  # We have old collect2
+	  hardcode_direct=unsupported
+	  # It fails to find uninstalled libraries when the uninstalled
+	  # path is not listed in the libpath.  Setting hardcode_minus_L
+	  # to unsupported forces relinking
+	  hardcode_minus_L=yes
+	  hardcode_libdir_flag_spec='-L$libdir'
+	  hardcode_libdir_separator=
+	fi
+      esac
+
+      shared_flag='-shared'
+    else
+      # not using gcc
+      if test "$host_cpu" = ia64; then
+	shared_flag='${wl}-G'
+      else
+	if test "$aix_use_runtimelinking" = yes; then
+	  shared_flag='${wl}-G'
+	else
+	  shared_flag='${wl}-bM:SRE'
+	fi
+      fi
+    fi
+
+    # It seems that -bexpall can do strange things, so it is better to
+    # generate a list of symbols to export.
+    always_export_symbols=yes
+    if test "$aix_use_runtimelinking" = yes; then
+      # Warning - without using the other runtime loading flags (-brtl),
+      # -berok will link without error, but may produce a broken library.
+      allow_undefined_flag='-berok'
+      hardcode_libdir_flag_spec='${wl}-blibpath:$libdir:/usr/lib:/lib'
+      archive_expsym_cmds="\$CC"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags `if test "x${allow_undefined_flag}" != "x"; then echo "${wl}${allow_undefined_flag}"; else :; fi` '"\${wl}$no_entry_flag \${wl}$exp_sym_flag:\$export_symbols $shared_flag"
+    else
+      if test "$host_cpu" = ia64; then
+	hardcode_libdir_flag_spec='${wl}-R $libdir:/usr/lib:/lib'
+	allow_undefined_flag="-z nodefs"
+	archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname ${wl}-h$soname $libobjs $deplibs $compiler_flags ${wl}${allow_undefined_flag} '"\${wl}$no_entry_flag \${wl}$exp_sym_flag:\$export_symbols"
+      else
+	hardcode_libdir_flag_spec='${wl}-bnolibpath ${wl}-blibpath:$libdir:/usr/lib:/lib'
+	# Warning - without using the other run time loading flags,
+	# -berok will link without error, but may produce a broken library.
+	allow_undefined_flag='${wl}-berok'
+	# This is a bit strange, but is similar to how AIX traditionally builds
+	# it's shared libraries.
+	archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${allow_undefined_flag} '"\${wl}$no_entry_flag \${wl}$exp_sym_flag:\$export_symbols"' ~$AR -crlo $objdir/$libname$release.a $objdir/$soname'
+      fi
+    fi
+    ;;
+
+  amigaos*)
+    archive_cmds='$rm $output_objdir/a2ixlibrary.data~$echo "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$echo "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$echo "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$echo "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_minus_L=yes
+    # see comment about different semantics on the GNU ld section
+    ld_shlibs=no
+    ;;
+
+  cygwin* | mingw* | pw32*)
+    # When not using gcc, we currently assume that we are using
+    # Microsoft Visual C++.
+    # hardcode_libdir_flag_spec is actually meaningless, as there is
+    # no search path for DLLs.
+    hardcode_libdir_flag_spec=' '
+    allow_undefined_flag=unsupported
+    # Tell ltmain to make .lib files, not .a files.
+    libext=lib
+    # FIXME: Setting linknames here is a bad hack.
+    archive_cmds='$CC -o $lib $libobjs $compiler_flags `echo "$deplibs" | sed -e '\''s/ -lc$//'\''` -link -dll~linknames='
+    # The linker will automatically build a .lib file if we build a DLL.
+    old_archive_from_new_cmds='true'
+    # FIXME: Should let the user specify the lib program.
+    old_archive_cmds='lib /OUT:$oldlib$oldobjs$old_deplibs'
+    fix_srcfile_path='`cygpath -w "$srcfile"`'
+    ;;
+
+  darwin* | rhapsody*)
+    case "$host_os" in
+    rhapsody* | darwin1.[[012]])
+      allow_undefined_flag='-undefined suppress'
+      ;;
+    *) # Darwin 1.3 on
+      allow_undefined_flag='-flat_namespace -undefined suppress'
+      ;;
+    esac
+    # FIXME: Relying on posixy $() will cause problems for
+    #        cross-compilation, but unfortunately the echo tests do not
+    #        yet detect zsh echo's removal of \ escapes.  Also zsh mangles
+    #	     `"' quotes if we put them in here... so don't!
+    archive_cmds='$CC -r -keep_private_externs -nostdlib -o ${lib}-master.o $libobjs && $CC $(test .$module = .yes && echo -bundle || echo -dynamiclib) $allow_undefined_flag -o $lib ${lib}-master.o $deplibs$linker_flags $(test .$module != .yes && echo -install_name $rpath/$soname $verstring)'
+    # We need to add '_' to the symbols in $export_symbols first
+    #archive_expsym_cmds="$archive_cmds"' && strip -s $export_symbols'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    whole_archive_flag_spec='-all_load $convenience'
+    ;;
+
+  freebsd1*)
+    ld_shlibs=no
+    ;;
+
+  # FreeBSD 2.2.[012] allows us to include c++rt0.o to get C++ constructor
+  # support.  Future versions do this automatically, but an explicit c++rt0.o
+  # does not break anything, and helps significantly (at the cost of a little
+  # extra space).
+  freebsd2.2*)
+    archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags /usr/lib/c++rt0.o'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  # Unfortunately, older versions of FreeBSD 2 do not have this feature.
+  freebsd2*)
+    archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_direct=yes
+    hardcode_minus_L=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  # FreeBSD 3 and greater uses gcc -shared to do shared libraries.
+  freebsd*)
+    archive_cmds='$CC -shared -o $lib $libobjs $deplibs $compiler_flags'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  hpux9* | hpux10* | hpux11*)
+    case $host_os in
+    hpux9*) archive_cmds='$rm $output_objdir/$soname~$LD -b +b $install_libdir -o $output_objdir/$soname $libobjs $deplibs $linker_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib' ;;
+    *) archive_cmds='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags' ;;
+    esac
+    hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir'
+    hardcode_libdir_separator=:
+    hardcode_direct=yes
+    hardcode_minus_L=yes # Not in the search PATH, but as the default
+			 # location of the library.
+    export_dynamic_flag_spec='${wl}-E'
+    ;;
+
+  irix5* | irix6* | nonstopux*)
+    if test "$GCC" = yes; then
+      archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+      hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    else
+      archive_cmds='$LD -shared $libobjs $deplibs $linker_flags -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${output_objdir}/so_locations -o $lib'
+      hardcode_libdir_flag_spec='-rpath $libdir'
+    fi
+    hardcode_libdir_separator=:
+    link_all_deplibs=yes
+    ;;
+
+  netbsd*)
+    if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+      archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'  # a.out
+    else
+      archive_cmds='$LD -shared -o $lib $libobjs $deplibs $linker_flags'      # ELF
+    fi
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  newsos6)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_direct=yes
+    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    hardcode_libdir_separator=:
+    hardcode_shlibpath_var=no
+    ;;
+
+  openbsd*)
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    if test -z "`echo __ELF__ | $CC -E - | grep __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
+      archive_cmds='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
+      hardcode_libdir_flag_spec='${wl}-rpath,$libdir'
+      export_dynamic_flag_spec='${wl}-E'
+    else
+      case "$host_os" in
+      openbsd[[01]].* | openbsd2.[[0-7]] | openbsd2.[[0-7]].*)
+	archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+	hardcode_libdir_flag_spec='-R$libdir'
+        ;;
+      *)
+        archive_cmds='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
+        hardcode_libdir_flag_spec='${wl}-rpath,$libdir'
+        ;;
+      esac
+    fi
+    ;;
+
+  os2*)
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_minus_L=yes
+    allow_undefined_flag=unsupported
+    archive_cmds='$echo "LIBRARY $libname INITINSTANCE" > $output_objdir/$libname.def~$echo "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~$echo DATA >> $output_objdir/$libname.def~$echo " SINGLE NONSHARED" >> $output_objdir/$libname.def~$echo EXPORTS >> $output_objdir/$libname.def~emxexp $libobjs >> $output_objdir/$libname.def~$CC -Zdll -Zcrtdll -o $lib $libobjs $deplibs $compiler_flags $output_objdir/$libname.def'
+    old_archive_from_new_cmds='emximp -o $output_objdir/$libname.a $output_objdir/$libname.def'
+    ;;
+
+  osf3*)
+    if test "$GCC" = yes; then
+      allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*'
+      archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+    else
+      allow_undefined_flag=' -expect_unresolved \*'
+      archive_cmds='$LD -shared${allow_undefined_flag} $libobjs $deplibs $linker_flags -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${output_objdir}/so_locations -o $lib'
+    fi
+    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    hardcode_libdir_separator=:
+    ;;
+
+  osf4* | osf5*)	# as osf3* with the addition of -msym flag
+    if test "$GCC" = yes; then
+      allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*'
+      archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+      hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    else
+      allow_undefined_flag=' -expect_unresolved \*'
+      archive_cmds='$LD -shared${allow_undefined_flag} $libobjs $deplibs $linker_flags -msym -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${output_objdir}/so_locations -o $lib'
+      archive_expsym_cmds='for i in `cat $export_symbols`; do printf "-exported_symbol " >> $lib.exp; echo "\$i" >> $lib.exp; done; echo "-hidden">> $lib.exp~
+      $LD -shared${allow_undefined_flag} -input $lib.exp $linker_flags $libobjs $deplibs -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${objdir}/so_locations -o $lib~$rm $lib.exp'
+
+      #Both c and cxx compiler support -rpath directly
+      hardcode_libdir_flag_spec='-rpath $libdir'
+    fi
+    hardcode_libdir_separator=:
+    ;;
+
+  sco3.2v5*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_shlibpath_var=no
+    runpath_var=LD_RUN_PATH
+    hardcode_runpath_var=yes
+    export_dynamic_flag_spec='${wl}-Bexport'
+    ;;
+
+  solaris*)
+    # gcc --version < 3.0 without binutils cannot create self contained
+    # shared libraries reliably, requiring libgcc.a to resolve some of
+    # the object symbols generated in some cases.  Libraries that use
+    # assert need libgcc.a to resolve __eprintf, for example.  Linking
+    # a copy of libgcc.a into every shared library to guarantee resolving
+    # such symbols causes other problems:  According to Tim Van Holder
+    # <tim.van.holder@pandora.be>, C++ libraries end up with a separate
+    # (to the application) exception stack for one thing.
+    no_undefined_flag=' -z defs'
+    if test "$GCC" = yes; then
+      case `$CC --version 2>/dev/null` in
+      [[12]].*)
+	cat <<EOF 1>&2
+
+*** Warning: Releases of GCC earlier than version 3.0 cannot reliably
+*** create self contained shared libraries on Solaris systems, without
+*** introducing a dependency on libgcc.a.  Therefore, libtool is disabling
+*** -no-undefined support, which will at least allow you to build shared
+*** libraries.  However, you may find that when you link such libraries
+*** into an application without using GCC, you have to manually add
+*** \`gcc --print-libgcc-file-name\` to the link command.  We urge you to
+*** upgrade to a newer version of GCC.  Another option is to rebuild your
+*** current GCC to use the GNU linker from GNU binutils 2.9.1 or newer.
+
+EOF
+        no_undefined_flag=
+	;;
+      esac
+    fi
+    # $CC -shared without GNU ld will not create a library from C++
+    # object files and a static libstdc++, better avoid it by now
+    archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    archive_expsym_cmds='$echo "{ global:" > $lib.exp~cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $lib.exp~$echo "local: *; };" >> $lib.exp~
+		$LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$rm $lib.exp'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_shlibpath_var=no
+    case $host_os in
+    solaris2.[[0-5]] | solaris2.[[0-5]].*) ;;
+    *) # Supported since Solaris 2.6 (maybe 2.5.1?)
+      whole_archive_flag_spec='-z allextract$convenience -z defaultextract' ;;
+    esac
+    link_all_deplibs=yes
+    ;;
+
+  sunos4*)
+    if test "x$host_vendor" = xsequent; then
+      # Use $CC to link under sequent, because it throws in some extra .o
+      # files that make .init and .fini sections work.
+      archive_cmds='$CC -G ${wl}-h $soname -o $lib $libobjs $deplibs $compiler_flags'
+    else
+      archive_cmds='$LD -assert pure-text -Bstatic -o $lib $libobjs $deplibs $linker_flags'
+    fi
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_direct=yes
+    hardcode_minus_L=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  sysv4)
+    case $host_vendor in
+      sni)
+        archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+        hardcode_direct=yes # is this really true???
+        ;;
+      siemens)
+        ## LD is ld it makes a PLAMLIB
+        ## CC just makes a GrossModule.
+        archive_cmds='$LD -G -o $lib $libobjs $deplibs $linker_flags'
+        reload_cmds='$CC -r -o $output$reload_objs'
+        hardcode_direct=no
+        ;;
+      motorola)
+        archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+        hardcode_direct=no #Motorola manual says yes, but my tests say they lie
+        ;;
+    esac
+    runpath_var='LD_RUN_PATH'
+    hardcode_shlibpath_var=no
+    ;;
+
+  sysv4.3*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_shlibpath_var=no
+    export_dynamic_flag_spec='-Bexport'
+    ;;
+
+  sysv5*)
+    no_undefined_flag=' -z text'
+    # $CC -shared without GNU ld will not create a library from C++
+    # object files and a static libstdc++, better avoid it by now
+    archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    archive_expsym_cmds='$echo "{ global:" > $lib.exp~cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $lib.exp~$echo "local: *; };" >> $lib.exp~
+		$LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$rm $lib.exp'
+    hardcode_libdir_flag_spec=
+    hardcode_shlibpath_var=no
+    runpath_var='LD_RUN_PATH'
+    ;;
+
+  uts4*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_shlibpath_var=no
+    ;;
+
+  dgux*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_shlibpath_var=no
+    ;;
+
+  sysv4*MP*)
+    if test -d /usr/nec; then
+      archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+      hardcode_shlibpath_var=no
+      runpath_var=LD_RUN_PATH
+      hardcode_runpath_var=yes
+      ld_shlibs=yes
+    fi
+    ;;
+
+  sysv4.2uw2*)
+    archive_cmds='$LD -G -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_direct=yes
+    hardcode_minus_L=no
+    hardcode_shlibpath_var=no
+    hardcode_runpath_var=yes
+    runpath_var=LD_RUN_PATH
+    ;;
+
+  sysv5uw7* | unixware7*)
+    no_undefined_flag='${wl}-z ${wl}text'
+    if test "$GCC" = yes; then
+      archive_cmds='$CC -shared ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+    else
+      archive_cmds='$CC -G ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+    fi
+    runpath_var='LD_RUN_PATH'
+    hardcode_shlibpath_var=no
+    ;;
+
+  *)
+    ld_shlibs=no
+    ;;
+  esac
+fi
+AC_MSG_RESULT([$ld_shlibs])
+test "$ld_shlibs" = no && can_build_shared=no
+
+# Check hardcoding attributes.
+AC_MSG_CHECKING([how to hardcode library paths into programs])
+hardcode_action=
+if test -n "$hardcode_libdir_flag_spec" || \
+   test -n "$runpath_var"; then
+
+  # We can hardcode non-existant directories.
+  if test "$hardcode_direct" != no &&
+     # If the only mechanism to avoid hardcoding is shlibpath_var, we
+     # have to relink, otherwise we might link with an installed library
+     # when we should be linking with a yet-to-be-installed one
+     ## test "$hardcode_shlibpath_var" != no &&
+     test "$hardcode_minus_L" != no; then
+    # Linking always hardcodes the temporary library directory.
+    hardcode_action=relink
+  else
+    # We can link without hardcoding, and we can hardcode nonexisting dirs.
+    hardcode_action=immediate
+  fi
+else
+  # We cannot hardcode anything, or else we can only hardcode existing
+  # directories.
+  hardcode_action=unsupported
+fi
+AC_MSG_RESULT([$hardcode_action])
+
+striplib=
+old_striplib=
+AC_MSG_CHECKING([whether stripping libraries is possible])
+if test -n "$STRIP" && $STRIP -V 2>&1 | grep "GNU strip" >/dev/null; then
+  test -z "$old_striplib" && old_striplib="$STRIP --strip-debug"
+  test -z "$striplib" && striplib="$STRIP --strip-unneeded"
+  AC_MSG_RESULT([yes])
+else
+  AC_MSG_RESULT([no])
+fi
+
+reload_cmds='$LD$reload_flag -o $output$reload_objs'
+test -z "$deplibs_check_method" && deplibs_check_method=unknown
+
+# PORTME Fill in your ld.so characteristics
+AC_MSG_CHECKING([dynamic linker characteristics])
+library_names_spec=
+libname_spec='lib$name'
+soname_spec=
+postinstall_cmds=
+postuninstall_cmds=
+finish_cmds=
+finish_eval=
+shlibpath_var=
+shlibpath_overrides_runpath=unknown
+version_type=none
+dynamic_linker="$host_os ld.so"
+sys_lib_dlsearch_path_spec="/lib /usr/lib"
+sys_lib_search_path_spec="/lib /usr/lib /usr/local/lib"
+
+case $host_os in
+aix3*)
+  version_type=linux
+  library_names_spec='${libname}${release}.so$versuffix $libname.a'
+  shlibpath_var=LIBPATH
+
+  # AIX has no versioning support, so we append a major version to the name.
+  soname_spec='${libname}${release}.so$major'
+  ;;
+
+aix4* | aix5*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  hardcode_into_libs=yes
+  if test "$host_cpu" = ia64; then
+    # AIX 5 supports IA64
+    library_names_spec='${libname}${release}.so$major ${libname}${release}.so$versuffix $libname.so'
+    shlibpath_var=LD_LIBRARY_PATH
+  else
+    # With GCC up to 2.95.x, collect2 would create an import file
+    # for dependence libraries.  The import file would start with
+    # the line `#! .'.  This would cause the generated library to
+    # depend on `.', always an invalid library.  This was fixed in
+    # development snapshots of GCC prior to 3.0.
+    case $host_os in
+      aix4 | aix4.[[01]] | aix4.[[01]].*)
+	if { echo '#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 97)'
+	     echo ' yes '
+	     echo '#endif'; } | ${CC} -E - | grep yes > /dev/null; then
+	  :
+	else
+	  can_build_shared=no
+	fi
+	;;
+    esac
+    # AIX (on Power*) has no versioning support, so currently we can
+    # not hardcode correct soname into executable. Probably we can
+    # add versioning support to collect2, so additional links can
+    # be useful in future.
+    if test "$aix_use_runtimelinking" = yes; then
+      # If using run time linking (on AIX 4.2 or later) use lib<name>.so
+      # instead of lib<name>.a to let people know that these are not
+      # typical AIX shared libraries.
+      library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+    else
+      # We preserve .a as extension for shared libraries through AIX4.2
+      # and later when we are not doing run time linking.
+      library_names_spec='${libname}${release}.a $libname.a'
+      soname_spec='${libname}${release}.so$major'
+    fi
+    shlibpath_var=LIBPATH
+  fi
+  hardcode_into_libs=yes
+  ;;
+
+amigaos*)
+  library_names_spec='$libname.ixlibrary $libname.a'
+  # Create ${libname}_ixlibrary.a entries in /sys/libs.
+  finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`$echo "X$lib" | $Xsed -e '\''s%^.*/\([[^/]]*\)\.ixlibrary$%\1%'\''`; test $rm /sys/libs/${libname}_ixlibrary.a; $show "(cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a)"; (cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a) || exit 1; done'
+  ;;
+
+beos*)
+  library_names_spec='${libname}.so'
+  dynamic_linker="$host_os ld.so"
+  shlibpath_var=LIBRARY_PATH
+  ;;
+
+bsdi4*)
+  version_type=linux
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  sys_lib_search_path_spec="/shlib /usr/lib /usr/X11/lib /usr/contrib/lib /lib /usr/local/lib"
+  sys_lib_dlsearch_path_spec="/shlib /usr/lib /usr/local/lib"
+  export_dynamic_flag_spec=-rdynamic
+  # the default ld.so.conf also contains /usr/contrib/lib and
+  # /usr/X11R6/lib (/usr/X11 is a link to /usr/X11R6), but let us allow
+  # libtool to hard-code these into programs
+  ;;
+
+cygwin* | mingw* | pw32*)
+  version_type=windows
+  need_version=no
+  need_lib_prefix=no
+  case $GCC,$host_os in
+  yes,cygwin*)
+    library_names_spec='$libname.dll.a'
+    soname_spec='`echo ${libname} | sed -e 's/^lib/cyg/'``echo ${release} | sed -e 's/[[.]]/-/g'`${versuffix}.dll'
+    postinstall_cmds='dlpath=`bash 2>&1 -c '\''. $dir/${file}i;echo \$dlname'\''`~
+      dldir=$destdir/`dirname \$dlpath`~
+      test -d \$dldir || mkdir -p \$dldir~
+      $install_prog .libs/$dlname \$dldir/$dlname'
+    postuninstall_cmds='dldll=`bash 2>&1 -c '\''. $file; echo \$dlname'\''`~
+      dlpath=$dir/\$dldll~
+       $rm \$dlpath'
+    ;;
+  yes,mingw*)
+    library_names_spec='${libname}`echo ${release} | sed -e 's/[[.]]/-/g'`${versuffix}.dll'
+    sys_lib_search_path_spec=`$CC -print-search-dirs | grep "^libraries:" | sed -e "s/^libraries://" -e "s/;/ /g" -e "s,=/,/,g"`
+    ;;
+  yes,pw32*)
+    library_names_spec='`echo ${libname} | sed -e 's/^lib/pw/'``echo ${release} | sed -e 's/[.]/-/g'`${versuffix}.dll'
+    ;;
+  *)
+    library_names_spec='${libname}`echo ${release} | sed -e 's/[[.]]/-/g'`${versuffix}.dll $libname.lib'
+    ;;
+  esac
+  dynamic_linker='Win32 ld.exe'
+  # FIXME: first we should search . and the directory the executable is in
+  shlibpath_var=PATH
+  ;;
+
+darwin* | rhapsody*)
+  dynamic_linker="$host_os dyld"
+  version_type=darwin
+  need_lib_prefix=no
+  need_version=no
+  # FIXME: Relying on posixy $() will cause problems for
+  #        cross-compilation, but unfortunately the echo tests do not
+  #        yet detect zsh echo's removal of \ escapes.
+  library_names_spec='${libname}${release}${versuffix}.$(test .$module = .yes && echo so || echo dylib) ${libname}${release}${major}.$(test .$module = .yes && echo so || echo dylib) ${libname}.$(test .$module = .yes && echo so || echo dylib)'
+  soname_spec='${libname}${release}${major}.$(test .$module = .yes && echo so || echo dylib)'
+  shlibpath_overrides_runpath=yes
+  shlibpath_var=DYLD_LIBRARY_PATH
+  ;;
+
+freebsd1*)
+  dynamic_linker=no
+  ;;
+
+freebsd*)
+  objformat=`test -x /usr/bin/objformat && /usr/bin/objformat || echo aout`
+  version_type=freebsd-$objformat
+  case $version_type in
+    freebsd-elf*)
+      library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so $libname.so'
+      need_version=no
+      need_lib_prefix=no
+      ;;
+    freebsd-*)
+      library_names_spec='${libname}${release}.so$versuffix $libname.so$versuffix'
+      need_version=yes
+      ;;
+  esac
+  shlibpath_var=LD_LIBRARY_PATH
+  case $host_os in
+  freebsd2*)
+    shlibpath_overrides_runpath=yes
+    ;;
+  *)
+    shlibpath_overrides_runpath=no
+    hardcode_into_libs=yes
+    ;;
+  esac
+  ;;
+
+gnu*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so${major} ${libname}.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  hardcode_into_libs=yes
+  ;;
+
+hpux9* | hpux10* | hpux11*)
+  # Give a soname corresponding to the major version so that dld.sl refuses to
+  # link against other versions.
+  dynamic_linker="$host_os dld.sl"
+  version_type=sunos
+  need_lib_prefix=no
+  need_version=no
+  shlibpath_var=SHLIB_PATH
+  shlibpath_overrides_runpath=no # +s is required to enable SHLIB_PATH
+  library_names_spec='${libname}${release}.sl$versuffix ${libname}${release}.sl$major $libname.sl'
+  soname_spec='${libname}${release}.sl$major'
+  # HP-UX runs *really* slowly unless shared libraries are mode 555.
+  postinstall_cmds='chmod 555 $lib'
+  ;;
+
+irix5* | irix6* | nonstopux*)
+  case $host_os in
+    nonstopux*) version_type=nonstopux ;;
+    *)          version_type=irix ;;
+  esac
+  need_lib_prefix=no
+  need_version=no
+  soname_spec='${libname}${release}.so$major'
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major ${libname}${release}.so $libname.so'
+  case $host_os in
+  irix5* | nonstopux*)
+    libsuff= shlibsuff=
+    ;;
+  *)
+    case $LD in # libtool.m4 will add one of these switches to LD
+    *-32|*"-32 ") libsuff= shlibsuff= libmagic=32-bit;;
+    *-n32|*"-n32 ") libsuff=32 shlibsuff=N32 libmagic=N32;;
+    *-64|*"-64 ") libsuff=64 shlibsuff=64 libmagic=64-bit;;
+    *) libsuff= shlibsuff= libmagic=never-match;;
+    esac
+    ;;
+  esac
+  shlibpath_var=LD_LIBRARY${shlibsuff}_PATH
+  shlibpath_overrides_runpath=no
+  sys_lib_search_path_spec="/usr/lib${libsuff} /lib${libsuff} /usr/local/lib${libsuff}"
+  sys_lib_dlsearch_path_spec="/usr/lib${libsuff} /lib${libsuff}"
+  ;;
+
+# No shared lib support for Linux oldld, aout, or coff.
+linux-gnuoldld* | linux-gnuaout* | linux-gnucoff*)
+  dynamic_linker=no
+  ;;
+
+# This must be Linux ELF.
+linux-gnu*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig -n $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  # This implies no fast_install, which is unacceptable.
+  # Some rework will be needed to allow for fast_install
+  # before this can be enabled.
+  hardcode_into_libs=yes
+
+  # We used to test for /lib/ld.so.1 and disable shared libraries on
+  # powerpc, because MkLinux only supported shared libraries with the
+  # GNU dynamic linker.  Since this was broken with cross compilers,
+  # most powerpc-linux boxes support dynamic linking these days and
+  # people can always --disable-shared, the test was removed, and we
+  # assume the GNU/Linux dynamic linker is in use.
+  dynamic_linker='GNU/Linux ld.so'
+
+  # Find out which ABI we are using (multilib Linux x86_64 hack).
+  libsuff=
+  case "$host_cpu" in
+  x86_64*|s390x*)
+    echo '[#]line __oline__ "configure"' > conftest.$ac_ext
+    if AC_TRY_EVAL(ac_compile); then
+      case `/usr/bin/file conftest.$ac_objext` in
+      *64-bit*)
+        libsuff=64
+        ;;
+      esac
+    fi
+    rm -rf conftest*
+    ;;
+  *)
+    ;;
+  esac
+  sys_lib_dlsearch_path_spec="/lib${libsuff} /usr/lib${libsuff}"
+  sys_lib_search_path_spec="/lib${libsuff} /usr/lib${libsuff} /usr/local/lib${libsuff}"
+  ;;
+
+netbsd*)
+  version_type=sunos
+  need_lib_prefix=no
+  need_version=no
+  if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+    library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix'
+    finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+    dynamic_linker='NetBSD (a.out) ld.so'
+  else
+    library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major ${libname}${release}.so ${libname}.so'
+    soname_spec='${libname}${release}.so$major'
+    dynamic_linker='NetBSD ld.elf_so'
+  fi
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  ;;
+
+newsos6)
+  version_type=linux
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  ;;
+
+openbsd*)
+  version_type=sunos
+  need_lib_prefix=no
+  need_version=no
+  if test -z "`echo __ELF__ | $CC -E - | grep __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
+    case "$host_os" in
+    openbsd2.[[89]] | openbsd2.[[89]].*)
+      shlibpath_overrides_runpath=no
+      ;;
+    *)
+      shlibpath_overrides_runpath=yes
+      ;;
+    esac
+  else
+    shlibpath_overrides_runpath=yes
+  fi
+  library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+os2*)
+  libname_spec='$name'
+  need_lib_prefix=no
+  library_names_spec='$libname.dll $libname.a'
+  dynamic_linker='OS/2 ld.exe'
+  shlibpath_var=LIBPATH
+  ;;
+
+osf3* | osf4* | osf5*)
+  version_type=osf
+  need_version=no
+  soname_spec='${libname}${release}.so$major'
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  shlibpath_var=LD_LIBRARY_PATH
+  sys_lib_search_path_spec="/usr/shlib /usr/ccs/lib /usr/lib/cmplrs/cc /usr/lib /usr/local/lib /var/shlib"
+  sys_lib_dlsearch_path_spec="$sys_lib_search_path_spec"
+  hardcode_into_libs=yes
+  ;;
+
+sco3.2v5*)
+  version_type=osf
+  soname_spec='${libname}${release}.so$major'
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+solaris*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  # ldd complains unless libraries are executable
+  postinstall_cmds='chmod +x $lib'
+  ;;
+
+sunos4*)
+  version_type=sunos
+  library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix'
+  finish_cmds='PATH="\$PATH:/usr/etc" ldconfig $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  if test "$with_gnu_ld" = yes; then
+    need_lib_prefix=no
+  fi
+  need_version=yes
+  ;;
+
+sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+  version_type=linux
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  case $host_vendor in
+    sni)
+      shlibpath_overrides_runpath=no
+      need_lib_prefix=no
+      export_dynamic_flag_spec='${wl}-Blargedynsym'
+      runpath_var=LD_RUN_PATH
+      ;;
+    siemens)
+      need_lib_prefix=no
+      ;;
+    motorola)
+      need_lib_prefix=no
+      need_version=no
+      shlibpath_overrides_runpath=no
+      sys_lib_search_path_spec='/lib /usr/lib /usr/ccs/lib'
+      ;;
+  esac
+  ;;
+
+uts4*)
+  version_type=linux
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+dgux*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+sysv4*MP*)
+  if test -d /usr/nec ;then
+    version_type=linux
+    library_names_spec='$libname.so.$versuffix $libname.so.$major $libname.so'
+    soname_spec='$libname.so.$major'
+    shlibpath_var=LD_LIBRARY_PATH
+  fi
+  ;;
+
+*)
+  dynamic_linker=no
+  ;;
+esac
+AC_MSG_RESULT([$dynamic_linker])
+test "$dynamic_linker" = no && can_build_shared=no
+
+# Report the final consequences.
+AC_MSG_CHECKING([if libtool supports shared libraries])
+AC_MSG_RESULT([$can_build_shared])
+
+AC_MSG_CHECKING([whether to build shared libraries])
+test "$can_build_shared" = "no" && enable_shared=no
+
+# On AIX, shared libraries and static libraries use the same namespace, and
+# are all built from PIC.
+case "$host_os" in
+aix3*)
+  test "$enable_shared" = yes && enable_static=no
+  if test -n "$RANLIB"; then
+    archive_cmds="$archive_cmds~\$RANLIB \$lib"
+    postinstall_cmds='$RANLIB $lib'
+  fi
+  ;;
+
+aix4*)
+  if test "$host_cpu" != ia64 && test "$aix_use_runtimelinking" = no ; then
+    test "$enable_shared" = yes && enable_static=no
+  fi
+  ;;
+esac
+AC_MSG_RESULT([$enable_shared])
+
+AC_MSG_CHECKING([whether to build static libraries])
+# Make sure either enable_shared or enable_static is yes.
+test "$enable_shared" = yes || enable_static=yes
+AC_MSG_RESULT([$enable_static])
+
+if test "$hardcode_action" = relink; then
+  # Fast installation is not supported
+  enable_fast_install=no
+elif test "$shlibpath_overrides_runpath" = yes ||
+     test "$enable_shared" = no; then
+  # Fast installation is not necessary
+  enable_fast_install=needless
+fi
+
+variables_saved_for_relink="PATH $shlibpath_var $runpath_var"
+if test "$GCC" = yes; then
+  variables_saved_for_relink="$variables_saved_for_relink GCC_EXEC_PREFIX COMPILER_PATH LIBRARY_PATH"
+fi
+
+AC_LIBTOOL_DLOPEN_SELF
+
+if test "$enable_shared" = yes && test "$GCC" = yes; then
+  case $archive_cmds in
+  *'~'*)
+    # FIXME: we may have to deal with multi-command sequences.
+    ;;
+  '$CC '*)
+    # Test whether the compiler implicitly links with -lc since on some
+    # systems, -lgcc has to come before -lc. If gcc already passes -lc
+    # to ld, don't add -lc before -lgcc.
+    AC_MSG_CHECKING([whether -lc should be explicitly linked in])
+    AC_CACHE_VAL([lt_cv_archive_cmds_need_lc],
+    [$rm conftest*
+    echo 'static int dummy;' > conftest.$ac_ext
+
+    if AC_TRY_EVAL(ac_compile); then
+      soname=conftest
+      lib=conftest
+      libobjs=conftest.$ac_objext
+      deplibs=
+      wl=$lt_cv_prog_cc_wl
+      compiler_flags=-v
+      linker_flags=-v
+      verstring=
+      output_objdir=.
+      libname=conftest
+      save_allow_undefined_flag=$allow_undefined_flag
+      allow_undefined_flag=
+      if AC_TRY_EVAL(archive_cmds 2\>\&1 \| grep \" -lc \" \>/dev/null 2\>\&1)
+      then
+	lt_cv_archive_cmds_need_lc=no
+      else
+	lt_cv_archive_cmds_need_lc=yes
+      fi
+      allow_undefined_flag=$save_allow_undefined_flag
+    else
+      cat conftest.err 1>&5
+    fi
+    $rm conftest*])
+    AC_MSG_RESULT([$lt_cv_archive_cmds_need_lc])
+    ;;
+  esac
+fi
+need_lc=${lt_cv_archive_cmds_need_lc-yes}
+
+# The second clause should only fire when bootstrapping the
+# libtool distribution, otherwise you forgot to ship ltmain.sh
+# with your package, and you will get complaints that there are
+# no rules to generate ltmain.sh.
+if test -f "$ltmain"; then
+  :
+else
+  # If there is no Makefile yet, we rely on a make rule to execute
+  # `config.status --recheck' to rerun these tests and create the
+  # libtool script then.
+  test -f Makefile && make "$ltmain"
+fi
+
+if test -f "$ltmain"; then
+  trap "$rm \"${ofile}T\"; exit 1" 1 2 15
+  $rm -f "${ofile}T"
+
+  echo creating $ofile
+
+  # Now quote all the things that may contain metacharacters while being
+  # careful not to overquote the AC_SUBSTed values.  We take copies of the
+  # variables and quote the copies for generation of the libtool script.
+  for var in echo old_CC old_CFLAGS SED \
+    AR AR_FLAGS CC LD LN_S NM SHELL \
+    reload_flag reload_cmds wl \
+    pic_flag link_static_flag no_builtin_flag export_dynamic_flag_spec \
+    thread_safe_flag_spec whole_archive_flag_spec libname_spec \
+    library_names_spec soname_spec \
+    RANLIB old_archive_cmds old_archive_from_new_cmds old_postinstall_cmds \
+    old_postuninstall_cmds archive_cmds archive_expsym_cmds postinstall_cmds \
+    postuninstall_cmds extract_expsyms_cmds old_archive_from_expsyms_cmds \
+    old_striplib striplib file_magic_cmd export_symbols_cmds \
+    deplibs_check_method allow_undefined_flag no_undefined_flag \
+    finish_cmds finish_eval global_symbol_pipe global_symbol_to_cdecl \
+    global_symbol_to_c_name_address \
+    hardcode_libdir_flag_spec hardcode_libdir_separator  \
+    sys_lib_search_path_spec sys_lib_dlsearch_path_spec \
+    compiler_c_o compiler_o_lo need_locks exclude_expsyms include_expsyms; do
+
+    case $var in
+    reload_cmds | old_archive_cmds | old_archive_from_new_cmds | \
+    old_postinstall_cmds | old_postuninstall_cmds | \
+    export_symbols_cmds | archive_cmds | archive_expsym_cmds | \
+    extract_expsyms_cmds | old_archive_from_expsyms_cmds | \
+    postinstall_cmds | postuninstall_cmds | \
+    finish_cmds | sys_lib_search_path_spec | sys_lib_dlsearch_path_spec)
+      # Double-quote double-evaled strings.
+      eval "lt_$var=\\\"\`\$echo \"X\$$var\" | \$Xsed -e \"\$double_quote_subst\" -e \"\$sed_quote_subst\" -e \"\$delay_variable_subst\"\`\\\""
+      ;;
+    *)
+      eval "lt_$var=\\\"\`\$echo \"X\$$var\" | \$Xsed -e \"\$sed_quote_subst\"\`\\\""
+      ;;
+    esac
+  done
+
+  cat <<__EOF__ > "${ofile}T"
+#! $SHELL
+
+# `$echo "$ofile" | sed 's%^.*/%%'` - Provide generalized library-building support services.
+# Generated automatically by $PROGRAM (GNU $PACKAGE $VERSION$TIMESTAMP)
+# NOTE: Changes made to this file will be lost: look at ltmain.sh.
+#
+# Copyright (C) 1996-2000 Free Software Foundation, Inc.
+# Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# A sed that does not truncate output.
+SED=$lt_SED
+
+# Sed that helps us avoid accidentally triggering echo(1) options like -n.
+Xsed="${SED} -e s/^X//"
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+if test "X\${CDPATH+set}" = Xset; then CDPATH=:; export CDPATH; fi
+
+# ### BEGIN LIBTOOL CONFIG
+
+# Libtool was configured on host `(hostname || uname -n) 2>/dev/null | sed 1q`:
+
+# Shell to use when invoking shell scripts.
+SHELL=$lt_SHELL
+
+# Whether or not to build shared libraries.
+build_libtool_libs=$enable_shared
+
+# Whether or not to build static libraries.
+build_old_libs=$enable_static
+
+# Whether or not to add -lc for building shared libraries.
+build_libtool_need_lc=$need_lc
+
+# Whether or not to optimize for fast installation.
+fast_install=$enable_fast_install
+
+# The host system.
+host_alias=$host_alias
+host=$host
+
+# An echo program that does not interpret backslashes.
+echo=$lt_echo
+
+# The archiver.
+AR=$lt_AR
+AR_FLAGS=$lt_AR_FLAGS
+
+# The default C compiler.
+CC=$lt_CC
+
+# Is the compiler the GNU C compiler?
+with_gcc=$GCC
+
+# The linker used to build libraries.
+LD=$lt_LD
+
+# Whether we need hard or soft links.
+LN_S=$lt_LN_S
+
+# A BSD-compatible nm program.
+NM=$lt_NM
+
+# A symbol stripping program
+STRIP=$STRIP
+
+# Used to examine libraries when file_magic_cmd begins "file"
+MAGIC_CMD=$MAGIC_CMD
+
+# Used on cygwin: DLL creation program.
+DLLTOOL="$DLLTOOL"
+
+# Used on cygwin: object dumper.
+OBJDUMP="$OBJDUMP"
+
+# Used on cygwin: assembler.
+AS="$AS"
+
+# The name of the directory that contains temporary libtool files.
+objdir=$objdir
+
+# How to create reloadable object files.
+reload_flag=$lt_reload_flag
+reload_cmds=$lt_reload_cmds
+
+# How to pass a linker flag through the compiler.
+wl=$lt_wl
+
+# Object file suffix (normally "o").
+objext="$ac_objext"
+
+# Old archive suffix (normally "a").
+libext="$libext"
+
+# Executable file suffix (normally "").
+exeext="$exeext"
+
+# Additional compiler flags for building library objects.
+pic_flag=$lt_pic_flag
+pic_mode=$pic_mode
+
+# Does compiler simultaneously support -c and -o options?
+compiler_c_o=$lt_compiler_c_o
+
+# Can we write directly to a .lo ?
+compiler_o_lo=$lt_compiler_o_lo
+
+# Must we lock files when doing compilation ?
+need_locks=$lt_need_locks
+
+# Do we need the lib prefix for modules?
+need_lib_prefix=$need_lib_prefix
+
+# Do we need a version for libraries?
+need_version=$need_version
+
+# Whether dlopen is supported.
+dlopen_support=$enable_dlopen
+
+# Whether dlopen of programs is supported.
+dlopen_self=$enable_dlopen_self
+
+# Whether dlopen of statically linked programs is supported.
+dlopen_self_static=$enable_dlopen_self_static
+
+# Compiler flag to prevent dynamic linking.
+link_static_flag=$lt_link_static_flag
+
+# Compiler flag to turn off builtin functions.
+no_builtin_flag=$lt_no_builtin_flag
+
+# Compiler flag to allow reflexive dlopens.
+export_dynamic_flag_spec=$lt_export_dynamic_flag_spec
+
+# Compiler flag to generate shared objects directly from archives.
+whole_archive_flag_spec=$lt_whole_archive_flag_spec
+
+# Compiler flag to generate thread-safe objects.
+thread_safe_flag_spec=$lt_thread_safe_flag_spec
+
+# Library versioning type.
+version_type=$version_type
+
+# Format of library name prefix.
+libname_spec=$lt_libname_spec
+
+# List of archive names.  First name is the real one, the rest are links.
+# The last name is the one that the linker finds with -lNAME.
+library_names_spec=$lt_library_names_spec
+
+# The coded name of the library, if different from the real name.
+soname_spec=$lt_soname_spec
+
+# Commands used to build and install an old-style archive.
+RANLIB=$lt_RANLIB
+old_archive_cmds=$lt_old_archive_cmds
+old_postinstall_cmds=$lt_old_postinstall_cmds
+old_postuninstall_cmds=$lt_old_postuninstall_cmds
+
+# Create an old-style archive from a shared archive.
+old_archive_from_new_cmds=$lt_old_archive_from_new_cmds
+
+# Create a temporary old-style archive to link instead of a shared archive.
+old_archive_from_expsyms_cmds=$lt_old_archive_from_expsyms_cmds
+
+# Commands used to build and install a shared archive.
+archive_cmds=$lt_archive_cmds
+archive_expsym_cmds=$lt_archive_expsym_cmds
+postinstall_cmds=$lt_postinstall_cmds
+postuninstall_cmds=$lt_postuninstall_cmds
+
+# Commands to strip libraries.
+old_striplib=$lt_old_striplib
+striplib=$lt_striplib
+
+# Method to check whether dependent libraries are shared objects.
+deplibs_check_method=$lt_deplibs_check_method
+
+# Command to use when deplibs_check_method == file_magic.
+file_magic_cmd=$lt_file_magic_cmd
+
+# Flag that allows shared libraries with undefined symbols to be built.
+allow_undefined_flag=$lt_allow_undefined_flag
+
+# Flag that forces no undefined symbols.
+no_undefined_flag=$lt_no_undefined_flag
+
+# Commands used to finish a libtool library installation in a directory.
+finish_cmds=$lt_finish_cmds
+
+# Same as above, but a single script fragment to be evaled but not shown.
+finish_eval=$lt_finish_eval
+
+# Take the output of nm and produce a listing of raw symbols and C names.
+global_symbol_pipe=$lt_global_symbol_pipe
+
+# Transform the output of nm in a proper C declaration
+global_symbol_to_cdecl=$lt_global_symbol_to_cdecl
+
+# Transform the output of nm in a C name address pair
+global_symbol_to_c_name_address=$lt_global_symbol_to_c_name_address
+
+# This is the shared library runtime path variable.
+runpath_var=$runpath_var
+
+# This is the shared library path variable.
+shlibpath_var=$shlibpath_var
+
+# Is shlibpath searched before the hard-coded library search path?
+shlibpath_overrides_runpath=$shlibpath_overrides_runpath
+
+# How to hardcode a shared library path into an executable.
+hardcode_action=$hardcode_action
+
+# Whether we should hardcode library paths into libraries.
+hardcode_into_libs=$hardcode_into_libs
+
+# Flag to hardcode \$libdir into a binary during linking.
+# This must work even if \$libdir does not exist.
+hardcode_libdir_flag_spec=$lt_hardcode_libdir_flag_spec
+
+# Whether we need a single -rpath flag with a separated argument.
+hardcode_libdir_separator=$lt_hardcode_libdir_separator
+
+# Set to yes if using DIR/libNAME.so during linking hardcodes DIR into the
+# resulting binary.
+hardcode_direct=$hardcode_direct
+
+# Set to yes if using the -LDIR flag during linking hardcodes DIR into the
+# resulting binary.
+hardcode_minus_L=$hardcode_minus_L
+
+# Set to yes if using SHLIBPATH_VAR=DIR during linking hardcodes DIR into
+# the resulting binary.
+hardcode_shlibpath_var=$hardcode_shlibpath_var
+
+# Variables whose values should be saved in libtool wrapper scripts and
+# restored at relink time.
+variables_saved_for_relink="$variables_saved_for_relink"
+
+# Whether libtool must link a program against all its dependency libraries.
+link_all_deplibs=$link_all_deplibs
+
+# Compile-time system search path for libraries
+sys_lib_search_path_spec=$lt_sys_lib_search_path_spec
+
+# Run-time system search path for libraries
+sys_lib_dlsearch_path_spec=$lt_sys_lib_dlsearch_path_spec
+
+# Fix the shell variable \$srcfile for the compiler.
+fix_srcfile_path="$fix_srcfile_path"
+
+# Set to yes if exported symbols are required.
+always_export_symbols=$always_export_symbols
+
+# The commands to list exported symbols.
+export_symbols_cmds=$lt_export_symbols_cmds
+
+# The commands to extract the exported symbol list from a shared archive.
+extract_expsyms_cmds=$lt_extract_expsyms_cmds
+
+# Symbols that should not be listed in the preloaded symbols.
+exclude_expsyms=$lt_exclude_expsyms
+
+# Symbols that must always be exported.
+include_expsyms=$lt_include_expsyms
+
+# ### END LIBTOOL CONFIG
+
+__EOF__
+
+  case $host_os in
+  aix3*)
+    cat <<\EOF >> "${ofile}T"
+
+# AIX sometimes has problems with the GCC collect2 program.  For some
+# reason, if we set the COLLECT_NAMES environment variable, the problems
+# vanish in a puff of smoke.
+if test "X${COLLECT_NAMES+set}" != Xset; then
+  COLLECT_NAMES=
+  export COLLECT_NAMES
+fi
+EOF
+    ;;
+  esac
+
+  case $host_os in
+  cygwin* | mingw* | pw32* | os2*)
+    cat <<'EOF' >> "${ofile}T"
+      # This is a source program that is used to create dlls on Windows
+      # Don't remove nor modify the starting and closing comments
+# /* ltdll.c starts here */
+# #define WIN32_LEAN_AND_MEAN
+# #include <windows.h>
+# #undef WIN32_LEAN_AND_MEAN
+# #include <stdio.h>
+#
+# #ifndef __CYGWIN__
+# #  ifdef __CYGWIN32__
+# #    define __CYGWIN__ __CYGWIN32__
+# #  endif
+# #endif
+#
+# #ifdef __cplusplus
+# extern "C" {
+# #endif
+# BOOL APIENTRY DllMain (HINSTANCE hInst, DWORD reason, LPVOID reserved);
+# #ifdef __cplusplus
+# }
+# #endif
+#
+# #ifdef __CYGWIN__
+# #include <cygwin/cygwin_dll.h>
+# DECLARE_CYGWIN_DLL( DllMain );
+# #endif
+# HINSTANCE __hDllInstance_base;
+#
+# BOOL APIENTRY
+# DllMain (HINSTANCE hInst, DWORD reason, LPVOID reserved)
+# {
+#   __hDllInstance_base = hInst;
+#   return TRUE;
+# }
+# /* ltdll.c ends here */
+	# This is a source program that is used to create import libraries
+	# on Windows for dlls which lack them. Don't remove nor modify the
+	# starting and closing comments
+# /* impgen.c starts here */
+# /*   Copyright (C) 1999-2000 Free Software Foundation, Inc.
+#
+#  This file is part of GNU libtool.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#  */
+#
+# #include <stdio.h>		/* for printf() */
+# #include <unistd.h>		/* for open(), lseek(), read() */
+# #include <fcntl.h>		/* for O_RDONLY, O_BINARY */
+# #include <string.h>		/* for strdup() */
+#
+# /* O_BINARY isn't required (or even defined sometimes) under Unix */
+# #ifndef O_BINARY
+# #define O_BINARY 0
+# #endif
+#
+# static unsigned int
+# pe_get16 (fd, offset)
+#      int fd;
+#      int offset;
+# {
+#   unsigned char b[2];
+#   lseek (fd, offset, SEEK_SET);
+#   read (fd, b, 2);
+#   return b[0] + (b[1]<<8);
+# }
+#
+# static unsigned int
+# pe_get32 (fd, offset)
+#     int fd;
+#     int offset;
+# {
+#   unsigned char b[4];
+#   lseek (fd, offset, SEEK_SET);
+#   read (fd, b, 4);
+#   return b[0] + (b[1]<<8) + (b[2]<<16) + (b[3]<<24);
+# }
+#
+# static unsigned int
+# pe_as32 (ptr)
+#      void *ptr;
+# {
+#   unsigned char *b = ptr;
+#   return b[0] + (b[1]<<8) + (b[2]<<16) + (b[3]<<24);
+# }
+#
+# int
+# main (argc, argv)
+#     int argc;
+#     char *argv[];
+# {
+#     int dll;
+#     unsigned long pe_header_offset, opthdr_ofs, num_entries, i;
+#     unsigned long export_rva, export_size, nsections, secptr, expptr;
+#     unsigned long name_rvas, nexp;
+#     unsigned char *expdata, *erva;
+#     char *filename, *dll_name;
+#
+#     filename = argv[1];
+#
+#     dll = open(filename, O_RDONLY|O_BINARY);
+#     if (dll < 1)
+# 	return 1;
+#
+#     dll_name = filename;
+#
+#     for (i=0; filename[i]; i++)
+# 	if (filename[i] == '/' || filename[i] == '\\'  || filename[i] == ':')
+# 	    dll_name = filename + i +1;
+#
+#     pe_header_offset = pe_get32 (dll, 0x3c);
+#     opthdr_ofs = pe_header_offset + 4 + 20;
+#     num_entries = pe_get32 (dll, opthdr_ofs + 92);
+#
+#     if (num_entries < 1) /* no exports */
+# 	return 1;
+#
+#     export_rva = pe_get32 (dll, opthdr_ofs + 96);
+#     export_size = pe_get32 (dll, opthdr_ofs + 100);
+#     nsections = pe_get16 (dll, pe_header_offset + 4 +2);
+#     secptr = (pe_header_offset + 4 + 20 +
+# 	      pe_get16 (dll, pe_header_offset + 4 + 16));
+#
+#     expptr = 0;
+#     for (i = 0; i < nsections; i++)
+#     {
+# 	char sname[8];
+# 	unsigned long secptr1 = secptr + 40 * i;
+# 	unsigned long vaddr = pe_get32 (dll, secptr1 + 12);
+# 	unsigned long vsize = pe_get32 (dll, secptr1 + 16);
+# 	unsigned long fptr = pe_get32 (dll, secptr1 + 20);
+# 	lseek(dll, secptr1, SEEK_SET);
+# 	read(dll, sname, 8);
+# 	if (vaddr <= export_rva && vaddr+vsize > export_rva)
+# 	{
+# 	    expptr = fptr + (export_rva - vaddr);
+# 	    if (export_rva + export_size > vaddr + vsize)
+# 		export_size = vsize - (export_rva - vaddr);
+# 	    break;
+# 	}
+#     }
+#
+#     expdata = (unsigned char*)malloc(export_size);
+#     lseek (dll, expptr, SEEK_SET);
+#     read (dll, expdata, export_size);
+#     erva = expdata - export_rva;
+#
+#     nexp = pe_as32 (expdata+24);
+#     name_rvas = pe_as32 (expdata+32);
+#
+#     printf ("EXPORTS\n");
+#     for (i = 0; i<nexp; i++)
+#     {
+# 	unsigned long name_rva = pe_as32 (erva+name_rvas+i*4);
+# 	printf ("\t%s @ %ld ;\n", erva+name_rva, 1+ i);
+#     }
+#
+#     return 0;
+# }
+# /* impgen.c ends here */
+
+EOF
+    ;;
+  esac
+
+  # We use sed instead of cat because bash on DJGPP gets confused if
+  # if finds mixed CR/LF and LF-only lines.  Since sed operates in
+  # text mode, it properly converts lines to CR/LF.  This bash problem
+  # is reportedly fixed, but why not run on old versions too?
+  sed '$q' "$ltmain" >> "${ofile}T" || (rm -f "${ofile}T"; exit 1)
+
+  mv -f "${ofile}T" "$ofile" || \
+    (rm -f "$ofile" && cp "${ofile}T" "$ofile" && rm -f "${ofile}T")
+  chmod +x "$ofile"
+fi
+
+])# _LT_AC_LTCONFIG_HACK
+
+# AC_LIBTOOL_DLOPEN - enable checks for dlopen support
+AC_DEFUN([AC_LIBTOOL_DLOPEN], [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])])
+
+# AC_LIBTOOL_WIN32_DLL - declare package support for building win32 dll's
+AC_DEFUN([AC_LIBTOOL_WIN32_DLL], [AC_BEFORE([$0], [AC_LIBTOOL_SETUP])])
+
+# AC_ENABLE_SHARED - implement the --enable-shared flag
+# Usage: AC_ENABLE_SHARED[(DEFAULT)]
+#   Where DEFAULT is either `yes' or `no'.  If omitted, it defaults to
+#   `yes'.
+AC_DEFUN([AC_ENABLE_SHARED],
+[define([AC_ENABLE_SHARED_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(shared,
+changequote(<<, >>)dnl
+<<  --enable-shared[=PKGS]  build shared libraries [default=>>AC_ENABLE_SHARED_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case $enableval in
+yes) enable_shared=yes ;;
+no) enable_shared=no ;;
+*)
+  enable_shared=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_shared=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac],
+enable_shared=AC_ENABLE_SHARED_DEFAULT)dnl
+])
+
+# AC_DISABLE_SHARED - set the default shared flag to --disable-shared
+AC_DEFUN([AC_DISABLE_SHARED],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_SHARED(no)])
+
+# AC_ENABLE_STATIC - implement the --enable-static flag
+# Usage: AC_ENABLE_STATIC[(DEFAULT)]
+#   Where DEFAULT is either `yes' or `no'.  If omitted, it defaults to
+#   `yes'.
+AC_DEFUN([AC_ENABLE_STATIC],
+[define([AC_ENABLE_STATIC_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(static,
+changequote(<<, >>)dnl
+<<  --enable-static[=PKGS]  build static libraries [default=>>AC_ENABLE_STATIC_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case $enableval in
+yes) enable_static=yes ;;
+no) enable_static=no ;;
+*)
+  enable_static=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_static=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac],
+enable_static=AC_ENABLE_STATIC_DEFAULT)dnl
+])
+
+# AC_DISABLE_STATIC - set the default static flag to --disable-static
+AC_DEFUN([AC_DISABLE_STATIC],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_STATIC(no)])
+
+
+# AC_ENABLE_FAST_INSTALL - implement the --enable-fast-install flag
+# Usage: AC_ENABLE_FAST_INSTALL[(DEFAULT)]
+#   Where DEFAULT is either `yes' or `no'.  If omitted, it defaults to
+#   `yes'.
+AC_DEFUN([AC_ENABLE_FAST_INSTALL],
+[define([AC_ENABLE_FAST_INSTALL_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(fast-install,
+changequote(<<, >>)dnl
+<<  --enable-fast-install[=PKGS]  optimize for fast installation [default=>>AC_ENABLE_FAST_INSTALL_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case $enableval in
+yes) enable_fast_install=yes ;;
+no) enable_fast_install=no ;;
+*)
+  enable_fast_install=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_fast_install=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac],
+enable_fast_install=AC_ENABLE_FAST_INSTALL_DEFAULT)dnl
+])
+
+# AC_DISABLE_FAST_INSTALL - set the default to --disable-fast-install
+AC_DEFUN([AC_DISABLE_FAST_INSTALL],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_FAST_INSTALL(no)])
+
+# AC_LIBTOOL_PICMODE - implement the --with-pic flag
+# Usage: AC_LIBTOOL_PICMODE[(MODE)]
+#   Where MODE is either `yes' or `no'.  If omitted, it defaults to
+#   `both'.
+AC_DEFUN([AC_LIBTOOL_PICMODE],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+pic_mode=ifelse($#,1,$1,default)])
+
+
+# AC_PATH_TOOL_PREFIX - find a file program which can recognise shared library
+AC_DEFUN([AC_PATH_TOOL_PREFIX],
+[AC_MSG_CHECKING([for $1])
+AC_CACHE_VAL(lt_cv_path_MAGIC_CMD,
+[case $MAGIC_CMD in
+  /*)
+  lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a path.
+  ;;
+  ?:/*)
+  lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a dos path.
+  ;;
+  *)
+  ac_save_MAGIC_CMD="$MAGIC_CMD"
+  IFS="${IFS=   }"; ac_save_ifs="$IFS"; IFS=":"
+dnl $ac_dummy forces splitting on constant user-supplied paths.
+dnl POSIX.2 word splitting is done only on the output of word expansions,
+dnl not every word.  This closes a longstanding sh security hole.
+  ac_dummy="ifelse([$2], , $PATH, [$2])"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$1; then
+      lt_cv_path_MAGIC_CMD="$ac_dir/$1"
+      if test -n "$file_magic_test_file"; then
+	case $deplibs_check_method in
+	"file_magic "*)
+	  file_magic_regex="`expr \"$deplibs_check_method\" : \"file_magic \(.*\)\"`"
+	  MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+	  if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+	    egrep "$file_magic_regex" > /dev/null; then
+	    :
+	  else
+	    cat <<EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such.  This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem.  Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
+EOF
+	  fi ;;
+	esac
+      fi
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+  MAGIC_CMD="$ac_save_MAGIC_CMD"
+  ;;
+esac])
+MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+if test -n "$MAGIC_CMD"; then
+  AC_MSG_RESULT($MAGIC_CMD)
+else
+  AC_MSG_RESULT(no)
+fi
+])
+
+
+# AC_PATH_MAGIC - find a file program which can recognise a shared library
+AC_DEFUN([AC_PATH_MAGIC],
+[AC_REQUIRE([AC_CHECK_TOOL_PREFIX])dnl
+AC_PATH_TOOL_PREFIX(${ac_tool_prefix}file, /usr/bin:$PATH)
+if test -z "$lt_cv_path_MAGIC_CMD"; then
+  if test -n "$ac_tool_prefix"; then
+    AC_PATH_TOOL_PREFIX(file, /usr/bin:$PATH)
+  else
+    MAGIC_CMD=:
+  fi
+fi
+])
+
+
+# AC_PROG_LD - find the path to the GNU or non-GNU linker
+AC_DEFUN([AC_PROG_LD],
+[AC_ARG_WITH(gnu-ld,
+[  --with-gnu-ld           assume the C compiler uses GNU ld [default=no]],
+test "$withval" = no || with_gnu_ld=yes, with_gnu_ld=no)
+AC_REQUIRE([AC_PROG_CC])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_REQUIRE([AC_CANONICAL_BUILD])dnl
+AC_REQUIRE([_LT_AC_LIBTOOL_SYS_PATH_SEPARATOR])dnl
+ac_prog=ld
+if test "$GCC" = yes; then
+  # Check if gcc -print-prog-name=ld gives a path.
+  AC_MSG_CHECKING([for ld used by GCC])
+  case $host in
+  *-*-mingw*)
+    # gcc leaves a trailing carriage return which upsets mingw
+    ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;;
+  *)
+    ac_prog=`($CC -print-prog-name=ld) 2>&5` ;;
+  esac
+  case $ac_prog in
+    # Accept absolute paths.
+    [[\\/]]* | [[A-Za-z]]:[[\\/]]*)
+      re_direlt='/[[^/]][[^/]]*/\.\./'
+      # Canonicalize the path of ld
+      ac_prog=`echo $ac_prog| sed 's%\\\\%/%g'`
+      while echo $ac_prog | grep "$re_direlt" > /dev/null 2>&1; do
+	ac_prog=`echo $ac_prog| sed "s%$re_direlt%/%"`
+      done
+      test -z "$LD" && LD="$ac_prog"
+      ;;
+  "")
+    # If it fails, then pretend we aren't using GCC.
+    ac_prog=ld
+    ;;
+  *)
+    # If it is relative, then search for the first ld in PATH.
+    with_gnu_ld=unknown
+    ;;
+  esac
+elif test "$with_gnu_ld" = yes; then
+  AC_MSG_CHECKING([for GNU ld])
+else
+  AC_MSG_CHECKING([for non-GNU ld])
+fi
+AC_CACHE_VAL(lt_cv_path_LD,
+[if test -z "$LD"; then
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
+  for ac_dir in $PATH; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then
+      lt_cv_path_LD="$ac_dir/$ac_prog"
+      # Check to see if the program is GNU ld.  I'd rather use --version,
+      # but apparently some GNU ld's only accept -v.
+      # Break only if it was the GNU/non-GNU ld that we prefer.
+      if "$lt_cv_path_LD" -v 2>&1 < /dev/null | egrep '(GNU|with BFD)' > /dev/null; then
+	test "$with_gnu_ld" != no && break
+      else
+	test "$with_gnu_ld" != yes && break
+      fi
+    fi
+  done
+  IFS="$ac_save_ifs"
+else
+  lt_cv_path_LD="$LD" # Let the user override the test with a path.
+fi])
+LD="$lt_cv_path_LD"
+if test -n "$LD"; then
+  AC_MSG_RESULT($LD)
+else
+  AC_MSG_RESULT(no)
+fi
+test -z "$LD" && AC_MSG_ERROR([no acceptable ld found in \$PATH])
+AC_PROG_LD_GNU
+])
+
+# AC_PROG_LD_GNU -
+AC_DEFUN([AC_PROG_LD_GNU],
+[AC_CACHE_CHECK([if the linker ($LD) is GNU ld], lt_cv_prog_gnu_ld,
+[# I'd rather use --version here, but apparently some GNU ld's only accept -v.
+if $LD -v 2>&1 </dev/null | egrep '(GNU|with BFD)' 1>&5; then
+  lt_cv_prog_gnu_ld=yes
+else
+  lt_cv_prog_gnu_ld=no
+fi])
+with_gnu_ld=$lt_cv_prog_gnu_ld
+])
+
+# AC_PROG_LD_RELOAD_FLAG - find reload flag for linker
+#   -- PORTME Some linkers may need a different reload flag.
+AC_DEFUN([AC_PROG_LD_RELOAD_FLAG],
+[AC_CACHE_CHECK([for $LD option to reload object files], lt_cv_ld_reload_flag,
+[lt_cv_ld_reload_flag='-r'])
+reload_flag=$lt_cv_ld_reload_flag
+test -n "$reload_flag" && reload_flag=" $reload_flag"
+])
+
+# AC_DEPLIBS_CHECK_METHOD - how to check for library dependencies
+#  -- PORTME fill in with the dynamic library characteristics
+AC_DEFUN([AC_DEPLIBS_CHECK_METHOD],
+[AC_CACHE_CHECK([how to recognise dependent libraries],
+lt_cv_deplibs_check_method,
+[lt_cv_file_magic_cmd='$MAGIC_CMD'
+lt_cv_file_magic_test_file=
+lt_cv_deplibs_check_method='unknown'
+# Need to set the preceding variable on all platforms that support
+# interlibrary dependencies.
+# 'none' -- dependencies not supported.
+# `unknown' -- same as none, but documents that we really don't know.
+# 'pass_all' -- all dependencies passed with no checks.
+# 'test_compile' -- check by making test program.
+# 'file_magic [[regex]]' -- check by looking for files in library path
+# which responds to the $file_magic_cmd with a given egrep regex.
+# If you have `file' or equivalent on your system and you're not sure
+# whether `pass_all' will *always* work, you probably want this one.
+
+case $host_os in
+aix4* | aix5*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+beos*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+bsdi4*)
+  lt_cv_deplibs_check_method='file_magic ELF [[0-9]][[0-9]]*-bit [[ML]]SB (shared object|dynamic lib)'
+  lt_cv_file_magic_cmd='/usr/bin/file -L'
+  lt_cv_file_magic_test_file=/shlib/libc.so
+  ;;
+
+cygwin* | mingw* | pw32*)
+  lt_cv_deplibs_check_method='file_magic file format pei*-i386(.*architecture: i386)?'
+  lt_cv_file_magic_cmd='$OBJDUMP -f'
+  ;;
+
+darwin* | rhapsody*)
+  lt_cv_deplibs_check_method='file_magic Mach-O dynamically linked shared library'
+  lt_cv_file_magic_cmd='/usr/bin/file -L'
+  case "$host_os" in
+  rhapsody* | darwin1.[[012]])
+    lt_cv_file_magic_test_file=`echo /System/Library/Frameworks/System.framework/Versions/*/System | head -1`
+    ;;
+  *) # Darwin 1.3 on
+    lt_cv_file_magic_test_file='/usr/lib/libSystem.dylib'
+    ;;
+  esac
+  ;;
+
+freebsd*)
+  if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+    case $host_cpu in
+    i*86 )
+      # Not sure whether the presence of OpenBSD here was a mistake.
+      # Let's accept both of them until this is cleared up.
+      lt_cv_deplibs_check_method='file_magic (FreeBSD|OpenBSD)/i[[3-9]]86 (compact )?demand paged shared library'
+      lt_cv_file_magic_cmd=/usr/bin/file
+      lt_cv_file_magic_test_file=`echo /usr/lib/libc.so.*`
+      ;;
+    esac
+  else
+    lt_cv_deplibs_check_method=pass_all
+  fi
+  ;;
+
+gnu*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+hpux10.20*|hpux11*)
+  lt_cv_deplibs_check_method='file_magic (s[[0-9]][[0-9]][[0-9]]|PA-RISC[[0-9]].[[0-9]]) shared library'
+  lt_cv_file_magic_cmd=/usr/bin/file
+  lt_cv_file_magic_test_file=/usr/lib/libc.sl
+  ;;
+
+irix5* | irix6* | nonstopux*)
+  case $host_os in
+  irix5* | nonstopux*)
+    # this will be overridden with pass_all, but let us keep it just in case
+    lt_cv_deplibs_check_method="file_magic ELF 32-bit MSB dynamic lib MIPS - version 1"
+    ;;
+  *)
+    case $LD in
+    *-32|*"-32 ") libmagic=32-bit;;
+    *-n32|*"-n32 ") libmagic=N32;;
+    *-64|*"-64 ") libmagic=64-bit;;
+    *) libmagic=never-match;;
+    esac
+    # this will be overridden with pass_all, but let us keep it just in case
+    lt_cv_deplibs_check_method="file_magic ELF ${libmagic} MSB mips-[[1234]] dynamic lib MIPS - version 1"
+    ;;
+  esac
+  lt_cv_file_magic_test_file=`echo /lib${libsuff}/libc.so*`
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+# This must be Linux ELF.
+linux-gnu*)
+  case $host_cpu in
+  alpha* | hppa* | i*86 | mips | mipsel | powerpc* | sparc* | ia64* | s390* | x86_64*)
+    lt_cv_deplibs_check_method=pass_all ;;
+  *)
+    # glibc up to 2.1.1 does not perform some relocations on ARM
+    lt_cv_deplibs_check_method='file_magic ELF [[0-9]][[0-9]]*-bit [[LM]]SB (shared object|dynamic lib )' ;;
+  esac
+  lt_cv_file_magic_test_file=`echo /lib/libc.so* /lib/libc-*.so`
+  ;;
+
+netbsd*)
+  if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+    lt_cv_deplibs_check_method='match_pattern /lib[[^/\.]]+\.so\.[[0-9]]+\.[[0-9]]+$'
+  else
+    lt_cv_deplibs_check_method='match_pattern /lib[[^/\.]]+\.so$'
+  fi
+  ;;
+
+newos6*)
+  lt_cv_deplibs_check_method='file_magic ELF [[0-9]][[0-9]]*-bit [[ML]]SB (executable|dynamic lib)'
+  lt_cv_file_magic_cmd=/usr/bin/file
+  lt_cv_file_magic_test_file=/usr/lib/libnls.so
+  ;;
+
+openbsd*)
+  lt_cv_file_magic_cmd=/usr/bin/file
+  lt_cv_file_magic_test_file=`echo /usr/lib/libc.so.*`
+  if test -z "`echo __ELF__ | $CC -E - | grep __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
+    lt_cv_deplibs_check_method='file_magic ELF [[0-9]][[0-9]]*-bit [[LM]]SB shared object'
+  else
+    lt_cv_deplibs_check_method='file_magic OpenBSD.* shared library'
+  fi
+  ;;
+
+osf3* | osf4* | osf5*)
+  # this will be overridden with pass_all, but let us keep it just in case
+  lt_cv_deplibs_check_method='file_magic COFF format alpha shared library'
+  lt_cv_file_magic_test_file=/shlib/libc.so
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+sco3.2v5*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+solaris*)
+  lt_cv_deplibs_check_method=pass_all
+  lt_cv_file_magic_test_file=/lib/libc.so
+  ;;
+
+sysv5uw[[78]]* | sysv4*uw2*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+  case $host_vendor in
+  motorola)
+    lt_cv_deplibs_check_method='file_magic ELF [[0-9]][[0-9]]*-bit [[ML]]SB (shared object|dynamic lib) M[[0-9]][[0-9]]* Version [[0-9]]'
+    lt_cv_file_magic_test_file=`echo /usr/lib/libc.so*`
+    ;;
+  ncr)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  sequent)
+    lt_cv_file_magic_cmd='/bin/file'
+    lt_cv_deplibs_check_method='file_magic ELF [[0-9]][[0-9]]*-bit [[LM]]SB (shared object|dynamic lib )'
+    ;;
+  sni)
+    lt_cv_file_magic_cmd='/bin/file'
+    lt_cv_deplibs_check_method="file_magic ELF [[0-9]][[0-9]]*-bit [[LM]]SB dynamic lib"
+    lt_cv_file_magic_test_file=/lib/libc.so
+    ;;
+  siemens)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  esac
+  ;;
+esac
+])
+file_magic_cmd=$lt_cv_file_magic_cmd
+deplibs_check_method=$lt_cv_deplibs_check_method
+])
+
+
+# AC_PROG_NM - find the path to a BSD-compatible name lister
+AC_DEFUN([AC_PROG_NM],
+[AC_REQUIRE([_LT_AC_LIBTOOL_SYS_PATH_SEPARATOR])dnl
+AC_MSG_CHECKING([for BSD-compatible nm])
+AC_CACHE_VAL(lt_cv_path_NM,
+[if test -n "$NM"; then
+  # Let the user override the test.
+  lt_cv_path_NM="$NM"
+else
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
+  for ac_dir in $PATH /usr/ccs/bin /usr/ucb /bin; do
+    test -z "$ac_dir" && ac_dir=.
+    tmp_nm=$ac_dir/${ac_tool_prefix}nm
+    if test -f $tmp_nm || test -f $tmp_nm$ac_exeext ; then
+      # Check to see if the nm accepts a BSD-compat flag.
+      # Adding the `sed 1q' prevents false positives on HP-UX, which says:
+      #   nm: unknown option "B" ignored
+      # Tru64's nm complains that /dev/null is an invalid object file
+      if ($tmp_nm -B /dev/null 2>&1 | sed '1q'; exit 0) | egrep '(/dev/null|Invalid file or object type)' >/dev/null; then
+	lt_cv_path_NM="$tmp_nm -B"
+	break
+      elif ($tmp_nm -p /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
+	lt_cv_path_NM="$tmp_nm -p"
+	break
+      else
+	lt_cv_path_NM=${lt_cv_path_NM="$tmp_nm"} # keep the first match, but
+	continue # so that we can try to find one that supports BSD flags
+      fi
+    fi
+  done
+  IFS="$ac_save_ifs"
+  test -z "$lt_cv_path_NM" && lt_cv_path_NM=nm
+fi])
+NM="$lt_cv_path_NM"
+AC_MSG_RESULT([$NM])
+])
+
+# AC_CHECK_LIBM - check for math library
+AC_DEFUN([AC_CHECK_LIBM],
+[AC_REQUIRE([AC_CANONICAL_HOST])dnl
+LIBM=
+case $host in
+*-*-beos* | *-*-cygwin* | *-*-pw32*)
+  # These system don't have libm
+  ;;
+*-ncr-sysv4.3*)
+  AC_CHECK_LIB(mw, _mwvalidcheckl, LIBM="-lmw")
+  AC_CHECK_LIB(m, main, LIBM="$LIBM -lm")
+  ;;
+*)
+  AC_CHECK_LIB(m, main, LIBM="-lm")
+  ;;
+esac
+])
+
+# AC_LIBLTDL_CONVENIENCE[(dir)] - sets LIBLTDL to the link flags for
+# the libltdl convenience library and LTDLINCL to the include flags for
+# the libltdl header and adds --enable-ltdl-convenience to the
+# configure arguments.  Note that LIBLTDL and LTDLINCL are not
+# AC_SUBSTed, nor is AC_CONFIG_SUBDIRS called.  If DIR is not
+# provided, it is assumed to be `libltdl'.  LIBLTDL will be prefixed
+# with '${top_builddir}/' and LTDLINCL will be prefixed with
+# '${top_srcdir}/' (note the single quotes!).  If your package is not
+# flat and you're not using automake, define top_builddir and
+# top_srcdir appropriately in the Makefiles.
+AC_DEFUN([AC_LIBLTDL_CONVENIENCE],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+  case $enable_ltdl_convenience in
+  no) AC_MSG_ERROR([this package needs a convenience libltdl]) ;;
+  "") enable_ltdl_convenience=yes
+      ac_configure_args="$ac_configure_args --enable-ltdl-convenience" ;;
+  esac
+  LIBLTDL='${top_builddir}/'ifelse($#,1,[$1],['libltdl'])/libltdlc.la
+  LTDLINCL='-I${top_srcdir}/'ifelse($#,1,[$1],['libltdl'])
+  # For backwards non-gettext consistent compatibility...
+  INCLTDL="$LTDLINCL"
+])
+
+# AC_LIBLTDL_INSTALLABLE[(dir)] - sets LIBLTDL to the link flags for
+# the libltdl installable library and LTDLINCL to the include flags for
+# the libltdl header and adds --enable-ltdl-install to the configure
+# arguments.  Note that LIBLTDL and LTDLINCL are not AC_SUBSTed, nor is
+# AC_CONFIG_SUBDIRS called.  If DIR is not provided and an installed
+# libltdl is not found, it is assumed to be `libltdl'.  LIBLTDL will
+# be prefixed with '${top_builddir}/' and LTDLINCL will be prefixed
+# with '${top_srcdir}/' (note the single quotes!).  If your package is
+# not flat and you're not using automake, define top_builddir and
+# top_srcdir appropriately in the Makefiles.
+# In the future, this macro may have to be called after AC_PROG_LIBTOOL.
+AC_DEFUN([AC_LIBLTDL_INSTALLABLE],
+[AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+  AC_CHECK_LIB(ltdl, main,
+  [test x"$enable_ltdl_install" != xyes && enable_ltdl_install=no],
+  [if test x"$enable_ltdl_install" = xno; then
+     AC_MSG_WARN([libltdl not installed, but installation disabled])
+   else
+     enable_ltdl_install=yes
+   fi
+  ])
+  if test x"$enable_ltdl_install" = x"yes"; then
+    ac_configure_args="$ac_configure_args --enable-ltdl-install"
+    LIBLTDL='${top_builddir}/'ifelse($#,1,[$1],['libltdl'])/libltdl.la
+    LTDLINCL='-I${top_srcdir}/'ifelse($#,1,[$1],['libltdl'])
+  else
+    ac_configure_args="$ac_configure_args --enable-ltdl-install=no"
+    LIBLTDL="-lltdl"
+    LTDLINCL=
+  fi
+  # For backwards non-gettext consistent compatibility...
+  INCLTDL="$LTDLINCL"
+])
+
+# old names
+AC_DEFUN([AM_PROG_LIBTOOL],   [AC_PROG_LIBTOOL])
+AC_DEFUN([AM_ENABLE_SHARED],  [AC_ENABLE_SHARED($@)])
+AC_DEFUN([AM_ENABLE_STATIC],  [AC_ENABLE_STATIC($@)])
+AC_DEFUN([AM_DISABLE_SHARED], [AC_DISABLE_SHARED($@)])
+AC_DEFUN([AM_DISABLE_STATIC], [AC_DISABLE_STATIC($@)])
+AC_DEFUN([AM_PROG_LD],        [AC_PROG_LD])
+AC_DEFUN([AM_PROG_NM],        [AC_PROG_NM])
+
+# This is just to silence aclocal about the macro not being used
+ifelse([AC_DISABLE_FAST_INSTALL])
+
+# NOTE: This macro has been submitted for inclusion into   #
+#  GNU Autoconf as AC_PROG_SED.  When it is available in   #
+#  a released version of Autoconf we should remove this    #
+#  macro and use it instead.                               #
+# LT_AC_PROG_SED
+# --------------
+# Check for a fully-functional sed program, that truncates
+# as few characters as possible.  Prefer GNU sed if found.
+AC_DEFUN([LT_AC_PROG_SED],
+[AC_MSG_CHECKING([for a sed that does not truncate output])
+AC_CACHE_VAL(lt_cv_path_SED,
+[# Loop through the user's path and test for sed and gsed.
+# Then use that list of sed's as ones to test for truncation.
+as_executable_p="test -f"
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_prog in sed gsed; do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      if $as_executable_p "$as_dir/$ac_prog$ac_exec_ext"; then
+        _sed_list="$_sed_list $as_dir/$ac_prog$ac_exec_ext"
+      fi
+    done
+  done
+done
+
+  # Create a temporary directory, and hook for its removal unless debugging.
+$debug ||
+{
+  trap 'exit_status=$?; rm -rf $tmp && exit $exit_status' 0
+  trap '{ (exit 1); exit 1; }' 1 2 13 15
+}
+
+# Create a (secure) tmp directory for tmp files.
+: ${TMPDIR=/tmp}
+{
+  tmp=`(umask 077 && mktemp -d -q "$TMPDIR/sedXXXXXX") 2>/dev/null` &&
+  test -n "$tmp" && test -d "$tmp"
+}  ||
+{
+  tmp=$TMPDIR/sed$$-$RANDOM
+  (umask 077 && mkdir $tmp)
+} ||
+{
+   echo "$me: cannot create a temporary directory in $TMPDIR" >&2
+   { (exit 1); exit 1; }
+}
+  _max=0
+  _count=0
+  # Add /usr/xpg4/bin/sed as it is typically found on Solaris
+  # along with /bin/sed that truncates output.
+  for _sed in $_sed_list /usr/xpg4/bin/sed; do
+    test ! -f ${_sed} && break
+    cat /dev/null > "$tmp/sed.in"
+    _count=0
+    echo ${ECHO_N-$ac_n} "0123456789${ECHO_C-$ac_c}" >"$tmp/sed.in"
+    # Check for GNU sed and select it if it is found.
+    if "${_sed}" --version 2>&1 < /dev/null | egrep '(GNU)' > /dev/null; then
+      lt_cv_path_SED=${_sed}
+      break
+    fi
+    while true; do
+      cat "$tmp/sed.in" "$tmp/sed.in" >"$tmp/sed.tmp"
+      mv "$tmp/sed.tmp" "$tmp/sed.in"
+      cp "$tmp/sed.in" "$tmp/sed.nl"
+      echo >>"$tmp/sed.nl"
+      ${_sed} -e 's/a$//' < "$tmp/sed.nl" >"$tmp/sed.out" || break
+      cmp -s "$tmp/sed.out" "$tmp/sed.nl" || break
+      # 40000 chars as input seems more than enough
+      test $_count -gt 10 && break
+      _count=`expr $_count + 1`
+      if test $_count -gt $_max; then
+        _max=$_count
+        lt_cv_path_SED=$_sed
+      fi
+    done
+  done
+  rm -rf "$tmp"
+])
+if test "X$SED" != "X"; then
+  lt_cv_path_SED=$SED
+else
+  SED=$lt_cv_path_SED
+fi
+AC_MSG_RESULT([$SED])
+])
+
diff --git a/altui/README.alt b/altui/README.alt
new file mode 100644
index 0000000..e0f31db
--- /dev/null
+++ b/altui/README.alt
@@ -0,0 +1,71 @@
+Here is an alternate command-line user interface for the IJG JPEG software.
+It is designed for use under MS-DOS, and may also be useful on other non-Unix
+operating systems.  (For that matter, this code works fine on Unix, but the
+standard command-line syntax is better on Unix because it is pipe-friendly.)
+
+With this user interface, cjpeg and djpeg accept multiple input file names
+on the command line; output file names are generated by substituting
+appropriate extensions.  The user is prompted before any already-existing
+file will be overwritten.  See usage.alt for details.
+
+Expansion of wild-card file specifications is useful but is not directly
+provided by this code.  Most DOS C compilers have the ability to do wild-card
+expansion "behind the scenes", and we rely on that feature.  On other systems,
+the shell may do it for you, as is done on Unix.
+
+Also, a DOS-specific routine is provided to determine available memory;
+this makes the -maxmemory switch unnecessary except in unusual cases.
+If you know how to determine available memory on a different system,
+you can easily add the necessary code.  (And please send it along to
+jpeg-info@uunet.uu.net so we can include it in future releases!)
+
+
+INSTALLATION
+============
+
+You need to have the main IJG JPEG distribution, release 6 or later.
+Replace the standard cjpeg.c and djpeg.c files with the ones provided here.
+Then build the software as described in the main distribution's install.doc
+file, with these exceptions:
+
+* Define PROGRESS_REPORT in jconfig.h if you want the percent-done display.
+* Define NO_OVERWRITE_CHECK if you *don't* want overwrite confirmation.
+* You may ignore the USE_SETMODE and TWO_FILE_COMMANDLINE symbols discussed
+  in install.doc; these files do not use them.
+* As given, djpeg.c defaults to GIF output (not PPM output as in the standard
+  djpeg.c).  If you want something different, modify DEFAULT_FMT.
+
+You may also need to do something special to enable filename wild-card
+expansion, assuming your compiler has that capability at all.
+
+Modify the standard usage.doc file as described in usage.alt.  (If you want
+to use the Unix-style manual pages cjpeg.1 and djpeg.1, better fix them too.)
+
+
+Here are some specific notes for popular MS-DOS compilers:
+
+Borland C:
+  Add "-DMSDOS" to CFLAGS to enable use of the DOS memory determination code.
+  Link with the standard library file WILDARGS.OBJ to get wild-card expansion.
+
+Microsoft C:
+  Add "-DMSDOS" to CFLAGS to enable use of the DOS memory determination code.
+  Link with the standard library file SETARGV.OBJ to get wild-card expansion.
+  In the versions I've used, you must also add /NOE to the linker switches to
+  avoid a duplicate-symbol error from including SETARGV.
+
+DJGPP (we recommend version 2.0 or later):
+  Add "-DFREE_MEM_ESTIMATE=0" to CFLAGS.  Wild-card expansion is automatic.
+
+
+LEGAL ISSUES
+============
+
+This software is copyright (C) 1991-1998, Thomas G. Lane.
+Terms of distribution and use are the same as for the free IJG JPEG software;
+see its README file for details.
+
+The authors make NO WARRANTY or representation, either express or implied,
+with respect to this software, its quality, accuracy, merchantability, or
+fitness for a particular purpose.  This software is provided "AS IS", and you,
+its user, assume the entire risk as to its quality and accuracy.
diff --git a/altui/cjpeg.c b/altui/cjpeg.c
new file mode 100644
index 0000000..df1a4f8
--- /dev/null
+++ b/altui/cjpeg.c
@@ -0,0 +1,813 @@
+/*
+ * alternate cjpeg.c
+ *
+ * Copyright (C) 1991-1998, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : January 6, 2006
+ * ---------------------------------------------------------------------
+ *
+ * This file contains an alternate user interface for the JPEG compressor.
+ * One or more input files are named on the command line, and output file
+ * names are created by substituting ".jpg" for the input file's extension.
+ */
+
+#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
+#include "jversion.h"		/* for version message */
+
+#ifdef USE_CCOMMAND		/* command-line reader for Macintosh */
+#ifdef __MWERKS__
+#include <SIOUX.h>              /* Metrowerks needs this */
+#include <console.h>		/* ... and this */
+#endif
+#ifdef THINK_C
+#include <console.h>		/* Think declares it here */
+#endif
+#endif
+
+#ifndef PATH_MAX		/* ANSI maximum-pathname-length constant */
+#define PATH_MAX 256
+#endif
+
+
+/* Create the add-on message string table. */
+
+#define JMESSAGE(code,string)	string ,
+
+static const char * const cdjpeg_message_table[] = {
+#include "cderror.h"
+  NULL
+};
+
+
+/*
+ * SIMD Ext: compiler-specific hacks to enable filename wild-card expansion
+ */
+
+#ifdef _MSC_VER		/* Microsoft Visual C++ */
+/* from setargv.c (setargv.obj) */
+/* Tested under Visual C++ V6.0, Toolkit 2003, and 2005 Express Edition */
+int __cdecl _setargv(void) { int __cdecl __setargv(void); return __setargv(); }
+#endif
+#ifdef __BORLANDC__	/* Borland C++ */
+/* from wildargs.c (wildargs.obj) */
+/* Tested under Borland C++ Compiler 5.5 (win32) */
+#include <wildargs.h>
+typedef void _RTLENTRY (* _RTLENTRY _argv_expand_fnc)(char *, _PFN_ADDARG);
+_argv_expand_fnc _argv_expand_ptr = _expand_wild;
+#endif
+
+
+/*
+ * Automatic determination of available memory.
+ */
+
+static long default_maxmem;	/* saves value determined at startup, or 0 */
+
+#ifndef FREE_MEM_ESTIMATE	/* may be defined from command line */
+
+#ifdef MSDOS			/* For MS-DOS (unless flat-memory model) */
+
+#include <dos.h>		/* for access to intdos() call */
+
+LOCAL(long)
+unused_dos_memory (void)
+/* Obtain total amount of unallocated DOS memory */
+{
+  union REGS regs;
+  long nparas;
+
+  regs.h.ah = 0x48;		/* DOS function Allocate Memory Block */
+  regs.x.bx = 0xFFFF;		/* Ask for more memory than DOS can have */
+  (void) intdos(&regs, &regs);
+  /* DOS will fail and return # of paragraphs actually available in BX. */
+  nparas = (unsigned int) regs.x.bx;
+  /* Times 16 to convert to bytes. */
+  return nparas << 4;
+}
+
+/* The default memory setting is 95% of the available space. */
+#define FREE_MEM_ESTIMATE  ((unused_dos_memory() * 95L) / 100L)
+
+#endif /* MSDOS */
+
+#ifdef ATARI			/* For Atari ST/STE/TT, Pure C or Turbo C */
+
+#include <ext.h>
+
+/* The default memory setting is 90% of the available space. */
+#define FREE_MEM_ESTIMATE  (((long) coreleft() * 90L) / 100L)
+
+#endif /* ATARI */
+
+/* Add memory-estimation procedures for other operating systems here,
+ * with appropriate #ifdef's around them.
+ */
+
+#endif /* !FREE_MEM_ESTIMATE */
+
+
+/*
+ * This routine determines what format the input file is,
+ * and selects the appropriate input-reading module.
+ *
+ * To determine which family of input formats the file belongs to,
+ * we may look only at the first byte of the file, since C does not
+ * guarantee that more than one character can be pushed back with ungetc.
+ * Looking at additional bytes would require one of these approaches:
+ *     1) assume we can fseek() the input file (fails for piped input);
+ *     2) assume we can push back more than one character (works in
+ *        some C implementations, but unportable);
+ *     3) provide our own buffering (breaks input readers that want to use
+ *        stdio directly, such as the RLE library);
+ * or  4) don't put back the data, and modify the input_init methods to assume
+ *        they start reading after the start of file (also breaks RLE library).
+ * #1 is attractive for MS-DOS but is untenable on Unix.
+ *
+ * The most portable solution for file types that can't be identified by their
+ * first byte is to make the user tell us what they are.  This is also the
+ * only approach for "raw" file types that contain only arbitrary values.
+ * We presently apply this method for Targa files.  Most of the time Targa
+ * files start with 0x00, so we recognize that case.  Potentially, however,
+ * a Targa file could start with any byte value (byte 0 is the length of the
+ * seldom-used ID field), so we provide a switch to force Targa input mode.
+ */
+
+static boolean is_targa;	/* records user -targa switch */
+
+
+LOCAL(cjpeg_source_ptr)
+select_file_type (j_compress_ptr cinfo, FILE * infile)
+{
+  int c;
+
+  if (is_targa) {
+#ifdef TARGA_SUPPORTED
+    return jinit_read_targa(cinfo);
+#else
+    ERREXIT(cinfo, JERR_TGA_NOTCOMP);
+#endif
+  }
+
+  if ((c = getc(infile)) == EOF)
+    ERREXIT(cinfo, JERR_INPUT_EMPTY);
+  if (ungetc(c, infile) == EOF)
+    ERREXIT(cinfo, JERR_UNGETC_FAILED);
+
+  switch (c) {
+#ifdef BMP_SUPPORTED
+  case 'B':
+    return jinit_read_bmp(cinfo);
+#endif
+#ifdef GIF_SUPPORTED
+  case 'G':
+    return jinit_read_gif(cinfo);
+#endif
+#ifdef PPM_SUPPORTED
+  case 'P':
+    return jinit_read_ppm(cinfo);
+#endif
+#ifdef RLE_SUPPORTED
+  case 'R':
+    return jinit_read_rle(cinfo);
+#endif
+#ifdef TARGA_SUPPORTED
+  case 0x00:
+    return jinit_read_targa(cinfo);
+#endif
+  default:
+    ERREXIT(cinfo, JERR_UNKNOWN_FORMAT);
+    break;
+  }
+
+  return NULL;			/* suppress compiler warnings */
+}
+
+
+/*
+ * Argument-parsing code.
+ * The switch parser is designed to be useful with DOS-style command line
+ * syntax, ie, intermixed switches and file names, where only the switches
+ * to the left of a given file name affect processing of that file.
+ */
+
+
+static const char * progname;	/* program name for error messages */
+static char * outfilename;	/* for -outfile switch */
+
+
+LOCAL(void)
+usage (void)
+/* complain about bad command line */
+{
+  fprintf(stderr, "usage: %s [switches] inputfile(s)\n", progname);
+  fprintf(stderr, "List of input files may use wildcards (* and ?)\n");
+  fprintf(stderr, "Output filename is same as input filename, but extension .jpg\n");
+
+  fprintf(stderr, "Switches (names may be abbreviated):\n");
+  fprintf(stderr, "  -quality N     Compression quality (0..100; 5-95 is useful range)\n");
+  fprintf(stderr, "  -grayscale     Create monochrome JPEG file\n");
+#ifdef ENTROPY_OPT_SUPPORTED
+  fprintf(stderr, "  -optimize      Optimize Huffman table (smaller file, but slow compression)\n");
+#endif
+#ifdef C_PROGRESSIVE_SUPPORTED
+  fprintf(stderr, "  -progressive   Create progressive JPEG file\n");
+#endif
+#ifdef TARGA_SUPPORTED
+  fprintf(stderr, "  -targa         Input file is Targa format (usually not needed)\n");
+#endif
+  fprintf(stderr, "Switches for advanced users:\n");
+#ifdef DCT_ISLOW_SUPPORTED
+  fprintf(stderr, "  -dct int       Use integer DCT method%s\n",
+	  (JDCT_DEFAULT == JDCT_ISLOW ? " (default)" : ""));
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+  fprintf(stderr, "  -dct fast      Use fast integer DCT (less accurate)%s\n",
+	  (JDCT_DEFAULT == JDCT_IFAST ? " (default)" : ""));
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+  fprintf(stderr, "  -dct float     Use floating-point DCT method%s\n",
+	  (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : ""));
+#endif
+  fprintf(stderr, "  -restart N     Set restart interval in rows, or in blocks with B\n");
+#ifdef INPUT_SMOOTHING_SUPPORTED
+  fprintf(stderr, "  -smooth N      Smooth dithered input (N=1..100 is strength)\n");
+#endif
+#ifndef FREE_MEM_ESTIMATE
+  fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
+#endif
+  fprintf(stderr, "  -outfile name  Specify name for output file\n");
+  fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
+  fprintf(stderr, "Switches for wizards:\n");
+#ifdef C_ARITH_CODING_SUPPORTED
+  fprintf(stderr, "  -arithmetic    Use arithmetic coding\n");
+#endif
+  fprintf(stderr, "  -baseline      Force baseline quantization tables\n");
+  fprintf(stderr, "  -qtables file  Use quantization tables given in file\n");
+  fprintf(stderr, "  -qslots N[,...]    Set component quantization tables\n");
+  fprintf(stderr, "  -sample HxV[,...]  Set component sampling factors\n");
+#ifdef C_MULTISCAN_FILES_SUPPORTED
+  fprintf(stderr, "  -scans file    Create multi-scan JPEG per script file\n");
+#endif
+  exit(EXIT_FAILURE);
+}
+
+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+LOCAL(void)
+print_simd_info (FILE * file, char * labelstr, unsigned int simd)
+{
+  fprintf(file, "%s%s%s%s%s%s\n", labelstr,
+	  simd & JSIMD_MMX   ? " MMX"    : "",
+	  simd & JSIMD_3DNOW ? " 3DNow!" : "",
+	  simd & JSIMD_SSE   ? " SSE"    : "",
+	  simd & JSIMD_SSE2  ? " SSE2"   : "",
+	  simd == JSIMD_NONE ? " NONE"   : "");
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
+
+
+LOCAL(int)
+parse_switches (j_compress_ptr cinfo, int argc, char **argv,
+		int last_file_arg_seen, boolean for_real)
+/* Parse optional switches.
+ * Returns argv[] index of first file-name argument (== argc if none).
+ * Any file names with indexes <= last_file_arg_seen are ignored;
+ * they have presumably been processed in a previous iteration.
+ * (Pass 0 for last_file_arg_seen on the first or only iteration.)
+ * for_real is FALSE on the first (dummy) pass; we may skip any expensive
+ * processing.
+ */
+{
+  int argn;
+  char * arg;
+  int quality;			/* -quality parameter */
+  int q_scale_factor;		/* scaling percentage for -qtables */
+  boolean force_baseline;
+  boolean simple_progressive;
+  char * qtablefile = NULL;	/* saves -qtables filename if any */
+  char * qslotsarg = NULL;	/* saves -qslots parm if any */
+  char * samplearg = NULL;	/* saves -sample parm if any */
+  char * scansarg = NULL;	/* saves -scans parm if any */
+
+  /* Set up default JPEG parameters. */
+  /* Note that default -quality level need not, and does not,
+   * match the default scaling for an explicit -qtables argument.
+   */
+  quality = 75;			/* default -quality value */
+  q_scale_factor = 100;		/* default to no scaling for -qtables */
+  force_baseline = FALSE;	/* by default, allow 16-bit quantizers */
+  simple_progressive = FALSE;
+  is_targa = FALSE;
+  outfilename = NULL;
+  cinfo->err->trace_level = 0;
+  if (default_maxmem > 0)	/* override library's default value */
+    cinfo->mem->max_memory_to_use = default_maxmem;
+
+  /* Scan command line options, adjust parameters */
+
+  for (argn = 1; argn < argc; argn++) {
+    arg = argv[argn];
+    if (*arg != '-') {
+      /* Not a switch, must be a file name argument */
+      if (argn <= last_file_arg_seen) {
+	outfilename = NULL;	/* -outfile applies to just one input file */
+	continue;		/* ignore this name if previously processed */
+      }
+      break;			/* else done parsing switches */
+    }
+    arg++;			/* advance past switch marker character */
+
+    if (keymatch(arg, "arithmetic", 1)) {
+      /* Use arithmetic coding. */
+#ifdef C_ARITH_CODING_SUPPORTED
+      cinfo->arith_code = TRUE;
+#else
+      fprintf(stderr, "%s: sorry, arithmetic coding not supported\n",
+	      progname);
+      exit(EXIT_FAILURE);
+#endif
+
+    } else if (keymatch(arg, "baseline", 1)) {
+      /* Force baseline-compatible output (8-bit quantizer values). */
+      force_baseline = TRUE;
+
+#ifndef JSIMD_MASKFUNC_NOT_SUPPORTED
+    } else if (keymatch(arg, "nosimd" , 4)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_ALL);
+    } else if (keymatch(arg, "nommx"  , 3)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_MMX);
+    } else if (keymatch(arg, "no3dnow", 3)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_3DNOW);
+    } else if (keymatch(arg, "nosse"  , 4)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_SSE);
+    } else if (keymatch(arg, "nosse2" , 6)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_SSE2);
+#endif /* !JSIMD_MASKFUNC_NOT_SUPPORTED */
+
+    } else if (keymatch(arg, "dct", 2)) {
+      /* Select DCT algorithm. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (keymatch(argv[argn], "int", 1)) {
+	cinfo->dct_method = JDCT_ISLOW;
+      } else if (keymatch(argv[argn], "fast", 2)) {
+	cinfo->dct_method = JDCT_IFAST;
+      } else if (keymatch(argv[argn], "float", 2)) {
+	cinfo->dct_method = JDCT_FLOAT;
+      } else
+	usage();
+
+    } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) {
+      /* Enable debug printouts. */
+      /* On first -d, print version identification */
+      static boolean printed_version = FALSE;
+
+      if (! printed_version) {
+	fprintf(stderr, "Independent JPEG Group's CJPEG, version %s\n%s\n",
+		JVERSION, JCOPYRIGHT);
+	fprintf(stderr,
+		"\nx86 SIMD extension for IJG JPEG library, version %s\n\n",
+		JPEG_SIMDEXT_VER_STR);
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+	print_simd_info(stderr, "SIMD instructions supported by the system :",
+			jpeg_simd_support(NULL));
+
+	fprintf(stderr, "\n      === SIMD Operation Modes ===\n");
+#ifdef DCT_ISLOW_SUPPORTED
+	print_simd_info(stderr, "Accurate integer DCT  (-dct int)   :",
+			jpeg_simd_forward_dct(cinfo, JDCT_ISLOW));
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+	print_simd_info(stderr, "Fast integer DCT      (-dct fast)  :",
+			jpeg_simd_forward_dct(cinfo, JDCT_IFAST));
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+	print_simd_info(stderr, "Floating-point DCT    (-dct float) :",
+			jpeg_simd_forward_dct(cinfo, JDCT_FLOAT));
+#endif
+	print_simd_info(stderr, "Downsampling (-sample 2x2 or 2x1)  :",
+			jpeg_simd_downsampler(cinfo));
+	print_simd_info(stderr, "Colorspace conversion (RGB->YCbCr) :",
+			jpeg_simd_color_converter(cinfo));
+	fprintf(stderr, "\n");
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
+	printed_version = TRUE;
+      }
+      cinfo->err->trace_level++;
+
+    } else if (keymatch(arg, "grayscale", 2) || keymatch(arg, "greyscale",2)) {
+      /* Force a monochrome JPEG file to be generated. */
+      jpeg_set_colorspace(cinfo, JCS_GRAYSCALE);
+
+    } else if (keymatch(arg, "maxmemory", 3)) {
+      /* Maximum memory in Kb (or Mb with 'm'). */
+      long lval;
+      char ch = 'x';
+
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
+	usage();
+      if (ch == 'm' || ch == 'M')
+	lval *= 1000L;
+      cinfo->mem->max_memory_to_use = lval * 1000L;
+
+    } else if (keymatch(arg, "optimize", 1) || keymatch(arg, "optimise", 1)) {
+      /* Enable entropy parm optimization. */
+#ifdef ENTROPY_OPT_SUPPORTED
+      cinfo->optimize_coding = TRUE;
+#else
+      fprintf(stderr, "%s: sorry, entropy optimization was not compiled\n",
+	      progname);
+      exit(EXIT_FAILURE);
+#endif
+
+    } else if (keymatch(arg, "outfile", 4)) {
+      /* Set output file name. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      outfilename = argv[argn];	/* save it away for later use */
+
+    } else if (keymatch(arg, "progressive", 1)) {
+      /* Select simple progressive mode. */
+#ifdef C_PROGRESSIVE_SUPPORTED
+      simple_progressive = TRUE;
+      /* We must postpone execution until num_components is known. */
+#else
+      fprintf(stderr, "%s: sorry, progressive output was not compiled\n",
+	      progname);
+      exit(EXIT_FAILURE);
+#endif
+
+    } else if (keymatch(arg, "quality", 1)) {
+      /* Quality factor (quantization table scaling factor). */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (sscanf(argv[argn], "%d", &quality) != 1)
+	usage();
+      /* Change scale factor in case -qtables is present. */
+      q_scale_factor = jpeg_quality_scaling(quality);
+
+    } else if (keymatch(arg, "qslots", 2)) {
+      /* Quantization table slot numbers. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      qslotsarg = argv[argn];
+      /* Must delay setting qslots until after we have processed any
+       * colorspace-determining switches, since jpeg_set_colorspace sets
+       * default quant table numbers.
+       */
+
+    } else if (keymatch(arg, "qtables", 2)) {
+      /* Quantization tables fetched from file. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      qtablefile = argv[argn];
+      /* We postpone actually reading the file in case -quality comes later. */
+
+    } else if (keymatch(arg, "restart", 1)) {
+      /* Restart interval in MCU rows (or in MCUs with 'b'). */
+      long lval;
+      char ch = 'x';
+
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
+	usage();
+      if (lval < 0 || lval > 65535L)
+	usage();
+      if (ch == 'b' || ch == 'B') {
+	cinfo->restart_interval = (unsigned int) lval;
+	cinfo->restart_in_rows = 0; /* else prior '-restart n' overrides me */
+      } else {
+	cinfo->restart_in_rows = (int) lval;
+	/* restart_interval will be computed during startup */
+      }
+
+    } else if (keymatch(arg, "sample", 2)) {
+      /* Set sampling factors. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      samplearg = argv[argn];
+      /* Must delay setting sample factors until after we have processed any
+       * colorspace-determining switches, since jpeg_set_colorspace sets
+       * default sampling factors.
+       */
+
+    } else if (keymatch(arg, "scans", 2)) {
+      /* Set scan script. */
+#ifdef C_MULTISCAN_FILES_SUPPORTED
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      scansarg = argv[argn];
+      /* We must postpone reading the file in case -progressive appears. */
+#else
+      fprintf(stderr, "%s: sorry, multi-scan output was not compiled\n",
+	      progname);
+      exit(EXIT_FAILURE);
+#endif
+
+    } else if (keymatch(arg, "smooth", 2)) {
+      /* Set input smoothing factor. */
+      int val;
+
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (sscanf(argv[argn], "%d", &val) != 1)
+	usage();
+      if (val < 0 || val > 100)
+	usage();
+      cinfo->smoothing_factor = val;
+
+    } else if (keymatch(arg, "targa", 1)) {
+      /* Input file is Targa format. */
+      is_targa = TRUE;
+
+    } else {
+      usage();			/* bogus switch */
+    }
+  }
+
+  /* Post-switch-scanning cleanup */
+
+  if (for_real) {
+
+    /* Set quantization tables for selected quality. */
+    /* Some or all may be overridden if -qtables is present. */
+    jpeg_set_quality(cinfo, quality, force_baseline);
+
+    if (qtablefile != NULL)	/* process -qtables if it was present */
+      if (! read_quant_tables(cinfo, qtablefile,
+			      q_scale_factor, force_baseline))
+	usage();
+
+    if (qslotsarg != NULL)	/* process -qslots if it was present */
+      if (! set_quant_slots(cinfo, qslotsarg))
+	usage();
+
+    if (samplearg != NULL)	/* process -sample if it was present */
+      if (! set_sample_factors(cinfo, samplearg))
+	usage();
+
+#ifdef C_PROGRESSIVE_SUPPORTED
+    if (simple_progressive)	/* process -progressive; -scans can override */
+      jpeg_simple_progression(cinfo);
+#endif
+
+#ifdef C_MULTISCAN_FILES_SUPPORTED
+    if (scansarg != NULL)	/* process -scans if it was present */
+      if (! read_scan_script(cinfo, scansarg))
+	usage();
+#endif
+  }
+
+  return argn;			/* return index of next arg (file name) */
+}
+
+
+/*
+ * Check for overwrite of an existing file; clear it with user
+ */
+
+#ifndef NO_OVERWRITE_CHECK
+
+LOCAL(boolean)
+is_write_ok (char * outfname)
+{
+  FILE * ofile;
+  int ch;
+
+  ofile = fopen(outfname, READ_BINARY);
+  if (ofile == NULL)
+    return TRUE;		/* not present */
+  fclose(ofile);		/* oops, it is present */
+
+  for (;;) {
+    fprintf(stderr, "%s already exists, overwrite it? [y/n] ",
+	    outfname);
+    fflush(stderr);
+    ch = getc(stdin);
+    if (ch != '\n')		/* flush rest of line */
+      while (getc(stdin) != '\n')
+	/* nothing */;
+
+    switch (ch) {
+    case 'Y':
+    case 'y':
+      return TRUE;
+    case 'N':
+    case 'n':
+      return FALSE;
+    /* otherwise, ask again */
+    }
+  }
+}
+
+#endif
+
+
+/*
+ * Process a single input file name, and return its index in argv[].
+ * File names at or to left of old_file_index have been processed already.
+ */
+
+LOCAL(int)
+process_one_file (int argc, char **argv, int old_file_index)
+{
+  struct jpeg_compress_struct cinfo;
+  struct jpeg_error_mgr jerr;
+  char *infilename;
+  char workfilename[PATH_MAX];
+#ifdef PROGRESS_REPORT
+  struct cdjpeg_progress_mgr progress;
+#endif
+  int file_index;
+  cjpeg_source_ptr src_mgr;
+  FILE * input_file = NULL;
+  FILE * output_file = NULL;
+  JDIMENSION num_scanlines;
+
+  /* Initialize the JPEG compression object with default error handling. */
+  cinfo.err = jpeg_std_error(&jerr);
+  jpeg_create_compress(&cinfo);
+  /* Add some application-specific error messages (from cderror.h) */
+  jerr.addon_message_table = cdjpeg_message_table;
+  jerr.first_addon_message = JMSG_FIRSTADDONCODE;
+  jerr.last_addon_message = JMSG_LASTADDONCODE;
+
+  /* Now safe to enable signal catcher. */
+#ifdef NEED_SIGNAL_CATCHER
+  enable_signal_catcher((j_common_ptr) &cinfo);
+#endif
+
+  /* Initialize JPEG parameters.
+   * Much of this may be overridden later.
+   * In particular, we don't yet know the input file's color space,
+   * but we need to provide some value for jpeg_set_defaults() to work.
+   */
+
+  cinfo.in_color_space = JCS_RGB; /* arbitrary guess */
+  jpeg_set_defaults(&cinfo);
+
+  /* Scan command line to find next file name.
+   * It is convenient to use just one switch-parsing routine, but the switch
+   * values read here are ignored; we will rescan the switches after opening
+   * the input file.
+   */
+
+  file_index = parse_switches(&cinfo, argc, argv, old_file_index, FALSE);
+  if (file_index >= argc) {
+    fprintf(stderr, "%s: missing input file name\n", progname);
+    usage();
+  }
+
+  /* Open the input file. */
+  infilename = argv[file_index];
+  if ((input_file = fopen(infilename, READ_BINARY)) == NULL) {
+    fprintf(stderr, "%s: can't open %s\n", progname, infilename);
+    goto fail;
+  }
+
+#ifdef PROGRESS_REPORT
+  start_progress_monitor((j_common_ptr) &cinfo, &progress);
+#endif
+
+  /* Figure out the input file format, and set up to read it. */
+  src_mgr = select_file_type(&cinfo, input_file);
+  src_mgr->input_file = input_file;
+
+  /* Read the input file header to obtain file size & colorspace. */
+  (*src_mgr->start_input) (&cinfo, src_mgr);
+
+  /* Now that we know input colorspace, fix colorspace-dependent defaults */
+  jpeg_default_colorspace(&cinfo);
+
+  /* Adjust default compression parameters by re-parsing the options */
+  file_index = parse_switches(&cinfo, argc, argv, old_file_index, TRUE);
+
+  /* If user didn't supply -outfile switch, select output file name. */
+  if (outfilename == NULL) {
+    int i;
+
+    outfilename = workfilename;
+    /* Make outfilename be infilename with .jpg substituted for extension */
+    strcpy(outfilename, infilename);
+    for (i = strlen(outfilename)-1; i >= 0; i--) {
+      switch (outfilename[i]) {
+      case ':':
+      case '/':
+      case '\\':
+	i = 0;			/* stop scanning */
+	break;
+      case '.':
+	outfilename[i] = '\0';	/* lop off existing extension */
+	i = 0;			/* stop scanning */
+	break;
+      default:
+	break;			/* keep scanning */
+      }
+    }
+    strcat(outfilename, ".jpg");
+  }
+
+  fprintf(stderr, "Compressing %s => %s\n", infilename, outfilename);
+#ifndef NO_OVERWRITE_CHECK
+  if (! is_write_ok(outfilename))
+    goto fail;
+#endif
+
+  /* Open the output file. */
+  if ((output_file = fopen(outfilename, WRITE_BINARY)) == NULL) {
+    fprintf(stderr, "%s: can't create %s\n", progname, outfilename);
+    goto fail;
+  }
+
+  /* Specify data destination for compression */
+  jpeg_stdio_dest(&cinfo, output_file);
+
+  /* Start compressor */
+  jpeg_start_compress(&cinfo, TRUE);
+
+  /* Process data */
+  while (cinfo.next_scanline < cinfo.image_height) {
+    num_scanlines = (*src_mgr->get_pixel_rows) (&cinfo, src_mgr);
+    (void) jpeg_write_scanlines(&cinfo, src_mgr->buffer, num_scanlines);
+  }
+
+  /* Finish compression and release memory */
+  (*src_mgr->finish_input) (&cinfo, src_mgr);
+  jpeg_finish_compress(&cinfo);
+
+  /* Clean up and exit */
+fail:
+  jpeg_destroy_compress(&cinfo);
+
+  if (input_file != NULL) fclose(input_file);
+  if (output_file != NULL) fclose(output_file);
+
+#ifdef PROGRESS_REPORT
+  end_progress_monitor((j_common_ptr) &cinfo);
+#endif
+
+  /* Disable signal catcher. */
+#ifdef NEED_SIGNAL_CATCHER
+  enable_signal_catcher((j_common_ptr) NULL);
+#endif
+
+  return file_index;
+}
+
+
+/*
+ * The main program.
+ */
+
+int
+main (int argc, char **argv)
+{
+  int file_index;
+
+  /* On Mac, fetch a command line. */
+#ifdef USE_CCOMMAND
+  argc = ccommand(&argv);
+#endif
+
+#ifdef MSDOS
+  progname = "cjpeg";		/* DOS tends to be too verbose about argv[0] */
+#else
+  progname = argv[0];
+  if (progname == NULL || progname[0] == 0)
+    progname = "cjpeg";		/* in case C library doesn't provide it */
+#endif
+
+  /* The default maxmem must be computed only once at program startup,
+   * since releasing memory with free() won't give it back to the OS.
+   */
+#ifdef FREE_MEM_ESTIMATE
+  default_maxmem = FREE_MEM_ESTIMATE;
+#else
+  default_maxmem = 0;
+#endif
+
+  /* Scan command line, parse switches and locate input file names */
+
+  if (argc < 2)
+    usage();			/* nothing on the command line?? */
+
+  file_index = 0;
+
+  while (file_index < argc-1)
+    file_index = process_one_file(argc, argv, file_index);
+
+  /* All done. */
+  exit(EXIT_SUCCESS);
+  return 0;			/* suppress no-return-value warnings */
+}
diff --git a/altui/djpeg.c b/altui/djpeg.c
new file mode 100644
index 0000000..a000d45
--- /dev/null
+++ b/altui/djpeg.c
@@ -0,0 +1,836 @@
+/*
+ * alternate djpeg.c
+ *
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : January 6, 2006
+ * ---------------------------------------------------------------------
+ *
+ * This file contains an alternate user interface for the JPEG decompressor.
+ * One or more input files are named on the command line, and output file
+ * names are created by substituting an appropriate extension.
+ */
+
+#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
+#include "jversion.h"		/* for version message */
+
+#include <ctype.h>		/* to declare isprint() */
+
+#ifdef USE_CCOMMAND		/* command-line reader for Macintosh */
+#ifdef __MWERKS__
+#include <SIOUX.h>              /* Metrowerks needs this */
+#include <console.h>		/* ... and this */
+#endif
+#ifdef THINK_C
+#include <console.h>		/* Think declares it here */
+#endif
+#endif
+
+#ifndef PATH_MAX		/* ANSI maximum-pathname-length constant */
+#define PATH_MAX 256
+#endif
+
+
+/* Create the add-on message string table. */
+
+#define JMESSAGE(code,string)	string ,
+
+static const char * const cdjpeg_message_table[] = {
+#include "cderror.h"
+  NULL
+};
+
+
+/*
+ * SIMD Ext: compiler-specific hacks to enable filename wild-card expansion
+ */
+
+#ifdef _MSC_VER		/* Microsoft Visual C++ */
+/* from setargv.c (setargv.obj) */
+/* Tested under Visual C++ V6.0, Toolkit 2003, and 2005 Express Edition */
+int __cdecl _setargv(void) { int __cdecl __setargv(void); return __setargv(); }
+#endif
+#ifdef __BORLANDC__	/* Borland C++ */
+/* from wildargs.c (wildargs.obj) */
+/* Tested under Borland C++ Compiler 5.5 (win32) */
+#include <wildargs.h>
+typedef void _RTLENTRY (* _RTLENTRY _argv_expand_fnc)(char *, _PFN_ADDARG);
+_argv_expand_fnc _argv_expand_ptr = _expand_wild;
+#endif
+
+
+/*
+ * Automatic determination of available memory.
+ */
+
+static long default_maxmem;	/* saves value determined at startup, or 0 */
+
+#ifndef FREE_MEM_ESTIMATE	/* may be defined from command line */
+
+#ifdef MSDOS			/* For MS-DOS (unless flat-memory model) */
+
+#include <dos.h>		/* for access to intdos() call */
+
+LOCAL(long)
+unused_dos_memory (void)
+/* Obtain total amount of unallocated DOS memory */
+{
+  union REGS regs;
+  long nparas;
+
+  regs.h.ah = 0x48;		/* DOS function Allocate Memory Block */
+  regs.x.bx = 0xFFFF;		/* Ask for more memory than DOS can have */
+  (void) intdos(&regs, &regs);
+  /* DOS will fail and return # of paragraphs actually available in BX. */
+  nparas = (unsigned int) regs.x.bx;
+  /* Times 16 to convert to bytes. */
+  return nparas << 4;
+}
+
+/* The default memory setting is 95% of the available space. */
+#define FREE_MEM_ESTIMATE  ((unused_dos_memory() * 95L) / 100L)
+
+#endif /* MSDOS */
+
+#ifdef ATARI			/* For Atari ST/STE/TT, Pure C or Turbo C */
+
+#include <ext.h>
+
+/* The default memory setting is 90% of the available space. */
+#define FREE_MEM_ESTIMATE  (((long) coreleft() * 90L) / 100L)
+
+#endif /* ATARI */
+
+/* Add memory-estimation procedures for other operating systems here,
+ * with appropriate #ifdef's around them.
+ */
+
+#endif /* !FREE_MEM_ESTIMATE */
+
+
+/*
+ * This list defines the known output image formats
+ * (not all of which need be supported by a given version).
+ * You can change the default output format by defining DEFAULT_FMT;
+ * indeed, you had better do so if you undefine PPM_SUPPORTED.
+ */
+
+typedef enum {
+	FMT_BMP,		/* BMP format (Windows flavor) */
+	FMT_GIF,		/* GIF format */
+	FMT_OS2,		/* BMP format (OS/2 flavor) */
+	FMT_PPM,		/* PPM/PGM (PBMPLUS formats) */
+	FMT_RLE,		/* RLE format */
+	FMT_TARGA,		/* Targa format */
+	FMT_TIFF		/* TIFF format */
+} IMAGE_FORMATS;
+
+#ifndef DEFAULT_FMT		/* so can override from CFLAGS in Makefile */
+#define DEFAULT_FMT	FMT_GIF
+#endif
+
+static IMAGE_FORMATS requested_fmt;
+
+
+/*
+ * Argument-parsing code.
+ * The switch parser is designed to be useful with DOS-style command line
+ * syntax, ie, intermixed switches and file names, where only the switches
+ * to the left of a given file name affect processing of that file.
+ */
+
+
+static const char * progname;	/* program name for error messages */
+static char * outfilename;	/* for -outfile switch */
+
+
+LOCAL(void)
+usage (void)
+/* complain about bad command line */
+{
+  fprintf(stderr, "usage: %s [switches] inputfile(s)\n", progname);
+  fprintf(stderr, "List of input files may use wildcards (* and ?)\n");
+  fprintf(stderr, "Output filename is same as input filename except for extension\n");
+
+  fprintf(stderr, "Switches (names may be abbreviated):\n");
+  fprintf(stderr, "  -colors N      Reduce image to no more than N colors\n");
+  fprintf(stderr, "  -fast          Fast, low-quality processing\n");
+  fprintf(stderr, "  -grayscale     Force grayscale output\n");
+#ifdef IDCT_SCALING_SUPPORTED
+  fprintf(stderr, "  -scale M/N     Scale output image by fraction M/N, eg, 1/8\n");
+#endif
+#ifdef BMP_SUPPORTED
+  fprintf(stderr, "  -bmp           Select BMP output format (Windows style)%s\n",
+	  (DEFAULT_FMT == FMT_BMP ? " (default)" : ""));
+#endif
+#ifdef GIF_SUPPORTED
+  fprintf(stderr, "  -gif           Select GIF output format%s\n",
+	  (DEFAULT_FMT == FMT_GIF ? " (default)" : ""));
+#endif
+#ifdef BMP_SUPPORTED
+  fprintf(stderr, "  -os2           Select BMP output format (OS/2 style)%s\n",
+	  (DEFAULT_FMT == FMT_OS2 ? " (default)" : ""));
+#endif
+#ifdef PPM_SUPPORTED
+  fprintf(stderr, "  -pnm           Select PBMPLUS (PPM/PGM) output format%s\n",
+	  (DEFAULT_FMT == FMT_PPM ? " (default)" : ""));
+#endif
+#ifdef RLE_SUPPORTED
+  fprintf(stderr, "  -rle           Select Utah RLE output format%s\n",
+	  (DEFAULT_FMT == FMT_RLE ? " (default)" : ""));
+#endif
+#ifdef TARGA_SUPPORTED
+  fprintf(stderr, "  -targa         Select Targa output format%s\n",
+	  (DEFAULT_FMT == FMT_TARGA ? " (default)" : ""));
+#endif
+  fprintf(stderr, "Switches for advanced users:\n");
+#ifdef DCT_ISLOW_SUPPORTED
+  fprintf(stderr, "  -dct int       Use integer DCT method%s\n",
+	  (JDCT_DEFAULT == JDCT_ISLOW ? " (default)" : ""));
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+  fprintf(stderr, "  -dct fast      Use fast integer DCT (less accurate)%s\n",
+	  (JDCT_DEFAULT == JDCT_IFAST ? " (default)" : ""));
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+  fprintf(stderr, "  -dct float     Use floating-point DCT method%s\n",
+	  (JDCT_DEFAULT == JDCT_FLOAT ? " (default)" : ""));
+#endif
+  fprintf(stderr, "  -dither fs     Use F-S dithering (default)\n");
+  fprintf(stderr, "  -dither none   Don't use dithering in quantization\n");
+  fprintf(stderr, "  -dither ordered  Use ordered dither (medium speed, quality)\n");
+#ifdef QUANT_2PASS_SUPPORTED
+  fprintf(stderr, "  -map FILE      Map to colors used in named image file\n");
+#endif
+  fprintf(stderr, "  -nosmooth      Don't use high-quality upsampling\n");
+#ifdef QUANT_1PASS_SUPPORTED
+  fprintf(stderr, "  -onepass       Use 1-pass quantization (fast, low quality)\n");
+#endif
+#ifndef FREE_MEM_ESTIMATE
+  fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
+#endif
+  fprintf(stderr, "  -outfile name  Specify name for output file\n");
+  fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
+  exit(EXIT_FAILURE);
+}
+
+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+LOCAL(void)
+print_simd_info (FILE * file, char * labelstr, unsigned int simd)
+{
+  fprintf(file, "%s%s%s%s%s%s\n", labelstr,
+	  simd & JSIMD_MMX   ? " MMX"    : "",
+	  simd & JSIMD_3DNOW ? " 3DNow!" : "",
+	  simd & JSIMD_SSE   ? " SSE"    : "",
+	  simd & JSIMD_SSE2  ? " SSE2"   : "",
+	  simd == JSIMD_NONE ? " NONE"   : "");
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
+
+
+LOCAL(int)
+parse_switches (j_decompress_ptr cinfo, int argc, char **argv,
+		int last_file_arg_seen, boolean for_real)
+/* Parse optional switches.
+ * Returns argv[] index of first file-name argument (== argc if none).
+ * Any file names with indexes <= last_file_arg_seen are ignored;
+ * they have presumably been processed in a previous iteration.
+ * (Pass 0 for last_file_arg_seen on the first or only iteration.)
+ * for_real is FALSE on the first (dummy) pass; we may skip any expensive
+ * processing.
+ */
+{
+  int argn;
+  char * arg;
+
+  /* Set up default JPEG parameters. */
+  requested_fmt = DEFAULT_FMT;	/* set default output file format */
+  outfilename = NULL;
+  cinfo->err->trace_level = 0;
+  if (default_maxmem > 0)	/* override library's default value */
+    cinfo->mem->max_memory_to_use = default_maxmem;
+
+  /* Scan command line options, adjust parameters */
+
+  for (argn = 1; argn < argc; argn++) {
+    arg = argv[argn];
+    if (*arg != '-') {
+      /* Not a switch, must be a file name argument */
+      if (argn <= last_file_arg_seen) {
+	outfilename = NULL;	/* -outfile applies to just one input file */
+	continue;		/* ignore this name if previously processed */
+      }
+      break;			/* else done parsing switches */
+    }
+    arg++;			/* advance past switch marker character */
+
+    if (keymatch(arg, "bmp", 1)) {
+      /* BMP output format. */
+      requested_fmt = FMT_BMP;
+
+    } else if (keymatch(arg, "colors", 1) || keymatch(arg, "colours", 1) ||
+	       keymatch(arg, "quantize", 1) || keymatch(arg, "quantise", 1)) {
+      /* Do color quantization. */
+      int val;
+
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (sscanf(argv[argn], "%d", &val) != 1)
+	usage();
+      cinfo->desired_number_of_colors = val;
+      cinfo->quantize_colors = TRUE;
+
+#ifndef JSIMD_MASKFUNC_NOT_SUPPORTED
+    } else if (keymatch(arg, "nosimd" , 4)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_ALL);
+    } else if (keymatch(arg, "nommx"  , 3)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_MMX);
+    } else if (keymatch(arg, "no3dnow", 3)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_3DNOW);
+    } else if (keymatch(arg, "nosse"  , 4)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_SSE);
+    } else if (keymatch(arg, "nosse2" , 6)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_SSE2);
+#endif /* !JSIMD_MASKFUNC_NOT_SUPPORTED */
+
+    } else if (keymatch(arg, "dct", 2)) {
+      /* Select IDCT algorithm. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (keymatch(argv[argn], "int", 1)) {
+	cinfo->dct_method = JDCT_ISLOW;
+      } else if (keymatch(argv[argn], "fast", 2)) {
+	cinfo->dct_method = JDCT_IFAST;
+      } else if (keymatch(argv[argn], "float", 2)) {
+	cinfo->dct_method = JDCT_FLOAT;
+      } else
+	usage();
+
+    } else if (keymatch(arg, "dither", 2)) {
+      /* Select dithering algorithm. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (keymatch(argv[argn], "fs", 2)) {
+	cinfo->dither_mode = JDITHER_FS;
+      } else if (keymatch(argv[argn], "none", 2)) {
+	cinfo->dither_mode = JDITHER_NONE;
+      } else if (keymatch(argv[argn], "ordered", 2)) {
+	cinfo->dither_mode = JDITHER_ORDERED;
+      } else
+	usage();
+
+    } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) {
+      /* Enable debug printouts. */
+      /* On first -d, print version identification */
+      static boolean printed_version = FALSE;
+
+      if (! printed_version) {
+	fprintf(stderr, "Independent JPEG Group's DJPEG, version %s\n%s\n",
+		JVERSION, JCOPYRIGHT);
+	fprintf(stderr,
+		"\nx86 SIMD extension for IJG JPEG library, version %s\n\n",
+		JPEG_SIMDEXT_VER_STR);
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+	print_simd_info(stderr, "SIMD instructions supported by the system :",
+			jpeg_simd_support(NULL));
+
+	fprintf(stderr, "\n      === SIMD Operation Modes ===\n");
+#ifdef DCT_ISLOW_SUPPORTED
+	print_simd_info(stderr, "Accurate integer DCT  (-dct int)   :",
+			jpeg_simd_inverse_dct(cinfo, JDCT_ISLOW));
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+	print_simd_info(stderr, "Fast integer DCT      (-dct fast)  :",
+			jpeg_simd_inverse_dct(cinfo, JDCT_IFAST));
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+	print_simd_info(stderr, "Floating-point DCT    (-dct float) :",
+			jpeg_simd_inverse_dct(cinfo, JDCT_FLOAT));
+#endif
+#ifdef IDCT_SCALING_SUPPORTED
+	print_simd_info(stderr, "Reduced-size DCT      (-scale M/N) :",
+			jpeg_simd_inverse_dct(cinfo, JDCT_FLOAT+1));
+#endif
+	print_simd_info(stderr, "High-quality upsampling (default)  :",
+			jpeg_simd_upsampler(cinfo, TRUE));
+	print_simd_info(stderr, "Low-quality upsampling (-nosmooth) :",
+			jpeg_simd_upsampler(cinfo, FALSE));
+	print_simd_info(stderr, "Colorspace conversion (YCbCr->RGB) :",
+			jpeg_simd_color_deconverter(cinfo));
+	fprintf(stderr, "\n");
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
+	printed_version = TRUE;
+      }
+      cinfo->err->trace_level++;
+
+    } else if (keymatch(arg, "fast", 1)) {
+      /* Select recommended processing options for quick-and-dirty output. */
+      cinfo->two_pass_quantize = FALSE;
+      cinfo->dither_mode = JDITHER_ORDERED;
+      if (! cinfo->quantize_colors) /* don't override an earlier -colors */
+	cinfo->desired_number_of_colors = 216;
+      cinfo->dct_method = JDCT_FASTEST;
+      cinfo->do_fancy_upsampling = FALSE;
+
+    } else if (keymatch(arg, "gif", 1)) {
+      /* GIF output format. */
+      requested_fmt = FMT_GIF;
+
+    } else if (keymatch(arg, "grayscale", 2) || keymatch(arg, "greyscale",2)) {
+      /* Force monochrome output. */
+      cinfo->out_color_space = JCS_GRAYSCALE;
+
+    } else if (keymatch(arg, "map", 3)) {
+      /* Quantize to a color map taken from an input file. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (for_real) {		/* too expensive to do twice! */
+#ifdef QUANT_2PASS_SUPPORTED	/* otherwise can't quantize to supplied map */
+	FILE * mapfile;
+
+	if ((mapfile = fopen(argv[argn], READ_BINARY)) == NULL) {
+	  fprintf(stderr, "%s: can't open %s\n", progname, argv[argn]);
+	  exit(EXIT_FAILURE);
+	}
+	read_color_map(cinfo, mapfile);
+	fclose(mapfile);
+	cinfo->quantize_colors = TRUE;
+#else
+	ERREXIT(cinfo, JERR_NOT_COMPILED);
+#endif
+      }
+
+    } else if (keymatch(arg, "maxmemory", 3)) {
+      /* Maximum memory in Kb (or Mb with 'm'). */
+      long lval;
+      char ch = 'x';
+
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (sscanf(argv[argn], "%ld%c", &lval, &ch) < 1)
+	usage();
+      if (ch == 'm' || ch == 'M')
+	lval *= 1000L;
+      cinfo->mem->max_memory_to_use = lval * 1000L;
+
+    } else if (keymatch(arg, "nosmooth", 3)) {
+      /* Suppress fancy upsampling */
+      cinfo->do_fancy_upsampling = FALSE;
+
+    } else if (keymatch(arg, "onepass", 3)) {
+      /* Use fast one-pass quantization. */
+      cinfo->two_pass_quantize = FALSE;
+
+    } else if (keymatch(arg, "os2", 3)) {
+      /* BMP output format (OS/2 flavor). */
+      requested_fmt = FMT_OS2;
+
+    } else if (keymatch(arg, "outfile", 4)) {
+      /* Set output file name. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      outfilename = argv[argn];	/* save it away for later use */
+
+    } else if (keymatch(arg, "pnm", 1) || keymatch(arg, "ppm", 1)) {
+      /* PPM/PGM output format. */
+      requested_fmt = FMT_PPM;
+
+    } else if (keymatch(arg, "rle", 1)) {
+      /* RLE output format. */
+      requested_fmt = FMT_RLE;
+
+    } else if (keymatch(arg, "scale", 1)) {
+      /* Scale the output image by a fraction M/N. */
+      if (++argn >= argc)	/* advance to next argument */
+	usage();
+      if (sscanf(argv[argn], "%d/%d",
+		 &cinfo->scale_num, &cinfo->scale_denom) != 2)
+	usage();
+
+    } else if (keymatch(arg, "targa", 1)) {
+      /* Targa output format. */
+      requested_fmt = FMT_TARGA;
+
+    } else {
+      usage();			/* bogus switch */
+    }
+  }
+
+  return argn;			/* return index of next arg (file name) */
+}
+
+
+/*
+ * Marker processor for COM and interesting APPn markers.
+ * This replaces the library's built-in processor, which just skips the marker.
+ * We want to print out the marker as text, to the extent possible.
+ * Note this code relies on a non-suspending data source.
+ */
+
+LOCAL(unsigned int)
+jpeg_getc (j_decompress_ptr cinfo)
+/* Read next byte */
+{
+  struct jpeg_source_mgr * datasrc = cinfo->src;
+
+  if (datasrc->bytes_in_buffer == 0) {
+    if (! (*datasrc->fill_input_buffer) (cinfo))
+      ERREXIT(cinfo, JERR_CANT_SUSPEND);
+  }
+  datasrc->bytes_in_buffer--;
+  return GETJOCTET(*datasrc->next_input_byte++);
+}
+
+
+METHODDEF(boolean)
+print_text_marker (j_decompress_ptr cinfo)
+{
+  boolean traceit = (cinfo->err->trace_level >= 1);
+  INT32 length;
+  unsigned int ch;
+  unsigned int lastch = 0;
+
+  length = jpeg_getc(cinfo) << 8;
+  length += jpeg_getc(cinfo);
+  length -= 2;			/* discount the length word itself */
+
+  if (traceit) {
+    if (cinfo->unread_marker == JPEG_COM)
+      fprintf(stderr, "Comment, length %ld:\n", (long) length);
+    else			/* assume it is an APPn otherwise */
+      fprintf(stderr, "APP%d, length %ld:\n",
+	      cinfo->unread_marker - JPEG_APP0, (long) length);
+  }
+
+  while (--length >= 0) {
+    ch = jpeg_getc(cinfo);
+    if (traceit) {
+      /* Emit the character in a readable form.
+       * Nonprintables are converted to \nnn form,
+       * while \ is converted to \\.
+       * Newlines in CR, CR/LF, or LF form will be printed as one newline.
+       */
+      if (ch == '\r') {
+	fprintf(stderr, "\n");
+      } else if (ch == '\n') {
+	if (lastch != '\r')
+	  fprintf(stderr, "\n");
+      } else if (ch == '\\') {
+	fprintf(stderr, "\\\\");
+      } else if (isprint(ch)) {
+	putc(ch, stderr);
+      } else {
+	fprintf(stderr, "\\%03o", ch);
+      }
+      lastch = ch;
+    }
+  }
+
+  if (traceit)
+    fprintf(stderr, "\n");
+
+  return TRUE;
+}
+
+
+/*
+ * Check for overwrite of an existing file; clear it with user
+ */
+
+#ifndef NO_OVERWRITE_CHECK
+
+LOCAL(boolean)
+is_write_ok (char * outfname)
+{
+  FILE * ofile;
+  int ch;
+
+  ofile = fopen(outfname, READ_BINARY);
+  if (ofile == NULL)
+    return TRUE;		/* not present */
+  fclose(ofile);		/* oops, it is present */
+
+  for (;;) {
+    fprintf(stderr, "%s already exists, overwrite it? [y/n] ",
+	    outfname);
+    fflush(stderr);
+    ch = getc(stdin);
+    if (ch != '\n')		/* flush rest of line */
+      while (getc(stdin) != '\n')
+	/* nothing */;
+
+    switch (ch) {
+    case 'Y':
+    case 'y':
+      return TRUE;
+    case 'N':
+    case 'n':
+      return FALSE;
+    /* otherwise, ask again */
+    }
+  }
+}
+
+#endif
+
+
+/*
+ * Process a single input file name, and return its index in argv[].
+ * File names at or to left of old_file_index have been processed already.
+ */
+
+LOCAL(int)
+process_one_file (int argc, char **argv, int old_file_index)
+{
+  struct jpeg_decompress_struct cinfo;
+  struct jpeg_error_mgr jerr;
+  char *infilename;
+  char workfilename[PATH_MAX];
+  const char *default_extension = NULL;
+#ifdef PROGRESS_REPORT
+  struct cdjpeg_progress_mgr progress;
+#endif
+  int file_index;
+  djpeg_dest_ptr dest_mgr = NULL;
+  FILE * input_file = NULL;
+  FILE * output_file = NULL;
+  JDIMENSION num_scanlines;
+
+  /* Initialize the JPEG decompression object with default error handling. */
+  cinfo.err = jpeg_std_error(&jerr);
+  jpeg_create_decompress(&cinfo);
+  /* Add some application-specific error messages (from cderror.h) */
+  jerr.addon_message_table = cdjpeg_message_table;
+  jerr.first_addon_message = JMSG_FIRSTADDONCODE;
+  jerr.last_addon_message = JMSG_LASTADDONCODE;
+
+  /* Insert custom marker processor for COM and APP12.
+   * APP12 is used by some digital camera makers for textual info,
+   * so we provide the ability to display it as text.
+   * If you like, additional APPn marker types can be selected for display,
+   * but don't try to override APP0 or APP14 this way (see libjpeg.doc).
+   */
+  jpeg_set_marker_processor(&cinfo, JPEG_COM, print_text_marker);
+  jpeg_set_marker_processor(&cinfo, JPEG_APP0+12, print_text_marker);
+
+  /* Now safe to enable signal catcher. */
+#ifdef NEED_SIGNAL_CATCHER
+  enable_signal_catcher((j_common_ptr) &cinfo);
+#endif
+
+  /* Scan command line to find next file name.
+   * It is convenient to use just one switch-parsing routine, but the switch
+   * values read here are ignored; we will rescan the switches after opening
+   * the input file.
+   * (Exception: tracing level set here controls verbosity for COM markers
+   * found during jpeg_read_header...)
+   */
+
+  file_index = parse_switches(&cinfo, argc, argv, old_file_index, FALSE);
+  if (file_index >= argc) {
+    fprintf(stderr, "%s: missing input file name\n", progname);
+    usage();
+  }
+
+  /* Open the input file. */
+  infilename = argv[file_index];
+  if ((input_file = fopen(infilename, READ_BINARY)) == NULL) {
+    fprintf(stderr, "%s: can't open %s\n", progname, infilename);
+    goto fail;
+  }
+
+#ifdef PROGRESS_REPORT
+  start_progress_monitor((j_common_ptr) &cinfo, &progress);
+#endif
+
+  /* Specify data source for decompression */
+  jpeg_stdio_src(&cinfo, input_file);
+
+  /* Read file header, set default decompression parameters */
+  (void) jpeg_read_header(&cinfo, TRUE);
+
+  /* Adjust default decompression parameters by re-parsing the options */
+  file_index = parse_switches(&cinfo, argc, argv, old_file_index, TRUE);
+
+  /* Initialize the output module now to let it override any crucial
+   * option settings (for instance, GIF wants to force color quantization).
+   */
+  switch (requested_fmt) {
+#ifdef BMP_SUPPORTED
+  case FMT_BMP:
+    dest_mgr = jinit_write_bmp(&cinfo, FALSE);
+    default_extension = ".bmp";
+    break;
+  case FMT_OS2:
+    dest_mgr = jinit_write_bmp(&cinfo, TRUE);
+    default_extension = ".bmp";
+    break;
+#endif
+#ifdef GIF_SUPPORTED
+  case FMT_GIF:
+    dest_mgr = jinit_write_gif(&cinfo);
+    default_extension = ".gif";
+    break;
+#endif
+#ifdef PPM_SUPPORTED
+  case FMT_PPM:
+    dest_mgr = jinit_write_ppm(&cinfo);
+    default_extension = ".ppm";
+    break;
+#endif
+#ifdef RLE_SUPPORTED
+  case FMT_RLE:
+    dest_mgr = jinit_write_rle(&cinfo);
+    default_extension = ".rle";
+    break;
+#endif
+#ifdef TARGA_SUPPORTED
+  case FMT_TARGA:
+    dest_mgr = jinit_write_targa(&cinfo);
+    default_extension = ".tga";
+    break;
+#endif
+  default:
+    ERREXIT(&cinfo, JERR_UNSUPPORTED_FORMAT);
+    break;
+  }
+
+  /* If user didn't supply -outfile switch, select output file name. */
+  if (outfilename == NULL) {
+    int i;
+
+    outfilename = workfilename;
+    /* Make outfilename be infilename with appropriate extension */
+    strcpy(outfilename, infilename);
+    for (i = strlen(outfilename)-1; i >= 0; i--) {
+      switch (outfilename[i]) {
+      case ':':
+      case '/':
+      case '\\':
+	i = 0;			/* stop scanning */
+	break;
+      case '.':
+	outfilename[i] = '\0';	/* lop off existing extension */
+	i = 0;			/* stop scanning */
+	break;
+      default:
+	break;			/* keep scanning */
+      }
+    }
+    strcat(outfilename, default_extension);
+  }
+
+  fprintf(stderr, "Decompressing %s => %s\n", infilename, outfilename);
+#ifndef NO_OVERWRITE_CHECK
+  if (! is_write_ok(outfilename))
+    goto fail;
+#endif
+
+  /* Open the output file. */
+  if ((output_file = fopen(outfilename, WRITE_BINARY)) == NULL) {
+    fprintf(stderr, "%s: can't create %s\n", progname, outfilename);
+    goto fail;
+  }
+  dest_mgr->output_file = output_file;
+
+  /* Start decompressor */
+  (void) jpeg_start_decompress(&cinfo);
+
+  /* Write output file header */
+  (*dest_mgr->start_output) (&cinfo, dest_mgr);
+
+  /* Process data */
+  while (cinfo.output_scanline < cinfo.output_height) {
+    num_scanlines = jpeg_read_scanlines(&cinfo, dest_mgr->buffer,
+					dest_mgr->buffer_height);
+    (*dest_mgr->put_pixel_rows) (&cinfo, dest_mgr, num_scanlines);
+  }
+
+#ifdef PROGRESS_REPORT
+  /* Hack: count final pass as done in case finish_output does an extra pass.
+   * The library won't have updated completed_passes.
+   */
+  progress.pub.completed_passes = progress.pub.total_passes;
+#endif
+
+  /* Finish decompression and release memory.
+   * I must do it in this order because output module has allocated memory
+   * of lifespan JPOOL_IMAGE; it needs to finish before releasing memory.
+   */
+  (*dest_mgr->finish_output) (&cinfo, dest_mgr);
+  (void) jpeg_finish_decompress(&cinfo);
+
+  /* Clean up and exit */
+fail:
+  jpeg_destroy_decompress(&cinfo);
+
+  if (input_file != NULL) fclose(input_file);
+  if (output_file != NULL) fclose(output_file);
+
+#ifdef PROGRESS_REPORT
+  end_progress_monitor((j_common_ptr) &cinfo);
+#endif
+
+  /* Disable signal catcher. */
+#ifdef NEED_SIGNAL_CATCHER
+  enable_signal_catcher((j_common_ptr) NULL);
+#endif
+
+  return file_index;
+}
+
+
+/*
+ * The main program.
+ */
+
+int
+main (int argc, char **argv)
+{
+  int file_index;
+
+  /* On Mac, fetch a command line. */
+#ifdef USE_CCOMMAND
+  argc = ccommand(&argv);
+#endif
+
+#ifdef MSDOS
+  progname = "djpeg";		/* DOS tends to be too verbose about argv[0] */
+#else
+  progname = argv[0];
+  if (progname == NULL || progname[0] == 0)
+    progname = "djpeg";		/* in case C library doesn't provide it */
+#endif
+
+  /* The default maxmem must be computed only once at program startup,
+   * since releasing memory with free() won't give it back to the OS.
+   */
+#ifdef FREE_MEM_ESTIMATE
+  default_maxmem = FREE_MEM_ESTIMATE;
+#else
+  default_maxmem = 0;
+#endif
+
+  /* Scan command line, parse switches and locate input file names */
+
+  if (argc < 2)
+    usage();			/* nothing on the command line?? */
+
+  file_index = 0;
+
+  while (file_index < argc-1)
+    file_index = process_one_file(argc, argv, file_index);
+
+  /* All done. */
+  exit(EXIT_SUCCESS);
+  return 0;			/* suppress no-return-value warnings */
+}
diff --git a/altui/usage.alt b/altui/usage.alt
new file mode 100644
index 0000000..277332b
--- /dev/null
+++ b/altui/usage.alt
@@ -0,0 +1,62 @@
+(Most of the standard usage.doc file also applies to this alternate version,
+but replace its "GENERAL USAGE" section with the text below.  Edit the text
+as necessary if you don't support wildcards or overwrite checking.  Be sure
+to fix the djpeg switch descriptions if you are not defaulting to PPM output.
+Also, if you've provided an accurate memory-estimation procedure, you can
+probably eliminate the HINTS related to the -maxmemory switch.)
+
+
+GENERAL USAGE
+
+We provide two programs, cjpeg to compress an image file into JPEG format,
+and djpeg to decompress a JPEG file back into a conventional image format.
+
+The basic command line is:
+	cjpeg [switches] list of image files
+or
+	djpeg [switches] list of jpeg files
+
+Each file named is compressed or decompressed.  The input file(s) are not
+modified; the output data is written to files which have the same names
+except for extension.  cjpeg always uses ".jpg" for the output file name's
+extension; djpeg uses one of ".bmp", ".gif", ".ppm", ".rle", or ".tga",
+depending on what output format is selected by the switches.
+
+For example, to convert xxx.bmp to xxx.jpg and yyy.ppm to yyy.jpg, say:
+	cjpeg xxx.bmp yyy.ppm
+
+On most systems you can use standard wildcards to specify the list of input
+files; for example, on DOS "djpeg *.jpg" decompresses all the JPEG files in
+the current directory.
+
+If an intended output file already exists, you'll be asked whether or not to
+overwrite it.  If you say no, the program skips that input file and goes on
+to the next one.
+
+You can intermix switches and file names; for example
+	djpeg -gif file1.jpg -targa file2.jpg
+decompresses file1.jpg into GIF format (file1.gif) and file2.jpg into Targa
+format (file2.tga).  Only switches to the left of a given file name affect
+processing of that file; when there are conflicting switches, the rightmost
+one takes precedence.
+
+You can override the program's choice of output file name by using the
+-outfile switch, as in
+	cjpeg -outfile output.jpg input.ppm
+-outfile only affects the first input file name to its right.
+
+The currently supported image file formats are: PPM (PBMPLUS color format),
+PGM (PBMPLUS gray-scale format), BMP, GIF, Targa, and RLE (Utah Raster
+Toolkit format).  (RLE is supported only if the URT library is available,
+which it isn't on most non-Unix systems.)  cjpeg recognizes the input image
+format automatically, with the exception of some Targa-format files.  You
+have to tell djpeg which format to generate.
+
+JPEG files are in the defacto standard JFIF file format.  There are other,
+less widely used JPEG-based file formats, but we don't support them.
+
+All switch names may be abbreviated; for example, -grayscale may be written
+-gray or -gr.  Most of the "basic" switches can be abbreviated to as little as
+one letter.  Upper and lower case are equivalent (-BMP is the same as -bmp).
+British spellings are also accepted (e.g., -greyscale), though for brevity
+these are not mentioned below.
diff --git a/cjpeg.c b/cjpeg.c
index f2a929f..10f5f5b 100644
--- a/cjpeg.c
+++ b/cjpeg.c
@@ -5,6 +5,13 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : August 23, 2005
+ * ---------------------------------------------------------------------
+ *
  * This file contains a command-line user interface for the JPEG compressor.
  * It should work on any system with Unix- or MS-DOS-style command lines.
  *
@@ -195,6 +202,22 @@
 }
 
 
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+LOCAL(void)
+print_simd_info (FILE * file, char * labelstr, unsigned int simd)
+{
+  fprintf(file, "%s%s%s%s%s%s\n", labelstr,
+	  simd & JSIMD_MMX   ? " MMX"    : "",
+	  simd & JSIMD_3DNOW ? " 3DNow!" : "",
+	  simd & JSIMD_SSE   ? " SSE"    : "",
+	  simd & JSIMD_SSE2  ? " SSE2"   : "",
+	  simd == JSIMD_NONE ? " NONE"   : "");
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
+
+
 LOCAL(int)
 parse_switches (j_compress_ptr cinfo, int argc, char **argv,
 		int last_file_arg_seen, boolean for_real)
@@ -258,6 +281,19 @@
       /* Force baseline-compatible output (8-bit quantizer values). */
       force_baseline = TRUE;
 
+#ifndef JSIMD_MASKFUNC_NOT_SUPPORTED
+    } else if (keymatch(arg, "nosimd" , 4)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_ALL);
+    } else if (keymatch(arg, "nommx"  , 3)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_MMX);
+    } else if (keymatch(arg, "no3dnow", 3)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_3DNOW);
+    } else if (keymatch(arg, "nosse"  , 4)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_SSE);
+    } else if (keymatch(arg, "nosse2" , 6)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_SSE2);
+#endif /* !JSIMD_MASKFUNC_NOT_SUPPORTED */
+
     } else if (keymatch(arg, "dct", 2)) {
       /* Select DCT algorithm. */
       if (++argn >= argc)	/* advance to next argument */
@@ -279,6 +315,32 @@
       if (! printed_version) {
 	fprintf(stderr, "Independent JPEG Group's CJPEG, version %s\n%s\n",
 		JVERSION, JCOPYRIGHT);
+	fprintf(stderr,
+		"\nx86 SIMD extension for IJG JPEG library, version %s\n\n",
+		JPEG_SIMDEXT_VER_STR);
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+	print_simd_info(stderr, "SIMD instructions supported by the system :",
+			jpeg_simd_support(NULL));
+
+	fprintf(stderr, "\n      === SIMD Operation Modes ===\n");
+#ifdef DCT_ISLOW_SUPPORTED
+	print_simd_info(stderr, "Accurate integer DCT  (-dct int)   :",
+			jpeg_simd_forward_dct(cinfo, JDCT_ISLOW));
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+	print_simd_info(stderr, "Fast integer DCT      (-dct fast)  :",
+			jpeg_simd_forward_dct(cinfo, JDCT_IFAST));
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+	print_simd_info(stderr, "Floating-point DCT    (-dct float) :",
+			jpeg_simd_forward_dct(cinfo, JDCT_FLOAT));
+#endif
+	print_simd_info(stderr, "Downsampling (-sample 2x2 or 2x1)  :",
+			jpeg_simd_downsampler(cinfo));
+	print_simd_info(stderr, "Colorspace conversion (RGB->YCbCr) :",
+			jpeg_simd_color_converter(cinfo));
+	fprintf(stderr, "\n");
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
 	printed_version = TRUE;
       }
       cinfo->err->trace_level++;
diff --git a/ckconfig.c b/ckconfig.c
index 34baf79..ba380dc 100644
--- a/ckconfig.c
+++ b/ckconfig.c
@@ -4,6 +4,13 @@
  * Copyright (C) 1991-1994, Thomas G. Lane.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
+ *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : March 28, 2005
+ * ---------------------------------------------------------------------
  */
 
 /*
@@ -362,12 +369,24 @@
 #else
   fprintf(outfile, "#undef INCOMPLETE_TYPES_BROKEN\n");
 #endif
+#ifdef _WIN32
+  fprintf(outfile, "\n/* Define "boolean" as unsigned char, not int, per Windows custom */\n");
+  fprintf(outfile, "#define TYPEDEF_UCHAR_BOOLEAN\n");
+#endif
   fprintf(outfile, "\n#ifdef JPEG_INTERNALS\n\n");
   if (is_shifting_signed(-0x7F7E80B1L))
     fprintf(outfile, "#undef RIGHT_SHIFT_IS_UNSIGNED\n");
   else
     fprintf(outfile, "#define RIGHT_SHIFT_IS_UNSIGNED\n");
   fprintf(outfile, "\n#endif /* JPEG_INTERNALS */\n");
+
+  fprintf(outfile, "\n#if defined(JPEG_INTERNALS) || defined(JPEG_INTERNAL_OPTIONS)\n");
+  fprintf(outfile, "#undef JSIMD_MMX_NOT_SUPPORTED\n");
+  fprintf(outfile, "#undef JSIMD_3DNOW_NOT_SUPPORTED\n");
+  fprintf(outfile, "#undef JSIMD_SSE_NOT_SUPPORTED\n");
+  fprintf(outfile, "#undef JSIMD_SSE2_NOT_SUPPORTED\n");
+  fprintf(outfile, "#endif\n");
+
   fprintf(outfile, "\n#ifdef JPEG_CJPEG_DJPEG\n\n");
   fprintf(outfile, "#define BMP_SUPPORTED		/* BMP image file format */\n");
   fprintf(outfile, "#define GIF_SUPPORTED		/* GIF image file format */\n");
@@ -375,6 +394,9 @@
   fprintf(outfile, "#undef RLE_SUPPORTED		/* Utah RLE image file format */\n");
   fprintf(outfile, "#define TARGA_SUPPORTED		/* Targa image file format */\n\n");
   fprintf(outfile, "#undef TWO_FILE_COMMANDLINE	/* You may need this on non-Unix systems */\n");
+#ifdef _WIN32
+  fprintf(outfile, "#define USE_SETMODE		/* Needed to make one-file style work */\n");
+#endif
   fprintf(outfile, "#undef NEED_SIGNAL_CATCHER	/* Define this if you use jmemname.c */\n");
   fprintf(outfile, "#undef DONT_USE_B_MODE\n");
   fprintf(outfile, "/* #define PROGRESS_REPORT */	/* optional */\n");
diff --git a/config.guess b/config.guess
old mode 100755
new mode 100644
index 413ed41..fb25fa4
--- a/config.guess
+++ b/config.guess
@@ -1,7 +1,10 @@
 #! /bin/sh
 # Attempt to guess a canonical system name.
-#   Copyright (C) 1992, 93, 94, 95, 96, 1997 Free Software Foundation, Inc.
-#
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+#   2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+
+timestamp='2006-01-30'
+
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
 # the Free Software Foundation; either version 2 of the License, or
@@ -14,154 +17,326 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+# 02110-1301, USA.
 #
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
 # configuration script generated by Autoconf, you may include it under
 # the same distribution terms that you use for the rest of that program.
 
-# Written by Per Bothner <bothner@cygnus.com>.
-# The master version of this file is at the FSF in /home/gd/gnu/lib.
+
+# Originally written by Per Bothner <per@bothner.com>.
+# Please send patches to <config-patches@gnu.org>.  Submit a context
+# diff and a properly formatted ChangeLog entry.
 #
 # This script attempts to guess a canonical system name similar to
 # config.sub.  If it succeeds, it prints the system name on stdout, and
 # exits with 0.  Otherwise, it exits with 1.
 #
 # The plan is that this can be called by configure scripts if you
-# don't specify an explicit system type (host/target name).
-#
-# Only a few systems have been added to this list; please add others
-# (but try to keep the structure clean).
-#
+# don't specify an explicit build system type.
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION]
+
+Output the configuration name of the system \`$me' is run on.
+
+Operation modes:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.guess ($timestamp)
+
+Originally written by Per Bothner.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005
+Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try \`$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit ;;
+    --version | -v )
+       echo "$version" ; exit ;;
+    --help | --h* | -h )
+       echo "$usage"; exit ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help" >&2
+       exit 1 ;;
+    * )
+       break ;;
+  esac
+done
+
+if test $# != 0; then
+  echo "$me: too many arguments$help" >&2
+  exit 1
+fi
+
+trap 'exit 1' 1 2 15
+
+# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
+# compiler to aid in system detection is discouraged as it requires
+# temporary files to be created and, as you can see below, it is a
+# headache to deal with in a portable fashion.
+
+# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still
+# use `HOST_CC' if defined, but it is deprecated.
+
+# Portable tmp directory creation inspired by the Autoconf team.
+
+set_cc_for_build='
+trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ;
+trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ;
+: ${TMPDIR=/tmp} ;
+ { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
+ { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } ||
+ { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } ||
+ { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ;
+dummy=$tmp/dummy ;
+tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
+case $CC_FOR_BUILD,$HOST_CC,$CC in
+ ,,)    echo "int x;" > $dummy.c ;
+	for c in cc gcc c89 c99 ; do
+	  if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then
+	     CC_FOR_BUILD="$c"; break ;
+	  fi ;
+	done ;
+	if test x"$CC_FOR_BUILD" = x ; then
+	  CC_FOR_BUILD=no_compiler_found ;
+	fi
+	;;
+ ,,*)   CC_FOR_BUILD=$CC ;;
+ ,*,*)  CC_FOR_BUILD=$HOST_CC ;;
+esac ; set_cc_for_build= ;'
 
 # This is needed to find uname on a Pyramid OSx when run in the BSD universe.
-# (ghazi@noc.rutgers.edu 8/24/94.)
+# (ghazi@noc.rutgers.edu 1994-08-24)
 if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
 	PATH=$PATH:/.attbin ; export PATH
 fi
 
 UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
 UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
-UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown
+UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
 UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
 
-trap 'rm -f dummy.c dummy.o dummy; exit 1' 1 2 15
-
 # Note: order is significant - the case branches are not exclusive.
 
 case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
+    *:NetBSD:*:*)
+	# NetBSD (nbsd) targets should (where applicable) match one or
+	# more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*,
+	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
+	# switched to ELF, *-*-netbsd* would select the old
+	# object file format.  This provides both forward
+	# compatibility and a consistent mechanism for selecting the
+	# object file format.
+	#
+	# Note: NetBSD doesn't particularly care about the vendor
+	# portion of the name.  We always set it to "unknown".
+	sysctl="sysctl -n hw.machine_arch"
+	UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
+	    /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
+	case "${UNAME_MACHINE_ARCH}" in
+	    armeb) machine=armeb-unknown ;;
+	    arm*) machine=arm-unknown ;;
+	    sh3el) machine=shl-unknown ;;
+	    sh3eb) machine=sh-unknown ;;
+	    *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
+	esac
+	# The Operating System including object format, if it has switched
+	# to ELF recently, or will in the future.
+	case "${UNAME_MACHINE_ARCH}" in
+	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
+		eval $set_cc_for_build
+		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
+			| grep __ELF__ >/dev/null
+		then
+		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
+		    # Return netbsd for either.  FIX?
+		    os=netbsd
+		else
+		    os=netbsdelf
+		fi
+		;;
+	    *)
+	        os=netbsd
+		;;
+	esac
+	# The OS release
+	# Debian GNU/NetBSD machines have a different userland, and
+	# thus, need a distinct triplet. However, they do not need
+	# kernel version information, so it can be replaced with a
+	# suitable tag, in the style of linux-gnu.
+	case "${UNAME_VERSION}" in
+	    Debian*)
+		release='-gnu'
+		;;
+	    *)
+		release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+		;;
+	esac
+	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
+	# contains redundant information, the shorter form:
+	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
+	echo "${machine}-${os}${release}"
+	exit ;;
+    *:OpenBSD:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
+	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
+	exit ;;
+    *:ekkoBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
+	exit ;;
+    *:SolidBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE}
+	exit ;;
+    macppc:MirBSD:*:*)
+	echo powerppc-unknown-mirbsd${UNAME_RELEASE}
+	exit ;;
+    *:MirBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
+	exit ;;
     alpha:OSF1:*:*)
-	if test $UNAME_RELEASE = "V4.0"; then
+	case $UNAME_RELEASE in
+	*4.0)
 		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
-	fi
+		;;
+	*5.*)
+	        UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
+		;;
+	esac
+	# According to Compaq, /usr/sbin/psrinfo has been available on
+	# OSF/1 and Tru64 systems produced since 1995.  I hope that
+	# covers most systems running today.  This code pipes the CPU
+	# types through head -n 1, so we only detect the type of CPU 0.
+	ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
+	case "$ALPHA_CPU_TYPE" in
+	    "EV4 (21064)")
+		UNAME_MACHINE="alpha" ;;
+	    "EV4.5 (21064)")
+		UNAME_MACHINE="alpha" ;;
+	    "LCA4 (21066/21068)")
+		UNAME_MACHINE="alpha" ;;
+	    "EV5 (21164)")
+		UNAME_MACHINE="alphaev5" ;;
+	    "EV5.6 (21164A)")
+		UNAME_MACHINE="alphaev56" ;;
+	    "EV5.6 (21164PC)")
+		UNAME_MACHINE="alphapca56" ;;
+	    "EV5.7 (21164PC)")
+		UNAME_MACHINE="alphapca57" ;;
+	    "EV6 (21264)")
+		UNAME_MACHINE="alphaev6" ;;
+	    "EV6.7 (21264A)")
+		UNAME_MACHINE="alphaev67" ;;
+	    "EV6.8CB (21264C)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.8AL (21264B)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.8CX (21264D)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.9A (21264/EV69A)")
+		UNAME_MACHINE="alphaev69" ;;
+	    "EV7 (21364)")
+		UNAME_MACHINE="alphaev7" ;;
+	    "EV7.9 (21364A)")
+		UNAME_MACHINE="alphaev79" ;;
+	esac
+	# A Pn.n version is a patched version.
 	# A Vn.n version is a released version.
 	# A Tn.n version is a released field test version.
 	# A Xn.n version is an unreleased experimental baselevel.
 	# 1.2 uses "1.2" for uname -r.
-	cat <<EOF >dummy.s
-	.globl main
-	.ent main
-main:
-	.frame \$30,0,\$26,0
-	.prologue 0
-	.long 0x47e03d80 # implver $0
-	lda \$2,259
-	.long 0x47e20c21 # amask $2,$1
-	srl \$1,8,\$2
-	sll \$2,2,\$2
-	sll \$0,3,\$0
-	addl \$1,\$0,\$0
-	addl \$2,\$0,\$0
-	ret \$31,(\$26),1
-	.end main
-EOF
-	${CC-cc} dummy.s -o dummy 2>/dev/null
-	if test "$?" = 0 ; then
-		./dummy
-		case "$?" in
-			7)
-				UNAME_MACHINE="alpha"
-				;;
-			15)
-				UNAME_MACHINE="alphaev5"
-				;;
-			14)
-				UNAME_MACHINE="alphaev56"
-				;;
-			10)
-				UNAME_MACHINE="alphapca56"
-				;;
-			16)
-				UNAME_MACHINE="alphaev6"
-				;;
-		esac
-	fi
-	rm -f dummy.s dummy
-	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[VTX]//' | tr [[A-Z]] [[a-z]]`
-	exit 0 ;;
+	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+	exit ;;
+    Alpha\ *:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# Should we change UNAME_MACHINE based on the output of uname instead
+	# of the specific Alpha model?
+	echo alpha-pc-interix
+	exit ;;
     21064:Windows_NT:50:3)
 	echo alpha-dec-winnt3.5
-	exit 0 ;;
+	exit ;;
     Amiga*:UNIX_System_V:4.0:*)
-	echo m68k-cbm-sysv4
-	exit 0;;
-    amiga:NetBSD:*:*)
-      echo m68k-cbm-netbsd${UNAME_RELEASE}
-      exit 0 ;;
-    amiga:OpenBSD:*:*)
-	echo m68k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    arc64:OpenBSD:*:*)
-	echo mips64el-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    arc:OpenBSD:*:*)
-	echo mipsel-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    hkmips:OpenBSD:*:*)
-	echo mips-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    pmax:OpenBSD:*:*)
-	echo mipsel-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    sgi:OpenBSD:*:*)
-	echo mips-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    wgrisc:OpenBSD:*:*)
-	echo mipsel-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
+	echo m68k-unknown-sysv4
+	exit ;;
+    *:[Aa]miga[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-amigaos
+	exit ;;
+    *:[Mm]orph[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-morphos
+	exit ;;
+    *:OS/390:*:*)
+	echo i370-ibm-openedition
+	exit ;;
+    *:z/VM:*:*)
+	echo s390-ibm-zvmoe
+	exit ;;
+    *:OS400:*:*)
+        echo powerpc-ibm-os400
+	exit ;;
     arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
 	echo arm-acorn-riscix${UNAME_RELEASE}
-	exit 0;;
-    arm32:NetBSD:*:*)
-	echo arm-unknown-netbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
-	exit 0 ;;
-    SR2?01:HI-UX/MPP:*:*)
+	exit ;;
+    arm:riscos:*:*|arm:RISCOS:*:*)
+	echo arm-unknown-riscos
+	exit ;;
+    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
 	echo hppa1.1-hitachi-hiuxmpp
-	exit 0;;
-    Pyramid*:OSx*:*:*|MIS*:OSx*:*:*)
+	exit ;;
+    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
 	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
 	if test "`(/bin/universe) 2>/dev/null`" = att ; then
 		echo pyramid-pyramid-sysv3
 	else
 		echo pyramid-pyramid-bsd
 	fi
-	exit 0 ;;
-    NILE:*:*:dcosx)
+	exit ;;
+    NILE*:*:*:dcosx)
 	echo pyramid-pyramid-svr4
-	exit 0 ;;
+	exit ;;
+    DRS?6000:unix:4.0:6*)
+	echo sparc-icl-nx6
+	exit ;;
+    DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
+	case `/usr/bin/uname -p` in
+	    sparc) echo sparc-icl-nx7; exit ;;
+	esac ;;
+    sun4H:SunOS:5.*:*)
+	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
     sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
 	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit 0 ;;
+	exit ;;
     i86pc:SunOS:5.*:*)
 	echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit 0 ;;
+	exit ;;
     sun4*:SunOS:6*:*)
 	# According to config.sub, this is the proper way to canonicalize
 	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
 	# it's likely to be more like Solaris than SunOS4.
 	echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit 0 ;;
+	exit ;;
     sun4*:SunOS:*:*)
 	case "`/usr/bin/arch -k`" in
 	    Series*|S4*)
@@ -170,12 +345,12 @@
 	esac
 	# Japanese Language versions have a version number like `4.1.3-JL'.
 	echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
-	exit 0 ;;
+	exit ;;
     sun3*:SunOS:*:*)
 	echo m68k-sun-sunos${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     sun*:*:4.2BSD:*)
-	UNAME_RELEASE=`(head -1 /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
+	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
 	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
 	case "`/bin/arch`" in
 	    sun3)
@@ -185,52 +360,63 @@
 		echo sparc-sun-sunos${UNAME_RELEASE}
 		;;
 	esac
-	exit 0 ;;
+	exit ;;
     aushp:SunOS:*:*)
 	echo sparc-auspex-sunos${UNAME_RELEASE}
-	exit 0 ;;
-    atari*:NetBSD:*:*)
-	echo m68k-atari-netbsd${UNAME_RELEASE}
-	exit 0 ;;
-    atari*:OpenBSD:*:*)
-	echo m68k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    sun3*:NetBSD:*:*)
-	echo m68k-sun-netbsd${UNAME_RELEASE}
-	exit 0 ;;
-    sun3*:OpenBSD:*:*)
-	echo m68k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    mac68k:NetBSD:*:*)
-	echo m68k-apple-netbsd${UNAME_RELEASE}
-	exit 0 ;;
-    mac68k:OpenBSD:*:*)
-	echo m68k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    mvme68k:OpenBSD:*:*)
-	echo m68k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    mvme88k:OpenBSD:*:*)
-	echo m88k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
+    # The situation for MiNT is a little confusing.  The machine name
+    # can be virtually everything (everything which is not
+    # "atarist" or "atariste" at least should have a processor
+    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
+    # to the lowercase version "mint" (or "freemint").  Finally
+    # the system name "TOS" denotes a system which is actually not
+    # MiNT.  But MiNT is downward compatible to TOS, so this should
+    # be no problem.
+    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
+        echo m68k-atari-mint${UNAME_RELEASE}
+	exit ;;
+    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
+	echo m68k-atari-mint${UNAME_RELEASE}
+        exit ;;
+    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
+        echo m68k-atari-mint${UNAME_RELEASE}
+	exit ;;
+    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
+        echo m68k-milan-mint${UNAME_RELEASE}
+        exit ;;
+    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
+        echo m68k-hades-mint${UNAME_RELEASE}
+        exit ;;
+    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
+        echo m68k-unknown-mint${UNAME_RELEASE}
+        exit ;;
+    m68k:machten:*:*)
+	echo m68k-apple-machten${UNAME_RELEASE}
+	exit ;;
     powerpc:machten:*:*)
 	echo powerpc-apple-machten${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     RISC*:Mach:*:*)
 	echo mips-dec-mach_bsd4.3
-	exit 0 ;;
+	exit ;;
     RISC*:ULTRIX:*:*)
 	echo mips-dec-ultrix${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     VAX*:ULTRIX*:*:*)
 	echo vax-dec-ultrix${UNAME_RELEASE}
-	exit 0 ;;
-    2020:CLIX:*:*)
+	exit ;;
+    2020:CLIX:*:* | 2430:CLIX:*:*)
 	echo clipper-intergraph-clix${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     mips:*:*:UMIPS | mips:*:*:RISCos)
-	sed 's/^	//' << EOF >dummy.c
-	int main (argc, argv) int argc; char **argv; {
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+#ifdef __cplusplus
+#include <stdio.h>  /* for printf() prototype */
+	int main (int argc, char *argv[]) {
+#else
+	int main (argc, argv) int argc; char *argv[]; {
+#endif
 	#if defined (host_mips) && defined (MIPSEB)
 	#if defined (SYSTYPE_SYSV)
 	  printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
@@ -245,62 +431,83 @@
 	  exit (-1);
 	}
 EOF
-	${CC-cc} dummy.c -o dummy \
-	  && ./dummy `echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` \
-	  && rm dummy.c dummy && exit 0
-	rm -f dummy.c dummy
+	$CC_FOR_BUILD -o $dummy $dummy.c &&
+	  dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` &&
+	  SYSTEM_NAME=`$dummy $dummyarg` &&
+	    { echo "$SYSTEM_NAME"; exit; }
 	echo mips-mips-riscos${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
+    Motorola:PowerMAX_OS:*:*)
+	echo powerpc-motorola-powermax
+	exit ;;
+    Motorola:*:4.3:PL8-*)
+	echo powerpc-harris-powermax
+	exit ;;
+    Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
+	echo powerpc-harris-powermax
+	exit ;;
     Night_Hawk:Power_UNIX:*:*)
 	echo powerpc-harris-powerunix
-	exit 0 ;;
+	exit ;;
     m88k:CX/UX:7*:*)
 	echo m88k-harris-cxux7
-	exit 0 ;;
+	exit ;;
     m88k:*:4*:R4*)
 	echo m88k-motorola-sysv4
-	exit 0 ;;
+	exit ;;
     m88k:*:3*:R3*)
 	echo m88k-motorola-sysv3
-	exit 0 ;;
+	exit ;;
     AViiON:dgux:*:*)
         # DG/UX returns AViiON for all architectures
         UNAME_PROCESSOR=`/usr/bin/uname -p`
-        if [ $UNAME_PROCESSOR = mc88100 -o $UNAME_PROCESSOR = mc88110 ] ; then
-	if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx \
-	     -o ${TARGET_BINARY_INTERFACE}x = x ] ; then
+	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
+	then
+	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
+	       [ ${TARGET_BINARY_INTERFACE}x = x ]
+	    then
 		echo m88k-dg-dgux${UNAME_RELEASE}
-	else
+	    else
 		echo m88k-dg-dguxbcs${UNAME_RELEASE}
+	    fi
+	else
+	    echo i586-dg-dgux${UNAME_RELEASE}
 	fi
-        else echo i586-dg-dgux${UNAME_RELEASE}
-        fi
- 	exit 0 ;;
+ 	exit ;;
     M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
 	echo m88k-dolphin-sysv3
-	exit 0 ;;
+	exit ;;
     M88*:*:R3*:*)
 	# Delta 88k system running SVR3
 	echo m88k-motorola-sysv3
-	exit 0 ;;
+	exit ;;
     XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
 	echo m88k-tektronix-sysv3
-	exit 0 ;;
+	exit ;;
     Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
 	echo m68k-tektronix-bsd
-	exit 0 ;;
+	exit ;;
     *:IRIX*:*:*)
 	echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
-	exit 0 ;;
+	exit ;;
     ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
-	echo romp-ibm-aix      # uname -m gives an 8 hex-code CPU id
-	exit 0 ;;              # Note that: echo "'`uname -s`'" gives 'AIX '
-    i?86:AIX:*:*)
+	echo romp-ibm-aix     # uname -m gives an 8 hex-code CPU id
+	exit ;;               # Note that: echo "'`uname -s`'" gives 'AIX '
+    i*86:AIX:*:*)
 	echo i386-ibm-aix
-	exit 0 ;;
+	exit ;;
+    ia64:AIX:*:*)
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+	fi
+	echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
+	exit ;;
     *:AIX:2:3)
 	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
-		sed 's/^		//' << EOF >dummy.c
+		eval $set_cc_for_build
+		sed 's/^		//' << EOF >$dummy.c
 		#include <sys/systemcfg.h>
 
 		main()
@@ -311,17 +518,21 @@
 			exit(0);
 			}
 EOF
-		${CC-cc} dummy.c -o dummy && ./dummy && rm dummy.c dummy && exit 0
-		rm -f dummy.c dummy
-		echo rs6000-ibm-aix3.2.5
+		if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy`
+		then
+			echo "$SYSTEM_NAME"
+		else
+			echo rs6000-ibm-aix3.2.5
+		fi
 	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
 		echo rs6000-ibm-aix3.2.4
 	else
 		echo rs6000-ibm-aix3.2
 	fi
-	exit 0 ;;
-    *:AIX:*:4)
-	if /usr/sbin/lsattr -EHl proc0 | grep POWER >/dev/null 2>&1; then
+	exit ;;
+    *:AIX:*:[45])
+	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
+	if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
 		IBM_ARCH=rs6000
 	else
 		IBM_ARCH=powerpc
@@ -329,43 +540,120 @@
 	if [ -x /usr/bin/oslevel ] ; then
 		IBM_REV=`/usr/bin/oslevel`
 	else
-		IBM_REV=4.${UNAME_RELEASE}
+		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
 	fi
 	echo ${IBM_ARCH}-ibm-aix${IBM_REV}
-	exit 0 ;;
+	exit ;;
     *:AIX:*:*)
 	echo rs6000-ibm-aix
-	exit 0 ;;
+	exit ;;
     ibmrt:4.4BSD:*|romp-ibm:BSD:*)
 	echo romp-ibm-bsd4.4
-	exit 0 ;;
-    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC NetBSD and
+	exit ;;
+    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
 	echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
-	exit 0 ;;                           # report: romp-ibm BSD 4.3
+	exit ;;                             # report: romp-ibm BSD 4.3
     *:BOSX:*:*)
 	echo rs6000-bull-bosx
-	exit 0 ;;
+	exit ;;
     DPX/2?00:B.O.S.:*:*)
 	echo m68k-bull-sysv3
-	exit 0 ;;
+	exit ;;
     9000/[34]??:4.3bsd:1.*:*)
 	echo m68k-hp-bsd
-	exit 0 ;;
+	exit ;;
     hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
 	echo m68k-hp-bsd4.4
-	exit 0 ;;
-    9000/[3478]??:HP-UX:*:*)
+	exit ;;
+    9000/[34678]??:HP-UX:*:*)
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
 	case "${UNAME_MACHINE}" in
 	    9000/31? )            HP_ARCH=m68000 ;;
 	    9000/[34]?? )         HP_ARCH=m68k ;;
-	    9000/7?? | 9000/8?[1679] ) HP_ARCH=hppa1.1 ;;
-	    9000/8?? )            HP_ARCH=hppa1.0 ;;
+	    9000/[678][0-9][0-9])
+		if [ -x /usr/bin/getconf ]; then
+		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
+                    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
+                    case "${sc_cpu_version}" in
+                      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
+                      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
+                      532)                      # CPU_PA_RISC2_0
+                        case "${sc_kernel_bits}" in
+                          32) HP_ARCH="hppa2.0n" ;;
+                          64) HP_ARCH="hppa2.0w" ;;
+			  '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
+                        esac ;;
+                    esac
+		fi
+		if [ "${HP_ARCH}" = "" ]; then
+		    eval $set_cc_for_build
+		    sed 's/^              //' << EOF >$dummy.c
+
+              #define _HPUX_SOURCE
+              #include <stdlib.h>
+              #include <unistd.h>
+
+              int main ()
+              {
+              #if defined(_SC_KERNEL_BITS)
+                  long bits = sysconf(_SC_KERNEL_BITS);
+              #endif
+                  long cpu  = sysconf (_SC_CPU_VERSION);
+
+                  switch (cpu)
+              	{
+              	case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
+              	case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
+              	case CPU_PA_RISC2_0:
+              #if defined(_SC_KERNEL_BITS)
+              	    switch (bits)
+              		{
+              		case 64: puts ("hppa2.0w"); break;
+              		case 32: puts ("hppa2.0n"); break;
+              		default: puts ("hppa2.0"); break;
+              		} break;
+              #else  /* !defined(_SC_KERNEL_BITS) */
+              	    puts ("hppa2.0"); break;
+              #endif
+              	default: puts ("hppa1.0"); break;
+              	}
+                  exit (0);
+              }
+EOF
+		    (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
+		    test -z "$HP_ARCH" && HP_ARCH=hppa
+		fi ;;
 	esac
-	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	if [ ${HP_ARCH} = "hppa2.0w" ]
+	then
+	    eval $set_cc_for_build
+
+	    # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
+	    # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
+	    # generating 64-bit code.  GNU and HP use different nomenclature:
+	    #
+	    # $ CC_FOR_BUILD=cc ./config.guess
+	    # => hppa2.0w-hp-hpux11.23
+	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
+	    # => hppa64-hp-hpux11.23
+
+	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
+		grep __LP64__ >/dev/null
+	    then
+		HP_ARCH="hppa2.0w"
+	    else
+		HP_ARCH="hppa64"
+	    fi
+	fi
 	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
-	exit 0 ;;
+	exit ;;
+    ia64:HP-UX:*:*)
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	echo ia64-hp-hpux${HPUX_REV}
+	exit ;;
     3050*:HI-UX:*:*)
-	sed 's/^	//' << EOF >dummy.c
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
 	#include <unistd.h>
 	int
 	main ()
@@ -390,324 +678,467 @@
 	  exit (0);
 	}
 EOF
-	${CC-cc} dummy.c -o dummy && ./dummy && rm dummy.c dummy && exit 0
-	rm -f dummy.c dummy
+	$CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` &&
+		{ echo "$SYSTEM_NAME"; exit; }
 	echo unknown-hitachi-hiuxwe2
-	exit 0 ;;
+	exit ;;
     9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
 	echo hppa1.1-hp-bsd
-	exit 0 ;;
+	exit ;;
     9000/8??:4.3bsd:*:*)
 	echo hppa1.0-hp-bsd
-	exit 0 ;;
+	exit ;;
+    *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
+	echo hppa1.0-hp-mpeix
+	exit ;;
     hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
 	echo hppa1.1-hp-osf
-	exit 0 ;;
+	exit ;;
     hp8??:OSF1:*:*)
 	echo hppa1.0-hp-osf
-	exit 0 ;;
-    i?86:OSF1:*:*)
+	exit ;;
+    i*86:OSF1:*:*)
 	if [ -x /usr/sbin/sysversion ] ; then
 	    echo ${UNAME_MACHINE}-unknown-osf1mk
 	else
 	    echo ${UNAME_MACHINE}-unknown-osf1
 	fi
-	exit 0 ;;
+	exit ;;
     parisc*:Lites*:*:*)
 	echo hppa1.1-hp-lites
-	exit 0 ;;
+	exit ;;
     C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
 	echo c1-convex-bsd
-        exit 0 ;;
+        exit ;;
     C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
 	if getsysinfo -f scalar_acc
 	then echo c32-convex-bsd
 	else echo c2-convex-bsd
 	fi
-        exit 0 ;;
+        exit ;;
     C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
 	echo c34-convex-bsd
-        exit 0 ;;
+        exit ;;
     C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
 	echo c38-convex-bsd
-        exit 0 ;;
+        exit ;;
     C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
 	echo c4-convex-bsd
-        exit 0 ;;
-    CRAY*X-MP:*:*:*)
-	echo xmp-cray-unicos
-        exit 0 ;;
+        exit ;;
     CRAY*Y-MP:*:*:*)
-	echo ymp-cray-unicos${UNAME_RELEASE}
-	exit 0 ;;
+	echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
     CRAY*[A-Z]90:*:*:*)
 	echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
 	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
-	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/
-	exit 0 ;;
+	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
+	      -e 's/\.[^.]*$/.X/'
+	exit ;;
     CRAY*TS:*:*:*)
-	echo t90-cray-unicos${UNAME_RELEASE}
-	exit 0 ;;
-    CRAY-2:*:*:*)
-	echo cray2-cray-unicos
-        exit 0 ;;
-    F300:UNIX_System_V:*:*)
-        FUJITSU_SYS=`uname -p | tr [A-Z] [a-z] | sed -e 's/\///'`
+	echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*T3E:*:*:*)
+	echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*SV1:*:*:*)
+	echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    *:UNICOS/mp:*:*)
+	echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
+	FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
         FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
-        echo "f300-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
-        exit 0 ;;
-    F301:UNIX_System_V:*:*)
-       echo f301-fujitsu-uxpv`echo $UNAME_RELEASE | sed 's/ .*//'`
-       exit 0 ;;
-    hp3[0-9][05]:NetBSD:*:*)
-	echo m68k-hp-netbsd${UNAME_RELEASE}
-	exit 0 ;;
-    hp300:OpenBSD:*:*)
-	echo m68k-unknown-openbsd${UNAME_RELEASE}
-	exit 0 ;;
-    i?86:BSD/386:*:* | *:BSD/OS:*:*)
+        echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+        exit ;;
+    5000:UNIX_System_V:4.*:*)
+        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+        FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
+        echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+	exit ;;
+    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
 	echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
+    sparc*:BSD/OS:*:*)
+	echo sparc-unknown-bsdi${UNAME_RELEASE}
+	exit ;;
+    *:BSD/OS:*:*)
+	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
+	exit ;;
     *:FreeBSD:*:*)
-	echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
-	exit 0 ;;
-    *:NetBSD:*:*)
-	echo ${UNAME_MACHINE}-unknown-netbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
-	exit 0 ;;
-    *:OpenBSD:*:*)
-	echo ${UNAME_MACHINE}-unknown-openbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
-	exit 0 ;;
+	case ${UNAME_MACHINE} in
+	    pc98)
+		echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	    *)
+		echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	esac
+	exit ;;
     i*:CYGWIN*:*)
-	echo i386-pc-cygwin32
-	exit 0 ;;
+	echo ${UNAME_MACHINE}-pc-cygwin
+	exit ;;
     i*:MINGW*:*)
-	echo i386-pc-mingw32
-	exit 0 ;;
+	echo ${UNAME_MACHINE}-pc-mingw32
+	exit ;;
+    i*:windows32*:*)
+    	# uname -m includes "-pc" on this system.
+    	echo ${UNAME_MACHINE}-mingw32
+	exit ;;
+    i*:PW*:*)
+	echo ${UNAME_MACHINE}-pc-pw32
+	exit ;;
+    x86:Interix*:[345]*)
+	echo i586-pc-interix${UNAME_RELEASE}|sed -e 's/\..*//'
+	exit ;;
+    [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
+	echo i${UNAME_MACHINE}-pc-mks
+	exit ;;
+    i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
+	# UNAME_MACHINE based on the output of uname instead of i386?
+	echo i586-pc-interix
+	exit ;;
+    i*:UWIN*:*)
+	echo ${UNAME_MACHINE}-pc-uwin
+	exit ;;
+    amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
+	echo x86_64-unknown-cygwin
+	exit ;;
     p*:CYGWIN*:*)
-	echo powerpcle-unknown-cygwin32
-	exit 0 ;;
+	echo powerpcle-unknown-cygwin
+	exit ;;
     prep*:SunOS:5.*:*)
 	echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit 0 ;;
+	exit ;;
     *:GNU:*:*)
+	# the GNU system
 	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
-	exit 0 ;;
-    *:Linux:*:*)
+	exit ;;
+    *:GNU/*:*:*)
+	# other systems with GNU libc and userland
+	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
+	exit ;;
+    i*86:Minix:*:*)
+	echo ${UNAME_MACHINE}-pc-minix
+	exit ;;
+    arm*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    cris:Linux:*:*)
+	echo cris-axis-linux-gnu
+	exit ;;
+    crisv32:Linux:*:*)
+	echo crisv32-axis-linux-gnu
+	exit ;;
+    frv:Linux:*:*)
+    	echo frv-unknown-linux-gnu
+	exit ;;
+    ia64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    m32r*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    m68*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    mips:Linux:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#undef CPU
+	#undef mips
+	#undef mipsel
+	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+	CPU=mipsel
+	#else
+	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
+	CPU=mips
+	#else
+	CPU=
+	#endif
+	#endif
+EOF
+	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
+	    /^CPU/{
+		s: ::g
+		p
+	    }'`"
+	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
+	;;
+    mips64:Linux:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#undef CPU
+	#undef mips64
+	#undef mips64el
+	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+	CPU=mips64el
+	#else
+	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
+	CPU=mips64
+	#else
+	CPU=
+	#endif
+	#endif
+EOF
+	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
+	    /^CPU/{
+		s: ::g
+		p
+	    }'`"
+	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
+	;;
+    or32:Linux:*:*)
+	echo or32-unknown-linux-gnu
+	exit ;;
+    ppc:Linux:*:*)
+	echo powerpc-unknown-linux-gnu
+	exit ;;
+    ppc64:Linux:*:*)
+	echo powerpc64-unknown-linux-gnu
+	exit ;;
+    alpha:Linux:*:*)
+	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
+	  EV5)   UNAME_MACHINE=alphaev5 ;;
+	  EV56)  UNAME_MACHINE=alphaev56 ;;
+	  PCA56) UNAME_MACHINE=alphapca56 ;;
+	  PCA57) UNAME_MACHINE=alphapca56 ;;
+	  EV6)   UNAME_MACHINE=alphaev6 ;;
+	  EV67)  UNAME_MACHINE=alphaev67 ;;
+	  EV68*) UNAME_MACHINE=alphaev68 ;;
+        esac
+	objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null
+	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
+	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+	exit ;;
+    parisc:Linux:*:* | hppa:Linux:*:*)
+	# Look for CPU level
+	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
+	  PA7*) echo hppa1.1-unknown-linux-gnu ;;
+	  PA8*) echo hppa2.0-unknown-linux-gnu ;;
+	  *)    echo hppa-unknown-linux-gnu ;;
+	esac
+	exit ;;
+    parisc64:Linux:*:* | hppa64:Linux:*:*)
+	echo hppa64-unknown-linux-gnu
+	exit ;;
+    s390:Linux:*:* | s390x:Linux:*:*)
+	echo ${UNAME_MACHINE}-ibm-linux
+	exit ;;
+    sh64*:Linux:*:*)
+    	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    sh*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    sparc:Linux:*:* | sparc64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    vax:Linux:*:*)
+	echo ${UNAME_MACHINE}-dec-linux-gnu
+	exit ;;
+    x86_64:Linux:*:*)
+	echo x86_64-unknown-linux-gnu
+	exit ;;
+    i*86:Linux:*:*)
 	# The BFD linker knows what the default object file format is, so
-	# first see if it will tell us.
-	ld_help_string=`ld --help 2>&1`
-	ld_supported_emulations=`echo $ld_help_string \
-			 | sed -ne '/supported emulations:/!d
+	# first see if it will tell us. cd to the root directory to prevent
+	# problems with other programs or directories called `ld' in the path.
+	# Set LC_ALL=C to ensure ld outputs messages in English.
+	ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \
+			 | sed -ne '/supported targets:/!d
 				    s/[ 	][ 	]*/ /g
-				    s/.*supported emulations: *//
+				    s/.*supported targets: *//
 				    s/ .*//
 				    p'`
-        case "$ld_supported_emulations" in
-	  i?86linux)  echo "${UNAME_MACHINE}-pc-linux-gnuaout"      ; exit 0 ;;
-	  i?86coff)   echo "${UNAME_MACHINE}-pc-linux-gnucoff"      ; exit 0 ;;
-	  sparclinux) echo "${UNAME_MACHINE}-unknown-linux-gnuaout" ; exit 0 ;;
-	  m68klinux)  echo "${UNAME_MACHINE}-unknown-linux-gnuaout" ; exit 0 ;;
-	  elf32ppc)   echo "powerpc-unknown-linux-gnu"              ; exit 0 ;;
+        case "$ld_supported_targets" in
+	  elf32-i386)
+		TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu"
+		;;
+	  a.out-i386-linux)
+		echo "${UNAME_MACHINE}-pc-linux-gnuaout"
+		exit ;;
+	  coff-i386)
+		echo "${UNAME_MACHINE}-pc-linux-gnucoff"
+		exit ;;
+	  "")
+		# Either a pre-BFD a.out linker (linux-gnuoldld) or
+		# one that does not give us useful --help.
+		echo "${UNAME_MACHINE}-pc-linux-gnuoldld"
+		exit ;;
 	esac
-
-	if test "${UNAME_MACHINE}" = "alpha" ; then
-		sed 's/^	//'  <<EOF >dummy.s
-		.globl main
-		.ent main
-	main:
-		.frame \$30,0,\$26,0
-		.prologue 0
-		.long 0x47e03d80 # implver $0
-		lda \$2,259
-		.long 0x47e20c21 # amask $2,$1
-		srl \$1,8,\$2
-		sll \$2,2,\$2
-		sll \$0,3,\$0
-		addl \$1,\$0,\$0
-		addl \$2,\$0,\$0
-		ret \$31,(\$26),1
-		.end main
+	# Determine whether the default compiler is a.out or elf
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#include <features.h>
+	#ifdef __ELF__
+	# ifdef __GLIBC__
+	#  if __GLIBC__ >= 2
+	LIBC=gnu
+	#  else
+	LIBC=gnulibc1
+	#  endif
+	# else
+	LIBC=gnulibc1
+	# endif
+	#else
+	#if defined(__INTEL_COMPILER) || defined(__PGI)
+	LIBC=gnu
+	#else
+	LIBC=gnuaout
+	#endif
+	#endif
+	#ifdef __dietlibc__
+	LIBC=dietlibc
+	#endif
 EOF
-		LIBC=""
-		${CC-cc} dummy.s -o dummy 2>/dev/null
-		if test "$?" = 0 ; then
-			./dummy
-			case "$?" in
-			7)
-				UNAME_MACHINE="alpha"
-				;;
-			15)
-				UNAME_MACHINE="alphaev5"
-				;;
-			14)
-				UNAME_MACHINE="alphaev56"
-				;;
-			10)
-				UNAME_MACHINE="alphapca56"
-				;;
-			16)
-				UNAME_MACHINE="alphaev6"
-				;;
-			esac	
-
-			objdump --private-headers dummy | \
-			  grep ld.so.1 > /dev/null
-			if test "$?" = 0 ; then
-				LIBC="libc1"
-			fi
-		fi	
-		rm -f dummy.s dummy
-		echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} ; exit 0
-	elif test "${UNAME_MACHINE}" = "mips" ; then
-	  cat >dummy.c <<EOF
-main(argc, argv)
-     int argc;
-     char *argv[];
-{
-#ifdef __MIPSEB__
-  printf ("%s-unknown-linux-gnu\n", argv[1]);
-#endif
-#ifdef __MIPSEL__
-  printf ("%sel-unknown-linux-gnu\n", argv[1]);
-#endif
-  return 0;
-}
-EOF
-	  ${CC-cc} dummy.c -o dummy 2>/dev/null && ./dummy "${UNAME_MACHINE}" && rm dummy.c dummy && exit 0
-	  rm -f dummy.c dummy
-	else
-	  # Either a pre-BFD a.out linker (linux-gnuoldld)
-	  # or one that does not give us useful --help.
-	  # GCC wants to distinguish between linux-gnuoldld and linux-gnuaout.
-	  # If ld does not provide *any* "supported emulations:"
-	  # that means it is gnuoldld.
-	  echo "$ld_help_string" | grep >/dev/null 2>&1 "supported emulations:"
-	  test $? != 0 && echo "${UNAME_MACHINE}-pc-linux-gnuoldld" && exit 0
-
-	  case "${UNAME_MACHINE}" in
-	  i?86)
-	    VENDOR=pc;
-	    ;;
-	  *)
-	    VENDOR=unknown;
-	    ;;
-	  esac
-	  # Determine whether the default compiler is a.out or elf
-	  cat >dummy.c <<EOF
-#include <features.h>
-main(argc, argv)
-     int argc;
-     char *argv[];
-{
-#ifdef __ELF__
-# ifdef __GLIBC__
-#  if __GLIBC__ >= 2
-    printf ("%s-${VENDOR}-linux-gnu\n", argv[1]);
-#  else
-    printf ("%s-${VENDOR}-linux-gnulibc1\n", argv[1]);
-#  endif
-# else
-   printf ("%s-${VENDOR}-linux-gnulibc1\n", argv[1]);
-# endif
-#else
-  printf ("%s-${VENDOR}-linux-gnuaout\n", argv[1]);
-#endif
-  return 0;
-}
-EOF
-	  ${CC-cc} dummy.c -o dummy 2>/dev/null && ./dummy "${UNAME_MACHINE}" && rm dummy.c dummy && exit 0
-	  rm -f dummy.c dummy
-	fi ;;
-# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.  earlier versions
-# are messed up and put the nodename in both sysname and nodename.
-    i?86:DYNIX/ptx:4*:*)
+	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
+	    /^LIBC/{
+		s: ::g
+		p
+	    }'`"
+	test x"${LIBC}" != x && {
+		echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
+		exit
+	}
+	test x"${TENTATIVE}" != x && { echo "${TENTATIVE}"; exit; }
+	;;
+    i*86:DYNIX/ptx:4*:*)
+	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
+	# earlier versions are messed up and put the nodename in both
+	# sysname and nodename.
 	echo i386-sequent-sysv4
-	exit 0 ;;
-    i?86:UNIX_SV:4.2MP:2.*)
+	exit ;;
+    i*86:UNIX_SV:4.2MP:2.*)
         # Unixware is an offshoot of SVR4, but it has its own version
         # number series starting with 2...
         # I am not positive that other SVR4 systems won't match this,
 	# I just have to hope.  -- rms.
         # Use sysv4.2uw... so that sysv4* matches it.
 	echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
-	exit 0 ;;
-    i?86:*:4.*:* | i?86:SYSTEM_V:4.*:*)
+	exit ;;
+    i*86:OS/2:*:*)
+	# If we were able to find `uname', then EMX Unix compatibility
+	# is probably installed.
+	echo ${UNAME_MACHINE}-pc-os2-emx
+	exit ;;
+    i*86:XTS-300:*:STOP)
+	echo ${UNAME_MACHINE}-unknown-stop
+	exit ;;
+    i*86:atheos:*:*)
+	echo ${UNAME_MACHINE}-unknown-atheos
+	exit ;;
+    i*86:syllable:*:*)
+	echo ${UNAME_MACHINE}-pc-syllable
+	exit ;;
+    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*)
+	echo i386-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    i*86:*DOS:*:*)
+	echo ${UNAME_MACHINE}-pc-msdosdjgpp
+	exit ;;
+    i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
+	UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
 	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
-		echo ${UNAME_MACHINE}-univel-sysv${UNAME_RELEASE}
+		echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
 	else
-		echo ${UNAME_MACHINE}-pc-sysv${UNAME_RELEASE}
+		echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
 	fi
-	exit 0 ;;
-    i?86:*:3.2:*)
+	exit ;;
+    i*86:*:5:[678]*)
+    	# UnixWare 7.x, OpenUNIX and OpenServer 6.
+	case `/bin/uname -X | grep "^Machine"` in
+	    *486*)	     UNAME_MACHINE=i486 ;;
+	    *Pentium)	     UNAME_MACHINE=i586 ;;
+	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
+	esac
+	echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
+	exit ;;
+    i*86:*:3.2:*)
 	if test -f /usr/options/cb.name; then
 		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
 		echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
 	elif /bin/uname -X 2>/dev/null >/dev/null ; then
-		UNAME_REL=`(/bin/uname -X|egrep Release|sed -e 's/.*= //')`
-		(/bin/uname -X|egrep i80486 >/dev/null) && UNAME_MACHINE=i486
-		(/bin/uname -X|egrep '^Machine.*Pentium' >/dev/null) \
+		UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
+		(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
+		(/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
 			&& UNAME_MACHINE=i586
+		(/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
+			&& UNAME_MACHINE=i686
 		echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
 	else
 		echo ${UNAME_MACHINE}-pc-sysv32
 	fi
-	exit 0 ;;
+	exit ;;
     pc:*:*:*)
+	# Left here for compatibility:
         # uname -m prints for DJGPP always 'pc', but it prints nothing about
         # the processor, so we play safe by assuming i386.
 	echo i386-pc-msdosdjgpp
-        exit 0 ;;
+        exit ;;
     Intel:Mach:3*:*)
 	echo i386-pc-mach3
-	exit 0 ;;
+	exit ;;
     paragon:*:*:*)
 	echo i860-intel-osf1
-	exit 0 ;;
+	exit ;;
     i860:*:4.*:*) # i860-SVR4
 	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
 	  echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
 	else # Add other i860-SVR4 vendors below as they are discovered.
 	  echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
 	fi
-	exit 0 ;;
+	exit ;;
     mini*:CTIX:SYS*5:*)
 	# "miniframe"
 	echo m68010-convergent-sysv
-	exit 0 ;;
-    M68*:*:R3V[567]*:*)
-	test -r /sysV68 && echo 'm68k-motorola-sysv' && exit 0 ;;
-    3[34]??:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 4850:*:4.0:3.0)
+	exit ;;
+    mc68k:UNIX:SYSTEM5:3.51m)
+	echo m68k-convergent-sysv
+	exit ;;
+    M680?0:D-NIX:5.3:*)
+	echo m68k-diab-dnix
+	exit ;;
+    M68*:*:R3V[5678]*:*)
+	test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
+    3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
 	OS_REL=''
 	test -r /etc/.relid \
 	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
 	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-	  && echo i486-ncr-sysv4.3${OS_REL} && exit 0
+	  && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
 	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
-	  && echo i586-ncr-sysv4.3${OS_REL} && exit 0 ;;
+	  && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
     3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
         /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-          && echo i486-ncr-sysv4 && exit 0 ;;
-    m68*:LynxOS:2.*:*)
+          && { echo i486-ncr-sysv4; exit; } ;;
+    m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
 	echo m68k-unknown-lynxos${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     mc68030:UNIX_System_V:4.*:*)
 	echo m68k-atari-sysv4
-	exit 0 ;;
-    i?86:LynxOS:2.*:*)
-	echo i386-unknown-lynxos${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
     TSUNAMI:LynxOS:2.*:*)
 	echo sparc-unknown-lynxos${UNAME_RELEASE}
-	exit 0 ;;
-    rs6000:LynxOS:2.*:* | PowerPC:LynxOS:2.*:*)
+	exit ;;
+    rs6000:LynxOS:2.*:*)
 	echo rs6000-unknown-lynxos${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
+    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*)
+	echo powerpc-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
     SM[BE]S:UNIX_SV:*:*)
 	echo mips-dde-sysv${UNAME_RELEASE}
-	exit 0 ;;
+	exit ;;
+    RM*:ReliantUNIX-*:*:*)
+	echo mips-sni-sysv4
+	exit ;;
     RM*:SINIX-*:*:*)
 	echo mips-sni-sysv4
-	exit 0 ;;
+	exit ;;
     *:SINIX-*:*:*)
 	if uname -p 2>/dev/null >/dev/null ; then
 		UNAME_MACHINE=`(uname -p) 2>/dev/null`
@@ -715,39 +1146,156 @@
 	else
 		echo ns32k-sni-sysv
 	fi
-	exit 0 ;;
-    PENTIUM:CPunix:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
-                           # says <Richard.M.Bartel@ccMail.Census.GOV>
+	exit ;;
+    PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
+                      # says <Richard.M.Bartel@ccMail.Census.GOV>
         echo i586-unisys-sysv4
-        exit 0 ;;
+        exit ;;
     *:UNIX_System_V:4*:FTX*)
 	# From Gerald Hewes <hewes@openmarket.com>.
 	# How about differentiating between stratus architectures? -djm
 	echo hppa1.1-stratus-sysv4
-	exit 0 ;;
+	exit ;;
     *:*:*:FTX*)
 	# From seanf@swdc.stratus.com.
 	echo i860-stratus-sysv4
-	exit 0 ;;
+	exit ;;
+    i*86:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	echo ${UNAME_MACHINE}-stratus-vos
+	exit ;;
+    *:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	echo hppa1.1-stratus-vos
+	exit ;;
     mc68*:A/UX:*:*)
 	echo m68k-apple-aux${UNAME_RELEASE}
-	exit 0 ;;
-    news*:NEWS-OS:*:6*)
+	exit ;;
+    news*:NEWS-OS:6*:*)
 	echo mips-sony-newsos6
-	exit 0 ;;
-    R3000:*System_V*:*:* | R4000:UNIX_SYSV:*:*)
+	exit ;;
+    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
 	if [ -d /usr/nec ]; then
 	        echo mips-nec-sysv${UNAME_RELEASE}
 	else
 	        echo mips-unknown-sysv${UNAME_RELEASE}
 	fi
-        exit 0 ;;
+        exit ;;
+    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
+	echo powerpc-be-beos
+	exit ;;
+    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
+	echo powerpc-apple-beos
+	exit ;;
+    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
+	echo i586-pc-beos
+	exit ;;
+    SX-4:SUPER-UX:*:*)
+	echo sx4-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-5:SUPER-UX:*:*)
+	echo sx5-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-6:SUPER-UX:*:*)
+	echo sx6-nec-superux${UNAME_RELEASE}
+	exit ;;
+    Power*:Rhapsody:*:*)
+	echo powerpc-apple-rhapsody${UNAME_RELEASE}
+	exit ;;
+    *:Rhapsody:*:*)
+	echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
+	exit ;;
+    *:Darwin:*:*)
+	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
+	case $UNAME_PROCESSOR in
+	    unknown) UNAME_PROCESSOR=powerpc ;;
+	esac
+	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
+	exit ;;
+    *:procnto*:*:* | *:QNX:[0123456789]*:*)
+	UNAME_PROCESSOR=`uname -p`
+	if test "$UNAME_PROCESSOR" = "x86"; then
+		UNAME_PROCESSOR=i386
+		UNAME_MACHINE=pc
+	fi
+	echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
+	exit ;;
+    *:QNX:*:4*)
+	echo i386-pc-qnx
+	exit ;;
+    NSE-?:NONSTOP_KERNEL:*:*)
+	echo nse-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    NSR-?:NONSTOP_KERNEL:*:*)
+	echo nsr-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    *:NonStop-UX:*:*)
+	echo mips-compaq-nonstopux
+	exit ;;
+    BS2000:POSIX*:*:*)
+	echo bs2000-siemens-sysv
+	exit ;;
+    DS/*:UNIX_System_V:*:*)
+	echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
+	exit ;;
+    *:Plan9:*:*)
+	# "uname -m" is not consistent, so use $cputype instead. 386
+	# is converted to i386 for consistency with other x86
+	# operating systems.
+	if test "$cputype" = "386"; then
+	    UNAME_MACHINE=i386
+	else
+	    UNAME_MACHINE="$cputype"
+	fi
+	echo ${UNAME_MACHINE}-unknown-plan9
+	exit ;;
+    *:TOPS-10:*:*)
+	echo pdp10-unknown-tops10
+	exit ;;
+    *:TENEX:*:*)
+	echo pdp10-unknown-tenex
+	exit ;;
+    KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
+	echo pdp10-dec-tops20
+	exit ;;
+    XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
+	echo pdp10-xkl-tops20
+	exit ;;
+    *:TOPS-20:*:*)
+	echo pdp10-unknown-tops20
+	exit ;;
+    *:ITS:*:*)
+	echo pdp10-unknown-its
+	exit ;;
+    SEI:*:*:SEIUX)
+        echo mips-sei-seiux${UNAME_RELEASE}
+	exit ;;
+    *:DragonFly:*:*)
+	echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
+	exit ;;
+    *:*VMS:*:*)
+    	UNAME_MACHINE=`(uname -p) 2>/dev/null`
+	case "${UNAME_MACHINE}" in
+	    A*) echo alpha-dec-vms ; exit ;;
+	    I*) echo ia64-dec-vms ; exit ;;
+	    V*) echo vax-dec-vms ; exit ;;
+	esac ;;
+    *:XENIX:*:SysV)
+	echo i386-pc-xenix
+	exit ;;
+    i*86:skyos:*:*)
+	echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
+	exit ;;
+    i*86:rdos:*:*)
+	echo ${UNAME_MACHINE}-pc-rdos
+	exit ;;
 esac
 
 #echo '(No uname command or uname output not recognized.)' 1>&2
 #echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
 
-cat >dummy.c <<EOF
+eval $set_cc_for_build
+cat >$dummy.c <<EOF
 #ifdef _SEQUENT_
 # include <sys/types.h>
 # include <sys/utsname.h>
@@ -772,7 +1320,7 @@
 #endif
 
 #if defined (__arm) && defined (__acorn) && defined (__unix)
-  printf ("arm-acorn-riscix"); exit (0);
+  printf ("arm-acorn-riscix\n"); exit (0);
 #endif
 
 #if defined (hp300) && !defined (hpux)
@@ -785,7 +1333,10 @@
 #endif
   int version;
   version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
-  printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
+  if (version < 4)
+    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
+  else
+    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
   exit (0);
 #endif
 
@@ -830,11 +1381,24 @@
 #endif
 
 #if defined (vax)
-#if !defined (ultrix)
-  printf ("vax-dec-bsd\n"); exit (0);
-#else
-  printf ("vax-dec-ultrix\n"); exit (0);
-#endif
+# if !defined (ultrix)
+#  include <sys/param.h>
+#  if defined (BSD)
+#   if BSD == 43
+      printf ("vax-dec-bsd4.3\n"); exit (0);
+#   else
+#    if BSD == 199006
+      printf ("vax-dec-bsd4.3reno\n"); exit (0);
+#    else
+      printf ("vax-dec-bsd\n"); exit (0);
+#    endif
+#   endif
+#  else
+    printf ("vax-dec-bsd\n"); exit (0);
+#  endif
+# else
+    printf ("vax-dec-ultrix\n"); exit (0);
+# endif
 #endif
 
 #if defined (alliant) && defined (i860)
@@ -845,12 +1409,12 @@
 }
 EOF
 
-${CC-cc} dummy.c -o dummy 2>/dev/null && ./dummy && rm dummy.c dummy && exit 0
-rm -f dummy.c dummy
+$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
+	{ echo "$SYSTEM_NAME"; exit; }
 
 # Apollos put the system type in the environment.
 
-test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit 0; }
+test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
 
 # Convex versions that predate uname can use getsysinfo(1)
 
@@ -859,25 +1423,69 @@
     case `getsysinfo -f cpu_type` in
     c1*)
 	echo c1-convex-bsd
-	exit 0 ;;
+	exit ;;
     c2*)
 	if getsysinfo -f scalar_acc
 	then echo c32-convex-bsd
 	else echo c2-convex-bsd
 	fi
-	exit 0 ;;
+	exit ;;
     c34*)
 	echo c34-convex-bsd
-	exit 0 ;;
+	exit ;;
     c38*)
 	echo c38-convex-bsd
-	exit 0 ;;
+	exit ;;
     c4*)
 	echo c4-convex-bsd
-	exit 0 ;;
+	exit ;;
     esac
 fi
 
-#echo '(Unable to guess system type)' 1>&2
+cat >&2 <<EOF
+$0: unable to guess system type
+
+This script, last modified $timestamp, has failed to recognize
+the operating system you are using. It is advised that you
+download the most up to date version of the config scripts from
+
+  http://savannah.gnu.org/cgi-bin/viewcvs/*checkout*/config/config/config.guess
+and
+  http://savannah.gnu.org/cgi-bin/viewcvs/*checkout*/config/config/config.sub
+
+If the version you run ($0) is already up to date, please
+send the following data and any information you think might be
+pertinent to <config-patches@gnu.org> in order to provide the needed
+information to handle your system.
+
+config.guess timestamp = $timestamp
+
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null`
+/bin/uname -X     = `(/bin/uname -X) 2>/dev/null`
+
+hostinfo               = `(hostinfo) 2>/dev/null`
+/bin/universe          = `(/bin/universe) 2>/dev/null`
+/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null`
+/bin/arch              = `(/bin/arch) 2>/dev/null`
+/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
+
+UNAME_MACHINE = ${UNAME_MACHINE}
+UNAME_RELEASE = ${UNAME_RELEASE}
+UNAME_SYSTEM  = ${UNAME_SYSTEM}
+UNAME_VERSION = ${UNAME_VERSION}
+EOF
 
 exit 1
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff --git a/config.sub b/config.sub
old mode 100755
new mode 100644
index 213a6d4..a4e8a94
--- a/config.sub
+++ b/config.sub
@@ -1,6 +1,10 @@
 #! /bin/sh
-# Configuration validation subroutine script, version 1.1.
-#   Copyright (C) 1991, 92, 93, 94, 95, 96, 1997 Free Software Foundation, Inc.
+# Configuration validation subroutine script.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+#   2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+
+timestamp='2006-01-02'
+
 # This file is (in principle) common to ALL GNU software.
 # The presence of a machine in this file suggests that SOME GNU software
 # can handle that machine.  It does not imply ALL GNU software can.
@@ -17,14 +21,18 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330,
-# Boston, MA 02111-1307, USA.
-
+# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+# 02110-1301, USA.
+#
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
 # configuration script generated by Autoconf, you may include it under
 # the same distribution terms that you use for the rest of that program.
 
+
+# Please send patches to <config-patches@gnu.org>.  Submit a context
+# diff and a properly formatted ChangeLog entry.
+#
 # Configuration subroutine to validate and canonicalize a configuration type.
 # Supply the specified configuration type as an argument.
 # If it is invalid, we print an error message on stderr and exit with code 1.
@@ -45,30 +53,75 @@
 #	CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
 # It is wrong to echo any other type of specification.
 
-if [ x$1 = x ]
-then
-	echo Configuration name missing. 1>&2
-	echo "Usage: $0 CPU-MFR-OPSYS" 1>&2
-	echo "or     $0 ALIAS" 1>&2
-	echo where ALIAS is a recognized configuration type. 1>&2
-	exit 1
-fi
+me=`echo "$0" | sed -e 's,.*/,,'`
 
-# First pass through any local machine types.
-case $1 in
-	*local*)
-		echo $1
-		exit 0
-		;;
-	*)
-	;;
+usage="\
+Usage: $0 [OPTION] CPU-MFR-OPSYS
+       $0 [OPTION] ALIAS
+
+Canonicalize a configuration name.
+
+Operation modes:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.sub ($timestamp)
+
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005
+Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try \`$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit ;;
+    --version | -v )
+       echo "$version" ; exit ;;
+    --help | --h* | -h )
+       echo "$usage"; exit ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help"
+       exit 1 ;;
+
+    *local*)
+       # First pass through any local machine types.
+       echo $1
+       exit ;;
+
+    * )
+       break ;;
+  esac
+done
+
+case $# in
+ 0) echo "$me: missing argument$help" >&2
+    exit 1;;
+ 1) ;;
+ *) echo "$me: too many arguments$help" >&2
+    exit 1;;
 esac
 
 # Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any).
 # Here we must recognize all the valid KERNEL-OS combinations.
 maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
 case $maybe_os in
-  linux-gnu*)
+  nto-qnx* | linux-gnu* | linux-dietlibc | linux-newlib* | linux-uclibc* | \
+  uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | knetbsd*-gnu* | netbsd*-gnu* | \
+  storm-chaos* | os2-emx* | rtmk-nova*)
     os=-$maybe_os
     basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
     ;;
@@ -94,15 +147,37 @@
 	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
 	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
 	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
-	-apple)
+	-apple | -axis | -knuth | -cray)
 		os=
 		basic_machine=$1
 		;;
+	-sim | -cisco | -oki | -wec | -winbond)
+		os=
+		basic_machine=$1
+		;;
+	-scout)
+		;;
+	-wrs)
+		os=-vxworks
+		basic_machine=$1
+		;;
+	-chorusos*)
+		os=-chorusos
+		basic_machine=$1
+		;;
+ 	-chorusrdb)
+ 		os=-chorusrdb
+		basic_machine=$1
+ 		;;
 	-hiux*)
 		os=-hiuxwe2
 		;;
+	-sco6)
+		os=-sco5v6
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
 	-sco5)
-		os=sco3.2v5
+		os=-sco3.2v5
 		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
 		;;
 	-sco4)
@@ -117,10 +192,17 @@
 		# Don't forget version if it is 3.2v4 or newer.
 		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
 		;;
+	-sco5v6*)
+		# Don't forget version if it is 3.2v4 or newer.
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
 	-sco*)
 		os=-sco3.2v2
 		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
 		;;
+	-udk*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
 	-isc)
 		os=-isc2.2
 		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
@@ -143,26 +225,84 @@
 	-psos*)
 		os=-psos
 		;;
+	-mint | -mint[0-9]*)
+		basic_machine=m68k-atari
+		os=-mint
+		;;
 esac
 
 # Decode aliases for certain CPU-COMPANY combinations.
 case $basic_machine in
 	# Recognize the basic CPU types without company name.
 	# Some are omitted here because they have special meanings below.
-	tahoe | i860 | m32r | m68k | m68000 | m88k | ns32k | arc | arm \
-		| arme[lb] | pyramid | mn10200 | mn10300 \
-		| tron | a29k | 580 | i960 | h8300 | hppa | hppa1.0 | hppa1.1 \
-		| alpha | alphaev5 | alphaev56 | we32k | ns16k | clipper \
-		| i370 | sh | powerpc | powerpcle | 1750a | dsp16xx | pdp11 \
-		| mips64 | mipsel | mips64el | mips64orion | mips64orionel \
-		| mipstx39 | mipstx39el \
-		| sparc | sparclet | sparclite | sparc64 | v850)
+	1750a | 580 \
+	| a29k \
+	| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
+	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
+	| am33_2.0 \
+	| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr \
+	| bfin \
+	| c4x | clipper \
+	| d10v | d30v | dlx | dsp16xx \
+	| fr30 | frv \
+	| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
+	| i370 | i860 | i960 | ia64 \
+	| ip2k | iq2000 \
+	| m32r | m32rle | m68000 | m68k | m88k | maxq | mb | microblaze | mcore \
+	| mips | mipsbe | mipseb | mipsel | mipsle \
+	| mips16 \
+	| mips64 | mips64el \
+	| mips64vr | mips64vrel \
+	| mips64orion | mips64orionel \
+	| mips64vr4100 | mips64vr4100el \
+	| mips64vr4300 | mips64vr4300el \
+	| mips64vr5000 | mips64vr5000el \
+	| mips64vr5900 | mips64vr5900el \
+	| mipsisa32 | mipsisa32el \
+	| mipsisa32r2 | mipsisa32r2el \
+	| mipsisa64 | mipsisa64el \
+	| mipsisa64r2 | mipsisa64r2el \
+	| mipsisa64sb1 | mipsisa64sb1el \
+	| mipsisa64sr71k | mipsisa64sr71kel \
+	| mipstx39 | mipstx39el \
+	| mn10200 | mn10300 \
+	| mt \
+	| msp430 \
+	| ns16k | ns32k \
+	| or32 \
+	| pdp10 | pdp11 | pj | pjl \
+	| powerpc | powerpc64 | powerpc64le | powerpcle | ppcbe \
+	| pyramid \
+	| sh | sh[1234] | sh[24]a | sh[23]e | sh[34]eb | shbe | shle | sh[1234]le | sh3ele \
+	| sh64 | sh64le \
+	| sparc | sparc64 | sparc64b | sparc86x | sparclet | sparclite \
+	| sparcv8 | sparcv9 | sparcv9b \
+	| strongarm \
+	| tahoe | thumb | tic4x | tic80 | tron \
+	| v850 | v850e \
+	| we32k \
+	| x86 | xscale | xscalee[bl] | xstormy16 | xtensa \
+	| z8k)
 		basic_machine=$basic_machine-unknown
 		;;
+	m32c)
+		basic_machine=$basic_machine-unknown
+		;;
+	m6811 | m68hc11 | m6812 | m68hc12)
+		# Motorola 68HC11/12.
+		basic_machine=$basic_machine-unknown
+		os=-none
+		;;
+	m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k)
+		;;
+	ms1)
+		basic_machine=mt-unknown
+		;;
+
 	# We use `pc' rather than `unknown'
 	# because (1) that's what they normally are, and
 	# (2) the word "unknown" tends to confuse beginning users.
-	i[3456]86)
+	i*86 | x86_64)
 	  basic_machine=$basic_machine-pc
 	  ;;
 	# Object if more than one company name word.
@@ -171,27 +311,91 @@
 		exit 1
 		;;
 	# Recognize the basic CPU types with company name.
-	vax-* | tahoe-* | i[3456]86-* | i860-* | m32r-* | m68k-* | m68000-* \
-	      | m88k-* | sparc-* | ns32k-* | fx80-* | arc-* | arm-* | c[123]* \
-	      | mips-* | pyramid-* | tron-* | a29k-* | romp-* | rs6000-* \
-	      | power-* | none-* | 580-* | cray2-* | h8300-* | i960-* \
-	      | xmp-* | ymp-* | hppa-* | hppa1.0-* | hppa1.1-* \
-	      | alpha-* | alphaev5-* | alphaev56-* | we32k-* | cydra-* \
-	      | ns16k-* | pn-* | np1-* | xps100-* | clipper-* | orion-* \
-	      | sparclite-* | pdp11-* | sh-* | powerpc-* | powerpcle-* \
-	      | sparc64-* | mips64-* | mipsel-* \
-	      | mips64el-* | mips64orion-* | mips64orionel-*  \
-	      | mipstx39-* | mipstx39el-* \
-	      | f301-*)
+	580-* \
+	| a29k-* \
+	| alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
+	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
+	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
+	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
+	| avr-* \
+	| bfin-* | bs2000-* \
+	| c[123]* | c30-* | [cjt]90-* | c4x-* | c54x-* | c55x-* | c6x-* \
+	| clipper-* | craynv-* | cydra-* \
+	| d10v-* | d30v-* | dlx-* \
+	| elxsi-* \
+	| f30[01]-* | f700-* | fr30-* | frv-* | fx80-* \
+	| h8300-* | h8500-* \
+	| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
+	| i*86-* | i860-* | i960-* | ia64-* \
+	| ip2k-* | iq2000-* \
+	| m32r-* | m32rle-* \
+	| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
+	| m88110-* | m88k-* | maxq-* | mcore-* \
+	| mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
+	| mips16-* \
+	| mips64-* | mips64el-* \
+	| mips64vr-* | mips64vrel-* \
+	| mips64orion-* | mips64orionel-* \
+	| mips64vr4100-* | mips64vr4100el-* \
+	| mips64vr4300-* | mips64vr4300el-* \
+	| mips64vr5000-* | mips64vr5000el-* \
+	| mips64vr5900-* | mips64vr5900el-* \
+	| mipsisa32-* | mipsisa32el-* \
+	| mipsisa32r2-* | mipsisa32r2el-* \
+	| mipsisa64-* | mipsisa64el-* \
+	| mipsisa64r2-* | mipsisa64r2el-* \
+	| mipsisa64sb1-* | mipsisa64sb1el-* \
+	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
+	| mipstx39-* | mipstx39el-* \
+	| mmix-* \
+	| mt-* \
+	| msp430-* \
+	| none-* | np1-* | ns16k-* | ns32k-* \
+	| orion-* \
+	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
+	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* | ppcbe-* \
+	| pyramid-* \
+	| romp-* | rs6000-* \
+	| sh-* | sh[1234]-* | sh[24]a-* | sh[23]e-* | sh[34]eb-* | shbe-* \
+	| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
+	| sparc-* | sparc64-* | sparc64b-* | sparc86x-* | sparclet-* \
+	| sparclite-* \
+	| sparcv8-* | sparcv9-* | sparcv9b-* | strongarm-* | sv1-* | sx?-* \
+	| tahoe-* | thumb-* \
+	| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
+	| tron-* \
+	| v850-* | v850e-* | vax-* \
+	| we32k-* \
+	| x86-* | x86_64-* | xps100-* | xscale-* | xscalee[bl]-* \
+	| xstormy16-* | xtensa-* \
+	| ymp-* \
+	| z8k-*)
+		;;
+	m32c-*)
 		;;
 	# Recognize the various machine names and aliases which stand
 	# for a CPU type and a company and sometimes even an OS.
+	386bsd)
+		basic_machine=i386-unknown
+		os=-bsd
+		;;
 	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
 		basic_machine=m68000-att
 		;;
 	3b*)
 		basic_machine=we32k-att
 		;;
+	a29khif)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+    	abacus)
+		basic_machine=abacus-unknown
+		;;
+	adobe68k)
+		basic_machine=m68010-adobe
+		os=-scout
+		;;
 	alliant | fx80)
 		basic_machine=fx80-alliant
 		;;
@@ -202,25 +406,35 @@
 		basic_machine=a29k-none
 		os=-bsd
 		;;
+	amd64)
+		basic_machine=x86_64-pc
+		;;
+	amd64-*)
+		basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
 	amdahl)
 		basic_machine=580-amdahl
 		os=-sysv
 		;;
 	amiga | amiga-*)
-		basic_machine=m68k-cbm
+		basic_machine=m68k-unknown
 		;;
 	amigaos | amigados)
-		basic_machine=m68k-cbm
+		basic_machine=m68k-unknown
 		os=-amigaos
 		;;
 	amigaunix | amix)
-		basic_machine=m68k-cbm
+		basic_machine=m68k-unknown
 		os=-sysv4
 		;;
 	apollo68)
 		basic_machine=m68k-apollo
 		os=-sysv
 		;;
+	apollo68bsd)
+		basic_machine=m68k-apollo
+		os=-bsd
+		;;
 	aux)
 		basic_machine=m68k-apple
 		os=-aux
@@ -229,6 +443,10 @@
 		basic_machine=ns32k-sequent
 		os=-dynix
 		;;
+	c90)
+		basic_machine=c90-cray
+		os=-unicos
+		;;
 	convex-c1)
 		basic_machine=c1-convex
 		os=-bsd
@@ -249,27 +467,45 @@
 		basic_machine=c38-convex
 		os=-bsd
 		;;
-	cray | ymp)
-		basic_machine=ymp-cray
+	cray | j90)
+		basic_machine=j90-cray
 		os=-unicos
 		;;
-	cray2)
-		basic_machine=cray2-cray
-		os=-unicos
+	craynv)
+		basic_machine=craynv-cray
+		os=-unicosmp
 		;;
-	[ctj]90-cray)
-		basic_machine=c90-cray
-		os=-unicos
+	cr16c)
+		basic_machine=cr16c-unknown
+		os=-elf
 		;;
 	crds | unos)
 		basic_machine=m68k-crds
 		;;
+	crisv32 | crisv32-* | etraxfs*)
+		basic_machine=crisv32-axis
+		;;
+	cris | cris-* | etrax*)
+		basic_machine=cris-axis
+		;;
+	crx)
+		basic_machine=crx-unknown
+		os=-elf
+		;;
 	da30 | da30-*)
 		basic_machine=m68k-da30
 		;;
 	decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn)
 		basic_machine=mips-dec
 		;;
+	decsystem10* | dec10*)
+		basic_machine=pdp10-dec
+		os=-tops10
+		;;
+	decsystem20* | dec20*)
+		basic_machine=pdp10-dec
+		os=-tops20
+		;;
 	delta | 3300 | motorola-3300 | motorola-delta \
 	      | 3300-motorola | delta-motorola)
 		basic_machine=m68k-motorola
@@ -278,6 +514,10 @@
 		basic_machine=m88k-motorola
 		os=-sysv3
 		;;
+	djgpp)
+		basic_machine=i586-pc
+		os=-msdosdjgpp
+		;;
 	dpx20 | dpx20-*)
 		basic_machine=rs6000-bull
 		os=-bosx
@@ -297,6 +537,10 @@
 	encore | umax | mmax)
 		basic_machine=ns32k-encore
 		;;
+	es1800 | OSE68k | ose68k | ose | OSE)
+		basic_machine=m68k-ericsson
+		os=-ose
+		;;
 	fx2800)
 		basic_machine=i860-alliant
 		;;
@@ -307,6 +551,10 @@
 		basic_machine=tron-gmicro
 		os=-sysv
 		;;
+	go32)
+		basic_machine=i386-pc
+		os=-go32
+		;;
 	h3050r* | hiux*)
 		basic_machine=hppa1.1-hitachi
 		os=-hiuxwe2
@@ -315,6 +563,14 @@
 		basic_machine=h8300-hitachi
 		os=-hms
 		;;
+	h8300xray)
+		basic_machine=h8300-hitachi
+		os=-xray
+		;;
+	h8500hms)
+		basic_machine=h8500-hitachi
+		os=-hms
+		;;
 	harris)
 		basic_machine=m88k-harris
 		os=-sysv3
@@ -330,13 +586,30 @@
 		basic_machine=m68k-hp
 		os=-hpux
 		;;
+	hp3k9[0-9][0-9] | hp9[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
 	hp9k2[0-9][0-9] | hp9k31[0-9])
 		basic_machine=m68000-hp
 		;;
 	hp9k3[2-9][0-9])
 		basic_machine=m68k-hp
 		;;
-	hp9k7[0-9][0-9] | hp7[0-9][0-9] | hp9k8[0-9]7 | hp8[0-9]7)
+	hp9k6[0-9][0-9] | hp6[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hp9k7[0-79][0-9] | hp7[0-79][0-9])
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k78[0-9] | hp78[0-9])
+		# FIXME: really hppa2.0-hp
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
+		# FIXME: really hppa2.0-hp
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[0-9][13679] | hp8[0-9][13679])
 		basic_machine=hppa1.1-hp
 		;;
 	hp9k8[0-9][0-9] | hp8[0-9][0-9])
@@ -345,27 +618,42 @@
 	hppa-next)
 		os=-nextstep3
 		;;
+	hppaosf)
+		basic_machine=hppa1.1-hp
+		os=-osf
+		;;
+	hppro)
+		basic_machine=hppa1.1-hp
+		os=-proelf
+		;;
 	i370-ibm* | ibm*)
 		basic_machine=i370-ibm
-		os=-mvs
 		;;
 # I'm not sure what "Sysv32" means.  Should this be sysv3.2?
-	i[3456]86v32)
+	i*86v32)
 		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
 		os=-sysv32
 		;;
-	i[3456]86v4*)
+	i*86v4*)
 		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
 		os=-sysv4
 		;;
-	i[3456]86v)
+	i*86v)
 		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
 		os=-sysv
 		;;
-	i[3456]86sol2)
+	i*86sol2)
 		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
 		os=-solaris2
 		;;
+	i386mach)
+		basic_machine=i386-mach
+		os=-mach
+		;;
+	i386-vsta | vsta)
+		basic_machine=i386-unknown
+		os=-vsta
+		;;
 	iris | iris4d)
 		basic_machine=mips-sgi
 		case $os in
@@ -391,16 +679,16 @@
 		basic_machine=ns32k-utek
 		os=-sysv
 		;;
+	mingw32)
+		basic_machine=i386-pc
+		os=-mingw32
+		;;
 	miniframe)
 		basic_machine=m68000-convergent
 		;;
-	mipsel*-linux*)
-		basic_machine=mipsel-unknown
-		os=-linux-gnu
-		;;
-	mips*-linux*)
-		basic_machine=mips-unknown
-		os=-linux-gnu
+	*mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*)
+		basic_machine=m68k-atari
+		os=-mint
 		;;
 	mips3*-*)
 		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`
@@ -408,10 +696,37 @@
 	mips3*)
 		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
 		;;
+	monitor)
+		basic_machine=m68k-rom68k
+		os=-coff
+		;;
+	morphos)
+		basic_machine=powerpc-unknown
+		os=-morphos
+		;;
+	msdos)
+		basic_machine=i386-pc
+		os=-msdos
+		;;
+	ms1-*)
+		basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
+		;;
+	mvs)
+		basic_machine=i370-ibm
+		os=-mvs
+		;;
 	ncr3000)
 		basic_machine=i486-ncr
 		os=-sysv4
 		;;
+	netbsd386)
+		basic_machine=i386-unknown
+		os=-netbsd
+		;;
+	netwinder)
+		basic_machine=armv4l-rebel
+		os=-linux
+		;;
 	news | news700 | news800 | news900)
 		basic_machine=m68k-sony
 		os=-newsos
@@ -424,6 +739,10 @@
 		basic_machine=mips-sony
 		os=-newsos
 		;;
+	necv70)
+		basic_machine=v70-nec
+		os=-sysv
+		;;
 	next | m*-next )
 		basic_machine=m68k-next
 		case $os in
@@ -449,9 +768,39 @@
 		basic_machine=i960-intel
 		os=-nindy
 		;;
+	mon960)
+		basic_machine=i960-intel
+		os=-mon960
+		;;
+	nonstopux)
+		basic_machine=mips-compaq
+		os=-nonstopux
+		;;
 	np1)
 		basic_machine=np1-gould
 		;;
+	nsr-tandem)
+		basic_machine=nsr-tandem
+		;;
+	op50n-* | op60c-*)
+		basic_machine=hppa1.1-oki
+		os=-proelf
+		;;
+	openrisc | openrisc-*)
+		basic_machine=or32-unknown
+		;;
+	os400)
+		basic_machine=powerpc-ibm
+		os=-os400
+		;;
+	OSE68000 | ose68000)
+		basic_machine=m68000-ericsson
+		os=-ose
+		;;
+	os68k)
+		basic_machine=m68k-none
+		os=-os68k
+		;;
 	pa-hitachi)
 		basic_machine=hppa1.1-hitachi
 		os=-hiuxwe2
@@ -466,53 +815,105 @@
 	pbb)
 		basic_machine=m68k-tti
 		;;
-        pc532 | pc532-*)
+	pc532 | pc532-*)
 		basic_machine=ns32k-pc532
 		;;
-	pentium | p5)
-		basic_machine=i586-intel
+	pc98)
+		basic_machine=i386-pc
 		;;
-	pentiumpro | p6)
-		basic_machine=i686-intel
+	pc98-*)
+		basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'`
 		;;
-	pentium-* | p5-*)
+	pentium | p5 | k5 | k6 | nexgen | viac3)
+		basic_machine=i586-pc
+		;;
+	pentiumpro | p6 | 6x86 | athlon | athlon_*)
+		basic_machine=i686-pc
+		;;
+	pentiumii | pentium2 | pentiumiii | pentium3)
+		basic_machine=i686-pc
+		;;
+	pentium4)
+		basic_machine=i786-pc
+		;;
+	pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
 		basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'`
 		;;
-	pentiumpro-* | p6-*)
+	pentiumpro-* | p6-* | 6x86-* | athlon-*)
 		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
 		;;
-	k5)
-		# We don't have specific support for AMD's K5 yet, so just call it a Pentium
-		basic_machine=i586-amd
+	pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*)
+		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
 		;;
-	nexen)
-		# We don't have specific support for Nexgen yet, so just call it a Pentium
-		basic_machine=i586-nexgen
+	pentium4-*)
+		basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'`
 		;;
 	pn)
 		basic_machine=pn-gould
 		;;
-	power)	basic_machine=rs6000-ibm
+	power)	basic_machine=power-ibm
 		;;
 	ppc)	basic_machine=powerpc-unknown
-	        ;;
+		;;
 	ppc-*)	basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
 		;;
 	ppcle | powerpclittle | ppc-le | powerpc-little)
 		basic_machine=powerpcle-unknown
-	        ;;
+		;;
 	ppcle-* | powerpclittle-*)
 		basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'`
 		;;
+	ppc64)	basic_machine=powerpc64-unknown
+		;;
+	ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppc64le | powerpc64little | ppc64-le | powerpc64-little)
+		basic_machine=powerpc64le-unknown
+		;;
+	ppc64le-* | powerpc64little-*)
+		basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
 	ps2)
 		basic_machine=i386-ibm
 		;;
+	pw32)
+		basic_machine=i586-unknown
+		os=-pw32
+		;;
+	rdos)
+		basic_machine=i386-pc
+		os=-rdos
+		;;
+	rom68k)
+		basic_machine=m68k-rom68k
+		os=-coff
+		;;
 	rm[46]00)
 		basic_machine=mips-siemens
 		;;
 	rtpc | rtpc-*)
 		basic_machine=romp-ibm
 		;;
+	s390 | s390-*)
+		basic_machine=s390-ibm
+		;;
+	s390x | s390x-*)
+		basic_machine=s390x-ibm
+		;;
+	sa29200)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	sb1)
+		basic_machine=mipsisa64sb1-unknown
+		;;
+	sb1el)
+		basic_machine=mipsisa64sb1el-unknown
+		;;
+	sei)
+		basic_machine=mips-sei
+		os=-seiux
+		;;
 	sequent)
 		basic_machine=i386-sequent
 		;;
@@ -520,6 +921,13 @@
 		basic_machine=sh-hitachi
 		os=-hms
 		;;
+	sh64)
+		basic_machine=sh64-unknown
+		;;
+	sparclite-wrs | simso-wrs)
+		basic_machine=sparclite-wrs
+		os=-vxworks
+		;;
 	sps7)
 		basic_machine=m68k-bull
 		os=-sysv2
@@ -527,6 +935,13 @@
 	spur)
 		basic_machine=spur-unknown
 		;;
+	st2000)
+		basic_machine=m68k-tandem
+		;;
+	stratus)
+		basic_machine=i860-stratus
+		os=-sysv4
+		;;
 	sun2)
 		basic_machine=m68000-sun
 		;;
@@ -567,19 +982,51 @@
 	sun386 | sun386i | roadrunner)
 		basic_machine=i386-sun
 		;;
+	sv1)
+		basic_machine=sv1-cray
+		os=-unicos
+		;;
 	symmetry)
 		basic_machine=i386-sequent
 		os=-dynix
 		;;
+	t3e)
+		basic_machine=alphaev5-cray
+		os=-unicos
+		;;
+	t90)
+		basic_machine=t90-cray
+		os=-unicos
+		;;
+	tic54x | c54x*)
+		basic_machine=tic54x-unknown
+		os=-coff
+		;;
+	tic55x | c55x*)
+		basic_machine=tic55x-unknown
+		os=-coff
+		;;
+	tic6x | c6x*)
+		basic_machine=tic6x-unknown
+		os=-coff
+		;;
 	tx39)
 		basic_machine=mipstx39-unknown
 		;;
 	tx39el)
 		basic_machine=mipstx39el-unknown
 		;;
+	toad1)
+		basic_machine=pdp10-xkl
+		os=-tops20
+		;;
 	tower | tower-32)
 		basic_machine=m68k-ncr
 		;;
+	tpf)
+		basic_machine=s390x-ibm
+		os=-tpf
+		;;
 	udi29k)
 		basic_machine=a29k-amd
 		os=-udi
@@ -588,6 +1035,10 @@
 		basic_machine=a29k-nyu
 		os=-sym1
 		;;
+	v810 | necv810)
+		basic_machine=v810-nec
+		os=-none
+		;;
 	vaxv)
 		basic_machine=vax-dec
 		os=-sysv
@@ -597,8 +1048,8 @@
 		os=-vms
 		;;
 	vpp*|vx|vx-*)
-               basic_machine=f301-fujitsu
-               ;;
+		basic_machine=f301-fujitsu
+		;;
 	vxworks960)
 		basic_machine=i960-wrs
 		os=-vxworks
@@ -611,12 +1062,28 @@
 		basic_machine=a29k-wrs
 		os=-vxworks
 		;;
-	xmp)
-		basic_machine=xmp-cray
+	w65*)
+		basic_machine=w65-wdc
+		os=-none
+		;;
+	w89k-*)
+		basic_machine=hppa1.1-winbond
+		os=-proelf
+		;;
+	xbox)
+		basic_machine=i686-pc
+		os=-mingw32
+		;;
+	xps | xps100)
+		basic_machine=xps100-honeywell
+		;;
+	ymp)
+		basic_machine=ymp-cray
 		os=-unicos
 		;;
-        xps | xps100)
-		basic_machine=xps100-honeywell
+	z8k-*-coff)
+		basic_machine=z8k-unknown
+		os=-sim
 		;;
 	none)
 		basic_machine=none-none
@@ -625,32 +1092,44 @@
 
 # Here we handle the default manufacturer of certain CPU types.  It is in
 # some cases the only manufacturer, in others, it is the most popular.
-	mips)
-		if [ x$os = x-linux-gnu ]; then
-			basic_machine=mips-unknown
-		else
-			basic_machine=mips-mips
-		fi
+	w89k)
+		basic_machine=hppa1.1-winbond
+		;;
+	op50n)
+		basic_machine=hppa1.1-oki
+		;;
+	op60c)
+		basic_machine=hppa1.1-oki
 		;;
 	romp)
 		basic_machine=romp-ibm
 		;;
+	mmix)
+		basic_machine=mmix-knuth
+		;;
 	rs6000)
 		basic_machine=rs6000-ibm
 		;;
 	vax)
 		basic_machine=vax-dec
 		;;
+	pdp10)
+		# there are many clones, so DEC is not a safe bet
+		basic_machine=pdp10-unknown
+		;;
 	pdp11)
 		basic_machine=pdp11-dec
 		;;
 	we32k)
 		basic_machine=we32k-att
 		;;
-	sparc)
+	sh[1234] | sh[24]a | sh[34]eb | sh[1234]le | sh[23]ele)
+		basic_machine=sh-unknown
+		;;
+	sparc | sparcv8 | sparcv9 | sparcv9b)
 		basic_machine=sparc-sun
 		;;
-        cydra)
+	cydra)
 		basic_machine=cydra-cydrome
 		;;
 	orion)
@@ -659,6 +1138,15 @@
 	orion105)
 		basic_machine=clipper-highlevel
 		;;
+	mac | mpw | mac-mpw)
+		basic_machine=m68k-apple
+		;;
+	pmac | pmac-mpw)
+		basic_machine=powerpc-apple
+		;;
+	*-unknown)
+		# Make sure to match an already-canonicalized machine name.
+		;;
 	*)
 		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
 		exit 1
@@ -711,14 +1199,49 @@
 	      | -aos* \
 	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
 	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
-	      | -hiux* | -386bsd* | -netbsd* | -openbsd* | -freebsd* | -riscix* \
-	      | -lynxos* | -bosx* | -nextstep* | -cxux* | -aout* | -elf* \
+	      | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
+	      | -openbsd* | -solidbsd* \
+	      | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
+	      | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
 	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
 	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
-	      | -cygwin32* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
-	      | -mingw32* | -linux-gnu* | -uxpv*)
+	      | -chorusos* | -chorusrdb* \
+	      | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
+	      | -mingw32* | -linux-gnu* | -linux-newlib* | -linux-uclibc* \
+	      | -uxpv* | -beos* | -mpeix* | -udk* \
+	      | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
+	      | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
+	      | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
+	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
+	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
+	      | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
+	      | -skyos* | -haiku* | -rdos*)
 	# Remember, each alternative MUST END IN *, to match a version number.
 		;;
+	-qnx*)
+		case $basic_machine in
+		    x86-* | i*86-*)
+			;;
+		    *)
+			os=-nto$os
+			;;
+		esac
+		;;
+	-nto-qnx*)
+		;;
+	-nto*)
+		os=`echo $os | sed -e 's|nto|nto-qnx|'`
+		;;
+	-sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \
+	      | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \
+	      | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*)
+		;;
+	-mac*)
+		os=`echo $os | sed -e 's|mac|macos|'`
+		;;
+	-linux-dietlibc)
+		os=-linux-dietlibc
+		;;
 	-linux*)
 		os=`echo $os | sed -e 's|linux|linux-gnu|'`
 		;;
@@ -728,6 +1251,15 @@
 	-sunos6*)
 		os=`echo $os | sed -e 's|sunos6|solaris3|'`
 		;;
+	-opened*)
+		os=-openedition
+		;;
+        -os400*)
+		os=-os400
+		;;
+	-wince*)
+		os=-wince
+		;;
 	-osfrose*)
 		os=-osfrose
 		;;
@@ -743,11 +1275,26 @@
 	-acis*)
 		os=-aos
 		;;
+	-atheos*)
+		os=-atheos
+		;;
+	-syllable*)
+		os=-syllable
+		;;
+	-386bsd)
+		os=-bsd
+		;;
 	-ctix* | -uts*)
 		os=-sysv
 		;;
+	-nova*)
+		os=-rtmk-nova
+		;;
 	-ns2 )
-	        os=-nextstep2
+		os=-nextstep2
+		;;
+	-nsk*)
+		os=-nsk
 		;;
 	# Preserve the version number of sinix5.
 	-sinix5.*)
@@ -756,6 +1303,9 @@
 	-sinix*)
 		os=-sysv4
 		;;
+        -tpf*)
+		os=-tpf
+		;;
 	-triton*)
 		os=-sysv3
 		;;
@@ -774,9 +1324,27 @@
 	# This must come after -sysvr4.
 	-sysv*)
 		;;
+	-ose*)
+		os=-ose
+		;;
+	-es1800*)
+		os=-ose
+		;;
 	-xenix)
 		os=-xenix
 		;;
+	-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
+		os=-mint
+		;;
+	-aros*)
+		os=-aros
+		;;
+	-kaos*)
+		os=-kaos
+		;;
+	-zvmoe)
+		os=-zvmoe
+		;;
 	-none)
 		;;
 	*)
@@ -802,10 +1370,20 @@
 	*-acorn)
 		os=-riscix1.2
 		;;
+	arm*-rebel)
+		os=-linux
+		;;
 	arm*-semi)
 		os=-aout
 		;;
-        pdp11-*)
+    c4x-* | tic4x-*)
+        os=-coff
+        ;;
+	# This must come before the *-dec entry.
+	pdp10-*)
+		os=-tops20
+		;;
+	pdp11-*)
 		os=-none
 		;;
 	*-dec | vax-*)
@@ -823,15 +1401,45 @@
 		# default.
 		# os=-sunos4
 		;;
+	m68*-cisco)
+		os=-aout
+		;;
+	mips*-cisco)
+		os=-elf
+		;;
+	mips*-*)
+		os=-elf
+		;;
+	or32-*)
+		os=-coff
+		;;
 	*-tti)	# must be before sparc entry or we get the wrong os.
 		os=-sysv3
 		;;
 	sparc-* | *-sun)
 		os=-sunos4.1.1
 		;;
+	*-be)
+		os=-beos
+		;;
+	*-haiku)
+		os=-haiku
+		;;
 	*-ibm)
 		os=-aix
 		;;
+    	*-knuth)
+		os=-mmixware
+		;;
+	*-wec)
+		os=-proelf
+		;;
+	*-winbond)
+		os=-proelf
+		;;
+	*-oki)
+		os=-proelf
+		;;
 	*-hp)
 		os=-hpux
 		;;
@@ -874,27 +1482,39 @@
 	*-next)
 		os=-nextstep3
 		;;
-        *-gould)
+	*-gould)
 		os=-sysv
 		;;
-        *-highlevel)
+	*-highlevel)
 		os=-bsd
 		;;
 	*-encore)
 		os=-bsd
 		;;
-        *-sgi)
+	*-sgi)
 		os=-irix
 		;;
-        *-siemens)
+	*-siemens)
 		os=-sysv4
 		;;
 	*-masscomp)
 		os=-rtu
 		;;
-	f301-fujitsu)
+	f30[01]-fujitsu | f700-fujitsu)
 		os=-uxpv
 		;;
+	*-rom68k)
+		os=-coff
+		;;
+	*-*bug)
+		os=-coff
+		;;
+	*-apple)
+		os=-macos
+		;;
+	*-atari*)
+		os=-mint
+		;;
 	*)
 		os=-none
 		;;
@@ -916,9 +1536,15 @@
 			-aix*)
 				vendor=ibm
 				;;
+			-beos*)
+				vendor=be
+				;;
 			-hpux*)
 				vendor=hp
 				;;
+			-mpeix*)
+				vendor=hp
+				;;
 			-hiux*)
 				vendor=hitachi
 				;;
@@ -934,21 +1560,47 @@
 			-genix*)
 				vendor=ns
 				;;
-			-mvs*)
+			-mvs* | -opened*)
+				vendor=ibm
+				;;
+			-os400*)
 				vendor=ibm
 				;;
 			-ptx*)
 				vendor=sequent
 				;;
-			-vxsim* | -vxworks*)
+			-tpf*)
+				vendor=ibm
+				;;
+			-vxsim* | -vxworks* | -windiss*)
 				vendor=wrs
 				;;
 			-aux*)
 				vendor=apple
 				;;
+			-hms*)
+				vendor=hitachi
+				;;
+			-mpw* | -macos*)
+				vendor=apple
+				;;
+			-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
+				vendor=atari
+				;;
+			-vos*)
+				vendor=stratus
+				;;
 		esac
 		basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"`
 		;;
 esac
 
 echo $basic_machine$os
+exit
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff --git a/config.ver b/config.ver
new file mode 100644
index 0000000..5b6b6c8
--- /dev/null
+++ b/config.ver
@@ -0,0 +1,44 @@
+
+JPEG_VER_MAJOR=62
+JPEG_VER_MINOR=1
+JPEG_REVISION=0
+
+case $host_os in
+  cygwin*)
+    # The shared library built from this source code is *not* binary
+    # compatible with the cygwin's official binary release (cygjpeg-62.dll).
+    # This is because the official binary has been built with
+    # the lossless jpeg patch which is available as ljpeg-6b.tar.gz .
+    # Therefore we decided to give the shared library the version number
+    # other than 62.
+    #
+    JPEG_VER_MAJOR=162
+    JPEG_VER_MINOR=0
+    ;;
+  freebsd*)
+    # This follows the official binary release in the ports collection.
+    JPEG_VER_MAJOR=9
+    ;;
+esac
+
+# convert absolute version numbers to libtool ages
+case $version_type in
+  freebsd-aout|freebsd-elf|sunos)
+    JPEG_LT_CURRENT=$JPEG_VER_MAJOR
+    JPEG_LT_REVISION=$JPEG_VER_MINOR
+    JPEG_LT_AGE=0
+    ;;
+  irix|nonstopux)
+    JPEG_LT_CURRENT=`expr $JPEG_VER_MAJOR + $JPEG_VER_MINOR - 1`
+    JPEG_LT_AGE=$JPEG_VER_MINOR
+    JPEG_LT_REVISION=$JPEG_VER_MINOR
+    ;;
+  *)
+    JPEG_LT_CURRENT=`expr $JPEG_VER_MAJOR + $JPEG_VER_MINOR`
+    JPEG_LT_AGE=$JPEG_VER_MINOR
+    JPEG_LT_REVISION=$JPEG_REVISION
+    ;;
+esac
+
+JPEG_LIB_VERSION=$JPEG_LT_CURRENT:$JPEG_LT_REVISION:$JPEG_LT_AGE
+
diff --git a/configure b/configure
index 35c9db5..9c368e5 100755
--- a/configure
+++ b/configure
@@ -1,7 +1,7 @@
 #! /bin/sh
 
 # Guess values for system-dependent variables and create Makefiles.
-# Generated automatically using autoconf version 2.12 
+# Generated automatically using autoconf version 2.13 
 # Copyright (C) 1992, 93, 94, 95, 96 Free Software Foundation, Inc.
 #
 # This configure script is free software; the Free Software Foundation
@@ -12,13 +12,190 @@
 ac_default_prefix=/usr/local
 # Any additions from configure.in:
 ac_help="$ac_help
-  --enable-shared         build shared library using GNU libtool"
+  --enable-shared[=PKGS]  build shared libraries [default=no]"
 ac_help="$ac_help
-  --enable-static         build static library using GNU libtool"
+  --enable-static[=PKGS]  build static libraries [default=no]"
+ac_help="$ac_help
+  --enable-fast-install[=PKGS]  optimize for fast installation [default=yes]"
+ac_help="$ac_help
+  --with-gnu-ld           assume the C compiler uses GNU ld [default=no]"
+
+# Find the correct PATH separator.  Usually this is `:', but
+# DJGPP uses `;' like DOS.
+if test "X${PATH_SEPARATOR+set}" != Xset; then
+  UNAME=${UNAME-`uname 2>/dev/null`}
+  case X$UNAME in
+    *-DOS) lt_cv_sys_path_separator=';' ;;
+    *)     lt_cv_sys_path_separator=':' ;;
+  esac
+  PATH_SEPARATOR=$lt_cv_sys_path_separator
+fi
+
+
+# Check that we are running under the correct shell.
+SHELL=${CONFIG_SHELL-/bin/sh}
+
+case X$ECHO in
+X*--fallback-echo)
+  # Remove one level of quotation (which was required for Make).
+  ECHO=`echo "$ECHO" | sed 's,\\\\\$\\$0,'$0','`
+  ;;
+esac
+
+echo=${ECHO-echo}
+if test "X$1" = X--no-reexec; then
+  # Discard the --no-reexec flag, and continue.
+  shift
+elif test "X$1" = X--fallback-echo; then
+  # Avoid inline document here, it may be left over
+  :
+elif test "X`($echo '\t') 2>/dev/null`" = 'X\t'; then
+  # Yippee, $echo works!
+  :
+else
+  # Restart under the correct shell.
+  exec $SHELL "$0" --no-reexec ${1+"$@"}
+fi
+
+if test "X$1" = X--fallback-echo; then
+  # used as fallback echo
+  shift
+  cat <<EOF
+
+EOF
+  exit 0
+fi
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+if test "X${CDPATH+set}" = Xset; then CDPATH=:; export CDPATH; fi
+
+if test -z "$ECHO"; then
+if test "X${echo_test_string+set}" != Xset; then
+# find a string as large as possible, as long as the shell can cope with it
+  for cmd in 'sed 50q "$0"' 'sed 20q "$0"' 'sed 10q "$0"' 'sed 2q "$0"' 'echo test'; do
+    # expected sizes: less than 2Kb, 1Kb, 512 bytes, 16 bytes, ...
+    if (echo_test_string="`eval $cmd`") 2>/dev/null &&
+       echo_test_string="`eval $cmd`" &&
+       (test "X$echo_test_string" = "X$echo_test_string") 2>/dev/null
+    then
+      break
+    fi
+  done
+fi
+
+if test "X`($echo '\t') 2>/dev/null`" = 'X\t' &&
+   echo_testing_string=`($echo "$echo_test_string") 2>/dev/null` &&
+   test "X$echo_testing_string" = "X$echo_test_string"; then
+  :
+else
+  # The Solaris, AIX, and Digital Unix default echo programs unquote
+  # backslashes.  This makes it impossible to quote backslashes using
+  #   echo "$something" | sed 's/\\/\\\\/g'
+  #
+  # So, first we look for a working echo in the user's PATH.
+
+  IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=$PATH_SEPARATOR
+  for dir in $PATH /usr/ucb; do
+    if (test -f $dir/echo || test -f $dir/echo$ac_exeext) &&
+       test "X`($dir/echo '\t') 2>/dev/null`" = 'X\t' &&
+       echo_testing_string=`($dir/echo "$echo_test_string") 2>/dev/null` &&
+       test "X$echo_testing_string" = "X$echo_test_string"; then
+      echo="$dir/echo"
+      break
+    fi
+  done
+  IFS="$save_ifs"
+
+  if test "X$echo" = Xecho; then
+    # We didn't find a better echo, so look for alternatives.
+    if test "X`(print -r '\t') 2>/dev/null`" = 'X\t' &&
+       echo_testing_string=`(print -r "$echo_test_string") 2>/dev/null` &&
+       test "X$echo_testing_string" = "X$echo_test_string"; then
+      # This shell has a builtin print -r that does the trick.
+      echo='print -r'
+    elif (test -f /bin/ksh || test -f /bin/ksh$ac_exeext) &&
+	 test "X$CONFIG_SHELL" != X/bin/ksh; then
+      # If we have ksh, try running configure again with it.
+      ORIGINAL_CONFIG_SHELL=${CONFIG_SHELL-/bin/sh}
+      export ORIGINAL_CONFIG_SHELL
+      CONFIG_SHELL=/bin/ksh
+      export CONFIG_SHELL
+      exec $CONFIG_SHELL "$0" --no-reexec ${1+"$@"}
+    else
+      # Try using printf.
+      echo='printf %s\n'
+      if test "X`($echo '\t') 2>/dev/null`" = 'X\t' &&
+	 echo_testing_string=`($echo "$echo_test_string") 2>/dev/null` &&
+	 test "X$echo_testing_string" = "X$echo_test_string"; then
+	# Cool, printf works
+	:
+      elif echo_testing_string=`($ORIGINAL_CONFIG_SHELL "$0" --fallback-echo '\t') 2>/dev/null` &&
+	   test "X$echo_testing_string" = 'X\t' &&
+	   echo_testing_string=`($ORIGINAL_CONFIG_SHELL "$0" --fallback-echo "$echo_test_string") 2>/dev/null` &&
+	   test "X$echo_testing_string" = "X$echo_test_string"; then
+	CONFIG_SHELL=$ORIGINAL_CONFIG_SHELL
+	export CONFIG_SHELL
+	SHELL="$CONFIG_SHELL"
+	export SHELL
+	echo="$CONFIG_SHELL $0 --fallback-echo"
+      elif echo_testing_string=`($CONFIG_SHELL "$0" --fallback-echo '\t') 2>/dev/null` &&
+	   test "X$echo_testing_string" = 'X\t' &&
+	   echo_testing_string=`($CONFIG_SHELL "$0" --fallback-echo "$echo_test_string") 2>/dev/null` &&
+	   test "X$echo_testing_string" = "X$echo_test_string"; then
+	echo="$CONFIG_SHELL $0 --fallback-echo"
+      else
+	# maybe with a smaller string...
+	prev=:
+
+	for cmd in 'echo test' 'sed 2q "$0"' 'sed 10q "$0"' 'sed 20q "$0"' 'sed 50q "$0"'; do
+	  if (test "X$echo_test_string" = "X`eval $cmd`") 2>/dev/null
+	  then
+	    break
+	  fi
+	  prev="$cmd"
+	done
+
+	if test "$prev" != 'sed 50q "$0"'; then
+	  echo_test_string=`eval $prev`
+	  export echo_test_string
+	  exec ${ORIGINAL_CONFIG_SHELL-${CONFIG_SHELL-/bin/sh}} "$0" ${1+"$@"}
+	else
+	  # Oops.  We lost completely, so just stick with echo.
+	  echo=echo
+	fi
+      fi
+    fi
+  fi
+fi
+fi
+
+# Copy echo and quote the copy suitably for passing to libtool from
+# the Makefile, instead of quoting the original, which is used later.
+ECHO=$echo
+if test "X$ECHO" = "X$CONFIG_SHELL $0 --fallback-echo"; then
+   ECHO="$CONFIG_SHELL \\\$\$0 --fallback-echo"
+fi
+
+
+ac_help="$ac_help
+  --disable-libtool-lock  avoid locking (might break parallel builds)"
+ac_help="$ac_help
+  --with-pic              try to use only PIC/non-PIC objects [default=use both]"
 ac_help="$ac_help
   --enable-maxmem[=N]     enable use of temp files, set max mem usage to N MB"
 ac_help="$ac_help
 "
+ac_help="$ac_help
+  --disable-mmx           do not use MMX instruction set"
+ac_help="$ac_help
+  --disable-3dnow         do not use 3DNow! instruction set"
+ac_help="$ac_help
+  --disable-sse           do not use SSE instruction set"
+ac_help="$ac_help
+  --disable-sse2          do not use SSE2 instruction set"
+ac_help="$ac_help
+  --enable-uchar-boolean  define type \"boolean\" as unsigned char (for Windows)"
 
 # Initialize some variables set by options.
 # The variables have the same names as the options, with
@@ -57,6 +234,7 @@
 # Initialize some other variables.
 subdirs=
 MFLAGS= MAKEFLAGS=
+SHELL=${CONFIG_SHELL-/bin/sh}
 # Maximum number of lines to put in a shell here document.
 ac_max_here_lines=12
 
@@ -340,7 +518,7 @@
     verbose=yes ;;
 
   -version | --version | --versio | --versi | --vers)
-    echo "configure generated by autoconf version 2.12"
+    echo "configure generated by autoconf version 2.13"
     exit 0 ;;
 
   -with-* | --with-*)
@@ -386,17 +564,6 @@
   -*) { echo "configure: error: $ac_option: invalid option; use --help to show usage" 1>&2; exit 1; }
     ;;
 
-  *=*)
-    varname=`echo "$ac_option"|sed -e 's/=.*//'`
-    # Reject names that aren't valid shell variable names.
-    if test -n "`echo $varname| sed 's/[a-zA-Z0-9_]//g'`"; then
-      { echo "configure: error: $varname: invalid shell variable name" 1>&2; exit 1; }
-    fi
-    val="`echo "$ac_option"|sed 's/[^=]*=//'`"
-    test -n "$verbose" && echo "	setting shell variable $varname to $val"
-    eval "$varname='$val'"
-    eval "export $varname" ;;
-
   *)
     if test -n "`echo $ac_option| sed 's/[-a-z0-9.]//g'`"; then
       echo "configure: warning: $ac_option: invalid host type" 1>&2
@@ -509,14 +676,23 @@
   fi
 done
 
+if test -r "$cache_file"; then
+  echo "loading cache $cache_file"
+  . $cache_file
+else
+  echo "creating cache $cache_file"
+  > $cache_file
+fi
 
 ac_ext=c
 # CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
-ac_link='${CC-cc} -o conftest $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
 cross_compiling=$ac_cv_prog_cc_cross
 
+ac_exeext=
+ac_objext=o
 if (echo "testing\c"; echo 1,2,3) | grep c >/dev/null; then
   # Stardent Vistra SVR4 grep lacks -e, says ghazi@caip.rutgers.edu.
   if (echo -n testing; echo 1,2,3) | sed s/-n/xn/ | grep xn >/dev/null; then
@@ -534,15 +710,16 @@
 # Extract the first word of "gcc", so it can be a program name with args.
 set dummy gcc; ac_word=$2
 echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:538: checking for $ac_word" >&5
+echo "configure:714: checking for $ac_word" >&5
 if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
   echo $ac_n "(cached) $ac_c" 1>&6
 else
   if test -n "$CC"; then
   ac_cv_prog_CC="$CC" # Let the user override the test.
 else
-  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:"
-  for ac_dir in $PATH; do
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do
     test -z "$ac_dir" && ac_dir=.
     if test -f $ac_dir/$ac_word; then
       ac_cv_prog_CC="gcc"
@@ -563,16 +740,17 @@
   # Extract the first word of "cc", so it can be a program name with args.
 set dummy cc; ac_word=$2
 echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:567: checking for $ac_word" >&5
+echo "configure:744: checking for $ac_word" >&5
 if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
   echo $ac_n "(cached) $ac_c" 1>&6
 else
   if test -n "$CC"; then
   ac_cv_prog_CC="$CC" # Let the user override the test.
 else
-  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:"
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
   ac_prog_rejected=no
-  for ac_dir in $PATH; do
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do
     test -z "$ac_dir" && ac_dir=.
     if test -f $ac_dir/$ac_word; then
       if test "$ac_dir/$ac_word" = "/usr/ucb/cc"; then
@@ -607,25 +785,61 @@
   echo "$ac_t""no" 1>&6
 fi
 
+  if test -z "$CC"; then
+    case "`uname -s`" in
+    *win32* | *WIN32*)
+      # Extract the first word of "cl", so it can be a program name with args.
+set dummy cl; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:795: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$ac_word; then
+      ac_cv_prog_CC="cl"
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+fi
+fi
+CC="$ac_cv_prog_CC"
+if test -n "$CC"; then
+  echo "$ac_t""$CC" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+ ;;
+    esac
+  fi
   test -z "$CC" && { echo "configure: error: no acceptable cc found in \$PATH" 1>&2; exit 1; }
 fi
 
 echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works""... $ac_c" 1>&6
-echo "configure:615: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works" >&5
+echo "configure:827: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works" >&5
 
 ac_ext=c
 # CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
-ac_link='${CC-cc} -o conftest $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
 cross_compiling=$ac_cv_prog_cc_cross
 
-cat > conftest.$ac_ext <<EOF
-#line 625 "configure"
+cat > conftest.$ac_ext << EOF
+
+#line 838 "configure"
 #include "confdefs.h"
+
 main(){return(0);}
 EOF
-if { (eval echo configure:629: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
+if { (eval echo configure:843: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
   ac_cv_prog_cc_works=yes
   # If we can't run a trivial program, we are probably using a cross compiler.
   if (./conftest; exit) 2>/dev/null; then
@@ -639,18 +853,24 @@
   ac_cv_prog_cc_works=no
 fi
 rm -fr conftest*
+ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
 
 echo "$ac_t""$ac_cv_prog_cc_works" 1>&6
 if test $ac_cv_prog_cc_works = no; then
   { echo "configure: error: installation or configuration problem: C compiler cannot create executables." 1>&2; exit 1; }
 fi
 echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler""... $ac_c" 1>&6
-echo "configure:649: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler" >&5
+echo "configure:869: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler" >&5
 echo "$ac_t""$ac_cv_prog_cc_cross" 1>&6
 cross_compiling=$ac_cv_prog_cc_cross
 
 echo $ac_n "checking whether we are using GNU C""... $ac_c" 1>&6
-echo "configure:654: checking whether we are using GNU C" >&5
+echo "configure:874: checking whether we are using GNU C" >&5
 if eval "test \"`echo '$''{'ac_cv_prog_gcc'+set}'`\" = set"; then
   echo $ac_n "(cached) $ac_c" 1>&6
 else
@@ -659,7 +879,7 @@
   yes;
 #endif
 EOF
-if { ac_try='${CC-cc} -E conftest.c'; { (eval echo configure:663: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
+if { ac_try='${CC-cc} -E conftest.c'; { (eval echo configure:883: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
   ac_cv_prog_gcc=yes
 else
   ac_cv_prog_gcc=no
@@ -670,14 +890,47 @@
 
 if test $ac_cv_prog_gcc = yes; then
   GCC=yes
-  test "${CFLAGS+set}" = set || CFLAGS="-O2"
 else
   GCC=
-  test "${CFLAGS+set}" = set || CFLAGS="-O"
+fi
+
+ac_test_CFLAGS="${CFLAGS+set}"
+ac_save_CFLAGS="$CFLAGS"
+CFLAGS=
+echo $ac_n "checking whether ${CC-cc} accepts -g""... $ac_c" 1>&6
+echo "configure:902: checking whether ${CC-cc} accepts -g" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_cc_g'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  echo 'void f(){}' > conftest.c
+if test -z "`${CC-cc} -g -c conftest.c 2>&1`"; then
+  ac_cv_prog_cc_g=yes
+else
+  ac_cv_prog_cc_g=no
+fi
+rm -f conftest*
+
+fi
+
+echo "$ac_t""$ac_cv_prog_cc_g" 1>&6
+if test "$ac_test_CFLAGS" = set; then
+  CFLAGS="$ac_save_CFLAGS"
+elif test $ac_cv_prog_cc_g = yes; then
+  if test "$GCC" = yes; then
+    CFLAGS="-g -O2"
+  else
+    CFLAGS="-g"
+  fi
+else
+  if test "$GCC" = yes; then
+    CFLAGS="-O2"
+  else
+    CFLAGS=
+  fi
 fi
 
 echo $ac_n "checking how to run the C preprocessor""... $ac_c" 1>&6
-echo "configure:681: checking how to run the C preprocessor" >&5
+echo "configure:934: checking how to run the C preprocessor" >&5
 # On Suns, sometimes $CPP names a directory.
 if test -n "$CPP" && test -d "$CPP"; then
   CPP=
@@ -692,14 +945,14 @@
   # On the NeXT, cc -E runs the code through the compiler's parser,
   # not just through cpp.
   cat > conftest.$ac_ext <<EOF
-#line 696 "configure"
+#line 949 "configure"
 #include "confdefs.h"
 #include <assert.h>
 Syntax Error
 EOF
 ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:702: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out`
+{ (eval echo configure:955: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
 if test -z "$ac_err"; then
   :
 else
@@ -709,14 +962,31 @@
   rm -rf conftest*
   CPP="${CC-cc} -E -traditional-cpp"
   cat > conftest.$ac_ext <<EOF
-#line 713 "configure"
+#line 966 "configure"
 #include "confdefs.h"
 #include <assert.h>
 Syntax Error
 EOF
 ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:719: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out`
+{ (eval echo configure:972: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  :
+else
+  echo "$ac_err" >&5
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  CPP="${CC-cc} -nologo -E"
+  cat > conftest.$ac_ext <<EOF
+#line 983 "configure"
+#include "confdefs.h"
+#include <assert.h>
+Syntax Error
+EOF
+ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
+{ (eval echo configure:989: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
 if test -z "$ac_err"; then
   :
 else
@@ -729,6 +999,8 @@
 rm -f conftest*
 fi
 rm -f conftest*
+fi
+rm -f conftest*
   ac_cv_prog_CPP="$CPP"
 fi
   CPP="$ac_cv_prog_CPP"
@@ -738,12 +1010,12 @@
 echo "$ac_t""$CPP" 1>&6
 
 echo $ac_n "checking for function prototypes""... $ac_c" 1>&6
-echo "configure:742: checking for function prototypes" >&5
+echo "configure:1014: checking for function prototypes" >&5
 if eval "test \"`echo '$''{'ijg_cv_have_prototypes'+set}'`\" = set"; then
   echo $ac_n "(cached) $ac_c" 1>&6
 else
   cat > conftest.$ac_ext <<EOF
-#line 747 "configure"
+#line 1019 "configure"
 #include "confdefs.h"
 
 int testfunction (int arg1, int * arg2); /* check prototypes */
@@ -761,7 +1033,7 @@
  
 ; return 0; }
 EOF
-if { (eval echo configure:765: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:1037: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
   rm -rf conftest*
   ijg_cv_have_prototypes=yes
 else
@@ -788,18 +1060,18 @@
 fi
 ac_safe=`echo "stddef.h" | sed 'y%./+-%__p_%'`
 echo $ac_n "checking for stddef.h""... $ac_c" 1>&6
-echo "configure:792: checking for stddef.h" >&5
+echo "configure:1064: checking for stddef.h" >&5
 if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
   echo $ac_n "(cached) $ac_c" 1>&6
 else
   cat > conftest.$ac_ext <<EOF
-#line 797 "configure"
+#line 1069 "configure"
 #include "confdefs.h"
 #include <stddef.h>
 EOF
 ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:802: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out`
+{ (eval echo configure:1074: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
 if test -z "$ac_err"; then
   rm -rf conftest*
   eval "ac_cv_header_$ac_safe=yes"
@@ -824,18 +1096,18 @@
 
 ac_safe=`echo "stdlib.h" | sed 'y%./+-%__p_%'`
 echo $ac_n "checking for stdlib.h""... $ac_c" 1>&6
-echo "configure:828: checking for stdlib.h" >&5
+echo "configure:1100: checking for stdlib.h" >&5
 if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
   echo $ac_n "(cached) $ac_c" 1>&6
 else
   cat > conftest.$ac_ext <<EOF
-#line 833 "configure"
+#line 1105 "configure"
 #include "confdefs.h"
 #include <stdlib.h>
 EOF
 ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:838: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out`
+{ (eval echo configure:1110: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
 if test -z "$ac_err"; then
   rm -rf conftest*
   eval "ac_cv_header_$ac_safe=yes"
@@ -860,18 +1132,18 @@
 
 ac_safe=`echo "string.h" | sed 'y%./+-%__p_%'`
 echo $ac_n "checking for string.h""... $ac_c" 1>&6
-echo "configure:864: checking for string.h" >&5
+echo "configure:1136: checking for string.h" >&5
 if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
   echo $ac_n "(cached) $ac_c" 1>&6
 else
   cat > conftest.$ac_ext <<EOF
-#line 869 "configure"
+#line 1141 "configure"
 #include "confdefs.h"
 #include <string.h>
 EOF
 ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:874: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out`
+{ (eval echo configure:1146: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
 if test -z "$ac_err"; then
   rm -rf conftest*
   eval "ac_cv_header_$ac_safe=yes"
@@ -896,9 +1168,9 @@
 fi
 
 echo $ac_n "checking for size_t""... $ac_c" 1>&6
-echo "configure:900: checking for size_t" >&5
+echo "configure:1172: checking for size_t" >&5
 cat > conftest.$ac_ext <<EOF
-#line 902 "configure"
+#line 1174 "configure"
 #include "confdefs.h"
 
 #ifdef HAVE_STDDEF_H
@@ -919,7 +1191,7 @@
  my_size_t foovar; 
 ; return 0; }
 EOF
-if { (eval echo configure:923: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:1195: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
   rm -rf conftest*
   ijg_size_t_ok=yes
 else
@@ -933,18 +1205,18 @@
 if test "$ijg_size_t_ok" != yes; then
 ac_safe=`echo "sys/types.h" | sed 'y%./+-%__p_%'`
 echo $ac_n "checking for sys/types.h""... $ac_c" 1>&6
-echo "configure:937: checking for sys/types.h" >&5
+echo "configure:1209: checking for sys/types.h" >&5
 if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
   echo $ac_n "(cached) $ac_c" 1>&6
 else
   cat > conftest.$ac_ext <<EOF
-#line 942 "configure"
+#line 1214 "configure"
 #include "confdefs.h"
 #include <sys/types.h>
 EOF
 ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:947: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
-ac_err=`grep -v '^ *+' conftest.out`
+{ (eval echo configure:1219: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
 if test -z "$ac_err"; then
   rm -rf conftest*
   eval "ac_cv_header_$ac_safe=yes"
@@ -964,7 +1236,7 @@
 EOF
 
 cat > conftest.$ac_ext <<EOF
-#line 968 "configure"
+#line 1240 "configure"
 #include "confdefs.h"
 #include <sys/types.h>
 EOF
@@ -990,16 +1262,16 @@
 fi
 fi
 echo $ac_n "checking for type unsigned char""... $ac_c" 1>&6
-echo "configure:994: checking for type unsigned char" >&5
+echo "configure:1266: checking for type unsigned char" >&5
 cat > conftest.$ac_ext <<EOF
-#line 996 "configure"
+#line 1268 "configure"
 #include "confdefs.h"
 
 int main() {
  unsigned char un_char; 
 ; return 0; }
 EOF
-if { (eval echo configure:1003: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:1275: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
   rm -rf conftest*
   echo "$ac_t""yes" 1>&6
 cat >> confdefs.h <<\EOF
@@ -1014,16 +1286,16 @@
 fi
 rm -f conftest*
 echo $ac_n "checking for type unsigned short""... $ac_c" 1>&6
-echo "configure:1018: checking for type unsigned short" >&5
+echo "configure:1290: checking for type unsigned short" >&5
 cat > conftest.$ac_ext <<EOF
-#line 1020 "configure"
+#line 1292 "configure"
 #include "confdefs.h"
 
 int main() {
  unsigned short un_short; 
 ; return 0; }
 EOF
-if { (eval echo configure:1027: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:1299: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
   rm -rf conftest*
   echo "$ac_t""yes" 1>&6
 cat >> confdefs.h <<\EOF
@@ -1038,9 +1310,9 @@
 fi
 rm -f conftest*
 echo $ac_n "checking for type void""... $ac_c" 1>&6
-echo "configure:1042: checking for type void" >&5
+echo "configure:1314: checking for type void" >&5
 cat > conftest.$ac_ext <<EOF
-#line 1044 "configure"
+#line 1316 "configure"
 #include "confdefs.h"
 
 /* Caution: a C++ compiler will insist on valid prototypes */
@@ -1068,7 +1340,7 @@
  
 ; return 0; }
 EOF
-if { (eval echo configure:1072: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:1344: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
   rm -rf conftest*
   echo "$ac_t""yes" 1>&6
 else
@@ -1084,12 +1356,12 @@
 rm -f conftest*
 
 echo $ac_n "checking for working const""... $ac_c" 1>&6
-echo "configure:1088: checking for working const" >&5
+echo "configure:1360: checking for working const" >&5
 if eval "test \"`echo '$''{'ac_cv_c_const'+set}'`\" = set"; then
   echo $ac_n "(cached) $ac_c" 1>&6
 else
   cat > conftest.$ac_ext <<EOF
-#line 1093 "configure"
+#line 1365 "configure"
 #include "confdefs.h"
 
 int main() {
@@ -1138,7 +1410,7 @@
 
 ; return 0; }
 EOF
-if { (eval echo configure:1142: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:1414: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
   rm -rf conftest*
   ac_cv_c_const=yes
 else
@@ -1159,10 +1431,10 @@
 fi
 
 echo $ac_n "checking for inline""... $ac_c" 1>&6
-echo "configure:1163: checking for inline" >&5
+echo "configure:1435: checking for inline" >&5
 ijg_cv_inline=""
 cat > conftest.$ac_ext <<EOF
-#line 1166 "configure"
+#line 1438 "configure"
 #include "confdefs.h"
 
 int main() {
@@ -1170,7 +1442,7 @@
 int bar() { return foo();
 ; return 0; }
 EOF
-if { (eval echo configure:1174: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:1446: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
   rm -rf conftest*
   ijg_cv_inline="__inline__"
 else
@@ -1178,7 +1450,7 @@
   cat conftest.$ac_ext >&5
   rm -rf conftest*
   cat > conftest.$ac_ext <<EOF
-#line 1182 "configure"
+#line 1454 "configure"
 #include "confdefs.h"
 
 int main() {
@@ -1186,7 +1458,7 @@
 int bar() { return foo();
 ; return 0; }
 EOF
-if { (eval echo configure:1190: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:1462: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
   rm -rf conftest*
   ijg_cv_inline="__inline"
 else
@@ -1194,7 +1466,7 @@
   cat conftest.$ac_ext >&5
   rm -rf conftest*
   cat > conftest.$ac_ext <<EOF
-#line 1198 "configure"
+#line 1470 "configure"
 #include "confdefs.h"
 
 int main() {
@@ -1202,7 +1474,7 @@
 int bar() { return foo();
 ; return 0; }
 EOF
-if { (eval echo configure:1206: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:1478: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
   rm -rf conftest*
   ijg_cv_inline="inline"
 else
@@ -1220,16 +1492,16 @@
 EOF
 
 echo $ac_n "checking for broken incomplete types""... $ac_c" 1>&6
-echo "configure:1224: checking for broken incomplete types" >&5
+echo "configure:1496: checking for broken incomplete types" >&5
 cat > conftest.$ac_ext <<EOF
-#line 1226 "configure"
+#line 1498 "configure"
 #include "confdefs.h"
  typedef struct undefined_structure * undef_struct_ptr; 
 int main() {
 
 ; return 0; }
 EOF
-if { (eval echo configure:1233: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:1505: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
   rm -rf conftest*
   echo "$ac_t""ok" 1>&6
 else
@@ -1244,9 +1516,9 @@
 fi
 rm -f conftest*
 echo $ac_n "checking for short external names""... $ac_c" 1>&6
-echo "configure:1248: checking for short external names" >&5
+echo "configure:1520: checking for short external names" >&5
 cat > conftest.$ac_ext <<EOF
-#line 1250 "configure"
+#line 1522 "configure"
 #include "confdefs.h"
 
 int possibly_duplicate_function () { return 0; }
@@ -1256,7 +1528,7 @@
  
 ; return 0; }
 EOF
-if { (eval echo configure:1260: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
+if { (eval echo configure:1532: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
   rm -rf conftest*
   echo "$ac_t""ok" 1>&6
 else
@@ -1271,14 +1543,14 @@
 fi
 rm -f conftest*
 echo $ac_n "checking to see if char is signed""... $ac_c" 1>&6
-echo "configure:1275: checking to see if char is signed" >&5
+echo "configure:1547: checking to see if char is signed" >&5
 if test "$cross_compiling" = yes; then
   echo Assuming that char is signed on target machine.
 echo If it is unsigned, this will be a little bit inefficient.
 
 else
   cat > conftest.$ac_ext <<EOF
-#line 1282 "configure"
+#line 1554 "configure"
 #include "confdefs.h"
 
 #ifdef HAVE_PROTOTYPES
@@ -1302,7 +1574,7 @@
   exit(is_char_signed((int) signed_char_check));
 }
 EOF
-if { (eval echo configure:1306: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest && (./conftest; exit) 2>/dev/null
+if { (eval echo configure:1578: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
 then
   echo "$ac_t""no" 1>&6
 cat >> confdefs.h <<\EOF
@@ -1319,12 +1591,12 @@
 fi
 
 echo $ac_n "checking to see if right shift is signed""... $ac_c" 1>&6
-echo "configure:1323: checking to see if right shift is signed" >&5
+echo "configure:1595: checking to see if right shift is signed" >&5
 if test "$cross_compiling" = yes; then
   echo "$ac_t""Assuming that right shift is signed on target machine." 1>&6
 else
   cat > conftest.$ac_ext <<EOF
-#line 1328 "configure"
+#line 1600 "configure"
 #include "confdefs.h"
 
 #ifdef HAVE_PROTOTYPES
@@ -1354,7 +1626,7 @@
   exit(is_shifting_signed(-0x7F7E80B1L));
 }
 EOF
-if { (eval echo configure:1358: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest && (./conftest; exit) 2>/dev/null
+if { (eval echo configure:1630: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
 then
   echo "$ac_t""no" 1>&6
 cat >> confdefs.h <<\EOF
@@ -1371,12 +1643,12 @@
 fi
 
 echo $ac_n "checking to see if fopen accepts b spec""... $ac_c" 1>&6
-echo "configure:1375: checking to see if fopen accepts b spec" >&5
+echo "configure:1647: checking to see if fopen accepts b spec" >&5
 if test "$cross_compiling" = yes; then
   echo "$ac_t""Assuming that it does." 1>&6
 else
   cat > conftest.$ac_ext <<EOF
-#line 1380 "configure"
+#line 1652 "configure"
 #include "confdefs.h"
 
 #include <stdio.h>
@@ -1386,7 +1658,7 @@
   exit(1);
 }
 EOF
-if { (eval echo configure:1390: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest && (./conftest; exit) 2>/dev/null
+if { (eval echo configure:1662: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
 then
   echo "$ac_t""yes" 1>&6
 else
@@ -1428,28 +1700,30 @@
 # SunOS /usr/etc/install
 # IRIX /sbin/install
 # AIX /bin/install
+# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag
 # AFS /usr/afsws/bin/install, which mishandles nonexistent args
 # SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff"
 # ./install, which can be erroneously created by make from ./install.sh.
 echo $ac_n "checking for a BSD compatible install""... $ac_c" 1>&6
-echo "configure:1436: checking for a BSD compatible install" >&5
+echo "configure:1709: checking for a BSD compatible install" >&5
 if test -z "$INSTALL"; then
 if eval "test \"`echo '$''{'ac_cv_path_install'+set}'`\" = set"; then
   echo $ac_n "(cached) $ac_c" 1>&6
 else
-    IFS="${IFS= 	}"; ac_save_IFS="$IFS"; IFS="${IFS}:"
+    IFS="${IFS= 	}"; ac_save_IFS="$IFS"; IFS=":"
   for ac_dir in $PATH; do
     # Account for people who put trailing slashes in PATH elements.
     case "$ac_dir/" in
     /|./|.//|/etc/*|/usr/sbin/*|/usr/etc/*|/sbin/*|/usr/afsws/bin/*|/usr/ucb/*) ;;
     *)
       # OSF1 and SCO ODT 3.0 have their own names for install.
-      for ac_prog in ginstall installbsd scoinst install; do
+      # Don't use installbsd from OSF since it installs stuff as root
+      # by default.
+      for ac_prog in ginstall scoinst install; do
         if test -f $ac_dir/$ac_prog; then
 	  if test $ac_prog = install &&
             grep dspmsg $ac_dir/$ac_prog >/dev/null 2>&1; then
 	    # AIX install.  It has an incompatible calling convention.
-	    # OSF/1 installbsd also uses dspmsg, but is usable.
 	    :
 	  else
 	    ac_cv_path_install="$ac_dir/$ac_prog -c"
@@ -1479,20 +1753,23 @@
 # It thinks the first close brace ends the variable substitution.
 test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}'
 
+test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL_PROGRAM}'
+
 test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644'
 
 # Extract the first word of "ranlib", so it can be a program name with args.
 set dummy ranlib; ac_word=$2
 echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:1488: checking for $ac_word" >&5
+echo "configure:1764: checking for $ac_word" >&5
 if eval "test \"`echo '$''{'ac_cv_prog_RANLIB'+set}'`\" = set"; then
   echo $ac_n "(cached) $ac_c" 1>&6
 else
   if test -n "$RANLIB"; then
   ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
 else
-  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:"
-  for ac_dir in $PATH; do
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do
     test -z "$ac_dir" && ac_dir=.
     if test -f $ac_dir/$ac_word; then
       ac_cv_prog_RANLIB="ranlib"
@@ -1511,30 +1788,186 @@
 fi
 
 
+
+# Make sure we can run config.sub.
+if ${CONFIG_SHELL-/bin/sh} $ac_config_sub sun4 >/dev/null 2>&1; then :
+else { echo "configure: error: can not run $ac_config_sub" 1>&2; exit 1; }
+fi
+
+echo $ac_n "checking host system type""... $ac_c" 1>&6
+echo "configure:1799: checking host system type" >&5
+
+host_alias=$host
+case "$host_alias" in
+NONE)
+  case $nonopt in
+  NONE)
+    if host_alias=`${CONFIG_SHELL-/bin/sh} $ac_config_guess`; then :
+    else { echo "configure: error: can not guess host type; you must specify one" 1>&2; exit 1; }
+    fi ;;
+  *) host_alias=$nonopt ;;
+  esac ;;
+esac
+
+host=`${CONFIG_SHELL-/bin/sh} $ac_config_sub $host_alias`
+host_cpu=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+host_vendor=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+host_os=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+echo "$ac_t""$host" 1>&6
+
+echo $ac_n "checking for Cygwin environment""... $ac_c" 1>&6
+echo "configure:1820: checking for Cygwin environment" >&5
+if eval "test \"`echo '$''{'ac_cv_cygwin'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 1825 "configure"
+#include "confdefs.h"
+
+int main() {
+
+#ifndef __CYGWIN__
+#define __CYGWIN__ __CYGWIN32__
+#endif
+return __CYGWIN__;
+; return 0; }
+EOF
+if { (eval echo configure:1836: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  ac_cv_cygwin=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_cygwin=no
+fi
+rm -f conftest*
+rm -f conftest*
+fi
+
+echo "$ac_t""$ac_cv_cygwin" 1>&6
+CYGWIN=
+test "$ac_cv_cygwin" = yes && CYGWIN=yes
+echo $ac_n "checking for mingw32 environment""... $ac_c" 1>&6
+echo "configure:1853: checking for mingw32 environment" >&5
+if eval "test \"`echo '$''{'ac_cv_mingw32'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 1858 "configure"
+#include "confdefs.h"
+
+int main() {
+return __MINGW32__;
+; return 0; }
+EOF
+if { (eval echo configure:1865: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  ac_cv_mingw32=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_mingw32=no
+fi
+rm -f conftest*
+rm -f conftest*
+fi
+
+echo "$ac_t""$ac_cv_mingw32" 1>&6
+MINGW32=
+test "$ac_cv_mingw32" = yes && MINGW32=yes
+
+
+echo $ac_n "checking for executable suffix""... $ac_c" 1>&6
+echo "configure:1884: checking for executable suffix" >&5
+if eval "test \"`echo '$''{'ac_cv_exeext'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test "$CYGWIN" = yes || test "$MINGW32" = yes; then
+  ac_cv_exeext=.exe
+else
+  rm -f conftest*
+  echo 'int main () { return 0; }' > conftest.$ac_ext
+  ac_cv_exeext=
+  if { (eval echo configure:1894: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; }; then
+    for file in conftest.*; do
+      case $file in
+      *.c | *.o | *.obj) ;;
+      *) ac_cv_exeext=`echo $file | sed -e s/conftest//` ;;
+      esac
+    done
+  else
+    { echo "configure: error: installation or configuration problem: compiler cannot create executables." 1>&2; exit 1; }
+  fi
+  rm -f conftest*
+  test x"${ac_cv_exeext}" = x && ac_cv_exeext=no
+fi
+fi
+
+EXEEXT=""
+test x"${ac_cv_exeext}" != xno && EXEEXT=${ac_cv_exeext}
+echo "$ac_t""${ac_cv_exeext}" 1>&6
+ac_exeext=$EXEEXT
+
+
 # Decide whether to use libtool,
 # and if so whether to build shared, static, or both flavors of library.
-LTSHARED="no"
 # Check whether --enable-shared or --disable-shared was given.
 if test "${enable_shared+set}" = set; then
   enableval="$enable_shared"
-  LTSHARED="$enableval"
+  p=${PACKAGE-default}
+case $enableval in
+yes) enable_shared=yes ;;
+no) enable_shared=no ;;
+*)
+  enable_shared=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_shared=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac
+else
+  enable_shared=no
 fi
 
-LTSTATIC="no"
 # Check whether --enable-static or --disable-static was given.
 if test "${enable_static+set}" = set; then
   enableval="$enable_static"
-  LTSTATIC="$enableval"
+  p=${PACKAGE-default}
+case $enableval in
+yes) enable_static=yes ;;
+no) enable_static=no ;;
+*)
+  enable_static=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_static=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac
+else
+  enable_static=no
 fi
 
-if test "x$LTSHARED" != xno  -o  "x$LTSTATIC" != xno; then
+if test "x$enable_shared" != xno  -o  "x$enable_static" != xno; then
   USELIBTOOL="yes"
-  LIBTOOL="./libtool"
+# LIBTOOL="./libtool"
   O="lo"
   A="la"
   LN='$(LIBTOOL) --mode=link $(CC)'
   INSTALL_LIB='$(LIBTOOL) --mode=install ${INSTALL}'
   INSTALL_PROGRAM="\$(LIBTOOL) --mode=install $INSTALL_PROGRAM"
+  UNINSTALL='$(LIBTOOL) --mode=uninstall $(RM)'
 else
   USELIBTOOL="no"
   LIBTOOL=""
@@ -1542,6 +1975,7 @@
   A="a"
   LN='$(CC)'
   INSTALL_LIB="$INSTALL_DATA"
+  UNINSTALL='$(RM)'
 fi
 
 
@@ -1549,19 +1983,4266 @@
 
 
 
+
 # Configure libtool if needed.
 if test $USELIBTOOL = yes; then
-  disable_shared=
-  disable_static=
-  if test "x$LTSHARED" = xno; then
-    disable_shared="--disable-shared"
-  fi
-  if test "x$LTSTATIC" = xno; then
-    disable_static="--disable-static"
-  fi
-  $srcdir/ltconfig $disable_shared $disable_static $srcdir/ltmain.sh
+  
+  
+  # Find the correct PATH separator.  Usually this is `:', but
+# DJGPP uses `;' like DOS.
+if test "X${PATH_SEPARATOR+set}" != Xset; then
+  UNAME=${UNAME-`uname 2>/dev/null`}
+  case X$UNAME in
+    *-DOS) lt_cv_sys_path_separator=';' ;;
+    *)     lt_cv_sys_path_separator=':' ;;
+  esac
+  PATH_SEPARATOR=$lt_cv_sys_path_separator
 fi
 
+# Check whether --enable-fast-install or --disable-fast-install was given.
+if test "${enable_fast_install+set}" = set; then
+  enableval="$enable_fast_install"
+  p=${PACKAGE-default}
+case $enableval in
+yes) enable_fast_install=yes ;;
+no) enable_fast_install=no ;;
+*)
+  enable_fast_install=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_fast_install=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac
+else
+  enable_fast_install=yes
+fi
+
+echo $ac_n "checking build system type""... $ac_c" 1>&6
+echo "configure:2027: checking build system type" >&5
+
+build_alias=$build
+case "$build_alias" in
+NONE)
+  case $nonopt in
+  NONE) build_alias=$host_alias ;;
+  *) build_alias=$nonopt ;;
+  esac ;;
+esac
+
+build=`${CONFIG_SHELL-/bin/sh} $ac_config_sub $build_alias`
+build_cpu=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+build_vendor=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+build_os=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+echo "$ac_t""$build" 1>&6
+
+# Check whether --with-gnu-ld or --without-gnu-ld was given.
+if test "${with_gnu_ld+set}" = set; then
+  withval="$with_gnu_ld"
+  test "$withval" = no || with_gnu_ld=yes
+else
+  with_gnu_ld=no
+fi
+
+ac_prog=ld
+if test "$GCC" = yes; then
+  # Check if gcc -print-prog-name=ld gives a path.
+  echo $ac_n "checking for ld used by GCC""... $ac_c" 1>&6
+echo "configure:2056: checking for ld used by GCC" >&5
+  case $host in
+  *-*-mingw*)
+    # gcc leaves a trailing carriage return which upsets mingw
+    ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;;
+  *)
+    ac_prog=`($CC -print-prog-name=ld) 2>&5` ;;
+  esac
+  case $ac_prog in
+    # Accept absolute paths.
+    [\\/]* | [A-Za-z]:[\\/]*)
+      re_direlt='/[^/][^/]*/\.\./'
+      # Canonicalize the path of ld
+      ac_prog=`echo $ac_prog| sed 's%\\\\%/%g'`
+      while echo $ac_prog | grep "$re_direlt" > /dev/null 2>&1; do
+	ac_prog=`echo $ac_prog| sed "s%$re_direlt%/%"`
+      done
+      test -z "$LD" && LD="$ac_prog"
+      ;;
+  "")
+    # If it fails, then pretend we aren't using GCC.
+    ac_prog=ld
+    ;;
+  *)
+    # If it is relative, then search for the first ld in PATH.
+    with_gnu_ld=unknown
+    ;;
+  esac
+elif test "$with_gnu_ld" = yes; then
+  echo $ac_n "checking for GNU ld""... $ac_c" 1>&6
+echo "configure:2086: checking for GNU ld" >&5
+else
+  echo $ac_n "checking for non-GNU ld""... $ac_c" 1>&6
+echo "configure:2089: checking for non-GNU ld" >&5
+fi
+if eval "test \"`echo '$''{'lt_cv_path_LD'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test -z "$LD"; then
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
+  for ac_dir in $PATH; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then
+      lt_cv_path_LD="$ac_dir/$ac_prog"
+      # Check to see if the program is GNU ld.  I'd rather use --version,
+      # but apparently some GNU ld's only accept -v.
+      # Break only if it was the GNU/non-GNU ld that we prefer.
+      if "$lt_cv_path_LD" -v 2>&1 < /dev/null | egrep '(GNU|with BFD)' > /dev/null; then
+	test "$with_gnu_ld" != no && break
+      else
+	test "$with_gnu_ld" != yes && break
+      fi
+    fi
+  done
+  IFS="$ac_save_ifs"
+else
+  lt_cv_path_LD="$LD" # Let the user override the test with a path.
+fi
+fi
+
+LD="$lt_cv_path_LD"
+if test -n "$LD"; then
+  echo "$ac_t""$LD" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+test -z "$LD" && { echo "configure: error: no acceptable ld found in \$PATH" 1>&2; exit 1; }
+echo $ac_n "checking if the linker ($LD) is GNU ld""... $ac_c" 1>&6
+echo "configure:2124: checking if the linker ($LD) is GNU ld" >&5
+if eval "test \"`echo '$''{'lt_cv_prog_gnu_ld'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  # I'd rather use --version here, but apparently some GNU ld's only accept -v.
+if $LD -v 2>&1 </dev/null | egrep '(GNU|with BFD)' 1>&5; then
+  lt_cv_prog_gnu_ld=yes
+else
+  lt_cv_prog_gnu_ld=no
+fi
+fi
+
+echo "$ac_t""$lt_cv_prog_gnu_ld" 1>&6
+with_gnu_ld=$lt_cv_prog_gnu_ld
+
+
+echo $ac_n "checking for $LD option to reload object files""... $ac_c" 1>&6
+echo "configure:2141: checking for $LD option to reload object files" >&5
+if eval "test \"`echo '$''{'lt_cv_ld_reload_flag'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  lt_cv_ld_reload_flag='-r'
+fi
+
+echo "$ac_t""$lt_cv_ld_reload_flag" 1>&6
+reload_flag=$lt_cv_ld_reload_flag
+test -n "$reload_flag" && reload_flag=" $reload_flag"
+
+echo $ac_n "checking for BSD-compatible nm""... $ac_c" 1>&6
+echo "configure:2153: checking for BSD-compatible nm" >&5
+if eval "test \"`echo '$''{'lt_cv_path_NM'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test -n "$NM"; then
+  # Let the user override the test.
+  lt_cv_path_NM="$NM"
+else
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
+  for ac_dir in $PATH /usr/ccs/bin /usr/ucb /bin; do
+    test -z "$ac_dir" && ac_dir=.
+    tmp_nm=$ac_dir/${ac_tool_prefix}nm
+    if test -f $tmp_nm || test -f $tmp_nm$ac_exeext ; then
+      # Check to see if the nm accepts a BSD-compat flag.
+      # Adding the `sed 1q' prevents false positives on HP-UX, which says:
+      #   nm: unknown option "B" ignored
+      # Tru64's nm complains that /dev/null is an invalid object file
+      if ($tmp_nm -B /dev/null 2>&1 | sed '1q'; exit 0) | egrep '(/dev/null|Invalid file or object type)' >/dev/null; then
+	lt_cv_path_NM="$tmp_nm -B"
+	break
+      elif ($tmp_nm -p /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
+	lt_cv_path_NM="$tmp_nm -p"
+	break
+      else
+	lt_cv_path_NM=${lt_cv_path_NM="$tmp_nm"} # keep the first match, but
+	continue # so that we can try to find one that supports BSD flags
+      fi
+    fi
+  done
+  IFS="$ac_save_ifs"
+  test -z "$lt_cv_path_NM" && lt_cv_path_NM=nm
+fi
+fi
+
+NM="$lt_cv_path_NM"
+echo "$ac_t""$NM" 1>&6
+
+echo $ac_n "checking for a sed that does not truncate output""... $ac_c" 1>&6
+echo "configure:2191: checking for a sed that does not truncate output" >&5
+if eval "test \"`echo '$''{'lt_cv_path_SED'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  # Loop through the user's path and test for sed and gsed.
+# Then use that list of sed's as ones to test for truncation.
+as_executable_p="test -f"
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_prog in sed gsed; do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      if $as_executable_p "$as_dir/$ac_prog$ac_exec_ext"; then
+        _sed_list="$_sed_list $as_dir/$ac_prog$ac_exec_ext"
+      fi
+    done
+  done
+done
+
+  # Create a temporary directory, and hook for its removal unless debugging.
+$debug ||
+{
+  trap 'exit_status=$?; rm -rf $tmp && exit $exit_status' 0
+  trap '{ (exit 1); exit 1; }' 1 2 13 15
+}
+
+# Create a (secure) tmp directory for tmp files.
+: ${TMPDIR=/tmp}
+{
+  tmp=`(umask 077 && mktemp -d -q "$TMPDIR/sedXXXXXX") 2>/dev/null` &&
+  test -n "$tmp" && test -d "$tmp"
+}  ||
+{
+  tmp=$TMPDIR/sed$$-$RANDOM
+  (umask 077 && mkdir $tmp)
+} ||
+{
+   echo "$me: cannot create a temporary directory in $TMPDIR" >&2
+   { (exit 1); exit 1; }
+}
+  _max=0
+  _count=0
+  # Add /usr/xpg4/bin/sed as it is typically found on Solaris
+  # along with /bin/sed that truncates output.
+  for _sed in $_sed_list /usr/xpg4/bin/sed; do
+    test ! -f ${_sed} && break
+    cat /dev/null > "$tmp/sed.in"
+    _count=0
+    echo ${ECHO_N-$ac_n} "0123456789${ECHO_C-$ac_c}" >"$tmp/sed.in"
+    # Check for GNU sed and select it if it is found.
+    if "${_sed}" --version 2>&1 < /dev/null | egrep '(GNU)' > /dev/null; then
+      lt_cv_path_SED=${_sed}
+      break
+    fi
+    while true; do
+      cat "$tmp/sed.in" "$tmp/sed.in" >"$tmp/sed.tmp"
+      mv "$tmp/sed.tmp" "$tmp/sed.in"
+      cp "$tmp/sed.in" "$tmp/sed.nl"
+      echo >>"$tmp/sed.nl"
+      ${_sed} -e 's/a$//' < "$tmp/sed.nl" >"$tmp/sed.out" || break
+      cmp -s "$tmp/sed.out" "$tmp/sed.nl" || break
+      # 40000 chars as input seems more than enough
+      test $_count -gt 10 && break
+      _count=`expr $_count + 1`
+      if test $_count -gt $_max; then
+        _max=$_count
+        lt_cv_path_SED=$_sed
+      fi
+    done
+  done
+  rm -rf "$tmp"
+
+fi
+
+if test "X$SED" != "X"; then
+  lt_cv_path_SED=$SED
+else
+  SED=$lt_cv_path_SED
+fi
+echo "$ac_t""$SED" 1>&6
+
+echo $ac_n "checking whether ln -s works""... $ac_c" 1>&6
+echo "configure:2275: checking whether ln -s works" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_LN_S'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  rm -f conftestdata
+if ln -s X conftestdata 2>/dev/null
+then
+  rm -f conftestdata
+  ac_cv_prog_LN_S="ln -s"
+else
+  ac_cv_prog_LN_S=ln
+fi
+fi
+LN_S="$ac_cv_prog_LN_S"
+if test "$ac_cv_prog_LN_S" = "ln -s"; then
+  echo "$ac_t""yes" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+echo $ac_n "checking how to recognise dependent libraries""... $ac_c" 1>&6
+echo "configure:2296: checking how to recognise dependent libraries" >&5
+if eval "test \"`echo '$''{'lt_cv_deplibs_check_method'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  lt_cv_file_magic_cmd='$MAGIC_CMD'
+lt_cv_file_magic_test_file=
+lt_cv_deplibs_check_method='unknown'
+# Need to set the preceding variable on all platforms that support
+# interlibrary dependencies.
+# 'none' -- dependencies not supported.
+# `unknown' -- same as none, but documents that we really don't know.
+# 'pass_all' -- all dependencies passed with no checks.
+# 'test_compile' -- check by making test program.
+# 'file_magic [[regex]]' -- check by looking for files in library path
+# which responds to the $file_magic_cmd with a given egrep regex.
+# If you have `file' or equivalent on your system and you're not sure
+# whether `pass_all' will *always* work, you probably want this one.
+
+case $host_os in
+aix4* | aix5*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+beos*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+bsdi4*)
+  lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib)'
+  lt_cv_file_magic_cmd='/usr/bin/file -L'
+  lt_cv_file_magic_test_file=/shlib/libc.so
+  ;;
+
+cygwin* | mingw* | pw32*)
+  lt_cv_deplibs_check_method='file_magic file format pei*-i386(.*architecture: i386)?'
+  lt_cv_file_magic_cmd='$OBJDUMP -f'
+  ;;
+
+darwin* | rhapsody*)
+  lt_cv_deplibs_check_method='file_magic Mach-O dynamically linked shared library'
+  lt_cv_file_magic_cmd='/usr/bin/file -L'
+  case "$host_os" in
+  rhapsody* | darwin1.[012])
+    lt_cv_file_magic_test_file=`echo /System/Library/Frameworks/System.framework/Versions/*/System | head -1`
+    ;;
+  *) # Darwin 1.3 on
+    lt_cv_file_magic_test_file='/usr/lib/libSystem.dylib'
+    ;;
+  esac
+  ;;
+
+freebsd*)
+  if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+    case $host_cpu in
+    i*86 )
+      # Not sure whether the presence of OpenBSD here was a mistake.
+      # Let's accept both of them until this is cleared up.
+      lt_cv_deplibs_check_method='file_magic (FreeBSD|OpenBSD)/i[3-9]86 (compact )?demand paged shared library'
+      lt_cv_file_magic_cmd=/usr/bin/file
+      lt_cv_file_magic_test_file=`echo /usr/lib/libc.so.*`
+      ;;
+    esac
+  else
+    lt_cv_deplibs_check_method=pass_all
+  fi
+  ;;
+
+gnu*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+hpux10.20*|hpux11*)
+  lt_cv_deplibs_check_method='file_magic (s[0-9][0-9][0-9]|PA-RISC[0-9].[0-9]) shared library'
+  lt_cv_file_magic_cmd=/usr/bin/file
+  lt_cv_file_magic_test_file=/usr/lib/libc.sl
+  ;;
+
+irix5* | irix6* | nonstopux*)
+  case $host_os in
+  irix5* | nonstopux*)
+    # this will be overridden with pass_all, but let us keep it just in case
+    lt_cv_deplibs_check_method="file_magic ELF 32-bit MSB dynamic lib MIPS - version 1"
+    ;;
+  *)
+    case $LD in
+    *-32|*"-32 ") libmagic=32-bit;;
+    *-n32|*"-n32 ") libmagic=N32;;
+    *-64|*"-64 ") libmagic=64-bit;;
+    *) libmagic=never-match;;
+    esac
+    # this will be overridden with pass_all, but let us keep it just in case
+    lt_cv_deplibs_check_method="file_magic ELF ${libmagic} MSB mips-[1234] dynamic lib MIPS - version 1"
+    ;;
+  esac
+  lt_cv_file_magic_test_file=`echo /lib${libsuff}/libc.so*`
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+# This must be Linux ELF.
+linux-gnu*)
+  case $host_cpu in
+  alpha* | hppa* | i*86 | mips | mipsel | powerpc* | sparc* | ia64* | s390* | x86_64*)
+    lt_cv_deplibs_check_method=pass_all ;;
+  *)
+    # glibc up to 2.1.1 does not perform some relocations on ARM
+    lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [LM]SB (shared object|dynamic lib )' ;;
+  esac
+  lt_cv_file_magic_test_file=`echo /lib/libc.so* /lib/libc-*.so`
+  ;;
+
+netbsd*)
+  if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+    lt_cv_deplibs_check_method='match_pattern /lib[^/\.]+\.so\.[0-9]+\.[0-9]+$'
+  else
+    lt_cv_deplibs_check_method='match_pattern /lib[^/\.]+\.so$'
+  fi
+  ;;
+
+newos6*)
+  lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (executable|dynamic lib)'
+  lt_cv_file_magic_cmd=/usr/bin/file
+  lt_cv_file_magic_test_file=/usr/lib/libnls.so
+  ;;
+
+openbsd*)
+  lt_cv_file_magic_cmd=/usr/bin/file
+  lt_cv_file_magic_test_file=`echo /usr/lib/libc.so.*`
+  if test -z "`echo __ELF__ | $CC -E - | grep __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
+    lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [LM]SB shared object'
+  else
+    lt_cv_deplibs_check_method='file_magic OpenBSD.* shared library'
+  fi
+  ;;
+
+osf3* | osf4* | osf5*)
+  # this will be overridden with pass_all, but let us keep it just in case
+  lt_cv_deplibs_check_method='file_magic COFF format alpha shared library'
+  lt_cv_file_magic_test_file=/shlib/libc.so
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+sco3.2v5*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+solaris*)
+  lt_cv_deplibs_check_method=pass_all
+  lt_cv_file_magic_test_file=/lib/libc.so
+  ;;
+
+sysv5uw[78]* | sysv4*uw2*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+  case $host_vendor in
+  motorola)
+    lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib) M[0-9][0-9]* Version [0-9]'
+    lt_cv_file_magic_test_file=`echo /usr/lib/libc.so*`
+    ;;
+  ncr)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  sequent)
+    lt_cv_file_magic_cmd='/bin/file'
+    lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [LM]SB (shared object|dynamic lib )'
+    ;;
+  sni)
+    lt_cv_file_magic_cmd='/bin/file'
+    lt_cv_deplibs_check_method="file_magic ELF [0-9][0-9]*-bit [LM]SB dynamic lib"
+    lt_cv_file_magic_test_file=/lib/libc.so
+    ;;
+  siemens)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  esac
+  ;;
+esac
+
+fi
+
+echo "$ac_t""$lt_cv_deplibs_check_method" 1>&6
+file_magic_cmd=$lt_cv_file_magic_cmd
+deplibs_check_method=$lt_cv_deplibs_check_method
+
+echo $ac_n "checking for object suffix""... $ac_c" 1>&6
+echo "configure:2482: checking for object suffix" >&5
+if eval "test \"`echo '$''{'ac_cv_objext'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  rm -f conftest*
+echo 'int i = 1;' > conftest.$ac_ext
+if { (eval echo configure:2488: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  for ac_file in conftest.*; do
+    case $ac_file in
+    *.c) ;;
+    *) ac_cv_objext=`echo $ac_file | sed -e s/conftest.//` ;;
+    esac
+  done
+else
+  { echo "configure: error: installation or configuration problem; compiler does not work" 1>&2; exit 1; }
+fi
+rm -f conftest*
+fi
+
+echo "$ac_t""$ac_cv_objext" 1>&6
+OBJEXT=$ac_cv_objext
+ac_objext=$ac_cv_objext
+
+if test $host != $build; then
+  ac_tool_prefix=${host_alias}-
+else
+  ac_tool_prefix=
+fi
+
+
+
+
+# Check for command to grab the raw symbol name followed by C symbol from nm.
+echo $ac_n "checking command to parse $NM output""... $ac_c" 1>&6
+echo "configure:2516: checking command to parse $NM output" >&5
+if eval "test \"`echo '$''{'lt_cv_sys_global_symbol_pipe'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  
+# These are sane defaults that work on at least a few old systems.
+# [They come from Ultrix.  What could be older than Ultrix?!! ;)]
+
+# Character class describing NM global symbol codes.
+symcode='[BCDEGRST]'
+
+# Regexp to match symbols that can be accessed directly from C.
+sympat='\([_A-Za-z][_A-Za-z0-9]*\)'
+
+# Transform the above into a raw symbol and a C symbol.
+symxfrm='\1 \2\3 \3'
+
+# Transform an extracted symbol line into a proper C declaration
+lt_cv_global_symbol_to_cdecl="sed -n -e 's/^. .* \(.*\)$/extern char \1;/p'"
+
+# Transform an extracted symbol line into symbol name and symbol address
+lt_cv_global_symbol_to_c_name_address="sed -n -e 's/^: \([^ ]*\) $/  {\\\"\1\\\", (lt_ptr) 0},/p' -e 's/^$symcode \([^ ]*\) \([^ ]*\)$/  {\"\2\", (lt_ptr) \&\2},/p'"
+
+# Define system-specific variables.
+case $host_os in
+aix*)
+  symcode='[BCDT]'
+  ;;
+cygwin* | mingw* | pw32*)
+  symcode='[ABCDGISTW]'
+  ;;
+hpux*) # Its linker distinguishes data from code symbols
+  lt_cv_global_symbol_to_cdecl="sed -n -e 's/^T .* \(.*\)$/extern char \1();/p' -e 's/^$symcode* .* \(.*\)$/extern char \1;/p'"
+  lt_cv_global_symbol_to_c_name_address="sed -n -e 's/^: \([^ ]*\) $/  {\\\"\1\\\", (lt_ptr) 0},/p' -e 's/^$symcode* \([^ ]*\) \([^ ]*\)$/  {\"\2\", (lt_ptr) \&\2},/p'"
+  ;;
+irix* | nonstopux*)
+  symcode='[BCDEGRST]'
+  ;;
+osf*)
+  symcode='[BCDEGQRST]'
+  ;;
+solaris* | sysv5*)
+  symcode='[BDT]'
+  ;;
+sysv4)
+  symcode='[DFNSTU]'
+  ;;
+esac
+
+# Handle CRLF in mingw tool chain
+opt_cr=
+case $host_os in
+mingw*)
+  opt_cr=`echo 'x\{0,1\}' | tr x '\015'` # option cr in regexp
+  ;;
+esac
+
+# If we're using GNU nm, then use its standard symbol codes.
+if $NM -V 2>&1 | egrep '(GNU|with BFD)' > /dev/null; then
+  symcode='[ABCDGISTW]'
+fi
+
+# Try without a prefix undercore, then with it.
+for ac_symprfx in "" "_"; do
+
+  # Write the raw and C identifiers.
+lt_cv_sys_global_symbol_pipe="sed -n -e 's/^.*[ 	]\($symcode$symcode*\)[ 	][ 	]*\($ac_symprfx\)$sympat$opt_cr$/$symxfrm/p'"
+
+  # Check to see that the pipe works correctly.
+  pipe_works=no
+  rm -f conftest*
+  cat > conftest.$ac_ext <<EOF
+#ifdef __cplusplus
+extern "C" {
+#endif
+char nm_test_var;
+void nm_test_func(){}
+#ifdef __cplusplus
+}
+#endif
+int main(){nm_test_var='a';nm_test_func();return(0);}
+EOF
+
+  if { (eval echo configure:2599: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+    # Now try to grab the symbols.
+    nlist=conftest.nm
+    if { (eval echo configure:2602: \"$NM conftest.$ac_objext \| $lt_cv_sys_global_symbol_pipe \> $nlist\") 1>&5; (eval $NM conftest.$ac_objext \| $lt_cv_sys_global_symbol_pipe \> $nlist) 2>&5; } && test -s "$nlist"; then
+      # Try sorting and uniquifying the output.
+      if sort "$nlist" | uniq > "$nlist"T; then
+	mv -f "$nlist"T "$nlist"
+      else
+	rm -f "$nlist"T
+      fi
+
+      # Make sure that we snagged all the symbols we need.
+      if egrep ' nm_test_var$' "$nlist" >/dev/null; then
+	if egrep ' nm_test_func$' "$nlist" >/dev/null; then
+	  cat <<EOF > conftest.$ac_ext
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+EOF
+	  # Now generate the symbol file.
+	  eval "$lt_cv_global_symbol_to_cdecl"' < "$nlist" >> conftest.$ac_ext'
+
+	  cat <<EOF >> conftest.$ac_ext
+#if defined (__STDC__) && __STDC__
+# define lt_ptr void *
+#else
+# define lt_ptr char *
+# define const
+#endif
+
+/* The mapping between symbol names and symbols. */
+const struct {
+  const char *name;
+  lt_ptr address;
+}
+lt_preloaded_symbols[] =
+{
+EOF
+	  sed "s/^$symcode$symcode* \(.*\) \(.*\)$/  {\"\2\", (lt_ptr) \&\2},/" < "$nlist" >> conftest.$ac_ext
+	  cat <<\EOF >> conftest.$ac_ext
+  {0, (lt_ptr) 0}
+};
+
+#ifdef __cplusplus
+}
+#endif
+EOF
+	  # Now try linking the two files.
+	  mv conftest.$ac_objext conftstm.$ac_objext
+	  save_LIBS="$LIBS"
+	  save_CFLAGS="$CFLAGS"
+	  LIBS="conftstm.$ac_objext"
+	  CFLAGS="$CFLAGS$no_builtin_flag"
+	  if { (eval echo configure:2653: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest$ac_exeext; then
+	    pipe_works=yes
+	  fi
+	  LIBS="$save_LIBS"
+	  CFLAGS="$save_CFLAGS"
+	else
+	  echo "cannot find nm_test_func in $nlist" >&5
+	fi
+      else
+	echo "cannot find nm_test_var in $nlist" >&5
+      fi
+    else
+      echo "cannot run $lt_cv_sys_global_symbol_pipe" >&5
+    fi
+  else
+    echo "$progname: failed program was:" >&5
+    cat conftest.$ac_ext >&5
+  fi
+  rm -f conftest* conftst*
+
+  # Do not use the global_symbol_pipe unless it works.
+  if test "$pipe_works" = yes; then
+    break
+  else
+    lt_cv_sys_global_symbol_pipe=
+  fi
+done
+
+fi
+
+global_symbol_pipe="$lt_cv_sys_global_symbol_pipe"
+if test -z "$lt_cv_sys_global_symbol_pipe"; then
+  global_symbol_to_cdecl=
+  global_symbol_to_c_name_address=
+else
+  global_symbol_to_cdecl="$lt_cv_global_symbol_to_cdecl"
+  global_symbol_to_c_name_address="$lt_cv_global_symbol_to_c_name_address"
+fi
+if test -z "$global_symbol_pipe$global_symbol_to_cdec$global_symbol_to_c_name_address";
+then
+  echo "$ac_t""failed" 1>&6
+else
+  echo "$ac_t""ok" 1>&6
+fi
+
+for ac_hdr in dlfcn.h
+do
+ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'`
+echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6
+echo "configure:2702: checking for $ac_hdr" >&5
+if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 2707 "configure"
+#include "confdefs.h"
+#include <$ac_hdr>
+EOF
+ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
+{ (eval echo configure:2712: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  rm -rf conftest*
+  eval "ac_cv_header_$ac_safe=yes"
+else
+  echo "$ac_err" >&5
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  eval "ac_cv_header_$ac_safe=no"
+fi
+rm -f conftest*
+fi
+if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then
+  echo "$ac_t""yes" 1>&6
+    ac_tr_hdr=HAVE_`echo $ac_hdr | sed 'y%abcdefghijklmnopqrstuvwxyz./-%ABCDEFGHIJKLMNOPQRSTUVWXYZ___%'`
+  cat >> confdefs.h <<EOF
+#define $ac_tr_hdr 1
+EOF
+ 
+else
+  echo "$ac_t""no" 1>&6
+fi
+done
+
+
+
+
+
+
+# Only perform the check for file, if the check method requires it
+case $deplibs_check_method in
+file_magic*)
+  if test "$file_magic_cmd" = '$MAGIC_CMD'; then
+    echo $ac_n "checking for ${ac_tool_prefix}file""... $ac_c" 1>&6
+echo "configure:2748: checking for ${ac_tool_prefix}file" >&5
+if eval "test \"`echo '$''{'lt_cv_path_MAGIC_CMD'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  case $MAGIC_CMD in
+  /*)
+  lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a path.
+  ;;
+  ?:/*)
+  lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a dos path.
+  ;;
+  *)
+  ac_save_MAGIC_CMD="$MAGIC_CMD"
+  IFS="${IFS=   }"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="/usr/bin:$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/${ac_tool_prefix}file; then
+      lt_cv_path_MAGIC_CMD="$ac_dir/${ac_tool_prefix}file"
+      if test -n "$file_magic_test_file"; then
+	case $deplibs_check_method in
+	"file_magic "*)
+	  file_magic_regex="`expr \"$deplibs_check_method\" : \"file_magic \(.*\)\"`"
+	  MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+	  if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+	    egrep "$file_magic_regex" > /dev/null; then
+	    :
+	  else
+	    cat <<EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such.  This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem.  Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
+EOF
+	  fi ;;
+	esac
+      fi
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+  MAGIC_CMD="$ac_save_MAGIC_CMD"
+  ;;
+esac
+fi
+
+MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+if test -n "$MAGIC_CMD"; then
+  echo "$ac_t""$MAGIC_CMD" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+if test -z "$lt_cv_path_MAGIC_CMD"; then
+  if test -n "$ac_tool_prefix"; then
+    echo $ac_n "checking for file""... $ac_c" 1>&6
+echo "configure:2810: checking for file" >&5
+if eval "test \"`echo '$''{'lt_cv_path_MAGIC_CMD'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  case $MAGIC_CMD in
+  /*)
+  lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a path.
+  ;;
+  ?:/*)
+  lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a dos path.
+  ;;
+  *)
+  ac_save_MAGIC_CMD="$MAGIC_CMD"
+  IFS="${IFS=   }"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="/usr/bin:$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/file; then
+      lt_cv_path_MAGIC_CMD="$ac_dir/file"
+      if test -n "$file_magic_test_file"; then
+	case $deplibs_check_method in
+	"file_magic "*)
+	  file_magic_regex="`expr \"$deplibs_check_method\" : \"file_magic \(.*\)\"`"
+	  MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+	  if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+	    egrep "$file_magic_regex" > /dev/null; then
+	    :
+	  else
+	    cat <<EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such.  This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem.  Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
+EOF
+	  fi ;;
+	esac
+      fi
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+  MAGIC_CMD="$ac_save_MAGIC_CMD"
+  ;;
+esac
+fi
+
+MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
+if test -n "$MAGIC_CMD"; then
+  echo "$ac_t""$MAGIC_CMD" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+  else
+    MAGIC_CMD=:
+  fi
+fi
+
+  fi
+  ;;
+esac
+
+# Extract the first word of "${ac_tool_prefix}ranlib", so it can be a program name with args.
+set dummy ${ac_tool_prefix}ranlib; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:2881: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_RANLIB'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test -n "$RANLIB"; then
+  ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
+else
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$ac_word; then
+      ac_cv_prog_RANLIB="${ac_tool_prefix}ranlib"
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+fi
+fi
+RANLIB="$ac_cv_prog_RANLIB"
+if test -n "$RANLIB"; then
+  echo "$ac_t""$RANLIB" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+
+if test -z "$ac_cv_prog_RANLIB"; then
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "ranlib", so it can be a program name with args.
+set dummy ranlib; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:2913: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_RANLIB'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test -n "$RANLIB"; then
+  ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
+else
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$ac_word; then
+      ac_cv_prog_RANLIB="ranlib"
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+  test -z "$ac_cv_prog_RANLIB" && ac_cv_prog_RANLIB=":"
+fi
+fi
+RANLIB="$ac_cv_prog_RANLIB"
+if test -n "$RANLIB"; then
+  echo "$ac_t""$RANLIB" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+else
+  RANLIB=":"
+fi
+fi
+
+# Extract the first word of "${ac_tool_prefix}strip", so it can be a program name with args.
+set dummy ${ac_tool_prefix}strip; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:2948: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_STRIP'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test -n "$STRIP"; then
+  ac_cv_prog_STRIP="$STRIP" # Let the user override the test.
+else
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$ac_word; then
+      ac_cv_prog_STRIP="${ac_tool_prefix}strip"
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+fi
+fi
+STRIP="$ac_cv_prog_STRIP"
+if test -n "$STRIP"; then
+  echo "$ac_t""$STRIP" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+
+if test -z "$ac_cv_prog_STRIP"; then
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "strip", so it can be a program name with args.
+set dummy strip; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:2980: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_STRIP'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test -n "$STRIP"; then
+  ac_cv_prog_STRIP="$STRIP" # Let the user override the test.
+else
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$ac_word; then
+      ac_cv_prog_STRIP="strip"
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+  test -z "$ac_cv_prog_STRIP" && ac_cv_prog_STRIP=":"
+fi
+fi
+STRIP="$ac_cv_prog_STRIP"
+if test -n "$STRIP"; then
+  echo "$ac_t""$STRIP" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+else
+  STRIP=":"
+fi
+fi
+
+
+enable_dlopen=yes
+enable_win32_dll=yes
+
+# Check whether --enable-libtool-lock or --disable-libtool-lock was given.
+if test "${enable_libtool_lock+set}" = set; then
+  enableval="$enable_libtool_lock"
+  :
+fi
+
+test "x$enable_libtool_lock" != xno && enable_libtool_lock=yes
+
+# Some flags need to be propagated to the compiler or linker for good
+# libtool support.
+case $host in
+*-*-irix6*)
+  # Find out which ABI we are using.
+  echo '#line 3029 "configure"' > conftest.$ac_ext
+  if { (eval echo configure:3030: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+    case `/usr/bin/file conftest.$ac_objext` in
+    *32-bit*)
+      LD="${LD-ld} -32"
+      ;;
+    *N32*)
+      LD="${LD-ld} -n32"
+      ;;
+    *64-bit*)
+      LD="${LD-ld} -64"
+      ;;
+    esac
+  fi
+  rm -rf conftest*
+  ;;
+
+*-*-sco3.2v5*)
+  # On SCO OpenServer 5, we need -belf to get full-featured binaries.
+  SAVE_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -belf"
+  echo $ac_n "checking whether the C compiler needs -belf""... $ac_c" 1>&6
+echo "configure:3051: checking whether the C compiler needs -belf" >&5
+if eval "test \"`echo '$''{'lt_cv_cc_needs_belf'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  
+     ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+     cat > conftest.$ac_ext <<EOF
+#line 3064 "configure"
+#include "confdefs.h"
+
+int main() {
+
+; return 0; }
+EOF
+if { (eval echo configure:3071: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  lt_cv_cc_needs_belf=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  lt_cv_cc_needs_belf=no
+fi
+rm -f conftest*
+     ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+fi
+
+echo "$ac_t""$lt_cv_cc_needs_belf" 1>&6
+  if test x"$lt_cv_cc_needs_belf" != x"yes"; then
+    # this is probably gcc 2.8.0, egcs 1.0 or newer; no need for -belf
+    CFLAGS="$SAVE_CFLAGS"
+  fi
+  ;;
+
+*-*-cygwin* | *-*-mingw* | *-*-pw32*)
+  # Extract the first word of "${ac_tool_prefix}dlltool", so it can be a program name with args.
+set dummy ${ac_tool_prefix}dlltool; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:3101: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_DLLTOOL'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test -n "$DLLTOOL"; then
+  ac_cv_prog_DLLTOOL="$DLLTOOL" # Let the user override the test.
+else
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$ac_word; then
+      ac_cv_prog_DLLTOOL="${ac_tool_prefix}dlltool"
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+fi
+fi
+DLLTOOL="$ac_cv_prog_DLLTOOL"
+if test -n "$DLLTOOL"; then
+  echo "$ac_t""$DLLTOOL" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+
+if test -z "$ac_cv_prog_DLLTOOL"; then
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "dlltool", so it can be a program name with args.
+set dummy dlltool; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:3133: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_DLLTOOL'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test -n "$DLLTOOL"; then
+  ac_cv_prog_DLLTOOL="$DLLTOOL" # Let the user override the test.
+else
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$ac_word; then
+      ac_cv_prog_DLLTOOL="dlltool"
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+  test -z "$ac_cv_prog_DLLTOOL" && ac_cv_prog_DLLTOOL="false"
+fi
+fi
+DLLTOOL="$ac_cv_prog_DLLTOOL"
+if test -n "$DLLTOOL"; then
+  echo "$ac_t""$DLLTOOL" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+else
+  DLLTOOL="false"
+fi
+fi
+
+  # Extract the first word of "${ac_tool_prefix}as", so it can be a program name with args.
+set dummy ${ac_tool_prefix}as; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:3168: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_AS'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test -n "$AS"; then
+  ac_cv_prog_AS="$AS" # Let the user override the test.
+else
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$ac_word; then
+      ac_cv_prog_AS="${ac_tool_prefix}as"
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+fi
+fi
+AS="$ac_cv_prog_AS"
+if test -n "$AS"; then
+  echo "$ac_t""$AS" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+
+if test -z "$ac_cv_prog_AS"; then
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "as", so it can be a program name with args.
+set dummy as; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:3200: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_AS'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test -n "$AS"; then
+  ac_cv_prog_AS="$AS" # Let the user override the test.
+else
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$ac_word; then
+      ac_cv_prog_AS="as"
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+  test -z "$ac_cv_prog_AS" && ac_cv_prog_AS="false"
+fi
+fi
+AS="$ac_cv_prog_AS"
+if test -n "$AS"; then
+  echo "$ac_t""$AS" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+else
+  AS="false"
+fi
+fi
+
+  # Extract the first word of "${ac_tool_prefix}objdump", so it can be a program name with args.
+set dummy ${ac_tool_prefix}objdump; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:3235: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_OBJDUMP'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test -n "$OBJDUMP"; then
+  ac_cv_prog_OBJDUMP="$OBJDUMP" # Let the user override the test.
+else
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$ac_word; then
+      ac_cv_prog_OBJDUMP="${ac_tool_prefix}objdump"
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+fi
+fi
+OBJDUMP="$ac_cv_prog_OBJDUMP"
+if test -n "$OBJDUMP"; then
+  echo "$ac_t""$OBJDUMP" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+
+if test -z "$ac_cv_prog_OBJDUMP"; then
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "objdump", so it can be a program name with args.
+set dummy objdump; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:3267: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_OBJDUMP'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test -n "$OBJDUMP"; then
+  ac_cv_prog_OBJDUMP="$OBJDUMP" # Let the user override the test.
+else
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$ac_word; then
+      ac_cv_prog_OBJDUMP="objdump"
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+  test -z "$ac_cv_prog_OBJDUMP" && ac_cv_prog_OBJDUMP="false"
+fi
+fi
+OBJDUMP="$ac_cv_prog_OBJDUMP"
+if test -n "$OBJDUMP"; then
+  echo "$ac_t""$OBJDUMP" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+else
+  OBJDUMP="false"
+fi
+fi
+
+
+  # recent cygwin and mingw systems supply a stub DllMain which the user
+  # can override, but on older systems we have to supply one
+  echo $ac_n "checking if libtool should supply DllMain function""... $ac_c" 1>&6
+echo "configure:3303: checking if libtool should supply DllMain function" >&5
+if eval "test \"`echo '$''{'lt_cv_need_dllmain'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 3308 "configure"
+#include "confdefs.h"
+
+int main() {
+extern int __attribute__((__stdcall__)) DllMain(void*, int, void*);
+      DllMain (0, 0, 0);
+; return 0; }
+EOF
+if { (eval echo configure:3316: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  lt_cv_need_dllmain=no
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  lt_cv_need_dllmain=yes
+fi
+rm -f conftest*
+fi
+
+echo "$ac_t""$lt_cv_need_dllmain" 1>&6
+
+  case $host/$CC in
+  *-*-cygwin*/gcc*-mno-cygwin*|*-*-mingw*)
+    # old mingw systems require "-dll" to link a DLL, while more recent ones
+    # require "-mdll"
+    SAVE_CFLAGS="$CFLAGS"
+    CFLAGS="$CFLAGS -mdll"
+    echo $ac_n "checking how to link DLLs""... $ac_c" 1>&6
+echo "configure:3337: checking how to link DLLs" >&5
+if eval "test \"`echo '$''{'lt_cv_cc_dll_switch'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 3342 "configure"
+#include "confdefs.h"
+
+int main() {
+
+; return 0; }
+EOF
+if { (eval echo configure:3349: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  lt_cv_cc_dll_switch=-mdll
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  lt_cv_cc_dll_switch=-dll
+fi
+rm -f conftest*
+fi
+
+echo "$ac_t""$lt_cv_cc_dll_switch" 1>&6
+    CFLAGS="$SAVE_CFLAGS" ;;
+  *-*-cygwin* | *-*-pw32*)
+    # cygwin systems need to pass --dll to the linker, and not link
+    # crt.o which will require a WinMain@16 definition.
+    lt_cv_cc_dll_switch="-Wl,--dll -nostartfiles" ;;
+  esac
+  ;;
+  
+esac
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+Xsed='sed -e s/^X//'
+sed_quote_subst='s/\([\\"\\`$\\\\]\)/\\\1/g'
+
+# Same as above, but do not quote variable references.
+double_quote_subst='s/\([\\"\\`\\\\]\)/\\\1/g'
+
+# Sed substitution to delay expansion of an escaped shell variable in a
+# double_quote_subst'ed string.
+delay_variable_subst='s/\\\\\\\\\\\$/\\\\\\$/g'
+
+# Constants:
+rm="rm -f"
+
+# Global variables:
+default_ofile=libtool
+can_build_shared=yes
+
+# All known linkers require a `.a' archive for static linking (except M$VC,
+# which needs '.lib').
+libext=a
+ltmain="$ac_aux_dir/ltmain.sh"
+ofile="$default_ofile"
+with_gnu_ld="$lt_cv_prog_gnu_ld"
+need_locks="$enable_libtool_lock"
+
+old_CC="$CC"
+old_CFLAGS="$CFLAGS"
+
+# Set sane defaults for various variables
+test -z "$AR" && AR=ar
+test -z "$AR_FLAGS" && AR_FLAGS=cru
+test -z "$AS" && AS=as
+test -z "$CC" && CC=cc
+test -z "$DLLTOOL" && DLLTOOL=dlltool
+test -z "$LD" && LD=ld
+test -z "$LN_S" && LN_S="ln -s"
+test -z "$MAGIC_CMD" && MAGIC_CMD=file
+test -z "$NM" && NM=nm
+test -z "$OBJDUMP" && OBJDUMP=objdump
+test -z "$RANLIB" && RANLIB=:
+test -z "$STRIP" && STRIP=:
+test -z "$ac_objext" && ac_objext=o
+
+if test x"$host" != x"$build"; then
+  ac_tool_prefix=${host_alias}-
+else
+  ac_tool_prefix=
+fi
+
+# Transform linux* to *-*-linux-gnu*, to support old configure scripts.
+case $host_os in
+linux-gnu*) ;;
+linux*) host=`echo $host | sed 's/^\(.*-.*-linux\)\(.*\)$/\1-gnu\2/'`
+esac
+
+case $host_os in
+aix3*)
+  # AIX sometimes has problems with the GCC collect2 program.  For some
+  # reason, if we set the COLLECT_NAMES environment variable, the problems
+  # vanish in a puff of smoke.
+  if test "X${COLLECT_NAMES+set}" != Xset; then
+    COLLECT_NAMES=
+    export COLLECT_NAMES
+  fi
+  ;;
+esac
+
+# Determine commands to create old-style static archives.
+old_archive_cmds='$AR $AR_FLAGS $oldlib$oldobjs$old_deplibs'
+old_postinstall_cmds='chmod 644 $oldlib'
+old_postuninstall_cmds=
+
+if test -n "$RANLIB"; then
+  case $host_os in
+  openbsd*)
+    old_postinstall_cmds="\$RANLIB -t \$oldlib~$old_postinstall_cmds"
+    ;;
+  *)
+    old_postinstall_cmds="\$RANLIB \$oldlib~$old_postinstall_cmds"
+    ;;
+  esac
+  old_archive_cmds="$old_archive_cmds~\$RANLIB \$oldlib"
+fi
+
+# Allow CC to be a program name with arguments.
+set dummy $CC
+compiler="$2"
+
+echo $ac_n "checking for objdir""... $ac_c" 1>&6
+echo "configure:3463: checking for objdir" >&5
+rm -f .libs 2>/dev/null
+mkdir .libs 2>/dev/null
+if test -d .libs; then
+  objdir=.libs
+else
+  # MS-DOS does not allow filenames that begin with a dot.
+  objdir=_libs
+fi
+rmdir .libs 2>/dev/null
+echo "$ac_t""$objdir" 1>&6
+
+
+# Check whether --with-pic or --without-pic was given.
+if test "${with_pic+set}" = set; then
+  withval="$with_pic"
+  pic_mode="$withval"
+else
+  pic_mode=default
+fi
+
+test -z "$pic_mode" && pic_mode=default
+
+# We assume here that the value for lt_cv_prog_cc_pic will not be cached
+# in isolation, and that seeing it set (from the cache) indicates that
+# the associated values are set (in the cache) correctly too.
+echo $ac_n "checking for $compiler option to produce PIC""... $ac_c" 1>&6
+echo "configure:3490: checking for $compiler option to produce PIC" >&5
+if eval "test \"`echo '$''{'lt_cv_prog_cc_pic'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+   lt_cv_prog_cc_pic=
+  lt_cv_prog_cc_shlib=
+  lt_cv_prog_cc_wl=
+  lt_cv_prog_cc_static=
+  lt_cv_prog_cc_no_builtin=
+  lt_cv_prog_cc_can_build_shared=$can_build_shared
+
+  if test "$GCC" = yes; then
+    lt_cv_prog_cc_wl='-Wl,'
+    lt_cv_prog_cc_static='-static'
+
+    case $host_os in
+    aix*)
+      # Below there is a dirty hack to force normal static linking with -ldl
+      # The problem is because libdl dynamically linked with both libc and
+      # libC (AIX C++ library), which obviously doesn't included in libraries
+      # list by gcc. This cause undefined symbols with -static flags.
+      # This hack allows C programs to be linked with "-static -ldl", but
+      # not sure about C++ programs.
+      lt_cv_prog_cc_static="$lt_cv_prog_cc_static ${lt_cv_prog_cc_wl}-lC"
+      ;;
+    amigaos*)
+      # FIXME: we need at least 68020 code to build shared libraries, but
+      # adding the `-m68020' flag to GCC prevents building anything better,
+      # like `-m68040'.
+      lt_cv_prog_cc_pic='-m68020 -resident32 -malways-restore-a4'
+      ;;
+    beos* | irix5* | irix6* | nonstopux* | osf3* | osf4* | osf5*)
+      # PIC is the default for these OSes.
+      ;;
+    darwin* | rhapsody*)
+      # PIC is the default on this platform
+      # Common symbols not allowed in MH_DYLIB files
+      lt_cv_prog_cc_pic='-fno-common'
+      ;;
+    cygwin* | mingw* | pw32* | os2*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      lt_cv_prog_cc_pic='-DDLL_EXPORT'
+      ;;
+    sysv4*MP*)
+      if test -d /usr/nec; then
+	 lt_cv_prog_cc_pic=-Kconform_pic
+      fi
+      ;;
+    *)
+      lt_cv_prog_cc_pic='-fPIC'
+      ;;
+    esac
+  else
+    # PORTME Check for PIC flags for the system compiler.
+    case $host_os in
+    aix3* | aix4* | aix5*)
+      lt_cv_prog_cc_wl='-Wl,'
+      # All AIX code is PIC.
+      if test "$host_cpu" = ia64; then
+	# AIX 5 now supports IA64 processor
+	lt_cv_prog_cc_static='-Bstatic'
+      else
+	lt_cv_prog_cc_static='-bnso -bI:/lib/syscalls.exp'
+      fi
+      ;;
+
+    hpux9* | hpux10* | hpux11*)
+      # Is there a better lt_cv_prog_cc_static that works with the bundled CC?
+      lt_cv_prog_cc_wl='-Wl,'
+      lt_cv_prog_cc_static="${lt_cv_prog_cc_wl}-a ${lt_cv_prog_cc_wl}archive"
+      lt_cv_prog_cc_pic='+Z'
+      ;;
+
+    irix5* | irix6* | nonstopux*)
+      lt_cv_prog_cc_wl='-Wl,'
+      lt_cv_prog_cc_static='-non_shared'
+      # PIC (with -KPIC) is the default.
+      ;;
+
+    cygwin* | mingw* | pw32* | os2*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      lt_cv_prog_cc_pic='-DDLL_EXPORT'
+      ;;
+
+    newsos6)
+      lt_cv_prog_cc_pic='-KPIC'
+      lt_cv_prog_cc_static='-Bstatic'
+      ;;
+
+    osf3* | osf4* | osf5*)
+      # All OSF/1 code is PIC.
+      lt_cv_prog_cc_wl='-Wl,'
+      lt_cv_prog_cc_static='-non_shared'
+      ;;
+
+    sco3.2v5*)
+      lt_cv_prog_cc_pic='-Kpic'
+      lt_cv_prog_cc_static='-dn'
+      lt_cv_prog_cc_shlib='-belf'
+      ;;
+
+    solaris*)
+      lt_cv_prog_cc_pic='-KPIC'
+      lt_cv_prog_cc_static='-Bstatic'
+      lt_cv_prog_cc_wl='-Wl,'
+      ;;
+
+    sunos4*)
+      lt_cv_prog_cc_pic='-PIC'
+      lt_cv_prog_cc_static='-Bstatic'
+      lt_cv_prog_cc_wl='-Qoption ld '
+      ;;
+
+    sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+      lt_cv_prog_cc_pic='-KPIC'
+      lt_cv_prog_cc_static='-Bstatic'
+      lt_cv_prog_cc_wl='-Wl,'
+      ;;
+
+    uts4*)
+      lt_cv_prog_cc_pic='-pic'
+      lt_cv_prog_cc_static='-Bstatic'
+      ;;
+
+    sysv4*MP*)
+      if test -d /usr/nec ;then
+	lt_cv_prog_cc_pic='-Kconform_pic'
+	lt_cv_prog_cc_static='-Bstatic'
+      fi
+      ;;
+
+    *)
+      lt_cv_prog_cc_can_build_shared=no
+      ;;
+    esac
+  fi
+
+fi
+
+if test -z "$lt_cv_prog_cc_pic"; then
+  echo "$ac_t""none" 1>&6
+else
+  echo "$ac_t""$lt_cv_prog_cc_pic" 1>&6
+
+  # Check to make sure the pic_flag actually works.
+  echo $ac_n "checking if $compiler PIC flag $lt_cv_prog_cc_pic works""... $ac_c" 1>&6
+echo "configure:3638: checking if $compiler PIC flag $lt_cv_prog_cc_pic works" >&5
+  if eval "test \"`echo '$''{'lt_cv_prog_cc_pic_works'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+      save_CFLAGS="$CFLAGS"
+    CFLAGS="$CFLAGS $lt_cv_prog_cc_pic -DPIC"
+    cat > conftest.$ac_ext <<EOF
+#line 3645 "configure"
+#include "confdefs.h"
+
+int main() {
+
+; return 0; }
+EOF
+if { (eval echo configure:3652: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+        case $host_os in
+      hpux9* | hpux10* | hpux11*)
+	# On HP-UX, both CC and GCC only warn that PIC is supported... then
+	# they create non-PIC objects.  So, if there were any warnings, we
+	# assume that PIC is not supported.
+	if test -s conftest.err; then
+	  lt_cv_prog_cc_pic_works=no
+	else
+	  lt_cv_prog_cc_pic_works=yes
+	fi
+	;;
+      *)
+	lt_cv_prog_cc_pic_works=yes
+	;;
+      esac
+    
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+        lt_cv_prog_cc_pic_works=no
+    
+fi
+rm -f conftest*
+    CFLAGS="$save_CFLAGS"
+  
+fi
+
+
+  if test "X$lt_cv_prog_cc_pic_works" = Xno; then
+    lt_cv_prog_cc_pic=
+    lt_cv_prog_cc_can_build_shared=no
+  else
+    lt_cv_prog_cc_pic=" $lt_cv_prog_cc_pic"
+  fi
+
+  echo "$ac_t""$lt_cv_prog_cc_pic_works" 1>&6
+fi
+
+# Check for any special shared library compilation flags.
+if test -n "$lt_cv_prog_cc_shlib"; then
+  echo "configure: warning: \`$CC' requires \`$lt_cv_prog_cc_shlib' to build shared libraries" 1>&2
+  if echo "$old_CC $old_CFLAGS " | egrep -e "[ 	]$lt_cv_prog_cc_shlib[ 	]" >/dev/null; then :
+  else
+   echo "configure: warning: add \`$lt_cv_prog_cc_shlib' to the CC or CFLAGS env variable and reconfigure" 1>&2
+    lt_cv_prog_cc_can_build_shared=no
+  fi
+fi
+
+echo $ac_n "checking if $compiler static flag $lt_cv_prog_cc_static works""... $ac_c" 1>&6
+echo "configure:3704: checking if $compiler static flag $lt_cv_prog_cc_static works" >&5
+if eval "test \"`echo '$''{'lt_cv_prog_cc_static_works'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+    lt_cv_prog_cc_static_works=no
+  save_LDFLAGS="$LDFLAGS"
+  LDFLAGS="$LDFLAGS $lt_cv_prog_cc_static"
+  cat > conftest.$ac_ext <<EOF
+#line 3712 "configure"
+#include "confdefs.h"
+
+int main() {
+
+; return 0; }
+EOF
+if { (eval echo configure:3719: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  lt_cv_prog_cc_static_works=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+fi
+rm -f conftest*
+  LDFLAGS="$save_LDFLAGS"
+
+fi
+
+
+# Belt *and* braces to stop my trousers falling down:
+test "X$lt_cv_prog_cc_static_works" = Xno && lt_cv_prog_cc_static=
+echo "$ac_t""$lt_cv_prog_cc_static_works" 1>&6
+
+pic_flag="$lt_cv_prog_cc_pic"
+special_shlib_compile_flags="$lt_cv_prog_cc_shlib"
+wl="$lt_cv_prog_cc_wl"
+link_static_flag="$lt_cv_prog_cc_static"
+no_builtin_flag="$lt_cv_prog_cc_no_builtin"
+can_build_shared="$lt_cv_prog_cc_can_build_shared"
+
+
+# Check to see if options -o and -c are simultaneously supported by compiler
+echo $ac_n "checking if $compiler supports -c -o file.$ac_objext""... $ac_c" 1>&6
+echo "configure:3746: checking if $compiler supports -c -o file.$ac_objext" >&5
+if eval "test \"`echo '$''{'lt_cv_compiler_c_o'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  
+$rm -r conftest 2>/dev/null
+mkdir conftest
+cd conftest
+echo "int some_variable = 0;" > conftest.$ac_ext
+mkdir out
+# According to Tom Tromey, Ian Lance Taylor reported there are C compilers
+# that will create temporary files in the current directory regardless of
+# the output directory.  Thus, making CWD read-only will cause this test
+# to fail, enabling locking or at least warning the user not to do parallel
+# builds.
+chmod -w .
+save_CFLAGS="$CFLAGS"
+CFLAGS="$CFLAGS -o out/conftest2.$ac_objext"
+compiler_c_o=no
+if { (eval echo configure:3765: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>out/conftest.err; } && test -s out/conftest2.$ac_objext; then
+  # The compiler can only warn and ignore the option if not recognized
+  # So say no if there are warnings
+  if test -s out/conftest.err; then
+    lt_cv_compiler_c_o=no
+  else
+    lt_cv_compiler_c_o=yes
+  fi
+else
+  # Append any errors to the config.log.
+  cat out/conftest.err 1>&5
+  lt_cv_compiler_c_o=no
+fi
+CFLAGS="$save_CFLAGS"
+chmod u+w .
+$rm conftest* out/*
+rmdir out
+cd ..
+rmdir conftest
+$rm -r conftest 2>/dev/null
+
+fi
+
+compiler_c_o=$lt_cv_compiler_c_o
+echo "$ac_t""$compiler_c_o" 1>&6
+
+if test x"$compiler_c_o" = x"yes"; then
+  # Check to see if we can write to a .lo
+  echo $ac_n "checking if $compiler supports -c -o file.lo""... $ac_c" 1>&6
+echo "configure:3794: checking if $compiler supports -c -o file.lo" >&5
+  if eval "test \"`echo '$''{'lt_cv_compiler_o_lo'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  
+  lt_cv_compiler_o_lo=no
+  save_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -c -o conftest.lo"
+  save_objext="$ac_objext"
+  ac_objext=lo
+  cat > conftest.$ac_ext <<EOF
+#line 3805 "configure"
+#include "confdefs.h"
+
+int main() {
+int some_variable = 0;
+; return 0; }
+EOF
+if { (eval echo configure:3812: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+      # The compiler can only warn and ignore the option if not recognized
+    # So say no if there are warnings
+    if test -s conftest.err; then
+      lt_cv_compiler_o_lo=no
+    else
+      lt_cv_compiler_o_lo=yes
+    fi
+  
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+fi
+rm -f conftest*
+  ac_objext="$save_objext"
+  CFLAGS="$save_CFLAGS"
+  
+fi
+
+  compiler_o_lo=$lt_cv_compiler_o_lo
+  echo "$ac_t""$compiler_o_lo" 1>&6
+else
+  compiler_o_lo=no
+fi
+
+# Check to see if we can do hard links to lock some files if needed
+hard_links="nottested"
+if test "$compiler_c_o" = no && test "$need_locks" != no; then
+  # do not overwrite the value of need_locks provided by the user
+  echo $ac_n "checking if we can lock with hard links""... $ac_c" 1>&6
+echo "configure:3843: checking if we can lock with hard links" >&5
+  hard_links=yes
+  $rm conftest*
+  ln conftest.a conftest.b 2>/dev/null && hard_links=no
+  touch conftest.a
+  ln conftest.a conftest.b 2>&5 || hard_links=no
+  ln conftest.a conftest.b 2>/dev/null && hard_links=no
+  echo "$ac_t""$hard_links" 1>&6
+  if test "$hard_links" = no; then
+    echo "configure: warning: \`$CC' does not support \`-c -o', so \`make -j' may be unsafe" 1>&2
+    need_locks=warn
+  fi
+else
+  need_locks=no
+fi
+
+if test "$GCC" = yes; then
+  # Check to see if options -fno-rtti -fno-exceptions are supported by compiler
+  echo $ac_n "checking if $compiler supports -fno-rtti -fno-exceptions""... $ac_c" 1>&6
+echo "configure:3862: checking if $compiler supports -fno-rtti -fno-exceptions" >&5
+  echo "int some_variable = 0;" > conftest.$ac_ext
+  save_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -fno-rtti -fno-exceptions -c conftest.$ac_ext"
+  compiler_rtti_exceptions=no
+  cat > conftest.$ac_ext <<EOF
+#line 3868 "configure"
+#include "confdefs.h"
+
+int main() {
+int some_variable = 0;
+; return 0; }
+EOF
+if { (eval echo configure:3875: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+      # The compiler can only warn and ignore the option if not recognized
+    # So say no if there are warnings
+    if test -s conftest.err; then
+      compiler_rtti_exceptions=no
+    else
+      compiler_rtti_exceptions=yes
+    fi
+  
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+fi
+rm -f conftest*
+  CFLAGS="$save_CFLAGS"
+  echo "$ac_t""$compiler_rtti_exceptions" 1>&6
+
+  if test "$compiler_rtti_exceptions" = "yes"; then
+    no_builtin_flag=' -fno-builtin -fno-rtti -fno-exceptions'
+  else
+    no_builtin_flag=' -fno-builtin'
+  fi
+fi
+
+# See if the linker supports building shared libraries.
+echo $ac_n "checking whether the linker ($LD) supports shared libraries""... $ac_c" 1>&6
+echo "configure:3902: checking whether the linker ($LD) supports shared libraries" >&5
+
+allow_undefined_flag=
+no_undefined_flag=
+need_lib_prefix=unknown
+need_version=unknown
+# when you set need_version to no, make sure it does not cause -set_version
+# flags to be left without arguments
+archive_cmds=
+archive_expsym_cmds=
+old_archive_from_new_cmds=
+old_archive_from_expsyms_cmds=
+export_dynamic_flag_spec=
+whole_archive_flag_spec=
+thread_safe_flag_spec=
+hardcode_into_libs=no
+hardcode_libdir_flag_spec=
+hardcode_libdir_separator=
+hardcode_direct=no
+hardcode_minus_L=no
+hardcode_shlibpath_var=unsupported
+runpath_var=
+link_all_deplibs=unknown
+always_export_symbols=no
+export_symbols_cmds='$NM $libobjs $convenience | $global_symbol_pipe | sed '\''s/.* //'\'' | sort | uniq > $export_symbols'
+# include_expsyms should be a list of space-separated symbols to be *always*
+# included in the symbol list
+include_expsyms=
+# exclude_expsyms can be an egrep regular expression of symbols to exclude
+# it will be wrapped by ` (' and `)$', so one must not match beginning or
+# end of line.  Example: `a|bc|.*d.*' will exclude the symbols `a' and `bc',
+# as well as any symbol that contains `d'.
+exclude_expsyms="_GLOBAL_OFFSET_TABLE_"
+# Although _GLOBAL_OFFSET_TABLE_ is a valid symbol C name, most a.out
+# platforms (ab)use it in PIC code, but their linkers get confused if
+# the symbol is explicitly referenced.  Since portable code cannot
+# rely on this symbol name, it's probably fine to never include it in
+# preloaded symbol tables.
+extract_expsyms_cmds=
+
+case $host_os in
+cygwin* | mingw* | pw32*)
+  # FIXME: the MSVC++ port hasn't been tested in a loooong time
+  # When not using gcc, we currently assume that we are using
+  # Microsoft Visual C++.
+  if test "$GCC" != yes; then
+    with_gnu_ld=no
+  fi
+  ;;
+openbsd*)
+  with_gnu_ld=no
+  ;;
+esac
+
+ld_shlibs=yes
+if test "$with_gnu_ld" = yes; then
+  # If archive_cmds runs LD, not CC, wlarc should be empty
+  wlarc='${wl}'
+
+  # See if GNU ld supports shared libraries.
+  case $host_os in
+  aix3* | aix4* | aix5*)
+    # On AIX, the GNU linker is very broken
+    # Note:Check GNU linker on AIX 5-IA64 when/if it becomes available.
+    ld_shlibs=no
+    cat <<EOF 1>&2
+
+*** Warning: the GNU linker, at least up to release 2.9.1, is reported
+*** to be unable to reliably create shared libraries on AIX.
+*** Therefore, libtool is disabling shared libraries support.  If you
+*** really care for shared libraries, you may want to modify your PATH
+*** so that a non-GNU linker is found, and then restart.
+
+EOF
+    ;;
+
+  amigaos*)
+    archive_cmds='$rm $output_objdir/a2ixlibrary.data~$echo "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$echo "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$echo "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$echo "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_minus_L=yes
+
+    # Samuel A. Falvo II <kc5tja@dolphin.openprojects.net> reports
+    # that the semantics of dynamic libraries on AmigaOS, at least up
+    # to version 4, is to share data among multiple programs linked
+    # with the same dynamic library.  Since this doesn't match the
+    # behavior of shared libraries on other platforms, we can use
+    # them.
+    ld_shlibs=no
+    ;;
+
+  beos*)
+    if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+      allow_undefined_flag=unsupported
+      # Joseph Beckenbach <jrb3@best.com> says some releases of gcc
+      # support --undefined.  This deserves some investigation.  FIXME
+      archive_cmds='$CC -nostart $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+    else
+      ld_shlibs=no
+    fi
+    ;;
+
+  cygwin* | mingw* | pw32*)
+    # hardcode_libdir_flag_spec is actually meaningless, as there is
+    # no search path for DLLs.
+    hardcode_libdir_flag_spec='-L$libdir'
+    allow_undefined_flag=unsupported
+    always_export_symbols=yes
+
+    extract_expsyms_cmds='test -f $output_objdir/impgen.c || \
+      sed -e "/^# \/\* impgen\.c starts here \*\//,/^# \/\* impgen.c ends here \*\// { s/^# //;s/^# *$//; p; }" -e d < $''0 > $output_objdir/impgen.c~
+      test -f $output_objdir/impgen.exe || (cd $output_objdir && \
+      if test "x$HOST_CC" != "x" ; then $HOST_CC -o impgen impgen.c ; \
+      else $CC -o impgen impgen.c ; fi)~
+      $output_objdir/impgen $dir/$soroot > $output_objdir/$soname-def'
+
+    old_archive_from_expsyms_cmds='$DLLTOOL --as=$AS --dllname $soname --def $output_objdir/$soname-def --output-lib $output_objdir/$newlib'
+
+    # cygwin and mingw dlls have different entry points and sets of symbols
+    # to exclude.
+    # FIXME: what about values for MSVC?
+    dll_entry=__cygwin_dll_entry@12
+    dll_exclude_symbols=DllMain@12,_cygwin_dll_entry@12,_cygwin_noncygwin_dll_entry@12~
+    case $host_os in
+    mingw*)
+      # mingw values
+      dll_entry=_DllMainCRTStartup@12
+      dll_exclude_symbols=DllMain@12,DllMainCRTStartup@12,DllEntryPoint@12~
+      ;;
+    esac
+
+    # mingw and cygwin differ, and it's simplest to just exclude the union
+    # of the two symbol sets.
+    dll_exclude_symbols=DllMain@12,_cygwin_dll_entry@12,_cygwin_noncygwin_dll_entry@12,DllMainCRTStartup@12,DllEntryPoint@12
+
+    # recent cygwin and mingw systems supply a stub DllMain which the user
+    # can override, but on older systems we have to supply one (in ltdll.c)
+    if test "x$lt_cv_need_dllmain" = "xyes"; then
+      ltdll_obj='$output_objdir/$soname-ltdll.'"$ac_objext "
+      ltdll_cmds='test -f $output_objdir/$soname-ltdll.c || sed -e "/^# \/\* ltdll\.c starts here \*\//,/^# \/\* ltdll.c ends here \*\// { s/^# //; p; }" -e d < $''0 > $output_objdir/$soname-ltdll.c~
+	test -f $output_objdir/$soname-ltdll.$ac_objext || (cd $output_objdir && $CC -c $soname-ltdll.c)~'
+    else
+      ltdll_obj=
+      ltdll_cmds=
+    fi
+
+    # Extract the symbol export list from an `--export-all' def file,
+    # then regenerate the def file from the symbol export list, so that
+    # the compiled dll only exports the symbol export list.
+    # Be careful not to strip the DATA tag left be newer dlltools.
+    export_symbols_cmds="$ltdll_cmds"'
+      $DLLTOOL --export-all --exclude-symbols '$dll_exclude_symbols' --output-def $output_objdir/$soname-def '$ltdll_obj'$libobjs $convenience~
+      sed -e "1,/EXPORTS/d" -e "s/ @ [0-9]*//" -e "s/ *;.*$//" < $output_objdir/$soname-def > $export_symbols'
+
+    # If the export-symbols file already is a .def file (1st line
+    # is EXPORTS), use it as is.
+    # If DATA tags from a recent dlltool are present, honour them!
+    archive_expsym_cmds='if test "x`sed 1q $export_symbols`" = xEXPORTS; then
+	cp $export_symbols $output_objdir/$soname-def;
+      else
+	echo EXPORTS > $output_objdir/$soname-def;
+	_lt_hint=1;
+	cat $export_symbols | while read symbol; do
+	 set dummy \$symbol;
+	 case \$# in
+	   2) echo "   \$2 @ \$_lt_hint ; " >> $output_objdir/$soname-def;;
+	   4) echo "   \$2 \$3 \$4 ; " >> $output_objdir/$soname-def; _lt_hint=`expr \$_lt_hint - 1`;;
+	   *) echo "     \$2 @ \$_lt_hint \$3 ; " >> $output_objdir/$soname-def;;
+	 esac;
+	 _lt_hint=`expr 1 + \$_lt_hint`;
+	done;
+      fi~
+      '"$ltdll_cmds"'
+      $CC -Wl,--base-file,$output_objdir/$soname-base '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $output_objdir/$soname '$ltdll_obj'$libobjs $deplibs $compiler_flags~
+      $DLLTOOL --as=$AS --dllname $soname --exclude-symbols '$dll_exclude_symbols' --def $output_objdir/$soname-def --base-file $output_objdir/$soname-base --output-exp $output_objdir/$soname-exp~
+      $CC -Wl,--base-file,$output_objdir/$soname-base $output_objdir/$soname-exp '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $output_objdir/$soname '$ltdll_obj'$libobjs $deplibs $compiler_flags~
+      $DLLTOOL --as=$AS --dllname $soname --exclude-symbols '$dll_exclude_symbols' --def $output_objdir/$soname-def --base-file $output_objdir/$soname-base --output-exp $output_objdir/$soname-exp --output-lib $output_objdir/$libname.dll.a~
+      $CC $output_objdir/$soname-exp '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $output_objdir/$soname '$ltdll_obj'$libobjs $deplibs $compiler_flags'
+    ;;
+
+  netbsd*)
+    if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+      archive_cmds='$LD -Bshareable $libobjs $deplibs $linker_flags -o $lib'
+      wlarc=
+    else
+      archive_cmds='$CC -shared -nodefaultlibs $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      archive_expsym_cmds='$CC -shared -nodefaultlibs $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+    fi
+    ;;
+
+  solaris* | sysv5*)
+    if $LD -v 2>&1 | egrep 'BFD 2\.8' > /dev/null; then
+      ld_shlibs=no
+      cat <<EOF 1>&2
+
+*** Warning: The releases 2.8.* of the GNU linker cannot reliably
+*** create shared libraries on Solaris systems.  Therefore, libtool
+*** is disabling shared libraries support.  We urge you to upgrade GNU
+*** binutils to release 2.9.1 or newer.  Another option is to modify
+*** your PATH or compiler configuration so that the native linker is
+*** used, and then restart.
+
+EOF
+    elif $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+      archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+    else
+      ld_shlibs=no
+    fi
+    ;;
+
+  sunos4*)
+    archive_cmds='$LD -assert pure-text -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+    wlarc=
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  *)
+    if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+      archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+    else
+      ld_shlibs=no
+    fi
+    ;;
+  esac
+
+  if test "$ld_shlibs" = yes; then
+    runpath_var=LD_RUN_PATH
+    hardcode_libdir_flag_spec='${wl}--rpath ${wl}$libdir'
+    export_dynamic_flag_spec='${wl}--export-dynamic'
+    case $host_os in
+    cygwin* | mingw* | pw32*)
+      # dlltool doesn't understand --whole-archive et. al.
+      whole_archive_flag_spec=
+      ;;
+    *)
+      # ancient GNU ld didn't support --whole-archive et. al.
+      if $LD --help 2>&1 | egrep 'no-whole-archive' > /dev/null; then
+	whole_archive_flag_spec="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
+      else
+	whole_archive_flag_spec=
+      fi
+      ;;
+    esac
+  fi
+else
+  # PORTME fill in a description of your system's linker (not GNU ld)
+  case $host_os in
+  aix3*)
+    allow_undefined_flag=unsupported
+    always_export_symbols=yes
+    archive_expsym_cmds='$LD -o $output_objdir/$soname $libobjs $deplibs $linker_flags -bE:$export_symbols -T512 -H512 -bM:SRE~$AR $AR_FLAGS $lib $output_objdir/$soname'
+    # Note: this linker hardcodes the directories in LIBPATH if there
+    # are no directories specified by -L.
+    hardcode_minus_L=yes
+    if test "$GCC" = yes && test -z "$link_static_flag"; then
+      # Neither direct hardcoding nor static linking is supported with a
+      # broken collect2.
+      hardcode_direct=unsupported
+    fi
+    ;;
+
+  aix4* | aix5*)
+    if test "$host_cpu" = ia64; then
+      # On IA64, the linker does run time linking by default, so we don't
+      # have to do anything special.
+      aix_use_runtimelinking=no
+      exp_sym_flag='-Bexport'
+      no_entry_flag=""
+    else
+      aix_use_runtimelinking=no
+
+      # Test if we are trying to use run time linking or normal
+      # AIX style linking. If -brtl is somewhere in LDFLAGS, we
+      # need to do runtime linking.
+      case $host_os in aix4.[23]|aix4.[23].*|aix5*)
+	for ld_flag in $LDFLAGS; do
+	  case $ld_flag in
+	  *-brtl*)
+	    aix_use_runtimelinking=yes
+	    break
+	  ;;
+	  esac
+	done
+      esac
+
+      exp_sym_flag='-bexport'
+      no_entry_flag='-bnoentry'
+    fi
+
+    # When large executables or shared objects are built, AIX ld can
+    # have problems creating the table of contents.  If linking a library
+    # or program results in "error TOC overflow" add -mminimal-toc to
+    # CXXFLAGS/CFLAGS for g++/gcc.  In the cases where that is not
+    # enough to fix the problem, add -Wl,-bbigtoc to LDFLAGS.
+
+    hardcode_direct=yes
+    archive_cmds=''
+    hardcode_libdir_separator=':'
+    if test "$GCC" = yes; then
+      case $host_os in aix4.[012]|aix4.[012].*)
+	collect2name=`${CC} -print-prog-name=collect2`
+	if test -f "$collect2name" && \
+	  strings "$collect2name" | grep resolve_lib_name >/dev/null
+	then
+	  # We have reworked collect2
+	  hardcode_direct=yes
+	else
+	  # We have old collect2
+	  hardcode_direct=unsupported
+	  # It fails to find uninstalled libraries when the uninstalled
+	  # path is not listed in the libpath.  Setting hardcode_minus_L
+	  # to unsupported forces relinking
+	  hardcode_minus_L=yes
+	  hardcode_libdir_flag_spec='-L$libdir'
+	  hardcode_libdir_separator=
+	fi
+      esac
+
+      shared_flag='-shared'
+    else
+      # not using gcc
+      if test "$host_cpu" = ia64; then
+	shared_flag='${wl}-G'
+      else
+	if test "$aix_use_runtimelinking" = yes; then
+	  shared_flag='${wl}-G'
+	else
+	  shared_flag='${wl}-bM:SRE'
+	fi
+      fi
+    fi
+
+    # It seems that -bexpall can do strange things, so it is better to
+    # generate a list of symbols to export.
+    always_export_symbols=yes
+    if test "$aix_use_runtimelinking" = yes; then
+      # Warning - without using the other runtime loading flags (-brtl),
+      # -berok will link without error, but may produce a broken library.
+      allow_undefined_flag='-berok'
+      hardcode_libdir_flag_spec='${wl}-blibpath:$libdir:/usr/lib:/lib'
+      archive_expsym_cmds="\$CC"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags `if test "x${allow_undefined_flag}" != "x"; then echo "${wl}${allow_undefined_flag}"; else :; fi` '"\${wl}$no_entry_flag \${wl}$exp_sym_flag:\$export_symbols $shared_flag"
+    else
+      if test "$host_cpu" = ia64; then
+	hardcode_libdir_flag_spec='${wl}-R $libdir:/usr/lib:/lib'
+	allow_undefined_flag="-z nodefs"
+	archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname ${wl}-h$soname $libobjs $deplibs $compiler_flags ${wl}${allow_undefined_flag} '"\${wl}$no_entry_flag \${wl}$exp_sym_flag:\$export_symbols"
+      else
+	hardcode_libdir_flag_spec='${wl}-bnolibpath ${wl}-blibpath:$libdir:/usr/lib:/lib'
+	# Warning - without using the other run time loading flags,
+	# -berok will link without error, but may produce a broken library.
+	allow_undefined_flag='${wl}-berok'
+	# This is a bit strange, but is similar to how AIX traditionally builds
+	# it's shared libraries.
+	archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${allow_undefined_flag} '"\${wl}$no_entry_flag \${wl}$exp_sym_flag:\$export_symbols"' ~$AR -crlo $objdir/$libname$release.a $objdir/$soname'
+      fi
+    fi
+    ;;
+
+  amigaos*)
+    archive_cmds='$rm $output_objdir/a2ixlibrary.data~$echo "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$echo "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$echo "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$echo "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_minus_L=yes
+    # see comment about different semantics on the GNU ld section
+    ld_shlibs=no
+    ;;
+
+  cygwin* | mingw* | pw32*)
+    # When not using gcc, we currently assume that we are using
+    # Microsoft Visual C++.
+    # hardcode_libdir_flag_spec is actually meaningless, as there is
+    # no search path for DLLs.
+    hardcode_libdir_flag_spec=' '
+    allow_undefined_flag=unsupported
+    # Tell ltmain to make .lib files, not .a files.
+    libext=lib
+    # FIXME: Setting linknames here is a bad hack.
+    archive_cmds='$CC -o $lib $libobjs $compiler_flags `echo "$deplibs" | sed -e '\''s/ -lc$//'\''` -link -dll~linknames='
+    # The linker will automatically build a .lib file if we build a DLL.
+    old_archive_from_new_cmds='true'
+    # FIXME: Should let the user specify the lib program.
+    old_archive_cmds='lib /OUT:$oldlib$oldobjs$old_deplibs'
+    fix_srcfile_path='`cygpath -w "$srcfile"`'
+    ;;
+
+  darwin* | rhapsody*)
+    case "$host_os" in
+    rhapsody* | darwin1.[012])
+      allow_undefined_flag='-undefined suppress'
+      ;;
+    *) # Darwin 1.3 on
+      allow_undefined_flag='-flat_namespace -undefined suppress'
+      ;;
+    esac
+    # FIXME: Relying on posixy $() will cause problems for
+    #        cross-compilation, but unfortunately the echo tests do not
+    #        yet detect zsh echo's removal of \ escapes.  Also zsh mangles
+    #	     `"' quotes if we put them in here... so don't!
+    archive_cmds='$CC -r -keep_private_externs -nostdlib -o ${lib}-master.o $libobjs && $CC $(test .$module = .yes && echo -bundle || echo -dynamiclib) $allow_undefined_flag -o $lib ${lib}-master.o $deplibs$linker_flags $(test .$module != .yes && echo -install_name $rpath/$soname $verstring)'
+    # We need to add '_' to the symbols in $export_symbols first
+    #archive_expsym_cmds="$archive_cmds"' && strip -s $export_symbols'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    whole_archive_flag_spec='-all_load $convenience'
+    ;;
+
+  freebsd1*)
+    ld_shlibs=no
+    ;;
+
+  # FreeBSD 2.2.[012] allows us to include c++rt0.o to get C++ constructor
+  # support.  Future versions do this automatically, but an explicit c++rt0.o
+  # does not break anything, and helps significantly (at the cost of a little
+  # extra space).
+  freebsd2.2*)
+    archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags /usr/lib/c++rt0.o'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  # Unfortunately, older versions of FreeBSD 2 do not have this feature.
+  freebsd2*)
+    archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_direct=yes
+    hardcode_minus_L=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  # FreeBSD 3 and greater uses gcc -shared to do shared libraries.
+  freebsd*)
+    archive_cmds='$CC -shared -o $lib $libobjs $deplibs $compiler_flags'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  hpux9* | hpux10* | hpux11*)
+    case $host_os in
+    hpux9*) archive_cmds='$rm $output_objdir/$soname~$LD -b +b $install_libdir -o $output_objdir/$soname $libobjs $deplibs $linker_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib' ;;
+    *) archive_cmds='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags' ;;
+    esac
+    hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir'
+    hardcode_libdir_separator=:
+    hardcode_direct=yes
+    hardcode_minus_L=yes # Not in the search PATH, but as the default
+			 # location of the library.
+    export_dynamic_flag_spec='${wl}-E'
+    ;;
+
+  irix5* | irix6* | nonstopux*)
+    if test "$GCC" = yes; then
+      archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+      hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    else
+      archive_cmds='$LD -shared $libobjs $deplibs $linker_flags -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${output_objdir}/so_locations -o $lib'
+      hardcode_libdir_flag_spec='-rpath $libdir'
+    fi
+    hardcode_libdir_separator=:
+    link_all_deplibs=yes
+    ;;
+
+  netbsd*)
+    if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+      archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'  # a.out
+    else
+      archive_cmds='$LD -shared -o $lib $libobjs $deplibs $linker_flags'      # ELF
+    fi
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  newsos6)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_direct=yes
+    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    hardcode_libdir_separator=:
+    hardcode_shlibpath_var=no
+    ;;
+
+  openbsd*)
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    if test -z "`echo __ELF__ | $CC -E - | grep __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
+      archive_cmds='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
+      hardcode_libdir_flag_spec='${wl}-rpath,$libdir'
+      export_dynamic_flag_spec='${wl}-E'
+    else
+      case "$host_os" in
+      openbsd[01].* | openbsd2.[0-7] | openbsd2.[0-7].*)
+	archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+	hardcode_libdir_flag_spec='-R$libdir'
+        ;;
+      *)
+        archive_cmds='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
+        hardcode_libdir_flag_spec='${wl}-rpath,$libdir'
+        ;;
+      esac
+    fi
+    ;;
+
+  os2*)
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_minus_L=yes
+    allow_undefined_flag=unsupported
+    archive_cmds='$echo "LIBRARY $libname INITINSTANCE" > $output_objdir/$libname.def~$echo "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~$echo DATA >> $output_objdir/$libname.def~$echo " SINGLE NONSHARED" >> $output_objdir/$libname.def~$echo EXPORTS >> $output_objdir/$libname.def~emxexp $libobjs >> $output_objdir/$libname.def~$CC -Zdll -Zcrtdll -o $lib $libobjs $deplibs $compiler_flags $output_objdir/$libname.def'
+    old_archive_from_new_cmds='emximp -o $output_objdir/$libname.a $output_objdir/$libname.def'
+    ;;
+
+  osf3*)
+    if test "$GCC" = yes; then
+      allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*'
+      archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+    else
+      allow_undefined_flag=' -expect_unresolved \*'
+      archive_cmds='$LD -shared${allow_undefined_flag} $libobjs $deplibs $linker_flags -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${output_objdir}/so_locations -o $lib'
+    fi
+    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    hardcode_libdir_separator=:
+    ;;
+
+  osf4* | osf5*)	# as osf3* with the addition of -msym flag
+    if test "$GCC" = yes; then
+      allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*'
+      archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+      hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    else
+      allow_undefined_flag=' -expect_unresolved \*'
+      archive_cmds='$LD -shared${allow_undefined_flag} $libobjs $deplibs $linker_flags -msym -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${output_objdir}/so_locations -o $lib'
+      archive_expsym_cmds='for i in `cat $export_symbols`; do printf "-exported_symbol " >> $lib.exp; echo "\$i" >> $lib.exp; done; echo "-hidden">> $lib.exp~
+      $LD -shared${allow_undefined_flag} -input $lib.exp $linker_flags $libobjs $deplibs -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${objdir}/so_locations -o $lib~$rm $lib.exp'
+
+      #Both c and cxx compiler support -rpath directly
+      hardcode_libdir_flag_spec='-rpath $libdir'
+    fi
+    hardcode_libdir_separator=:
+    ;;
+
+  sco3.2v5*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_shlibpath_var=no
+    runpath_var=LD_RUN_PATH
+    hardcode_runpath_var=yes
+    export_dynamic_flag_spec='${wl}-Bexport'
+    ;;
+
+  solaris*)
+    # gcc --version < 3.0 without binutils cannot create self contained
+    # shared libraries reliably, requiring libgcc.a to resolve some of
+    # the object symbols generated in some cases.  Libraries that use
+    # assert need libgcc.a to resolve __eprintf, for example.  Linking
+    # a copy of libgcc.a into every shared library to guarantee resolving
+    # such symbols causes other problems:  According to Tim Van Holder
+    # <tim.van.holder@pandora.be>, C++ libraries end up with a separate
+    # (to the application) exception stack for one thing.
+    no_undefined_flag=' -z defs'
+    if test "$GCC" = yes; then
+      case `$CC --version 2>/dev/null` in
+      [12].*)
+	cat <<EOF 1>&2
+
+*** Warning: Releases of GCC earlier than version 3.0 cannot reliably
+*** create self contained shared libraries on Solaris systems, without
+*** introducing a dependency on libgcc.a.  Therefore, libtool is disabling
+*** -no-undefined support, which will at least allow you to build shared
+*** libraries.  However, you may find that when you link such libraries
+*** into an application without using GCC, you have to manually add
+*** \`gcc --print-libgcc-file-name\` to the link command.  We urge you to
+*** upgrade to a newer version of GCC.  Another option is to rebuild your
+*** current GCC to use the GNU linker from GNU binutils 2.9.1 or newer.
+
+EOF
+        no_undefined_flag=
+	;;
+      esac
+    fi
+    # $CC -shared without GNU ld will not create a library from C++
+    # object files and a static libstdc++, better avoid it by now
+    archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    archive_expsym_cmds='$echo "{ global:" > $lib.exp~cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $lib.exp~$echo "local: *; };" >> $lib.exp~
+		$LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$rm $lib.exp'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_shlibpath_var=no
+    case $host_os in
+    solaris2.[0-5] | solaris2.[0-5].*) ;;
+    *) # Supported since Solaris 2.6 (maybe 2.5.1?)
+      whole_archive_flag_spec='-z allextract$convenience -z defaultextract' ;;
+    esac
+    link_all_deplibs=yes
+    ;;
+
+  sunos4*)
+    if test "x$host_vendor" = xsequent; then
+      # Use $CC to link under sequent, because it throws in some extra .o
+      # files that make .init and .fini sections work.
+      archive_cmds='$CC -G ${wl}-h $soname -o $lib $libobjs $deplibs $compiler_flags'
+    else
+      archive_cmds='$LD -assert pure-text -Bstatic -o $lib $libobjs $deplibs $linker_flags'
+    fi
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_direct=yes
+    hardcode_minus_L=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  sysv4)
+    case $host_vendor in
+      sni)
+        archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+        hardcode_direct=yes # is this really true???
+        ;;
+      siemens)
+        ## LD is ld it makes a PLAMLIB
+        ## CC just makes a GrossModule.
+        archive_cmds='$LD -G -o $lib $libobjs $deplibs $linker_flags'
+        reload_cmds='$CC -r -o $output$reload_objs'
+        hardcode_direct=no
+        ;;
+      motorola)
+        archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+        hardcode_direct=no #Motorola manual says yes, but my tests say they lie
+        ;;
+    esac
+    runpath_var='LD_RUN_PATH'
+    hardcode_shlibpath_var=no
+    ;;
+
+  sysv4.3*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_shlibpath_var=no
+    export_dynamic_flag_spec='-Bexport'
+    ;;
+
+  sysv5*)
+    no_undefined_flag=' -z text'
+    # $CC -shared without GNU ld will not create a library from C++
+    # object files and a static libstdc++, better avoid it by now
+    archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    archive_expsym_cmds='$echo "{ global:" > $lib.exp~cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $lib.exp~$echo "local: *; };" >> $lib.exp~
+		$LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$rm $lib.exp'
+    hardcode_libdir_flag_spec=
+    hardcode_shlibpath_var=no
+    runpath_var='LD_RUN_PATH'
+    ;;
+
+  uts4*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_shlibpath_var=no
+    ;;
+
+  dgux*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_shlibpath_var=no
+    ;;
+
+  sysv4*MP*)
+    if test -d /usr/nec; then
+      archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+      hardcode_shlibpath_var=no
+      runpath_var=LD_RUN_PATH
+      hardcode_runpath_var=yes
+      ld_shlibs=yes
+    fi
+    ;;
+
+  sysv4.2uw2*)
+    archive_cmds='$LD -G -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_direct=yes
+    hardcode_minus_L=no
+    hardcode_shlibpath_var=no
+    hardcode_runpath_var=yes
+    runpath_var=LD_RUN_PATH
+    ;;
+
+  sysv5uw7* | unixware7*)
+    no_undefined_flag='${wl}-z ${wl}text'
+    if test "$GCC" = yes; then
+      archive_cmds='$CC -shared ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+    else
+      archive_cmds='$CC -G ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
+    fi
+    runpath_var='LD_RUN_PATH'
+    hardcode_shlibpath_var=no
+    ;;
+
+  *)
+    ld_shlibs=no
+    ;;
+  esac
+fi
+echo "$ac_t""$ld_shlibs" 1>&6
+test "$ld_shlibs" = no && can_build_shared=no
+
+# Check hardcoding attributes.
+echo $ac_n "checking how to hardcode library paths into programs""... $ac_c" 1>&6
+echo "configure:4601: checking how to hardcode library paths into programs" >&5
+hardcode_action=
+if test -n "$hardcode_libdir_flag_spec" || \
+   test -n "$runpath_var"; then
+
+  # We can hardcode non-existant directories.
+  if test "$hardcode_direct" != no &&
+     # If the only mechanism to avoid hardcoding is shlibpath_var, we
+     # have to relink, otherwise we might link with an installed library
+     # when we should be linking with a yet-to-be-installed one
+     ## test "$hardcode_shlibpath_var" != no &&
+     test "$hardcode_minus_L" != no; then
+    # Linking always hardcodes the temporary library directory.
+    hardcode_action=relink
+  else
+    # We can link without hardcoding, and we can hardcode nonexisting dirs.
+    hardcode_action=immediate
+  fi
+else
+  # We cannot hardcode anything, or else we can only hardcode existing
+  # directories.
+  hardcode_action=unsupported
+fi
+echo "$ac_t""$hardcode_action" 1>&6
+
+striplib=
+old_striplib=
+echo $ac_n "checking whether stripping libraries is possible""... $ac_c" 1>&6
+echo "configure:4629: checking whether stripping libraries is possible" >&5
+if test -n "$STRIP" && $STRIP -V 2>&1 | grep "GNU strip" >/dev/null; then
+  test -z "$old_striplib" && old_striplib="$STRIP --strip-debug"
+  test -z "$striplib" && striplib="$STRIP --strip-unneeded"
+  echo "$ac_t""yes" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+reload_cmds='$LD$reload_flag -o $output$reload_objs'
+test -z "$deplibs_check_method" && deplibs_check_method=unknown
+
+# PORTME Fill in your ld.so characteristics
+echo $ac_n "checking dynamic linker characteristics""... $ac_c" 1>&6
+echo "configure:4643: checking dynamic linker characteristics" >&5
+library_names_spec=
+libname_spec='lib$name'
+soname_spec=
+postinstall_cmds=
+postuninstall_cmds=
+finish_cmds=
+finish_eval=
+shlibpath_var=
+shlibpath_overrides_runpath=unknown
+version_type=none
+dynamic_linker="$host_os ld.so"
+sys_lib_dlsearch_path_spec="/lib /usr/lib"
+sys_lib_search_path_spec="/lib /usr/lib /usr/local/lib"
+
+case $host_os in
+aix3*)
+  version_type=linux
+  library_names_spec='${libname}${release}.so$versuffix $libname.a'
+  shlibpath_var=LIBPATH
+
+  # AIX has no versioning support, so we append a major version to the name.
+  soname_spec='${libname}${release}.so$major'
+  ;;
+
+aix4* | aix5*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  hardcode_into_libs=yes
+  if test "$host_cpu" = ia64; then
+    # AIX 5 supports IA64
+    library_names_spec='${libname}${release}.so$major ${libname}${release}.so$versuffix $libname.so'
+    shlibpath_var=LD_LIBRARY_PATH
+  else
+    # With GCC up to 2.95.x, collect2 would create an import file
+    # for dependence libraries.  The import file would start with
+    # the line `#! .'.  This would cause the generated library to
+    # depend on `.', always an invalid library.  This was fixed in
+    # development snapshots of GCC prior to 3.0.
+    case $host_os in
+      aix4 | aix4.[01] | aix4.[01].*)
+	if { echo '#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 97)'
+	     echo ' yes '
+	     echo '#endif'; } | ${CC} -E - | grep yes > /dev/null; then
+	  :
+	else
+	  can_build_shared=no
+	fi
+	;;
+    esac
+    # AIX (on Power*) has no versioning support, so currently we can
+    # not hardcode correct soname into executable. Probably we can
+    # add versioning support to collect2, so additional links can
+    # be useful in future.
+    if test "$aix_use_runtimelinking" = yes; then
+      # If using run time linking (on AIX 4.2 or later) use lib<name>.so
+      # instead of lib<name>.a to let people know that these are not
+      # typical AIX shared libraries.
+      library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+    else
+      # We preserve .a as extension for shared libraries through AIX4.2
+      # and later when we are not doing run time linking.
+      library_names_spec='${libname}${release}.a $libname.a'
+      soname_spec='${libname}${release}.so$major'
+    fi
+    shlibpath_var=LIBPATH
+  fi
+  hardcode_into_libs=yes
+  ;;
+
+amigaos*)
+  library_names_spec='$libname.ixlibrary $libname.a'
+  # Create ${libname}_ixlibrary.a entries in /sys/libs.
+  finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`$echo "X$lib" | $Xsed -e '\''s%^.*/\([^/]*\)\.ixlibrary$%\1%'\''`; test $rm /sys/libs/${libname}_ixlibrary.a; $show "(cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a)"; (cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a) || exit 1; done'
+  ;;
+
+beos*)
+  library_names_spec='${libname}.so'
+  dynamic_linker="$host_os ld.so"
+  shlibpath_var=LIBRARY_PATH
+  ;;
+
+bsdi4*)
+  version_type=linux
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  sys_lib_search_path_spec="/shlib /usr/lib /usr/X11/lib /usr/contrib/lib /lib /usr/local/lib"
+  sys_lib_dlsearch_path_spec="/shlib /usr/lib /usr/local/lib"
+  export_dynamic_flag_spec=-rdynamic
+  # the default ld.so.conf also contains /usr/contrib/lib and
+  # /usr/X11R6/lib (/usr/X11 is a link to /usr/X11R6), but let us allow
+  # libtool to hard-code these into programs
+  ;;
+
+cygwin* | mingw* | pw32*)
+  version_type=windows
+  need_version=no
+  need_lib_prefix=no
+  case $GCC,$host_os in
+  yes,cygwin*)
+    library_names_spec='$libname.dll.a'
+    soname_spec='`echo ${libname} | sed -e 's/^lib/cyg/'``echo ${release} | sed -e 's/[.]/-/g'`${versuffix}.dll'
+    postinstall_cmds='dlpath=`bash 2>&1 -c '\''. $dir/${file}i;echo \$dlname'\''`~
+      dldir=$destdir/`dirname \$dlpath`~
+      test -d \$dldir || mkdir -p \$dldir~
+      $install_prog .libs/$dlname \$dldir/$dlname'
+    postuninstall_cmds='dldll=`bash 2>&1 -c '\''. $file; echo \$dlname'\''`~
+      dlpath=$dir/\$dldll~
+       $rm \$dlpath'
+    ;;
+  yes,mingw*)
+    library_names_spec='${libname}`echo ${release} | sed -e 's/[.]/-/g'`${versuffix}.dll'
+    sys_lib_search_path_spec=`$CC -print-search-dirs | grep "^libraries:" | sed -e "s/^libraries://" -e "s/;/ /g" -e "s,=/,/,g"`
+    ;;
+  yes,pw32*)
+    library_names_spec='`echo ${libname} | sed -e 's/^lib/pw/'``echo ${release} | sed -e 's/./-/g'`${versuffix}.dll'
+    ;;
+  *)
+    library_names_spec='${libname}`echo ${release} | sed -e 's/[.]/-/g'`${versuffix}.dll $libname.lib'
+    ;;
+  esac
+  dynamic_linker='Win32 ld.exe'
+  # FIXME: first we should search . and the directory the executable is in
+  shlibpath_var=PATH
+  ;;
+
+darwin* | rhapsody*)
+  dynamic_linker="$host_os dyld"
+  version_type=darwin
+  need_lib_prefix=no
+  need_version=no
+  # FIXME: Relying on posixy $() will cause problems for
+  #        cross-compilation, but unfortunately the echo tests do not
+  #        yet detect zsh echo's removal of \ escapes.
+  library_names_spec='${libname}${release}${versuffix}.$(test .$module = .yes && echo so || echo dylib) ${libname}${release}${major}.$(test .$module = .yes && echo so || echo dylib) ${libname}.$(test .$module = .yes && echo so || echo dylib)'
+  soname_spec='${libname}${release}${major}.$(test .$module = .yes && echo so || echo dylib)'
+  shlibpath_overrides_runpath=yes
+  shlibpath_var=DYLD_LIBRARY_PATH
+  ;;
+
+freebsd1*)
+  dynamic_linker=no
+  ;;
+
+freebsd*)
+  objformat=`test -x /usr/bin/objformat && /usr/bin/objformat || echo aout`
+  version_type=freebsd-$objformat
+  case $version_type in
+    freebsd-elf*)
+      library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so $libname.so'
+      need_version=no
+      need_lib_prefix=no
+      ;;
+    freebsd-*)
+      library_names_spec='${libname}${release}.so$versuffix $libname.so$versuffix'
+      need_version=yes
+      ;;
+  esac
+  shlibpath_var=LD_LIBRARY_PATH
+  case $host_os in
+  freebsd2*)
+    shlibpath_overrides_runpath=yes
+    ;;
+  *)
+    shlibpath_overrides_runpath=no
+    hardcode_into_libs=yes
+    ;;
+  esac
+  ;;
+
+gnu*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so${major} ${libname}.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  hardcode_into_libs=yes
+  ;;
+
+hpux9* | hpux10* | hpux11*)
+  # Give a soname corresponding to the major version so that dld.sl refuses to
+  # link against other versions.
+  dynamic_linker="$host_os dld.sl"
+  version_type=sunos
+  need_lib_prefix=no
+  need_version=no
+  shlibpath_var=SHLIB_PATH
+  shlibpath_overrides_runpath=no # +s is required to enable SHLIB_PATH
+  library_names_spec='${libname}${release}.sl$versuffix ${libname}${release}.sl$major $libname.sl'
+  soname_spec='${libname}${release}.sl$major'
+  # HP-UX runs *really* slowly unless shared libraries are mode 555.
+  postinstall_cmds='chmod 555 $lib'
+  ;;
+
+irix5* | irix6* | nonstopux*)
+  case $host_os in
+    nonstopux*) version_type=nonstopux ;;
+    *)          version_type=irix ;;
+  esac
+  need_lib_prefix=no
+  need_version=no
+  soname_spec='${libname}${release}.so$major'
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major ${libname}${release}.so $libname.so'
+  case $host_os in
+  irix5* | nonstopux*)
+    libsuff= shlibsuff=
+    ;;
+  *)
+    case $LD in # libtool.m4 will add one of these switches to LD
+    *-32|*"-32 ") libsuff= shlibsuff= libmagic=32-bit;;
+    *-n32|*"-n32 ") libsuff=32 shlibsuff=N32 libmagic=N32;;
+    *-64|*"-64 ") libsuff=64 shlibsuff=64 libmagic=64-bit;;
+    *) libsuff= shlibsuff= libmagic=never-match;;
+    esac
+    ;;
+  esac
+  shlibpath_var=LD_LIBRARY${shlibsuff}_PATH
+  shlibpath_overrides_runpath=no
+  sys_lib_search_path_spec="/usr/lib${libsuff} /lib${libsuff} /usr/local/lib${libsuff}"
+  sys_lib_dlsearch_path_spec="/usr/lib${libsuff} /lib${libsuff}"
+  ;;
+
+# No shared lib support for Linux oldld, aout, or coff.
+linux-gnuoldld* | linux-gnuaout* | linux-gnucoff*)
+  dynamic_linker=no
+  ;;
+
+# This must be Linux ELF.
+linux-gnu*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig -n $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  # This implies no fast_install, which is unacceptable.
+  # Some rework will be needed to allow for fast_install
+  # before this can be enabled.
+  hardcode_into_libs=yes
+
+  # We used to test for /lib/ld.so.1 and disable shared libraries on
+  # powerpc, because MkLinux only supported shared libraries with the
+  # GNU dynamic linker.  Since this was broken with cross compilers,
+  # most powerpc-linux boxes support dynamic linking these days and
+  # people can always --disable-shared, the test was removed, and we
+  # assume the GNU/Linux dynamic linker is in use.
+  dynamic_linker='GNU/Linux ld.so'
+
+  # Find out which ABI we are using (multilib Linux x86_64 hack).
+  libsuff=
+  case "$host_cpu" in
+  x86_64*|s390x*)
+    echo '#line 4902 "configure"' > conftest.$ac_ext
+    if { (eval echo configure:4903: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+      case `/usr/bin/file conftest.$ac_objext` in
+      *64-bit*)
+        libsuff=64
+        ;;
+      esac
+    fi
+    rm -rf conftest*
+    ;;
+  *)
+    ;;
+  esac
+  sys_lib_dlsearch_path_spec="/lib${libsuff} /usr/lib${libsuff}"
+  sys_lib_search_path_spec="/lib${libsuff} /usr/lib${libsuff} /usr/local/lib${libsuff}"
+  ;;
+
+netbsd*)
+  version_type=sunos
+  need_lib_prefix=no
+  need_version=no
+  if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+    library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix'
+    finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+    dynamic_linker='NetBSD (a.out) ld.so'
+  else
+    library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major ${libname}${release}.so ${libname}.so'
+    soname_spec='${libname}${release}.so$major'
+    dynamic_linker='NetBSD ld.elf_so'
+  fi
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  ;;
+
+newsos6)
+  version_type=linux
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  ;;
+
+openbsd*)
+  version_type=sunos
+  need_lib_prefix=no
+  need_version=no
+  if test -z "`echo __ELF__ | $CC -E - | grep __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
+    case "$host_os" in
+    openbsd2.[89] | openbsd2.[89].*)
+      shlibpath_overrides_runpath=no
+      ;;
+    *)
+      shlibpath_overrides_runpath=yes
+      ;;
+    esac
+  else
+    shlibpath_overrides_runpath=yes
+  fi
+  library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+os2*)
+  libname_spec='$name'
+  need_lib_prefix=no
+  library_names_spec='$libname.dll $libname.a'
+  dynamic_linker='OS/2 ld.exe'
+  shlibpath_var=LIBPATH
+  ;;
+
+osf3* | osf4* | osf5*)
+  version_type=osf
+  need_version=no
+  soname_spec='${libname}${release}.so$major'
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  shlibpath_var=LD_LIBRARY_PATH
+  sys_lib_search_path_spec="/usr/shlib /usr/ccs/lib /usr/lib/cmplrs/cc /usr/lib /usr/local/lib /var/shlib"
+  sys_lib_dlsearch_path_spec="$sys_lib_search_path_spec"
+  hardcode_into_libs=yes
+  ;;
+
+sco3.2v5*)
+  version_type=osf
+  soname_spec='${libname}${release}.so$major'
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+solaris*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  # ldd complains unless libraries are executable
+  postinstall_cmds='chmod +x $lib'
+  ;;
+
+sunos4*)
+  version_type=sunos
+  library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix'
+  finish_cmds='PATH="\$PATH:/usr/etc" ldconfig $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  if test "$with_gnu_ld" = yes; then
+    need_lib_prefix=no
+  fi
+  need_version=yes
+  ;;
+
+sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+  version_type=linux
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  case $host_vendor in
+    sni)
+      shlibpath_overrides_runpath=no
+      need_lib_prefix=no
+      export_dynamic_flag_spec='${wl}-Blargedynsym'
+      runpath_var=LD_RUN_PATH
+      ;;
+    siemens)
+      need_lib_prefix=no
+      ;;
+    motorola)
+      need_lib_prefix=no
+      need_version=no
+      shlibpath_overrides_runpath=no
+      sys_lib_search_path_spec='/lib /usr/lib /usr/ccs/lib'
+      ;;
+  esac
+  ;;
+
+uts4*)
+  version_type=linux
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+dgux*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+sysv4*MP*)
+  if test -d /usr/nec ;then
+    version_type=linux
+    library_names_spec='$libname.so.$versuffix $libname.so.$major $libname.so'
+    soname_spec='$libname.so.$major'
+    shlibpath_var=LD_LIBRARY_PATH
+  fi
+  ;;
+
+*)
+  dynamic_linker=no
+  ;;
+esac
+echo "$ac_t""$dynamic_linker" 1>&6
+test "$dynamic_linker" = no && can_build_shared=no
+
+# Report the final consequences.
+echo $ac_n "checking if libtool supports shared libraries""... $ac_c" 1>&6
+echo "configure:5074: checking if libtool supports shared libraries" >&5
+echo "$ac_t""$can_build_shared" 1>&6
+
+echo $ac_n "checking whether to build shared libraries""... $ac_c" 1>&6
+echo "configure:5078: checking whether to build shared libraries" >&5
+test "$can_build_shared" = "no" && enable_shared=no
+
+# On AIX, shared libraries and static libraries use the same namespace, and
+# are all built from PIC.
+case "$host_os" in
+aix3*)
+  test "$enable_shared" = yes && enable_static=no
+  if test -n "$RANLIB"; then
+    archive_cmds="$archive_cmds~\$RANLIB \$lib"
+    postinstall_cmds='$RANLIB $lib'
+  fi
+  ;;
+
+aix4*)
+  if test "$host_cpu" != ia64 && test "$aix_use_runtimelinking" = no ; then
+    test "$enable_shared" = yes && enable_static=no
+  fi
+  ;;
+esac
+echo "$ac_t""$enable_shared" 1>&6
+
+echo $ac_n "checking whether to build static libraries""... $ac_c" 1>&6
+echo "configure:5101: checking whether to build static libraries" >&5
+# Make sure either enable_shared or enable_static is yes.
+test "$enable_shared" = yes || enable_static=yes
+echo "$ac_t""$enable_static" 1>&6
+
+if test "$hardcode_action" = relink; then
+  # Fast installation is not supported
+  enable_fast_install=no
+elif test "$shlibpath_overrides_runpath" = yes ||
+     test "$enable_shared" = no; then
+  # Fast installation is not necessary
+  enable_fast_install=needless
+fi
+
+variables_saved_for_relink="PATH $shlibpath_var $runpath_var"
+if test "$GCC" = yes; then
+  variables_saved_for_relink="$variables_saved_for_relink GCC_EXEC_PREFIX COMPILER_PATH LIBRARY_PATH"
+fi
+
+if test "x$enable_dlopen" != xyes; then
+  enable_dlopen=unknown
+  enable_dlopen_self=unknown
+  enable_dlopen_self_static=unknown
+else
+  lt_cv_dlopen=no
+  lt_cv_dlopen_libs=
+
+  case $host_os in
+  beos*)
+    lt_cv_dlopen="load_add_on"
+    lt_cv_dlopen_libs=
+    lt_cv_dlopen_self=yes
+    ;;
+
+  cygwin* | mingw* | pw32*)
+    lt_cv_dlopen="LoadLibrary"
+    lt_cv_dlopen_libs=
+   ;;
+
+  *)
+    echo $ac_n "checking for shl_load""... $ac_c" 1>&6
+echo "configure:5142: checking for shl_load" >&5
+if eval "test \"`echo '$''{'ac_cv_func_shl_load'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 5147 "configure"
+#include "confdefs.h"
+/* System header to define __stub macros and hopefully few prototypes,
+    which can conflict with char shl_load(); below.  */
+#include <assert.h>
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+char shl_load();
+
+int main() {
+
+/* The GNU C library defines this for functions which it implements
+    to always fail with ENOSYS.  Some functions are actually named
+    something starting with __ and the normal name is an alias.  */
+#if defined (__stub_shl_load) || defined (__stub___shl_load)
+choke me
+#else
+shl_load();
+#endif
+
+; return 0; }
+EOF
+if { (eval echo configure:5170: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  eval "ac_cv_func_shl_load=yes"
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  eval "ac_cv_func_shl_load=no"
+fi
+rm -f conftest*
+fi
+
+if eval "test \"`echo '$ac_cv_func_'shl_load`\" = yes"; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="shl_load"
+else
+  echo "$ac_t""no" 1>&6
+echo $ac_n "checking for shl_load in -ldld""... $ac_c" 1>&6
+echo "configure:5188: checking for shl_load in -ldld" >&5
+ac_lib_var=`echo dld'_'shl_load | sed 'y%./+-%__p_%'`
+if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  ac_save_LIBS="$LIBS"
+LIBS="-ldld  $LIBS"
+cat > conftest.$ac_ext <<EOF
+#line 5196 "configure"
+#include "confdefs.h"
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+char shl_load();
+
+int main() {
+shl_load()
+; return 0; }
+EOF
+if { (eval echo configure:5207: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  eval "ac_cv_lib_$ac_lib_var=yes"
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  eval "ac_cv_lib_$ac_lib_var=no"
+fi
+rm -f conftest*
+LIBS="$ac_save_LIBS"
+
+fi
+if eval "test \"`echo '$ac_cv_lib_'$ac_lib_var`\" = yes"; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="shl_load" lt_cv_dlopen_libs="-dld"
+else
+  echo "$ac_t""no" 1>&6
+echo $ac_n "checking for dlopen""... $ac_c" 1>&6
+echo "configure:5226: checking for dlopen" >&5
+if eval "test \"`echo '$''{'ac_cv_func_dlopen'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 5231 "configure"
+#include "confdefs.h"
+/* System header to define __stub macros and hopefully few prototypes,
+    which can conflict with char dlopen(); below.  */
+#include <assert.h>
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+char dlopen();
+
+int main() {
+
+/* The GNU C library defines this for functions which it implements
+    to always fail with ENOSYS.  Some functions are actually named
+    something starting with __ and the normal name is an alias.  */
+#if defined (__stub_dlopen) || defined (__stub___dlopen)
+choke me
+#else
+dlopen();
+#endif
+
+; return 0; }
+EOF
+if { (eval echo configure:5254: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  eval "ac_cv_func_dlopen=yes"
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  eval "ac_cv_func_dlopen=no"
+fi
+rm -f conftest*
+fi
+
+if eval "test \"`echo '$ac_cv_func_'dlopen`\" = yes"; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="dlopen"
+else
+  echo "$ac_t""no" 1>&6
+echo $ac_n "checking for dlopen in -ldl""... $ac_c" 1>&6
+echo "configure:5272: checking for dlopen in -ldl" >&5
+ac_lib_var=`echo dl'_'dlopen | sed 'y%./+-%__p_%'`
+if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  ac_save_LIBS="$LIBS"
+LIBS="-ldl  $LIBS"
+cat > conftest.$ac_ext <<EOF
+#line 5280 "configure"
+#include "confdefs.h"
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+char dlopen();
+
+int main() {
+dlopen()
+; return 0; }
+EOF
+if { (eval echo configure:5291: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  eval "ac_cv_lib_$ac_lib_var=yes"
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  eval "ac_cv_lib_$ac_lib_var=no"
+fi
+rm -f conftest*
+LIBS="$ac_save_LIBS"
+
+fi
+if eval "test \"`echo '$ac_cv_lib_'$ac_lib_var`\" = yes"; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-ldl"
+else
+  echo "$ac_t""no" 1>&6
+echo $ac_n "checking for dlopen in -lsvld""... $ac_c" 1>&6
+echo "configure:5310: checking for dlopen in -lsvld" >&5
+ac_lib_var=`echo svld'_'dlopen | sed 'y%./+-%__p_%'`
+if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  ac_save_LIBS="$LIBS"
+LIBS="-lsvld  $LIBS"
+cat > conftest.$ac_ext <<EOF
+#line 5318 "configure"
+#include "confdefs.h"
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+char dlopen();
+
+int main() {
+dlopen()
+; return 0; }
+EOF
+if { (eval echo configure:5329: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  eval "ac_cv_lib_$ac_lib_var=yes"
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  eval "ac_cv_lib_$ac_lib_var=no"
+fi
+rm -f conftest*
+LIBS="$ac_save_LIBS"
+
+fi
+if eval "test \"`echo '$ac_cv_lib_'$ac_lib_var`\" = yes"; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-lsvld"
+else
+  echo "$ac_t""no" 1>&6
+echo $ac_n "checking for dld_link in -ldld""... $ac_c" 1>&6
+echo "configure:5348: checking for dld_link in -ldld" >&5
+ac_lib_var=`echo dld'_'dld_link | sed 'y%./+-%__p_%'`
+if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  ac_save_LIBS="$LIBS"
+LIBS="-ldld  $LIBS"
+cat > conftest.$ac_ext <<EOF
+#line 5356 "configure"
+#include "confdefs.h"
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+char dld_link();
+
+int main() {
+dld_link()
+; return 0; }
+EOF
+if { (eval echo configure:5367: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  eval "ac_cv_lib_$ac_lib_var=yes"
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  eval "ac_cv_lib_$ac_lib_var=no"
+fi
+rm -f conftest*
+LIBS="$ac_save_LIBS"
+
+fi
+if eval "test \"`echo '$ac_cv_lib_'$ac_lib_var`\" = yes"; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="dld_link" lt_cv_dlopen_libs="-dld"
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+	      
+fi
+
+	    
+fi
+
+	  
+fi
+
+	
+fi
+
+      
+fi
+
+    ;;
+  esac
+
+  if test "x$lt_cv_dlopen" != xno; then
+    enable_dlopen=yes
+  else
+    enable_dlopen=no
+  fi
+
+  case $lt_cv_dlopen in
+  dlopen)
+    save_CPPFLAGS="$CPPFLAGS"
+        test "x$ac_cv_header_dlfcn_h" = xyes && CPPFLAGS="$CPPFLAGS -DHAVE_DLFCN_H"
+
+    save_LDFLAGS="$LDFLAGS"
+    eval LDFLAGS=\"\$LDFLAGS $export_dynamic_flag_spec\"
+
+    save_LIBS="$LIBS"
+    LIBS="$lt_cv_dlopen_libs $LIBS"
+
+    echo $ac_n "checking whether a program can dlopen itself""... $ac_c" 1>&6
+echo "configure:5423: checking whether a program can dlopen itself" >&5
+if eval "test \"`echo '$''{'lt_cv_dlopen_self'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  	  if test "$cross_compiling" = yes; then :
+  lt_cv_dlopen_self=cross
+else
+    lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
+  lt_status=$lt_dlunknown
+  cat > conftest.$ac_ext <<EOF
+#line 5433 "configure"
+#include "confdefs.h"
+
+#if HAVE_DLFCN_H
+#include <dlfcn.h>
+#endif
+
+#include <stdio.h>
+
+#ifdef RTLD_GLOBAL
+#  define LT_DLGLOBAL		RTLD_GLOBAL
+#else
+#  ifdef DL_GLOBAL
+#    define LT_DLGLOBAL		DL_GLOBAL
+#  else
+#    define LT_DLGLOBAL		0
+#  endif
+#endif
+
+/* We may have to define LT_DLLAZY_OR_NOW in the command line if we
+   find out it does not work in some platform. */
+#ifndef LT_DLLAZY_OR_NOW
+#  ifdef RTLD_LAZY
+#    define LT_DLLAZY_OR_NOW		RTLD_LAZY
+#  else
+#    ifdef DL_LAZY
+#      define LT_DLLAZY_OR_NOW		DL_LAZY
+#    else
+#      ifdef RTLD_NOW
+#        define LT_DLLAZY_OR_NOW	RTLD_NOW
+#      else
+#        ifdef DL_NOW
+#          define LT_DLLAZY_OR_NOW	DL_NOW
+#        else
+#          define LT_DLLAZY_OR_NOW	0
+#        endif
+#      endif
+#    endif
+#  endif
+#endif
+
+#ifdef __cplusplus
+extern "C" void exit (int);
+#endif
+
+void fnord() { int i=42;}
+int main ()
+{
+  void *self = dlopen (0, LT_DLGLOBAL|LT_DLLAZY_OR_NOW);
+  int status = $lt_dlunknown;
+
+  if (self)
+    {
+      if (dlsym (self,"fnord"))       status = $lt_dlno_uscore;
+      else if (dlsym( self,"_fnord")) status = $lt_dlneed_uscore;
+      /* dlclose (self); */
+    }
+
+    exit (status);
+}
+EOF
+  if { (eval echo configure:5494: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} 2>/dev/null; then
+    (./conftest; exit; ) 2>/dev/null
+    lt_status=$?
+    case x$lt_status in
+      x$lt_dlno_uscore) lt_cv_dlopen_self=yes ;;
+      x$lt_dlneed_uscore) lt_cv_dlopen_self=yes ;;
+      x$lt_unknown|x*) lt_cv_dlopen_self=no ;;
+    esac
+  else :
+    # compilation failed
+    lt_cv_dlopen_self=no
+  fi
+fi
+rm -fr conftest*
+
+    
+fi
+
+echo "$ac_t""$lt_cv_dlopen_self" 1>&6
+
+    if test "x$lt_cv_dlopen_self" = xyes; then
+      LDFLAGS="$LDFLAGS $link_static_flag"
+      echo $ac_n "checking whether a statically linked program can dlopen itself""... $ac_c" 1>&6
+echo "configure:5517: checking whether a statically linked program can dlopen itself" >&5
+if eval "test \"`echo '$''{'lt_cv_dlopen_self_static'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  	  if test "$cross_compiling" = yes; then :
+  lt_cv_dlopen_self_static=cross
+else
+    lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
+  lt_status=$lt_dlunknown
+  cat > conftest.$ac_ext <<EOF
+#line 5527 "configure"
+#include "confdefs.h"
+
+#if HAVE_DLFCN_H
+#include <dlfcn.h>
+#endif
+
+#include <stdio.h>
+
+#ifdef RTLD_GLOBAL
+#  define LT_DLGLOBAL		RTLD_GLOBAL
+#else
+#  ifdef DL_GLOBAL
+#    define LT_DLGLOBAL		DL_GLOBAL
+#  else
+#    define LT_DLGLOBAL		0
+#  endif
+#endif
+
+/* We may have to define LT_DLLAZY_OR_NOW in the command line if we
+   find out it does not work in some platform. */
+#ifndef LT_DLLAZY_OR_NOW
+#  ifdef RTLD_LAZY
+#    define LT_DLLAZY_OR_NOW		RTLD_LAZY
+#  else
+#    ifdef DL_LAZY
+#      define LT_DLLAZY_OR_NOW		DL_LAZY
+#    else
+#      ifdef RTLD_NOW
+#        define LT_DLLAZY_OR_NOW	RTLD_NOW
+#      else
+#        ifdef DL_NOW
+#          define LT_DLLAZY_OR_NOW	DL_NOW
+#        else
+#          define LT_DLLAZY_OR_NOW	0
+#        endif
+#      endif
+#    endif
+#  endif
+#endif
+
+#ifdef __cplusplus
+extern "C" void exit (int);
+#endif
+
+void fnord() { int i=42;}
+int main ()
+{
+  void *self = dlopen (0, LT_DLGLOBAL|LT_DLLAZY_OR_NOW);
+  int status = $lt_dlunknown;
+
+  if (self)
+    {
+      if (dlsym (self,"fnord"))       status = $lt_dlno_uscore;
+      else if (dlsym( self,"_fnord")) status = $lt_dlneed_uscore;
+      /* dlclose (self); */
+    }
+
+    exit (status);
+}
+EOF
+  if { (eval echo configure:5588: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} 2>/dev/null; then
+    (./conftest; exit; ) 2>/dev/null
+    lt_status=$?
+    case x$lt_status in
+      x$lt_dlno_uscore) lt_cv_dlopen_self_static=yes ;;
+      x$lt_dlneed_uscore) lt_cv_dlopen_self_static=yes ;;
+      x$lt_unknown|x*) lt_cv_dlopen_self_static=no ;;
+    esac
+  else :
+    # compilation failed
+    lt_cv_dlopen_self_static=no
+  fi
+fi
+rm -fr conftest*
+
+      
+fi
+
+echo "$ac_t""$lt_cv_dlopen_self_static" 1>&6
+    fi
+
+    CPPFLAGS="$save_CPPFLAGS"
+    LDFLAGS="$save_LDFLAGS"
+    LIBS="$save_LIBS"
+    ;;
+  esac
+
+  case $lt_cv_dlopen_self in
+  yes|no) enable_dlopen_self=$lt_cv_dlopen_self ;;
+  *) enable_dlopen_self=unknown ;;
+  esac
+
+  case $lt_cv_dlopen_self_static in
+  yes|no) enable_dlopen_self_static=$lt_cv_dlopen_self_static ;;
+  *) enable_dlopen_self_static=unknown ;;
+  esac
+fi
+
+
+if test "$enable_shared" = yes && test "$GCC" = yes; then
+  case $archive_cmds in
+  *'~'*)
+    # FIXME: we may have to deal with multi-command sequences.
+    ;;
+  '$CC '*)
+    # Test whether the compiler implicitly links with -lc since on some
+    # systems, -lgcc has to come before -lc. If gcc already passes -lc
+    # to ld, don't add -lc before -lgcc.
+    echo $ac_n "checking whether -lc should be explicitly linked in""... $ac_c" 1>&6
+echo "configure:5637: checking whether -lc should be explicitly linked in" >&5
+    if eval "test \"`echo '$''{'lt_cv_archive_cmds_need_lc'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  $rm conftest*
+    echo 'static int dummy;' > conftest.$ac_ext
+
+    if { (eval echo configure:5644: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+      soname=conftest
+      lib=conftest
+      libobjs=conftest.$ac_objext
+      deplibs=
+      wl=$lt_cv_prog_cc_wl
+      compiler_flags=-v
+      linker_flags=-v
+      verstring=
+      output_objdir=.
+      libname=conftest
+      save_allow_undefined_flag=$allow_undefined_flag
+      allow_undefined_flag=
+      if { (eval echo configure:5657: \"$archive_cmds 2\>\&1 \| grep \" -lc \" \>/dev/null 2\>\&1\") 1>&5; (eval $archive_cmds 2\>\&1 \| grep \" -lc \" \>/dev/null 2\>\&1) 2>&5; }
+      then
+	lt_cv_archive_cmds_need_lc=no
+      else
+	lt_cv_archive_cmds_need_lc=yes
+      fi
+      allow_undefined_flag=$save_allow_undefined_flag
+    else
+      cat conftest.err 1>&5
+    fi
+    $rm conftest*
+fi
+
+    echo "$ac_t""$lt_cv_archive_cmds_need_lc" 1>&6
+    ;;
+  esac
+fi
+need_lc=${lt_cv_archive_cmds_need_lc-yes}
+
+# The second clause should only fire when bootstrapping the
+# libtool distribution, otherwise you forgot to ship ltmain.sh
+# with your package, and you will get complaints that there are
+# no rules to generate ltmain.sh.
+if test -f "$ltmain"; then
+  :
+else
+  # If there is no Makefile yet, we rely on a make rule to execute
+  # `config.status --recheck' to rerun these tests and create the
+  # libtool script then.
+  test -f Makefile && make "$ltmain"
+fi
+
+if test -f "$ltmain"; then
+  trap "$rm \"${ofile}T\"; exit 1" 1 2 15
+  $rm -f "${ofile}T"
+
+  echo creating $ofile
+
+  # Now quote all the things that may contain metacharacters while being
+  # careful not to overquote the AC_SUBSTed values.  We take copies of the
+  # variables and quote the copies for generation of the libtool script.
+  for var in echo old_CC old_CFLAGS SED \
+    AR AR_FLAGS CC LD LN_S NM SHELL \
+    reload_flag reload_cmds wl \
+    pic_flag link_static_flag no_builtin_flag export_dynamic_flag_spec \
+    thread_safe_flag_spec whole_archive_flag_spec libname_spec \
+    library_names_spec soname_spec \
+    RANLIB old_archive_cmds old_archive_from_new_cmds old_postinstall_cmds \
+    old_postuninstall_cmds archive_cmds archive_expsym_cmds postinstall_cmds \
+    postuninstall_cmds extract_expsyms_cmds old_archive_from_expsyms_cmds \
+    old_striplib striplib file_magic_cmd export_symbols_cmds \
+    deplibs_check_method allow_undefined_flag no_undefined_flag \
+    finish_cmds finish_eval global_symbol_pipe global_symbol_to_cdecl \
+    global_symbol_to_c_name_address \
+    hardcode_libdir_flag_spec hardcode_libdir_separator  \
+    sys_lib_search_path_spec sys_lib_dlsearch_path_spec \
+    compiler_c_o compiler_o_lo need_locks exclude_expsyms include_expsyms; do
+
+    case $var in
+    reload_cmds | old_archive_cmds | old_archive_from_new_cmds | \
+    old_postinstall_cmds | old_postuninstall_cmds | \
+    export_symbols_cmds | archive_cmds | archive_expsym_cmds | \
+    extract_expsyms_cmds | old_archive_from_expsyms_cmds | \
+    postinstall_cmds | postuninstall_cmds | \
+    finish_cmds | sys_lib_search_path_spec | sys_lib_dlsearch_path_spec)
+      # Double-quote double-evaled strings.
+      eval "lt_$var=\\\"\`\$echo \"X\$$var\" | \$Xsed -e \"\$double_quote_subst\" -e \"\$sed_quote_subst\" -e \"\$delay_variable_subst\"\`\\\""
+      ;;
+    *)
+      eval "lt_$var=\\\"\`\$echo \"X\$$var\" | \$Xsed -e \"\$sed_quote_subst\"\`\\\""
+      ;;
+    esac
+  done
+
+  cat <<__EOF__ > "${ofile}T"
+#! $SHELL
+
+# `$echo "$ofile" | sed 's%^.*/%%'` - Provide generalized library-building support services.
+# Generated automatically by $PROGRAM (GNU $PACKAGE $VERSION$TIMESTAMP)
+# NOTE: Changes made to this file will be lost: look at ltmain.sh.
+#
+# Copyright (C) 1996-2000 Free Software Foundation, Inc.
+# Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# A sed that does not truncate output.
+SED=$lt_SED
+
+# Sed that helps us avoid accidentally triggering echo(1) options like -n.
+Xsed="${SED} -e s/^X//"
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+if test "X\${CDPATH+set}" = Xset; then CDPATH=:; export CDPATH; fi
+
+# ### BEGIN LIBTOOL CONFIG
+
+# Libtool was configured on host `(hostname || uname -n) 2>/dev/null | sed 1q`:
+
+# Shell to use when invoking shell scripts.
+SHELL=$lt_SHELL
+
+# Whether or not to build shared libraries.
+build_libtool_libs=$enable_shared
+
+# Whether or not to build static libraries.
+build_old_libs=$enable_static
+
+# Whether or not to add -lc for building shared libraries.
+build_libtool_need_lc=$need_lc
+
+# Whether or not to optimize for fast installation.
+fast_install=$enable_fast_install
+
+# The host system.
+host_alias=$host_alias
+host=$host
+
+# An echo program that does not interpret backslashes.
+echo=$lt_echo
+
+# The archiver.
+AR=$lt_AR
+AR_FLAGS=$lt_AR_FLAGS
+
+# The default C compiler.
+CC=$lt_CC
+
+# Is the compiler the GNU C compiler?
+with_gcc=$GCC
+
+# The linker used to build libraries.
+LD=$lt_LD
+
+# Whether we need hard or soft links.
+LN_S=$lt_LN_S
+
+# A BSD-compatible nm program.
+NM=$lt_NM
+
+# A symbol stripping program
+STRIP=$STRIP
+
+# Used to examine libraries when file_magic_cmd begins "file"
+MAGIC_CMD=$MAGIC_CMD
+
+# Used on cygwin: DLL creation program.
+DLLTOOL="$DLLTOOL"
+
+# Used on cygwin: object dumper.
+OBJDUMP="$OBJDUMP"
+
+# Used on cygwin: assembler.
+AS="$AS"
+
+# The name of the directory that contains temporary libtool files.
+objdir=$objdir
+
+# How to create reloadable object files.
+reload_flag=$lt_reload_flag
+reload_cmds=$lt_reload_cmds
+
+# How to pass a linker flag through the compiler.
+wl=$lt_wl
+
+# Object file suffix (normally "o").
+objext="$ac_objext"
+
+# Old archive suffix (normally "a").
+libext="$libext"
+
+# Executable file suffix (normally "").
+exeext="$exeext"
+
+# Additional compiler flags for building library objects.
+pic_flag=$lt_pic_flag
+pic_mode=$pic_mode
+
+# Does compiler simultaneously support -c and -o options?
+compiler_c_o=$lt_compiler_c_o
+
+# Can we write directly to a .lo ?
+compiler_o_lo=$lt_compiler_o_lo
+
+# Must we lock files when doing compilation ?
+need_locks=$lt_need_locks
+
+# Do we need the lib prefix for modules?
+need_lib_prefix=$need_lib_prefix
+
+# Do we need a version for libraries?
+need_version=$need_version
+
+# Whether dlopen is supported.
+dlopen_support=$enable_dlopen
+
+# Whether dlopen of programs is supported.
+dlopen_self=$enable_dlopen_self
+
+# Whether dlopen of statically linked programs is supported.
+dlopen_self_static=$enable_dlopen_self_static
+
+# Compiler flag to prevent dynamic linking.
+link_static_flag=$lt_link_static_flag
+
+# Compiler flag to turn off builtin functions.
+no_builtin_flag=$lt_no_builtin_flag
+
+# Compiler flag to allow reflexive dlopens.
+export_dynamic_flag_spec=$lt_export_dynamic_flag_spec
+
+# Compiler flag to generate shared objects directly from archives.
+whole_archive_flag_spec=$lt_whole_archive_flag_spec
+
+# Compiler flag to generate thread-safe objects.
+thread_safe_flag_spec=$lt_thread_safe_flag_spec
+
+# Library versioning type.
+version_type=$version_type
+
+# Format of library name prefix.
+libname_spec=$lt_libname_spec
+
+# List of archive names.  First name is the real one, the rest are links.
+# The last name is the one that the linker finds with -lNAME.
+library_names_spec=$lt_library_names_spec
+
+# The coded name of the library, if different from the real name.
+soname_spec=$lt_soname_spec
+
+# Commands used to build and install an old-style archive.
+RANLIB=$lt_RANLIB
+old_archive_cmds=$lt_old_archive_cmds
+old_postinstall_cmds=$lt_old_postinstall_cmds
+old_postuninstall_cmds=$lt_old_postuninstall_cmds
+
+# Create an old-style archive from a shared archive.
+old_archive_from_new_cmds=$lt_old_archive_from_new_cmds
+
+# Create a temporary old-style archive to link instead of a shared archive.
+old_archive_from_expsyms_cmds=$lt_old_archive_from_expsyms_cmds
+
+# Commands used to build and install a shared archive.
+archive_cmds=$lt_archive_cmds
+archive_expsym_cmds=$lt_archive_expsym_cmds
+postinstall_cmds=$lt_postinstall_cmds
+postuninstall_cmds=$lt_postuninstall_cmds
+
+# Commands to strip libraries.
+old_striplib=$lt_old_striplib
+striplib=$lt_striplib
+
+# Method to check whether dependent libraries are shared objects.
+deplibs_check_method=$lt_deplibs_check_method
+
+# Command to use when deplibs_check_method == file_magic.
+file_magic_cmd=$lt_file_magic_cmd
+
+# Flag that allows shared libraries with undefined symbols to be built.
+allow_undefined_flag=$lt_allow_undefined_flag
+
+# Flag that forces no undefined symbols.
+no_undefined_flag=$lt_no_undefined_flag
+
+# Commands used to finish a libtool library installation in a directory.
+finish_cmds=$lt_finish_cmds
+
+# Same as above, but a single script fragment to be evaled but not shown.
+finish_eval=$lt_finish_eval
+
+# Take the output of nm and produce a listing of raw symbols and C names.
+global_symbol_pipe=$lt_global_symbol_pipe
+
+# Transform the output of nm in a proper C declaration
+global_symbol_to_cdecl=$lt_global_symbol_to_cdecl
+
+# Transform the output of nm in a C name address pair
+global_symbol_to_c_name_address=$lt_global_symbol_to_c_name_address
+
+# This is the shared library runtime path variable.
+runpath_var=$runpath_var
+
+# This is the shared library path variable.
+shlibpath_var=$shlibpath_var
+
+# Is shlibpath searched before the hard-coded library search path?
+shlibpath_overrides_runpath=$shlibpath_overrides_runpath
+
+# How to hardcode a shared library path into an executable.
+hardcode_action=$hardcode_action
+
+# Whether we should hardcode library paths into libraries.
+hardcode_into_libs=$hardcode_into_libs
+
+# Flag to hardcode \$libdir into a binary during linking.
+# This must work even if \$libdir does not exist.
+hardcode_libdir_flag_spec=$lt_hardcode_libdir_flag_spec
+
+# Whether we need a single -rpath flag with a separated argument.
+hardcode_libdir_separator=$lt_hardcode_libdir_separator
+
+# Set to yes if using DIR/libNAME.so during linking hardcodes DIR into the
+# resulting binary.
+hardcode_direct=$hardcode_direct
+
+# Set to yes if using the -LDIR flag during linking hardcodes DIR into the
+# resulting binary.
+hardcode_minus_L=$hardcode_minus_L
+
+# Set to yes if using SHLIBPATH_VAR=DIR during linking hardcodes DIR into
+# the resulting binary.
+hardcode_shlibpath_var=$hardcode_shlibpath_var
+
+# Variables whose values should be saved in libtool wrapper scripts and
+# restored at relink time.
+variables_saved_for_relink="$variables_saved_for_relink"
+
+# Whether libtool must link a program against all its dependency libraries.
+link_all_deplibs=$link_all_deplibs
+
+# Compile-time system search path for libraries
+sys_lib_search_path_spec=$lt_sys_lib_search_path_spec
+
+# Run-time system search path for libraries
+sys_lib_dlsearch_path_spec=$lt_sys_lib_dlsearch_path_spec
+
+# Fix the shell variable \$srcfile for the compiler.
+fix_srcfile_path="$fix_srcfile_path"
+
+# Set to yes if exported symbols are required.
+always_export_symbols=$always_export_symbols
+
+# The commands to list exported symbols.
+export_symbols_cmds=$lt_export_symbols_cmds
+
+# The commands to extract the exported symbol list from a shared archive.
+extract_expsyms_cmds=$lt_extract_expsyms_cmds
+
+# Symbols that should not be listed in the preloaded symbols.
+exclude_expsyms=$lt_exclude_expsyms
+
+# Symbols that must always be exported.
+include_expsyms=$lt_include_expsyms
+
+# ### END LIBTOOL CONFIG
+
+__EOF__
+
+  case $host_os in
+  aix3*)
+    cat <<\EOF >> "${ofile}T"
+
+# AIX sometimes has problems with the GCC collect2 program.  For some
+# reason, if we set the COLLECT_NAMES environment variable, the problems
+# vanish in a puff of smoke.
+if test "X${COLLECT_NAMES+set}" != Xset; then
+  COLLECT_NAMES=
+  export COLLECT_NAMES
+fi
+EOF
+    ;;
+  esac
+
+  case $host_os in
+  cygwin* | mingw* | pw32* | os2*)
+    cat <<'EOF' >> "${ofile}T"
+      # This is a source program that is used to create dlls on Windows
+      # Don't remove nor modify the starting and closing comments
+# /* ltdll.c starts here */
+# #define WIN32_LEAN_AND_MEAN
+# #include <windows.h>
+# #undef WIN32_LEAN_AND_MEAN
+# #include <stdio.h>
+#
+# #ifndef __CYGWIN__
+# #  ifdef __CYGWIN32__
+# #    define __CYGWIN__ __CYGWIN32__
+# #  endif
+# #endif
+#
+# #ifdef __cplusplus
+# extern "C" {
+# #endif
+# BOOL APIENTRY DllMain (HINSTANCE hInst, DWORD reason, LPVOID reserved);
+# #ifdef __cplusplus
+# }
+# #endif
+#
+# #ifdef __CYGWIN__
+# #include <cygwin/cygwin_dll.h>
+# DECLARE_CYGWIN_DLL( DllMain );
+# #endif
+# HINSTANCE __hDllInstance_base;
+#
+# BOOL APIENTRY
+# DllMain (HINSTANCE hInst, DWORD reason, LPVOID reserved)
+# {
+#   __hDllInstance_base = hInst;
+#   return TRUE;
+# }
+# /* ltdll.c ends here */
+	# This is a source program that is used to create import libraries
+	# on Windows for dlls which lack them. Don't remove nor modify the
+	# starting and closing comments
+# /* impgen.c starts here */
+# /*   Copyright (C) 1999-2000 Free Software Foundation, Inc.
+#
+#  This file is part of GNU libtool.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#  */
+#
+# #include <stdio.h>		/* for printf() */
+# #include <unistd.h>		/* for open(), lseek(), read() */
+# #include <fcntl.h>		/* for O_RDONLY, O_BINARY */
+# #include <string.h>		/* for strdup() */
+#
+# /* O_BINARY isn't required (or even defined sometimes) under Unix */
+# #ifndef O_BINARY
+# #define O_BINARY 0
+# #endif
+#
+# static unsigned int
+# pe_get16 (fd, offset)
+#      int fd;
+#      int offset;
+# {
+#   unsigned char b[2];
+#   lseek (fd, offset, SEEK_SET);
+#   read (fd, b, 2);
+#   return b[0] + (b[1]<<8);
+# }
+#
+# static unsigned int
+# pe_get32 (fd, offset)
+#     int fd;
+#     int offset;
+# {
+#   unsigned char b[4];
+#   lseek (fd, offset, SEEK_SET);
+#   read (fd, b, 4);
+#   return b[0] + (b[1]<<8) + (b[2]<<16) + (b[3]<<24);
+# }
+#
+# static unsigned int
+# pe_as32 (ptr)
+#      void *ptr;
+# {
+#   unsigned char *b = ptr;
+#   return b[0] + (b[1]<<8) + (b[2]<<16) + (b[3]<<24);
+# }
+#
+# int
+# main (argc, argv)
+#     int argc;
+#     char *argv[];
+# {
+#     int dll;
+#     unsigned long pe_header_offset, opthdr_ofs, num_entries, i;
+#     unsigned long export_rva, export_size, nsections, secptr, expptr;
+#     unsigned long name_rvas, nexp;
+#     unsigned char *expdata, *erva;
+#     char *filename, *dll_name;
+#
+#     filename = argv[1];
+#
+#     dll = open(filename, O_RDONLY|O_BINARY);
+#     if (dll < 1)
+# 	return 1;
+#
+#     dll_name = filename;
+#
+#     for (i=0; filename[i]; i++)
+# 	if (filename[i] == '/' || filename[i] == '\\'  || filename[i] == ':')
+# 	    dll_name = filename + i +1;
+#
+#     pe_header_offset = pe_get32 (dll, 0x3c);
+#     opthdr_ofs = pe_header_offset + 4 + 20;
+#     num_entries = pe_get32 (dll, opthdr_ofs + 92);
+#
+#     if (num_entries < 1) /* no exports */
+# 	return 1;
+#
+#     export_rva = pe_get32 (dll, opthdr_ofs + 96);
+#     export_size = pe_get32 (dll, opthdr_ofs + 100);
+#     nsections = pe_get16 (dll, pe_header_offset + 4 +2);
+#     secptr = (pe_header_offset + 4 + 20 +
+# 	      pe_get16 (dll, pe_header_offset + 4 + 16));
+#
+#     expptr = 0;
+#     for (i = 0; i < nsections; i++)
+#     {
+# 	char sname[8];
+# 	unsigned long secptr1 = secptr + 40 * i;
+# 	unsigned long vaddr = pe_get32 (dll, secptr1 + 12);
+# 	unsigned long vsize = pe_get32 (dll, secptr1 + 16);
+# 	unsigned long fptr = pe_get32 (dll, secptr1 + 20);
+# 	lseek(dll, secptr1, SEEK_SET);
+# 	read(dll, sname, 8);
+# 	if (vaddr <= export_rva && vaddr+vsize > export_rva)
+# 	{
+# 	    expptr = fptr + (export_rva - vaddr);
+# 	    if (export_rva + export_size > vaddr + vsize)
+# 		export_size = vsize - (export_rva - vaddr);
+# 	    break;
+# 	}
+#     }
+#
+#     expdata = (unsigned char*)malloc(export_size);
+#     lseek (dll, expptr, SEEK_SET);
+#     read (dll, expdata, export_size);
+#     erva = expdata - export_rva;
+#
+#     nexp = pe_as32 (expdata+24);
+#     name_rvas = pe_as32 (expdata+32);
+#
+#     printf ("EXPORTS\n");
+#     for (i = 0; i<nexp; i++)
+#     {
+# 	unsigned long name_rva = pe_as32 (erva+name_rvas+i*4);
+# 	printf ("\t%s @ %ld ;\n", erva+name_rva, 1+ i);
+#     }
+#
+#     return 0;
+# }
+# /* impgen.c ends here */
+
+EOF
+    ;;
+  esac
+
+  # We use sed instead of cat because bash on DJGPP gets confused if
+  # if finds mixed CR/LF and LF-only lines.  Since sed operates in
+  # text mode, it properly converts lines to CR/LF.  This bash problem
+  # is reportedly fixed, but why not run on old versions too?
+  sed '$q' "$ltmain" >> "${ofile}T" || (rm -f "${ofile}T"; exit 1)
+
+  mv -f "${ofile}T" "$ofile" || \
+    (rm -f "$ofile" && cp "${ofile}T" "$ofile" && rm -f "${ofile}T")
+  chmod +x "$ofile"
+fi
+
+
+
+
+
+# This can be used to rebuild libtool when needed
+LIBTOOL_DEPS="$ac_aux_dir/ltmain.sh"
+
+# Always use our own libtool.
+LIBTOOL='$(SHELL) $(top_builddir)/libtool'
+
+# Prevent multiple expansion
+
+
+fi
+# if libtool >= 1.5
+TAGCC=
+
+
 # Select memory manager depending on user input.
 # If no "-enable-maxmem", use jmemnobs
 MEMORYMGR='jmemnobs.$(O)'
@@ -1592,16 +6273,16 @@
 EOF
 
 echo $ac_n "checking for 'tmpfile()'""... $ac_c" 1>&6
-echo "configure:1596: checking for 'tmpfile()'" >&5
+echo "configure:6277: checking for 'tmpfile()'" >&5
 cat > conftest.$ac_ext <<EOF
-#line 1598 "configure"
+#line 6279 "configure"
 #include "confdefs.h"
 #include <stdio.h>
 int main() {
  FILE * tfile = tmpfile(); 
 ; return 0; }
 EOF
-if { (eval echo configure:1605: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
+if { (eval echo configure:6286: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
   rm -rf conftest*
   echo "$ac_t""yes" 1>&6
 MEMORYMGR='jmemansi.$(O)'
@@ -1616,16 +6297,16 @@
 EOF
 
 echo $ac_n "checking for 'mktemp()'""... $ac_c" 1>&6
-echo "configure:1620: checking for 'mktemp()'" >&5
+echo "configure:6301: checking for 'mktemp()'" >&5
 cat > conftest.$ac_ext <<EOF
-#line 1622 "configure"
+#line 6303 "configure"
 #include "confdefs.h"
 
 int main() {
  char fname[80]; mktemp(fname); 
 ; return 0; }
 EOF
-if { (eval echo configure:1629: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
+if { (eval echo configure:6310: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
   rm -rf conftest*
   echo "$ac_t""yes" 1>&6
 else
@@ -1644,11 +6325,354 @@
 fi
 
 
-# Extract the library version ID from jpeglib.h.
-echo $ac_n "checking libjpeg version number""... $ac_c" 1>&6
-echo "configure:1650: checking libjpeg version number" >&5
-JPEG_LIB_VERSION=`sed -e '/^#define JPEG_LIB_VERSION/!d' -e 's/^[^0-9]*\([0-9][0-9]*\).*$/\1/' $srcdir/jpeglib.h`
-echo "$ac_t""$JPEG_LIB_VERSION" 1>&6
+
+echo $ac_n "checking to see if the host cpu type is i386 or compatible""... $ac_c" 1>&6
+echo "configure:6331: checking to see if the host cpu type is i386 or compatible" >&5
+case "$host_cpu" in
+  i*86 | x86 | ia32)
+    echo "$ac_t""yes" 1>&6
+  ;;
+  x86_64 | amd64 | aa64)
+    echo "$ac_t""no (x86_64)" 1>&6
+    { echo "configure: error: Currently, this version of JPEG library cannot be compiled as 64-bit code. sorry." 1>&2; exit 1; }
+  ;;
+  *)
+    echo "$ac_t""no ("$host_cpu")" 1>&6
+    { echo "configure: error: This version of JPEG library is for i386 or compatible processors only." 1>&2; exit 1; }
+  ;;
+esac
+
+if test -z "$NAFLAGS" ; then
+  echo $ac_n "checking for object file format of host system""... $ac_c" 1>&6
+echo "configure:6348: checking for object file format of host system" >&5
+  case "$host_os" in
+    cygwin* | mingw* | pw32* | interix*)
+      objfmt='Win32-COFF'
+    ;;
+    msdosdjgpp* | go32*)
+      objfmt='COFF'
+    ;;
+    os2-emx*)			# not tested
+      objfmt='MSOMF'		# obj
+    ;;
+    linux*coff* | linux*oldld*)
+      objfmt='COFF'		# ???
+    ;;
+    linux*aout*)
+      objfmt='a.out'
+    ;;
+    linux*)
+      objfmt='ELF'
+    ;;
+    freebsd* | netbsd* | openbsd*)
+      if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+        objfmt='BSD-a.out'
+      else
+        objfmt='ELF'
+      fi
+    ;;
+    solaris* | sunos* | sysv* | sco*)
+      objfmt='ELF'
+    ;;
+    darwin* | rhapsody* | nextstep* | openstep* | macos*)
+      objfmt='Mach-O'
+    ;;
+    *)
+      objfmt='ELF ?'
+    ;;
+  esac
+  echo "$ac_t""$objfmt" 1>&6
+  if test "$objfmt" = 'ELF ?'; then
+    objfmt='ELF'
+    echo "configure: warning: unexpected host system. assumed that the format is $objfmt." 1>&2
+  fi
+else
+  objfmt=''
+fi
+echo $ac_n "checking for object file format specifier (NAFLAGS) ""... $ac_c" 1>&6
+echo "configure:6394: checking for object file format specifier (NAFLAGS) " >&5
+case "$objfmt" in
+  MSOMF)      NAFLAGS='-fobj -DOBJ32';;
+  Win32-COFF) NAFLAGS='-fwin32 -DWIN32';;
+  COFF)       NAFLAGS='-fcoff -DCOFF';;
+  a.out)      NAFLAGS='-faout -DAOUT';;
+  BSD-a.out)  NAFLAGS='-faoutb -DAOUT';;
+  ELF)        NAFLAGS='-felf -DELF';;
+  RDF)        NAFLAGS='-frdf -DRDF';;
+  Mach-O)     NAFLAGS='-fmacho -DMACHO';;
+esac
+echo "$ac_t""$NAFLAGS" 1>&6
+
+
+
+for ac_prog in nasm nasmw
+do
+# Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
+echo "configure:6414: checking for $ac_word" >&5
+if eval "test \"`echo '$''{'ac_cv_prog_NASM'+set}'`\" = set"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test -n "$NASM"; then
+  ac_cv_prog_NASM="$NASM" # Let the user override the test.
+else
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$ac_word; then
+      ac_cv_prog_NASM="$ac_prog"
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+fi
+fi
+NASM="$ac_cv_prog_NASM"
+if test -n "$NASM"; then
+  echo "$ac_t""$NASM" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+test -n "$NASM" && break
+done
+
+test -z "$NASM" && { echo "configure: error: no nasm (Netwide Assembler) found in \$PATH" 1>&2; exit 1; }
+if echo "$NASM" | grep yasm > /dev/null; then
+  echo "configure: warning: DON'T USE YASM! CURRENT VERSION (R0.4.0) IS BUGGY!" 1>&2
+fi
+
+echo $ac_n "checking whether the assembler ($NASM $NAFLAGS) works""... $ac_c" 1>&6
+echo "configure:6449: checking whether the assembler ($NASM $NAFLAGS) works" >&5
+cat > conftest.asm <<EOF
+%line 6451 "configure"
+        section .text
+        bits    32
+        global  _main,main
+_main:
+main:   xor     eax,eax
+        ret
+EOF
+try_nasm='$NASM $NAFLAGS -o conftest.o conftest.asm'
+if { (eval echo configure:6460: \"$try_nasm\") 1>&5; (eval $try_nasm) 2>&5; } && test -s conftest.o; then
+  echo "$ac_t""yes" 1>&6
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.asm >&5
+  rm -rf conftest*
+  echo "$ac_t""no" 1>&6
+  { echo "configure: error: installation or configuration problem: assembler cannot create object files." 1>&2; exit 1; }
+fi
+echo $ac_n "checking whether the linker accepts assembler output""... $ac_c" 1>&6
+echo "configure:6470: checking whether the linker accepts assembler output" >&5
+try_nasm='${CC-cc} -o conftest${ac_exeext} $LDFLAGS conftest.o $LIBS 1>&5'
+if { (eval echo configure:6472: \"$try_nasm\") 1>&5; (eval $try_nasm) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  echo "$ac_t""yes" 1>&6
+else
+  rm -rf conftest*
+  echo "$ac_t""no" 1>&6
+  { echo "configure: error: configuration problem: maybe object file format mismatch." 1>&2; exit 1; }
+fi
+
+echo $ac_n "checking whether the assembler supports line continuation character""... $ac_c" 1>&6
+echo "configure:6482: checking whether the assembler supports line continuation character" >&5
+cat > conftest.asm <<\EOF
+%line 6484 "configure"
+; The line continuation character '\'
+; was introduced in nasm 0.98.25.
+        section .text
+        bits    32
+        global  _zero
+_zero:  xor     \
+                eax,eax
+        ret
+EOF
+try_nasm='$NASM $NAFLAGS -o conftest.o conftest.asm'
+if { (eval echo configure:6495: \"$try_nasm\") 1>&5; (eval $try_nasm) 2>&5; } && test -s conftest.o; then
+  rm -rf conftest*
+  echo "$ac_t""yes" 1>&6
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.asm >&5
+  rm -rf conftest*
+  echo "$ac_t""no" 1>&6
+  { echo "configure: error: you have to use a more recent version of the assembler." 1>&2; exit 1; }
+fi
+
+
+echo $ac_n "checking SIMD instruction sets requested to use""... $ac_c" 1>&6
+echo "configure:6508: checking SIMD instruction sets requested to use" >&5
+simd_to_use=""
+
+# Check whether --enable-mmx or --disable-mmx was given.
+if test "${enable_mmx+set}" = set; then
+  enableval="$enable_mmx"
+  if test "x$enableval" = xno; then
+  cat >> confdefs.h <<\EOF
+#define JSIMD_MMX_NOT_SUPPORTED 
+EOF
+
+else
+  simd_to_use="$simd_to_use MMX"
+fi
+else
+  simd_to_use="$simd_to_use MMX"
+fi
+
+
+# Check whether --enable-3dnow or --disable-3dnow was given.
+if test "${enable_3dnow+set}" = set; then
+  enableval="$enable_3dnow"
+  if test "x$enableval" = xno; then
+  cat >> confdefs.h <<\EOF
+#define JSIMD_3DNOW_NOT_SUPPORTED 
+EOF
+
+else
+  simd_to_use="$simd_to_use 3DNow!"
+fi
+else
+  simd_to_use="$simd_to_use 3DNow!"
+fi
+
+
+# Check whether --enable-sse or --disable-sse was given.
+if test "${enable_sse+set}" = set; then
+  enableval="$enable_sse"
+  if test "x$enableval" = xno; then
+  cat >> confdefs.h <<\EOF
+#define JSIMD_SSE_NOT_SUPPORTED 
+EOF
+
+else
+  simd_to_use="$simd_to_use SSE"
+fi
+else
+  simd_to_use="$simd_to_use SSE"
+fi
+
+
+# Check whether --enable-sse2 or --disable-sse2 was given.
+if test "${enable_sse2+set}" = set; then
+  enableval="$enable_sse2"
+  if test "x$enableval" = xno; then
+  cat >> confdefs.h <<\EOF
+#define JSIMD_SSE2_NOT_SUPPORTED 
+EOF
+
+else
+  simd_to_use="$simd_to_use SSE2"
+fi
+else
+  simd_to_use="$simd_to_use SSE2"
+fi
+
+
+test -z "$simd_to_use" && simd_to_use="NONE"
+echo "$ac_t""$simd_to_use" 1>&6
+
+for simd_name in $simd_to_use; do
+case "$simd_name" in
+  MMX)    simd_instruction='psubw mm0,mm0';;
+  3DNow!) simd_instruction='pfsub mm0,mm0';;
+  SSE)    simd_instruction='subps xmm0,xmm0';;
+  SSE2)   simd_instruction='subpd xmm0,xmm0';;
+  *)      continue;;
+esac
+echo $ac_n "checking whether the assembler supports $simd_name instructions""... $ac_c" 1>&6
+echo "configure:6587: checking whether the assembler supports $simd_name instructions" >&5
+cat > conftest.asm <<EOF
+%line 6589 "configure"
+        section .text
+        bits    32
+        global  _simd
+_simd:  $simd_instruction
+        ret
+EOF
+try_nasm='$NASM $NAFLAGS -o conftest.o conftest.asm'
+if { (eval echo configure:6597: \"$try_nasm\") 1>&5; (eval $try_nasm) 2>&5; } && test -s conftest.o; then
+  rm -rf conftest*
+  echo "$ac_t""yes" 1>&6
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.asm >&5
+  rm -rf conftest*
+  echo "$ac_t""no" 1>&6
+  { echo "configure: error: you have to use a more recent version of the assembler." 1>&2; exit 1; }
+fi
+done
+
+# Select OS-dependent SIMD instruction support checker.
+# jsimdw32.$(O) (Win32) / jsimddjg.$(O) (DJGPP V.2) / jsimdgcc.$(O) (Unix/gcc)
+if test "x$SIMDCHECKER" = x ; then
+  case "$host_os" in
+    cygwin* | mingw* | pw32* | interix*)
+      SIMDCHECKER='jsimdw32.$(O)'
+    ;;
+    msdosdjgpp* | go32*)
+      SIMDCHECKER='jsimddjg.$(O)'
+    ;;
+    os2-emx*)			# not tested
+      SIMDCHECKER='jsimdgcc.$(O)'
+    ;;
+    *)
+      SIMDCHECKER='jsimdgcc.$(O)'
+    ;;
+  esac
+fi
+
+
+case "$host_os" in
+  cygwin* | mingw* | pw32* | os2-emx* | msdosdjgpp* | go32*)
+    cat >> confdefs.h <<\EOF
+#define USE_SETMODE 
+EOF
+
+  ;;
+# _host_name_*)
+#   AC_DEFINE([USE_FDOPEN],)
+# ;;
+esac
+
+# This is for UNIX-like environments on Windows platform.
+# Check whether --enable-uchar-boolean or --disable-uchar-boolean was given.
+if test "${enable_uchar_boolean+set}" = set; then
+  enableval="$enable_uchar_boolean"
+  if test "x$enableval" != xno; then
+  cat >> confdefs.h <<\EOF
+#define TYPEDEF_UCHAR_BOOLEAN 
+EOF
+
+fi
+fi
+
+
+
+JPEG_LIB_VERSION="63:0:1"
+confv_dirs="$srcdir $srcdir/.. $srcdir/../.."
+config_ver=
+for ac_dir in $confv_dirs; do
+  if test -r $ac_dir/config.ver; then
+    config_ver=$ac_dir/config.ver
+    break
+  fi
+done
+if test -z "$config_ver"; then
+  echo "configure: warning: cannot find config.ver in $confv_dirs" 1>&2
+  echo "configure: warning: default version number $JPEG_LIB_VERSION is used" 1>&2
+  echo $ac_n "checking libjpeg version number for libtool""... $ac_c" 1>&6
+echo "configure:6668: checking libjpeg version number for libtool" >&5
+  echo "$ac_t""$JPEG_LIB_VERSION" 1>&6
+else
+  echo $ac_n "checking libjpeg version number for libtool""... $ac_c" 1>&6
+echo "configure:6672: checking libjpeg version number for libtool" >&5
+  . $config_ver
+  echo "$ac_t""$JPEG_LIB_VERSION" 1>&6
+  echo "configure: if you want to change the version number, modify $config_ver" 1>&2
+fi
 
 
 # Prepare to massage makefile.cfg correctly.
@@ -1675,12 +6699,15 @@
   COM_LT="# "
 fi
 
-if test "x$LTSHARED" != xno; then
+if test "x$enable_shared" != xno; then
   FORCE_INSTALL_LIB="install-lib"
+  UNINSTALL_LIB="uninstall-lib"
 else
   FORCE_INSTALL_LIB=""
+  UNINSTALL_LIB=""
 fi
 
+
 # Set up -I directives
 if test "x$srcdir" = x.; then
   INCLUDEFLAGS='-I$(srcdir)'
@@ -1689,6 +6716,52 @@
 fi
 
 trap '' 1 2 15
+cat > confcache <<\EOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs.  It is not useful on other systems.
+# If it contains results you don't want to keep, you may remove or edit it.
+#
+# By default, configure uses ./config.cache as the cache file,
+# creating it if it does not exist already.  You can give configure
+# the --cache-file=FILE option to use a different cache file; that is
+# what configure does when it calls configure scripts in
+# subdirectories, so they share the cache.
+# Giving --cache-file=/dev/null disables caching, for debugging configure.
+# config.status only pays attention to the cache file if you give it the
+# --recheck option to rerun configure.
+#
+EOF
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, don't put newlines in cache variables' values.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(set) 2>&1 |
+  case `(ac_space=' '; set | grep ac_space) 2>&1` in
+  *ac_space=\ *)
+    # `set' does not quote correctly, so add quotes (double-quote substitution
+    # turns \\\\ into \\, and sed turns \\ into \).
+    sed -n \
+      -e "s/'/'\\\\''/g" \
+      -e "s/^\\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\\)=\\(.*\\)/\\1=\${\\1='\\2'}/p"
+    ;;
+  *)
+    # `set' quotes correctly as required by POSIX, so do not add quotes.
+    sed -n -e 's/^\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\)=\(.*\)/\1=${\1=\2}/p'
+    ;;
+  esac >> confcache
+if cmp -s $cache_file confcache; then
+  :
+else
+  if test -w $cache_file; then
+    echo "updating cache $cache_file"
+    cat confcache > $cache_file
+  else
+    echo "not updating unwritable cache $cache_file"
+  fi
+fi
+rm -f confcache
 
 trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15
 
@@ -1732,7 +6805,7 @@
     echo "running \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion"
     exec \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion ;;
   -version | --version | --versio | --versi | --vers | --ver | --ve | --v)
-    echo "$CONFIG_STATUS generated by autoconf version 2.12"
+    echo "$CONFIG_STATUS generated by autoconf version 2.13"
     exit 0 ;;
   -help | --help | --hel | --he | --h)
     echo "\$ac_cs_usage"; exit 0 ;;
@@ -1752,9 +6825,11 @@
  s/@@/%@/; s/@@/@%/; s/@g\$/%g/' > conftest.subs <<\\CEOF
 $ac_vpsub
 $extrasub
+s%@SHELL@%$SHELL%g
 s%@CFLAGS@%$CFLAGS%g
 s%@CPPFLAGS@%$CPPFLAGS%g
 s%@CXXFLAGS@%$CXXFLAGS%g
+s%@FFLAGS@%$FFLAGS%g
 s%@DEFS@%$DEFS%g
 s%@LDFLAGS@%$LDFLAGS%g
 s%@LIBS@%$LIBS%g
@@ -1776,20 +6851,45 @@
 s%@CC@%$CC%g
 s%@CPP@%$CPP%g
 s%@INSTALL_PROGRAM@%$INSTALL_PROGRAM%g
+s%@INSTALL_SCRIPT@%$INSTALL_SCRIPT%g
 s%@INSTALL_DATA@%$INSTALL_DATA%g
 s%@RANLIB@%$RANLIB%g
+s%@host@%$host%g
+s%@host_alias@%$host_alias%g
+s%@host_cpu@%$host_cpu%g
+s%@host_vendor@%$host_vendor%g
+s%@host_os@%$host_os%g
+s%@EXEEXT@%$EXEEXT%g
 s%@LIBTOOL@%$LIBTOOL%g
 s%@O@%$O%g
 s%@A@%$A%g
 s%@LN@%$LN%g
 s%@INSTALL_LIB@%$INSTALL_LIB%g
+s%@UNINSTALL@%$UNINSTALL%g
+s%@build@%$build%g
+s%@build_alias@%$build_alias%g
+s%@build_cpu@%$build_cpu%g
+s%@build_vendor@%$build_vendor%g
+s%@build_os@%$build_os%g
+s%@LN_S@%$LN_S%g
+s%@OBJEXT@%$OBJEXT%g
+s%@ECHO@%$ECHO%g
+s%@STRIP@%$STRIP%g
+s%@DLLTOOL@%$DLLTOOL%g
+s%@AS@%$AS%g
+s%@OBJDUMP@%$OBJDUMP%g
+s%@TAGCC@%$TAGCC%g
 s%@MEMORYMGR@%$MEMORYMGR%g
+s%@NAFLAGS@%$NAFLAGS%g
+s%@NASM@%$NASM%g
+s%@SIMDCHECKER@%$SIMDCHECKER%g
 s%@JPEG_LIB_VERSION@%$JPEG_LIB_VERSION%g
 s%@A2K_DEPS@%$A2K_DEPS%g
 s%@COM_A2K@%$COM_A2K%g
 s%@ANSI2KNRFLAGS@%$ANSI2KNRFLAGS%g
 s%@COM_LT@%$COM_LT%g
 s%@FORCE_INSTALL_LIB@%$FORCE_INSTALL_LIB%g
+s%@UNINSTALL_LIB@%$UNINSTALL_LIB%g
 s%@INCLUDEFLAGS@%$INCLUDEFLAGS%g
 
 CEOF
@@ -1952,6 +7052,7 @@
 # example, in the case of _POSIX_SOURCE, which is predefined and required
 # on some systems where configure will not decide to define it.
 cat >> conftest.vals <<\EOF
+s%^[ 	]*#[ 	]*undef[ 	][ 	]*[a-zA-Z_][a-zA-Z_0-9]*%/* & */%
 EOF
 
 # Break up conftest.vals because some shells have a limit on
diff --git a/configure.in b/configure.in
new file mode 100644
index 0000000..06171f0
--- /dev/null
+++ b/configure.in
@@ -0,0 +1,634 @@
+dnl Process this file with autoconf to produce a configure script.
+AC_INIT([jcmaster.c])
+AC_CONFIG_HEADER([jconfig.h:jconfig.cfg])
+dnl --------------------------------------------------------------------
+AC_PROG_CC
+AC_PROG_CPP
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([for function prototypes])
+AC_CACHE_VAL([ijg_cv_have_prototypes],[AC_TRY_COMPILE([
+int testfunction (int arg1, int * arg2); /* check prototypes */
+struct methods_struct {		/* check method-pointer declarations */
+  int (*error_exit) (char *msgtext);
+  int (*trace_message) (char *msgtext);
+  int (*another_method) (void);
+};
+int testfunction (int arg1, int * arg2) /* check definitions */
+{ return arg2[arg1]; }
+int test2function (void)	/* check void arg list */
+{ return 0; }
+],[ ],[ijg_cv_have_prototypes=yes],[ijg_cv_have_prototypes=no])])
+AC_MSG_RESULT([$ijg_cv_have_prototypes])
+if test $ijg_cv_have_prototypes = yes; then
+  AC_DEFINE([HAVE_PROTOTYPES],)
+else
+  echo [Your compiler does not seem to know about function prototypes.]
+  echo [Perhaps it needs a special switch to enable ANSI C mode.]
+  echo [If so, we recommend running configure like this:]
+  echo ["   ./configure  CC='cc -switch'"]
+  echo [where -switch is the proper switch.]
+fi
+dnl --------------------------------------------------------------------
+AC_CHECK_HEADER([stddef.h],[AC_DEFINE([HAVE_STDDEF_H],)])
+AC_CHECK_HEADER([stdlib.h],[AC_DEFINE([HAVE_STDLIB_H],)])
+AC_CHECK_HEADER([string.h],[:],[AC_DEFINE([NEED_BSD_STRINGS],)])
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([for size_t])
+AC_TRY_COMPILE([
+#ifdef HAVE_STDDEF_H
+#include <stddef.h>
+#endif
+#ifdef HAVE_STDLIB_H
+#include <stdlib.h>
+#endif
+#include <stdio.h>
+#ifdef NEED_BSD_STRINGS
+#include <strings.h>
+#else
+#include <string.h>
+#endif
+typedef size_t my_size_t;
+],[ my_size_t foovar; ],
+[ijg_size_t_ok=yes],
+[ijg_size_t_ok="not ANSI, perhaps it is in sys/types.h"])
+AC_MSG_RESULT([$ijg_size_t_ok])
+if test "$ijg_size_t_ok" != yes; then
+AC_CHECK_HEADER([sys/types.h],[AC_DEFINE([NEED_SYS_TYPES_H],)
+AC_EGREP_HEADER([size_t],[sys/types.h],
+[ijg_size_t_ok="size_t is in sys/types.h"],[ijg_size_t_ok=no])],
+[ijg_size_t_ok=no])
+AC_MSG_RESULT([$ijg_size_t_ok])
+if test "$ijg_size_t_ok" = no; then
+  echo [Type size_t is not defined in any of the usual places.]
+  echo [Try putting '"typedef unsigned int size_t;"' in jconfig.h.]
+fi
+fi
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([for type unsigned char])
+AC_TRY_COMPILE(,[ unsigned char un_char; ],[AC_MSG_RESULT(yes)
+AC_DEFINE([HAVE_UNSIGNED_CHAR],)],[AC_MSG_RESULT(no)])
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([for type unsigned short])
+AC_TRY_COMPILE(,[ unsigned short un_short; ],[AC_MSG_RESULT(yes)
+AC_DEFINE([HAVE_UNSIGNED_SHORT],)],[AC_MSG_RESULT(no)])
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([for type void])
+AC_TRY_COMPILE([
+/* Caution: a C++ compiler will insist on valid prototypes */
+typedef void * void_ptr;	/* check void * */
+#ifdef HAVE_PROTOTYPES		/* check ptr to function returning void */
+typedef void (*void_func) (int a, int b);
+#else
+typedef void (*void_func) ();
+#endif
+
+#ifdef HAVE_PROTOTYPES		/* check void function result */
+void test3function (void_ptr arg1, void_func arg2)
+#else
+void test3function (arg1, arg2)
+     void_ptr arg1;
+     void_func arg2;
+#endif
+{
+  char * locptr = (char *) arg1; /* check casting to and from void * */
+  arg1 = (void *) locptr;
+  (*arg2) (1, 2);		/* check call of fcn returning void */
+}
+],[ ],[AC_MSG_RESULT(yes)],[AC_MSG_RESULT(no)
+AC_DEFINE([void],[char])])
+
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([for working const])
+AC_CACHE_VAL([ac_cv_c_const],[AC_TRY_COMPILE(,[
+/* Ultrix mips cc rejects this.  */
+typedef int charset[2]; const charset x;
+/* SunOS 4.1.1 cc rejects this.  */
+char const *const *ccp;
+char **p;
+/* NEC SVR4.0.2 mips cc rejects this.  */
+struct point {int x, y;};
+static struct point const zero = {0,0};
+/* AIX XL C 1.02.0.0 rejects this.
+   It does not let you subtract one const X* pointer from another in an arm
+   of an if-expression whose if-part is not a constant expression */
+const char *g = "string";
+ccp = &g + (g ? g-g : 0);
+/* HPUX 7.0 cc rejects these. */
+++ccp;
+p = (char**) ccp;
+ccp = (char const *const *) p;
+{ /* SCO 3.2v4 cc rejects this.  */
+  char *t;
+  char const *s = 0 ? (char *) 0 : (char const *) 0;
+
+  *t++ = 0;
+}
+{ /* Someone thinks the Sun supposedly-ANSI compiler will reject this.  */
+  int x[] = {25, 17};
+  const int *foo = &x[0];
+  ++foo;
+}
+{ /* Sun SC1.0 ANSI compiler rejects this -- but not the above. */
+  typedef const int *iptr;
+  iptr p = 0;
+  ++p;
+}
+{ /* AIX XL C 1.02.0.0 rejects this saying
+     "k.c", line 2.27: 1506-025 (S) Operand must be a modifiable lvalue. */
+  struct s { int j; const int *ap[3]; };
+  struct s *b; b->j = 5;
+}
+{ /* ULTRIX-32 V3.1 (Rev 9) vcc rejects this */
+  const int foo = 10;
+}
+],[ac_cv_c_const=yes],[ac_cv_c_const=no])])
+AC_MSG_RESULT([$ac_cv_c_const])
+if test $ac_cv_c_const = no; then
+  AC_DEFINE([const],)
+fi
+
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([for inline])
+ijg_cv_inline=""
+AC_TRY_COMPILE(,[} __inline__ int foo() { return 0; }
+int bar() { return foo();],[ijg_cv_inline="__inline__"],
+[AC_TRY_COMPILE(,[} __inline int foo() { return 0; }
+int bar() { return foo();],[ijg_cv_inline="__inline"],
+[AC_TRY_COMPILE(,[} inline int foo() { return 0; }
+int bar() { return foo();],[ijg_cv_inline="inline"],)])])
+AC_MSG_RESULT([$ijg_cv_inline])
+AC_DEFINE_UNQUOTED([INLINE],[$ijg_cv_inline])
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([for broken incomplete types])
+AC_TRY_COMPILE([ typedef struct undefined_structure * undef_struct_ptr; ],
+,[AC_MSG_RESULT(ok)],[AC_MSG_RESULT(broken)
+AC_DEFINE([INCOMPLETE_TYPES_BROKEN],)])
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([for short external names])
+AC_TRY_LINK([
+int possibly_duplicate_function () { return 0; }
+int possibly_dupli_function () { return 1; }
+],[ ],[AC_MSG_RESULT(ok)],[AC_MSG_RESULT(short)
+AC_DEFINE([NEED_SHORT_EXTERNAL_NAMES],)])
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([to see if char is signed])
+AC_TRY_RUN([
+#ifdef HAVE_PROTOTYPES
+int is_char_signed (int arg)
+#else
+int is_char_signed (arg)
+     int arg;
+#endif
+{
+  if (arg == 189) {		/* expected result for unsigned char */
+    return 0;			/* type char is unsigned */
+  }
+  else if (arg != -67) {	/* expected result for signed char */
+    printf("Hmm, it seems 'char' is not eight bits wide on your machine.\n");
+    printf("I fear the JPEG software will not work at all.\n\n");
+  }
+  return 1;			/* assume char is signed otherwise */
+}
+char signed_char_check = (char) (-67);
+main() {
+  exit(is_char_signed((int) signed_char_check));
+}],[AC_MSG_RESULT(no)
+AC_DEFINE([CHAR_IS_UNSIGNED],)],[AC_MSG_RESULT(yes)],
+[echo Assuming that char is signed on target machine.
+echo If it is unsigned, this will be a little bit inefficient.
+])
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([to see if right shift is signed])
+AC_TRY_RUN([
+#ifdef HAVE_PROTOTYPES
+int is_shifting_signed (long arg)
+#else
+int is_shifting_signed (arg)
+     long arg;
+#endif
+/* See whether right-shift on a long is signed or not. */
+{
+  long res = arg >> 4;
+
+  if (res == -0x7F7E80CL) {	/* expected result for signed shift */
+    return 1;			/* right shift is signed */
+  }
+  /* see if unsigned-shift hack will fix it. */
+  /* we can't just test exact value since it depends on width of long... */
+  res |= (~0L) << (32-4);
+  if (res == -0x7F7E80CL) {	/* expected result now? */
+    return 0;			/* right shift is unsigned */
+  }
+  printf("Right shift isn't acting as I expect it to.\n");
+  printf("I fear the JPEG software will not work at all.\n\n");
+  return 0;			/* try it with unsigned anyway */
+}
+main() {
+  exit(is_shifting_signed(-0x7F7E80B1L));
+}],[AC_MSG_RESULT(no)
+AC_DEFINE([RIGHT_SHIFT_IS_UNSIGNED],)],[AC_MSG_RESULT(yes)],
+[AC_MSG_RESULT([Assuming that right shift is signed on target machine.])])
+dnl --------------------------------------------------------------------
+AC_MSG_CHECKING([to see if fopen accepts b spec])
+AC_TRY_RUN([
+#include <stdio.h>
+main() {
+  if (fopen("conftestdata", "wb") != NULL)
+    exit(0);
+  exit(1);
+}],[AC_MSG_RESULT(yes)],[AC_MSG_RESULT(no)
+AC_DEFINE([DONT_USE_B_MODE],)],[AC_MSG_RESULT([Assuming that it does.])])
+dnl --------------------------------------------------------------------
+AC_PROG_INSTALL
+AC_PROG_RANLIB
+dnl --------------------------------------------------------------------
+
+AC_CANONICAL_HOST
+AC_EXEEXT
+
+# Decide whether to use libtool,
+# and if so whether to build shared, static, or both flavors of library.
+AC_DISABLE_SHARED
+AC_DISABLE_STATIC
+if test "x$enable_shared" != xno  -o  "x$enable_static" != xno; then
+  USELIBTOOL="yes"
+# LIBTOOL="./libtool"
+  O="lo"
+  A="la"
+  LN='$(LIBTOOL) --mode=link $(CC)'
+  INSTALL_LIB='$(LIBTOOL) --mode=install ${INSTALL}'
+  INSTALL_PROGRAM="\$(LIBTOOL) --mode=install $INSTALL_PROGRAM"
+  UNINSTALL='$(LIBTOOL) --mode=uninstall $(RM)'
+else
+  USELIBTOOL="no"
+  LIBTOOL=""
+  O="o"
+  A="a"
+  LN='$(CC)'
+  INSTALL_LIB="$INSTALL_DATA"
+  UNINSTALL='$(RM)'
+fi
+AC_SUBST([LIBTOOL])
+AC_SUBST([O])
+AC_SUBST([A])
+AC_SUBST([LN])
+AC_SUBST([INSTALL_LIB])
+AC_SUBST([UNINSTALL])
+
+# Configure libtool if needed.
+if test $USELIBTOOL = yes; then
+  AC_LIBTOOL_DLOPEN
+  AC_LIBTOOL_WIN32_DLL
+  AC_PROG_LIBTOOL
+fi
+# if libtool >= 1.5
+TAGCC=ifdef([AC_LIBTOOL_GCJ],[--tag=CC])
+AC_SUBST([TAGCC])
+
+dnl --------------------------------------------------------------------
+# Select memory manager depending on user input.
+# If no "-enable-maxmem", use jmemnobs
+MEMORYMGR='jmemnobs.$(O)'
+MAXMEM="no"
+AC_ARG_ENABLE([maxmem],
+[  --enable-maxmem[=N]     enable use of temp files, set max mem usage to N MB],
+[MAXMEM="$enableval"])
+# support --with-maxmem for backwards compatibility with IJG V5.
+AC_ARG_WITH([maxmem],,[MAXMEM="$withval"])
+if test "x$MAXMEM" = xyes; then
+  MAXMEM=1
+fi
+if test "x$MAXMEM" != xno; then
+  if test -n "`echo $MAXMEM | sed 's/[[0-9]]//g'`"; then
+    AC_MSG_ERROR([non-numeric argument to --enable-maxmem])
+  fi
+  DEFAULTMAXMEM=`expr $MAXMEM \* 1048576`
+AC_DEFINE_UNQUOTED([DEFAULT_MAX_MEM],[${DEFAULTMAXMEM}])
+AC_MSG_CHECKING([for 'tmpfile()'])
+AC_TRY_LINK([#include <stdio.h>],[ FILE * tfile = tmpfile(); ],
+[AC_MSG_RESULT(yes)
+MEMORYMGR='jmemansi.$(O)'],
+[AC_MSG_RESULT(no)
+MEMORYMGR='jmemname.$(O)'
+AC_DEFINE([NEED_SIGNAL_CATCHER],)
+AC_MSG_CHECKING([for 'mktemp()'])
+AC_TRY_LINK(,[ char fname[80]; mktemp(fname); ],
+[AC_MSG_RESULT(yes)],[AC_MSG_RESULT(no)
+AC_DEFINE([NO_MKTEMP],)])])
+fi
+AC_SUBST([MEMORYMGR])
+
+dnl ====================================================================
+
+AC_MSG_CHECKING([to see if the host cpu type is i386 or compatible])
+case "$host_cpu" in
+  i*86 | x86 | ia32)
+    AC_MSG_RESULT(yes)
+  ;;
+  x86_64 | amd64 | aa64)
+    AC_MSG_RESULT([no (x86_64)])
+    AC_MSG_ERROR([Currently, this version of JPEG library cannot be compiled as 64-bit code. sorry.])
+  ;;
+  *)
+    AC_MSG_RESULT([no ("$host_cpu")])
+    AC_MSG_ERROR([This version of JPEG library is for i386 or compatible processors only.])
+  ;;
+esac
+
+if test -z "$NAFLAGS" ; then
+  AC_MSG_CHECKING([for object file format of host system])
+  case "$host_os" in
+    cygwin* | mingw* | pw32* | interix*)
+      objfmt='Win32-COFF'
+    ;;
+    msdosdjgpp* | go32*)
+      objfmt='COFF'
+    ;;
+    os2-emx*)			# not tested
+      objfmt='MSOMF'		# obj
+    ;;
+    linux*coff* | linux*oldld*)
+      objfmt='COFF'		# ???
+    ;;
+    linux*aout*)
+      objfmt='a.out'
+    ;;
+    linux*)
+      objfmt='ELF'
+    ;;
+    freebsd* | netbsd* | openbsd*)
+      if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+        objfmt='BSD-a.out'
+      else
+        objfmt='ELF'
+      fi
+    ;;
+    solaris* | sunos* | sysv* | sco*)
+      objfmt='ELF'
+    ;;
+    darwin* | rhapsody* | nextstep* | openstep* | macos*)
+      objfmt='Mach-O'
+    ;;
+    *)
+      objfmt='ELF ?'
+    ;;
+  esac
+  AC_MSG_RESULT([$objfmt])
+  if test "$objfmt" = 'ELF ?'; then
+    objfmt='ELF'
+    AC_MSG_WARN([unexpected host system. assumed that the format is $objfmt.])
+  fi
+else
+  objfmt=''
+fi
+AC_MSG_CHECKING([for object file format specifier (NAFLAGS) ])
+case "$objfmt" in
+  MSOMF)      NAFLAGS='-fobj -DOBJ32';;
+  Win32-COFF) NAFLAGS='-fwin32 -DWIN32';;
+  COFF)       NAFLAGS='-fcoff -DCOFF';;
+  a.out)      NAFLAGS='-faout -DAOUT';;
+  BSD-a.out)  NAFLAGS='-faoutb -DAOUT';;
+  ELF)        NAFLAGS='-felf -DELF';;
+  RDF)        NAFLAGS='-frdf -DRDF';;
+  Mach-O)     NAFLAGS='-fmacho -DMACHO';;
+esac
+AC_MSG_RESULT([$NAFLAGS])
+AC_SUBST([NAFLAGS])
+
+dnl --------------------------------------------------------------------
+
+AC_CHECK_PROGS(NASM, [nasm nasmw])
+test -z "$NASM" && AC_MSG_ERROR([no nasm (Netwide Assembler) found in \$PATH])
+if echo "$NASM" | grep yasm > /dev/null; then
+  AC_MSG_WARN([DON'T USE YASM! CURRENT VERSION (R0.4.0) IS BUGGY!])
+fi
+
+AC_MSG_CHECKING([whether the assembler ($NASM $NAFLAGS) works])
+cat > conftest.asm <<EOF
+[%line __oline__ "configure"
+        section .text
+        bits    32
+        global  _main,main
+_main:
+main:   xor     eax,eax
+        ret
+]EOF
+try_nasm='$NASM $NAFLAGS -o conftest.o conftest.asm'
+if AC_TRY_EVAL(try_nasm) && test -s conftest.o; then
+  AC_MSG_RESULT(yes)
+else
+  echo "configure: failed program was:" >&AC_FD_CC
+  cat conftest.asm >&AC_FD_CC
+  rm -rf conftest*
+  AC_MSG_RESULT(no)
+  AC_MSG_ERROR([installation or configuration problem: assembler cannot create object files.])
+fi
+AC_MSG_CHECKING([whether the linker accepts assembler output])
+try_nasm='${CC-cc} -o conftest${ac_exeext} $LDFLAGS conftest.o $LIBS 1>&AC_FD_CC'
+if AC_TRY_EVAL(try_nasm) && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  AC_MSG_RESULT(yes)
+else
+  rm -rf conftest*
+  AC_MSG_RESULT(no)
+  AC_MSG_ERROR([configuration problem: maybe object file format mismatch.])
+fi
+
+AC_MSG_CHECKING([whether the assembler supports line continuation character])
+cat > conftest.asm <<\EOF
+[%line __oline__ "configure"
+; The line continuation character '\'
+; was introduced in nasm 0.98.25.
+        section .text
+        bits    32
+        global  _zero
+_zero:  xor     \
+                eax,eax
+        ret
+]EOF
+try_nasm='$NASM $NAFLAGS -o conftest.o conftest.asm'
+if AC_TRY_EVAL(try_nasm) && test -s conftest.o; then
+  rm -rf conftest*
+  AC_MSG_RESULT(yes)
+else
+  echo "configure: failed program was:" >&AC_FD_CC
+  cat conftest.asm >&AC_FD_CC
+  rm -rf conftest*
+  AC_MSG_RESULT(no)
+  AC_MSG_ERROR([you have to use a more recent version of the assembler.])
+fi
+
+dnl --------------------------------------------------------------------
+
+AC_MSG_CHECKING([SIMD instruction sets requested to use])
+simd_to_use=""
+
+AC_ARG_ENABLE(mmx,
+[  --disable-mmx           do not use MMX instruction set],
+[if test "x$enableval" = xno; then
+  AC_DEFINE([JSIMD_MMX_NOT_SUPPORTED],)
+else
+  simd_to_use="$simd_to_use MMX"
+fi], [simd_to_use="$simd_to_use MMX"])
+
+AC_ARG_ENABLE(3dnow,
+[  --disable-3dnow         do not use 3DNow! instruction set],
+[if test "x$enableval" = xno; then
+  AC_DEFINE([JSIMD_3DNOW_NOT_SUPPORTED],)
+else
+  simd_to_use="$simd_to_use 3DNow!"
+fi], [simd_to_use="$simd_to_use 3DNow!"])
+
+AC_ARG_ENABLE(sse,
+[  --disable-sse           do not use SSE instruction set],
+[if test "x$enableval" = xno; then
+  AC_DEFINE([JSIMD_SSE_NOT_SUPPORTED],)
+else
+  simd_to_use="$simd_to_use SSE"
+fi], [simd_to_use="$simd_to_use SSE"])
+
+AC_ARG_ENABLE(sse2,
+[  --disable-sse2          do not use SSE2 instruction set],
+[if test "x$enableval" = xno; then
+  AC_DEFINE([JSIMD_SSE2_NOT_SUPPORTED],)
+else
+  simd_to_use="$simd_to_use SSE2"
+fi], [simd_to_use="$simd_to_use SSE2"])
+
+test -z "$simd_to_use" && simd_to_use="NONE"
+AC_MSG_RESULT([$simd_to_use])
+
+for simd_name in $simd_to_use; do
+case "$simd_name" in
+  MMX)    simd_instruction='psubw mm0,mm0';;
+  3DNow!) simd_instruction='pfsub mm0,mm0';;
+  SSE)    simd_instruction='subps xmm0,xmm0';;
+  SSE2)   simd_instruction='subpd xmm0,xmm0';;
+  *)      continue;;
+esac
+AC_MSG_CHECKING([whether the assembler supports $simd_name instructions])
+cat > conftest.asm <<EOF
+[%line __oline__ "configure"
+        section .text
+        bits    32
+        global  _simd
+_simd:  $simd_instruction
+        ret
+]EOF
+try_nasm='$NASM $NAFLAGS -o conftest.o conftest.asm'
+if AC_TRY_EVAL(try_nasm) && test -s conftest.o; then
+  rm -rf conftest*
+  AC_MSG_RESULT(yes)
+else
+  echo "configure: failed program was:" >&AC_FD_CC
+  cat conftest.asm >&AC_FD_CC
+  rm -rf conftest*
+  AC_MSG_RESULT(no)
+  AC_MSG_ERROR([you have to use a more recent version of the assembler.])
+fi
+done
+
+dnl --------------------------------------------------------------------
+# Select OS-dependent SIMD instruction support checker.
+# jsimdw32.$(O) (Win32) / jsimddjg.$(O) (DJGPP V.2) / jsimdgcc.$(O) (Unix/gcc)
+if test "x$SIMDCHECKER" = x ; then
+  case "$host_os" in
+    cygwin* | mingw* | pw32* | interix*)
+      SIMDCHECKER='jsimdw32.$(O)'
+    ;;
+    msdosdjgpp* | go32*)
+      SIMDCHECKER='jsimddjg.$(O)'
+    ;;
+    os2-emx*)			# not tested
+      SIMDCHECKER='jsimdgcc.$(O)'
+    ;;
+    *)
+      SIMDCHECKER='jsimdgcc.$(O)'
+    ;;
+  esac
+fi
+AC_SUBST([SIMDCHECKER])
+
+case "$host_os" in
+  cygwin* | mingw* | pw32* | os2-emx* | msdosdjgpp* | go32*)
+    AC_DEFINE([USE_SETMODE],)
+  ;;
+# _host_name_*)
+#   AC_DEFINE([USE_FDOPEN],)
+# ;;
+esac
+
+# This is for UNIX-like environments on Windows platform.
+AC_ARG_ENABLE(uchar-boolean,
+[  --enable-uchar-boolean  define type \"boolean\" as unsigned char (for Windows)],
+[if test "x$enableval" != xno; then
+  AC_DEFINE([TYPEDEF_UCHAR_BOOLEAN],)
+fi])
+
+dnl --------------------------------------------------------------------
+
+JPEG_LIB_VERSION="63:0:1"
+confv_dirs="$srcdir $srcdir/.. $srcdir/../.."
+config_ver=
+for ac_dir in $confv_dirs; do
+  if test -r $ac_dir/config.ver; then
+    config_ver=$ac_dir/config.ver
+    break
+  fi
+done
+if test -z "$config_ver"; then
+  AC_MSG_WARN([cannot find config.ver in $confv_dirs])
+  AC_MSG_WARN([default version number $JPEG_LIB_VERSION is used])
+  AC_MSG_CHECKING([libjpeg version number for libtool])
+  AC_MSG_RESULT([$JPEG_LIB_VERSION])
+else
+  AC_MSG_CHECKING([libjpeg version number for libtool])
+  . $config_ver
+  AC_MSG_RESULT([$JPEG_LIB_VERSION])
+  echo "configure: if you want to change the version number, modify $config_ver" 1>&2
+fi
+AC_SUBST([JPEG_LIB_VERSION])
+
+dnl --------------------------------------------------------------------
+# Prepare to massage makefile.cfg correctly.
+if test $ijg_cv_have_prototypes = yes; then
+  A2K_DEPS=""
+  COM_A2K="# "
+else
+  A2K_DEPS="ansi2knr"
+  COM_A2K=""
+fi
+AC_SUBST([A2K_DEPS])
+AC_SUBST([COM_A2K])
+# ansi2knr needs -DBSD if string.h is missing
+if test $ac_cv_header_string_h = no; then
+  ANSI2KNRFLAGS="-DBSD"
+else
+  ANSI2KNRFLAGS=""
+fi
+AC_SUBST([ANSI2KNRFLAGS])
+# Substitutions to enable or disable libtool-related stuff
+if test $USELIBTOOL = yes -a $ijg_cv_have_prototypes = yes; then
+  COM_LT=""
+else
+  COM_LT="# "
+fi
+AC_SUBST([COM_LT])
+if test "x$enable_shared" != xno; then
+  FORCE_INSTALL_LIB="install-lib"
+  UNINSTALL_LIB="uninstall-lib"
+else
+  FORCE_INSTALL_LIB=""
+  UNINSTALL_LIB=""
+fi
+AC_SUBST([FORCE_INSTALL_LIB])
+AC_SUBST([UNINSTALL_LIB])
+# Set up -I directives
+if test "x$srcdir" = x.; then
+  INCLUDEFLAGS='-I$(srcdir)'
+else
+  INCLUDEFLAGS='-I. -I$(srcdir)'
+fi
+AC_SUBST([INCLUDEFLAGS])
+dnl --------------------------------------------------------------------
+AC_OUTPUT([Makefile:makefile.cfg])
diff --git a/djpeg.c b/djpeg.c
index e099e90..a1ec059 100644
--- a/djpeg.c
+++ b/djpeg.c
@@ -5,6 +5,13 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : August 23, 2005
+ * ---------------------------------------------------------------------
+ *
  * This file contains a command-line user interface for the JPEG decompressor.
  * It should work on any system with Unix- or MS-DOS-style command lines.
  *
@@ -158,6 +165,22 @@
 }
 
 
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+LOCAL(void)
+print_simd_info (FILE * file, char * labelstr, unsigned int simd)
+{
+  fprintf(file, "%s%s%s%s%s%s\n", labelstr,
+	  simd & JSIMD_MMX   ? " MMX"    : "",
+	  simd & JSIMD_3DNOW ? " 3DNow!" : "",
+	  simd & JSIMD_SSE   ? " SSE"    : "",
+	  simd & JSIMD_SSE2  ? " SSE2"   : "",
+	  simd == JSIMD_NONE ? " NONE"   : "");
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
+
+
 LOCAL(int)
 parse_switches (j_decompress_ptr cinfo, int argc, char **argv,
 		int last_file_arg_seen, boolean for_real)
@@ -208,6 +231,19 @@
       cinfo->desired_number_of_colors = val;
       cinfo->quantize_colors = TRUE;
 
+#ifndef JSIMD_MASKFUNC_NOT_SUPPORTED
+    } else if (keymatch(arg, "nosimd" , 4)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_ALL);
+    } else if (keymatch(arg, "nommx"  , 3)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_MMX);
+    } else if (keymatch(arg, "no3dnow", 3)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_3DNOW);
+    } else if (keymatch(arg, "nosse"  , 4)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_SSE);
+    } else if (keymatch(arg, "nosse2" , 6)) {
+      jpeg_simd_mask((j_common_ptr) cinfo, JSIMD_NONE, JSIMD_SSE2);
+#endif /* !JSIMD_MASKFUNC_NOT_SUPPORTED */
+
     } else if (keymatch(arg, "dct", 2)) {
       /* Select IDCT algorithm. */
       if (++argn >= argc)	/* advance to next argument */
@@ -242,6 +278,38 @@
       if (! printed_version) {
 	fprintf(stderr, "Independent JPEG Group's DJPEG, version %s\n%s\n",
 		JVERSION, JCOPYRIGHT);
+	fprintf(stderr,
+		"\nx86 SIMD extension for IJG JPEG library, version %s\n\n",
+		JPEG_SIMDEXT_VER_STR);
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+	print_simd_info(stderr, "SIMD instructions supported by the system :",
+			jpeg_simd_support(NULL));
+
+	fprintf(stderr, "\n      === SIMD Operation Modes ===\n");
+#ifdef DCT_ISLOW_SUPPORTED
+	print_simd_info(stderr, "Accurate integer DCT  (-dct int)   :",
+			jpeg_simd_inverse_dct(cinfo, JDCT_ISLOW));
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+	print_simd_info(stderr, "Fast integer DCT      (-dct fast)  :",
+			jpeg_simd_inverse_dct(cinfo, JDCT_IFAST));
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+	print_simd_info(stderr, "Floating-point DCT    (-dct float) :",
+			jpeg_simd_inverse_dct(cinfo, JDCT_FLOAT));
+#endif
+#ifdef IDCT_SCALING_SUPPORTED
+	print_simd_info(stderr, "Reduced-size DCT      (-scale M/N) :",
+			jpeg_simd_inverse_dct(cinfo, JDCT_FLOAT+1));
+#endif
+	print_simd_info(stderr, "High-quality upsampling (default)  :",
+			jpeg_simd_upsampler(cinfo, TRUE));
+	print_simd_info(stderr, "Low-quality upsampling (-nosmooth) :",
+			jpeg_simd_upsampler(cinfo, FALSE));
+	print_simd_info(stderr, "Colorspace conversion (YCbCr->RGB) :",
+			jpeg_simd_color_deconverter(cinfo));
+	fprintf(stderr, "\n");
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
 	printed_version = TRUE;
       }
       cinfo->err->trace_level++;
diff --git a/install-sh b/install-sh
index e843669..4d4a951 100755
--- a/install-sh
+++ b/install-sh
@@ -1,19 +1,38 @@
 #!/bin/sh
-#
 # install - install a program, script, or datafile
-# This comes from X11R5 (mit/util/scripts/install.sh).
+
+scriptversion=2005-05-14.22
+
+# This originates from X11R5 (mit/util/scripts/install.sh), which was
+# later released in X11R6 (xc/config/util/install.sh) with the
+# following copyright and license.
 #
-# Copyright 1991 by the Massachusetts Institute of Technology
+# Copyright (C) 1994 X Consortium
 #
-# Permission to use, copy, modify, distribute, and sell this software and its
-# documentation for any purpose is hereby granted without fee, provided that
-# the above copyright notice appear in all copies and that both that
-# copyright notice and this permission notice appear in supporting
-# documentation, and that the name of M.I.T. not be used in advertising or
-# publicity pertaining to distribution of the software without specific,
-# written prior permission.  M.I.T. makes no representations about the
-# suitability of this software for any purpose.  It is provided "as is"
-# without express or implied warranty.
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC-
+# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# Except as contained in this notice, the name of the X Consortium shall not
+# be used in advertising or otherwise to promote the sale, use or other deal-
+# ings in this Software without prior written authorization from the X Consor-
+# tium.
+#
+#
+# FSF changes to this file are in the public domain.
 #
 # Calling this script install-sh is preferred over install.sh, to prevent
 # `make' implicit rules from creating a file called install from it
@@ -23,13 +42,11 @@
 # from scratch.  It can only install one file at a time, a restriction
 # shared with many OS's install programs.
 
-
 # set DOITPROG to echo to test this script
 
 # Don't use :- since 4.3BSD and earlier shells don't like it.
 doit="${DOITPROG-}"
 
-
 # put in absolute paths if you don't have them in your path; or use env. vars.
 
 mvprog="${MVPROG-mv}"
@@ -41,210 +58,266 @@
 rmprog="${RMPROG-rm}"
 mkdirprog="${MKDIRPROG-mkdir}"
 
-transformbasename=""
-transform_arg=""
-instcmd="$mvprog"
 chmodcmd="$chmodprog 0755"
-chowncmd=""
-chgrpcmd=""
-stripcmd=""
+chowncmd=
+chgrpcmd=
+stripcmd=
 rmcmd="$rmprog -f"
 mvcmd="$mvprog"
-src=""
-dst=""
-dir_arg=""
+src=
+dst=
+dir_arg=
+dstarg=
+no_target_directory=
 
-while [ x"$1" != x ]; do
-    case $1 in
-	-c) instcmd="$cpprog"
-	    shift
-	    continue;;
+usage="Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE
+   or: $0 [OPTION]... SRCFILES... DIRECTORY
+   or: $0 [OPTION]... -t DIRECTORY SRCFILES...
+   or: $0 [OPTION]... -d DIRECTORIES...
 
-	-d) dir_arg=true
-	    shift
-	    continue;;
+In the 1st form, copy SRCFILE to DSTFILE.
+In the 2nd and 3rd, copy all SRCFILES to DIRECTORY.
+In the 4th, create DIRECTORIES.
 
-	-m) chmodcmd="$chmodprog $2"
-	    shift
-	    shift
-	    continue;;
+Options:
+-c         (ignored)
+-d         create directories instead of installing files.
+-g GROUP   $chgrpprog installed files to GROUP.
+-m MODE    $chmodprog installed files to MODE.
+-o USER    $chownprog installed files to USER.
+-s         $stripprog installed files.
+-t DIRECTORY  install into DIRECTORY.
+-T         report an error if DSTFILE is a directory.
+--help     display this help and exit.
+--version  display version info and exit.
 
-	-o) chowncmd="$chownprog $2"
-	    shift
-	    shift
-	    continue;;
+Environment variables override the default commands:
+  CHGRPPROG CHMODPROG CHOWNPROG CPPROG MKDIRPROG MVPROG RMPROG STRIPPROG
+"
 
-	-g) chgrpcmd="$chgrpprog $2"
-	    shift
-	    shift
-	    continue;;
+while test -n "$1"; do
+  case $1 in
+    -c) shift
+        continue;;
 
-	-s) stripcmd="$stripprog"
-	    shift
-	    continue;;
+    -d) dir_arg=true
+        shift
+        continue;;
 
-	-t=*) transformarg=`echo $1 | sed 's/-t=//'`
-	    shift
-	    continue;;
+    -g) chgrpcmd="$chgrpprog $2"
+        shift
+        shift
+        continue;;
 
-	-b=*) transformbasename=`echo $1 | sed 's/-b=//'`
-	    shift
-	    continue;;
+    --help) echo "$usage"; exit $?;;
 
-	*)  if [ x"$src" = x ]
-	    then
-		src=$1
-	    else
-		# this colon is to work around a 386BSD /bin/sh bug
-		:
-		dst=$1
-	    fi
-	    shift
-	    continue;;
-    esac
-done
+    -m) chmodcmd="$chmodprog $2"
+        shift
+        shift
+        continue;;
 
-if [ x"$src" = x ]
-then
-	echo "install:	no input file specified"
-	exit 1
-else
-	true
-fi
+    -o) chowncmd="$chownprog $2"
+        shift
+        shift
+        continue;;
 
-if [ x"$dir_arg" != x ]; then
-	dst=$src
-	src=""
-	
-	if [ -d $dst ]; then
-		instcmd=:
-	else
-		instcmd=mkdir
-	fi
-else
+    -s) stripcmd=$stripprog
+        shift
+        continue;;
 
-# Waiting for this to be detected by the "$instcmd $src $dsttmp" command
-# might cause directories to be created, which would be especially bad 
-# if $src (and thus $dsttmp) contains '*'.
-
-	if [ -f $src -o -d $src ]
-	then
-		true
-	else
-		echo "install:  $src does not exist"
-		exit 1
-	fi
-	
-	if [ x"$dst" = x ]
-	then
-		echo "install:	no destination specified"
-		exit 1
-	else
-		true
-	fi
-
-# If destination is a directory, append the input filename; if your system
-# does not like double slashes in filenames, you may need to add some logic
-
-	if [ -d $dst ]
-	then
-		dst="$dst"/`basename $src`
-	else
-		true
-	fi
-fi
-
-## this sed command emulates the dirname command
-dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'`
-
-# Make sure that the destination directory exists.
-#  this part is taken from Noah Friedman's mkinstalldirs script
-
-# Skip lots of stat calls in the usual case.
-if [ ! -d "$dstdir" ]; then
-defaultIFS='	
-'
-IFS="${IFS-${defaultIFS}}"
-
-oIFS="${IFS}"
-# Some sh's can't handle IFS=/ for some reason.
-IFS='%'
-set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'`
-IFS="${oIFS}"
-
-pathcomp=''
-
-while [ $# -ne 0 ] ; do
-	pathcomp="${pathcomp}${1}"
+    -t) dstarg=$2
 	shift
+	shift
+	continue;;
 
-	if [ ! -d "${pathcomp}" ] ;
-        then
-		$mkdirprog "${pathcomp}"
-	else
-		true
-	fi
+    -T) no_target_directory=true
+	shift
+	continue;;
 
-	pathcomp="${pathcomp}/"
+    --version) echo "$0 $scriptversion"; exit $?;;
+
+    *)  # When -d is used, all remaining arguments are directories to create.
+	# When -t is used, the destination is already specified.
+	test -n "$dir_arg$dstarg" && break
+        # Otherwise, the last argument is the destination.  Remove it from $@.
+	for arg
+	do
+          if test -n "$dstarg"; then
+	    # $@ is not empty: it contains at least $arg.
+	    set fnord "$@" "$dstarg"
+	    shift # fnord
+	  fi
+	  shift # arg
+	  dstarg=$arg
+	done
+	break;;
+  esac
 done
+
+if test -z "$1"; then
+  if test -z "$dir_arg"; then
+    echo "$0: no input file specified." >&2
+    exit 1
+  fi
+  # It's OK to call `install-sh -d' without argument.
+  # This can happen when creating conditional directories.
+  exit 0
 fi
 
-if [ x"$dir_arg" != x ]
-then
-	$doit $instcmd $dst &&
+for src
+do
+  # Protect names starting with `-'.
+  case $src in
+    -*) src=./$src ;;
+  esac
 
-	if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi &&
-	if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi &&
-	if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi &&
-	if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi
-else
+  if test -n "$dir_arg"; then
+    dst=$src
+    src=
 
-# If we're going to rename the final executable, determine the name now.
+    if test -d "$dst"; then
+      mkdircmd=:
+      chmodcmd=
+    else
+      mkdircmd=$mkdirprog
+    fi
+  else
+    # Waiting for this to be detected by the "$cpprog $src $dsttmp" command
+    # might cause directories to be created, which would be especially bad
+    # if $src (and thus $dsttmp) contains '*'.
+    if test ! -f "$src" && test ! -d "$src"; then
+      echo "$0: $src does not exist." >&2
+      exit 1
+    fi
 
-	if [ x"$transformarg" = x ] 
-	then
-		dstfile=`basename $dst`
-	else
-		dstfile=`basename $dst $transformbasename | 
-			sed $transformarg`$transformbasename
-	fi
+    if test -z "$dstarg"; then
+      echo "$0: no destination specified." >&2
+      exit 1
+    fi
 
-# don't allow the sed command to completely eliminate the filename
+    dst=$dstarg
+    # Protect names starting with `-'.
+    case $dst in
+      -*) dst=./$dst ;;
+    esac
 
-	if [ x"$dstfile" = x ] 
-	then
-		dstfile=`basename $dst`
-	else
-		true
-	fi
+    # If destination is a directory, append the input filename; won't work
+    # if double slashes aren't ignored.
+    if test -d "$dst"; then
+      if test -n "$no_target_directory"; then
+	echo "$0: $dstarg: Is a directory" >&2
+	exit 1
+      fi
+      dst=$dst/`basename "$src"`
+    fi
+  fi
 
-# Make a temp file name in the proper directory.
+  # This sed command emulates the dirname command.
+  dstdir=`echo "$dst" | sed -e 's,/*$,,;s,[^/]*$,,;s,/*$,,;s,^$,.,'`
 
-	dsttmp=$dstdir/#inst.$$#
+  # Make sure that the destination directory exists.
 
-# Move or copy the file name to the temp name
+  # Skip lots of stat calls in the usual case.
+  if test ! -d "$dstdir"; then
+    defaultIFS='
+	 '
+    IFS="${IFS-$defaultIFS}"
 
-	$doit $instcmd $src $dsttmp &&
+    oIFS=$IFS
+    # Some sh's can't handle IFS=/ for some reason.
+    IFS='%'
+    set x `echo "$dstdir" | sed -e 's@/@%@g' -e 's@^%@/@'`
+    shift
+    IFS=$oIFS
 
-	trap "rm -f ${dsttmp}" 0 &&
+    pathcomp=
 
-# and set any options; do chmod last to preserve setuid bits
+    while test $# -ne 0 ; do
+      pathcomp=$pathcomp$1
+      shift
+      if test ! -d "$pathcomp"; then
+        $mkdirprog "$pathcomp"
+	# mkdir can fail with a `File exist' error in case several
+	# install-sh are creating the directory concurrently.  This
+	# is OK.
+	test -d "$pathcomp" || exit
+      fi
+      pathcomp=$pathcomp/
+    done
+  fi
 
-# If any of these fail, we abort the whole thing.  If we want to
-# ignore errors from any of these, just make sure not to ignore
-# errors from the above "$doit $instcmd $src $dsttmp" command.
+  if test -n "$dir_arg"; then
+    $doit $mkdircmd "$dst" \
+      && { test -z "$chowncmd" || $doit $chowncmd "$dst"; } \
+      && { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } \
+      && { test -z "$stripcmd" || $doit $stripcmd "$dst"; } \
+      && { test -z "$chmodcmd" || $doit $chmodcmd "$dst"; }
 
-	if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi &&
-	if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi &&
-	if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi &&
-	if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi &&
+  else
+    dstfile=`basename "$dst"`
 
-# Now rename the file to the real destination.
+    # Make a couple of temp file names in the proper directory.
+    dsttmp=$dstdir/_inst.$$_
+    rmtmp=$dstdir/_rm.$$_
 
-	$doit $rmcmd -f $dstdir/$dstfile &&
-	$doit $mvcmd $dsttmp $dstdir/$dstfile 
+    # Trap to clean up those temp files at exit.
+    trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0
+    trap '(exit $?); exit' 1 2 13 15
 
-fi &&
+    # Copy the file name to the temp name.
+    $doit $cpprog "$src" "$dsttmp" &&
 
+    # and set any options; do chmod last to preserve setuid bits.
+    #
+    # If any of these fail, we abort the whole thing.  If we want to
+    # ignore errors from any of these, just make sure not to ignore
+    # errors from the above "$doit $cpprog $src $dsttmp" command.
+    #
+    { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } \
+      && { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } \
+      && { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } \
+      && { test -z "$chmodcmd" || $doit $chmodcmd "$dsttmp"; } &&
 
-exit 0
+    # Now rename the file to the real destination.
+    { $doit $mvcmd -f "$dsttmp" "$dstdir/$dstfile" 2>/dev/null \
+      || {
+	   # The rename failed, perhaps because mv can't rename something else
+	   # to itself, or perhaps because mv is so ancient that it does not
+	   # support -f.
+
+	   # Now remove or move aside any old file at destination location.
+	   # We try this two ways since rm can't unlink itself on some
+	   # systems and the destination file might be busy for other
+	   # reasons.  In this case, the final cleanup might fail but the new
+	   # file should still install successfully.
+	   {
+	     if test -f "$dstdir/$dstfile"; then
+	       $doit $rmcmd -f "$dstdir/$dstfile" 2>/dev/null \
+	       || $doit $mvcmd -f "$dstdir/$dstfile" "$rmtmp" 2>/dev/null \
+	       || {
+		 echo "$0: cannot unlink or rename $dstdir/$dstfile" >&2
+		 (exit 1); exit 1
+	       }
+	     else
+	       :
+	     fi
+	   } &&
+
+	   # Now rename the file to the real destination.
+	   $doit $mvcmd "$dsttmp" "$dstdir/$dstfile"
+	 }
+    }
+  fi || { (exit 1); exit 1; }
+done
+
+# The final little trick to "correctly" pass the exit status to the exit trap.
+{
+  (exit 0); exit 0
+}
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-end: "$"
+# End:
diff --git a/jccolmmx.asm b/jccolmmx.asm
new file mode 100644
index 0000000..2e2fca6
--- /dev/null
+++ b/jccolmmx.asm
@@ -0,0 +1,513 @@
+;
+; jccolmmx.asm - colorspace conversion (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jcolsamp.inc"
+
+%if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+%ifdef JCCOLOR_RGBYCC_MMX_SUPPORTED
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_081	equ	 5329			; FIX(0.08131)
+F_0_114	equ	 7471			; FIX(0.11400)
+F_0_168	equ	11059			; FIX(0.16874)
+F_0_250	equ	16384			; FIX(0.25000)
+F_0_299	equ	19595			; FIX(0.29900)
+F_0_331	equ	21709			; FIX(0.33126)
+F_0_418	equ	27439			; FIX(0.41869)
+F_0_587	equ	38470			; FIX(0.58700)
+F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_rgb_ycc_convert_mmx)
+
+EXTN(jconst_rgb_ycc_convert_mmx):
+
+PW_F0299_F0337	times 2 dw  F_0_299, F_0_337
+PW_F0114_F0250	times 2 dw  F_0_114, F_0_250
+PW_MF016_MF033	times 2 dw -F_0_168,-F_0_331
+PW_MF008_MF041	times 2 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ	times 2 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF	times 2 dd  (1 << (SCALEBITS-1))
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jpeg_rgb_ycc_convert_mmx (j_compress_ptr cinfo,
+;                           JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                           JDIMENSION output_row, int num_rows);
+;
+
+%define cinfo(b)	(b)+8		; j_compress_ptr cinfo
+%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
+%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
+%define output_row(b)	(b)+20		; JDIMENSION output_row
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		8
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_rgb_ycc_convert_mmx)
+
+EXTN(jpeg_rgb_ycc_convert_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, POINTER [cinfo(eax)]
+	mov	ecx, JDIMENSION [jcstruct_image_width(ecx)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	esi, JSAMPIMAGE [output_buf(eax)]
+	mov	ecx, JDIMENSION [output_row(eax)]
+	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	esi, JSAMPARRAY [input_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	pushpic	eax
+	push	edx
+	push	ebx
+	push	edi
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr0
+	mov	ebx, JSAMPROW [ebx]	; outptr1
+	mov	edx, JSAMPROW [edx]	; outptr2
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jae	short .columnloop
+	alignx	16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+	push	eax
+	push	edx
+	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
+	test	cl, SIZEOF_BYTE
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_BYTE
+	xor	eax,eax
+	mov	al, BYTE [esi+ecx]
+.column_ld2:
+	test	cl, SIZEOF_WORD
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_WORD
+	xor	edx,edx
+	mov	dx, WORD [esi+ecx]
+	shl	eax, WORD_BIT
+	or	eax,edx
+.column_ld4:
+	movd	mmA,eax
+	pop	edx
+	pop	eax
+	test	cl, SIZEOF_DWORD
+	jz	short .column_ld8
+	sub	ecx, byte SIZEOF_DWORD
+	movd	mmG, DWORD [esi+ecx]
+	psllq	mmA, DWORD_BIT
+	por	mmA,mmG
+.column_ld8:
+	test	cl, SIZEOF_MMWORD
+	jz	short .column_ld16
+	movq	mmG,mmA
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	mov	ecx, SIZEOF_MMWORD
+	jmp	short .rgb_ycc_cnv
+.column_ld16:
+	test	cl, 2*SIZEOF_MMWORD
+	mov	ecx, SIZEOF_MMWORD
+	jz	short .rgb_ycc_cnv
+	movq	mmF,mmA
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+	; mmA=(00 10 20 01 11 21 02 12)
+	; mmG=(22 03 13 23 04 14 24 05)
+	; mmF=(15 25 06 16 26 07 17 27)
+
+	movq      mmD,mmA
+	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 10 20 01)
+	psrlq     mmD,4*BYTE_BIT	; mmD=(11 21 02 12 -- -- -- --)
+
+	punpckhbw mmA,mmG		; mmA=(00 04 10 14 20 24 01 05)
+	psllq     mmG,4*BYTE_BIT	; mmG=(-- -- -- -- 22 03 13 23)
+
+	punpcklbw mmD,mmF		; mmD=(11 15 21 25 02 06 12 16)
+	punpckhbw mmG,mmF		; mmG=(22 26 03 07 13 17 23 27)
+
+	movq      mmE,mmA
+	psllq     mmA,4*BYTE_BIT	; mmA=(-- -- -- -- 00 04 10 14)
+	psrlq     mmE,4*BYTE_BIT	; mmE=(20 24 01 05 -- -- -- --)
+
+	punpckhbw mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
+	psllq     mmD,4*BYTE_BIT	; mmD=(-- -- -- -- 11 15 21 25)
+
+	punpcklbw mmE,mmG		; mmE=(20 22 24 26 01 03 05 07)
+	punpckhbw mmD,mmG		; mmD=(11 13 15 17 21 23 25 27)
+
+	pxor      mmH,mmH
+
+	movq      mmC,mmA
+	punpcklbw mmA,mmH		; mmA=(00 02 04 06)
+	punpckhbw mmC,mmH		; mmC=(10 12 14 16)
+
+	movq      mmB,mmE
+	punpcklbw mmE,mmH		; mmE=(20 22 24 26)
+	punpckhbw mmB,mmH		; mmB=(01 03 05 07)
+
+	movq      mmF,mmD
+	punpcklbw mmD,mmH		; mmD=(11 13 15 17)
+	punpckhbw mmF,mmH		; mmF=(21 23 25 27)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+	test	cl, SIZEOF_MMWORD/8
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_MMWORD/8
+	movd	mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+	test	cl, SIZEOF_MMWORD/4
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_MMWORD/4
+	movq	mmF,mmA
+	movq	mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+	test	cl, SIZEOF_MMWORD/2
+	mov	ecx, SIZEOF_MMWORD
+	jz	short .rgb_ycc_cnv
+	movq	mmD,mmA
+	movq	mmC,mmF
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movq	mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+	movq	mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+	; mmA=(00 10 20 30 01 11 21 31)
+	; mmF=(02 12 22 32 03 13 23 33)
+	; mmD=(04 14 24 34 05 15 25 35)
+	; mmC=(06 16 26 36 07 17 27 37)
+
+	movq      mmB,mmA
+	punpcklbw mmA,mmF		; mmA=(00 02 10 12 20 22 30 32)
+	punpckhbw mmB,mmF		; mmB=(01 03 11 13 21 23 31 33)
+
+	movq      mmG,mmD
+	punpcklbw mmD,mmC		; mmD=(04 06 14 16 24 26 34 36)
+	punpckhbw mmG,mmC		; mmG=(05 07 15 17 25 27 35 37)
+
+	movq      mmE,mmA
+	punpcklwd mmA,mmD		; mmA=(00 02 04 06 10 12 14 16)
+	punpckhwd mmE,mmD		; mmE=(20 22 24 26 30 32 34 36)
+
+	movq      mmH,mmB
+	punpcklwd mmB,mmG		; mmB=(01 03 05 07 11 13 15 17)
+	punpckhwd mmH,mmG		; mmH=(21 23 25 27 31 33 35 37)
+
+	pxor      mmF,mmF
+
+	movq      mmC,mmA
+	punpcklbw mmA,mmF		; mmA=(00 02 04 06)
+	punpckhbw mmC,mmF		; mmC=(10 12 14 16)
+
+	movq      mmD,mmB
+	punpcklbw mmB,mmF		; mmB=(01 03 05 07)
+	punpckhbw mmD,mmF		; mmD=(11 13 15 17)
+
+	movq      mmG,mmE
+	punpcklbw mmE,mmF		; mmE=(20 22 24 26)
+	punpckhbw mmG,mmF		; mmG=(30 32 34 36)
+
+	punpcklbw mmF,mmH
+	punpckhbw mmH,mmH
+	psrlw     mmF,BYTE_BIT		; mmF=(21 23 25 27)
+	psrlw     mmH,BYTE_BIT		; mmH=(31 33 35 37)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+	; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+	; (Original)
+	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+	;
+	; (This implementation)
+	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+	movq      MMWORD [wk(0)], mm0	; wk(0)=RE
+	movq      MMWORD [wk(1)], mm1	; wk(1)=RO
+	movq      MMWORD [wk(2)], mm4	; wk(2)=BE
+	movq      MMWORD [wk(3)], mm5	; wk(3)=BO
+
+	movq      mm6,mm1
+	punpcklwd mm1,mm3
+	punpckhwd mm6,mm3
+	movq      mm7,mm1
+	movq      mm4,mm6
+	pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+	pmaddwd   mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+	pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+	movq      MMWORD [wk(4)], mm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+	movq      MMWORD [wk(5)], mm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	pxor      mm1,mm1
+	pxor      mm6,mm6
+	punpcklwd mm1,mm5		; mm1=BOL
+	punpckhwd mm6,mm5		; mm6=BOH
+	psrld     mm1,1			; mm1=BOL*FIX(0.500)
+	psrld     mm6,1			; mm6=BOH*FIX(0.500)
+
+	movq      mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
+
+	paddd     mm7,mm1
+	paddd     mm4,mm6
+	paddd     mm7,mm5
+	paddd     mm4,mm5
+	psrld     mm7,SCALEBITS		; mm7=CbOL
+	psrld     mm4,SCALEBITS		; mm4=CbOH
+	packssdw  mm7,mm4		; mm7=CbO
+
+	movq      mm1, MMWORD [wk(2)]	; mm1=BE
+
+	movq      mm6,mm0
+	punpcklwd mm0,mm2
+	punpckhwd mm6,mm2
+	movq      mm5,mm0
+	movq      mm4,mm6
+	pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+	pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+	movq      MMWORD [wk(6)], mm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+	movq      MMWORD [wk(7)], mm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	pxor      mm0,mm0
+	pxor      mm6,mm6
+	punpcklwd mm0,mm1		; mm0=BEL
+	punpckhwd mm6,mm1		; mm6=BEH
+	psrld     mm0,1			; mm0=BEL*FIX(0.500)
+	psrld     mm6,1			; mm6=BEH*FIX(0.500)
+
+	movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+	paddd     mm5,mm0
+	paddd     mm4,mm6
+	paddd     mm5,mm1
+	paddd     mm4,mm1
+	psrld     mm5,SCALEBITS		; mm5=CbEL
+	psrld     mm4,SCALEBITS		; mm4=CbEH
+	packssdw  mm5,mm4		; mm5=CbE
+
+	psllw     mm7,BYTE_BIT
+	por       mm5,mm7		; mm5=Cb
+	movq      MMWORD [ebx], mm5	; Save Cb
+
+	movq      mm0, MMWORD [wk(3)]	; mm0=BO
+	movq      mm6, MMWORD [wk(2)]	; mm6=BE
+	movq      mm1, MMWORD [wk(1)]	; mm1=RO
+
+	movq      mm4,mm0
+	punpcklwd mm0,mm3
+	punpckhwd mm4,mm3
+	movq      mm7,mm0
+	movq      mm5,mm4
+	pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+	pmaddwd   mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+	movq      mm3,[GOTOFF(eax,PD_ONEHALF)]	; mm3=[PD_ONEHALF]
+
+	paddd     mm0, MMWORD [wk(4)]
+	paddd     mm4, MMWORD [wk(5)]
+	paddd     mm0,mm3
+	paddd     mm4,mm3
+	psrld     mm0,SCALEBITS		; mm0=YOL
+	psrld     mm4,SCALEBITS		; mm4=YOH
+	packssdw  mm0,mm4		; mm0=YO
+
+	pxor      mm3,mm3
+	pxor      mm4,mm4
+	punpcklwd mm3,mm1		; mm3=ROL
+	punpckhwd mm4,mm1		; mm4=ROH
+	psrld     mm3,1			; mm3=ROL*FIX(0.500)
+	psrld     mm4,1			; mm4=ROH*FIX(0.500)
+
+	movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
+
+	paddd     mm7,mm3
+	paddd     mm5,mm4
+	paddd     mm7,mm1
+	paddd     mm5,mm1
+	psrld     mm7,SCALEBITS		; mm7=CrOL
+	psrld     mm5,SCALEBITS		; mm5=CrOH
+	packssdw  mm7,mm5		; mm7=CrO
+
+	movq      mm3, MMWORD [wk(0)]	; mm3=RE
+
+	movq      mm4,mm6
+	punpcklwd mm6,mm2
+	punpckhwd mm4,mm2
+	movq      mm1,mm6
+	movq      mm5,mm4
+	pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+	pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+	pmaddwd   mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+	movq      mm2,[GOTOFF(eax,PD_ONEHALF)]	; mm2=[PD_ONEHALF]
+
+	paddd     mm6, MMWORD [wk(6)]
+	paddd     mm4, MMWORD [wk(7)]
+	paddd     mm6,mm2
+	paddd     mm4,mm2
+	psrld     mm6,SCALEBITS		; mm6=YEL
+	psrld     mm4,SCALEBITS		; mm4=YEH
+	packssdw  mm6,mm4		; mm6=YE
+
+	psllw     mm0,BYTE_BIT
+	por       mm6,mm0		; mm6=Y
+	movq      MMWORD [edi], mm6	; Save Y
+
+	pxor      mm2,mm2
+	pxor      mm4,mm4
+	punpcklwd mm2,mm3		; mm2=REL
+	punpckhwd mm4,mm3		; mm4=REH
+	psrld     mm2,1			; mm2=REL*FIX(0.500)
+	psrld     mm4,1			; mm4=REH*FIX(0.500)
+
+	movq      mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
+
+	paddd     mm1,mm2
+	paddd     mm5,mm4
+	paddd     mm1,mm0
+	paddd     mm5,mm0
+	psrld     mm1,SCALEBITS		; mm1=CrEL
+	psrld     mm5,SCALEBITS		; mm5=CrEH
+	packssdw  mm1,mm5		; mm1=CrE
+
+	psllw     mm7,BYTE_BIT
+	por       mm1,mm7		; mm1=Cr
+	movq      MMWORD [edx], mm1	; Save Cr
+
+	sub	ecx, byte SIZEOF_MMWORD
+	add	esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; inptr
+	add	edi, byte SIZEOF_MMWORD			; outptr0
+	add	ebx, byte SIZEOF_MMWORD			; outptr1
+	add	edx, byte SIZEOF_MMWORD			; outptr2
+	cmp	ecx, byte SIZEOF_MMWORD
+	jae	near .columnloop
+	test	ecx,ecx
+	jnz	near .column_ld1
+
+	pop	ecx			; col
+	pop	esi
+	pop	edi
+	pop	ebx
+	pop	edx
+	poppic	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_buf
+	add	edi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JCCOLOR_RGBYCC_MMX_SUPPORTED
+%endif ; RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
diff --git a/jccolor.c b/jccolor.c
index 0a8a4b5..85f3083 100644
--- a/jccolor.c
+++ b/jccolor.c
@@ -5,12 +5,20 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : January 5, 2006
+ * ---------------------------------------------------------------------
+ *
  * This file contains input colorspace conversion routines.
  */
 
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jcolsamp.h"		/* Private declarations */
 
 
 /* Private subobject */
@@ -352,6 +360,7 @@
 jinit_color_converter (j_compress_ptr cinfo)
 {
   my_cconvert_ptr cconvert;
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
 
   cconvert = (my_cconvert_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -420,8 +429,23 @@
     if (cinfo->num_components != 3)
       ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
     if (cinfo->in_color_space == JCS_RGB) {
-      cconvert->pub.start_pass = rgb_ycc_start;
-      cconvert->pub.color_convert = rgb_ycc_convert;
+#if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+#ifdef JCCOLOR_RGBYCC_SSE2_SUPPORTED
+      if (simd & JSIMD_SSE2 &&
+          IS_CONST_ALIGNED_16(jconst_rgb_ycc_convert_sse2)) {
+        cconvert->pub.color_convert = jpeg_rgb_ycc_convert_sse2;
+      } else
+#endif
+#ifdef JCCOLOR_RGBYCC_MMX_SUPPORTED
+      if (simd & JSIMD_MMX) {
+        cconvert->pub.color_convert = jpeg_rgb_ycc_convert_mmx;
+      } else
+#endif
+#endif /* RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4 */
+      {
+        cconvert->pub.start_pass = rgb_ycc_start;
+        cconvert->pub.color_convert = rgb_ycc_convert;
+      }
     } else if (cinfo->in_color_space == JCS_YCbCr)
       cconvert->pub.color_convert = null_convert;
     else
@@ -457,3 +481,28 @@
     break;
   }
 }
+
+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+GLOBAL(unsigned int)
+jpeg_simd_color_converter (j_compress_ptr cinfo)
+{
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
+
+#if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+#ifdef JCCOLOR_RGBYCC_SSE2_SUPPORTED
+  if (simd & JSIMD_SSE2 &&
+      IS_CONST_ALIGNED_16(jconst_rgb_ycc_convert_sse2))
+    return JSIMD_SSE2;
+#endif
+#ifdef JCCOLOR_RGBYCC_MMX_SUPPORTED
+  if (simd & JSIMD_MMX)
+    return JSIMD_MMX;
+#endif
+#endif /* RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4 */
+
+  return JSIMD_NONE;
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
diff --git a/jccolss2.asm b/jccolss2.asm
new file mode 100644
index 0000000..1aabd89
--- /dev/null
+++ b/jccolss2.asm
@@ -0,0 +1,541 @@
+;
+; jccolss2.asm - colorspace conversion (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jcolsamp.inc"
+
+%if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+%ifdef JCCOLOR_RGBYCC_SSE2_SUPPORTED
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_081	equ	 5329			; FIX(0.08131)
+F_0_114	equ	 7471			; FIX(0.11400)
+F_0_168	equ	11059			; FIX(0.16874)
+F_0_250	equ	16384			; FIX(0.25000)
+F_0_299	equ	19595			; FIX(0.29900)
+F_0_331	equ	21709			; FIX(0.33126)
+F_0_418	equ	27439			; FIX(0.41869)
+F_0_587	equ	38470			; FIX(0.58700)
+F_0_337	equ	(F_0_587 - F_0_250)	; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337	times 4 dw  F_0_299, F_0_337
+PW_F0114_F0250	times 4 dw  F_0_114, F_0_250
+PW_MF016_MF033	times 4 dw -F_0_168,-F_0_331
+PW_MF008_MF041	times 4 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ	times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF	times 4 dd  (1 << (SCALEBITS-1))
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jpeg_rgb_ycc_convert_sse2 (j_compress_ptr cinfo,
+;                            JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                            JDIMENSION output_row, int num_rows);
+;
+
+%define cinfo(b)	(b)+8		; j_compress_ptr cinfo
+%define input_buf(b)	(b)+12		; JSAMPARRAY input_buf
+%define output_buf(b)	(b)+16		; JSAMPIMAGE output_buf
+%define output_row(b)	(b)+20		; JDIMENSION output_row
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		8
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_rgb_ycc_convert_sse2)
+
+EXTN(jpeg_rgb_ycc_convert_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, POINTER [cinfo(eax)]
+	mov	ecx, JDIMENSION [jcstruct_image_width(ecx)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	esi, JSAMPIMAGE [output_buf(eax)]
+	mov	ecx, JDIMENSION [output_row(eax)]
+	mov	edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+	lea	edi, [edi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	esi, JSAMPARRAY [input_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	pushpic	eax
+	push	edx
+	push	ebx
+	push	edi
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr0
+	mov	ebx, JSAMPROW [ebx]	; outptr1
+	mov	edx, JSAMPROW [edx]	; outptr2
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	alignx	16,7
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+	push	eax
+	push	edx
+	lea	ecx,[ecx+ecx*2]		; imul ecx,RGB_PIXELSIZE
+	test	cl, SIZEOF_BYTE
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_BYTE
+	movzx	eax, BYTE [esi+ecx]
+.column_ld2:
+	test	cl, SIZEOF_WORD
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_WORD
+	movzx	edx, WORD [esi+ecx]
+	shl	eax, WORD_BIT
+	or	eax,edx
+.column_ld4:
+	movd	xmmA,eax
+	pop	edx
+	pop	eax
+	test	cl, SIZEOF_DWORD
+	jz	short .column_ld8
+	sub	ecx, byte SIZEOF_DWORD
+	movd	xmmF, _DWORD [esi+ecx]
+	pslldq	xmmA, SIZEOF_DWORD
+	por	xmmA,xmmF
+.column_ld8:
+	test	cl, SIZEOF_MMWORD
+	jz	short .column_ld16
+	sub	ecx, byte SIZEOF_MMWORD
+	movq	xmmB, _MMWORD [esi+ecx]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmB
+.column_ld16:
+	test	cl, SIZEOF_XMMWORD
+	jz	short .column_ld32
+	movdqa	xmmF,xmmA
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	mov	ecx, SIZEOF_XMMWORD
+	jmp	short .rgb_ycc_cnv
+.column_ld32:
+	test	cl, 2*SIZEOF_XMMWORD
+	mov	ecx, SIZEOF_XMMWORD
+	jz	short .rgb_ycc_cnv
+	movdqa	xmmB,xmmA
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	movdqu	xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	movdqa    xmmG,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+	movdqa    xmmD,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+	movdqa    xmmE,xmmA
+	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+	pxor      xmmH,xmmH
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmB,xmmE
+	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+	movdqa    xmmF,xmmD
+	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+	test	cl, SIZEOF_XMMWORD/16
+	jz	short .column_ld2
+	sub	ecx, byte SIZEOF_XMMWORD/16
+	movd	xmmA, _DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+	test	cl, SIZEOF_XMMWORD/8
+	jz	short .column_ld4
+	sub	ecx, byte SIZEOF_XMMWORD/8
+	movq	xmmE, _MMWORD [esi+ecx*RGB_PIXELSIZE]
+	pslldq	xmmA, SIZEOF_MMWORD
+	por	xmmA,xmmE
+.column_ld4:
+	test	cl, SIZEOF_XMMWORD/4
+	jz	short .column_ld8
+	sub	ecx, byte SIZEOF_XMMWORD/4
+	movdqa	xmmE,xmmA
+	movdqu	xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+	test	cl, SIZEOF_XMMWORD/2
+	mov	ecx, SIZEOF_XMMWORD
+	jz	short .rgb_ycc_cnv
+	movdqa	xmmF,xmmA
+	movdqa	xmmH,xmmE
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	jmp	short .rgb_ycc_cnv
+	alignx	16,7
+
+.columnloop:
+	movdqu	xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqu	xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	movdqu	xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+	movdqu	xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+	movdqa    xmmC,xmmF
+	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+	movdqa    xmmB,xmmA
+	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+	movdqa    xmmG,xmmD
+	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+	movdqa    xmmE,xmmA
+	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+	movdqa    xmmH,xmmB
+	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+	pxor      xmmF,xmmF
+
+	movdqa    xmmC,xmmA
+	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
+	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+	movdqa    xmmD,xmmB
+	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
+	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+	movdqa    xmmG,xmmE
+	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
+	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+	punpcklbw xmmF,xmmH
+	punpckhbw xmmH,xmmH
+	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
+	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+	; (Original)
+	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+	;
+	; (This implementation)
+	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+	; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+	; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=RE
+	movdqa    XMMWORD [wk(1)], xmm1	; wk(1)=RO
+	movdqa    XMMWORD [wk(2)], xmm4	; wk(2)=BE
+	movdqa    XMMWORD [wk(3)], xmm5	; wk(3)=BO
+
+	movdqa    xmm6,xmm1
+	punpcklwd xmm1,xmm3
+	punpckhwd xmm6,xmm3
+	movdqa    xmm7,xmm1
+	movdqa    xmm4,xmm6
+	pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+	pmaddwd   xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+	movdqa    XMMWORD [wk(4)], xmm1	; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+	movdqa    XMMWORD [wk(5)], xmm6	; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+	pxor      xmm1,xmm1
+	pxor      xmm6,xmm6
+	punpcklwd xmm1,xmm5		; xmm1=BOL
+	punpckhwd xmm6,xmm5		; xmm6=BOH
+	psrld     xmm1,1		; xmm1=BOL*FIX(0.500)
+	psrld     xmm6,1		; xmm6=BOH*FIX(0.500)
+
+	movdqa    xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm7,xmm1
+	paddd     xmm4,xmm6
+	paddd     xmm7,xmm5
+	paddd     xmm4,xmm5
+	psrld     xmm7,SCALEBITS	; xmm7=CbOL
+	psrld     xmm4,SCALEBITS	; xmm4=CbOH
+	packssdw  xmm7,xmm4		; xmm7=CbO
+
+	movdqa    xmm1, XMMWORD [wk(2)]	; xmm1=BE
+
+	movdqa    xmm6,xmm0
+	punpcklwd xmm0,xmm2
+	punpckhwd xmm6,xmm2
+	movdqa    xmm5,xmm0
+	movdqa    xmm4,xmm6
+	pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+	pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+	movdqa    XMMWORD [wk(6)], xmm0	; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+	movdqa    XMMWORD [wk(7)], xmm6	; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+	pxor      xmm0,xmm0
+	pxor      xmm6,xmm6
+	punpcklwd xmm0,xmm1		; xmm0=BEL
+	punpckhwd xmm6,xmm1		; xmm6=BEH
+	psrld     xmm0,1		; xmm0=BEL*FIX(0.500)
+	psrld     xmm6,1		; xmm6=BEH*FIX(0.500)
+
+	movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm5,xmm0
+	paddd     xmm4,xmm6
+	paddd     xmm5,xmm1
+	paddd     xmm4,xmm1
+	psrld     xmm5,SCALEBITS	; xmm5=CbEL
+	psrld     xmm4,SCALEBITS	; xmm4=CbEH
+	packssdw  xmm5,xmm4		; xmm5=CbE
+
+	psllw     xmm7,BYTE_BIT
+	por       xmm5,xmm7		; xmm5=Cb
+	movdqa    XMMWORD [ebx], xmm5	; Save Cb
+
+	movdqa    xmm0, XMMWORD [wk(3)]	; xmm0=BO
+	movdqa    xmm6, XMMWORD [wk(2)]	; xmm6=BE
+	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=RO
+
+	movdqa    xmm4,xmm0
+	punpcklwd xmm0,xmm3
+	punpckhwd xmm4,xmm3
+	movdqa    xmm7,xmm0
+	movdqa    xmm5,xmm4
+	pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+	pmaddwd   xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+	movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)]	; xmm3=[PD_ONEHALF]
+
+	paddd     xmm0, XMMWORD [wk(4)]
+	paddd     xmm4, XMMWORD [wk(5)]
+	paddd     xmm0,xmm3
+	paddd     xmm4,xmm3
+	psrld     xmm0,SCALEBITS	; xmm0=YOL
+	psrld     xmm4,SCALEBITS	; xmm4=YOH
+	packssdw  xmm0,xmm4		; xmm0=YO
+
+	pxor      xmm3,xmm3
+	pxor      xmm4,xmm4
+	punpcklwd xmm3,xmm1		; xmm3=ROL
+	punpckhwd xmm4,xmm1		; xmm4=ROH
+	psrld     xmm3,1		; xmm3=ROL*FIX(0.500)
+	psrld     xmm4,1		; xmm4=ROH*FIX(0.500)
+
+	movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm7,xmm3
+	paddd     xmm5,xmm4
+	paddd     xmm7,xmm1
+	paddd     xmm5,xmm1
+	psrld     xmm7,SCALEBITS	; xmm7=CrOL
+	psrld     xmm5,SCALEBITS	; xmm5=CrOH
+	packssdw  xmm7,xmm5		; xmm7=CrO
+
+	movdqa    xmm3, XMMWORD [wk(0)]	; xmm3=RE
+
+	movdqa    xmm4,xmm6
+	punpcklwd xmm6,xmm2
+	punpckhwd xmm4,xmm2
+	movdqa    xmm1,xmm6
+	movdqa    xmm5,xmm4
+	pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+	pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+	pmaddwd   xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+	movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)]	; xmm2=[PD_ONEHALF]
+
+	paddd     xmm6, XMMWORD [wk(6)]
+	paddd     xmm4, XMMWORD [wk(7)]
+	paddd     xmm6,xmm2
+	paddd     xmm4,xmm2
+	psrld     xmm6,SCALEBITS	; xmm6=YEL
+	psrld     xmm4,SCALEBITS	; xmm4=YEH
+	packssdw  xmm6,xmm4		; xmm6=YE
+
+	psllw     xmm0,BYTE_BIT
+	por       xmm6,xmm0		; xmm6=Y
+	movdqa    XMMWORD [edi], xmm6	; Save Y
+
+	pxor      xmm2,xmm2
+	pxor      xmm4,xmm4
+	punpcklwd xmm2,xmm3		; xmm2=REL
+	punpckhwd xmm4,xmm3		; xmm4=REH
+	psrld     xmm2,1		; xmm2=REL*FIX(0.500)
+	psrld     xmm4,1		; xmm4=REH*FIX(0.500)
+
+	movdqa    xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
+
+	paddd     xmm1,xmm2
+	paddd     xmm5,xmm4
+	paddd     xmm1,xmm0
+	paddd     xmm5,xmm0
+	psrld     xmm1,SCALEBITS	; xmm1=CrEL
+	psrld     xmm5,SCALEBITS	; xmm5=CrEH
+	packssdw  xmm1,xmm5		; xmm1=CrE
+
+	psllw     xmm7,BYTE_BIT
+	por       xmm1,xmm7		; xmm1=Cr
+	movdqa    XMMWORD [edx], xmm1	; Save Cr
+
+	sub	ecx, byte SIZEOF_XMMWORD
+	add	esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
+	add	edi, byte SIZEOF_XMMWORD		; outptr0
+	add	ebx, byte SIZEOF_XMMWORD		; outptr1
+	add	edx, byte SIZEOF_XMMWORD		; outptr2
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	test	ecx,ecx
+	jnz	near .column_ld1
+
+	pop	ecx			; col
+	pop	esi
+	pop	edi
+	pop	ebx
+	pop	edx
+	poppic	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_buf
+	add	edi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JCCOLOR_RGBYCC_SSE2_SUPPORTED
+%endif ; RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
diff --git a/jcdctmgr.c b/jcdctmgr.c
index 61fa79b..3a89eb4 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -5,6 +5,13 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : December 24, 2005
+ * ---------------------------------------------------------------------
+ *
  * This file contains the forward-DCT management logic.
  * This code selects a particular DCT implementation to be used,
  * and it performs related housekeeping chores including coefficient
@@ -24,6 +31,8 @@
 
   /* Pointer to the DCT routine actually in use */
   forward_DCT_method_ptr do_dct;
+  convsamp_int_method_ptr convsamp;
+  quantize_int_method_ptr quantize;
 
   /* The actual post-DCT divisors --- not identical to the quant table
    * entries, because of scaling (especially for an unnormalized DCT).
@@ -34,12 +43,75 @@
 #ifdef DCT_FLOAT_SUPPORTED
   /* Same as above for the floating-point case. */
   float_DCT_method_ptr do_float_dct;
+  convsamp_float_method_ptr float_convsamp;
+  quantize_float_method_ptr float_quantize;
   FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
 #endif
 } my_fdct_controller;
 
 typedef my_fdct_controller * my_fdct_ptr;
 
+/*
+ * SIMD Ext: Most of SSE/SSE2 instructions require that the memory address
+ * is aligned to a 16-byte boundary; if not, a general-protection exception
+ * (#GP) is generated.
+ */
+
+#define ALIGN_SIZE	16		/* sizeof SSE/SSE2 register */
+#define ALIGN_MEM(p,a)	((void *) (((size_t) (p) + (a) - 1) & -(a)))
+
+#ifdef JFDCT_INT_QUANTIZE_WITH_DIVISION
+#undef jpeg_quantize_int
+#undef jpeg_quantize_int_mmx
+#undef jpeg_quantize_int_sse2
+#define jpeg_quantize_int       jpeg_quantize_idiv
+#define jpeg_quantize_int_mmx   jpeg_quantize_idiv
+#define jpeg_quantize_int_sse2  jpeg_quantize_idiv
+#endif
+
+
+#ifndef JFDCT_INT_QUANTIZE_WITH_DIVISION
+
+/*
+ * SIMD Ext: compute the reciprocal of the divisor
+ *
+ * This implementation is based on an algorithm described in
+ *   "How to optimize for the Pentium family of microprocessors"
+ *   (http://www.agner.org/assem/).
+ */
+
+LOCAL(void)
+compute_reciprocal (DCTELEM divisor, DCTELEM * dtbl)
+{
+  unsigned long d = ((unsigned long) divisor) & 0x0000FFFF;
+  unsigned long fq, fr;
+  int b, r, c;
+
+  for (b = 0; (1UL << b) <= d; b++) ;
+
+  r  = 16 + (--b);
+  fq = (1UL << r) / d;
+  fr = (1UL << r) % d;
+  r -= 16;
+  c  = 0;
+
+  if (fr == 0) {
+    fq >>= 1;
+    r--;
+  } else if (fr <= (d / 2)) {
+    c++;
+  } else {
+    fq++;
+  }
+
+  dtbl[DCTSIZE2 * 0] = (DCTELEM) fq;		/* reciprocal */
+  dtbl[DCTSIZE2 * 1] = (DCTELEM) (c + (d / 2));	/* correction + roundfactor */
+  dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (16 - (r + 1 + 1)));	/* scale */
+  dtbl[DCTSIZE2 * 3] = (DCTELEM) (r + 1);			/* shift */
+}
+
+#endif /* JFDCT_INT_QUANTIZE_WITH_DIVISION */
+
 
 /*
  * Initialize for a processing pass.
@@ -75,6 +147,18 @@
       /* For LL&M IDCT method, divisors are equal to raw quantization
        * coefficients multiplied by 8 (to counteract scaling).
        */
+#ifndef JFDCT_INT_QUANTIZE_WITH_DIVISION
+      if (fdct->divisors[qtblno] == NULL) {
+	fdct->divisors[qtblno] = (DCTELEM *)
+	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				      (DCTSIZE2 * 4) * SIZEOF(DCTELEM));
+      }
+      dtbl = fdct->divisors[qtblno];
+      for (i = 0; i < DCTSIZE2; i++) {
+	compute_reciprocal ((DCTELEM) (qtbl->quantval[i] << 3), &dtbl[i]);
+      }
+      break;
+#else  /* JFDCT_INT_QUANTIZE_WITH_DIVISION */
       if (fdct->divisors[qtblno] == NULL) {
 	fdct->divisors[qtblno] = (DCTELEM *)
 	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -85,7 +169,8 @@
 	dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
       }
       break;
-#endif
+#endif /* JFDCT_INT_QUANTIZE_WITH_DIVISION */
+#endif /* DCT_ISLOW_SUPPORTED */
 #ifdef DCT_IFAST_SUPPORTED
     case JDCT_IFAST:
       {
@@ -109,6 +194,21 @@
 	};
 	SHIFT_TEMPS
 
+#ifndef JFDCT_INT_QUANTIZE_WITH_DIVISION
+	if (fdct->divisors[qtblno] == NULL) {
+	  fdct->divisors[qtblno] = (DCTELEM *)
+	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+					(DCTSIZE2 * 4) * SIZEOF(DCTELEM));
+	}
+	dtbl = fdct->divisors[qtblno];
+	for (i = 0; i < DCTSIZE2; i++) {
+	  compute_reciprocal ((DCTELEM)
+			       DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
+						     (INT32) aanscales[i]),
+				       CONST_BITS-3),
+			      &dtbl[i]);
+	}
+#else  /* JFDCT_INT_QUANTIZE_WITH_DIVISION */
 	if (fdct->divisors[qtblno] == NULL) {
 	  fdct->divisors[qtblno] = (DCTELEM *)
 	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -121,9 +221,10 @@
 				  (INT32) aanscales[i]),
 		    CONST_BITS-3);
 	}
+#endif /* JFDCT_INT_QUANTIZE_WITH_DIVISION */
       }
       break;
-#endif
+#endif /* DCT_IFAST_SUPPORTED */
 #ifdef DCT_FLOAT_SUPPORTED
     case JDCT_FLOAT:
       {
@@ -183,83 +284,23 @@
 	     JDIMENSION num_blocks)
 /* This version is used for integer DCT implementations. */
 {
-  /* This routine is heavily used, so it's worth coding it tightly. */
   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  forward_DCT_method_ptr do_dct = fdct->do_dct;
   DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
-  DCTELEM workspace[DCTSIZE2];	/* work area for FDCT subroutine */
+  DCTELEM workspace[DCTSIZE2 + ALIGN_SIZE/sizeof(DCTELEM)];
+  DCTELEM * wkptr = (DCTELEM *) ALIGN_MEM(workspace, ALIGN_SIZE);
   JDIMENSION bi;
 
   sample_data += start_row;	/* fold in the vertical offset once */
 
   for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
     /* Load data into workspace, applying unsigned->signed conversion */
-    { register DCTELEM *workspaceptr;
-      register JSAMPROW elemptr;
-      register int elemr;
-
-      workspaceptr = workspace;
-      for (elemr = 0; elemr < DCTSIZE; elemr++) {
-	elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8		/* unroll the inner loop */
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	*workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-#else
-	{ register int elemc;
-	  for (elemc = DCTSIZE; elemc > 0; elemc--) {
-	    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-	  }
-	}
-#endif
-      }
-    }
+    (*fdct->convsamp) (sample_data, start_col, wkptr);
 
     /* Perform the DCT */
-    (*do_dct) (workspace);
+    (*fdct->do_dct) (wkptr);
 
     /* Quantize/descale the coefficients, and store into coef_blocks[] */
-    { register DCTELEM temp, qval;
-      register int i;
-      register JCOEFPTR output_ptr = coef_blocks[bi];
-
-      for (i = 0; i < DCTSIZE2; i++) {
-	qval = divisors[i];
-	temp = workspace[i];
-	/* Divide the coefficient value by qval, ensuring proper rounding.
-	 * Since C does not specify the direction of rounding for negative
-	 * quotients, we have to force the dividend positive for portability.
-	 *
-	 * In most files, at least half of the output values will be zero
-	 * (at default quantization settings, more like three-quarters...)
-	 * so we should ensure that this case is fast.  On many machines,
-	 * a comparison is enough cheaper than a divide to make a special test
-	 * a win.  Since both inputs will be nonnegative, we need only test
-	 * for a < b to discover whether a/b is 0.
-	 * If your machine's division is fast enough, define FAST_DIVIDE.
-	 */
-#ifdef FAST_DIVIDE
-#define DIVIDE_BY(a,b)	a /= b
-#else
-#define DIVIDE_BY(a,b)	if (a >= b) a /= b; else a = 0
-#endif
-	if (temp < 0) {
-	  temp = -temp;
-	  temp += qval>>1;	/* for rounding */
-	  DIVIDE_BY(temp, qval);
-	  temp = -temp;
-	} else {
-	  temp += qval>>1;	/* for rounding */
-	  DIVIDE_BY(temp, qval);
-	}
-	output_ptr[i] = (JCOEF) temp;
-      }
-    }
+    (*fdct->quantize) (coef_blocks[bi], divisors, wkptr);
   }
 }
 
@@ -273,64 +314,23 @@
 		   JDIMENSION num_blocks)
 /* This version is used for floating-point DCT implementations. */
 {
-  /* This routine is heavily used, so it's worth coding it tightly. */
   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  float_DCT_method_ptr do_dct = fdct->do_float_dct;
   FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
-  FAST_FLOAT workspace[DCTSIZE2]; /* work area for FDCT subroutine */
+  FAST_FLOAT workspace[DCTSIZE2 + ALIGN_SIZE/sizeof(FAST_FLOAT)];
+  FAST_FLOAT * wkptr = (FAST_FLOAT *) ALIGN_MEM(workspace, ALIGN_SIZE);
   JDIMENSION bi;
 
   sample_data += start_row;	/* fold in the vertical offset once */
 
   for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
     /* Load data into workspace, applying unsigned->signed conversion */
-    { register FAST_FLOAT *workspaceptr;
-      register JSAMPROW elemptr;
-      register int elemr;
-
-      workspaceptr = workspace;
-      for (elemr = 0; elemr < DCTSIZE; elemr++) {
-	elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8		/* unroll the inner loop */
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-#else
-	{ register int elemc;
-	  for (elemc = DCTSIZE; elemc > 0; elemc--) {
-	    *workspaceptr++ = (FAST_FLOAT)
-	      (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-	  }
-	}
-#endif
-      }
-    }
+    (*fdct->float_convsamp) (sample_data, start_col, wkptr);
 
     /* Perform the DCT */
-    (*do_dct) (workspace);
+    (*fdct->do_float_dct) (wkptr);
 
     /* Quantize/descale the coefficients, and store into coef_blocks[] */
-    { register FAST_FLOAT temp;
-      register int i;
-      register JCOEFPTR output_ptr = coef_blocks[bi];
-
-      for (i = 0; i < DCTSIZE2; i++) {
-	/* Apply the quantization and scaling factor */
-	temp = workspace[i] * divisors[i];
-	/* Round to nearest integer.
-	 * Since C does not specify the direction of rounding for negative
-	 * quotients, we have to force the dividend positive for portability.
-	 * The maximum coefficient size is +-16K (for 12-bit data), so this
-	 * code should work for either 16-bit or 32-bit ints.
-	 */
-	output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
-      }
-    }
+    (*fdct->float_quantize) (coef_blocks[bi], divisors, wkptr);
   }
 }
 
@@ -346,6 +346,7 @@
 {
   my_fdct_ptr fdct;
   int i;
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
 
   fdct = (my_fdct_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -357,21 +358,86 @@
 #ifdef DCT_ISLOW_SUPPORTED
   case JDCT_ISLOW:
     fdct->pub.forward_DCT = forward_DCT;
-    fdct->do_dct = jpeg_fdct_islow;
-    break;
+#ifdef JFDCT_INT_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_fdct_islow_sse2)) {
+      fdct->do_dct = jpeg_fdct_islow_sse2;
+      fdct->convsamp = jpeg_convsamp_int_sse2;
+      fdct->quantize = jpeg_quantize_int_sse2;
+    } else
 #endif
+#ifdef JFDCT_INT_MMX_SUPPORTED
+    if (simd & JSIMD_MMX) {
+      fdct->do_dct = jpeg_fdct_islow_mmx;
+      fdct->convsamp = jpeg_convsamp_int_mmx;
+      fdct->quantize = jpeg_quantize_int_mmx;
+    } else
+#endif
+    {
+      fdct->do_dct = jpeg_fdct_islow;
+      fdct->convsamp = jpeg_convsamp_int;
+      fdct->quantize = jpeg_quantize_int;
+    }
+    break;
+#endif /* DCT_ISLOW_SUPPORTED */
 #ifdef DCT_IFAST_SUPPORTED
   case JDCT_IFAST:
     fdct->pub.forward_DCT = forward_DCT;
-    fdct->do_dct = jpeg_fdct_ifast;
-    break;
+#ifdef JFDCT_INT_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_fdct_ifast_sse2)) {
+      fdct->do_dct = jpeg_fdct_ifast_sse2;
+      fdct->convsamp = jpeg_convsamp_int_sse2;
+      fdct->quantize = jpeg_quantize_int_sse2;
+    } else
 #endif
+#ifdef JFDCT_INT_MMX_SUPPORTED
+    if (simd & JSIMD_MMX) {
+      fdct->do_dct = jpeg_fdct_ifast_mmx;
+      fdct->convsamp = jpeg_convsamp_int_mmx;
+      fdct->quantize = jpeg_quantize_int_mmx;
+    } else
+#endif
+    {
+      fdct->do_dct = jpeg_fdct_ifast;
+      fdct->convsamp = jpeg_convsamp_int;
+      fdct->quantize = jpeg_quantize_int;
+    }
+    break;
+#endif /* DCT_IFAST_SUPPORTED */
 #ifdef DCT_FLOAT_SUPPORTED
   case JDCT_FLOAT:
     fdct->pub.forward_DCT = forward_DCT_float;
-    fdct->do_float_dct = jpeg_fdct_float;
-    break;
+#ifdef JFDCT_FLT_SSE_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE && simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_fdct_float_sse)) {
+      fdct->do_float_dct = jpeg_fdct_float_sse;
+      fdct->float_convsamp = jpeg_convsamp_flt_sse2;
+      fdct->float_quantize = jpeg_quantize_flt_sse2;
+    } else
 #endif
+#ifdef JFDCT_FLT_SSE_MMX_SUPPORTED
+    if (simd & JSIMD_SSE &&
+        IS_CONST_ALIGNED_16(jconst_fdct_float_sse)) {
+      fdct->do_float_dct = jpeg_fdct_float_sse;
+      fdct->float_convsamp = jpeg_convsamp_flt_sse;
+      fdct->float_quantize = jpeg_quantize_flt_sse;
+    } else
+#endif
+#ifdef JFDCT_FLT_3DNOW_MMX_SUPPORTED
+    if (simd & JSIMD_3DNOW) {
+      fdct->do_float_dct = jpeg_fdct_float_3dnow;
+      fdct->float_convsamp = jpeg_convsamp_flt_3dnow;
+      fdct->float_quantize = jpeg_quantize_flt_3dnow;
+    } else
+#endif
+    {
+      fdct->do_float_dct = jpeg_fdct_float;
+      fdct->float_convsamp = jpeg_convsamp_float;
+      fdct->float_quantize = jpeg_quantize_float;
+    }
+    break;
+#endif /* DCT_FLOAT_SUPPORTED */
   default:
     ERREXIT(cinfo, JERR_NOT_COMPILED);
     break;
@@ -385,3 +451,65 @@
 #endif
   }
 }
+
+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+GLOBAL(unsigned int)
+jpeg_simd_forward_dct (j_compress_ptr cinfo, int method)
+{
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
+
+  switch (method) {
+#ifdef DCT_ISLOW_SUPPORTED
+  case JDCT_ISLOW:
+#ifdef JFDCT_INT_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_fdct_islow_sse2))
+      return JSIMD_SSE2;
+#endif
+#ifdef JFDCT_INT_MMX_SUPPORTED
+    if (simd & JSIMD_MMX)
+      return JSIMD_MMX;
+#endif
+    return JSIMD_NONE;
+#endif /* DCT_ISLOW_SUPPORTED */
+#ifdef DCT_IFAST_SUPPORTED
+  case JDCT_IFAST:
+#ifdef JFDCT_INT_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_fdct_ifast_sse2))
+      return JSIMD_SSE2;
+#endif
+#ifdef JFDCT_INT_MMX_SUPPORTED
+    if (simd & JSIMD_MMX)
+      return JSIMD_MMX;
+#endif
+    return JSIMD_NONE;
+#endif /* DCT_IFAST_SUPPORTED */
+#ifdef DCT_FLOAT_SUPPORTED
+  case JDCT_FLOAT:
+#ifdef JFDCT_FLT_SSE_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE && simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_fdct_float_sse))
+      return JSIMD_SSE;		/* (JSIMD_SSE | JSIMD_SSE2); */
+#endif
+#ifdef JFDCT_FLT_SSE_MMX_SUPPORTED
+    if (simd & JSIMD_SSE &&
+        IS_CONST_ALIGNED_16(jconst_fdct_float_sse))
+      return JSIMD_SSE;		/* (JSIMD_SSE | JSIMD_MMX); */
+#endif
+#ifdef JFDCT_FLT_3DNOW_MMX_SUPPORTED
+    if (simd & JSIMD_3DNOW)
+      return JSIMD_3DNOW;	/* (JSIMD_3DNOW | JSIMD_MMX); */
+#endif
+    return JSIMD_NONE;
+#endif /* DCT_FLOAT_SUPPORTED */
+  default:
+    ;
+  }
+
+  return JSIMD_NONE;	/* not compiled */
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
diff --git a/jcolsamp.h b/jcolsamp.h
new file mode 100644
index 0000000..2a27b53
--- /dev/null
+++ b/jcolsamp.h
@@ -0,0 +1,143 @@
+/*
+ * jcolsamp.h - private declarations for color conversion & up/downsampling
+ *
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * Last Modified : February 4, 2006
+ *
+ * [TAB8]
+ */
+
+
+/* configuration check: BITS_IN_JSAMPLE==8 (8-bit sample values) is the only
+ * valid setting on this SIMD extension.
+ */
+#if BITS_IN_JSAMPLE != 8
+#error "Sorry, this SIMD code only copes with 8-bit sample values."
+#endif
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jpeg_rgb_ycc_convert_mmx	jMRgbYccCnv	/* jccolmmx.asm */
+#define jpeg_rgb_ycc_convert_sse2	jSRgbYccCnv	/* jccolss2.asm */
+#define jpeg_h2v1_downsample_mmx	jM21Downsample	/* jcsammmx.asm */
+#define jpeg_h2v2_downsample_mmx	jM22Downsample	/* jcsammmx.asm */
+#define jpeg_h2v1_downsample_sse2	jS21Downsample	/* jcsamss2.asm */
+#define jpeg_h2v2_downsample_sse2	jS22Downsample	/* jcsamss2.asm */
+#define jpeg_ycc_rgb_convert_mmx	jMYccRgbCnv	/* jdcolmmx.asm */
+#define jpeg_ycc_rgb_convert_sse2	jSYccRgbCnv	/* jdcolss2.asm */
+#define jpeg_h2v1_merged_upsample_mmx	jM21MerUpsample	/* jdmermmx.asm */
+#define jpeg_h2v2_merged_upsample_mmx	jM22MerUpsample	/* jdmermmx.asm */
+#define jpeg_h2v1_merged_upsample_sse2	jS21MerUpsample	/* jdmerss2.asm */
+#define jpeg_h2v2_merged_upsample_sse2	jS22MerUpsample	/* jdmerss2.asm */
+#define jpeg_h2v1_fancy_upsample_mmx	jM21FanUpsample	/* jdsammmx.asm */
+#define jpeg_h2v2_fancy_upsample_mmx	jM22FanUpsample	/* jdsammmx.asm */
+#define jpeg_h1v2_fancy_upsample_mmx	jM12FanUpsample	/* jdsammmx.asm */
+#define jpeg_h2v1_upsample_mmx		jM21Upsample	/* jdsammmx.asm */
+#define jpeg_h2v2_upsample_mmx		jM22Upsample	/* jdsammmx.asm */
+#define jpeg_h2v1_fancy_upsample_sse2	jS21FanUpsample	/* jdsamss2.asm */
+#define jpeg_h2v2_fancy_upsample_sse2	jS22FanUpsample	/* jdsamss2.asm */
+#define jpeg_h1v2_fancy_upsample_sse2	jS12FanUpsample	/* jdsamss2.asm */
+#define jpeg_h2v1_upsample_sse2		jS21Upsample	/* jdsamss2.asm */
+#define jpeg_h2v2_upsample_sse2		jS22Upsample	/* jdsamss2.asm */
+#define jconst_rgb_ycc_convert_mmx	jMCRgbYccCnv	/* jccolmmx.asm */
+#define jconst_rgb_ycc_convert_sse2	jSCRgbYccCnv	/* jccolss2.asm */
+#define jconst_ycc_rgb_convert_mmx	jMCYccRgbCnv	/* jdcolmmx.asm */
+#define jconst_ycc_rgb_convert_sse2	jSCYccRgbCnv	/* jdcolss2.asm */
+#define jconst_merged_upsample_mmx	jMCMerUpsample	/* jdmermmx.asm */
+#define jconst_merged_upsample_sse2	jSCMerUpsample	/* jdmerss2.asm */
+#define jconst_fancy_upsample_mmx	jMCFanUpsample	/* jdsammmx.asm */
+#define jconst_fancy_upsample_sse2	jSCFanUpsample	/* jdsamss2.asm */
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+#define jpeg_simd_merged_upsampler	jSiMUpsampler	/* jdmerge.c    */
+#endif
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+/* Extern declarations for color conversion & up/downsampling routines. */
+
+EXTERN(void) jpeg_rgb_ycc_convert_mmx
+    JPP((j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+	 JDIMENSION output_row, int num_rows));
+EXTERN(void) jpeg_rgb_ycc_convert_sse2
+    JPP((j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+	 JDIMENSION output_row, int num_rows));
+
+EXTERN(void) jpeg_h2v1_downsample_mmx
+    JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jpeg_h2v2_downsample_mmx
+    JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jpeg_h2v1_downsample_sse2
+    JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jpeg_h2v2_downsample_sse2
+    JPP((j_compress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY output_data));
+
+EXTERN(void) jpeg_ycc_rgb_convert_mmx
+    JPP((j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row,
+	 JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jpeg_ycc_rgb_convert_sse2
+    JPP((j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row,
+	 JSAMPARRAY output_buf, int num_rows));
+
+EXTERN(void) jpeg_h2v1_merged_upsample_mmx
+    JPP((j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+	 JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jpeg_h2v2_merged_upsample_mmx
+    JPP((j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+	 JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jpeg_h2v1_merged_upsample_sse2
+    JPP((j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+	 JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jpeg_h2v2_merged_upsample_sse2
+    JPP((j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+	 JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+
+EXTERN(void) jpeg_h2v1_fancy_upsample_mmx
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jpeg_h2v2_fancy_upsample_mmx
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jpeg_h1v2_fancy_upsample_mmx
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jpeg_h2v1_upsample_mmx
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jpeg_h2v2_upsample_mmx
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jpeg_h2v1_fancy_upsample_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jpeg_h2v2_fancy_upsample_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jpeg_h1v2_fancy_upsample_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jpeg_h2v1_upsample_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jpeg_h2v2_upsample_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+extern const int jconst_rgb_ycc_convert_mmx[];
+extern const int jconst_rgb_ycc_convert_sse2[];
+extern const int jconst_ycc_rgb_convert_mmx[];
+extern const int jconst_ycc_rgb_convert_sse2[];
+extern const int jconst_merged_upsample_mmx[];
+extern const int jconst_merged_upsample_sse2[];
+extern const int jconst_fancy_upsample_mmx[];
+extern const int jconst_fancy_upsample_sse2[];
+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+EXTERN(unsigned int) jpeg_simd_merged_upsampler JPP((j_decompress_ptr cinfo));
+#endif
diff --git a/jcolsamp.inc b/jcolsamp.inc
new file mode 100644
index 0000000..03f5dbd
--- /dev/null
+++ b/jcolsamp.inc
@@ -0,0 +1,156 @@
+;
+; jcolsamp.inc - private declarations for color conversion & up/downsampling
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; Last Modified : January 5, 2006
+;
+; [TAB8]
+
+; --------------------------------------------------------------------------
+;
+; configuration check: BITS_IN_JSAMPLE==8 (8-bit sample values) is the only
+; valid setting on this SIMD extension.
+;
+%if BITS_IN_JSAMPLE != 8
+%error "Sorry, this SIMD code only copes with 8-bit sample values."
+%endif
+
+; Short forms of external names for systems with brain-damaged linkers.
+;
+%ifdef NEED_SHORT_EXTERNAL_NAMES
+%define jpeg_rgb_ycc_convert_mmx	jMRgbYccCnv	; jccolmmx.asm
+%define jpeg_rgb_ycc_convert_sse2	jSRgbYccCnv	; jccolss2.asm
+%define jpeg_h2v1_downsample_mmx	jM21Downsample	; jcsammmx.asm
+%define jpeg_h2v2_downsample_mmx	jM22Downsample	; jcsammmx.asm
+%define jpeg_h2v1_downsample_sse2	jS21Downsample	; jcsamss2.asm
+%define jpeg_h2v2_downsample_sse2	jS22Downsample	; jcsamss2.asm
+%define jpeg_ycc_rgb_convert_mmx	jMYccRgbCnv	; jdcolmmx.asm
+%define jpeg_ycc_rgb_convert_sse2	jSYccRgbCnv	; jdcolss2.asm
+%define jpeg_h2v1_merged_upsample_mmx	jM21MerUpsample	; jdmermmx.asm
+%define jpeg_h2v2_merged_upsample_mmx	jM22MerUpsample	; jdmermmx.asm
+%define jpeg_h2v1_merged_upsample_sse2	jS21MerUpsample	; jdmerss2.asm
+%define jpeg_h2v2_merged_upsample_sse2	jS22MerUpsample	; jdmerss2.asm
+%define jpeg_h2v1_fancy_upsample_mmx	jM21FanUpsample	; jdsammmx.asm
+%define jpeg_h2v2_fancy_upsample_mmx	jM22FanUpsample	; jdsammmx.asm
+%define jpeg_h1v2_fancy_upsample_mmx	jM12FanUpsample	; jdsammmx.asm
+%define jpeg_h2v1_upsample_mmx		jM21Upsample	; jdsammmx.asm
+%define jpeg_h2v2_upsample_mmx		jM22Upsample	; jdsammmx.asm
+%define jpeg_h2v1_fancy_upsample_sse2	jS21FanUpsample	; jdsamss2.asm
+%define jpeg_h2v2_fancy_upsample_sse2	jS22FanUpsample	; jdsamss2.asm
+%define jpeg_h1v2_fancy_upsample_sse2	jS12FanUpsample	; jdsamss2.asm
+%define jpeg_h2v1_upsample_sse2		jS21Upsample	; jdsamss2.asm
+%define jpeg_h2v2_upsample_sse2		jS22Upsample	; jdsamss2.asm
+%define jconst_rgb_ycc_convert_mmx	jMCRgbYccCnv	; jccolmmx.asm
+%define jconst_rgb_ycc_convert_sse2	jSCRgbYccCnv	; jccolss2.asm
+%define jconst_ycc_rgb_convert_mmx	jMCYccRgbCnv	; jdcolmmx.asm
+%define jconst_ycc_rgb_convert_sse2	jSCYccRgbCnv	; jdcolss2.asm
+%define jconst_merged_upsample_mmx	jMCMerUpsample	; jdmermmx.asm
+%define jconst_merged_upsample_sse2	jSCMerUpsample	; jdmerss2.asm
+%define jconst_fancy_upsample_mmx	jMCFanUpsample	; jdsammmx.asm
+%define jconst_fancy_upsample_sse2	jSCFanUpsample	; jdsamss2.asm
+%endif ; NEED_SHORT_EXTERNAL_NAMES
+
+; --------------------------------------------------------------------------
+
+; pseudo-resisters to make ordering of RGB configurable
+;
+%if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+%if RGB_RED < 0 || RGB_RED >= RGB_PIXELSIZE || RGB_GREEN < 0 || \
+   RGB_GREEN >= RGB_PIXELSIZE || RGB_BLUE < 0 || RGB_BLUE >= RGB_PIXELSIZE || \
+   RGB_RED == RGB_GREEN || RGB_GREEN == RGB_BLUE || RGB_RED == RGB_BLUE
+%error "Incorrect RGB pixel offset."
+%endif
+
+%if RGB_RED == 0
+%define  mmA  mm0
+%define  mmB  mm1
+%define xmmA xmm0
+%define xmmB xmm1
+%elif RGB_GREEN == 0
+%define  mmA  mm2
+%define  mmB  mm3
+%define xmmA xmm2
+%define xmmB xmm3
+%elif RGB_BLUE == 0
+%define  mmA  mm4
+%define  mmB  mm5
+%define xmmA xmm4
+%define xmmB xmm5
+%else
+%define  mmA  mm6
+%define  mmB  mm7
+%define xmmA xmm6
+%define xmmB xmm7
+%endif
+
+%if RGB_RED == 1
+%define  mmC  mm0
+%define  mmD  mm1
+%define xmmC xmm0
+%define xmmD xmm1
+%elif RGB_GREEN == 1
+%define  mmC  mm2
+%define  mmD  mm3
+%define xmmC xmm2
+%define xmmD xmm3
+%elif RGB_BLUE == 1
+%define  mmC  mm4
+%define  mmD  mm5
+%define xmmC xmm4
+%define xmmD xmm5
+%else
+%define  mmC  mm6
+%define  mmD  mm7
+%define xmmC xmm6
+%define xmmD xmm7
+%endif
+
+%if RGB_RED == 2
+%define  mmE  mm0
+%define  mmF  mm1
+%define xmmE xmm0
+%define xmmF xmm1
+%elif RGB_GREEN == 2
+%define  mmE  mm2
+%define  mmF  mm3
+%define xmmE xmm2
+%define xmmF xmm3
+%elif RGB_BLUE == 2
+%define  mmE  mm4
+%define  mmF  mm5
+%define xmmE xmm4
+%define xmmF xmm5
+%else
+%define  mmE  mm6
+%define  mmF  mm7
+%define xmmE xmm6
+%define xmmF xmm7
+%endif
+
+%if RGB_RED == 3
+%define  mmG  mm0
+%define  mmH  mm1
+%define xmmG xmm0
+%define xmmH xmm1
+%elif RGB_GREEN == 3
+%define  mmG  mm2
+%define  mmH  mm3
+%define xmmG xmm2
+%define xmmH xmm3
+%elif RGB_BLUE == 3
+%define  mmG  mm4
+%define  mmH  mm5
+%define xmmG xmm4
+%define xmmH xmm5
+%else
+%define  mmG  mm6
+%define  mmH  mm7
+%define xmmG xmm6
+%define xmmH xmm7
+%endif
+%endif ; RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+
+; --------------------------------------------------------------------------
diff --git a/jcomapi.c b/jcomapi.c
index 9b1fa75..e4235c0 100644
--- a/jcomapi.c
+++ b/jcomapi.c
@@ -5,6 +5,13 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : March 11, 2005
+ * ---------------------------------------------------------------------
+ *
  * This file contains application interface routines that are used for both
  * compression and decompression.
  */
@@ -104,3 +111,54 @@
   tbl->sent_table = FALSE;	/* make sure this is false in any new table */
   return tbl;
 }
+
+
+/*
+ * SIMD Ext: Checking for support of SIMD instruction set.
+ */
+
+GLOBAL(unsigned int)
+jpeg_simd_support (j_common_ptr cinfo)
+{
+  enum { JSIMD_INVALID = ~0 };
+  static volatile unsigned int simd_supported = JSIMD_INVALID;
+
+  if (simd_supported == JSIMD_INVALID)
+    simd_supported = jpeg_simd_os_support(jpeg_simd_cpu_support());
+
+#ifndef JSIMD_MASKFUNC_NOT_SUPPORTED
+  if (cinfo != NULL)	/* Turn off the masked flags */
+    return simd_supported & ~jpeg_simd_mask(cinfo, JSIMD_NONE, JSIMD_NONE);
+#endif
+  return simd_supported;
+}
+
+#ifndef JSIMD_MASKFUNC_NOT_SUPPORTED
+
+/*
+ * SIMD Ext: modify/retrieve SIMD instruction mask
+ */
+
+GLOBAL(unsigned int)
+jpeg_simd_mask (j_common_ptr cinfo, unsigned int remove, unsigned int add)
+{
+  unsigned long *gp;
+  unsigned int oldmask;
+
+  if (cinfo->is_decompressor)
+    gp = (unsigned long *) &((j_decompress_ptr) cinfo)->output_gamma;
+  else	/* compressor */
+    gp = (unsigned long *) &((j_compress_ptr) cinfo)->input_gamma;
+
+  if ((gp[1] == 0x3FF00000 || gp[1] == 0x00000000) &&	/* +1.0 or +0.0 */
+      (gp[0] & ~JSIMD_ALL) == 0) {
+    oldmask = gp[0];
+    if (((remove | add) & ~JSIMD_ALL) == 0)
+      gp[0] = (oldmask & ~remove) | add;
+  } else {
+    oldmask = 0;	/* error */
+  }
+  return oldmask;
+}
+
+#endif /* !JSIMD_MASKFUNC_NOT_SUPPORTED */
diff --git a/jconfig.bc5 b/jconfig.bc5
new file mode 100644
index 0000000..50c309d
--- /dev/null
+++ b/jconfig.bc5
@@ -0,0 +1,48 @@
+/* jconfig.bc5 --- jconfig.h for Borland C++ Compiler 5.5 (win32) */
+/* see jconfig.doc for explanations */
+
+#define HAVE_PROTOTYPES
+#define HAVE_UNSIGNED_CHAR
+#define HAVE_UNSIGNED_SHORT
+/* #define void char */
+/* #define const */
+#undef CHAR_IS_UNSIGNED
+#define HAVE_STDDEF_H
+#define HAVE_STDLIB_H
+#undef NEED_BSD_STRINGS
+#undef NEED_SYS_TYPES_H
+#undef NEED_FAR_POINTERS	/* we presume a 32-bit flat memory model */
+#undef NEED_SHORT_EXTERNAL_NAMES
+#undef INCOMPLETE_TYPES_BROKEN	/* this assumes you have -w-stu in CFLAGS */
+
+/* Define "boolean" as unsigned char, not int, per Windows custom */
+#define TYPEDEF_UCHAR_BOOLEAN
+
+#ifdef JPEG_INTERNALS
+
+#undef RIGHT_SHIFT_IS_UNSIGNED
+
+#endif /* JPEG_INTERNALS */
+
+#if defined(JPEG_INTERNALS) || defined(JPEG_INTERNAL_OPTIONS)
+#undef JSIMD_MMX_NOT_SUPPORTED
+#undef JSIMD_3DNOW_NOT_SUPPORTED
+#undef JSIMD_SSE_NOT_SUPPORTED
+#undef JSIMD_SSE2_NOT_SUPPORTED
+#endif
+
+#ifdef JPEG_CJPEG_DJPEG
+
+#define BMP_SUPPORTED		/* BMP image file format */
+#define GIF_SUPPORTED		/* GIF image file format */
+#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
+#undef RLE_SUPPORTED		/* Utah RLE image file format */
+#define TARGA_SUPPORTED		/* Targa image file format */
+
+#define TWO_FILE_COMMANDLINE
+#define USE_SETMODE		/* Borland has setmode() */
+#undef NEED_SIGNAL_CATCHER	/* Define this if you use jmemname.c */
+#undef DONT_USE_B_MODE
+#undef PROGRESS_REPORT		/* optional */
+
+#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jconfig.cfg b/jconfig.cfg
index 36a04fa..147cb6b 100644
--- a/jconfig.cfg
+++ b/jconfig.cfg
@@ -16,6 +16,9 @@
 /* Define this if you get warnings about undefined structures. */
 #undef INCOMPLETE_TYPES_BROKEN
 
+/* Define "boolean" as unsigned char, not int, per Windows custom */
+#undef TYPEDEF_UCHAR_BOOLEAN
+
 #ifdef JPEG_INTERNALS
 
 #undef RIGHT_SHIFT_IS_UNSIGNED
@@ -26,6 +29,13 @@
 
 #endif /* JPEG_INTERNALS */
 
+#if defined(JPEG_INTERNALS) || defined(JPEG_INTERNAL_OPTIONS)
+#undef JSIMD_MMX_NOT_SUPPORTED
+#undef JSIMD_3DNOW_NOT_SUPPORTED
+#undef JSIMD_SSE_NOT_SUPPORTED
+#undef JSIMD_SSE2_NOT_SUPPORTED
+#endif
+
 #ifdef JPEG_CJPEG_DJPEG
 
 #define BMP_SUPPORTED		/* BMP image file format */
@@ -35,6 +45,8 @@
 #define TARGA_SUPPORTED		/* Targa image file format */
 
 #undef TWO_FILE_COMMANDLINE
+#undef USE_SETMODE
+#undef USE_FDOPEN
 #undef NEED_SIGNAL_CATCHER
 #undef DONT_USE_B_MODE
 
diff --git a/jconfig.dj b/jconfig.dj
index f759a9d..b5a2e47 100644
--- a/jconfig.dj
+++ b/jconfig.dj
@@ -21,6 +21,13 @@
 
 #endif /* JPEG_INTERNALS */
 
+#if defined(JPEG_INTERNALS) || defined(JPEG_INTERNAL_OPTIONS)
+#undef JSIMD_MMX_NOT_SUPPORTED
+#undef JSIMD_3DNOW_NOT_SUPPORTED
+#undef JSIMD_SSE_NOT_SUPPORTED
+#undef JSIMD_SSE2_NOT_SUPPORTED
+#endif
+
 #ifdef JPEG_CJPEG_DJPEG
 
 #define BMP_SUPPORTED		/* BMP image file format */
@@ -35,4 +42,6 @@
 #undef DONT_USE_B_MODE
 #undef PROGRESS_REPORT		/* optional */
 
+#define FREE_MEM_ESTIMATE	0	/* for alternate cjpeg/djpeg */
+
 #endif /* JPEG_CJPEG_DJPEG */
diff --git a/jconfig.linux b/jconfig.linux
new file mode 100644
index 0000000..6c38ed5
--- /dev/null
+++ b/jconfig.linux
@@ -0,0 +1,44 @@
+/* jconfig.linux --- jconfig.h for Linux ELF with gcc */
+/* see jconfig.doc for explanations */
+
+#define HAVE_PROTOTYPES
+#define HAVE_UNSIGNED_CHAR
+#define HAVE_UNSIGNED_SHORT
+/* #define void char */
+/* #define const */
+#undef CHAR_IS_UNSIGNED
+#define HAVE_STDDEF_H
+#define HAVE_STDLIB_H
+#undef NEED_BSD_STRINGS
+#undef NEED_SYS_TYPES_H
+#undef NEED_FAR_POINTERS
+#undef NEED_SHORT_EXTERNAL_NAMES
+#undef INCOMPLETE_TYPES_BROKEN
+
+#ifdef JPEG_INTERNALS
+
+#undef RIGHT_SHIFT_IS_UNSIGNED
+
+#endif /* JPEG_INTERNALS */
+
+#if defined(JPEG_INTERNALS) || defined(JPEG_INTERNAL_OPTIONS)
+#undef JSIMD_MMX_NOT_SUPPORTED
+#undef JSIMD_3DNOW_NOT_SUPPORTED
+#undef JSIMD_SSE_NOT_SUPPORTED
+#undef JSIMD_SSE2_NOT_SUPPORTED
+#endif
+
+#ifdef JPEG_CJPEG_DJPEG
+
+#define BMP_SUPPORTED		/* BMP image file format */
+#define GIF_SUPPORTED		/* GIF image file format */
+#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
+#undef RLE_SUPPORTED		/* Utah RLE image file format */
+#define TARGA_SUPPORTED		/* Targa image file format */
+
+#undef TWO_FILE_COMMANDLINE
+#undef NEED_SIGNAL_CATCHER	/* Define this if you use jmemname.c */
+#undef DONT_USE_B_MODE
+#undef PROGRESS_REPORT		/* optional */
+
+#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jconfig.mgw b/jconfig.mgw
new file mode 100644
index 0000000..83dfe1d
--- /dev/null
+++ b/jconfig.mgw
@@ -0,0 +1,48 @@
+/* jconfig.mgw --- jconfig.h for MinGW */
+/* see jconfig.doc for explanations */
+
+#define HAVE_PROTOTYPES
+#define HAVE_UNSIGNED_CHAR
+#define HAVE_UNSIGNED_SHORT
+/* #define void char */
+/* #define const */
+#undef CHAR_IS_UNSIGNED
+#define HAVE_STDDEF_H
+#define HAVE_STDLIB_H
+#undef NEED_BSD_STRINGS
+#undef NEED_SYS_TYPES_H
+#undef NEED_FAR_POINTERS
+#undef NEED_SHORT_EXTERNAL_NAMES
+#undef INCOMPLETE_TYPES_BROKEN
+
+/* Define "boolean" as unsigned char, not int, per Windows custom */
+#define TYPEDEF_UCHAR_BOOLEAN
+
+#ifdef JPEG_INTERNALS
+
+#undef RIGHT_SHIFT_IS_UNSIGNED
+
+#endif /* JPEG_INTERNALS */
+
+#if defined(JPEG_INTERNALS) || defined(JPEG_INTERNAL_OPTIONS)
+#undef JSIMD_MMX_NOT_SUPPORTED
+#undef JSIMD_3DNOW_NOT_SUPPORTED
+#undef JSIMD_SSE_NOT_SUPPORTED
+#undef JSIMD_SSE2_NOT_SUPPORTED
+#endif
+
+#ifdef JPEG_CJPEG_DJPEG
+
+#define BMP_SUPPORTED		/* BMP image file format */
+#define GIF_SUPPORTED		/* GIF image file format */
+#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
+#undef RLE_SUPPORTED		/* Utah RLE image file format */
+#define TARGA_SUPPORTED		/* Targa image file format */
+
+#define TWO_FILE_COMMANDLINE	/* optional */
+#define USE_SETMODE		/* MinGW has setmode() */
+#undef NEED_SIGNAL_CATCHER	/* Define this if you use jmemname.c */
+#undef DONT_USE_B_MODE
+#undef PROGRESS_REPORT		/* optional */
+
+#endif /* JPEG_CJPEG_DJPEG */
diff --git a/jconfig.vc b/jconfig.vc
index 7e291c7..d5bc9f9 100644
--- a/jconfig.vc
+++ b/jconfig.vc
@@ -16,11 +16,7 @@
 #undef INCOMPLETE_TYPES_BROKEN
 
 /* Define "boolean" as unsigned char, not int, per Windows custom */
-#ifndef __RPCNDR_H__		/* don't conflict if rpcndr.h already read */
-typedef unsigned char boolean;
-#endif
-#define HAVE_BOOLEAN		/* prevent jmorecfg.h from redefining it */
-
+#define TYPEDEF_UCHAR_BOOLEAN
 
 #ifdef JPEG_INTERNALS
 
@@ -28,6 +24,13 @@
 
 #endif /* JPEG_INTERNALS */
 
+#if defined(JPEG_INTERNALS) || defined(JPEG_INTERNAL_OPTIONS)
+#undef JSIMD_MMX_NOT_SUPPORTED
+#undef JSIMD_3DNOW_NOT_SUPPORTED
+#undef JSIMD_SSE_NOT_SUPPORTED
+#undef JSIMD_SSE2_NOT_SUPPORTED
+#endif
+
 #ifdef JPEG_CJPEG_DJPEG
 
 #define BMP_SUPPORTED		/* BMP image file format */
diff --git a/jcqnt3dn.asm b/jcqnt3dn.asm
new file mode 100644
index 0000000..8197858
--- /dev/null
+++ b/jcqnt3dn.asm
@@ -0,0 +1,240 @@
+;
+; jcqnt3dn.asm - sample data conversion and quantization (3DNow! & MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : January 23, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+%ifdef JFDCT_FLT_3DNOW_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jpeg_convsamp_flt_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                          FAST_FLOAT * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jpeg_convsamp_flt_3dnow)
+
+EXTN(jpeg_convsamp_flt_3dnow):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pcmpeqw  mm7,mm7
+	psllw    mm7,7
+	packsswb mm7,mm7		; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+	psubb	mm0,mm7				; mm0=(01234567)
+	psubb	mm1,mm7				; mm1=(89ABCDEF)
+
+	punpcklbw mm2,mm0			; mm2=(*0*1*2*3)
+	punpckhbw mm0,mm0			; mm0=(*4*5*6*7)
+	punpcklbw mm3,mm1			; mm3=(*8*9*A*B)
+	punpckhbw mm1,mm1			; mm1=(*C*D*E*F)
+
+	punpcklwd mm4,mm2			; mm4=(***0***1)
+	punpckhwd mm2,mm2			; mm2=(***2***3)
+	punpcklwd mm5,mm0			; mm5=(***4***5)
+	punpckhwd mm0,mm0			; mm0=(***6***7)
+
+	psrad	mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(01)
+	psrad	mm2,(DWORD_BIT-BYTE_BIT)	; mm2=(23)
+	pi2fd	mm4,mm4
+	pi2fd	mm2,mm2
+	psrad	mm5,(DWORD_BIT-BYTE_BIT)	; mm5=(45)
+	psrad	mm0,(DWORD_BIT-BYTE_BIT)	; mm0=(67)
+	pi2fd	mm5,mm5
+	pi2fd	mm0,mm0
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
+	movq	MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+	movq	MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+
+	punpcklwd mm6,mm3			; mm6=(***8***9)
+	punpckhwd mm3,mm3			; mm3=(***A***B)
+	punpcklwd mm4,mm1			; mm4=(***C***D)
+	punpckhwd mm1,mm1			; mm1=(***E***F)
+
+	psrad	mm6,(DWORD_BIT-BYTE_BIT)	; mm6=(89)
+	psrad	mm3,(DWORD_BIT-BYTE_BIT)	; mm3=(AB)
+	pi2fd	mm6,mm6
+	pi2fd	mm3,mm3
+	psrad	mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(CD)
+	psrad	mm1,(DWORD_BIT-BYTE_BIT)	; mm1=(EF)
+	pi2fd	mm4,mm4
+	pi2fd	mm1,mm1
+
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
+	movq	MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+
+	add	esi, byte 2*SIZEOF_JSAMPROW
+	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .convloop
+
+	femms		; empty MMX/3DNow! state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jpeg_quantize_flt_3dnow (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                          FAST_FLOAT * workspace);
+;
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; FAST_FLOAT * divisors
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jpeg_quantize_flt_3dnow)
+
+EXTN(jpeg_quantize_flt_3dnow):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov       eax, 0x4B400000	; (float)0x00C00000 (rndint_magic)
+	movd      mm7,eax
+	punpckldq mm7,mm7		; mm7={12582912.0F 12582912.0F}
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/16
+	alignx	16,7
+.quantloop:
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+
+	pfadd	mm0,mm7			; mm0=(00 ** 01 **)
+	pfadd	mm1,mm7			; mm1=(02 ** 03 **)
+	pfadd	mm2,mm7			; mm0=(04 ** 05 **)
+	pfadd	mm3,mm7			; mm1=(06 ** 07 **)
+
+	movq      mm4,mm0
+	punpcklwd mm0,mm1		; mm0=(00 02 ** **)
+	punpckhwd mm4,mm1		; mm4=(01 03 ** **)
+	movq      mm5,mm2
+	punpcklwd mm2,mm3		; mm2=(04 06 ** **)
+	punpckhwd mm5,mm3		; mm5=(05 07 ** **)
+
+	punpcklwd mm0,mm4		; mm0=(00 01 02 03)
+	punpcklwd mm2,mm5		; mm2=(04 05 06 07)
+
+	movq	mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
+	pfmul	mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+	pfmul	mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+	pfadd	mm6,mm7			; mm0=(10 ** 11 **)
+	pfadd	mm1,mm7			; mm4=(12 ** 13 **)
+	pfadd	mm3,mm7			; mm0=(14 ** 15 **)
+	pfadd	mm4,mm7			; mm4=(16 ** 17 **)
+
+	movq      mm5,mm6
+	punpcklwd mm6,mm1		; mm6=(10 12 ** **)
+	punpckhwd mm5,mm1		; mm5=(11 13 ** **)
+	movq      mm1,mm3
+	punpcklwd mm3,mm4		; mm3=(14 16 ** **)
+	punpckhwd mm1,mm4		; mm1=(15 17 ** **)
+
+	punpcklwd mm6,mm5		; mm6=(10 11 12 13)
+	punpcklwd mm3,mm1		; mm3=(14 15 16 17)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+	add	esi, byte 16*SIZEOF_FAST_FLOAT
+	add	edx, byte 16*SIZEOF_FAST_FLOAT
+	add	edi, byte 16*SIZEOF_JCOEF
+	dec	eax
+	jnz	near .quantloop
+
+	femms		; empty MMX/3DNow! state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+%endif ; JFDCT_FLT_3DNOW_MMX_SUPPORTED
+%endif ; DCT_FLOAT_SUPPORTED
diff --git a/jcqntflt.asm b/jcqntflt.asm
new file mode 100644
index 0000000..4631a06
--- /dev/null
+++ b/jcqntflt.asm
@@ -0,0 +1,202 @@
+;
+; jcqntflt.asm - sample data conversion and quantization (non-SIMD, FP)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : March 21, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jpeg_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                      FAST_FLOAT * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jpeg_convsamp_float)
+
+EXTN(jpeg_convsamp_float):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi]		; (JSAMPLE *)
+	add	ebx, JDIMENSION [start_col]
+
+%assign i 0	; i=0
+%rep 4	; -- repeat 4 times ---
+	xor	eax,eax
+	xor	edx,edx
+	mov	al, JSAMPLE [ebx+(i+0)*SIZEOF_JSAMPLE]
+	mov	dl, JSAMPLE [ebx+(i+1)*SIZEOF_JSAMPLE]
+	add	eax, byte -CENTERJSAMPLE
+	add	edx, byte -CENTERJSAMPLE
+	push	eax
+	push	edx
+%assign i i+2	; i+=2
+%endrep	; -- repeat end ---
+
+	fild	INT32 [esp+0*SIZEOF_INT32]
+	fild	INT32 [esp+1*SIZEOF_INT32]
+	fild	INT32 [esp+2*SIZEOF_INT32]
+	fild	INT32 [esp+3*SIZEOF_INT32]
+	fild	INT32 [esp+4*SIZEOF_INT32]
+	fild	INT32 [esp+5*SIZEOF_INT32]
+	fild	INT32 [esp+6*SIZEOF_INT32]
+	fild	INT32 [esp+7*SIZEOF_INT32]
+
+	add	esp, byte DCTSIZE*SIZEOF_INT32
+
+	fstp	FAST_FLOAT [edi+0*SIZEOF_FAST_FLOAT]
+	fstp	FAST_FLOAT [edi+1*SIZEOF_FAST_FLOAT]
+	fstp	FAST_FLOAT [edi+2*SIZEOF_FAST_FLOAT]
+	fstp	FAST_FLOAT [edi+3*SIZEOF_FAST_FLOAT]
+	fstp	FAST_FLOAT [edi+4*SIZEOF_FAST_FLOAT]
+	fstp	FAST_FLOAT [edi+5*SIZEOF_FAST_FLOAT]
+	fstp	FAST_FLOAT [edi+6*SIZEOF_FAST_FLOAT]
+	fstp	FAST_FLOAT [edi+7*SIZEOF_FAST_FLOAT]
+
+	add	esi, byte SIZEOF_JSAMPROW
+	add	edi, byte DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .convloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jpeg_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                      FAST_FLOAT * workspace);
+;
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; FAST_FLOAT * divisors
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+%define FLT_ROUNDS	1		; from <float.h>
+
+	align	16
+	global	EXTN(jpeg_quantize_float)
+
+EXTN(jpeg_quantize_float):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; unused
+;	push	edx		; unused
+	push	esi
+	push	edi
+
+%if (FLT_ROUNDS != 1)
+	push	eax
+	fnstcw	word [esp]
+	mov	eax, [esp]
+	and	eax, (~0x0C00)		; round to nearest integer
+	push	eax
+	fldcw	word [esp]
+	pop	eax
+%endif
+	mov	esi, POINTER [workspace]
+	mov	ebx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/8
+	alignx	16,7
+.quantloop:
+	fld	FAST_FLOAT [esi+0*SIZEOF_FAST_FLOAT]
+	fmul	FAST_FLOAT [ebx+0*SIZEOF_FAST_FLOAT]
+	fld	FAST_FLOAT [esi+1*SIZEOF_FAST_FLOAT]
+	fmul	FAST_FLOAT [ebx+1*SIZEOF_FAST_FLOAT]
+	fld	FAST_FLOAT [esi+2*SIZEOF_FAST_FLOAT]
+	fmul	FAST_FLOAT [ebx+2*SIZEOF_FAST_FLOAT]
+	fld	FAST_FLOAT [esi+3*SIZEOF_FAST_FLOAT]
+	fmul	FAST_FLOAT [ebx+3*SIZEOF_FAST_FLOAT]
+
+	fld	FAST_FLOAT [esi+4*SIZEOF_FAST_FLOAT]
+	fmul	FAST_FLOAT [ebx+4*SIZEOF_FAST_FLOAT]
+	fxch	st0,st1
+	fld	FAST_FLOAT [esi+5*SIZEOF_FAST_FLOAT]
+	fmul	FAST_FLOAT [ebx+5*SIZEOF_FAST_FLOAT]
+	fxch	st0,st3
+	fld	FAST_FLOAT [esi+6*SIZEOF_FAST_FLOAT]
+	fmul	FAST_FLOAT [ebx+6*SIZEOF_FAST_FLOAT]
+	fxch	st0,st5
+	fld	FAST_FLOAT [esi+7*SIZEOF_FAST_FLOAT]
+	fmul	FAST_FLOAT [ebx+7*SIZEOF_FAST_FLOAT]
+	fxch	st0,st7
+
+	fistp	JCOEF [edi+0*SIZEOF_JCOEF]
+	fistp	JCOEF [edi+1*SIZEOF_JCOEF]
+	fistp	JCOEF [edi+2*SIZEOF_JCOEF]
+	fistp	JCOEF [edi+3*SIZEOF_JCOEF]
+	fistp	JCOEF [edi+4*SIZEOF_JCOEF]
+	fistp	JCOEF [edi+5*SIZEOF_JCOEF]
+	fistp	JCOEF [edi+6*SIZEOF_JCOEF]
+	fistp	JCOEF [edi+7*SIZEOF_JCOEF]
+
+	add	esi, byte 8*SIZEOF_FAST_FLOAT
+	add	ebx, byte 8*SIZEOF_FAST_FLOAT
+	add	edi, byte 8*SIZEOF_JCOEF
+	dec	eax
+	jnz	short .quantloop
+
+%if (FLT_ROUNDS != 1)
+	fldcw	word [esp]
+	pop	eax		; pop old control word
+%endif
+	pop	edi
+	pop	esi
+;	pop	edx		; unused
+;	pop	ecx		; unused
+	pop	ebx
+	pop	ebp
+	ret
+
+%endif ; DCT_FLOAT_SUPPORTED
diff --git a/jcqntint.asm b/jcqntint.asm
new file mode 100644
index 0000000..e0de1cb
--- /dev/null
+++ b/jcqntint.asm
@@ -0,0 +1,243 @@
+;
+; jcqntint.asm - sample data conversion and quantization (non-SIMD, integer)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : January 27, 2005
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jpeg_convsamp_int (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                    DCTELEM * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jpeg_convsamp_int)
+
+EXTN(jpeg_convsamp_int):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi]		; (JSAMPLE *)
+	add	ebx, JDIMENSION [start_col]
+
+%assign i 0	; i=0
+%rep 4	; -- repeat 4 times ---
+	xor	eax,eax
+	xor	edx,edx
+	mov	al, JSAMPLE [ebx+(i+0)*SIZEOF_JSAMPLE]
+	mov	dl, JSAMPLE [ebx+(i+1)*SIZEOF_JSAMPLE]
+	add	eax, byte -CENTERJSAMPLE
+	add	edx, byte -CENTERJSAMPLE
+	mov	DCTELEM [edi+(i+0)*SIZEOF_DCTELEM], ax
+	mov	DCTELEM [edi+(i+1)*SIZEOF_DCTELEM], dx
+%assign i i+2	; i+=2
+%endrep	; -- repeat end ---
+
+	add	esi, byte SIZEOF_JSAMPROW
+	add	edi, byte DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	short .convloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%ifndef JFDCT_INT_QUANTIZE_WITH_DIVISION
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jpeg_quantize_int (JCOEFPTR coef_block, DCTELEM * divisors,
+;                    DCTELEM * workspace);
+;
+
+%define RECIPROCAL(i,b)	((b)+((i)+DCTSIZE2*0)*SIZEOF_DCTELEM)
+%define CORRECTION(i,b)	((b)+((i)+DCTSIZE2*1)*SIZEOF_DCTELEM)
+%define SHIFT(i,b)	((b)+((i)+DCTSIZE2*3)*SIZEOF_DCTELEM)
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; DCTELEM * divisors
+%define workspace	ebp+16		; DCTELEM * workspace
+
+%define UNROLL	2
+
+	align	16
+	global	EXTN(jpeg_quantize_int)
+
+EXTN(jpeg_quantize_int):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	ebx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	ecx, DCTSIZE2/UNROLL
+	alignx	16,7
+.quantloop:
+	push	ecx
+
+%assign i 0	; i=0;
+%rep UNROLL	; ---- repeat (UNROLL) times ----
+	mov	cx, DCTELEM [esi+(i)*SIZEOF_DCTELEM]
+	mov	ax,cx
+	sar	cx,(WORD_BIT-1)
+	xor	ax,cx		; if (ax < 0) ax = -ax;
+	sub	ax,cx
+	add	ax, DCTELEM [CORRECTION(i,ebx)]	; correction + roundfactor
+	shl	ax,1
+	mul	DCTELEM [RECIPROCAL(i,ebx)]	; reciprocal
+	mov	ax,cx
+	mov	cx, DCTELEM [SHIFT(i,ebx)]	; shift
+	shr	dx,cl
+	xor	dx,ax
+	sub	dx,ax
+	mov	JCOEF [edi+(i)*SIZEOF_JCOEF], dx
+%assign i i+1	; i++;
+%endrep		; ---- repeat end ----
+
+	pop	ecx
+
+	add	esi, byte UNROLL*SIZEOF_DCTELEM
+	add	ebx, byte UNROLL*SIZEOF_DCTELEM
+	add	edi, byte UNROLL*SIZEOF_JCOEF
+	dec	ecx
+	jnz	.quantloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%else ; JFDCT_INT_QUANTIZE_WITH_DIVISION
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jpeg_quantize_idiv (JCOEFPTR coef_block, DCTELEM * divisors,
+;                     DCTELEM * workspace);
+;
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; DCTELEM * divisors
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jpeg_quantize_idiv)
+
+EXTN(jpeg_quantize_idiv):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	ebx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	ecx, DCTSIZE2
+	alignx	16,7
+.quantloop:
+	push	ecx
+
+	movsx	ecx, DCTELEM [esi]	; temp
+	mov	eax,ecx
+	sar	ecx,(DWORD_BIT-1)
+	xor	edx,edx
+	mov	dx, DCTELEM [ebx]	; qval
+	xor	eax,ecx			; if (eax < 0) eax = -eax;
+	shr	edx,1
+	sub	eax,ecx
+	cmp	eax,edx			; if (temp + qval/2 >= qval)
+	jge	short .quant
+	; ---- if the quantized coefficient is zero
+	xor	eax,eax
+	jmp	short .output
+	alignx	16,7
+.quant:	; ---- do quantization
+	add	eax,edx
+	xor	edx,edx
+	div	DCTELEM [ebx]		; Q:ax,R:dx
+	xor	ax,cx
+	sub	ax,cx
+	alignx	16,7
+.output:
+	mov	JCOEF [edi], ax
+
+	pop	ecx
+
+	add	esi, byte SIZEOF_DCTELEM
+	add	ebx, byte SIZEOF_DCTELEM
+	add	edi, byte SIZEOF_JCOEF
+	dec	ecx
+	jnz	short .quantloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%endif ; !JFDCT_INT_QUANTIZE_WITH_DIVISION
diff --git a/jcqntmmx.asm b/jcqntmmx.asm
new file mode 100644
index 0000000..9cdf584
--- /dev/null
+++ b/jcqntmmx.asm
@@ -0,0 +1,254 @@
+;
+; jcqntmmx.asm - sample data conversion and quantization (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : January 27, 2005
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef JFDCT_INT_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jpeg_convsamp_int_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                        DCTELEM * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jpeg_convsamp_int_mmx)
+
+EXTN(jpeg_convsamp_int_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pxor	mm6,mm6			; mm6=(all 0's)
+	pcmpeqw	mm7,mm7
+	psllw	mm7,7			; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; mm0=(01234567)
+	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]	; mm1=(89ABCDEF)
+
+	mov	ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; mm2=(GHIJKLMN)
+	movq	mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE]	; mm3=(OPQRSTUV)
+
+	movq      mm4,mm0
+	punpcklbw mm0,mm6		; mm0=(0123)
+	punpckhbw mm4,mm6		; mm4=(4567)
+	movq      mm5,mm1
+	punpcklbw mm1,mm6		; mm1=(89AB)
+	punpckhbw mm5,mm6		; mm5=(CDEF)
+
+	paddw	mm0,mm7
+	paddw	mm4,mm7
+	paddw	mm1,mm7
+	paddw	mm5,mm7
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
+
+	movq      mm0,mm2
+	punpcklbw mm2,mm6		; mm2=(GHIJ)
+	punpckhbw mm0,mm6		; mm0=(KLMN)
+	movq      mm4,mm3
+	punpcklbw mm3,mm6		; mm3=(OPQR)
+	punpckhbw mm4,mm6		; mm4=(STUV)
+
+	paddw	mm2,mm7
+	paddw	mm0,mm7
+	paddw	mm3,mm7
+	paddw	mm4,mm7
+
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
+
+	add	esi, byte 4*SIZEOF_JSAMPROW
+	add	edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	short .convloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%ifndef JFDCT_INT_QUANTIZE_WITH_DIVISION
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jpeg_quantize_int_mmx (JCOEFPTR coef_block, DCTELEM * divisors,
+;                        DCTELEM * workspace);
+;
+
+%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)      MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; DCTELEM * divisors
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jpeg_quantize_int_mmx)
+
+EXTN(jpeg_quantize_int_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	ah, 2
+	alignx	16,7
+.quantloop1:
+	mov	al, DCTSIZE2/8/2
+	alignx	16,7
+.quantloop2:
+	movq	mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
+	movq	mm0,mm2
+	movq	mm1,mm3
+	psraw	mm2,(WORD_BIT-1)
+	psraw	mm3,(WORD_BIT-1)
+	pxor	mm0,mm2
+	pxor	mm1,mm3
+	psubw	mm0,mm2		; if (mm0 < 0) mm0 = -mm0;
+	psubw	mm1,mm3		; if (mm1 < 0) mm1 = -mm1;
+
+	; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
+	; {
+	;   enum { SHORT_BIT = 16 };
+	;   signed short sx = (signed short) x;
+	;   signed short sy = (signed short) y;
+	;   signed long sz;
+	; 
+	;   sz = (long) sx * (long) sy;     /* signed multiply */
+	; 
+	;   if (sx < 0) sz += (long) sy << SHORT_BIT;
+	;   if (sy < 0) sz += (long) sx << SHORT_BIT;
+	; 
+	;   return (unsigned long) sz;
+	; }
+
+	paddw	mm0, MMWORD [CORRECTION(0,0,edx)]   ; correction + roundfactor
+	paddw	mm1, MMWORD [CORRECTION(0,1,edx)]
+	psllw	mm0,1
+	psllw	mm1,1
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pmulhw	mm0, MMWORD [RECIPROCAL(0,0,edx)]   ; reciprocal
+	pmulhw	mm1, MMWORD [RECIPROCAL(0,1,edx)]
+	movq	mm6, MMWORD [SCALE(0,0,edx)]	; scale
+	movq	mm7, MMWORD [SCALE(0,1,edx)]
+	paddw	mm0,mm4		; reciprocal is always negative (MSB=1)
+	paddw	mm1,mm5
+	psllw	mm0,1
+	psllw	mm1,1
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pmulhw	mm0,mm6
+	pmulhw	mm1,mm7
+	psraw	mm6,(WORD_BIT-1)
+	psraw	mm7,(WORD_BIT-1)
+	pand	mm6,mm4
+	pand	mm7,mm5
+	paddw	mm0,mm6
+	paddw	mm1,mm7
+	psraw	mm4,(WORD_BIT-1)
+	psraw	mm5,(WORD_BIT-1)
+	pand	mm4, MMWORD [SCALE(0,0,edx)]	; scale
+	pand	mm5, MMWORD [SCALE(0,1,edx)]
+	paddw	mm0,mm4
+	paddw	mm1,mm5
+
+	pxor	mm0,mm2
+	pxor	mm1,mm3
+	psubw	mm0,mm2
+	psubw	mm1,mm3
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
+
+	add	esi, byte 8*SIZEOF_DCTELEM
+	add	edx, byte 8*SIZEOF_DCTELEM
+	add	edi, byte 8*SIZEOF_JCOEF
+	dec	al
+	jnz	near .quantloop2
+	dec	ah
+	jnz	near .quantloop1	; to avoid branch misprediction
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+%endif ; !JFDCT_INT_QUANTIZE_WITH_DIVISION
+%endif ; JFDCT_INT_MMX_SUPPORTED
diff --git a/jcqnts2f.asm b/jcqnts2f.asm
new file mode 100644
index 0000000..faf663e
--- /dev/null
+++ b/jcqnts2f.asm
@@ -0,0 +1,178 @@
+;
+; jcqnts2f.asm - sample data conversion and quantization (SSE & SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : January 18, 2005
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+%ifdef JFDCT_FLT_SSE_SSE2_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jpeg_convsamp_flt_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                         FAST_FLOAT * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jpeg_convsamp_flt_sse2)
+
+EXTN(jpeg_convsamp_flt_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pcmpeqw  xmm7,xmm7
+	psllw    xmm7,7
+	packsswb xmm7,xmm7		; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm0, _MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+	movq	xmm1, _MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+	psubb	xmm0,xmm7			; xmm0=(01234567)
+	psubb	xmm1,xmm7			; xmm1=(89ABCDEF)
+
+	punpcklbw xmm0,xmm0			; xmm0=(*0*1*2*3*4*5*6*7)
+	punpcklbw xmm1,xmm1			; xmm1=(*8*9*A*B*C*D*E*F)
+
+	punpcklwd xmm2,xmm0			; xmm2=(***0***1***2***3)
+	punpckhwd xmm0,xmm0			; xmm0=(***4***5***6***7)
+	punpcklwd xmm3,xmm1			; xmm3=(***8***9***A***B)
+	punpckhwd xmm1,xmm1			; xmm1=(***C***D***E***F)
+
+	psrad     xmm2,(DWORD_BIT-BYTE_BIT)	; xmm2=(0123)
+	psrad     xmm0,(DWORD_BIT-BYTE_BIT)	; xmm0=(4567)
+	cvtdq2ps  xmm2,xmm2			; xmm2=(0123)
+	cvtdq2ps  xmm0,xmm0			; xmm0=(4567)
+	psrad     xmm3,(DWORD_BIT-BYTE_BIT)	; xmm3=(89AB)
+	psrad     xmm1,(DWORD_BIT-BYTE_BIT)	; xmm1=(CDEF)
+	cvtdq2ps  xmm3,xmm3			; xmm3=(89AB)
+	cvtdq2ps  xmm1,xmm1			; xmm1=(CDEF)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+
+	add	esi, byte 2*SIZEOF_JSAMPROW
+	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	short .convloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jpeg_quantize_flt_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                         FAST_FLOAT * workspace);
+;
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; FAST_FLOAT * divisors
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jpeg_quantize_flt_sse2)
+
+EXTN(jpeg_quantize_flt_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/16
+	alignx	16,7
+.quantloop:
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+	cvtps2dq xmm0,xmm0
+	cvtps2dq xmm1,xmm1
+	cvtps2dq xmm2,xmm2
+	cvtps2dq xmm3,xmm3
+
+	packssdw xmm0,xmm1
+	packssdw xmm2,xmm3
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
+
+	add	esi, byte 16*SIZEOF_FAST_FLOAT
+	add	edx, byte 16*SIZEOF_FAST_FLOAT
+	add	edi, byte 16*SIZEOF_JCOEF
+	dec	eax
+	jnz	short .quantloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+%endif ; JFDCT_FLT_SSE_SSE2_SUPPORTED
+%endif ; DCT_FLOAT_SUPPORTED
diff --git a/jcqnts2i.asm b/jcqnts2i.asm
new file mode 100644
index 0000000..71bae2c
--- /dev/null
+++ b/jcqnts2i.asm
@@ -0,0 +1,216 @@
+;
+; jcqnts2i.asm - sample data conversion and quantization (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : January 27, 2005
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef JFDCT_INT_SSE2_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jpeg_convsamp_int_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                         DCTELEM * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jpeg_convsamp_int_sse2)
+
+EXTN(jpeg_convsamp_int_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pxor	xmm6,xmm6		; xmm6=(all 0's)
+	pcmpeqw	xmm7,xmm7
+	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm0, _MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; xmm0=(01234567)
+	movq	xmm1, _MMWORD [edx+eax*SIZEOF_JSAMPLE]	; xmm1=(89ABCDEF)
+
+	mov	ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	xmm2, _MMWORD [ebx+eax*SIZEOF_JSAMPLE]	; xmm2=(GHIJKLMN)
+	movq	xmm3, _MMWORD [edx+eax*SIZEOF_JSAMPLE]	; xmm3=(OPQRSTUV)
+
+	punpcklbw xmm0,xmm6		; xmm0=(01234567)
+	punpcklbw xmm1,xmm6		; xmm1=(89ABCDEF)
+	paddw     xmm0,xmm7
+	paddw     xmm1,xmm7
+	punpcklbw xmm2,xmm6		; xmm2=(GHIJKLMN)
+	punpcklbw xmm3,xmm6		; xmm3=(OPQRSTUV)
+	paddw     xmm2,xmm7
+	paddw     xmm3,xmm7
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+	movdqa	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+	add	esi, byte 4*SIZEOF_JSAMPROW
+	add	edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	short .convloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%ifndef JFDCT_INT_QUANTIZE_WITH_DIVISION
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jpeg_quantize_int_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
+;                         DCTELEM * workspace);
+;
+
+%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; DCTELEM * divisors
+%define workspace	ebp+16		; DCTELEM * workspace
+
+	align	16
+	global	EXTN(jpeg_quantize_int_sse2)
+
+EXTN(jpeg_quantize_int_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/32
+	alignx	16,7
+.quantloop:
+	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
+	movdqa	xmm0,xmm4
+	movdqa	xmm1,xmm5
+	movdqa	xmm2,xmm6
+	movdqa	xmm3,xmm7
+	psraw	xmm4,(WORD_BIT-1)
+	psraw	xmm5,(WORD_BIT-1)
+	psraw	xmm6,(WORD_BIT-1)
+	psraw	xmm7,(WORD_BIT-1)
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm5
+	pxor	xmm2,xmm6
+	pxor	xmm3,xmm7
+	psubw	xmm0,xmm4		; if (xmm0 < 0) xmm0 = -xmm0;
+	psubw	xmm1,xmm5		; if (xmm1 < 0) xmm1 = -xmm1;
+	psubw	xmm2,xmm6		; if (xmm2 < 0) xmm2 = -xmm2;
+	psubw	xmm3,xmm7		; if (xmm3 < 0) xmm3 = -xmm3;
+
+	paddw	xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+	paddw	xmm1, XMMWORD [CORRECTION(1,0,edx)]
+	paddw	xmm2, XMMWORD [CORRECTION(2,0,edx)]
+	paddw	xmm3, XMMWORD [CORRECTION(3,0,edx)]
+	psllw	xmm0,1
+	psllw	xmm1,1
+	psllw	xmm2,1
+	psllw	xmm3,1
+	pmulhuw	xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+	pmulhuw	xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
+	pmulhuw	xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
+	pmulhuw	xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
+	psllw	xmm0,1
+	psllw	xmm1,1
+	psllw	xmm2,1
+	psllw	xmm3,1
+	pmulhuw	xmm0, XMMWORD [SCALE(0,0,edx)]	; scale
+	pmulhuw	xmm1, XMMWORD [SCALE(1,0,edx)]
+	pmulhuw	xmm2, XMMWORD [SCALE(2,0,edx)]
+	pmulhuw	xmm3, XMMWORD [SCALE(3,0,edx)]
+
+	pxor	xmm0,xmm4
+	pxor	xmm1,xmm5
+	pxor	xmm2,xmm6
+	pxor	xmm3,xmm7
+	psubw	xmm0,xmm4
+	psubw	xmm1,xmm5
+	psubw	xmm2,xmm6
+	psubw	xmm3,xmm7
+	movdqa	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+	movdqa	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+	movdqa	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+	add	esi, byte 32*SIZEOF_DCTELEM
+	add	edx, byte 32*SIZEOF_DCTELEM
+	add	edi, byte 32*SIZEOF_JCOEF
+	dec	eax
+	jnz	near .quantloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+%endif ; !JFDCT_INT_QUANTIZE_WITH_DIVISION
+%endif ; JFDCT_INT_SSE2_SUPPORTED
diff --git a/jcqntsse.asm b/jcqntsse.asm
new file mode 100644
index 0000000..fe99a20
--- /dev/null
+++ b/jcqntsse.asm
@@ -0,0 +1,218 @@
+;
+; jcqntsse.asm - sample data conversion and quantization (SSE & MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : January 12, 2005
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+%ifdef JFDCT_FLT_SSE_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jpeg_convsamp_flt_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                        FAST_FLOAT * workspace);
+;
+
+%define sample_data	ebp+8		; JSAMPARRAY sample_data
+%define start_col	ebp+12		; JDIMENSION start_col
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jpeg_convsamp_flt_sse)
+
+EXTN(jpeg_convsamp_flt_sse):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	pcmpeqw  mm7,mm7
+	psllw    mm7,7
+	packsswb mm7,mm7		; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [start_col]
+	mov	edi, POINTER [workspace]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.convloop:
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+
+	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+	psubb	mm0,mm7				; mm0=(01234567)
+	psubb	mm1,mm7				; mm1=(89ABCDEF)
+
+	punpcklbw mm2,mm0			; mm2=(*0*1*2*3)
+	punpckhbw mm0,mm0			; mm0=(*4*5*6*7)
+	punpcklbw mm3,mm1			; mm3=(*8*9*A*B)
+	punpckhbw mm1,mm1			; mm1=(*C*D*E*F)
+
+	punpcklwd mm4,mm2			; mm4=(***0***1)
+	punpckhwd mm2,mm2			; mm2=(***2***3)
+	punpcklwd mm5,mm0			; mm5=(***4***5)
+	punpckhwd mm0,mm0			; mm0=(***6***7)
+
+	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(01)
+	psrad     mm2,(DWORD_BIT-BYTE_BIT)	; mm2=(23)
+	cvtpi2ps  xmm0,mm4			; xmm0=(01**)
+	cvtpi2ps  xmm1,mm2			; xmm1=(23**)
+	psrad     mm5,(DWORD_BIT-BYTE_BIT)	; mm5=(45)
+	psrad     mm0,(DWORD_BIT-BYTE_BIT)	; mm0=(67)
+	cvtpi2ps  xmm2,mm5			; xmm2=(45**)
+	cvtpi2ps  xmm3,mm0			; xmm3=(67**)
+
+	punpcklwd mm6,mm3			; mm6=(***8***9)
+	punpckhwd mm3,mm3			; mm3=(***A***B)
+	punpcklwd mm4,mm1			; mm4=(***C***D)
+	punpckhwd mm1,mm1			; mm1=(***E***F)
+
+	psrad     mm6,(DWORD_BIT-BYTE_BIT)	; mm6=(89)
+	psrad     mm3,(DWORD_BIT-BYTE_BIT)	; mm3=(AB)
+	cvtpi2ps  xmm4,mm6			; xmm4=(89**)
+	cvtpi2ps  xmm5,mm3			; xmm5=(AB**)
+	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(CD)
+	psrad     mm1,(DWORD_BIT-BYTE_BIT)	; mm1=(EF)
+	cvtpi2ps  xmm6,mm4			; xmm6=(CD**)
+	cvtpi2ps  xmm7,mm1			; xmm7=(EF**)
+
+	movlhps   xmm0,xmm1			; xmm0=(0123)
+	movlhps   xmm2,xmm3			; xmm2=(4567)
+	movlhps   xmm4,xmm5			; xmm4=(89AB)
+	movlhps   xmm6,xmm7			; xmm6=(CDEF)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+
+	add	esi, byte 2*SIZEOF_JSAMPROW
+	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .convloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jpeg_quantize_flt_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                        FAST_FLOAT * workspace);
+;
+
+%define coef_block	ebp+8		; JCOEFPTR coef_block
+%define divisors	ebp+12		; FAST_FLOAT * divisors
+%define workspace	ebp+16		; FAST_FLOAT * workspace
+
+	align	16
+	global	EXTN(jpeg_quantize_flt_sse)
+
+EXTN(jpeg_quantize_flt_sse):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	esi, POINTER [workspace]
+	mov	edx, POINTER [divisors]
+	mov	edi, JCOEFPTR [coef_block]
+	mov	eax, DCTSIZE2/16
+	alignx	16,7
+.quantloop:
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+	movhlps  xmm4,xmm0
+	movhlps  xmm5,xmm1
+
+	cvtps2pi mm0,xmm0
+	cvtps2pi mm1,xmm1
+	cvtps2pi mm4,xmm4
+	cvtps2pi mm5,xmm5
+
+	movhlps  xmm6,xmm2
+	movhlps  xmm7,xmm3
+
+	cvtps2pi mm2,xmm2
+	cvtps2pi mm3,xmm3
+	cvtps2pi mm6,xmm6
+	cvtps2pi mm7,xmm7
+
+	packssdw mm0,mm4
+	packssdw mm1,mm5
+	packssdw mm2,mm6
+	packssdw mm3,mm7
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+	add	esi, byte 16*SIZEOF_FAST_FLOAT
+	add	edx, byte 16*SIZEOF_FAST_FLOAT
+	add	edi, byte 16*SIZEOF_JCOEF
+	dec	eax
+	jnz	short .quantloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+%endif ; JFDCT_FLT_SSE_MMX_SUPPORTED
+%endif ; DCT_FLOAT_SUPPORTED
diff --git a/jcsammmx.asm b/jcsammmx.asm
new file mode 100644
index 0000000..95fc825
--- /dev/null
+++ b/jcsammmx.asm
@@ -0,0 +1,328 @@
+;
+; jcsammmx.asm - downsampling (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : January 23, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jcolsamp.inc"
+
+%ifdef JCSAMPLE_MMX_SUPPORTED
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jpeg_h2v1_downsample_mmx (j_compress_ptr cinfo,
+;                           jpeg_component_info * compptr,
+;                           JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define cinfo(b)	(b)+8		; j_compress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define input_data(b)	(b)+16		; JSAMPARRAY input_data
+%define output_data(b)	(b)+20		; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jpeg_h2v1_downsample_mmx)
+
+EXTN(jpeg_h2v1_downsample_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, POINTER [compptr(ebp)]
+	mov	ecx, JDIMENSION [jcompinfo_width_in_blocks(ecx)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, POINTER [cinfo(ebp)]
+	mov	edx, JDIMENSION [jcstruct_image_width(edx)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, POINTER [cinfo(ebp)]
+	mov	eax, INT [jcstruct_max_v_samp_factor(eax)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v1_downsample
+
+	mov	eax, POINTER [compptr(ebp)]
+	mov	eax, JDIMENSION [jcompinfo_v_samp_factor(eax)]	; rowctr
+	test	eax,eax
+	jle	short .return
+
+	mov       edx, 0x00010000	; bias pattern
+	movd      mm7,edx
+	pcmpeqw   mm6,mm6
+	punpckldq mm7,mm7		; mm7={0, 1, 0, 1}
+	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mm1, MMWORD [esi+1*SIZEOF_MMWORD]
+	movq	mm2,mm0
+	movq	mm3,mm1
+
+	pand	mm0,mm6
+	psrlw	mm2,BYTE_BIT
+	pand	mm1,mm6
+	psrlw	mm3,BYTE_BIT
+
+	paddw	mm0,mm2
+	paddw	mm1,mm3
+	paddw	mm0,mm7
+	paddw	mm1,mm7
+	psrlw	mm0,1
+	psrlw	mm1,1
+
+	packuswb mm0,mm1
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr
+	add	edi, byte 1*SIZEOF_MMWORD	; outptr
+	sub	ecx, byte SIZEOF_MMWORD		; outcol
+	jnz	short .columnloop
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	short .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jpeg_h2v2_downsample_mmx (j_compress_ptr cinfo,
+;                           jpeg_component_info * compptr,
+;                           JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define cinfo(b)	(b)+8		; j_compress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define input_data(b)	(b)+16		; JSAMPARRAY input_data
+%define output_data(b)	(b)+20		; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jpeg_h2v2_downsample_mmx)
+
+EXTN(jpeg_h2v2_downsample_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, POINTER [compptr(ebp)]
+	mov	ecx, JDIMENSION [jcompinfo_width_in_blocks(ecx)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, POINTER [cinfo(ebp)]
+	mov	edx, JDIMENSION [jcstruct_image_width(edx)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, POINTER [cinfo(ebp)]
+	mov	eax, INT [jcstruct_max_v_samp_factor(eax)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v2_downsample
+
+	mov	eax, POINTER [compptr(ebp)]
+	mov	eax, JDIMENSION [jcompinfo_v_samp_factor(eax)]	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov       edx, 0x00020001	; bias pattern
+	movd      mm7,edx
+	pcmpeqw   mm6,mm6
+	punpckldq mm7,mm7		; mm7={1, 2, 1, 2}
+	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
+	mov	edi, JSAMPROW [edi]			; outptr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [edx+0*SIZEOF_MMWORD]
+	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mm2, MMWORD [edx+1*SIZEOF_MMWORD]
+	movq	mm3, MMWORD [esi+1*SIZEOF_MMWORD]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pand	mm0,mm6
+	psrlw	mm4,BYTE_BIT
+	pand	mm1,mm6
+	psrlw	mm5,BYTE_BIT
+	paddw	mm0,mm4
+	paddw	mm1,mm5
+
+	movq	mm4,mm2
+	movq	mm5,mm3
+	pand	mm2,mm6
+	psrlw	mm4,BYTE_BIT
+	pand	mm3,mm6
+	psrlw	mm5,BYTE_BIT
+	paddw	mm2,mm4
+	paddw	mm3,mm5
+
+	paddw	mm0,mm1
+	paddw	mm2,mm3
+	paddw	mm0,mm7
+	paddw	mm2,mm7
+	psrlw	mm0,2
+	psrlw	mm2,2
+
+	packuswb mm0,mm2
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+	add	edx, byte 2*SIZEOF_MMWORD	; inptr0
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr1
+	add	edi, byte 1*SIZEOF_MMWORD	; outptr
+	sub	ecx, byte SIZEOF_MMWORD		; outcol
+	jnz	near .columnloop
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+%endif ; JCSAMPLE_MMX_SUPPORTED
diff --git a/jcsample.c b/jcsample.c
index 212ec87..9af7f15 100644
--- a/jcsample.c
+++ b/jcsample.c
@@ -5,6 +5,13 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : January 5, 2006
+ * ---------------------------------------------------------------------
+ *
  * This file contains downsampling routines.
  *
  * Downsampling input data is counted in "row groups".  A row group
@@ -48,6 +55,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jcolsamp.h"		/* Private declarations */
 
 
 /* Pointer to routine to downsample a single component */
@@ -467,6 +475,7 @@
   int ci;
   jpeg_component_info * compptr;
   boolean smoothok = TRUE;
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
 
   downsample = (my_downsample_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -494,7 +503,17 @@
     } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
 	       compptr->v_samp_factor == cinfo->max_v_samp_factor) {
       smoothok = FALSE;
-      downsample->methods[ci] = h2v1_downsample;
+#ifdef JCSAMPLE_SSE2_SUPPORTED
+      if (simd & JSIMD_SSE2)
+	downsample->methods[ci] = jpeg_h2v1_downsample_sse2;
+      else
+#endif
+#ifdef JCSAMPLE_MMX_SUPPORTED
+      if (simd & JSIMD_MMX)
+	downsample->methods[ci] = jpeg_h2v1_downsample_mmx;
+      else
+#endif
+	downsample->methods[ci] = h2v1_downsample;
     } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
 	       compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) {
 #ifdef INPUT_SMOOTHING_SUPPORTED
@@ -503,6 +522,16 @@
 	downsample->pub.need_context_rows = TRUE;
       } else
 #endif
+#ifdef JCSAMPLE_SSE2_SUPPORTED
+      if (simd & JSIMD_SSE2)
+	downsample->methods[ci] = jpeg_h2v2_downsample_sse2;
+      else
+#endif
+#ifdef JCSAMPLE_MMX_SUPPORTED
+      if (simd & JSIMD_MMX)
+	downsample->methods[ci] = jpeg_h2v2_downsample_mmx;
+      else
+#endif
 	downsample->methods[ci] = h2v2_downsample;
     } else if ((cinfo->max_h_samp_factor % compptr->h_samp_factor) == 0 &&
 	       (cinfo->max_v_samp_factor % compptr->v_samp_factor) == 0) {
@@ -517,3 +546,25 @@
     TRACEMS(cinfo, 0, JTRC_SMOOTH_NOTIMPL);
 #endif
 }
+
+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+GLOBAL(unsigned int)
+jpeg_simd_downsampler (j_compress_ptr cinfo)
+{
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
+
+#ifdef JCSAMPLE_SSE2_SUPPORTED
+  if (simd & JSIMD_SSE2)
+    return JSIMD_SSE2;
+#endif
+#ifdef JCSAMPLE_MMX_SUPPORTED
+  if (simd & JSIMD_MMX)
+    return JSIMD_MMX;
+#endif
+
+  return JSIMD_NONE;
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
diff --git a/jcsamss2.asm b/jcsamss2.asm
new file mode 100644
index 0000000..e187d63
--- /dev/null
+++ b/jcsamss2.asm
@@ -0,0 +1,355 @@
+;
+; jcsamss2.asm - downsampling (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : January 23, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jcolsamp.inc"
+
+%ifdef JCSAMPLE_SSE2_SUPPORTED
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jpeg_h2v1_downsample_sse2 (j_compress_ptr cinfo,
+;                            jpeg_component_info * compptr,
+;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define cinfo(b)	(b)+8		; j_compress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define input_data(b)	(b)+16		; JSAMPARRAY input_data
+%define output_data(b)	(b)+20		; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jpeg_h2v1_downsample_sse2)
+
+EXTN(jpeg_h2v1_downsample_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, POINTER [compptr(ebp)]
+	mov	ecx, JDIMENSION [jcompinfo_width_in_blocks(ecx)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, POINTER [cinfo(ebp)]
+	mov	edx, JDIMENSION [jcstruct_image_width(edx)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, POINTER [cinfo(ebp)]
+	mov	eax, INT [jcstruct_max_v_samp_factor(eax)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v1_downsample
+
+	mov	eax, POINTER [compptr(ebp)]
+	mov	eax, JDIMENSION [jcompinfo_v_samp_factor(eax)]	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov	edx, 0x00010000		; bias pattern
+	movd	xmm7,edx
+	pcmpeqw	xmm6,xmm6
+	pshufd	xmm7,xmm7,0x00		; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+	alignx	16,7
+
+.columnloop_r8:
+	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	pxor	xmm1,xmm1
+	mov	ecx, SIZEOF_XMMWORD
+	jmp	short .downsample
+	alignx	16,7
+
+.columnloop:
+	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+	movdqa	xmm2,xmm0
+	movdqa	xmm3,xmm1
+
+	pand	xmm0,xmm6
+	psrlw	xmm2,BYTE_BIT
+	pand	xmm1,xmm6
+	psrlw	xmm3,BYTE_BIT
+
+	paddw	xmm0,xmm2
+	paddw	xmm1,xmm3
+	paddw	xmm0,xmm7
+	paddw	xmm1,xmm7
+	psrlw	xmm0,1
+	psrlw	xmm1,1
+
+	packuswb xmm0,xmm1
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+	sub	ecx, byte SIZEOF_XMMWORD	; outcol
+	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	edi, byte 1*SIZEOF_XMMWORD	; outptr
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+	test	ecx,ecx
+	jnz	short .columnloop_r8
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jpeg_h2v2_downsample_sse2 (j_compress_ptr cinfo,
+;                            jpeg_component_info * compptr,
+;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+%define cinfo(b)	(b)+8		; j_compress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define input_data(b)	(b)+16		; JSAMPARRAY input_data
+%define output_data(b)	(b)+20		; JSAMPARRAY output_data
+
+	align	16
+	global	EXTN(jpeg_h2v2_downsample_sse2)
+
+EXTN(jpeg_h2v2_downsample_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	ecx, POINTER [compptr(ebp)]
+	mov	ecx, JDIMENSION [jcompinfo_width_in_blocks(ecx)]
+	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
+	jz	near .return
+
+	mov	edx, POINTER [cinfo(ebp)]
+	mov	edx, JDIMENSION [jcstruct_image_width(edx)]
+
+	; -- expand_right_edge
+
+	push	ecx
+	shl	ecx,1				; output_cols * 2
+	sub	ecx,edx
+	jle	short .expand_end
+
+	mov	eax, POINTER [cinfo(ebp)]
+	mov	eax, INT [jcstruct_max_v_samp_factor(eax)]
+	test	eax,eax
+	jle	short .expand_end
+
+	cld
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	alignx	16,7
+.expandloop:
+	push	eax
+	push	ecx
+
+	mov	edi, JSAMPROW [esi]
+	add	edi,edx
+	mov	al, JSAMPLE [edi-1]
+
+	rep stosb
+
+	pop	ecx
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	dec	eax
+	jg	short .expandloop
+
+.expand_end:
+	pop	ecx				; output_cols
+
+	; -- h2v2_downsample
+
+	mov	eax, POINTER [compptr(ebp)]
+	mov	eax, JDIMENSION [jcompinfo_v_samp_factor(eax)]	; rowctr
+	test	eax,eax
+	jle	near .return
+
+	mov	edx, 0x00020001		; bias pattern
+	movd	xmm7,edx
+	pcmpeqw	xmm6,xmm6
+	pshufd	xmm7,xmm7,0x00		; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
+	alignx	16,7
+.rowloop:
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
+	mov	edi, JSAMPROW [edi]			; outptr
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	short .columnloop
+	alignx	16,7
+
+.columnloop_r8:
+	movdqa	xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	mov	ecx, SIZEOF_XMMWORD
+	jmp	short .downsample
+	alignx	16,7
+
+.columnloop:
+	movdqa	xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqa	xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
+	movdqa	xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,xmm1
+	pand	xmm0,xmm6
+	psrlw	xmm4,BYTE_BIT
+	pand	xmm1,xmm6
+	psrlw	xmm5,BYTE_BIT
+	paddw	xmm0,xmm4
+	paddw	xmm1,xmm5
+
+	movdqa	xmm4,xmm2
+	movdqa	xmm5,xmm3
+	pand	xmm2,xmm6
+	psrlw	xmm4,BYTE_BIT
+	pand	xmm3,xmm6
+	psrlw	xmm5,BYTE_BIT
+	paddw	xmm2,xmm4
+	paddw	xmm3,xmm5
+
+	paddw	xmm0,xmm1
+	paddw	xmm2,xmm3
+	paddw	xmm0,xmm7
+	paddw	xmm2,xmm7
+	psrlw	xmm0,2
+	psrlw	xmm2,2
+
+	packuswb xmm0,xmm2
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+	sub	ecx, byte SIZEOF_XMMWORD	; outcol
+	add	edx, byte 2*SIZEOF_XMMWORD	; inptr0
+	add	esi, byte 2*SIZEOF_XMMWORD	; inptr1
+	add	edi, byte 1*SIZEOF_XMMWORD	; outptr
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jae	near .columnloop
+	test	ecx,ecx
+	jnz	near .columnloop_r8
+
+	pop	esi
+	pop	edi
+	pop	ecx
+
+	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
+	dec	eax				; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+%endif ; JCSAMPLE_SSE2_SUPPORTED
diff --git a/jdcoefct.c b/jdcoefct.c
index 4938d20..1a515d3 100644
--- a/jdcoefct.c
+++ b/jdcoefct.c
@@ -5,6 +5,13 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified to improve performance.
+ * Last Modified : December 18, 2005
+ * ---------------------------------------------------------------------
+ *
  * This file contains the coefficient buffer controller for decompression.
  * This controller is the top level of the JPEG decompressor proper.
  * The coefficient buffer lies between entropy decoding and inverse-DCT steps.
@@ -133,6 +140,11 @@
 }
 
 
+#ifndef NEED_FAR_POINTERS
+#undef jzero_far
+#define jzero_far(target, bytestozero)  MEMZERO(target, bytestozero)
+#endif
+
 /*
  * Decompress and return some data in the single-pass case.
  * Always attempts to emit one fully interleaved MCU row ("iMCU" row).
@@ -150,15 +162,61 @@
   JDIMENSION MCU_col_num;	/* index of current MCU within row */
   JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
-  int blkn, ci, xindex, yindex, yoffset, useful_width;
+  int blkn, ci, ctr, xindex, yindex, yoffset;
   JSAMPARRAY output_ptr;
-  JDIMENSION start_col, output_col;
+  JDIMENSION output_col;
   jpeg_component_info *compptr;
   inverse_DCT_method_ptr inverse_DCT;
+  JSAMPARRAY output_ptr_blk[D_MAX_BLOCKS_IN_MCU];
+  JDIMENSION output_col_off[D_MAX_BLOCKS_IN_MCU];
+  jpeg_component_info *compptr_blk[D_MAX_BLOCKS_IN_MCU];
+  inverse_DCT_method_ptr inverse_DCT_blk_1[D_MAX_BLOCKS_IN_MCU];
+  inverse_DCT_method_ptr inverse_DCT_blk_2[D_MAX_BLOCKS_IN_MCU];
+  inverse_DCT_method_ptr *inverse_DCT_blk;
 
   /* Loop to process as much as one whole iMCU row */
   for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
        yoffset++) {
+    /* Determine where data should go in output_buf and do the IDCT thing.
+     * We skip dummy blocks at the right and bottom edges (but blkn gets
+     * incremented past them!).  Note the inner loop relies on having
+     * allocated the MCU_buffer[] blocks sequentially.
+     */
+    blkn = 0;			/* index of current DCT block within MCU */
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      compptr = cinfo->cur_comp_info[ci];
+      /* Don't bother to IDCT an uninteresting component. */
+      if (! compptr->component_needed) {
+	for (ctr = compptr->MCU_blocks; ctr > 0; ctr--) {
+	  inverse_DCT_blk_1[blkn] = inverse_DCT_blk_2[blkn] = NULL;
+	  blkn++;
+	}
+	continue;
+      }
+      inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index];
+      output_ptr = output_buf[compptr->component_index] +
+	yoffset * compptr->DCT_scaled_size;
+      for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
+	if (cinfo->input_iMCU_row < last_iMCU_row ||
+	    yoffset+yindex < compptr->last_row_height) {
+	  for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
+	    compptr_blk[blkn] = compptr;
+	    output_ptr_blk[blkn] = output_ptr;
+	    output_col_off[blkn] = xindex * compptr->DCT_scaled_size;
+	    inverse_DCT_blk_1[blkn] = inverse_DCT;
+	    inverse_DCT_blk_2[blkn] = (xindex < compptr->last_col_width) ?
+				      inverse_DCT : NULL;
+	    blkn++;
+	  }
+	} else {
+	  for (ctr = compptr->MCU_width; ctr > 0; ctr--) {
+	    inverse_DCT_blk_1[blkn] = inverse_DCT_blk_2[blkn] = NULL;
+	    blkn++;
+	  }
+	}
+	output_ptr += compptr->DCT_scaled_size;
+      }
+    }
     for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col;
 	 MCU_col_num++) {
       /* Try to fetch an MCU.  Entropy decoder expects buffer to be zeroed. */
@@ -170,39 +228,17 @@
 	coef->MCU_ctr = MCU_col_num;
 	return JPEG_SUSPENDED;
       }
-      /* Determine where data should go in output_buf and do the IDCT thing.
-       * We skip dummy blocks at the right and bottom edges (but blkn gets
-       * incremented past them!).  Note the inner loop relies on having
-       * allocated the MCU_buffer[] blocks sequentially.
-       */
-      blkn = 0;			/* index of current DCT block within MCU */
-      for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-	compptr = cinfo->cur_comp_info[ci];
-	/* Don't bother to IDCT an uninteresting component. */
-	if (! compptr->component_needed) {
-	  blkn += compptr->MCU_blocks;
+      inverse_DCT_blk = (MCU_col_num < last_MCU_col) ? inverse_DCT_blk_1
+						     : inverse_DCT_blk_2;
+      for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+	inverse_DCT = inverse_DCT_blk[blkn];
+	if (inverse_DCT == NULL)
 	  continue;
-	}
-	inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index];
-	useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
-						    : compptr->last_col_width;
-	output_ptr = output_buf[compptr->component_index] +
-	  yoffset * compptr->DCT_scaled_size;
-	start_col = MCU_col_num * compptr->MCU_sample_width;
-	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-	  if (cinfo->input_iMCU_row < last_iMCU_row ||
-	      yoffset+yindex < compptr->last_row_height) {
-	    output_col = start_col;
-	    for (xindex = 0; xindex < useful_width; xindex++) {
-	      (*inverse_DCT) (cinfo, compptr,
-			      (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
-			      output_ptr, output_col);
-	      output_col += compptr->DCT_scaled_size;
-	    }
-	  }
-	  blkn += compptr->MCU_width;
-	  output_ptr += compptr->DCT_scaled_size;
-	}
+	compptr = compptr_blk[blkn];
+	output_col = MCU_col_num * compptr->MCU_sample_width +
+		     output_col_off[blkn];
+	(*inverse_DCT) (cinfo, compptr, (JCOEFPTR) coef->MCU_buffer[blkn],
+			output_ptr_blk[blkn], output_col);
       }
     }
     /* Completed an MCU row, but perhaps not an iMCU row */
@@ -250,6 +286,8 @@
   JBLOCKARRAY buffer[MAX_COMPS_IN_SCAN];
   JBLOCKROW buffer_ptr;
   jpeg_component_info *compptr;
+  int MCU_width[D_MAX_BLOCKS_IN_MCU];
+  JBLOCKROW MCU_buffer_base[D_MAX_BLOCKS_IN_MCU];
 
   /* Align the virtual buffers for the components used in this scan. */
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
@@ -267,19 +305,24 @@
   /* Loop to process one whole iMCU row */
   for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
        yoffset++) {
+    /* Construct list of pointers to DCT blocks belonging to this MCU */
+    blkn = 0;			/* index of current DCT block within MCU */
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      compptr = cinfo->cur_comp_info[ci];
+      for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
+	buffer_ptr = buffer[ci][yindex+yoffset];
+	for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
+	  MCU_width[blkn] = compptr->MCU_width;
+	  MCU_buffer_base[blkn] = buffer_ptr++;
+	  blkn++;
+	}
+      }
+    }
     for (MCU_col_num = coef->MCU_ctr; MCU_col_num < cinfo->MCUs_per_row;
 	 MCU_col_num++) {
-      /* Construct list of pointers to DCT blocks belonging to this MCU */
-      blkn = 0;			/* index of current DCT block within MCU */
-      for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-	compptr = cinfo->cur_comp_info[ci];
-	start_col = MCU_col_num * compptr->MCU_width;
-	for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-	  buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
-	  for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
-	    coef->MCU_buffer[blkn++] = buffer_ptr++;
-	  }
-	}
+      for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+	start_col = MCU_col_num * MCU_width[blkn];
+	coef->MCU_buffer[blkn] = MCU_buffer_base[blkn] + start_col;
       }
       /* Try to fetch the MCU. */
       if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
@@ -454,6 +497,15 @@
 
 
 /*
+ * SIMD Ext: Most of SSE/SSE2 instructions require that the memory address
+ * is aligned to a 16-byte boundary; if not, a general-protection exception
+ * (#GP) is generated.
+ */
+
+#define ALIGN_SIZE	16		/* sizeof SSE/SSE2 register */
+#define ALIGN_MEM(p,a)	((void *) (((size_t) (p) + (a) - 1) & -(a)))
+
+/*
  * Variant of decompress_data for use when doing block smoothing.
  */
 
@@ -471,7 +523,8 @@
   jpeg_component_info *compptr;
   inverse_DCT_method_ptr inverse_DCT;
   boolean first_row, last_row;
-  JBLOCK workspace;
+  JCOEF workspace[DCTSIZE2 + ALIGN_SIZE/sizeof(JCOEF)];
+  JCOEF * workptr = (JCOEF *) ALIGN_MEM(workspace, ALIGN_SIZE);
   int *coef_bits;
   JQUANT_TBL *quanttbl;
   INT32 Q00,Q01,Q02,Q10,Q11,Q20, num;
@@ -560,7 +613,7 @@
       last_block_column = compptr->width_in_blocks - 1;
       for (block_num = 0; block_num <= last_block_column; block_num++) {
 	/* Fetch current DCT block into workspace so we can modify it. */
-	jcopy_block_row(buffer_ptr, (JBLOCKROW) workspace, (JDIMENSION) 1);
+	jcopy_block_row(buffer_ptr, (JBLOCKROW) workptr, (JDIMENSION) 1);
 	/* Update DC values */
 	if (block_num < last_block_column) {
 	  DC3 = (int) prev_block_row[1][0];
@@ -572,7 +625,7 @@
 	 * and is not known to be fully accurate.
 	 */
 	/* AC01 */
-	if ((Al=coef_bits[1]) != 0 && workspace[1] == 0) {
+	if ((Al=coef_bits[1]) != 0 && workptr[1] == 0) {
 	  num = 36 * Q00 * (DC4 - DC6);
 	  if (num >= 0) {
 	    pred = (int) (((Q01<<7) + num) / (Q01<<8));
@@ -584,10 +637,10 @@
 	      pred = (1<<Al)-1;
 	    pred = -pred;
 	  }
-	  workspace[1] = (JCOEF) pred;
+	  workptr[1] = (JCOEF) pred;
 	}
 	/* AC10 */
-	if ((Al=coef_bits[2]) != 0 && workspace[8] == 0) {
+	if ((Al=coef_bits[2]) != 0 && workptr[8] == 0) {
 	  num = 36 * Q00 * (DC2 - DC8);
 	  if (num >= 0) {
 	    pred = (int) (((Q10<<7) + num) / (Q10<<8));
@@ -599,10 +652,10 @@
 	      pred = (1<<Al)-1;
 	    pred = -pred;
 	  }
-	  workspace[8] = (JCOEF) pred;
+	  workptr[8] = (JCOEF) pred;
 	}
 	/* AC20 */
-	if ((Al=coef_bits[3]) != 0 && workspace[16] == 0) {
+	if ((Al=coef_bits[3]) != 0 && workptr[16] == 0) {
 	  num = 9 * Q00 * (DC2 + DC8 - 2*DC5);
 	  if (num >= 0) {
 	    pred = (int) (((Q20<<7) + num) / (Q20<<8));
@@ -614,10 +667,10 @@
 	      pred = (1<<Al)-1;
 	    pred = -pred;
 	  }
-	  workspace[16] = (JCOEF) pred;
+	  workptr[16] = (JCOEF) pred;
 	}
 	/* AC11 */
-	if ((Al=coef_bits[4]) != 0 && workspace[9] == 0) {
+	if ((Al=coef_bits[4]) != 0 && workptr[9] == 0) {
 	  num = 5 * Q00 * (DC1 - DC3 - DC7 + DC9);
 	  if (num >= 0) {
 	    pred = (int) (((Q11<<7) + num) / (Q11<<8));
@@ -629,10 +682,10 @@
 	      pred = (1<<Al)-1;
 	    pred = -pred;
 	  }
-	  workspace[9] = (JCOEF) pred;
+	  workptr[9] = (JCOEF) pred;
 	}
 	/* AC02 */
-	if ((Al=coef_bits[5]) != 0 && workspace[2] == 0) {
+	if ((Al=coef_bits[5]) != 0 && workptr[2] == 0) {
 	  num = 9 * Q00 * (DC4 + DC6 - 2*DC5);
 	  if (num >= 0) {
 	    pred = (int) (((Q02<<7) + num) / (Q02<<8));
@@ -644,10 +697,10 @@
 	      pred = (1<<Al)-1;
 	    pred = -pred;
 	  }
-	  workspace[2] = (JCOEF) pred;
+	  workptr[2] = (JCOEF) pred;
 	}
 	/* OK, do the IDCT */
-	(*inverse_DCT) (cinfo, compptr, (JCOEFPTR) workspace,
+	(*inverse_DCT) (cinfo, compptr, (JCOEFPTR) workptr,
 			output_ptr, output_col);
 	/* Advance for next column */
 	DC1 = DC2; DC2 = DC3;
diff --git a/jdcolmmx.asm b/jdcolmmx.asm
new file mode 100644
index 0000000..e46622c
--- /dev/null
+++ b/jdcolmmx.asm
@@ -0,0 +1,438 @@
+;
+; jdcolmmx.asm - colorspace conversion (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jcolsamp.inc"
+
+%if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+%ifdef JDCOLOR_YCCRGB_MMX_SUPPORTED
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_ycc_rgb_convert_mmx)
+
+EXTN(jconst_ycc_rgb_convert_mmx):
+
+PW_F0402	times 4 dw  F_0_402
+PW_MF0228	times 4 dw -F_0_228
+PW_MF0344_F0285	times 2 dw -F_0_344, F_0_285
+PW_ONE		times 4 dw  1
+PD_ONEHALF	times 2 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jpeg_ycc_rgb_convert_mmx (j_decompress_ptr cinfo,
+;                           JSAMPIMAGE input_buf, JDIMENSION input_row,
+;                           JSAMPARRAY output_buf, int num_rows)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define input_buf(b)	(b)+12		; JSAMPIMAGE input_buf
+%define input_row(b)	(b)+16		; JDIMENSION input_row
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_ycc_rgb_convert_mmx)
+
+EXTN(jpeg_ycc_rgb_convert_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, POINTER [cinfo(eax)]
+	mov	ecx, JDIMENSION [jdstruct_output_width(ecx)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [input_row(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	push	eax
+	push	edi
+	push	edx
+	push	ebx
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr0
+	mov	ebx, JSAMPROW [ebx]	; inptr1
+	mov	edx, JSAMPROW [edx]	; inptr2
+	mov	edi, JSAMPROW [edi]	; outptr
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+	alignx	16,7
+.columnloop:
+
+	movq	mm5, MMWORD [ebx]	; mm5=Cb(01234567)
+	movq	mm1, MMWORD [edx]	; mm1=Cr(01234567)
+
+	pcmpeqw	mm4,mm4
+	pcmpeqw	mm7,mm7
+	psrlw	mm4,BYTE_BIT
+	psllw	mm7,7			; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+	movq	mm0,mm4			; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
+
+	pand	mm4,mm5			; mm4=Cb(0246)=CbE
+	psrlw	mm5,BYTE_BIT		; mm5=Cb(1357)=CbO
+	pand	mm0,mm1			; mm0=Cr(0246)=CrE
+	psrlw	mm1,BYTE_BIT		; mm1=Cr(1357)=CrO
+
+	paddw	mm4,mm7
+	paddw	mm5,mm7
+	paddw	mm0,mm7
+	paddw	mm1,mm7
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movq	mm2,mm4			; mm2=CbE
+	movq	mm3,mm5			; mm3=CbO
+	paddw	mm4,mm4			; mm4=2*CbE
+	paddw	mm5,mm5			; mm5=2*CbO
+	movq	mm6,mm0			; mm6=CrE
+	movq	mm7,mm1			; mm7=CrO
+	paddw	mm0,mm0			; mm0=2*CrE
+	paddw	mm1,mm1			; mm1=2*CrO
+
+	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbE * -FIX(0.22800))
+	pmulhw	mm5,[GOTOFF(eax,PW_MF0228)]	; mm5=(2*CbO * -FIX(0.22800))
+	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrE * FIX(0.40200))
+	pmulhw	mm1,[GOTOFF(eax,PW_F0402)]	; mm1=(2*CrO * FIX(0.40200))
+
+	paddw	mm4,[GOTOFF(eax,PW_ONE)]
+	paddw	mm5,[GOTOFF(eax,PW_ONE)]
+	psraw	mm4,1			; mm4=(CbE * -FIX(0.22800))
+	psraw	mm5,1			; mm5=(CbO * -FIX(0.22800))
+	paddw	mm0,[GOTOFF(eax,PW_ONE)]
+	paddw	mm1,[GOTOFF(eax,PW_ONE)]
+	psraw	mm0,1			; mm0=(CrE * FIX(0.40200))
+	psraw	mm1,1			; mm1=(CrO * FIX(0.40200))
+
+	paddw	mm4,mm2
+	paddw	mm5,mm3
+	paddw	mm4,mm2			; mm4=(CbE * FIX(1.77200))=(B-Y)E
+	paddw	mm5,mm3			; mm5=(CbO * FIX(1.77200))=(B-Y)O
+	paddw	mm0,mm6			; mm0=(CrE * FIX(1.40200))=(R-Y)E
+	paddw	mm1,mm7			; mm1=(CrO * FIX(1.40200))=(R-Y)O
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(B-Y)E
+	movq	MMWORD [wk(1)], mm5	; wk(1)=(B-Y)O
+
+	movq      mm4,mm2
+	movq      mm5,mm3
+	punpcklwd mm2,mm6
+	punpckhwd mm4,mm6
+	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm4,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd mm3,mm7
+	punpckhwd mm5,mm7
+	pmaddwd   mm3,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm4,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm2,SCALEBITS
+	psrad     mm4,SCALEBITS
+	paddd     mm3,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm3,SCALEBITS
+	psrad     mm5,SCALEBITS
+
+	packssdw  mm2,mm4	; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+	packssdw  mm3,mm5	; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+	psubw     mm2,mm6	; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+	psubw     mm3,mm7	; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+	movq      mm5, MMWORD [esi]	; mm5=Y(01234567)
+
+	pcmpeqw   mm4,mm4
+	psrlw     mm4,BYTE_BIT		; mm4={0xFF 0x00 0xFF 0x00 ..}
+	pand      mm4,mm5		; mm4=Y(0246)=YE
+	psrlw     mm5,BYTE_BIT		; mm5=Y(1357)=YO
+
+	paddw     mm0,mm4		; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
+	paddw     mm1,mm5		; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
+	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
+	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+	paddw     mm2,mm4		; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
+	paddw     mm3,mm5		; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
+	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
+	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+	paddw     mm4, MMWORD [wk(0)]	; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
+	paddw     mm5, MMWORD [wk(1)]	; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
+	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
+	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
+	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
+
+	movq      mmG,mmA
+	movq      mmH,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
+	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
+
+	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
+	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
+
+	movq      mmC,mmD
+	movq      mmB,mmD
+	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
+	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
+
+	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
+
+	movq      mmF,mmE
+	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
+	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
+
+	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
+	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
+	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_MMWORD
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	mmA,mmC
+	sub	ecx, byte 2*SIZEOF_MMWORD
+	add	edi, byte 2*SIZEOF_MMWORD
+	jmp	short .column_st4
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmE
+	sub	ecx, byte SIZEOF_MMWORD
+	add	edi, byte SIZEOF_MMWORD
+.column_st4:
+	movd	eax,mmA
+	cmp	ecx, byte SIZEOF_DWORD
+	jb	short .column_st2
+	mov	DWORD [edi+0*SIZEOF_DWORD], eax
+	psrlq	mmA,DWORD_BIT
+	movd	eax,mmA
+	sub	ecx, byte SIZEOF_DWORD
+	add	edi, byte SIZEOF_DWORD
+.column_st2:
+	cmp	ecx, byte SIZEOF_WORD
+	jb	short .column_st1
+	mov	WORD [edi+0*SIZEOF_WORD], ax
+	shr	eax,WORD_BIT
+	sub	ecx, byte SIZEOF_WORD
+	add	edi, byte SIZEOF_WORD
+.column_st1:
+	cmp	ecx, byte SIZEOF_BYTE
+	jb	short .nextrow
+	mov	BYTE [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
+	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
+	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
+
+	movq      mmC,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
+	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
+	movq      mmG,mmB
+	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
+	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
+
+	movq      mmD,mmA
+	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
+	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
+	movq      mmH,mmC
+	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
+	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	cmp	ecx, byte SIZEOF_MMWORD/2
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	mmA,mmC
+	movq	mmD,mmH
+	sub	ecx, byte SIZEOF_MMWORD/2
+	add	edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD/4
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmD
+	sub	ecx, byte SIZEOF_MMWORD/4
+	add	edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+	cmp	ecx, byte SIZEOF_MMWORD/8
+	jb	short .nextrow
+	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	alignx	16,7
+
+.nextrow:
+	pop	ecx
+	pop	esi
+	pop	ebx
+	pop	edx
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	add	edi, byte SIZEOF_JSAMPROW	; output_buf
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JDCOLOR_YCCRGB_MMX_SUPPORTED
+%endif ; RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
diff --git a/jdcolor.c b/jdcolor.c
index 6c04dfe..9a8c7ea 100644
--- a/jdcolor.c
+++ b/jdcolor.c
@@ -5,12 +5,20 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : January 5, 2006
+ * ---------------------------------------------------------------------
+ *
  * This file contains output colorspace conversion routines.
  */
 
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jcolsamp.h"		/* Private declarations */
 
 
 /* Private subobject */
@@ -105,6 +113,17 @@
 }
 
 
+#if RGB_PIXELSIZE == 4
+/* offset of filler byte */
+#define RGB_FILLER  (6 - (RGB_RED) - (RGB_GREEN) - (RGB_BLUE))
+/* byte pattern to fill with */
+#ifdef RGBX_FILLER_0XFF
+#define RGB_FILLER_BYTE 0xFF
+#else
+#define RGB_FILLER_BYTE 0x00
+#endif
+#endif /* RGB_PIXELSIZE == 4 */
+
 /*
  * Convert some rows of samples to the output colorspace.
  *
@@ -151,6 +170,9 @@
 			      ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
 						 SCALEBITS))];
       outptr[RGB_BLUE] =  range_limit[y + Cbbtab[cb]];
+#if RGB_PIXELSIZE == 4
+      outptr[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
       outptr += RGB_PIXELSIZE;
     }
   }
@@ -228,6 +250,9 @@
     for (col = 0; col < num_cols; col++) {
       /* We can dispense with GETJSAMPLE() here */
       outptr[RGB_RED] = outptr[RGB_GREEN] = outptr[RGB_BLUE] = inptr[col];
+#if RGB_PIXELSIZE == 4
+      outptr[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
       outptr += RGB_PIXELSIZE;
     }
   }
@@ -305,6 +330,7 @@
 {
   my_cconvert_ptr cconvert;
   int ci;
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
 
   cconvert = (my_cconvert_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -358,8 +384,23 @@
   case JCS_RGB:
     cinfo->out_color_components = RGB_PIXELSIZE;
     if (cinfo->jpeg_color_space == JCS_YCbCr) {
-      cconvert->pub.color_convert = ycc_rgb_convert;
-      build_ycc_rgb_table(cinfo);
+#if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+#ifdef JDCOLOR_YCCRGB_SSE2_SUPPORTED
+      if (simd & JSIMD_SSE2 &&
+          IS_CONST_ALIGNED_16(jconst_ycc_rgb_convert_sse2)) {
+        cconvert->pub.color_convert = jpeg_ycc_rgb_convert_sse2;
+      } else
+#endif
+#ifdef JDCOLOR_YCCRGB_MMX_SUPPORTED
+      if (simd & JSIMD_MMX) {
+        cconvert->pub.color_convert = jpeg_ycc_rgb_convert_mmx;
+      } else
+#endif
+#endif /* RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4 */
+      {
+        cconvert->pub.color_convert = ycc_rgb_convert;
+        build_ycc_rgb_table(cinfo);
+      }
     } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
       cconvert->pub.color_convert = gray_rgb_convert;
     } else if (cinfo->jpeg_color_space == JCS_RGB && RGB_PIXELSIZE == 3) {
@@ -394,3 +435,28 @@
   else
     cinfo->output_components = cinfo->out_color_components;
 }
+
+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+GLOBAL(unsigned int)
+jpeg_simd_color_deconverter (j_decompress_ptr cinfo)
+{
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
+
+#if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+#ifdef JDCOLOR_YCCRGB_SSE2_SUPPORTED
+  if (simd & JSIMD_SSE2 &&
+      IS_CONST_ALIGNED_16(jconst_ycc_rgb_convert_sse2))
+    return JSIMD_SSE2;
+#endif
+#ifdef JDCOLOR_YCCRGB_MMX_SUPPORTED
+  if (simd & JSIMD_MMX)
+    return JSIMD_MMX;
+#endif
+#endif /* RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4 */
+
+  return JSIMD_NONE;
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
diff --git a/jdcolss2.asm b/jdcolss2.asm
new file mode 100644
index 0000000..fd6f04d
--- /dev/null
+++ b/jdcolss2.asm
@@ -0,0 +1,536 @@
+;
+; jdcolss2.asm - colorspace conversion (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jcolsamp.inc"
+
+%if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+%ifdef JDCOLOR_YCCRGB_SSE2_SUPPORTED
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402	times 8 dw  F_0_402
+PW_MF0228	times 8 dw -F_0_228
+PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
+PW_ONE		times 8 dw  1
+PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jpeg_ycc_rgb_convert_sse2 (j_decompress_ptr cinfo,
+;                            JSAMPIMAGE input_buf, JDIMENSION input_row,
+;                            JSAMPARRAY output_buf, int num_rows)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define input_buf(b)	(b)+12		; JSAMPIMAGE input_buf
+%define input_row(b)	(b)+16		; JDIMENSION input_row
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define num_rows(b)	(b)+24		; int num_rows
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_ycc_rgb_convert_sse2)
+
+EXTN(jpeg_ycc_rgb_convert_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, POINTER [cinfo(eax)]
+	mov	ecx, JDIMENSION [jdstruct_output_width(ecx)]	; num_cols
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [input_row(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
+	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+	pop	ecx
+
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	eax, INT [num_rows(eax)]
+	test	eax,eax
+	jle	near .return
+	alignx	16,7
+.rowloop:
+	push	eax
+	push	edi
+	push	edx
+	push	ebx
+	push	esi
+	push	ecx			; col
+
+	mov	esi, JSAMPROW [esi]	; inptr0
+	mov	ebx, JSAMPROW [ebx]	; inptr1
+	mov	edx, JSAMPROW [edx]	; inptr2
+	mov	edi, JSAMPROW [edi]	; outptr
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+	alignx	16,7
+.columnloop:
+
+	movdqa	xmm5, XMMWORD [ebx]	; xmm5=Cb(0123456789ABCDEF)
+	movdqa	xmm1, XMMWORD [edx]	; xmm1=Cr(0123456789ABCDEF)
+
+	pcmpeqw	xmm4,xmm4
+	pcmpeqw	xmm7,xmm7
+	psrlw	xmm4,BYTE_BIT
+	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+	movdqa	xmm0,xmm4		; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+	pand	xmm4,xmm5		; xmm4=Cb(02468ACE)=CbE
+	psrlw	xmm5,BYTE_BIT		; xmm5=Cb(13579BDF)=CbO
+	pand	xmm0,xmm1		; xmm0=Cr(02468ACE)=CrE
+	psrlw	xmm1,BYTE_BIT		; xmm1=Cr(13579BDF)=CrO
+
+	paddw	xmm4,xmm7
+	paddw	xmm5,xmm7
+	paddw	xmm0,xmm7
+	paddw	xmm1,xmm7
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movdqa	xmm2,xmm4		; xmm2=CbE
+	movdqa	xmm3,xmm5		; xmm3=CbO
+	paddw	xmm4,xmm4		; xmm4=2*CbE
+	paddw	xmm5,xmm5		; xmm5=2*CbO
+	movdqa	xmm6,xmm0		; xmm6=CrE
+	movdqa	xmm7,xmm1		; xmm7=CrO
+	paddw	xmm0,xmm0		; xmm0=2*CrE
+	paddw	xmm1,xmm1		; xmm1=2*CrO
+
+	pmulhw	xmm4,[GOTOFF(eax,PW_MF0228)]	; xmm4=(2*CbE * -FIX(0.22800))
+	pmulhw	xmm5,[GOTOFF(eax,PW_MF0228)]	; xmm5=(2*CbO * -FIX(0.22800))
+	pmulhw	xmm0,[GOTOFF(eax,PW_F0402)]	; xmm0=(2*CrE * FIX(0.40200))
+	pmulhw	xmm1,[GOTOFF(eax,PW_F0402)]	; xmm1=(2*CrO * FIX(0.40200))
+
+	paddw	xmm4,[GOTOFF(eax,PW_ONE)]
+	paddw	xmm5,[GOTOFF(eax,PW_ONE)]
+	psraw	xmm4,1			; xmm4=(CbE * -FIX(0.22800))
+	psraw	xmm5,1			; xmm5=(CbO * -FIX(0.22800))
+	paddw	xmm0,[GOTOFF(eax,PW_ONE)]
+	paddw	xmm1,[GOTOFF(eax,PW_ONE)]
+	psraw	xmm0,1			; xmm0=(CrE * FIX(0.40200))
+	psraw	xmm1,1			; xmm1=(CrO * FIX(0.40200))
+
+	paddw	xmm4,xmm2
+	paddw	xmm5,xmm3
+	paddw	xmm4,xmm2		; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+	paddw	xmm5,xmm3		; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+	paddw	xmm0,xmm6		; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+	paddw	xmm1,xmm7		; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=(B-Y)E
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(B-Y)O
+
+	movdqa    xmm4,xmm2
+	movdqa    xmm5,xmm3
+	punpcklwd xmm2,xmm6
+	punpckhwd xmm4,xmm6
+	pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   xmm4,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd xmm3,xmm7
+	punpckhwd xmm5,xmm7
+	pmaddwd   xmm3,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     xmm4,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     xmm2,SCALEBITS
+	psrad     xmm4,SCALEBITS
+	paddd     xmm3,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     xmm3,SCALEBITS
+	psrad     xmm5,SCALEBITS
+
+	packssdw  xmm2,xmm4	; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+	packssdw  xmm3,xmm5	; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+	psubw     xmm2,xmm6	; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+	psubw     xmm3,xmm7	; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+	movdqa    xmm5, XMMWORD [esi]	; xmm5=Y(0123456789ABCDEF)
+
+	pcmpeqw   xmm4,xmm4
+	psrlw     xmm4,BYTE_BIT		; xmm4={0xFF 0x00 0xFF 0x00 ..}
+	pand      xmm4,xmm5		; xmm4=Y(02468ACE)=YE
+	psrlw     xmm5,BYTE_BIT		; xmm5=Y(13579BDF)=YO
+
+	paddw     xmm0,xmm4		; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+	paddw     xmm1,xmm5		; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
+	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
+
+	paddw     xmm2,xmm4		; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+	paddw     xmm3,xmm5		; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
+	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
+
+	paddw     xmm4, XMMWORD [wk(0)]	; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+	paddw     xmm5, XMMWORD [wk(1)]	; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
+	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+	movdqa    xmmG,xmmA
+	movdqa    xmmH,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+	movdqa    xmmC,xmmD
+	movdqa    xmmB,xmmD
+	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+	movdqa    xmmF,xmmE
+	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+	movdqa    xmmB,xmmE
+	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+	movdqa    xmmB,xmmF
+	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	edi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [edi], xmmF
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	ecx, byte SIZEOF_XMMWORD
+	jz	near .nextrow
+
+	add	esi, byte SIZEOF_XMMWORD	; inptr0
+	add	ebx, byte SIZEOF_XMMWORD	; inptr1
+	add	edx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st32:
+	pcmpeqb	xmmH,xmmH			; xmmH=(all 1's)
+	lea	ecx, [ecx+ecx*2]		; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_XMMWORD
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmF
+	sub	ecx, byte 2*SIZEOF_XMMWORD
+	jmp	short .column_st15
+.column_st16:
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	ecx, byte SIZEOF_XMMWORD
+.column_st15:
+	mov	eax,ecx
+	xor	ecx, byte 0x0F
+	shl	ecx, 2
+	movd	xmmB,ecx
+	psrlq	xmmH,4
+	pcmpeqb	xmmE,xmmE
+	psrlq	xmmH,xmmB
+	psrlq	xmmE,xmmB
+	punpcklbw xmmE,xmmH
+	; ----------------
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	add	eax,ecx
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmG,xmmA
+	movdqa	xmmC,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmD,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmF,ecx
+	psllq	xmmA,xmmF
+	psllq	xmmE,xmmF
+	jmp	short .adj0
+.adj1:	neg	ecx
+	movd	xmmF,ecx
+	psrlq	xmmA,xmmF
+	psrlq	xmmE,xmmF
+	psllq	xmmG,xmmD
+	psllq	xmmC,xmmD
+	por	xmmA,xmmG
+	por	xmmE,xmmC
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%else
+	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%endif
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+	movdqa    xmmC,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+	movdqa    xmmG,xmmB
+	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	movdqa    xmmH,xmmC
+	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	edi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+	movntdq	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [edi], xmmC
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [edi], xmmH
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	ecx, byte SIZEOF_XMMWORD
+	jz	near .nextrow
+
+	add	esi, byte SIZEOF_XMMWORD	; inptr0
+	add	ebx, byte SIZEOF_XMMWORD	; inptr1
+	add	edx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st32:
+	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
+	cmp	ecx, byte SIZEOF_XMMWORD/2
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmC
+	movdqa	xmmD,xmmH
+	sub	ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+	cmp	ecx, byte SIZEOF_XMMWORD/4
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+	cmp	ecx, byte SIZEOF_XMMWORD/16
+	jb	short .nextrow
+	mov	eax,ecx
+	xor	ecx, byte 0x03
+	inc	ecx
+	shl	ecx, 4
+	movd	xmmF,ecx
+	psrlq	xmmE,xmmF
+	punpcklbw xmmE,xmmE
+	; ----------------
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	lea	eax, [ecx+eax*4]	; RGB_PIXELSIZE
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmB,xmmA
+	movdqa	xmmG,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmC,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmH,ecx
+	psllq	xmmA,xmmH
+	psllq	xmmE,xmmH
+	jmp	short .adj0
+.adj1:	neg	ecx
+	movd	xmmH,ecx
+	psrlq	xmmA,xmmH
+	psrlq	xmmE,xmmH
+	psllq	xmmB,xmmC
+	psllq	xmmG,xmmC
+	por	xmmA,xmmB
+	por	xmmE,xmmG
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+	alignx	16,7
+
+.nextrow:
+	pop	ecx
+	pop	esi
+	pop	ebx
+	pop	edx
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW
+	add	ebx, byte SIZEOF_JSAMPROW
+	add	edx, byte SIZEOF_JSAMPROW
+	add	edi, byte SIZEOF_JSAMPROW	; output_buf
+	dec	eax				; num_rows
+	jg	near .rowloop
+
+	sfence		; flush the write buffer
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JDCOLOR_YCCRGB_SSE2_SUPPORTED
+%endif ; RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
diff --git a/jdct.h b/jdct.h
index 04192a2..678a3d1 100644
--- a/jdct.h
+++ b/jdct.h
@@ -5,6 +5,13 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : January 5, 2006
+ * ---------------------------------------------------------------------
+ *
  * This include file contains common declarations for the forward and
  * inverse DCT modules.  These declarations are private to the DCT managers
  * (jcdctmgr.c, jddctmgr.c) and the individual DCT algorithms.
@@ -13,6 +20,13 @@
  */
 
 
+/* SIMD Ext: configuration check */
+
+#if BITS_IN_JSAMPLE != 8
+#error "Sorry, this SIMD code only copes with 8-bit sample values."
+#endif
+
+
 /*
  * A forward DCT routine is given a pointer to a work area of type DCTELEM[];
  * the DCT is to be performed in-place in that buffer.  Type DCTELEM is int
@@ -26,14 +40,25 @@
  * Quantization of the output coefficients is done by jcdctmgr.c.
  */
 
-#if BITS_IN_JSAMPLE == 8
-typedef int DCTELEM;		/* 16 or 32 bits is fine */
-#else
-typedef INT32 DCTELEM;		/* must have 32 bits */
-#endif
+/* SIMD Ext: To maximize parallelism, Type DCTELEM is changed to short
+ * (originally, int).
+ */
+typedef short DCTELEM;		/* SIMD Ext: must be short */
 
 typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
 typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
+typedef JMETHOD(void, convsamp_int_method_ptr,
+		(JSAMPARRAY sample_data, JDIMENSION start_col,
+		 DCTELEM * workspace));
+typedef JMETHOD(void, convsamp_float_method_ptr,
+		(JSAMPARRAY sample_data, JDIMENSION start_col,
+		 FAST_FLOAT *workspace));
+typedef JMETHOD(void, quantize_int_method_ptr,
+		(JCOEFPTR coef_block, DCTELEM * divisors,
+		 DCTELEM * workspace));
+typedef JMETHOD(void, quantize_float_method_ptr,
+		(JCOEFPTR coef_block, FAST_FLOAT * divisors,
+		 FAST_FLOAT * workspace));
 
 
 /*
@@ -49,19 +74,22 @@
 
 /* typedef inverse_DCT_method_ptr is declared in jpegint.h */
 
+/* SIMD Ext: To maximize parallelism, Type MULTIPLIER is changed to short.
+ * Macro definitions of MULTIPLIER and FAST_FLOAT in jmorecfg.h are ignored.
+ */
+#undef MULTIPLIER
+#define MULTIPLIER  short	/* SIMD Ext: must be short */
+#undef FAST_FLOAT
+#define FAST_FLOAT  float	/* SIMD Ext: must be float */
+
 /*
  * Each IDCT routine has its own ideas about the best dct_table element type.
  */
 
-typedef MULTIPLIER ISLOW_MULT_TYPE; /* short or int, whichever is faster */
-#if BITS_IN_JSAMPLE == 8
-typedef MULTIPLIER IFAST_MULT_TYPE; /* 16 bits is OK, use short if faster */
+typedef MULTIPLIER ISLOW_MULT_TYPE;	/* SIMD Ext: must be short */
+typedef MULTIPLIER IFAST_MULT_TYPE;	/* SIMD Ext: must be short */
 #define IFAST_SCALE_BITS  2	/* fractional bits in scale factors */
-#else
-typedef INT32 IFAST_MULT_TYPE;	/* need 32 bits for scaled quantizers */
-#define IFAST_SCALE_BITS  13	/* fractional bits in scale factors */
-#endif
-typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
+typedef FAST_FLOAT FLOAT_MULT_TYPE;	/* SIMD Ext: must be float */
 
 
 /*
@@ -81,15 +109,64 @@
 /* Short forms of external names for systems with brain-damaged linkers. */
 
 #ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jpeg_fdct_islow		jFDislow
-#define jpeg_fdct_ifast		jFDifast
-#define jpeg_fdct_float		jFDfloat
-#define jpeg_idct_islow		jRDislow
-#define jpeg_idct_ifast		jRDifast
-#define jpeg_idct_float		jRDfloat
-#define jpeg_idct_4x4		jRD4x4
-#define jpeg_idct_2x2		jRD2x2
-#define jpeg_idct_1x1		jRD1x1
+#define jpeg_fdct_islow		jFDislow		/* jfdctint.asm */
+#define jpeg_fdct_ifast		jFDifast		/* jfdctfst.asm */
+#define jpeg_fdct_float		jFDfloat		/* jfdctflt.asm */
+#define jpeg_fdct_islow_mmx	jFDMislow		/* jfmmxint.asm */
+#define jpeg_fdct_ifast_mmx	jFDMifast		/* jfmmxfst.asm */
+#define jpeg_fdct_float_3dnow	jFD3float		/* jf3dnflt.asm */
+#define jpeg_fdct_islow_sse2	jFDSislow		/* jfss2int.asm */
+#define jpeg_fdct_ifast_sse2	jFDSifast		/* jfss2fst.asm */
+#define jpeg_fdct_float_sse	jFDSfloat		/* jfsseflt.asm */
+#define jpeg_convsamp_int	jCnvInt			/* jcqntint.asm */
+#define jpeg_quantize_int	jQntInt			/* jcqntint.asm */
+#define jpeg_quantize_idiv	jQntIDiv		/* jcqntint.asm */
+#define jpeg_convsamp_float	jCnvFloat		/* jcqntflt.asm */
+#define jpeg_quantize_float	jQntFloat		/* jcqntflt.asm */
+#define jpeg_convsamp_int_mmx	jCnvMmx			/* jcqntmmx.asm */
+#define jpeg_quantize_int_mmx	jQntMmx			/* jcqntmmx.asm */
+#define jpeg_convsamp_flt_3dnow	jCnv3dnow		/* jcqnt3dn.asm */
+#define jpeg_quantize_flt_3dnow	jQnt3dnow		/* jcqnt3dn.asm */
+#define jpeg_convsamp_int_sse2	jCnvISse2		/* jcqnts2i.asm */
+#define jpeg_quantize_int_sse2	jQntISse2		/* jcqnts2i.asm */
+#define jpeg_convsamp_flt_sse	jCnvSse			/* jcqntsse.asm */
+#define jpeg_quantize_flt_sse	jQntSse			/* jcqntsse.asm */
+#define jpeg_convsamp_flt_sse2	jCnvFSse2		/* jcqnts2f.asm */
+#define jpeg_quantize_flt_sse2	jQntFSse2		/* jcqnts2f.asm */
+#define jpeg_idct_islow		jRDislow		/* jidctint.asm */
+#define jpeg_idct_ifast		jRDifast		/* jidctfst.asm */
+#define jpeg_idct_float		jRDfloat		/* jidctflt.asm */
+#define jpeg_idct_4x4		jRD4x4			/* jidctred.asm */
+#define jpeg_idct_2x2		jRD2x2			/* jidctred.asm */
+#define jpeg_idct_1x1		jRD1x1			/* jidctred.asm */
+#define jpeg_idct_islow_mmx	jRDMislow		/* jimmxint.asm */
+#define jpeg_idct_ifast_mmx	jRDMifast		/* jimmxfst.asm */
+#define jpeg_idct_float_3dnow	jRD3float		/* ji3dnflt.asm */
+#define jpeg_idct_4x4_mmx	jRDM4x4			/* jimmxred.asm */
+#define jpeg_idct_2x2_mmx	jRDM2x2			/* jimmxred.asm */
+#define jpeg_idct_islow_sse2	jRDSislow		/* jiss2int.asm */
+#define jpeg_idct_ifast_sse2	jRDSifast		/* jiss2fst.asm */
+#define jpeg_idct_float_sse	jRDSfloat		/* jisseflt.asm */
+#define jpeg_idct_float_sse2	jRD2float		/* jiss2flt.asm */
+#define jpeg_idct_4x4_sse2	jRDS4x4			/* jiss2red.asm */
+#define jpeg_idct_2x2_sse2	jRDS2x2			/* jiss2red.asm */
+#define jconst_fdct_float	jFCfloat		/* jfdctflt.asm */
+#define jconst_fdct_islow_mmx	jFCMislow		/* jfmmxint.asm */
+#define jconst_fdct_ifast_mmx	jFCMifast		/* jfmmxfst.asm */
+#define jconst_fdct_float_3dnow	jFC3float		/* jf3dnflt.asm */
+#define jconst_fdct_islow_sse2	jFCSislow		/* jfss2int.asm */
+#define jconst_fdct_ifast_sse2	jFCSifast		/* jfss2fst.asm */
+#define jconst_fdct_float_sse	jFCSfloat		/* jfsseflt.asm */
+#define jconst_idct_float	jRCfloat		/* jidctflt.asm */
+#define jconst_idct_islow_mmx	jRCMislow		/* jimmxint.asm */
+#define jconst_idct_ifast_mmx	jRCMifast		/* jimmxfst.asm */
+#define jconst_idct_float_3dnow	jRC3float		/* ji3dnflt.asm */
+#define jconst_idct_red_mmx	jRCMred			/* jimmxred.asm */
+#define jconst_idct_islow_sse2	jRCSislow		/* jiss2int.asm */
+#define jconst_idct_ifast_sse2	jRCSifast		/* jiss2fst.asm */
+#define jconst_idct_float_sse	jRCSfloat		/* jisseflt.asm */
+#define jconst_idct_float_sse2	jRC2float		/* jiss2flt.asm */
+#define jconst_idct_red_sse2	jRCSred			/* jiss2red.asm */
 #endif /* NEED_SHORT_EXTERNAL_NAMES */
 
 /* Extern declarations for the forward and inverse DCT routines. */
@@ -98,6 +175,47 @@
 EXTERN(void) jpeg_fdct_ifast JPP((DCTELEM * data));
 EXTERN(void) jpeg_fdct_float JPP((FAST_FLOAT * data));
 
+EXTERN(void) jpeg_fdct_islow_mmx JPP((DCTELEM * data));
+EXTERN(void) jpeg_fdct_ifast_mmx JPP((DCTELEM * data));
+EXTERN(void) jpeg_fdct_float_3dnow JPP((FAST_FLOAT * data));
+
+EXTERN(void) jpeg_fdct_islow_sse2 JPP((DCTELEM * data));
+EXTERN(void) jpeg_fdct_ifast_sse2 JPP((DCTELEM * data));
+EXTERN(void) jpeg_fdct_float_sse JPP((FAST_FLOAT * data));
+
+EXTERN(void) jpeg_convsamp_int
+    JPP((JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace));
+EXTERN(void) jpeg_quantize_int
+    JPP((JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace));
+EXTERN(void) jpeg_quantize_idiv
+    JPP((JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace));
+EXTERN(void) jpeg_convsamp_float
+    JPP((JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace));
+EXTERN(void) jpeg_quantize_float
+    JPP((JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace));
+
+EXTERN(void) jpeg_convsamp_int_mmx
+    JPP((JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace));
+EXTERN(void) jpeg_quantize_int_mmx
+    JPP((JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace));
+EXTERN(void) jpeg_convsamp_flt_3dnow
+    JPP((JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace));
+EXTERN(void) jpeg_quantize_flt_3dnow
+    JPP((JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace));
+
+EXTERN(void) jpeg_convsamp_int_sse2
+    JPP((JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace));
+EXTERN(void) jpeg_quantize_int_sse2
+    JPP((JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace));
+EXTERN(void) jpeg_convsamp_flt_sse
+    JPP((JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace));
+EXTERN(void) jpeg_quantize_flt_sse
+    JPP((JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace));
+EXTERN(void) jpeg_convsamp_flt_sse2
+    JPP((JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace));
+EXTERN(void) jpeg_quantize_flt_sse2
+    JPP((JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace));
+
 EXTERN(void) jpeg_idct_islow
     JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
 	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
@@ -117,6 +235,60 @@
     JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
 	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
 
+EXTERN(void) jpeg_idct_islow_mmx
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_ifast_mmx
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_4x4_mmx
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_2x2_mmx
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+
+EXTERN(void) jpeg_idct_float_3dnow
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_float_sse
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_float_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+
+EXTERN(void) jpeg_idct_islow_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_ifast_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_4x4_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_2x2_sse2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+
+extern const int jconst_fdct_float[];
+extern const int jconst_fdct_islow_mmx[];
+extern const int jconst_fdct_ifast_mmx[];
+extern const int jconst_fdct_float_3dnow[];
+extern const int jconst_fdct_islow_sse2[];
+extern const int jconst_fdct_ifast_sse2[];
+extern const int jconst_fdct_float_sse[];
+extern const int jconst_idct_float[];
+extern const int jconst_idct_islow_mmx[];
+extern const int jconst_idct_ifast_mmx[];
+extern const int jconst_idct_float_3dnow[];
+extern const int jconst_idct_red_mmx[];
+extern const int jconst_idct_islow_sse2[];
+extern const int jconst_idct_ifast_sse2[];
+extern const int jconst_idct_float_sse[];
+extern const int jconst_idct_float_sse2[];
+extern const int jconst_idct_red_sse2[];
+
 
 /*
  * Macros for handling fixed-point arithmetic; these are used by many
diff --git a/jdct.inc b/jdct.inc
new file mode 100644
index 0000000..a6fb0ed
--- /dev/null
+++ b/jdct.inc
@@ -0,0 +1,125 @@
+;
+; jdct.inc - private declarations for forward & reverse DCT subsystems
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; Last Modified : January 5, 2006
+;
+; [TAB8]
+
+; ---- jdct.h --------------------------------------------------------------
+;
+; configuration check: BITS_IN_JSAMPLE==8 (8-bit sample values) is the only
+; valid setting on this SIMD extension.
+;
+%if BITS_IN_JSAMPLE != 8
+%error "Sorry, this SIMD code only copes with 8-bit sample values."
+%endif
+
+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+; the DCT is to be performed in-place in that buffer.
+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
+;
+%define DCTELEM			word		; short
+%define SIZEOF_DCTELEM		SIZEOF_WORD	; sizeof(DCTELEM)
+
+; To maximize parallelism, Type MULTIPLIER is changed to short.
+;
+%define MULTIPLIER		word		; short
+%define SIZEOF_MULTIPLIER	SIZEOF_WORD	; sizeof(MULTIPLIER)
+%define FAST_FLOAT		FP32		; float
+%define SIZEOF_FAST_FLOAT	SIZEOF_FP32	; sizeof(FAST_FLOAT)
+
+; Each IDCT routine has its own ideas about the best dct_table element type.
+;
+%define ISLOW_MULT_TYPE 	MULTIPLIER          ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE	SIZEOF_MULTIPLIER   ; sizeof(ISLOW_MULT_TYPE)
+%define IFAST_MULT_TYPE 	MULTIPLIER          ; must be short
+%define SIZEOF_IFAST_MULT_TYPE	SIZEOF_MULTIPLIER   ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS	2	; fractional bits in scale factors
+%define FLOAT_MULT_TYPE 	FAST_FLOAT          ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE	SIZEOF_FAST_FLOAT   ; sizeof(FLOAT_MULT_TYPE)
+
+; Each IDCT routine is responsible for range-limiting its results and
+; converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
+; be quite far out of range if the input data is corrupt, so a bulletproof
+; range-limiting step is required.  We use a mask-and-table-lookup method
+; to do the combined operations quickly.
+;
+%define RANGE_MASK  (MAXJSAMPLE * 4 + 3)  ; 2 bits wider than legal samples
+
+; Short forms of external names for systems with brain-damaged linkers.
+;
+%ifdef NEED_SHORT_EXTERNAL_NAMES
+%define jpeg_fdct_islow		jFDislow	; jfdctint.asm
+%define jpeg_fdct_ifast		jFDifast	; jfdctfst.asm
+%define jpeg_fdct_float		jFDfloat	; jfdctflt.asm
+%define jpeg_fdct_islow_mmx	jFDMislow	; jfmmxint.asm
+%define jpeg_fdct_ifast_mmx	jFDMifast	; jfmmxfst.asm
+%define jpeg_fdct_float_3dnow	jFD3float	; jf3dnflt.asm
+%define jpeg_fdct_islow_sse2	jFDSislow	; jfss2int.asm
+%define jpeg_fdct_ifast_sse2	jFDSifast	; jfss2fst.asm
+%define jpeg_fdct_float_sse	jFDSfloat	; jfsseflt.asm
+%define jpeg_convsamp_int	jCnvInt		; jcqntint.asm
+%define jpeg_quantize_int	jQntInt		; jcqntint.asm
+%define jpeg_quantize_idiv	jQntIDiv	; jcqntint.asm
+%define jpeg_convsamp_float	jCnvFloat	; jcqntflt.asm
+%define jpeg_quantize_float	jQntFloat	; jcqntflt.asm
+%define jpeg_convsamp_int_mmx	jCnvMmx		; jcqntmmx.asm
+%define jpeg_quantize_int_mmx	jQntMmx		; jcqntmmx.asm
+%define jpeg_convsamp_flt_3dnow	jCnv3dnow	; jcqnt3dn.asm
+%define jpeg_quantize_flt_3dnow	jQnt3dnow	; jcqnt3dn.asm
+%define jpeg_convsamp_int_sse2	jCnvISse2	; jcqnts2i.asm
+%define jpeg_quantize_int_sse2	jQntISse2	; jcqnts2i.asm
+%define jpeg_convsamp_flt_sse	jCnvSse		; jcqntsse.asm
+%define jpeg_quantize_flt_sse	jQntSse		; jcqntsse.asm
+%define jpeg_convsamp_flt_sse2	jCnvFSse2	; jcqnts2f.asm
+%define jpeg_quantize_flt_sse2	jQntFSse2	; jcqnts2f.asm
+%define jpeg_idct_islow		jRDislow	; jidctint.asm
+%define jpeg_idct_ifast		jRDifast	; jidctfst.asm
+%define jpeg_idct_float		jRDfloat	; jidctflt.asm
+%define jpeg_idct_4x4		jRD4x4		; jidctred.asm
+%define jpeg_idct_2x2		jRD2x2		; jidctred.asm
+%define jpeg_idct_1x1		jRD1x1		; jidctred.asm
+%define jpeg_idct_islow_mmx	jRDMislow	; jimmxint.asm
+%define jpeg_idct_ifast_mmx	jRDMifast	; jimmxfst.asm
+%define jpeg_idct_float_3dnow	jRD3float	; ji3dnflt.asm
+%define jpeg_idct_4x4_mmx	jRDM4x4		; jimmxred.asm
+%define jpeg_idct_2x2_mmx	jRDM2x2		; jimmxred.asm
+%define jpeg_idct_islow_sse2	jRDSislow	; jiss2int.asm
+%define jpeg_idct_ifast_sse2	jRDSifast	; jiss2fst.asm
+%define jpeg_idct_float_sse	jRDSfloat	; jisseflt.asm
+%define jpeg_idct_float_sse2	jRD2float	; jiss2flt.asm
+%define jpeg_idct_4x4_sse2	jRDS4x4		; jiss2red.asm
+%define jpeg_idct_2x2_sse2	jRDS2x2		; jiss2red.asm
+%define jconst_fdct_float	jFCfloat	; jfdctflt.asm
+%define jconst_fdct_islow_mmx	jFCMislow	; jfmmxint.asm
+%define jconst_fdct_ifast_mmx	jFCMifast	; jfmmxfst.asm
+%define jconst_fdct_float_3dnow	jFC3float	; jf3dnflt.asm
+%define jconst_fdct_islow_sse2	jFCSislow	; jfss2int.asm
+%define jconst_fdct_ifast_sse2	jFCSifast	; jfss2fst.asm
+%define jconst_fdct_float_sse	jFCSfloat	; jfsseflt.asm
+%define jconst_idct_float	jRCfloat	; jidctflt.asm
+%define jconst_idct_islow_mmx	jRCMislow	; jimmxint.asm
+%define jconst_idct_ifast_mmx	jRCMifast	; jimmxfst.asm
+%define jconst_idct_float_3dnow	jRC3float	; ji3dnflt.asm
+%define jconst_idct_red_mmx	jRCMred		; jimmxred.asm
+%define jconst_idct_islow_sse2	jRCSislow	; jiss2int.asm
+%define jconst_idct_ifast_sse2	jRCSifast	; jiss2fst.asm
+%define jconst_idct_float_sse	jRCSfloat	; jisseflt.asm
+%define jconst_idct_float_sse2	jRC2float	; jiss2flt.asm
+%define jconst_idct_red_sse2	jRCSred		; jiss2red.asm
+%endif ; NEED_SHORT_EXTERNAL_NAMES
+
+; --------------------------------------------------------------------------
+
+%define ROW(n,b,s)		((b)+(n)*(s))
+%define COL(n,b,s)		((b)+(n)*(s)*DCTSIZE)
+
+%define DWBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
+%define MMBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
+%define XMMBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD)
+
+; --------------------------------------------------------------------------
diff --git a/jddctmgr.c b/jddctmgr.c
index bbf8d0e..de6df8d 100644
--- a/jddctmgr.c
+++ b/jddctmgr.c
@@ -5,6 +5,13 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : December 24, 2005
+ * ---------------------------------------------------------------------
+ *
  * This file contains the inverse-DCT management logic.
  * This code selects a particular IDCT implementation to be used,
  * and it performs related housekeeping chores.  No code in this file
@@ -94,6 +101,7 @@
   int method = 0;
   inverse_DCT_method_ptr method_ptr = NULL;
   JQUANT_TBL * qtbl;
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
@@ -105,34 +113,95 @@
       method = JDCT_ISLOW;	/* jidctred uses islow-style table */
       break;
     case 2:
-      method_ptr = jpeg_idct_2x2;
+#ifdef JIDCT_INT_SSE2_SUPPORTED
+      if (simd & JSIMD_SSE2 &&
+          IS_CONST_ALIGNED_16(jconst_idct_red_sse2))
+	method_ptr = jpeg_idct_2x2_sse2;
+      else
+#endif
+#ifdef JIDCT_INT_MMX_SUPPORTED
+      if (simd & JSIMD_MMX)
+	method_ptr = jpeg_idct_2x2_mmx;
+      else
+#endif
+	method_ptr = jpeg_idct_2x2;
       method = JDCT_ISLOW;	/* jidctred uses islow-style table */
       break;
     case 4:
-      method_ptr = jpeg_idct_4x4;
+#ifdef JIDCT_INT_SSE2_SUPPORTED
+      if (simd & JSIMD_SSE2 &&
+          IS_CONST_ALIGNED_16(jconst_idct_red_sse2))
+	method_ptr = jpeg_idct_4x4_sse2;
+      else
+#endif
+#ifdef JIDCT_INT_MMX_SUPPORTED
+      if (simd & JSIMD_MMX)
+	method_ptr = jpeg_idct_4x4_mmx;
+      else
+#endif
+	method_ptr = jpeg_idct_4x4;
       method = JDCT_ISLOW;	/* jidctred uses islow-style table */
       break;
-#endif
+#endif /* IDCT_SCALING_SUPPORTED */
     case DCTSIZE:
       switch (cinfo->dct_method) {
 #ifdef DCT_ISLOW_SUPPORTED
       case JDCT_ISLOW:
-	method_ptr = jpeg_idct_islow;
+#ifdef JIDCT_INT_SSE2_SUPPORTED
+	if (simd & JSIMD_SSE2 &&
+	    IS_CONST_ALIGNED_16(jconst_idct_islow_sse2))
+	  method_ptr = jpeg_idct_islow_sse2;
+	else
+#endif
+#ifdef JIDCT_INT_MMX_SUPPORTED
+	if (simd & JSIMD_MMX)
+	  method_ptr = jpeg_idct_islow_mmx;
+	else
+#endif
+	  method_ptr = jpeg_idct_islow;
 	method = JDCT_ISLOW;
 	break;
-#endif
+#endif /* DCT_ISLOW_SUPPORTED */
 #ifdef DCT_IFAST_SUPPORTED
       case JDCT_IFAST:
-	method_ptr = jpeg_idct_ifast;
+#ifdef JIDCT_INT_SSE2_SUPPORTED
+	if (simd & JSIMD_SSE2 &&
+	    IS_CONST_ALIGNED_16(jconst_idct_ifast_sse2))
+	  method_ptr = jpeg_idct_ifast_sse2;
+	else
+#endif
+#ifdef JIDCT_INT_MMX_SUPPORTED
+	if (simd & JSIMD_MMX)
+	  method_ptr = jpeg_idct_ifast_mmx;
+	else
+#endif
+	  method_ptr = jpeg_idct_ifast;
 	method = JDCT_IFAST;
 	break;
-#endif
+#endif /* DCT_IFAST_SUPPORTED */
 #ifdef DCT_FLOAT_SUPPORTED
       case JDCT_FLOAT:
-	method_ptr = jpeg_idct_float;
+#ifdef JIDCT_FLT_SSE_SSE2_SUPPORTED
+	if (simd & JSIMD_SSE && simd & JSIMD_SSE2 &&
+	    IS_CONST_ALIGNED_16(jconst_idct_float_sse2))
+	  method_ptr = jpeg_idct_float_sse2;
+	else
+#endif
+#ifdef JIDCT_FLT_SSE_MMX_SUPPORTED
+	if (simd & JSIMD_SSE &&
+	    IS_CONST_ALIGNED_16(jconst_idct_float_sse))
+	  method_ptr = jpeg_idct_float_sse;
+	else
+#endif
+#ifdef JIDCT_FLT_3DNOW_MMX_SUPPORTED
+	if (simd & JSIMD_3DNOW)
+	  method_ptr = jpeg_idct_float_3dnow;
+	else
+#endif
+	  method_ptr = jpeg_idct_float;
 	method = JDCT_FLOAT;
 	break;
-#endif
+#endif /* DCT_FLOAT_SUPPORTED */
       default:
 	ERREXIT(cinfo, JERR_NOT_COMPILED);
 	break;
@@ -267,3 +336,78 @@
     idct->cur_method[ci] = -1;
   }
 }
+
+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+GLOBAL(unsigned int)
+jpeg_simd_inverse_dct (j_decompress_ptr cinfo, int method)
+{
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
+
+  switch (method) {
+#ifdef DCT_ISLOW_SUPPORTED
+  case JDCT_ISLOW:
+#ifdef JIDCT_INT_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_idct_islow_sse2))
+      return JSIMD_SSE2;
+#endif
+#ifdef JIDCT_INT_MMX_SUPPORTED
+    if (simd & JSIMD_MMX)
+      return JSIMD_MMX;
+#endif
+    return JSIMD_NONE;
+#endif /* DCT_ISLOW_SUPPORTED */
+#ifdef DCT_IFAST_SUPPORTED
+  case JDCT_IFAST:
+#ifdef JIDCT_INT_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_idct_ifast_sse2))
+      return JSIMD_SSE2;
+#endif
+#ifdef JIDCT_INT_MMX_SUPPORTED
+    if (simd & JSIMD_MMX)
+      return JSIMD_MMX;
+#endif
+    return JSIMD_NONE;
+#endif /* DCT_IFAST_SUPPORTED */
+#ifdef DCT_FLOAT_SUPPORTED
+  case JDCT_FLOAT:
+#ifdef JIDCT_FLT_SSE_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE && simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_idct_float_sse2))
+      return JSIMD_SSE;		/* (JSIMD_SSE | JSIMD_SSE2); */
+#endif
+#ifdef JIDCT_FLT_SSE_MMX_SUPPORTED
+    if (simd & JSIMD_SSE &&
+        IS_CONST_ALIGNED_16(jconst_idct_float_sse))
+      return JSIMD_SSE;		/* (JSIMD_SSE | JSIMD_MMX); */
+#endif
+#ifdef JIDCT_FLT_3DNOW_MMX_SUPPORTED
+    if (simd & JSIMD_3DNOW)
+      return JSIMD_3DNOW;	/* (JSIMD_3DNOW | JSIMD_MMX); */
+#endif
+    return JSIMD_NONE;
+#endif /* DCT_FLOAT_SUPPORTED */
+#ifdef IDCT_SCALING_SUPPORTED
+  case JDCT_FLOAT + 1:
+#ifdef JIDCT_INT_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_idct_red_sse2))
+      return JSIMD_SSE2;
+#endif
+#ifdef JIDCT_INT_MMX_SUPPORTED
+    if (simd & JSIMD_MMX)
+      return JSIMD_MMX;
+#endif
+    return JSIMD_NONE;
+#endif /* IDCT_SCALING_SUPPORTED */
+  default:
+    ;
+  }
+
+  return JSIMD_NONE;	/* not compiled */
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
diff --git a/jdhuff.c b/jdhuff.c
index b5ba39f..4f75ebe 100644
--- a/jdhuff.c
+++ b/jdhuff.c
@@ -5,6 +5,13 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified to improve performance.
+ * Last Modified : October 31, 2004
+ * ---------------------------------------------------------------------
+ *
  * This file contains Huffman entropy decoding routines.
  *
  * Much of the complexity here has to do with supporting input suspension.
@@ -151,8 +158,8 @@
 {
   JHUFF_TBL *htbl;
   d_derived_tbl *dtbl;
-  int p, i, l, si, numsymbols;
-  int lookbits, ctr;
+  int p, i, l, la, lx, si, numsymbols;
+  int lookbits, look_end, sym, val, ctr;
   char huffsize[257];
   unsigned int huffcode[257];
   unsigned int code;
@@ -234,18 +241,34 @@
    * with that code.
    */
 
-  MEMZERO(dtbl->look_nbits, SIZEOF(dtbl->look_nbits));
+  MEMZERO(dtbl->lookx_nbits, SIZEOF(dtbl->lookx_nbits));
 
   p = 0;
-  for (l = 1; l <= HUFF_LOOKAHEAD; l++) {
+  for (l = 1; l <= HUFFX_LOOKAHEAD-1; l++) {
     for (i = 1; i <= (int) htbl->bits[l]; i++, p++) {
       /* l = current code's length, p = its index in huffcode[] & huffval[]. */
       /* Generate left-justified code followed by all possible bit sequences */
-      lookbits = huffcode[p] << (HUFF_LOOKAHEAD-l);
-      for (ctr = 1 << (HUFF_LOOKAHEAD-l); ctr > 0; ctr--) {
-	dtbl->look_nbits[lookbits] = l;
-	dtbl->look_sym[lookbits] = htbl->huffval[p];
-	lookbits++;
+      sym = htbl->huffval[p];		/* current symbol */
+      la = sym & 15;			/* length of additional bits field */
+      lx = HUFFX_LOOKAHEAD - l;
+      lookbits = huffcode[p] << lx;
+      look_end = lookbits + (1 << lx);
+      lx -= la;
+      while (lookbits < look_end) {
+	if (lx >= 0) {
+	  val = (lookbits >>  lx) & ((1 << la) - 1);
+	  ctr = 1 << lx;
+	} else {
+	  val = (lookbits << -lx) & ((1 << la) - 1);
+	  ctr = 1;
+	}
+	val = HUFF_EXTEND(val, la);
+	for (; ctr > 0; ctr--) {
+	  dtbl->lookx_nbits[lookbits] = l + la;
+	  dtbl->lookx_val[lookbits] = val;
+	  dtbl->lookx_sym[lookbits] = sym;
+	  lookbits++;
+	}
       }
     }
   }
@@ -271,23 +294,8 @@
  * See jdhuff.h for info about usage.
  * Note: current values of get_buffer and bits_left are passed as parameters,
  * but are returned in the corresponding fields of the state struct.
- *
- * On most machines MIN_GET_BITS should be 25 to allow the full 32-bit width
- * of get_buffer to be used.  (On machines with wider words, an even larger
- * buffer could be used.)  However, on some machines 32-bit shifts are
- * quite slow and take time proportional to the number of places shifted.
- * (This is true with most PC compilers, for instance.)  In this case it may
- * be a win to set MIN_GET_BITS to the minimum value of 15.  This reduces the
- * average shift distance at the cost of more calls to jpeg_fill_bit_buffer.
  */
 
-#ifdef SLOW_SHIFT_32
-#define MIN_GET_BITS  15	/* minimum allowable value */
-#else
-#define MIN_GET_BITS  (BIT_BUF_SIZE-7)
-#endif
-
-
 GLOBAL(boolean)
 jpeg_fill_bit_buffer (bitread_working_state * state,
 		      register bit_buf_type get_buffer, register int bits_left,
@@ -434,32 +442,6 @@
 
 
 /*
- * Figure F.12: extend sign bit.
- * On some machines, a shift and add will be faster than a table lookup.
- */
-
-#ifdef AVOID_TABLES
-
-#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x))
-
-#else
-
-#define HUFF_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
-
-static const int extend_test[16] =   /* entry n is 2**(n-1) */
-  { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
-    0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
-
-static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
-  { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
-    ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
-    ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
-    ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
-
-#endif /* AVOID_TABLES */
-
-
-/*
  * Check for a restart marker & resynchronize decoder.
  * Returns FALSE if must suspend.
  */
@@ -548,13 +530,59 @@
       /* Decode a single block's worth of coefficients */
 
       /* Section F.2.2.1: decode the DC coefficient difference */
-      HUFF_DECODE(s, br_state, dctbl, return FALSE, label1);
-      if (s) {
-	CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	r = GET_BITS(s);
-	s = HUFF_EXTEND(r, s);
+      {		/* HUFFX_DECODE */
+	register int nb, look, t;
+	if (bits_left < HUFFX_LOOKAHEAD) {
+	  register const JOCTET * next_input_byte = br_state.next_input_byte;
+	  register size_t         bytes_in_buffer = br_state.bytes_in_buffer;
+	  if (cinfo->unread_marker == 0) {
+	    while (bits_left < MIN_GET_BITS) {
+	      register int c;
+	      if (bytes_in_buffer == 0 ||
+		  (c = GETJOCTET(*next_input_byte)) == 0xFF) {
+		goto label11; }
+	      bytes_in_buffer--; next_input_byte++;
+	      get_buffer = (get_buffer << 8) | c;
+	      bits_left += 8;
+	    }
+	    br_state.next_input_byte = next_input_byte;
+	    br_state.bytes_in_buffer = bytes_in_buffer;
+	  } else {
+	label11:
+	    br_state.next_input_byte = next_input_byte;
+	    br_state.bytes_in_buffer = bytes_in_buffer;
+	    if (! jpeg_fill_bit_buffer(&br_state,get_buffer,bits_left, 0)) {
+	      return FALSE; }
+	    get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	    if (bits_left < HUFFX_LOOKAHEAD) {
+	      nb = 1; goto label1;
+	    }
+	  }
+	}
+	look = PEEK_BITS(HUFFX_LOOKAHEAD);
+	if ((nb = dctbl->lookx_nbits[look]) != 0) {
+	  s = dctbl->lookx_val[look];
+	  if (nb <= HUFFX_LOOKAHEAD) {
+	    DROP_BITS(nb);
+	  } else {
+	    DROP_BITS(HUFFX_LOOKAHEAD);
+	    nb -= HUFFX_LOOKAHEAD;
+	    CHECK_BIT_BUFFER(br_state, nb, return FALSE);
+	    s += GET_BITS(nb);
+	  }
+	} else {
+	  nb = HUFFX_LOOKAHEAD;
+      label1:
+	  if ((s=jpeg_huff_decode(&br_state,get_buffer,bits_left,dctbl,nb))
+	       < 0) { return FALSE; }
+	  get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	  if (s) {
+	    CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	    t = GET_BITS(s);
+	    s = HUFF_EXTEND(t, s);
+	  }
+	}
       }
-
       if (entropy->dc_needed[blkn]) {
 	/* Convert DC difference to actual value, update last_dc_val */
 	int ci = cinfo->MCU_membership[blkn];
@@ -569,16 +597,65 @@
 	/* Section F.2.2.2: decode the AC coefficients */
 	/* Since zeroes are skipped, output area must be cleared beforehand */
 	for (k = 1; k < DCTSIZE2; k++) {
-	  HUFF_DECODE(s, br_state, actbl, return FALSE, label2);
-      
-	  r = s >> 4;
-	  s &= 15;
-      
+	  {	/* HUFFX_DECODE */
+	    register int nb, look, t;
+	    if (bits_left < HUFFX_LOOKAHEAD) {
+	      register const JOCTET * next_input_byte
+					      = br_state.next_input_byte;
+	      register size_t bytes_in_buffer = br_state.bytes_in_buffer;
+	      if (cinfo->unread_marker == 0) {
+		while (bits_left < MIN_GET_BITS) {
+		  register int c;
+		  if (bytes_in_buffer == 0 ||
+		      (c = GETJOCTET(*next_input_byte)) == 0xFF) {
+		    goto label21; }
+		  bytes_in_buffer--; next_input_byte++;
+		  get_buffer = (get_buffer << 8) | c;
+		  bits_left += 8;
+		}
+		br_state.next_input_byte = next_input_byte;
+		br_state.bytes_in_buffer = bytes_in_buffer;
+	      } else {
+	    label21:
+		br_state.next_input_byte = next_input_byte;
+		br_state.bytes_in_buffer = bytes_in_buffer;
+		if (! jpeg_fill_bit_buffer(&br_state,get_buffer,bits_left,0)) {
+		  return FALSE; }
+		get_buffer = br_state.get_buffer;
+		bits_left  = br_state.bits_left;
+		if (bits_left < HUFFX_LOOKAHEAD) {
+		  nb = 1; goto label2;
+		}
+	      }
+	    }
+	    look = PEEK_BITS(HUFFX_LOOKAHEAD);
+	    if ((nb = actbl->lookx_nbits[look]) != 0) {
+	      s = actbl->lookx_val[look];
+	      r = actbl->lookx_sym[look] >> 4;
+	      if (nb <= HUFFX_LOOKAHEAD) {
+		DROP_BITS(nb);
+	      } else {
+		DROP_BITS(HUFFX_LOOKAHEAD);
+		nb -= HUFFX_LOOKAHEAD;
+		CHECK_BIT_BUFFER(br_state, nb, return FALSE);
+		s += GET_BITS(nb);
+	      }
+	    } else {
+	      nb = HUFFX_LOOKAHEAD;
+	  label2:
+	      if ((s=jpeg_huff_decode(&br_state,get_buffer,bits_left,actbl,nb))
+		   < 0) { return FALSE; }
+	      get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	      r = s >> 4; s &= 15;
+	      if (s) {
+		CHECK_BIT_BUFFER(br_state, s, return FALSE);
+		t = GET_BITS(s);
+		s = HUFF_EXTEND(t, s);
+	      }
+	    }
+	  }
 	  if (s) {
 	    k += r;
-	    CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	    r = GET_BITS(s);
-	    s = HUFF_EXTEND(r, s);
 	    /* Output coefficient in natural (dezigzagged) order.
 	     * Note: the extra entries in jpeg_natural_order[] will save us
 	     * if k >= DCTSIZE2, which could happen if the data is corrupted.
@@ -596,15 +673,64 @@
 	/* Section F.2.2.2: decode the AC coefficients */
 	/* In this path we just discard the values */
 	for (k = 1; k < DCTSIZE2; k++) {
-	  HUFF_DECODE(s, br_state, actbl, return FALSE, label3);
-      
-	  r = s >> 4;
-	  s &= 15;
-      
+	  {	/* HUFFX_DECODE */
+	    register int nb, look;
+	    if (bits_left < HUFFX_LOOKAHEAD) {
+	      register const JOCTET * next_input_byte
+					      = br_state.next_input_byte;
+	      register size_t bytes_in_buffer = br_state.bytes_in_buffer;
+	      if (cinfo->unread_marker == 0) {
+		while (bits_left < MIN_GET_BITS) {
+		  register int c;
+		  if (bytes_in_buffer == 0 ||
+		      (c = GETJOCTET(*next_input_byte)) == 0xFF) {
+		    goto label31; }
+		  bytes_in_buffer--; next_input_byte++;
+		  get_buffer = (get_buffer << 8) | c;
+		  bits_left += 8;
+		}
+		br_state.next_input_byte = next_input_byte;
+		br_state.bytes_in_buffer = bytes_in_buffer;
+	      } else {
+	    label31:
+		br_state.next_input_byte = next_input_byte;
+		br_state.bytes_in_buffer = bytes_in_buffer;
+		if (! jpeg_fill_bit_buffer(&br_state,get_buffer,bits_left,0)) {
+		  return FALSE; }
+		get_buffer = br_state.get_buffer;
+		bits_left  = br_state.bits_left;
+		if (bits_left < HUFFX_LOOKAHEAD) {
+		  nb = 1; goto label3;
+		}
+	      }
+	    }
+	    look = PEEK_BITS(HUFFX_LOOKAHEAD);
+	    if ((nb = actbl->lookx_nbits[look]) != 0) {
+	      s = actbl->lookx_sym[look];
+	      r = s >> 4; s &= 15;
+	      if (nb <= HUFFX_LOOKAHEAD) {
+		DROP_BITS(nb);
+	      } else {
+		DROP_BITS(HUFFX_LOOKAHEAD);
+		nb -= HUFFX_LOOKAHEAD;
+		CHECK_BIT_BUFFER(br_state, nb, return FALSE);
+		DROP_BITS(nb);
+	      }
+	    } else {
+	      nb = HUFFX_LOOKAHEAD;
+	  label3:
+	      if ((s=jpeg_huff_decode(&br_state,get_buffer,bits_left,actbl,nb))
+		   < 0) { return FALSE; }
+	      get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	      r = s >> 4; s &= 15;
+	      if (s) {
+		CHECK_BIT_BUFFER(br_state, s, return FALSE);
+		DROP_BITS(s);
+	      }
+	    }
+	  }
 	  if (s) {
 	    k += r;
-	    CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	    DROP_BITS(s);
 	  } else {
 	    if (r != 15)
 	      break;
diff --git a/jdhuff.h b/jdhuff.h
index ae19b6c..b5e193e 100644
--- a/jdhuff.h
+++ b/jdhuff.h
@@ -5,6 +5,13 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified to improve performance.
+ * Last Modified : October 31, 2004
+ * ---------------------------------------------------------------------
+ *
  * This file contains declarations for Huffman entropy decoding routines
  * that are shared between the sequential decoder (jdhuff.c) and the
  * progressive decoder (jdphuff.c).  No other modules need to see these.
@@ -21,7 +28,7 @@
 
 /* Derived data constructed for each Huffman table */
 
-#define HUFF_LOOKAHEAD	8	/* # of bits of lookahead */
+#define HUFFX_LOOKAHEAD	9	/* # of bits of lookahead */
 
 typedef struct {
   /* Basic tables: (element [0] of each array is unused) */
@@ -36,13 +43,15 @@
   /* Link to public Huffman table (needed only in jpeg_huff_decode) */
   JHUFF_TBL *pub;
 
-  /* Lookahead tables: indexed by the next HUFF_LOOKAHEAD bits of
+  /* Lookahead tables: indexed by the next HUFFX_LOOKAHEAD bits of
    * the input data stream.  If the next Huffman code is no more
-   * than HUFF_LOOKAHEAD bits long, we can obtain its length and
-   * the corresponding symbol directly from these tables.
+   * than HUFFX_LOOKAHEAD-1 bits long, we can obtain its length,
+   * the corresponding symbol, and the encoded coefficient value
+   * directly from these tables.
    */
-  int look_nbits[1<<HUFF_LOOKAHEAD]; /* # bits, or 0 if too long */
-  UINT8 look_sym[1<<HUFF_LOOKAHEAD]; /* symbol, or unused */
+  UINT8 lookx_nbits[1<<HUFFX_LOOKAHEAD];  /* # bits, or 0 if too long */
+  INT16 lookx_val[1<<HUFFX_LOOKAHEAD];  /* coefficient value, or unused */
+  UINT8 lookx_sym[1<<HUFFX_LOOKAHEAD];  /* symbol, or unused */
 } d_derived_tbl;
 
 /* Expand a Huffman table definition into the derived format */
@@ -79,6 +88,21 @@
  * because not all machines measure sizeof in 8-bit bytes.
  */
 
+#ifdef SLOW_SHIFT_32
+#define MIN_GET_BITS  15	/* minimum allowable value */
+#else
+#define MIN_GET_BITS  (BIT_BUF_SIZE-7)
+#endif
+
+/* On most machines MIN_GET_BITS should be 25 to allow the full 32-bit width
+ * of get_buffer to be used.  (On machines with wider words, an even larger
+ * buffer could be used.)  However, on some machines 32-bit shifts are
+ * quite slow and take time proportional to the number of places shifted.
+ * (This is true with most PC compilers, for instance.)  In this case it may
+ * be a win to set MIN_GET_BITS to the minimum value of 15.  This reduces the
+ * average shift distance at the cost of more calls to jpeg_fill_bit_buffer.
+ */
+
 typedef struct {		/* Bitreading state saved across MCUs */
   bit_buf_type get_buffer;	/* current bit-extraction buffer */
   int bits_left;		/* # of unused bits in it */
@@ -109,7 +133,7 @@
 	br_state.next_input_byte = cinfop->src->next_input_byte; \
 	br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
 	get_buffer = permstate.get_buffer; \
-	bits_left = permstate.bits_left;
+	bits_left = permstate.bits_left
 
 #define BITREAD_SAVE_STATE(cinfop,permstate)  \
 	cinfop->src->next_input_byte = br_state.next_input_byte; \
@@ -155,47 +179,14 @@
 	JPP((bitread_working_state * state, register bit_buf_type get_buffer,
 	     register int bits_left, int nbits));
 
-
-/*
- * Code for extracting next Huffman-coded symbol from input bit stream.
- * Again, this is time-critical and we make the main paths be macros.
- *
- * We use a lookahead table to process codes of up to HUFF_LOOKAHEAD bits
- * without looping.  Usually, more than 95% of the Huffman codes will be 8
- * or fewer bits long.  The few overlength codes are handled with a loop,
- * which need not be inline code.
- *
- * Notes about the HUFF_DECODE macro:
- * 1. Near the end of the data segment, we may fail to get enough bits
- *    for a lookahead.  In that case, we do it the hard way.
- * 2. If the lookahead table contains no entry, the next code must be
- *    more than HUFF_LOOKAHEAD bits long.
- * 3. jpeg_huff_decode returns -1 if forced to suspend.
- */
-
-#define HUFF_DECODE(result,state,htbl,failaction,slowlabel) \
-{ register int nb, look; \
-  if (bits_left < HUFF_LOOKAHEAD) { \
-    if (! jpeg_fill_bit_buffer(&state,get_buffer,bits_left, 0)) {failaction;} \
-    get_buffer = state.get_buffer; bits_left = state.bits_left; \
-    if (bits_left < HUFF_LOOKAHEAD) { \
-      nb = 1; goto slowlabel; \
-    } \
-  } \
-  look = PEEK_BITS(HUFF_LOOKAHEAD); \
-  if ((nb = htbl->look_nbits[look]) != 0) { \
-    DROP_BITS(nb); \
-    result = htbl->look_sym[look]; \
-  } else { \
-    nb = HUFF_LOOKAHEAD+1; \
-slowlabel: \
-    if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
-	{ failaction; } \
-    get_buffer = state.get_buffer; bits_left = state.bits_left; \
-  } \
-}
-
 /* Out-of-line case for Huffman code fetching */
 EXTERN(int) jpeg_huff_decode
 	JPP((bitread_working_state * state, register bit_buf_type get_buffer,
 	     register int bits_left, d_derived_tbl * htbl, int min_bits));
+
+
+/*
+ * Figure F.12: extend sign bit.
+ */
+
+#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x))
diff --git a/jdmerge.c b/jdmerge.c
index 3744446..f440d40 100644
--- a/jdmerge.c
+++ b/jdmerge.c
@@ -5,6 +5,13 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : January 5, 2006
+ * ---------------------------------------------------------------------
+ *
  * This file contains code for merged upsampling/color conversion.
  *
  * This file combines functions from jdsample.c and jdcolor.c;
@@ -35,6 +42,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jcolsamp.h"		/* Private declarations */
 
 #ifdef UPSAMPLE_MERGING_SUPPORTED
 
@@ -218,6 +226,17 @@
  */
 
 
+#if RGB_PIXELSIZE == 4
+/* offset of filler byte */
+#define RGB_FILLER  (6 - (RGB_RED) - (RGB_GREEN) - (RGB_BLUE))
+/* byte pattern to fill with */
+#ifdef RGBX_FILLER_0XFF
+#define RGB_FILLER_BYTE 0xFF
+#else
+#define RGB_FILLER_BYTE 0x00
+#endif
+#endif /* RGB_PIXELSIZE == 4 */
+
 /*
  * Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
  */
@@ -258,11 +277,17 @@
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
+#if RGB_PIXELSIZE == 4
+    outptr[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
     outptr += RGB_PIXELSIZE;
     y  = GETJSAMPLE(*inptr0++);
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
+#if RGB_PIXELSIZE == 4
+    outptr[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
     outptr += RGB_PIXELSIZE;
   }
   /* If image width is odd, do the last output column separately */
@@ -276,6 +301,9 @@
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
+#if RGB_PIXELSIZE == 4
+    outptr[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
   }
 }
 
@@ -322,21 +350,33 @@
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
+#if RGB_PIXELSIZE == 4
+    outptr0[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
     outptr0 += RGB_PIXELSIZE;
     y  = GETJSAMPLE(*inptr00++);
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
+#if RGB_PIXELSIZE == 4
+    outptr0[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
     outptr0 += RGB_PIXELSIZE;
     y  = GETJSAMPLE(*inptr01++);
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
+#if RGB_PIXELSIZE == 4
+    outptr1[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
     outptr1 += RGB_PIXELSIZE;
     y  = GETJSAMPLE(*inptr01++);
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
+#if RGB_PIXELSIZE == 4
+    outptr1[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
     outptr1 += RGB_PIXELSIZE;
   }
   /* If image width is odd, do the last output column separately */
@@ -350,10 +390,16 @@
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
+#if RGB_PIXELSIZE == 4
+    outptr0[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
     y  = GETJSAMPLE(*inptr01);
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
+#if RGB_PIXELSIZE == 4
+    outptr1[RGB_FILLER] = RGB_FILLER_BYTE;
+#endif
   }
 }
 
@@ -370,6 +416,7 @@
 jinit_merged_upsampler (j_decompress_ptr cinfo)
 {
   my_upsample_ptr upsample;
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
 
   upsample = (my_upsample_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -382,19 +429,73 @@
 
   if (cinfo->max_v_samp_factor == 2) {
     upsample->pub.upsample = merged_2v_upsample;
-    upsample->upmethod = h2v2_merged_upsample;
+#if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+#ifdef JDMERGE_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_merged_upsample_sse2)) {
+      upsample->upmethod = jpeg_h2v2_merged_upsample_sse2;
+    } else
+#endif
+#ifdef JDMERGE_MMX_SUPPORTED
+    if (simd & JSIMD_MMX) {
+      upsample->upmethod = jpeg_h2v2_merged_upsample_mmx;
+    } else
+#endif
+#endif /* RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4 */
+    {
+      upsample->upmethod = h2v2_merged_upsample;
+      build_ycc_rgb_table(cinfo);
+    }
     /* Allocate a spare row buffer */
     upsample->spare_row = (JSAMPROW)
       (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 		(size_t) (upsample->out_row_width * SIZEOF(JSAMPLE)));
   } else {
     upsample->pub.upsample = merged_1v_upsample;
-    upsample->upmethod = h2v1_merged_upsample;
+#if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+#ifdef JDMERGE_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_merged_upsample_sse2)) {
+      upsample->upmethod = jpeg_h2v1_merged_upsample_sse2;
+    } else
+#endif
+#ifdef JDMERGE_MMX_SUPPORTED
+    if (simd & JSIMD_MMX) {
+      upsample->upmethod = jpeg_h2v1_merged_upsample_mmx;
+    } else
+#endif
+#endif /* RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4 */
+    {
+      upsample->upmethod = h2v1_merged_upsample;
+      build_ycc_rgb_table(cinfo);
+    }
     /* No spare row needed */
     upsample->spare_row = NULL;
   }
-
-  build_ycc_rgb_table(cinfo);
 }
 
+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+GLOBAL(unsigned int)
+jpeg_simd_merged_upsampler (j_decompress_ptr cinfo)
+{
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
+
+#if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+#ifdef JDMERGE_SSE2_SUPPORTED
+  if (simd & JSIMD_SSE2 &&
+      IS_CONST_ALIGNED_16(jconst_merged_upsample_sse2))
+    return JSIMD_SSE2;
+#endif
+#ifdef JDMERGE_MMX_SUPPORTED
+  if (simd & JSIMD_MMX)
+    return JSIMD_MMX;
+#endif
+#endif /* RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4 */
+
+  return JSIMD_NONE;
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
 #endif /* UPSAMPLE_MERGING_SUPPORTED */
diff --git a/jdmermmx.asm b/jdmermmx.asm
new file mode 100644
index 0000000..4c88515
--- /dev/null
+++ b/jdmermmx.asm
@@ -0,0 +1,981 @@
+;
+; jdmermmx.asm - merged upsampling/color conversion (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jcolsamp.inc"
+
+%if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+%ifdef UPSAMPLE_MERGING_SUPPORTED
+%ifdef JDMERGE_MMX_SUPPORTED
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_merged_upsample_mmx)
+
+EXTN(jconst_merged_upsample_mmx):
+
+PW_F0402	times 4 dw  F_0_402
+PW_MF0228	times 4 dw -F_0_228
+PW_MF0344_F0285	times 2 dw -F_0_344, F_0_285
+PW_ONE		times 4 dw  1
+PD_ONEHALF	times 2 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jpeg_h2v1_merged_upsample_mmx (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+;                                JDIMENSION in_row_group_ctr,
+;                                JSAMPARRAY output_buf);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
+%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		3
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_h2v1_merged_upsample_mmx)
+
+EXTN(jpeg_h2v1_merged_upsample_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, POINTER [cinfo(eax)]
+	mov	ecx, JDIMENSION [jdstruct_output_width(ecx)]	; col
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [in_row_group_ctr(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]		; inptr0
+	mov	ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]		; inptr1
+	mov	edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]		; inptr2
+	mov	edi, JSAMPROW [edi]				; outptr
+
+	pop	ecx			; col
+
+	alignx	16,7
+.columnloop:
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	movq      mm6, MMWORD [ebx]	; mm6=Cb(01234567)
+	movq      mm7, MMWORD [edx]	; mm7=Cr(01234567)
+
+	pxor      mm1,mm1		; mm1=(all 0's)
+	pcmpeqw   mm3,mm3
+	psllw     mm3,7			; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
+
+	movq      mm4,mm6
+	punpckhbw mm6,mm1		; mm6=Cb(4567)=CbH
+	punpcklbw mm4,mm1		; mm4=Cb(0123)=CbL
+	movq      mm0,mm7
+	punpckhbw mm7,mm1		; mm7=Cr(4567)=CrH
+	punpcklbw mm0,mm1		; mm0=Cr(0123)=CrL
+
+	paddw     mm6,mm3
+	paddw     mm4,mm3
+	paddw     mm7,mm3
+	paddw     mm0,mm3
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movq	mm5,mm6			; mm5=CbH
+	movq	mm2,mm4			; mm2=CbL
+	paddw	mm6,mm6			; mm6=2*CbH
+	paddw	mm4,mm4			; mm4=2*CbL
+	movq	mm1,mm7			; mm1=CrH
+	movq	mm3,mm0			; mm3=CrL
+	paddw	mm7,mm7			; mm7=2*CrH
+	paddw	mm0,mm0			; mm0=2*CrL
+
+	pmulhw	mm6,[GOTOFF(eax,PW_MF0228)]	; mm6=(2*CbH * -FIX(0.22800))
+	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbL * -FIX(0.22800))
+	pmulhw	mm7,[GOTOFF(eax,PW_F0402)]	; mm7=(2*CrH * FIX(0.40200))
+	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrL * FIX(0.40200))
+
+	paddw	mm6,[GOTOFF(eax,PW_ONE)]
+	paddw	mm4,[GOTOFF(eax,PW_ONE)]
+	psraw	mm6,1			; mm6=(CbH * -FIX(0.22800))
+	psraw	mm4,1			; mm4=(CbL * -FIX(0.22800))
+	paddw	mm7,[GOTOFF(eax,PW_ONE)]
+	paddw	mm0,[GOTOFF(eax,PW_ONE)]
+	psraw	mm7,1			; mm7=(CrH * FIX(0.40200))
+	psraw	mm0,1			; mm0=(CrL * FIX(0.40200))
+
+	paddw	mm6,mm5
+	paddw	mm4,mm2
+	paddw	mm6,mm5			; mm6=(CbH * FIX(1.77200))=(B-Y)H
+	paddw	mm4,mm2			; mm4=(CbL * FIX(1.77200))=(B-Y)L
+	paddw	mm7,mm1			; mm7=(CrH * FIX(1.40200))=(R-Y)H
+	paddw	mm0,mm3			; mm0=(CrL * FIX(1.40200))=(R-Y)L
+
+	movq	MMWORD [wk(0)], mm6	; wk(0)=(B-Y)H
+	movq	MMWORD [wk(1)], mm7	; wk(1)=(R-Y)H
+
+	movq      mm6,mm5
+	movq      mm7,mm2
+	punpcklwd mm5,mm1
+	punpckhwd mm6,mm1
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm6,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd mm2,mm3
+	punpckhwd mm7,mm3
+	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm7,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm6,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm5,SCALEBITS
+	psrad     mm6,SCALEBITS
+	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm7,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm2,SCALEBITS
+	psrad     mm7,SCALEBITS
+
+	packssdw  mm5,mm6	; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+	packssdw  mm2,mm7	; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+	psubw     mm5,mm1	; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+	psubw     mm2,mm3	; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+	movq	MMWORD [wk(2)], mm5	; wk(2)=(G-Y)H
+
+	mov	al,2			; Yctr
+	jmp	short .Yloop_1st
+	alignx	16,7
+
+.Yloop_2nd:
+	movq	mm0, MMWORD [wk(1)]	; mm0=(R-Y)H
+	movq	mm2, MMWORD [wk(2)]	; mm2=(G-Y)H
+	movq	mm4, MMWORD [wk(0)]	; mm4=(B-Y)H
+	alignx	16,7
+
+.Yloop_1st:
+	movq	mm7, MMWORD [esi]	; mm7=Y(01234567)
+
+	pcmpeqw	mm6,mm6
+	psrlw	mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
+	pand	mm6,mm7			; mm6=Y(0246)=YE
+	psrlw	mm7,BYTE_BIT		; mm7=Y(1357)=YO
+
+	movq	mm1,mm0			; mm1=mm0=(R-Y)(L/H)
+	movq	mm3,mm2			; mm3=mm2=(G-Y)(L/H)
+	movq	mm5,mm4			; mm5=mm4=(B-Y)(L/H)
+
+	paddw     mm0,mm6		; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
+	paddw     mm1,mm7		; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
+	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
+	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+	paddw     mm2,mm6		; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
+	paddw     mm3,mm7		; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
+	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
+	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+	paddw     mm4,mm6		; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
+	paddw     mm5,mm7		; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
+	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
+	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
+	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
+
+	movq      mmG,mmA
+	movq      mmH,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
+	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
+
+	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
+	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
+
+	movq      mmC,mmD
+	movq      mmB,mmD
+	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
+	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
+
+	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
+
+	movq      mmF,mmE
+	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
+	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
+
+	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
+	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
+	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .endcolumn
+
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_MMWORD
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	mmA,mmC
+	sub	ecx, byte 2*SIZEOF_MMWORD
+	add	edi, byte 2*SIZEOF_MMWORD
+	jmp	short .column_st4
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmE
+	sub	ecx, byte SIZEOF_MMWORD
+	add	edi, byte SIZEOF_MMWORD
+.column_st4:
+	movd	eax,mmA
+	cmp	ecx, byte SIZEOF_DWORD
+	jb	short .column_st2
+	mov	DWORD [edi+0*SIZEOF_DWORD], eax
+	psrlq	mmA,DWORD_BIT
+	movd	eax,mmA
+	sub	ecx, byte SIZEOF_DWORD
+	add	edi, byte SIZEOF_DWORD
+.column_st2:
+	cmp	ecx, byte SIZEOF_WORD
+	jb	short .column_st1
+	mov	WORD [edi+0*SIZEOF_WORD], ax
+	shr	eax,WORD_BIT
+	sub	ecx, byte SIZEOF_WORD
+	add	edi, byte SIZEOF_WORD
+.column_st1:
+	cmp	ecx, byte SIZEOF_BYTE
+	jb	short .endcolumn
+	mov	BYTE [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
+	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
+	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
+
+	movq      mmC,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
+	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
+	movq      mmG,mmB
+	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
+	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
+
+	movq      mmD,mmA
+	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
+	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
+	movq      mmH,mmC
+	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
+	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .endcolumn
+
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
+	add	esi, byte SIZEOF_MMWORD			; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	ebx, byte SIZEOF_MMWORD			; inptr1
+	add	edx, byte SIZEOF_MMWORD			; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	cmp	ecx, byte SIZEOF_MMWORD/2
+	jb	short .column_st8
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	mmA,mmC
+	movq	mmD,mmH
+	sub	ecx, byte SIZEOF_MMWORD/2
+	add	edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD/4
+	jb	short .column_st4
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmA,mmD
+	sub	ecx, byte SIZEOF_MMWORD/4
+	add	edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+	cmp	ecx, byte SIZEOF_MMWORD/8
+	jb	short .endcolumn
+	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%ifndef USE_DEDICATED_H2V2_MERGED_UPSAMPLE_MMX
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jpeg_h2v2_merged_upsample_mmx (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+;                                JDIMENSION in_row_group_ctr,
+;                                JSAMPARRAY output_buf);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
+%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
+
+	align	16
+	global	EXTN(jpeg_h2v2_merged_upsample_mmx)
+
+EXTN(jpeg_h2v2_merged_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	eax, POINTER [cinfo(ebp)]
+
+	mov	edi, JSAMPIMAGE [input_buf(ebp)]
+	mov	ecx, JDIMENSION [in_row_group_ctr(ebp)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	mov	edi, JSAMPARRAY [output_buf(ebp)]
+	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+	push	edx			; inptr2
+	push	ebx			; inptr1
+	push	esi			; inptr00
+	mov	ebx,esp
+
+	push	edi			; output_buf (outptr0)
+	push	ecx			; in_row_group_ctr
+	push	ebx			; input_buf
+	push	eax			; cinfo
+
+	call	near EXTN(jpeg_h2v1_merged_upsample_mmx)
+
+	add	esi, byte SIZEOF_JSAMPROW	; inptr01
+	add	edi, byte SIZEOF_JSAMPROW	; outptr1
+	mov	POINTER [ebx+0*SIZEOF_POINTER], esi
+	mov	POINTER [ebx-1*SIZEOF_POINTER], edi
+
+	call	near EXTN(jpeg_h2v1_merged_upsample_mmx)
+
+	add	esp, byte 7*SIZEOF_DWORD
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%else  ; USE_DEDICATED_H2V2_MERGED_UPSAMPLE_MMX
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jpeg_h2v2_merged_upsample_mmx (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+;                                JDIMENSION in_row_group_ctr,
+;                                JSAMPARRAY output_buf);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
+%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		10
+%define inptr1		wk(0)-SIZEOF_JSAMPROW	; JSAMPROW inptr1
+%define inptr2		inptr1-SIZEOF_JSAMPROW	; JSAMPROW inptr2
+%define gotptr		inptr2-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_h2v2_merged_upsample_mmx)
+
+EXTN(jpeg_h2v2_merged_upsample_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [inptr2]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, POINTER [cinfo(eax)]
+	mov	ecx, JDIMENSION [jdstruct_output_width(ecx)]	; col
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [in_row_group_ctr(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	eax, JSAMPROW [esi+(ecx*2+0)*SIZEOF_JSAMPROW]	; inptr00
+	mov	esi, JSAMPROW [esi+(ecx*2+1)*SIZEOF_JSAMPROW]	; inptr01
+	mov	ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]		; inptr1
+	mov	edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]		; inptr2
+
+	pop	ecx		; col
+	push	eax		; inptr00
+	push	esi		; inptr01
+
+	mov	esi, JSAMPROW [edi+0*SIZEOF_JSAMPROW]		; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]		; outptr1
+	alignx	16,7
+.columnloop:
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	movq	mm6, MMWORD [ebx]	; mm6=Cb(01234567)
+	movq	mm7, MMWORD [edx]	; mm7=Cr(01234567)
+
+	mov	JSAMPROW [inptr1], ebx	; inptr1
+	mov	JSAMPROW [inptr2], edx	; inptr2
+	pop	edx			; edx=inptr01
+	pop	ebx			; ebx=inptr00
+
+	pxor      mm1,mm1		; mm1=(all 0's)
+	pcmpeqw   mm3,mm3
+	psllw     mm3,7			; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
+
+	movq      mm4,mm6
+	punpckhbw mm6,mm1		; mm6=Cb(4567)=CbH
+	punpcklbw mm4,mm1		; mm4=Cb(0123)=CbL
+	movq      mm0,mm7
+	punpckhbw mm7,mm1		; mm7=Cr(4567)=CrH
+	punpcklbw mm0,mm1		; mm0=Cr(0123)=CrL
+
+	paddw     mm6,mm3
+	paddw     mm4,mm3
+	paddw     mm7,mm3
+	paddw     mm0,mm3
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movq	mm5,mm6			; mm5=CbH
+	movq	mm2,mm4			; mm2=CbL
+	paddw	mm6,mm6			; mm6=2*CbH
+	paddw	mm4,mm4			; mm4=2*CbL
+	movq	mm1,mm7			; mm1=CrH
+	movq	mm3,mm0			; mm3=CrL
+	paddw	mm7,mm7			; mm7=2*CrH
+	paddw	mm0,mm0			; mm0=2*CrL
+
+	pmulhw	mm6,[GOTOFF(eax,PW_MF0228)]	; mm6=(2*CbH * -FIX(0.22800))
+	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbL * -FIX(0.22800))
+	pmulhw	mm7,[GOTOFF(eax,PW_F0402)]	; mm7=(2*CrH * FIX(0.40200))
+	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrL * FIX(0.40200))
+
+	paddw	mm6,[GOTOFF(eax,PW_ONE)]
+	paddw	mm4,[GOTOFF(eax,PW_ONE)]
+	psraw	mm6,1			; mm6=(CbH * -FIX(0.22800))
+	psraw	mm4,1			; mm4=(CbL * -FIX(0.22800))
+	paddw	mm7,[GOTOFF(eax,PW_ONE)]
+	paddw	mm0,[GOTOFF(eax,PW_ONE)]
+	psraw	mm7,1			; mm7=(CrH * FIX(0.40200))
+	psraw	mm0,1			; mm0=(CrL * FIX(0.40200))
+
+	paddw	mm6,mm5
+	paddw	mm4,mm2
+	paddw	mm6,mm5			; mm6=(CbH * FIX(1.77200))=(B-Y)H
+	paddw	mm4,mm2			; mm4=(CbL * FIX(1.77200))=(B-Y)L
+	paddw	mm7,mm1			; mm7=(CrH * FIX(1.40200))=(R-Y)H
+	paddw	mm0,mm3			; mm0=(CrL * FIX(1.40200))=(R-Y)L
+
+	movq	MMWORD [wk(0)], mm6	; wk(0)=(B-Y)H
+	movq	MMWORD [wk(1)], mm7	; wk(1)=(R-Y)H
+
+	movq      mm6,mm5
+	movq      mm7,mm2
+	punpcklwd mm5,mm1
+	punpckhwd mm6,mm1
+	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm6,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd mm2,mm3
+	punpckhwd mm7,mm3
+	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   mm7,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm6,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm5,SCALEBITS
+	psrad     mm6,SCALEBITS
+	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     mm7,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     mm2,SCALEBITS
+	psrad     mm7,SCALEBITS
+
+	packssdw  mm5,mm6	; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+	packssdw  mm2,mm7	; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+	psubw     mm5,mm1	; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+	psubw     mm2,mm3	; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+	movq	MMWORD [wk(2)], mm5	; wk(2)=(G-Y)H
+
+	mov	ah,2			; YHctr
+	jmp	short .YHloop_1st
+	alignx	16,7
+
+.YHloop_2nd:
+	movq	mm0, MMWORD [wk(1)]	; mm0=(R-Y)H
+	movq	mm2, MMWORD [wk(2)]	; mm2=(G-Y)H
+	movq	mm4, MMWORD [wk(0)]	; mm4=(B-Y)H
+	alignx	16,7
+
+.YHloop_1st:
+	movq	MMWORD [wk(3)], mm0	; wk(3)=(R-Y)(L/H)
+	movq	MMWORD [wk(4)], mm2	; wk(4)=(G-Y)(L/H)
+	movq	MMWORD [wk(5)], mm4	; wk(5)=(B-Y)(L/H)
+
+	movq	mm7, MMWORD [ebx]	; mm7=Y(01234567)
+
+	mov	al,2			; YVctr
+	jmp	short .YVloop_1st
+	alignx	16,7
+
+.YVloop_2nd:
+	movq	mm0, MMWORD [wk(3)]	; mm0=(R-Y)(L/H)
+	movq	mm2, MMWORD [wk(4)]	; mm2=(G-Y)(L/H)
+	movq	mm4, MMWORD [wk(5)]	; mm4=(B-Y)(L/H)
+
+	movq	mm7, MMWORD [edx]	; mm7=Y(01234567)
+	alignx	16,7
+
+.YVloop_1st:
+	pcmpeqw	mm6,mm6
+	psrlw	mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
+	pand	mm6,mm7			; mm6=Y(0246)=YE
+	psrlw	mm7,BYTE_BIT		; mm7=Y(1357)=YO
+
+	movq	mm1,mm0			; mm1=mm0=(R-Y)(L/H)
+	movq	mm3,mm2			; mm3=mm2=(G-Y)(L/H)
+	movq	mm5,mm4			; mm5=mm4=(B-Y)(L/H)
+
+	paddw     mm0,mm6		; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
+	paddw     mm1,mm7		; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
+	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
+	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+	paddw     mm2,mm6		; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
+	paddw     mm3,mm7		; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
+	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
+	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+	paddw     mm4,mm6		; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
+	paddw     mm5,mm7		; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
+	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
+	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
+	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
+
+	movq      mmG,mmA
+	movq      mmH,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
+	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
+
+	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
+	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
+
+	movq      mmC,mmD
+	movq      mmB,mmD
+	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
+	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
+
+	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
+
+	movq      mmF,mmE
+	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
+	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
+
+	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
+	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
+	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
+
+	dec	al			; YVctr
+	jz	short .YVloop_break
+
+	movq	MMWORD [wk(6)], mmA
+	movq	MMWORD [wk(7)], mmE
+	movq	MMWORD [wk(8)], mmC
+
+	jmp	near .YVloop_2nd
+	alignx	16,7
+
+.YVloop_break:
+	movq	mmH, MMWORD [wk(6)]
+	movq	mmB, MMWORD [wk(7)]
+	movq	mmD, MMWORD [wk(8)]
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [esi+0*SIZEOF_MMWORD], mmH
+	movq	MMWORD [esi+1*SIZEOF_MMWORD], mmB
+	movq	MMWORD [esi+2*SIZEOF_MMWORD], mmD
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	near .endcolumn
+
+	add	esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr0
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr1
+	add	ebx, byte SIZEOF_MMWORD			; inptr00
+	add	edx, byte SIZEOF_MMWORD			; inptr01
+	dec	ah			; YHctr
+	jnz	near .YHloop_2nd
+
+	push	ebx			; inptr00
+	push	edx			; inptr01
+	mov	ebx, JSAMPROW [inptr1]	; ebx=inptr1
+	mov	edx, JSAMPROW [inptr2]	; edx=inptr2
+	add	ebx, byte SIZEOF_MMWORD	; inptr1
+	add	edx, byte SIZEOF_MMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_MMWORD
+	jb	short .column_st8
+	movq	MMWORD [esi+0*SIZEOF_MMWORD], mmH
+	movq	MMWORD [esi+1*SIZEOF_MMWORD], mmB
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
+	movq	mmH,mmD
+	movq	mmA,mmC
+	sub	ecx, byte 2*SIZEOF_MMWORD
+	add	esi, byte 2*SIZEOF_MMWORD
+	add	edi, byte 2*SIZEOF_MMWORD
+	jmp	short .column_st4
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st4
+	movq	MMWORD [esi+0*SIZEOF_MMWORD], mmH
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmH,mmB
+	movq	mmA,mmE
+	sub	ecx, byte SIZEOF_MMWORD
+	add	esi, byte SIZEOF_MMWORD
+	add	edi, byte SIZEOF_MMWORD
+.column_st4:
+	movd	eax,mmH
+	movd	edx,mmA
+	cmp	ecx, byte SIZEOF_DWORD
+	jb	short .column_st2
+	mov	DWORD [esi+0*SIZEOF_DWORD], eax
+	mov	DWORD [edi+0*SIZEOF_DWORD], edx
+	psrlq	mmH,DWORD_BIT
+	psrlq	mmA,DWORD_BIT
+	movd	eax,mmH
+	movd	edx,mmA
+	sub	ecx, byte SIZEOF_DWORD
+	add	esi, byte SIZEOF_DWORD
+	add	edi, byte SIZEOF_DWORD
+.column_st2:
+	cmp	ecx, byte SIZEOF_WORD
+	jb	short .column_st1
+	mov	WORD [esi+0*SIZEOF_WORD], ax
+	mov	WORD [edi+0*SIZEOF_WORD], dx
+	shr	eax,WORD_BIT
+	shr	edx,WORD_BIT
+	sub	ecx, byte SIZEOF_WORD
+	add	esi, byte SIZEOF_WORD
+	add	edi, byte SIZEOF_WORD
+.column_st1:
+	cmp	ecx, byte SIZEOF_BYTE
+	jb	short .endcolumn
+	mov	BYTE [esi+0*SIZEOF_BYTE], al
+	mov	BYTE [edi+0*SIZEOF_BYTE], dl
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
+	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
+	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
+	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
+	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
+
+	movq      mmC,mmA
+	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
+	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
+	movq      mmG,mmB
+	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
+	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
+
+	movq      mmD,mmA
+	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
+	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
+	movq      mmH,mmC
+	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
+	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
+
+	dec	al			; YVctr
+	jz	short .YVloop_break
+
+	movq	MMWORD [wk(6)], mmA
+	movq	MMWORD [wk(7)], mmD
+	movq	MMWORD [wk(8)], mmC
+	movq	MMWORD [wk(9)], mmH
+
+	jmp	near .YVloop_2nd
+	alignx	16,7
+
+.YVloop_break:
+	movq	mmE, MMWORD [wk(6)]
+	movq	mmF, MMWORD [wk(7)]
+	movq	mmB, MMWORD [wk(8)]
+	movq	mmG, MMWORD [wk(9)]
+
+	cmp	ecx, byte SIZEOF_MMWORD
+	jb	short .column_st16
+
+	movq	MMWORD [esi+0*SIZEOF_MMWORD], mmE
+	movq	MMWORD [esi+1*SIZEOF_MMWORD], mmF
+	movq	MMWORD [esi+2*SIZEOF_MMWORD], mmB
+	movq	MMWORD [esi+3*SIZEOF_MMWORD], mmG
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+	sub	ecx, byte SIZEOF_MMWORD
+	jz	short .endcolumn
+
+	add	esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr0
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr1
+	add	ebx, byte SIZEOF_MMWORD			; inptr00
+	add	edx, byte SIZEOF_MMWORD			; inptr01
+	dec	ah			; YHctr
+	jnz	near .YHloop_2nd
+
+	push	ebx			; inptr00
+	push	edx			; inptr01
+	mov	ebx, JSAMPROW [inptr1]	; ebx=inptr1
+	mov	edx, JSAMPROW [inptr2]	; edx=inptr2
+	add	ebx, byte SIZEOF_MMWORD	; inptr1
+	add	edx, byte SIZEOF_MMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st16:
+	cmp	ecx, byte SIZEOF_MMWORD/2
+	jb	short .column_st8
+	movq	MMWORD [esi+0*SIZEOF_MMWORD], mmE
+	movq	MMWORD [esi+1*SIZEOF_MMWORD], mmF
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
+	movq	mmE,mmB
+	movq	mmF,mmG
+	movq	mmA,mmC
+	movq	mmD,mmH
+	sub	ecx, byte SIZEOF_MMWORD/2
+	add	esi, byte 2*SIZEOF_MMWORD
+	add	edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+	cmp	ecx, byte SIZEOF_MMWORD/4
+	jb	short .column_st4
+	movq	MMWORD [esi+0*SIZEOF_MMWORD], mmE
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
+	movq	mmE,mmF
+	movq	mmA,mmD
+	sub	ecx, byte SIZEOF_MMWORD/4
+	add	esi, byte 1*SIZEOF_MMWORD
+	add	edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+	cmp	ecx, byte SIZEOF_MMWORD/8
+	jb	short .endcolumn
+	movd	DWORD [esi+0*SIZEOF_DWORD], mmE
+	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; !USE_DEDICATED_H2V2_MERGED_UPSAMPLE_MMX
+
+%endif ; JDMERGE_MMX_SUPPORTED
+%endif ; UPSAMPLE_MERGING_SUPPORTED
+%endif ; RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
diff --git a/jdmerss2.asm b/jdmerss2.asm
new file mode 100644
index 0000000..b6f51c8
--- /dev/null
+++ b/jdmerss2.asm
@@ -0,0 +1,1272 @@
+;
+; jdmerss2.asm - merged upsampling/color conversion (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jcolsamp.inc"
+
+%if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
+%ifdef UPSAMPLE_MERGING_SUPPORTED
+%ifdef JDMERGE_SSE2_SUPPORTED
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS	16
+
+F_0_344	equ	 22554			; FIX(0.34414)
+F_0_714	equ	 46802			; FIX(0.71414)
+F_1_402	equ	 91881			; FIX(1.40200)
+F_1_772	equ	116130			; FIX(1.77200)
+F_0_402	equ	(F_1_402 - 65536)	; FIX(1.40200) - FIX(1)
+F_0_285	equ	( 65536 - F_0_714)	; FIX(1) - FIX(0.71414)
+F_0_228	equ	(131072 - F_1_772)	; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402	times 8 dw  F_0_402
+PW_MF0228	times 8 dw -F_0_228
+PW_MF0344_F0285	times 4 dw -F_0_344, F_0_285
+PW_ONE		times 8 dw  1
+PD_ONEHALF	times 4 dd  1 << (SCALEBITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jpeg_h2v1_merged_upsample_sse2 (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
+%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		3
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_h2v1_merged_upsample_sse2)
+
+EXTN(jpeg_h2v1_merged_upsample_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, POINTER [cinfo(eax)]
+	mov	ecx, JDIMENSION [jdstruct_output_width(ecx)]	; col
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [in_row_group_ctr(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]		; inptr0
+	mov	ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]		; inptr1
+	mov	edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]		; inptr2
+	mov	edi, JSAMPROW [edi]				; outptr
+
+	pop	ecx			; col
+
+	alignx	16,7
+.columnloop:
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	movdqa    xmm6, XMMWORD [ebx]	; xmm6=Cb(0123456789ABCDEF)
+	movdqa    xmm7, XMMWORD [edx]	; xmm7=Cr(0123456789ABCDEF)
+
+	pxor      xmm1,xmm1		; xmm1=(all 0's)
+	pcmpeqw   xmm3,xmm3
+	psllw     xmm3,7		; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+	movdqa    xmm4,xmm6
+	punpckhbw xmm6,xmm1		; xmm6=Cb(89ABCDEF)=CbH
+	punpcklbw xmm4,xmm1		; xmm4=Cb(01234567)=CbL
+	movdqa    xmm0,xmm7
+	punpckhbw xmm7,xmm1		; xmm7=Cr(89ABCDEF)=CrH
+	punpcklbw xmm0,xmm1		; xmm0=Cr(01234567)=CrL
+
+	paddw     xmm6,xmm3
+	paddw     xmm4,xmm3
+	paddw     xmm7,xmm3
+	paddw     xmm0,xmm3
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movdqa	xmm5,xmm6		; xmm5=CbH
+	movdqa	xmm2,xmm4		; xmm2=CbL
+	paddw	xmm6,xmm6		; xmm6=2*CbH
+	paddw	xmm4,xmm4		; xmm4=2*CbL
+	movdqa	xmm1,xmm7		; xmm1=CrH
+	movdqa	xmm3,xmm0		; xmm3=CrL
+	paddw	xmm7,xmm7		; xmm7=2*CrH
+	paddw	xmm0,xmm0		; xmm0=2*CrL
+
+	pmulhw	xmm6,[GOTOFF(eax,PW_MF0228)]	; xmm6=(2*CbH * -FIX(0.22800))
+	pmulhw	xmm4,[GOTOFF(eax,PW_MF0228)]	; xmm4=(2*CbL * -FIX(0.22800))
+	pmulhw	xmm7,[GOTOFF(eax,PW_F0402)]	; xmm7=(2*CrH * FIX(0.40200))
+	pmulhw	xmm0,[GOTOFF(eax,PW_F0402)]	; xmm0=(2*CrL * FIX(0.40200))
+
+	paddw	xmm6,[GOTOFF(eax,PW_ONE)]
+	paddw	xmm4,[GOTOFF(eax,PW_ONE)]
+	psraw	xmm6,1			; xmm6=(CbH * -FIX(0.22800))
+	psraw	xmm4,1			; xmm4=(CbL * -FIX(0.22800))
+	paddw	xmm7,[GOTOFF(eax,PW_ONE)]
+	paddw	xmm0,[GOTOFF(eax,PW_ONE)]
+	psraw	xmm7,1			; xmm7=(CrH * FIX(0.40200))
+	psraw	xmm0,1			; xmm0=(CrL * FIX(0.40200))
+
+	paddw	xmm6,xmm5
+	paddw	xmm4,xmm2
+	paddw	xmm6,xmm5		; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+	paddw	xmm4,xmm2		; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+	paddw	xmm7,xmm1		; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+	paddw	xmm0,xmm3		; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=(B-Y)H
+	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(R-Y)H
+
+	movdqa    xmm6,xmm5
+	movdqa    xmm7,xmm2
+	punpcklwd xmm5,xmm1
+	punpckhwd xmm6,xmm1
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   xmm6,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd xmm2,xmm3
+	punpckhwd xmm7,xmm3
+	pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   xmm7,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     xmm6,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     xmm5,SCALEBITS
+	psrad     xmm6,SCALEBITS
+	paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     xmm7,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     xmm2,SCALEBITS
+	psrad     xmm7,SCALEBITS
+
+	packssdw  xmm5,xmm6	; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+	packssdw  xmm2,xmm7	; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+	psubw     xmm5,xmm1	; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+	psubw     xmm2,xmm3	; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+	movdqa	XMMWORD [wk(2)], xmm5	; wk(2)=(G-Y)H
+
+	mov	al,2			; Yctr
+	jmp	short .Yloop_1st
+	alignx	16,7
+
+.Yloop_2nd:
+	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(R-Y)H
+	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(G-Y)H
+	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(B-Y)H
+	alignx	16,7
+
+.Yloop_1st:
+	movdqa	xmm7, XMMWORD [esi]	; xmm7=Y(0123456789ABCDEF)
+
+	pcmpeqw	xmm6,xmm6
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+	pand	xmm6,xmm7		; xmm6=Y(02468ACE)=YE
+	psrlw	xmm7,BYTE_BIT		; xmm7=Y(13579BDF)=YO
+
+	movdqa	xmm1,xmm0		; xmm1=xmm0=(R-Y)(L/H)
+	movdqa	xmm3,xmm2		; xmm3=xmm2=(G-Y)(L/H)
+	movdqa	xmm5,xmm4		; xmm5=xmm4=(B-Y)(L/H)
+
+	paddw     xmm0,xmm6		; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+	paddw     xmm1,xmm7		; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
+	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
+
+	paddw     xmm2,xmm6		; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+	paddw     xmm3,xmm7		; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
+	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
+
+	paddw     xmm4,xmm6		; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+	paddw     xmm5,xmm7		; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
+	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+	movdqa    xmmG,xmmA
+	movdqa    xmmH,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+	movdqa    xmmC,xmmD
+	movdqa    xmmB,xmmD
+	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+	movdqa    xmmF,xmmE
+	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+	movdqa    xmmB,xmmE
+	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+	movdqa    xmmB,xmmF
+	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	edi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmH,xmmH			; xmmH=(all 1's)
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmF,xmmH			; movntdqu XMMWORD [edi], xmmF
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	ecx, byte SIZEOF_XMMWORD
+	jz	near .endcolumn
+
+	add	esi, byte SIZEOF_XMMWORD	; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	ebx, byte SIZEOF_XMMWORD	; inptr1
+	add	edx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st32:
+	pcmpeqb	xmmH,xmmH			; xmmH=(all 1's)
+	lea	ecx, [ecx+ecx*2]		; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_XMMWORD
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmH			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmF
+	sub	ecx, byte 2*SIZEOF_XMMWORD
+	jmp	short .column_st15
+.column_st16:
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmH			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	ecx, byte SIZEOF_XMMWORD
+.column_st15:
+	mov	eax,ecx
+	xor	ecx, byte 0x0F
+	shl	ecx, 2
+	movd	xmmB,ecx
+	psrlq	xmmH,4
+	pcmpeqb	xmmE,xmmE
+	psrlq	xmmH,xmmB
+	psrlq	xmmE,xmmB
+	punpcklbw xmmE,xmmH
+	; ----------------
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	add	eax,ecx
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmG,xmmA
+	movdqa	xmmC,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmD,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmF,ecx
+	psllq	xmmA,xmmF
+	psllq	xmmE,xmmF
+	jmp	short .adj0
+.adj1:	neg	ecx
+	movd	xmmF,ecx
+	psrlq	xmmA,xmmF
+	psrlq	xmmE,xmmF
+	psllq	xmmG,xmmD
+	psllq	xmmC,xmmD
+	por	xmmA,xmmG
+	por	xmmE,xmmC
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%else
+	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%endif
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+	movdqa    xmmC,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+	movdqa    xmmG,xmmB
+	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	movdqa    xmmH,xmmC
+	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st32
+
+	test	edi, SIZEOF_XMMWORD-1
+	jnz	short .out1
+	; --(aligned)-------------------
+	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+	movntdq	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr
+	jmp	short .out0
+.out1:	; --(unaligned)-----------------
+	pcmpeqb    xmmE,xmmE			; xmmE=(all 1's)
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmC,xmmE			; movntdqu XMMWORD [edi], xmmC
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmH,xmmE			; movntdqu XMMWORD [edi], xmmH
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+.out0:
+	sub	ecx, byte SIZEOF_XMMWORD
+	jz	near .endcolumn
+
+	add	esi, byte SIZEOF_XMMWORD	; inptr0
+	dec	al			; Yctr
+	jnz	near .Yloop_2nd
+
+	add	ebx, byte SIZEOF_XMMWORD	; inptr1
+	add	edx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st32:
+	pcmpeqb	xmmE,xmmE			; xmmE=(all 1's)
+	cmp	ecx, byte SIZEOF_XMMWORD/2
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	maskmovdqu xmmD,xmmE			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmC
+	movdqa	xmmD,xmmH
+	sub	ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+	cmp	ecx, byte SIZEOF_XMMWORD/4
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr
+	movdqa	xmmA,xmmD
+	sub	ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+	cmp	ecx, byte SIZEOF_XMMWORD/16
+	jb	short .endcolumn
+	mov	eax,ecx
+	xor	ecx, byte 0x03
+	inc	ecx
+	shl	ecx, 4
+	movd	xmmF,ecx
+	psrlq	xmmE,xmmF
+	punpcklbw xmmE,xmmE
+	; ----------------
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0
+	lea	eax, [ecx+eax*4]	; RGB_PIXELSIZE
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmE,ecx
+	movdqa	xmmB,xmmA
+	movdqa	xmmG,xmmE
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	movd	xmmC,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1
+	movd	xmmH,ecx
+	psllq	xmmA,xmmH
+	psllq	xmmE,xmmH
+	jmp	short .adj0
+.adj1:	neg	ecx
+	movd	xmmH,ecx
+	psrlq	xmmA,xmmH
+	psrlq	xmmE,xmmH
+	psllq	xmmB,xmmC
+	psllq	xmmG,xmmC
+	por	xmmA,xmmB
+	por	xmmE,xmmG
+.adj0:	; ----------------
+	maskmovdqu xmmA,xmmE			; movntdqu XMMWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+	sfence		; flush the write buffer
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%ifndef USE_DEDICATED_H2V2_MERGED_UPSAMPLE_SSE2
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jpeg_h2v2_merged_upsample_sse2 (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
+%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
+
+	align	16
+	global	EXTN(jpeg_h2v2_merged_upsample_sse2)
+
+EXTN(jpeg_h2v2_merged_upsample_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	eax, POINTER [cinfo(ebp)]
+
+	mov	edi, JSAMPIMAGE [input_buf(ebp)]
+	mov	ecx, JDIMENSION [in_row_group_ctr(ebp)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	mov	edi, JSAMPARRAY [output_buf(ebp)]
+	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+	push	edx			; inptr2
+	push	ebx			; inptr1
+	push	esi			; inptr00
+	mov	ebx,esp
+
+	push	edi			; output_buf (outptr0)
+	push	ecx			; in_row_group_ctr
+	push	ebx			; input_buf
+	push	eax			; cinfo
+
+	call	near EXTN(jpeg_h2v1_merged_upsample_sse2)
+
+	add	esi, byte SIZEOF_JSAMPROW	; inptr01
+	add	edi, byte SIZEOF_JSAMPROW	; outptr1
+	mov	POINTER [ebx+0*SIZEOF_POINTER], esi
+	mov	POINTER [ebx-1*SIZEOF_POINTER], edi
+
+	call	near EXTN(jpeg_h2v1_merged_upsample_sse2)
+
+	add	esp, byte 7*SIZEOF_DWORD
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%else  ; USE_DEDICATED_H2V2_MERGED_UPSAMPLE_SSE2
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jpeg_h2v2_merged_upsample_sse2 (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
+%define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		10
+%define inptr1		wk(0)-SIZEOF_JSAMPROW	; JSAMPROW inptr1
+%define inptr2		inptr1-SIZEOF_JSAMPROW	; JSAMPROW inptr2
+%define gotptr		inptr2-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_h2v2_merged_upsample_sse2)
+
+EXTN(jpeg_h2v2_merged_upsample_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [inptr2]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	ecx, POINTER [cinfo(eax)]
+	mov	ecx, JDIMENSION [jdstruct_output_width(ecx)]	; col
+	test	ecx,ecx
+	jz	near .return
+
+	push	ecx
+
+	mov	edi, JSAMPIMAGE [input_buf(eax)]
+	mov	ecx, JDIMENSION [in_row_group_ctr(eax)]
+	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+	mov	edi, JSAMPARRAY [output_buf(eax)]
+	mov	eax, JSAMPROW [esi+(ecx*2+0)*SIZEOF_JSAMPROW]	; inptr00
+	mov	esi, JSAMPROW [esi+(ecx*2+1)*SIZEOF_JSAMPROW]	; inptr01
+	mov	ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]		; inptr1
+	mov	edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]		; inptr2
+
+	pop	ecx		; col
+	push	eax		; inptr00
+	push	esi		; inptr01
+
+	mov	esi, JSAMPROW [edi+0*SIZEOF_JSAMPROW]		; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]		; outptr1
+	alignx	16,7
+.columnloop:
+	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
+
+	movdqa	xmm6, XMMWORD [ebx]	; xmm6=Cb(0123456789ABCDEF)
+	movdqa	xmm7, XMMWORD [edx]	; xmm7=Cr(0123456789ABCDEF)
+
+	mov	JSAMPROW [inptr1], ebx	; inptr1
+	mov	JSAMPROW [inptr2], edx	; inptr2
+	pop	edx			; edx=inptr01
+	pop	ebx			; ebx=inptr00
+
+	pxor      xmm1,xmm1		; xmm1=(all 0's)
+	pcmpeqw   xmm3,xmm3
+	psllw     xmm3,7		; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+	movdqa    xmm4,xmm6
+	punpckhbw xmm6,xmm1		; xmm6=Cb(89ABCDEF)=CbH
+	punpcklbw xmm4,xmm1		; xmm4=Cb(01234567)=CbL
+	movdqa    xmm0,xmm7
+	punpckhbw xmm7,xmm1		; xmm7=Cr(89ABCDEF)=CrH
+	punpcklbw xmm0,xmm1		; xmm0=Cr(01234567)=CrL
+
+	paddw     xmm6,xmm3
+	paddw     xmm4,xmm3
+	paddw     xmm7,xmm3
+	paddw     xmm0,xmm3
+
+	; (Original)
+	; R = Y                + 1.40200 * Cr
+	; G = Y - 0.34414 * Cb - 0.71414 * Cr
+	; B = Y + 1.77200 * Cb
+	;
+	; (This implementation)
+	; R = Y                + 0.40200 * Cr + Cr
+	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+	; B = Y - 0.22800 * Cb + Cb + Cb
+
+	movdqa	xmm5,xmm6		; xmm5=CbH
+	movdqa	xmm2,xmm4		; xmm2=CbL
+	paddw	xmm6,xmm6		; xmm6=2*CbH
+	paddw	xmm4,xmm4		; xmm4=2*CbL
+	movdqa	xmm1,xmm7		; xmm1=CrH
+	movdqa	xmm3,xmm0		; xmm3=CrL
+	paddw	xmm7,xmm7		; xmm7=2*CrH
+	paddw	xmm0,xmm0		; xmm0=2*CrL
+
+	pmulhw	xmm6,[GOTOFF(eax,PW_MF0228)]	; xmm6=(2*CbH * -FIX(0.22800))
+	pmulhw	xmm4,[GOTOFF(eax,PW_MF0228)]	; xmm4=(2*CbL * -FIX(0.22800))
+	pmulhw	xmm7,[GOTOFF(eax,PW_F0402)]	; xmm7=(2*CrH * FIX(0.40200))
+	pmulhw	xmm0,[GOTOFF(eax,PW_F0402)]	; xmm0=(2*CrL * FIX(0.40200))
+
+	paddw	xmm6,[GOTOFF(eax,PW_ONE)]
+	paddw	xmm4,[GOTOFF(eax,PW_ONE)]
+	psraw	xmm6,1			; xmm6=(CbH * -FIX(0.22800))
+	psraw	xmm4,1			; xmm4=(CbL * -FIX(0.22800))
+	paddw	xmm7,[GOTOFF(eax,PW_ONE)]
+	paddw	xmm0,[GOTOFF(eax,PW_ONE)]
+	psraw	xmm7,1			; xmm7=(CrH * FIX(0.40200))
+	psraw	xmm0,1			; xmm0=(CrL * FIX(0.40200))
+
+	paddw	xmm6,xmm5
+	paddw	xmm4,xmm2
+	paddw	xmm6,xmm5		; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+	paddw	xmm4,xmm2		; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+	paddw	xmm7,xmm1		; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+	paddw	xmm0,xmm3		; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=(B-Y)H
+	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(R-Y)H
+
+	movdqa    xmm6,xmm5
+	movdqa    xmm7,xmm2
+	punpcklwd xmm5,xmm1
+	punpckhwd xmm6,xmm1
+	pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   xmm6,[GOTOFF(eax,PW_MF0344_F0285)]
+	punpcklwd xmm2,xmm3
+	punpckhwd xmm7,xmm3
+	pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
+	pmaddwd   xmm7,[GOTOFF(eax,PW_MF0344_F0285)]
+
+	paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     xmm6,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     xmm5,SCALEBITS
+	psrad     xmm6,SCALEBITS
+	paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
+	paddd     xmm7,[GOTOFF(eax,PD_ONEHALF)]
+	psrad     xmm2,SCALEBITS
+	psrad     xmm7,SCALEBITS
+
+	packssdw  xmm5,xmm6	; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+	packssdw  xmm2,xmm7	; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+	psubw     xmm5,xmm1	; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+	psubw     xmm2,xmm3	; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+	movdqa	XMMWORD [wk(2)], xmm5	; wk(2)=(G-Y)H
+
+	mov	ah,2			; YHctr
+	jmp	short .YHloop_1st
+	alignx	16,7
+
+.YHloop_2nd:
+	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(R-Y)H
+	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(G-Y)H
+	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(B-Y)H
+	alignx	16,7
+
+.YHloop_1st:
+	movdqa	XMMWORD [wk(3)], xmm0	; wk(3)=(R-Y)(L/H)
+	movdqa	XMMWORD [wk(4)], xmm2	; wk(4)=(G-Y)(L/H)
+	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=(B-Y)(L/H)
+
+	movdqa	xmm7, XMMWORD [ebx]	; xmm7=Y(0123456789ABCDEF)
+
+	mov	al,2			; YVctr
+	jmp	short .YVloop_1st
+	alignx	16,7
+
+.YVloop_2nd:
+	movdqa	xmm0, XMMWORD [wk(3)]	; xmm0=(R-Y)(L/H)
+	movdqa	xmm2, XMMWORD [wk(4)]	; xmm2=(G-Y)(L/H)
+	movdqa	xmm4, XMMWORD [wk(5)]	; xmm4=(B-Y)(L/H)
+
+	movdqa	xmm7, XMMWORD [edx]	; xmm7=Y(0123456789ABCDEF)
+	alignx	16,7
+
+.YVloop_1st:
+	pcmpeqw	xmm6,xmm6
+	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
+	pand	xmm6,xmm7		; xmm6=Y(02468ACE)=YE
+	psrlw	xmm7,BYTE_BIT		; xmm7=Y(13579BDF)=YO
+
+	movdqa	xmm1,xmm0		; xmm1=xmm0=(R-Y)(L/H)
+	movdqa	xmm3,xmm2		; xmm3=xmm2=(G-Y)(L/H)
+	movdqa	xmm5,xmm4		; xmm5=xmm4=(B-Y)(L/H)
+
+	paddw     xmm0,xmm6		; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+	paddw     xmm1,xmm7		; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+	packuswb  xmm0,xmm0		; xmm0=R(02468ACE********)
+	packuswb  xmm1,xmm1		; xmm1=R(13579BDF********)
+
+	paddw     xmm2,xmm6		; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+	paddw     xmm3,xmm7		; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+	packuswb  xmm2,xmm2		; xmm2=G(02468ACE********)
+	packuswb  xmm3,xmm3		; xmm3=G(13579BDF********)
+
+	paddw     xmm4,xmm6		; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+	paddw     xmm5,xmm7		; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+	packuswb  xmm4,xmm4		; xmm4=B(02468ACE********)
+	packuswb  xmm5,xmm5		; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmB	; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+	punpcklbw xmmD,xmmF	; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+	movdqa    xmmG,xmmA
+	movdqa    xmmH,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+	punpckhwd xmmG,xmmE	; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+	psrldq    xmmH,2	; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+	psrldq    xmmE,2	; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+	movdqa    xmmC,xmmD
+	movdqa    xmmB,xmmD
+	punpcklwd xmmD,xmmH	; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+	punpckhwd xmmC,xmmH	; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+	psrldq    xmmB,2	; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+	movdqa    xmmF,xmmE
+	punpcklwd xmmE,xmmB	; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+	punpckhwd xmmF,xmmB	; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+	pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+	movdqa    xmmB,xmmE
+	punpckldq xmmA,xmmD	; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+	punpckldq xmmE,xmmH	; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+	punpckhdq xmmD,xmmB	; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+	pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+	movdqa    xmmB,xmmF
+	punpckldq xmmG,xmmC	; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+	punpckldq xmmF,xmmH	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+	punpckhdq xmmC,xmmB	; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+	punpcklqdq xmmA,xmmE	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+	punpcklqdq xmmD,xmmG	; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+	punpcklqdq xmmF,xmmC	; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+	dec	al			; YVctr
+	jz	short .YVloop_break
+
+	movdqa	XMMWORD [wk(6)], xmmA
+	movdqa	XMMWORD [wk(7)], xmmD
+	movdqa	XMMWORD [wk(8)], xmmF
+
+	jmp	near .YVloop_2nd
+	alignx	16,7
+
+.YVloop_break:
+	movdqa	xmmH, XMMWORD [wk(6)]
+	movdqa	xmmB, XMMWORD [wk(7)]
+	movdqa	xmmE, XMMWORD [wk(8)]
+
+	pcmpeqb	xmmG,xmmG	; xmmG=(all 1's)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	near .column_st32
+
+	test	edi, SIZEOF_XMMWORD-1
+	jnz	short .out01
+	; --(aligned)-------------------
+	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr1
+	jmp	short .out00
+.out01:	; --(unaligned)-----------------
+	maskmovdqu xmmA,xmmG			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr1
+	maskmovdqu xmmD,xmmG			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr1
+	maskmovdqu xmmF,xmmG			; movntdqu XMMWORD [edi], xmmF
+	add	edi, byte SIZEOF_XMMWORD	; outptr1
+.out00:
+	test	esi, SIZEOF_XMMWORD-1
+	jnz	short .out11
+	; --(aligned)-------------------
+	movntdq	XMMWORD [esi+0*SIZEOF_XMMWORD], xmmH
+	movntdq	XMMWORD [esi+1*SIZEOF_XMMWORD], xmmB
+	movntdq	XMMWORD [esi+2*SIZEOF_XMMWORD], xmmE
+	add	esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr0
+	jmp	short .out10
+.out11:	; --(unaligned)-----------------
+	xchg	edi,esi				; edi=outptr0, esi=outptr1
+	maskmovdqu xmmH,xmmG			; movntdqu XMMWORD [edi], xmmH
+	add	edi, byte SIZEOF_XMMWORD	; outptr0
+	maskmovdqu xmmB,xmmG			; movntdqu XMMWORD [edi], xmmB
+	add	edi, byte SIZEOF_XMMWORD	; outptr0
+	maskmovdqu xmmE,xmmG			; movntdqu XMMWORD [edi], xmmE
+	add	edi, byte SIZEOF_XMMWORD	; outptr0
+	xchg	edi,esi				; edi=outptr1, esi=outptr0
+.out10:
+	sub	ecx, byte SIZEOF_XMMWORD
+	jz	near .endcolumn
+
+	add	ebx, byte SIZEOF_XMMWORD	; inptr00
+	add	edx, byte SIZEOF_XMMWORD	; inptr01
+	dec	ah			; YHctr
+	jnz	near .YHloop_2nd
+
+	push	ebx				; inptr00
+	push	edx				; inptr01
+	mov	ebx, JSAMPROW [inptr1]		; ebx=inptr1
+	mov	edx, JSAMPROW [inptr2]		; edx=inptr2
+	add	ebx, byte SIZEOF_XMMWORD	; inptr1
+	add	edx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st32:
+	lea	ecx, [ecx+ecx*2]		; imul ecx, RGB_PIXELSIZE
+	cmp	ecx, byte 2*SIZEOF_XMMWORD
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmG			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr1
+	maskmovdqu xmmD,xmmG			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr1
+	xchg	edi,esi				; edi=outptr0, esi=outptr1
+	maskmovdqu xmmH,xmmG			; movntdqu XMMWORD [edi], xmmH
+	add	edi, byte SIZEOF_XMMWORD	; outptr0
+	maskmovdqu xmmB,xmmG			; movntdqu XMMWORD [edi], xmmB
+	add	edi, byte SIZEOF_XMMWORD	; outptr0
+	xchg	edi,esi				; edi=outptr1, esi=outptr0
+	movdqa	xmmA,xmmF
+	movdqa	xmmH,xmmE
+	sub	ecx, byte 2*SIZEOF_XMMWORD
+	jmp	short .column_st15
+.column_st16:
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmG			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr1
+	xchg	edi,esi				; edi=outptr0, esi=outptr1
+	maskmovdqu xmmH,xmmG			; movntdqu XMMWORD [edi], xmmH
+	add	edi, byte SIZEOF_XMMWORD	; outptr0
+	xchg	edi,esi				; edi=outptr1, esi=outptr0
+	movdqa	xmmA,xmmD
+	movdqa	xmmH,xmmB
+	sub	ecx, byte SIZEOF_XMMWORD
+.column_st15:
+	mov	edx,ecx
+	xor	ecx, byte 0x0F
+	shl	ecx, 2
+	movd	xmmC,ecx
+	psrlq	xmmG,4
+	pcmpeqb	xmmD,xmmD
+	psrlq	xmmG,xmmC
+	psrlq	xmmD,xmmC
+	punpcklbw xmmD,xmmG
+	movdqa    xmmB,xmmD
+	; ================
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0a
+	lea	eax, [ecx+edx]
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0a
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmD,ecx
+	movdqa	xmmF,xmmA
+	movdqa	xmmE,xmmD
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmD, SIZEOF_XMMWORD/2
+	movd	xmmC,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1a
+	movd	xmmG,ecx
+	psllq	xmmA,xmmG
+	psllq	xmmD,xmmG
+	jmp	short .adj0a
+.adj1a:	neg	ecx
+	movd	xmmG,ecx
+	psrlq	xmmA,xmmG
+	psrlq	xmmD,xmmG
+	psllq	xmmF,xmmC
+	psllq	xmmE,xmmC
+	por	xmmA,xmmF
+	por	xmmD,xmmE
+.adj0a:	; ----------------
+	maskmovdqu xmmA,xmmD			; movntdqu XMMWORD [edi], xmmA
+	xchg	edi,esi				; edi=outptr0, esi=outptr1
+	; ================
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0b
+	lea	eax, [ecx+edx]
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0b
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmH,ecx & pslldq xmmB,ecx
+	movdqa	xmmG,xmmH
+	movdqa	xmmC,xmmB
+	pslldq	xmmH, SIZEOF_XMMWORD/2
+	pslldq	xmmB, SIZEOF_XMMWORD/2
+	movd	xmmF,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1b
+	movd	xmmE,ecx
+	psllq	xmmH,xmmE
+	psllq	xmmB,xmmE
+	jmp	short .adj0b
+.adj1b:	neg	ecx
+	movd	xmmE,ecx
+	psrlq	xmmH,xmmE
+	psrlq	xmmB,xmmE
+	psllq	xmmG,xmmF
+	psllq	xmmC,xmmF
+	por	xmmH,xmmG
+	por	xmmB,xmmC
+.adj0b:	; ----------------
+	maskmovdqu xmmH,xmmB			; movntdqu XMMWORD [edi], xmmH
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+	pcmpeqb   xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pcmpeqb   xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%else
+	pxor      xmm6,xmm6		; xmm6=XE=X(02468ACE********)
+	pxor      xmm7,xmm7		; xmm7=XO=X(13579BDF********)
+%endif
+	; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+	; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+	; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+	; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+	punpcklbw xmmA,xmmC	; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+	punpcklbw xmmE,xmmG	; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+	punpcklbw xmmB,xmmD	; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+	punpcklbw xmmF,xmmH	; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+	movdqa    xmmC,xmmA
+	punpcklwd xmmA,xmmE	; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+	punpckhwd xmmC,xmmE	; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+	movdqa    xmmG,xmmB
+	punpcklwd xmmB,xmmF	; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+	punpckhwd xmmG,xmmF	; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+	movdqa    xmmD,xmmA
+	punpckldq xmmA,xmmB	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+	punpckhdq xmmD,xmmB	; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+	movdqa    xmmH,xmmC
+	punpckldq xmmC,xmmG	; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+	punpckhdq xmmH,xmmG	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+	dec	al			; YVctr
+	jz	short .YVloop_break
+
+	movdqa	XMMWORD [wk(6)], xmmA
+	movdqa	XMMWORD [wk(7)], xmmD
+	movdqa	XMMWORD [wk(8)], xmmC
+	movdqa	XMMWORD [wk(9)], xmmH
+
+	jmp	near .YVloop_2nd
+	alignx	16,7
+
+.YVloop_break:
+	movdqa	xmmE, XMMWORD [wk(6)]
+	movdqa	xmmF, XMMWORD [wk(7)]
+	movdqa	xmmB, XMMWORD [wk(8)]
+
+	pcmpeqb	xmmG,xmmG	; xmmG=(all 1's)
+
+	cmp	ecx, byte SIZEOF_XMMWORD
+	jb	near .column_st32
+
+	test	edi, SIZEOF_XMMWORD-1
+	jnz	short .out01
+	; --(aligned)-------------------
+	movntdq	XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+	movntdq	XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+	movntdq	XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+	movntdq	XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+	add	edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr1
+	jmp	short .out00
+.out01:	; --(unaligned)-----------------
+	maskmovdqu xmmA,xmmG			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr1
+	maskmovdqu xmmD,xmmG			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr1
+	maskmovdqu xmmC,xmmG			; movntdqu XMMWORD [edi], xmmC
+	add	edi, byte SIZEOF_XMMWORD	; outptr1
+	maskmovdqu xmmH,xmmG			; movntdqu XMMWORD [edi], xmmH
+	add	edi, byte SIZEOF_XMMWORD	; outptr1
+.out00:
+	movdqa	xmmA, XMMWORD [wk(9)]
+
+	test	esi, SIZEOF_XMMWORD-1
+	jnz	short .out11
+	; --(aligned)-------------------
+	movntdq	XMMWORD [esi+0*SIZEOF_XMMWORD], xmmE
+	movntdq	XMMWORD [esi+1*SIZEOF_XMMWORD], xmmF
+	movntdq	XMMWORD [esi+2*SIZEOF_XMMWORD], xmmB
+	movntdq	XMMWORD [esi+3*SIZEOF_XMMWORD], xmmA
+	add	esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; outptr0
+	jmp	short .out10
+.out11:	; --(unaligned)-----------------
+	xchg	edi,esi				; edi=outptr0, esi=outptr1
+	maskmovdqu xmmE,xmmG			; movntdqu XMMWORD [edi], xmmE
+	add	edi, byte SIZEOF_XMMWORD	; outptr0
+	maskmovdqu xmmF,xmmG			; movntdqu XMMWORD [edi], xmmF
+	add	edi, byte SIZEOF_XMMWORD	; outptr0
+	maskmovdqu xmmB,xmmG			; movntdqu XMMWORD [edi], xmmB
+	add	edi, byte SIZEOF_XMMWORD	; outptr0
+	maskmovdqu xmmA,xmmG			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr0
+	xchg	edi,esi				; edi=outptr1, esi=outptr0
+.out10:
+	sub	ecx, byte SIZEOF_XMMWORD
+	jz	near .endcolumn
+
+	add	ebx, byte SIZEOF_XMMWORD	; inptr00
+	add	edx, byte SIZEOF_XMMWORD	; inptr01
+	dec	ah			; YHctr
+	jnz	near .YHloop_2nd
+
+	push	ebx				; inptr00
+	push	edx				; inptr01
+	mov	ebx, JSAMPROW [inptr1]		; ebx=inptr1
+	mov	edx, JSAMPROW [inptr2]		; edx=inptr2
+	add	ebx, byte SIZEOF_XMMWORD	; inptr1
+	add	edx, byte SIZEOF_XMMWORD	; inptr2
+	jmp	near .columnloop
+	alignx	16,7
+
+.column_st32:
+	cmp	ecx, byte SIZEOF_XMMWORD/2
+	jb	short .column_st16
+	maskmovdqu xmmA,xmmG			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr1
+	maskmovdqu xmmD,xmmG			; movntdqu XMMWORD [edi], xmmD
+	add	edi, byte SIZEOF_XMMWORD	; outptr1
+	xchg	edi,esi				; edi=outptr0, esi=outptr1
+	maskmovdqu xmmE,xmmG			; movntdqu XMMWORD [edi], xmmE
+	add	edi, byte SIZEOF_XMMWORD	; outptr0
+	maskmovdqu xmmF,xmmG			; movntdqu XMMWORD [edi], xmmF
+	add	edi, byte SIZEOF_XMMWORD	; outptr0
+	xchg	edi,esi				; edi=outptr1, esi=outptr0
+	movdqa	xmmA,xmmC
+	movdqa	xmmD,xmmH
+	movdqa	xmmE,xmmB
+	movdqa	xmmF, XMMWORD [wk(9)]
+	sub	ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+	cmp	ecx, byte SIZEOF_XMMWORD/4
+	jb	short .column_st15
+	maskmovdqu xmmA,xmmG			; movntdqu XMMWORD [edi], xmmA
+	add	edi, byte SIZEOF_XMMWORD	; outptr1
+	xchg	edi,esi				; edi=outptr0, esi=outptr1
+	maskmovdqu xmmE,xmmG			; movntdqu XMMWORD [edi], xmmE
+	add	edi, byte SIZEOF_XMMWORD	; outptr0
+	xchg	edi,esi				; edi=outptr1, esi=outptr0
+	movdqa	xmmA,xmmD
+	movdqa	xmmE,xmmF
+	sub	ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+	cmp	ecx, byte SIZEOF_XMMWORD/16
+	jb	near .endcolumn
+	mov	edx,ecx
+	xor	ecx, byte 0x03
+	inc	ecx
+	shl	ecx, 4
+	movd	xmmC,ecx
+	psrlq	xmmG,xmmC
+	punpcklbw xmmG,xmmG
+	movdqa    xmmH,xmmG
+	; ================
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0a
+	lea	eax, [ecx+edx*4]	; RGB_PIXELSIZE
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0a
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmA,ecx & pslldq xmmG,ecx
+	movdqa	xmmB,xmmA
+	movdqa	xmmD,xmmG
+	pslldq	xmmA, SIZEOF_XMMWORD/2
+	pslldq	xmmG, SIZEOF_XMMWORD/2
+	movd	xmmF,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1a
+	movd	xmmC,ecx
+	psllq	xmmA,xmmC
+	psllq	xmmG,xmmC
+	jmp	short .adj0a
+.adj1a:	neg	ecx
+	movd	xmmC,ecx
+	psrlq	xmmA,xmmC
+	psrlq	xmmG,xmmC
+	psllq	xmmB,xmmF
+	psllq	xmmD,xmmF
+	por	xmmA,xmmB
+	por	xmmG,xmmD
+.adj0a:	; ----------------
+	maskmovdqu xmmA,xmmG			; movntdqu XMMWORD [edi], xmmA
+	xchg	edi,esi				; edi=outptr0, esi=outptr1
+	; ================
+	mov	ecx,edi
+	and	ecx, byte SIZEOF_XMMWORD-1
+	jz	short .adj0b
+	lea	eax, [ecx+edx*4]	; RGB_PIXELSIZE
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .adj0b
+	and	edi, byte (-SIZEOF_XMMWORD)	; align to 16-byte boundary
+	shl	ecx, 3			; pslldq xmmE,ecx & pslldq xmmH,ecx
+	movdqa	xmmC,xmmE
+	movdqa	xmmF,xmmH
+	pslldq	xmmE, SIZEOF_XMMWORD/2
+	pslldq	xmmH, SIZEOF_XMMWORD/2
+	movd	xmmB,ecx
+	sub	ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+	jb	short .adj1b
+	movd	xmmD,ecx
+	psllq	xmmE,xmmD
+	psllq	xmmH,xmmD
+	jmp	short .adj0b
+.adj1b:	neg	ecx
+	movd	xmmD,ecx
+	psrlq	xmmE,xmmD
+	psrlq	xmmH,xmmD
+	psllq	xmmC,xmmB
+	psllq	xmmF,xmmB
+	por	xmmE,xmmC
+	por	xmmH,xmmF
+.adj0b:	; ----------------
+	maskmovdqu xmmE,xmmH			; movntdqu XMMWORD [edi], xmmE
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+	sfence		; flush the write buffer
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; !USE_DEDICATED_H2V2_MERGED_UPSAMPLE_SSE2
+
+%endif ; JDMERGE_SSE2_SUPPORTED
+%endif ; UPSAMPLE_MERGING_SUPPORTED
+%endif ; RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4
diff --git a/jdphuff.c b/jdphuff.c
index 2267809..a1d92b7 100644
--- a/jdphuff.c
+++ b/jdphuff.c
@@ -5,6 +5,13 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified to improve performance.
+ * Last Modified : October 31, 2004
+ * ---------------------------------------------------------------------
+ *
  * This file contains Huffman entropy decoding routines for progressive JPEG.
  *
  * Much of the complexity here has to do with supporting input suspension.
@@ -69,6 +76,7 @@
   d_derived_tbl * derived_tbls[NUM_HUFF_TBLS];
 
   d_derived_tbl * ac_derived_tbl; /* active table during an AC scan */
+  d_derived_tbl * dc_derived_tbls[MAX_COMPS_IN_SCAN];
 } phuff_entropy_decoder;
 
 typedef phuff_entropy_decoder * phuff_entropy_ptr;
@@ -168,6 +176,7 @@
 	tbl = compptr->dc_tbl_no;
 	jpeg_make_d_derived_tbl(cinfo, TRUE, tbl,
 				& entropy->derived_tbls[tbl]);
+	entropy->dc_derived_tbls[ci] = entropy->derived_tbls[tbl];
       }
     } else {
       tbl = compptr->ac_tbl_no;
@@ -194,32 +203,6 @@
 
 
 /*
- * Figure F.12: extend sign bit.
- * On some machines, a shift and add will be faster than a table lookup.
- */
-
-#ifdef AVOID_TABLES
-
-#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x))
-
-#else
-
-#define HUFF_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
-
-static const int extend_test[16] =   /* entry n is 2**(n-1) */
-  { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
-    0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
-
-static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
-  { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
-    ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
-    ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
-    ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
-
-#endif /* AVOID_TABLES */
-
-
-/*
  * Check for a restart marker & resynchronize decoder.
  * Returns FALSE if must suspend.
  */
@@ -284,16 +267,12 @@
 
 METHODDEF(boolean)
 decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{   
+{
   phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
   int Al = cinfo->Al;
-  register int s, r;
-  int blkn, ci;
-  JBLOCKROW block;
+  int blkn;
   BITREAD_STATE_VARS;
   savable_state state;
-  d_derived_tbl * tbl;
-  jpeg_component_info * compptr;
 
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
@@ -314,21 +293,67 @@
     /* Outer loop handles each block in the MCU */
 
     for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-      block = MCU_data[blkn];
-      ci = cinfo->MCU_membership[blkn];
-      compptr = cinfo->cur_comp_info[ci];
-      tbl = entropy->derived_tbls[compptr->dc_tbl_no];
+      JBLOCKROW block = MCU_data[blkn];
+      int ci = cinfo->MCU_membership[blkn];
+      d_derived_tbl * tbl = entropy->dc_derived_tbls[ci];
+      register int s;
 
       /* Decode a single block's worth of coefficients */
 
       /* Section F.2.2.1: decode the DC coefficient difference */
-      HUFF_DECODE(s, br_state, tbl, return FALSE, label1);
-      if (s) {
-	CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	r = GET_BITS(s);
-	s = HUFF_EXTEND(r, s);
+      {		/* HUFFX_DECODE */
+	register int nb, look, t;
+	if (bits_left < HUFFX_LOOKAHEAD) {
+	  register const JOCTET * next_input_byte = br_state.next_input_byte;
+	  register size_t         bytes_in_buffer = br_state.bytes_in_buffer;
+	  if (cinfo->unread_marker == 0) {
+	    while (bits_left < MIN_GET_BITS) {
+	      register int c;
+	      if (bytes_in_buffer == 0 ||
+		  (c = GETJOCTET(*next_input_byte)) == 0xFF) {
+		goto label11; }
+	      bytes_in_buffer--; next_input_byte++;
+	      get_buffer = (get_buffer << 8) | c;
+	      bits_left += 8;
+	    }
+	    br_state.next_input_byte = next_input_byte;
+	    br_state.bytes_in_buffer = bytes_in_buffer;
+	  } else {
+	label11:
+	    br_state.next_input_byte = next_input_byte;
+	    br_state.bytes_in_buffer = bytes_in_buffer;
+	    if (! jpeg_fill_bit_buffer(&br_state,get_buffer,bits_left, 0)) {
+	      return FALSE; }
+	    get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	    if (bits_left < HUFFX_LOOKAHEAD) {
+	      nb = 1; goto label1;
+	    }
+	  }
+	}
+	look = PEEK_BITS(HUFFX_LOOKAHEAD);
+	if ((nb = tbl->lookx_nbits[look]) != 0) {
+	  s = tbl->lookx_val[look];
+	  if (nb <= HUFFX_LOOKAHEAD) {
+	    DROP_BITS(nb);
+	  } else {
+	    DROP_BITS(HUFFX_LOOKAHEAD);
+	    nb -= HUFFX_LOOKAHEAD;
+	    CHECK_BIT_BUFFER(br_state, nb, return FALSE);
+	    s += GET_BITS(nb);
+	  }
+	} else {
+	  nb = HUFFX_LOOKAHEAD;
+      label1:
+	  if ((s=jpeg_huff_decode(&br_state,get_buffer,bits_left,tbl,nb))
+	       < 0) { return FALSE; }
+	  get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	  if (s) {
+	    CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	    t = GET_BITS(s);
+	    s = HUFF_EXTEND(t, s);
+	  }
+	}
       }
-
       /* Convert DC difference to actual value, update last_dc_val */
       s += state.last_dc_val[ci];
       state.last_dc_val[ci] = s;
@@ -355,15 +380,12 @@
 
 METHODDEF(boolean)
 decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{   
+{
   phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
   int Se = cinfo->Se;
   int Al = cinfo->Al;
-  register int s, k, r;
   unsigned int EOBRUN;
-  JBLOCKROW block;
   BITREAD_STATE_VARS;
-  d_derived_tbl * tbl;
 
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
@@ -384,22 +406,74 @@
 
     /* There is always only one block per MCU */
 
-    if (EOBRUN > 0)		/* if it's a band of zeroes... */
+    if (EOBRUN > 0) {		/* if it's a band of zeroes... */
       EOBRUN--;			/* ...process it now (we do nothing) */
-    else {
+    } else {
+      JBLOCKROW block = MCU_data[0];
+      d_derived_tbl * tbl = entropy->ac_derived_tbl;
+      register int s, k, r;
+
+      /* Load up working state */
       BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-      block = MCU_data[0];
-      tbl = entropy->ac_derived_tbl;
 
       for (k = cinfo->Ss; k <= Se; k++) {
-	HUFF_DECODE(s, br_state, tbl, return FALSE, label2);
-	r = s >> 4;
-	s &= 15;
+	{	/* HUFFX_DECODE */
+	  register int nb, look, t;
+	  if (bits_left < HUFFX_LOOKAHEAD) {
+	    register const JOCTET * next_input_byte = br_state.next_input_byte;
+	    register size_t         bytes_in_buffer = br_state.bytes_in_buffer;
+	    if (cinfo->unread_marker == 0) {
+	      while (bits_left < MIN_GET_BITS) {
+		register int c;
+		if (bytes_in_buffer == 0 ||
+		    (c = GETJOCTET(*next_input_byte)) == 0xFF) {
+		  goto label21; }
+		bytes_in_buffer--; next_input_byte++;
+		get_buffer = (get_buffer << 8) | c;
+		bits_left += 8;
+	      }
+	      br_state.next_input_byte = next_input_byte;
+	      br_state.bytes_in_buffer = bytes_in_buffer;
+	    } else {
+	  label21:
+	      br_state.next_input_byte = next_input_byte;
+	      br_state.bytes_in_buffer = bytes_in_buffer;
+	      if (! jpeg_fill_bit_buffer(&br_state,get_buffer,bits_left, 0)) {
+		return FALSE; }
+	      get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	      if (bits_left < HUFFX_LOOKAHEAD) {
+		nb = 1; goto label2;
+	      }
+	    }
+	  }
+	  look = PEEK_BITS(HUFFX_LOOKAHEAD);
+	  if ((nb = tbl->lookx_nbits[look]) != 0) {
+	    s = tbl->lookx_val[look];
+	    r = tbl->lookx_sym[look] >> 4;
+	    if (nb <= HUFFX_LOOKAHEAD) {
+	      DROP_BITS(nb);
+	    } else {
+	      DROP_BITS(HUFFX_LOOKAHEAD);
+	      nb -= HUFFX_LOOKAHEAD;
+	      CHECK_BIT_BUFFER(br_state, nb, return FALSE);
+	      s += GET_BITS(nb);
+	    }
+	  } else {
+	    nb = HUFFX_LOOKAHEAD;
+	label2:
+	    if ((s=jpeg_huff_decode(&br_state,get_buffer,bits_left,tbl,nb))
+		 < 0) { return FALSE; }
+	    get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	    r = s >> 4; s &= 15;
+	    if (s) {
+	      CHECK_BIT_BUFFER(br_state, s, return FALSE);
+	      t = GET_BITS(s);
+	      s = HUFF_EXTEND(t, s);
+	    }
+	  }
+	}
 	if (s) {
 	  k += r;
-	  CHECK_BIT_BUFFER(br_state, s, return FALSE);
-	  r = GET_BITS(s);
-	  s = HUFF_EXTEND(r, s);
 	  /* Scale and output coefficient in natural (dezigzagged) order */
 	  (*block)[jpeg_natural_order[k]] = (JCOEF) (s << Al);
 	} else {
@@ -440,11 +514,10 @@
 
 METHODDEF(boolean)
 decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{   
+{
   phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
   int p1 = 1 << cinfo->Al;	/* 1 in the bit position being coded */
   int blkn;
-  JBLOCKROW block;
   BITREAD_STATE_VARS;
 
   /* Process restart marker if needed; may have to suspend */
@@ -464,7 +537,7 @@
   /* Outer loop handles each block in the MCU */
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-    block = MCU_data[blkn];
+    JBLOCKROW block = MCU_data[blkn];
 
     /* Encoded data is simply the next bit of the two's-complement DC value */
     CHECK_BIT_BUFFER(br_state, 1, return FALSE);
@@ -489,17 +562,17 @@
 
 METHODDEF(boolean)
 decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{   
+{
   phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
   int Se = cinfo->Se;
-  int p1 = 1 << cinfo->Al;	/* 1 in the bit position being coded */
-  int m1 = (-1) << cinfo->Al;	/* -1 in the bit position being coded */
+  int Al = cinfo->Al;
   register int s, k, r;
   unsigned int EOBRUN;
   JBLOCKROW block;
   JCOEFPTR thiscoef;
   BITREAD_STATE_VARS;
   d_derived_tbl * tbl;
+  int pm1[2];
   int num_newnz;
   int newnz_pos[DCTSIZE2];
 
@@ -522,6 +595,13 @@
     block = MCU_data[0];
     tbl = entropy->ac_derived_tbl;
 
+    /* The pm1[] array is indexed by a value from relational operator.
+     * This method eliminates conditional branches depending on random data,
+     * which result in lower performance on recent processors.
+     */
+    pm1[0] =   1  << cinfo->Al;	/* +1 in the bit position being coded */
+    pm1[1] = (-1) << cinfo->Al;	/* -1 in the bit position being coded */
+
     /* If we are forced to suspend, we must undo the assignments to any newly
      * nonzero coefficients in the block, because otherwise we'd get confused
      * next time about which coefficients were already nonzero.
@@ -535,18 +615,63 @@
 
     if (EOBRUN == 0) {
       for (; k <= Se; k++) {
-	HUFF_DECODE(s, br_state, tbl, goto undoit, label3);
-	r = s >> 4;
-	s &= 15;
-	if (s) {
-	  if (s != 1)		/* size of new coef should always be 1 */
-	    WARNMS(cinfo, JWRN_HUFF_BAD_CODE);
-	  CHECK_BIT_BUFFER(br_state, 1, goto undoit);
-	  if (GET_BITS(1))
-	    s = p1;		/* newly nonzero coef is positive */
-	  else
-	    s = m1;		/* newly nonzero coef is negative */
-	} else {
+	{	/* HUFFX_DECODE */
+	  register int nb, look, t;
+	  if (bits_left < HUFFX_LOOKAHEAD) {
+	    register const JOCTET * next_input_byte = br_state.next_input_byte;
+	    register size_t         bytes_in_buffer = br_state.bytes_in_buffer;
+	    if (cinfo->unread_marker == 0) {
+	      while (bits_left < MIN_GET_BITS) {
+		register int c;
+		if (bytes_in_buffer == 0 ||
+		    (c = GETJOCTET(*next_input_byte)) == 0xFF) {
+		  goto label31; }
+		bytes_in_buffer--; next_input_byte++;
+		get_buffer = (get_buffer << 8) | c;
+		bits_left += 8;
+	      }
+	      br_state.next_input_byte = next_input_byte;
+	      br_state.bytes_in_buffer = bytes_in_buffer;
+	    } else {
+	  label31:
+	      br_state.next_input_byte = next_input_byte;
+	      br_state.bytes_in_buffer = bytes_in_buffer;
+	      if (! jpeg_fill_bit_buffer(&br_state,get_buffer,bits_left, 0)) {
+		goto undoit; }
+	      get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	      if (bits_left < HUFFX_LOOKAHEAD) {
+		nb = 1; goto label3;
+	      }
+	    }
+	  }
+	  look = PEEK_BITS(HUFFX_LOOKAHEAD);
+	  if ((nb = tbl->lookx_nbits[look]) != 0) {
+	    t = tbl->lookx_sym[look];
+	    s = tbl->lookx_val[look];
+	    r = t >> 4; t &= 15;
+	    if (t <= 1) {
+	      DROP_BITS(nb);
+	    } else {		  /* size of new coef should always be 1 */
+	      WARNMS(cinfo, JWRN_HUFF_BAD_CODE);
+	      DROP_BITS(nb - (t - 1));
+	      s = (s >= 0) ? 1 : -1;
+	    }
+	  } else {
+	    nb = HUFFX_LOOKAHEAD;
+	label3:
+	    if ((s=jpeg_huff_decode(&br_state,get_buffer,bits_left,tbl,nb))
+		 < 0) { goto undoit; }
+	    get_buffer = br_state.get_buffer; bits_left = br_state.bits_left;
+	    r = s >> 4; s &= 15;
+	    if (s) {
+	      if (s != 1)	    /* size of new coef should always be 1 */
+		WARNMS(cinfo, JWRN_HUFF_BAD_CODE);
+	      CHECK_BIT_BUFFER(br_state, 1, goto undoit);
+	      s = GET_BITS(1) ? 1 : -1;
+	    }
+	  }
+	}
+	if (s == 0) {
 	  if (r != 15) {
 	    EOBRUN = 1 << r;	/* EOBr, run length is 2^r + appended bits */
 	    if (r) {
@@ -567,12 +692,8 @@
 	  if (*thiscoef != 0) {
 	    CHECK_BIT_BUFFER(br_state, 1, goto undoit);
 	    if (GET_BITS(1)) {
-	      if ((*thiscoef & p1) == 0) { /* do nothing if already set it */
-		if (*thiscoef >= 0)
-		  *thiscoef += p1;
-		else
-		  *thiscoef += m1;
-	      }
+	      if ((*thiscoef & pm1[0]) == 0) /* do nothing if already set it */
+		*thiscoef += pm1[(*thiscoef < 0)];
 	    }
 	  } else {
 	    if (--r < 0)
@@ -583,7 +704,7 @@
 	if (s) {
 	  int pos = jpeg_natural_order[k];
 	  /* Output newly nonzero coefficient */
-	  (*block)[pos] = (JCOEF) s;
+	  (*block)[pos] = (JCOEF) (s << Al);
 	  /* Remember its position in case we have to suspend */
 	  newnz_pos[num_newnz++] = pos;
 	}
@@ -601,12 +722,8 @@
 	if (*thiscoef != 0) {
 	  CHECK_BIT_BUFFER(br_state, 1, goto undoit);
 	  if (GET_BITS(1)) {
-	    if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */
-	      if (*thiscoef >= 0)
-		*thiscoef += p1;
-	      else
-		*thiscoef += m1;
-	    }
+	    if ((*thiscoef & pm1[0]) == 0)  /* do nothing if already set it */
+	      *thiscoef += pm1[(*thiscoef < 0)];
 	  }
 	}
       }
diff --git a/jdsammmx.asm b/jdsammmx.asm
new file mode 100644
index 0000000..bb17d37
--- /dev/null
+++ b/jdsammmx.asm
@@ -0,0 +1,893 @@
+;
+; jdsammmx.asm - upsampling (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jcolsamp.inc"
+
+%ifdef JDSAMPLE_FANCY_MMX_SUPPORTED
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fancy_upsample_mmx)
+
+EXTN(jconst_fancy_upsample_mmx):
+
+PW_ONE		times 4 dw  1
+PW_TWO		times 4 dw  2
+PW_THREE	times 4 dw  3
+PW_SEVEN	times 4 dw  7
+PW_EIGHT	times 4 dw  8
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jpeg_h2v1_fancy_upsample_mmx (j_decompress_ptr cinfo,
+;                               jpeg_component_info * compptr,
+;                               JSAMPARRAY input_data,
+;                               JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jpeg_h2v1_fancy_upsample_mmx)
+
+EXTN(jpeg_h2v1_fancy_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	mov	eax, POINTER [compptr(ebp)]
+	mov	eax, JDIMENSION [jcompinfo_downsampled_width(eax)]  ; colctr
+	test	eax,eax
+	jz	near .return
+
+	mov	ecx, POINTER [cinfo(ebp)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax			; colctr
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr
+
+	test	eax, SIZEOF_MMWORD-1
+	jz	short .skip
+	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+.skip:
+	pxor	mm0,mm0			; mm0=(all 0's)
+	pcmpeqb	mm7,mm7
+	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
+	pand	mm7, MMWORD [esi+0*SIZEOF_MMWORD]
+
+	add	eax, byte SIZEOF_MMWORD-1
+	and	eax, byte -SIZEOF_MMWORD
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	short .columnloop
+	alignx	16,7
+
+.columnloop_last:
+	pcmpeqb	mm6,mm6
+	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
+	pand	mm6, MMWORD [esi+0*SIZEOF_MMWORD]
+	jmp	short .upsample
+	alignx	16,7
+
+.columnloop:
+	movq	mm6, MMWORD [esi+1*SIZEOF_MMWORD]
+	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
+
+.upsample:
+	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+	movq	mm2,mm1
+	movq	mm3,mm1			; mm1=( 0 1 2 3 4 5 6 7)
+	psllq	mm2,BYTE_BIT		; mm2=( - 0 1 2 3 4 5 6)
+	psrlq	mm3,BYTE_BIT		; mm3=( 1 2 3 4 5 6 7 -)
+
+	por	mm2,mm7			; mm2=(-1 0 1 2 3 4 5 6)
+	por	mm3,mm6			; mm3=( 1 2 3 4 5 6 7 8)
+
+	movq	mm7,mm1
+	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT	; mm7=( 7 - - - - - - -)
+
+	movq      mm4,mm1
+	punpcklbw mm1,mm0		; mm1=( 0 1 2 3)
+	punpckhbw mm4,mm0		; mm4=( 4 5 6 7)
+	movq      mm5,mm2
+	punpcklbw mm2,mm0		; mm2=(-1 0 1 2)
+	punpckhbw mm5,mm0		; mm5=( 3 4 5 6)
+	movq      mm6,mm3
+	punpcklbw mm3,mm0		; mm3=( 1 2 3 4)
+	punpckhbw mm6,mm0		; mm6=( 5 6 7 8)
+
+	pmullw	mm1,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	mm2,[GOTOFF(ebx,PW_ONE)]
+	paddw	mm5,[GOTOFF(ebx,PW_ONE)]
+	paddw	mm3,[GOTOFF(ebx,PW_TWO)]
+	paddw	mm6,[GOTOFF(ebx,PW_TWO)]
+
+	paddw	mm2,mm1
+	paddw	mm5,mm4
+	psrlw	mm2,2			; mm2=OutLE=( 0  2  4  6)
+	psrlw	mm5,2			; mm5=OutHE=( 8 10 12 14)
+	paddw	mm3,mm1
+	paddw	mm6,mm4
+	psrlw	mm3,2			; mm3=OutLO=( 1  3  5  7)
+	psrlw	mm6,2			; mm6=OutHO=( 9 11 13 15)
+
+	psllw	mm3,BYTE_BIT
+	psllw	mm6,BYTE_BIT
+	por	mm2,mm3			; mm2=OutL=( 0  1  2  3  4  5  6  7)
+	por	mm5,mm6			; mm5=OutH=( 8  9 10 11 12 13 14 15)
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm5
+
+	sub	eax, byte SIZEOF_MMWORD
+	add	esi, byte 1*SIZEOF_MMWORD	; inptr
+	add	edi, byte 2*SIZEOF_MMWORD	; outptr
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	esi
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	ecx				; rowctr
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jpeg_h2v2_fancy_upsample_mmx (j_decompress_ptr cinfo,
+;                               jpeg_component_info * compptr,
+;                               JSAMPARRAY input_data,
+;                               JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		4
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_h2v2_fancy_upsample_mmx)
+
+EXTN(jpeg_h2v2_fancy_upsample_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	edx,eax				; edx = original ebp
+	mov	eax, POINTER [compptr(edx)]
+	mov	eax, JDIMENSION [jcompinfo_downsampled_width(eax)]  ; colctr
+	test	eax,eax
+	jz	near .return
+
+	mov	ecx, POINTER [cinfo(edx)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(edx)]	; input_data
+	mov	edi, POINTER [output_data_ptr(edx)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax					; colctr
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+
+	test	eax, SIZEOF_MMWORD-1
+	jz	short .skip
+	push	edx
+	mov	dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+	pop	edx
+.skip:
+	; -- process the first column block
+
+	movq	mm0, MMWORD [ebx+0*SIZEOF_MMWORD]	; mm0=row[ 0][0]
+	movq	mm1, MMWORD [ecx+0*SIZEOF_MMWORD]	; mm1=row[-1][0]
+	movq	mm2, MMWORD [esi+0*SIZEOF_MMWORD]	; mm2=row[+1][0]
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pxor      mm3,mm3		; mm3=(all 0's)
+	movq      mm4,mm0
+	punpcklbw mm0,mm3		; mm0=row[ 0][0]( 0 1 2 3)
+	punpckhbw mm4,mm3		; mm4=row[ 0][0]( 4 5 6 7)
+	movq      mm5,mm1
+	punpcklbw mm1,mm3		; mm1=row[-1][0]( 0 1 2 3)
+	punpckhbw mm5,mm3		; mm5=row[-1][0]( 4 5 6 7)
+	movq      mm6,mm2
+	punpcklbw mm2,mm3		; mm2=row[+1][0]( 0 1 2 3)
+	punpckhbw mm6,mm3		; mm6=row[+1][0]( 4 5 6 7)
+
+	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+
+	pcmpeqb	mm7,mm7
+	psrlq	mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
+
+	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
+	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
+	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
+	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
+
+	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1	; temporarily save
+	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5	; the intermediate data
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm6
+
+	pand	mm1,mm7			; mm1=( 0 - - -)
+	pand	mm2,mm7			; mm2=( 0 - - -)
+
+	movq	MMWORD [wk(0)], mm1
+	movq	MMWORD [wk(1)], mm2
+
+	poppic	ebx
+
+	add	eax, byte SIZEOF_MMWORD-1
+	and	eax, byte -SIZEOF_MMWORD
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	short .columnloop
+	alignx	16,7
+
+.columnloop_last:
+	; -- process the last column block
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pcmpeqb	mm1,mm1
+	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
+	movq	mm2,mm1
+
+	pand	mm1, MMWORD [edx+1*SIZEOF_MMWORD]	; mm1=( - - - 7)
+	pand	mm2, MMWORD [edi+1*SIZEOF_MMWORD]	; mm2=( - - - 7)
+
+	movq	MMWORD [wk(2)], mm1
+	movq	MMWORD [wk(3)], mm2
+
+	jmp	short .upsample
+	alignx	16,7
+
+.columnloop:
+	; -- process the next column block
+
+	movq	mm0, MMWORD [ebx+1*SIZEOF_MMWORD]	; mm0=row[ 0][1]
+	movq	mm1, MMWORD [ecx+1*SIZEOF_MMWORD]	; mm1=row[-1][1]
+	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]	; mm2=row[+1][1]
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pxor      mm3,mm3		; mm3=(all 0's)
+	movq      mm4,mm0
+	punpcklbw mm0,mm3		; mm0=row[ 0][1]( 0 1 2 3)
+	punpckhbw mm4,mm3		; mm4=row[ 0][1]( 4 5 6 7)
+	movq      mm5,mm1
+	punpcklbw mm1,mm3		; mm1=row[-1][1]( 0 1 2 3)
+	punpckhbw mm5,mm3		; mm5=row[-1][1]( 4 5 6 7)
+	movq      mm6,mm2
+	punpcklbw mm2,mm3		; mm2=row[+1][1]( 0 1 2 3)
+	punpckhbw mm6,mm3		; mm6=row[+1][1]( 4 5 6 7)
+
+	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+
+	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
+	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
+	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
+	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
+
+	movq	MMWORD [edx+2*SIZEOF_MMWORD], mm1	; temporarily save
+	movq	MMWORD [edx+3*SIZEOF_MMWORD], mm5	; the intermediate data
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm6
+
+	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm1=( - - - 0)
+	psllq	mm2,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm2=( - - - 0)
+
+	movq	MMWORD [wk(2)], mm1
+	movq	MMWORD [wk(3)], mm2
+
+.upsample:
+	; -- process the upper row
+
+	movq	mm7, MMWORD [edx+0*SIZEOF_MMWORD]	; mm7=Int0L=( 0 1 2 3)
+	movq	mm3, MMWORD [edx+1*SIZEOF_MMWORD]	; mm3=Int0H=( 4 5 6 7)
+
+	movq	mm0,mm7
+	movq	mm4,mm3
+	psrlq	mm0,2*BYTE_BIT			; mm0=( 1 2 3 -)
+	psllq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( - - - 4)
+	movq	mm5,mm7
+	movq	mm6,mm3
+	psrlq	mm5,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm5=( 3 - - -)
+	psllq	mm6,2*BYTE_BIT			; mm6=( - 4 5 6)
+
+	por	mm0,mm4				; mm0=( 1 2 3 4)
+	por	mm5,mm6				; mm5=( 3 4 5 6)
+
+	movq	mm1,mm7
+	movq	mm2,mm3
+	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
+	psrlq	mm2,2*BYTE_BIT			; mm2=( 5 6 7 -)
+	movq	mm4,mm3
+	psrlq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( 7 - - -)
+
+	por	mm1, MMWORD [wk(0)]		; mm1=(-1 0 1 2)
+	por	mm2, MMWORD [wk(2)]		; mm2=( 5 6 7 8)
+
+	movq	MMWORD [wk(0)], mm4
+
+	pmullw	mm7,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm3,[GOTOFF(ebx,PW_THREE)]
+	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm5,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm0,[GOTOFF(ebx,PW_SEVEN)]
+	paddw	mm2,[GOTOFF(ebx,PW_SEVEN)]
+
+	paddw	mm1,mm7
+	paddw	mm5,mm3
+	psrlw	mm1,4			; mm1=Out0LE=( 0  2  4  6)
+	psrlw	mm5,4			; mm5=Out0HE=( 8 10 12 14)
+	paddw	mm0,mm7
+	paddw	mm2,mm3
+	psrlw	mm0,4			; mm0=Out0LO=( 1  3  5  7)
+	psrlw	mm2,4			; mm2=Out0HO=( 9 11 13 15)
+
+	psllw	mm0,BYTE_BIT
+	psllw	mm2,BYTE_BIT
+	por	mm1,mm0			; mm1=Out0L=( 0  1  2  3  4  5  6  7)
+	por	mm5,mm2			; mm5=Out0H=( 8  9 10 11 12 13 14 15)
+
+	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1
+	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5
+
+	; -- process the lower row
+
+	movq	mm6, MMWORD [edi+0*SIZEOF_MMWORD]	; mm6=Int1L=( 0 1 2 3)
+	movq	mm4, MMWORD [edi+1*SIZEOF_MMWORD]	; mm4=Int1H=( 4 5 6 7)
+
+	movq	mm7,mm6
+	movq	mm3,mm4
+	psrlq	mm7,2*BYTE_BIT			; mm7=( 1 2 3 -)
+	psllq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( - - - 4)
+	movq	mm0,mm6
+	movq	mm2,mm4
+	psrlq	mm0,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm0=( 3 - - -)
+	psllq	mm2,2*BYTE_BIT			; mm2=( - 4 5 6)
+
+	por	mm7,mm3				; mm7=( 1 2 3 4)
+	por	mm0,mm2				; mm0=( 3 4 5 6)
+
+	movq	mm1,mm6
+	movq	mm5,mm4
+	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
+	psrlq	mm5,2*BYTE_BIT			; mm5=( 5 6 7 -)
+	movq	mm3,mm4
+	psrlq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( 7 - - -)
+
+	por	mm1, MMWORD [wk(1)]		; mm1=(-1 0 1 2)
+	por	mm5, MMWORD [wk(3)]		; mm5=( 5 6 7 8)
+
+	movq	MMWORD [wk(1)], mm3
+
+	pmullw	mm6,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm0,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	mm7,[GOTOFF(ebx,PW_SEVEN)]
+	paddw	mm5,[GOTOFF(ebx,PW_SEVEN)]
+
+	paddw	mm1,mm6
+	paddw	mm0,mm4
+	psrlw	mm1,4			; mm1=Out1LE=( 0  2  4  6)
+	psrlw	mm0,4			; mm0=Out1HE=( 8 10 12 14)
+	paddw	mm7,mm6
+	paddw	mm5,mm4
+	psrlw	mm7,4			; mm7=Out1LO=( 1  3  5  7)
+	psrlw	mm5,4			; mm5=Out1HO=( 9 11 13 15)
+
+	psllw	mm7,BYTE_BIT
+	psllw	mm5,BYTE_BIT
+	por	mm1,mm7			; mm1=Out1L=( 0  1  2  3  4  5  6  7)
+	por	mm0,mm5			; mm0=Out1H=( 8  9 10 11 12 13 14 15)
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm1
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm0
+
+	poppic	ebx
+
+	sub	eax, byte SIZEOF_MMWORD
+	add	ecx, byte 1*SIZEOF_MMWORD	; inptr1(above)
+	add	ebx, byte 1*SIZEOF_MMWORD	; inptr0
+	add	esi, byte 1*SIZEOF_MMWORD	; inptr1(below)
+	add	edx, byte 2*SIZEOF_MMWORD	; outptr0
+	add	edi, byte 2*SIZEOF_MMWORD	; outptr1
+	cmp	eax, byte SIZEOF_MMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	esi
+	pop	edi
+	pop	ecx
+	pop	eax
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%ifdef UPSAMPLE_H1V2_SUPPORTED
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 1:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jpeg_h1v2_fancy_upsample_mmx (j_decompress_ptr cinfo,
+;                               jpeg_component_info * compptr,
+;                               JSAMPARRAY input_data,
+;                               JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+%define gotptr		ebp-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_h1v2_fancy_upsample_mmx)
+
+EXTN(jpeg_h1v2_fancy_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	eax, POINTER [compptr(ebp)]
+	mov	eax, JDIMENSION [jcompinfo_downsampled_width(eax)]  ; colctr
+	add	eax, byte SIZEOF_MMWORD-1
+	and	eax, byte -SIZEOF_MMWORD
+	jz	near .return
+
+	mov	ecx, POINTER [cinfo(ebp)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax					; colctr
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+
+	pxor	mm0,mm0			; mm0=(all 0's)
+	alignx	16,7
+
+.columnloop:
+	movq	mm1, MMWORD [ebx]	; mm1=row[ 0]( 0 1 2 3 4 5 6 7)
+	movq	mm2, MMWORD [ecx]	; mm2=row[-1]( 0 1 2 3 4 5 6 7)
+	movq	mm3, MMWORD [esi]	; mm3=row[+1]( 0 1 2 3 4 5 6 7)
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	movq      mm4,mm1
+	punpcklbw mm1,mm0		; mm1=row[ 0]( 0 1 2 3)
+	punpckhbw mm4,mm0		; mm4=row[ 0]( 4 5 6 7)
+	movq      mm5,mm2
+	punpcklbw mm2,mm0		; mm2=row[-1]( 0 1 2 3)
+	punpckhbw mm5,mm0		; mm5=row[-1]( 4 5 6 7)
+	movq      mm6,mm3
+	punpcklbw mm3,mm0		; mm3=row[+1]( 0 1 2 3)
+	punpckhbw mm6,mm0		; mm6=row[+1]( 4 5 6 7)
+
+	pmullw	mm1,[GOTOFF(ebx,PW_THREE)]
+	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	mm2,[GOTOFF(ebx,PW_ONE)]
+	paddw	mm5,[GOTOFF(ebx,PW_ONE)]
+	paddw	mm3,[GOTOFF(ebx,PW_TWO)]
+	paddw	mm6,[GOTOFF(ebx,PW_TWO)]
+
+	paddw	mm2,mm1
+	paddw	mm5,mm4
+	psrlw	mm2,2			; mm2=Out0L=( 0 1 2 3)
+	psrlw	mm5,2			; mm5=Out0H=( 4 5 6 7)
+	paddw	mm3,mm1
+	paddw	mm6,mm4
+	psrlw	mm3,2			; mm3=Out1L=( 0 1 2 3)
+	psrlw	mm6,2			; mm6=Out1H=( 4 5 6 7)
+
+	packuswb  mm2,mm5		; mm2=Out0=( 0 1 2 3 4 5 6 7)
+	packuswb  mm3,mm6		; mm3=Out1=( 0 1 2 3 4 5 6 7)
+
+	movq	MMWORD [edx], mm2
+	movq	MMWORD [edi], mm3
+
+	poppic	ebx
+
+	add	ecx, byte 1*SIZEOF_MMWORD	; inptr1(above)
+	add	ebx, byte 1*SIZEOF_MMWORD	; inptr0
+	add	esi, byte 1*SIZEOF_MMWORD	; inptr1(below)
+	add	edx, byte 1*SIZEOF_MMWORD	; outptr0
+	add	edi, byte 1*SIZEOF_MMWORD	; outptr1
+	sub	eax, byte SIZEOF_MMWORD
+	jnz	near .columnloop
+
+	pop	esi
+	pop	edi
+	pop	ecx
+	pop	eax
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	near .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	poppic	eax		; remove gotptr
+	pop	ebp
+	ret
+
+%endif ; UPSAMPLE_H1V2_SUPPORTED
+%endif ; JDSAMPLE_FANCY_MMX_SUPPORTED
+
+%ifdef JDSAMPLE_SIMPLE_MMX_SUPPORTED
+
+%ifndef JDSAMPLE_FANCY_MMX_SUPPORTED
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+%endif
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jpeg_h2v1_upsample_mmx (j_decompress_ptr cinfo,
+;                         jpeg_component_info * compptr,
+;                         JSAMPARRAY input_data,
+;                         JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jpeg_h2v1_upsample_mmx)
+
+EXTN(jpeg_h2v1_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	edx, POINTER [cinfo(ebp)]
+	mov	edx, JDIMENSION [jdstruct_output_width(edx)]
+	add	edx, byte (2*SIZEOF_MMWORD)-1
+	and	edx, byte -(2*SIZEOF_MMWORD)
+	jz	short .return
+
+	mov	ecx, POINTER [cinfo(ebp)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	short .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+	mov	eax,edx				; colctr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+	movq      mm1,mm0
+	punpcklbw mm0,mm0
+	punpckhbw mm1,mm1
+
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+	movq      mm3,mm2
+	punpcklbw mm2,mm2
+	punpckhbw mm3,mm3
+
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr
+	add	edi, byte 4*SIZEOF_MMWORD	; outptr
+	jmp	short .columnloop
+	alignx	16,7
+
+.nextrow:
+	pop	esi
+	pop	edi
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	ecx				; rowctr
+	jg	short .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jpeg_h2v2_upsample_mmx (j_decompress_ptr cinfo,
+;                         jpeg_component_info * compptr,
+;                         JSAMPARRAY input_data,
+;                         JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jpeg_h2v2_upsample_mmx)
+
+EXTN(jpeg_h2v2_upsample_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	edx, POINTER [cinfo(ebp)]
+	mov	edx, JDIMENSION [jdstruct_output_width(edx)]
+	add	edx, byte (2*SIZEOF_MMWORD)-1
+	and	edx, byte -(2*SIZEOF_MMWORD)
+	jz	near .return
+
+	mov	ecx, POINTER [cinfo(ebp)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	short .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]			; inptr
+	mov	ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+	mov	eax,edx					; colctr
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+	movq      mm1,mm0
+	punpcklbw mm0,mm0
+	punpckhbw mm1,mm1
+
+	movq	MMWORD [ebx+0*SIZEOF_MMWORD], mm0
+	movq	MMWORD [ebx+1*SIZEOF_MMWORD], mm1
+	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
+	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+	movq      mm3,mm2
+	punpcklbw mm2,mm2
+	punpckhbw mm3,mm3
+
+	movq	MMWORD [ebx+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [ebx+3*SIZEOF_MMWORD], mm3
+	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
+	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+	sub	eax, byte 2*SIZEOF_MMWORD
+	jz	short .nextrow
+
+	add	esi, byte 2*SIZEOF_MMWORD	; inptr
+	add	ebx, byte 4*SIZEOF_MMWORD	; outptr0
+	add	edi, byte 4*SIZEOF_MMWORD	; outptr1
+	jmp	short .columnloop
+	alignx	16,7
+
+.nextrow:
+	pop	esi
+	pop	edi
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	short .rowloop
+
+	emms		; empty MMX state
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%endif ; JDSAMPLE_SIMPLE_MMX_SUPPORTED
diff --git a/jdsample.c b/jdsample.c
index 80ffefb..37a6cee 100644
--- a/jdsample.c
+++ b/jdsample.c
@@ -5,6 +5,13 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : January 5, 2006
+ * ---------------------------------------------------------------------
+ *
  * This file contains upsampling routines.
  *
  * Upsampling input data is counted in "row groups".  A row group
@@ -21,6 +28,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jcolsamp.h"		/* Private declarations */
 
 
 /* Pointer to routine to upsample a single component */
@@ -285,6 +293,37 @@
 }
 
 
+#ifdef UPSAMPLE_H1V2_SUPPORTED
+
+/*
+ * Fast processing for the common case of 1:1 horizontal and 2:1 vertical.
+ * It's still a box filter.
+ *
+ * SIMD Ext: This routine is for files that are rotated or transposed
+ *           by jpegtran.
+ */
+
+METHODDEF(void)
+h1v2_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	       JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  int inrow, outrow;
+
+  inrow = outrow = 0;
+  while (outrow < cinfo->max_v_samp_factor) {
+    jcopy_sample_rows(input_data, inrow, output_data, outrow,
+		      1, cinfo->output_width);
+    jcopy_sample_rows(input_data, inrow, output_data, outrow+1,
+		      1, cinfo->output_width);
+    inrow++;
+    outrow += 2;
+  }
+}
+
+#endif /* UPSAMPLE_H1V2_SUPPORTED */
+
+
 /*
  * Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
  *
@@ -391,6 +430,52 @@
 }
 
 
+#ifdef UPSAMPLE_H1V2_SUPPORTED
+
+/*
+ * Fancy processing for the common case of 1:1 horizontal and 2:1 vertical.
+ * Again a triangle filter; see comments for h2v1 case, above.
+ *
+ * It is OK for us to reference the adjacent input rows because we demanded
+ * context from the main buffer controller (see initialization code).
+ *
+ * SIMD Ext: This routine is for files that are rotated or transposed
+ *           by jpegtran.
+ */
+
+METHODDEF(void)
+h1v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		     JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  register JSAMPROW inptr0, inptr1, outptr;
+  register int colsum;
+  register JDIMENSION colctr;
+  int inrow, outrow, v;
+
+  inrow = outrow = 0;
+  while (outrow < cinfo->max_v_samp_factor) {
+    for (v = 0; v < 2; v++) {
+      /* inptr0 points to nearest input row, inptr1 points to next nearest */
+      inptr0 = input_data[inrow];
+      if (v == 0)		/* next nearest is row above */
+	inptr1 = input_data[inrow-1];
+      else			/* next nearest is row below */
+	inptr1 = input_data[inrow+1];
+      outptr = output_data[outrow++];
+
+      for (colctr = compptr->downsampled_width; colctr > 0; colctr--) {
+	colsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
+	*outptr++ = (JSAMPLE) ((colsum + v + 1) >> 2);
+      }
+    }
+    inrow++;
+  }
+}
+
+#endif /* UPSAMPLE_H1V2_SUPPORTED */
+
+
 /*
  * Module initialization routine for upsampling.
  */
@@ -403,6 +488,7 @@
   jpeg_component_info * compptr;
   boolean need_buffer, do_fancy;
   int h_in_group, v_in_group, h_out_group, v_out_group;
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
 
   upsample = (my_upsample_ptr)
     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -447,18 +533,83 @@
     } else if (h_in_group * 2 == h_out_group &&
 	       v_in_group == v_out_group) {
       /* Special cases for 2h1v upsampling */
-      if (do_fancy && compptr->downsampled_width > 2)
-	upsample->methods[ci] = h2v1_fancy_upsample;
-      else
-	upsample->methods[ci] = h2v1_upsample;
+      if (do_fancy && compptr->downsampled_width > 2) {
+#ifdef JDSAMPLE_FANCY_SSE2_SUPPORTED
+	if (simd & JSIMD_SSE2 &&
+	    IS_CONST_ALIGNED_16(jconst_fancy_upsample_sse2))
+	  upsample->methods[ci] = jpeg_h2v1_fancy_upsample_sse2;
+	else
+#endif
+#ifdef JDSAMPLE_FANCY_MMX_SUPPORTED
+	if (simd & JSIMD_MMX)
+	  upsample->methods[ci] = jpeg_h2v1_fancy_upsample_mmx;
+	else
+#endif
+	  upsample->methods[ci] = h2v1_fancy_upsample;
+      } else {
+#ifdef JDSAMPLE_SIMPLE_SSE2_SUPPORTED
+	if (simd & JSIMD_SSE2)
+	  upsample->methods[ci] = jpeg_h2v1_upsample_sse2;
+	else
+#endif
+#ifdef JDSAMPLE_SIMPLE_MMX_SUPPORTED
+	if (simd & JSIMD_MMX)
+	  upsample->methods[ci] = jpeg_h2v1_upsample_mmx;
+	else
+#endif
+	  upsample->methods[ci] = h2v1_upsample;
+      }
     } else if (h_in_group * 2 == h_out_group &&
 	       v_in_group * 2 == v_out_group) {
       /* Special cases for 2h2v upsampling */
       if (do_fancy && compptr->downsampled_width > 2) {
-	upsample->methods[ci] = h2v2_fancy_upsample;
+#ifdef JDSAMPLE_FANCY_SSE2_SUPPORTED
+	if (simd & JSIMD_SSE2 &&
+	    IS_CONST_ALIGNED_16(jconst_fancy_upsample_sse2))
+	  upsample->methods[ci] = jpeg_h2v2_fancy_upsample_sse2;
+	else
+#endif
+#ifdef JDSAMPLE_FANCY_MMX_SUPPORTED
+	if (simd & JSIMD_MMX)
+	  upsample->methods[ci] = jpeg_h2v2_fancy_upsample_mmx;
+	else
+#endif
+	  upsample->methods[ci] = h2v2_fancy_upsample;
+	upsample->pub.need_context_rows = TRUE;
+      } else {
+#ifdef JDSAMPLE_SIMPLE_SSE2_SUPPORTED
+	if (simd & JSIMD_SSE2)
+	  upsample->methods[ci] = jpeg_h2v2_upsample_sse2;
+	else
+#endif
+#ifdef JDSAMPLE_SIMPLE_MMX_SUPPORTED
+	if (simd & JSIMD_MMX)
+	  upsample->methods[ci] = jpeg_h2v2_upsample_mmx;
+	else
+#endif
+	  upsample->methods[ci] = h2v2_upsample;
+      }
+#ifdef UPSAMPLE_H1V2_SUPPORTED
+    } else if (h_in_group == h_out_group &&
+	       v_in_group * 2 == v_out_group) {
+      /* Special cases for 1h2v upsampling */
+      if (do_fancy) {
+#ifdef JDSAMPLE_FANCY_SSE2_SUPPORTED
+	if (simd & JSIMD_SSE2 &&
+	    IS_CONST_ALIGNED_16(jconst_fancy_upsample_sse2))
+	  upsample->methods[ci] = jpeg_h1v2_fancy_upsample_sse2;
+	else
+#endif
+#ifdef JDSAMPLE_FANCY_MMX_SUPPORTED
+	if (simd & JSIMD_MMX)
+	  upsample->methods[ci] = jpeg_h1v2_fancy_upsample_mmx;
+	else
+#endif
+	  upsample->methods[ci] = h1v2_fancy_upsample;
 	upsample->pub.need_context_rows = TRUE;
       } else
-	upsample->methods[ci] = h2v2_upsample;
+	upsample->methods[ci] = h1v2_upsample;
+#endif /* UPSAMPLE_H1V2_SUPPORTED */
     } else if ((h_out_group % h_in_group) == 0 &&
 	       (v_out_group % v_in_group) == 0) {
       /* Generic integral-factors upsampling method */
@@ -468,11 +619,52 @@
     } else
       ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL);
     if (need_buffer) {
+      enum { SIZEOF_XMMWORD = 16 };	/* from jsimdext.inc */
       upsample->color_buf[ci] = (*cinfo->mem->alloc_sarray)
 	((j_common_ptr) cinfo, JPOOL_IMAGE,
-	 (JDIMENSION) jround_up((long) cinfo->output_width,
-				(long) cinfo->max_h_samp_factor),
+	 (JDIMENSION) jround_up(jround_up((long) cinfo->output_width,
+					  (long) cinfo->max_h_samp_factor),
+				(long) (2 * SIZEOF_XMMWORD)),
 	 (JDIMENSION) cinfo->max_v_samp_factor);
     }
   }
 }
+
+
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+
+GLOBAL(unsigned int)
+jpeg_simd_upsampler (j_decompress_ptr cinfo, int do_fancy)
+{
+  unsigned int simd = jpeg_simd_support((j_common_ptr) cinfo);
+
+#ifdef UPSAMPLE_MERGING_SUPPORTED
+  if (!do_fancy)
+    return jpeg_simd_merged_upsampler(cinfo);
+#endif
+
+  if (do_fancy) {
+#ifdef JDSAMPLE_FANCY_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2 &&
+        IS_CONST_ALIGNED_16(jconst_fancy_upsample_sse2))
+      return JSIMD_SSE2;
+#endif
+#ifdef JDSAMPLE_FANCY_MMX_SUPPORTED
+    if (simd & JSIMD_MMX)
+      return JSIMD_MMX;
+#endif
+  } else {
+#ifdef JDSAMPLE_SIMPLE_SSE2_SUPPORTED
+    if (simd & JSIMD_SSE2)
+      return JSIMD_SSE2;
+#endif
+#ifdef JDSAMPLE_SIMPLE_MMX_SUPPORTED
+    if (simd & JSIMD_MMX)
+      return JSIMD_MMX;
+#endif
+  }
+
+  return JSIMD_NONE;
+}
+
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
diff --git a/jdsamss2.asm b/jdsamss2.asm
new file mode 100644
index 0000000..46fcf51
--- /dev/null
+++ b/jdsamss2.asm
@@ -0,0 +1,883 @@
+;
+; jdsamss2.asm - upsampling (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jcolsamp.inc"
+
+%ifdef JDSAMPLE_FANCY_SSE2_SUPPORTED
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE		times 8 dw  1
+PW_TWO		times 8 dw  2
+PW_THREE	times 8 dw  3
+PW_SEVEN	times 8 dw  7
+PW_EIGHT	times 8 dw  8
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jpeg_h2v1_fancy_upsample_sse2 (j_decompress_ptr cinfo,
+;                                jpeg_component_info * compptr,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jpeg_h2v1_fancy_upsample_sse2)
+
+EXTN(jpeg_h2v1_fancy_upsample_sse2):
+	push	ebp
+	mov	ebp,esp
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	mov	eax, POINTER [compptr(ebp)]
+	mov	eax, JDIMENSION [jcompinfo_downsampled_width(eax)]  ; colctr
+	test	eax,eax
+	jz	near .return
+
+	mov	ecx, POINTER [cinfo(ebp)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax			; colctr
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]	; inptr
+	mov	edi, JSAMPROW [edi]	; outptr
+
+	test	eax, SIZEOF_XMMWORD-1
+	jz	short .skip
+	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+.skip:
+	pxor	xmm0,xmm0		; xmm0=(all 0's)
+	pcmpeqb	xmm7,xmm7
+	psrldq	xmm7,(SIZEOF_XMMWORD-1)
+	pand	xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+	add	eax, byte SIZEOF_XMMWORD-1
+	and	eax, byte -SIZEOF_XMMWORD
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .columnloop
+	alignx	16,7
+
+.columnloop_last:
+	pcmpeqb	xmm6,xmm6
+	pslldq	xmm6,(SIZEOF_XMMWORD-1)
+	pand	xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	jmp	short .upsample
+	alignx	16,7
+
+.columnloop:
+	movdqa	xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
+	pslldq	xmm6,(SIZEOF_XMMWORD-1)
+
+.upsample:
+	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+	movdqa	xmm2,xmm1
+	movdqa	xmm3,xmm1		; xmm1=( 0  1  2 ... 13 14 15)
+	pslldq	xmm2,1			; xmm2=(--  0  1 ... 12 13 14)
+	psrldq	xmm3,1			; xmm3=( 1  2  3 ... 14 15 --)
+
+	por	xmm2,xmm7		; xmm2=(-1  0  1 ... 12 13 14)
+	por	xmm3,xmm6		; xmm3=( 1  2  3 ... 14 15 16)
+
+	movdqa	xmm7,xmm1
+	psrldq	xmm7,(SIZEOF_XMMWORD-1)	; xmm7=(15 -- -- ... -- -- --)
+
+	movdqa    xmm4,xmm1
+	punpcklbw xmm1,xmm0		; xmm1=( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm0		; xmm4=( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm2
+	punpcklbw xmm2,xmm0		; xmm2=(-1  0  1  2  3  4  5  6)
+	punpckhbw xmm5,xmm0		; xmm5=( 7  8  9 10 11 12 13 14)
+	movdqa    xmm6,xmm3
+	punpcklbw xmm3,xmm0		; xmm3=( 1  2  3  4  5  6  7  8)
+	punpckhbw xmm6,xmm0		; xmm6=( 9 10 11 12 13 14 15 16)
+
+	pmullw	xmm1,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	xmm2,[GOTOFF(ebx,PW_ONE)]
+	paddw	xmm5,[GOTOFF(ebx,PW_ONE)]
+	paddw	xmm3,[GOTOFF(ebx,PW_TWO)]
+	paddw	xmm6,[GOTOFF(ebx,PW_TWO)]
+
+	paddw	xmm2,xmm1
+	paddw	xmm5,xmm4
+	psrlw	xmm2,2			; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
+	psrlw	xmm5,2			; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+	paddw	xmm3,xmm1
+	paddw	xmm6,xmm4
+	psrlw	xmm3,2			; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
+	psrlw	xmm6,2			; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+	psllw	xmm3,BYTE_BIT
+	psllw	xmm6,BYTE_BIT
+	por	xmm2,xmm3		; xmm2=OutL=( 0  1  2 ... 13 14 15)
+	por	xmm5,xmm6		; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
+
+	sub	eax, byte SIZEOF_XMMWORD
+	add	esi, byte 1*SIZEOF_XMMWORD	; inptr
+	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	esi
+	pop	edi
+	pop	eax
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	ecx				; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jpeg_h2v2_fancy_upsample_sse2 (j_decompress_ptr cinfo,
+;                                jpeg_component_info * compptr,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		4
+%define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_h2v2_fancy_upsample_sse2)
+
+EXTN(jpeg_h2v2_fancy_upsample_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	edx,eax				; edx = original ebp
+	mov	eax, POINTER [compptr(edx)]
+	mov	eax, JDIMENSION [jcompinfo_downsampled_width(eax)]  ; colctr
+	test	eax,eax
+	jz	near .return
+
+	mov	ecx, POINTER [cinfo(edx)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(edx)]	; input_data
+	mov	edi, POINTER [output_data_ptr(edx)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax					; colctr
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+
+	test	eax, SIZEOF_XMMWORD-1
+	jz	short .skip
+	push	edx
+	mov	dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
+	pop	edx
+.skip:
+	; -- process the first column block
+
+	movdqa	xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]	; xmm0=row[ 0][0]
+	movdqa	xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]	; xmm1=row[-1][0]
+	movdqa	xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]	; xmm2=row[+1][0]
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pxor      xmm3,xmm3		; xmm3=(all 0's)
+	movdqa    xmm4,xmm0
+	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm1
+	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm6,xmm2
+	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+	pmullw	xmm0,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
+
+	pcmpeqb	xmm7,xmm7
+	psrldq	xmm7,(SIZEOF_XMMWORD-2)
+
+	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+	movdqa	XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1	; temporarily save
+	movdqa	XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5	; the intermediate data
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
+
+	pand	xmm1,xmm7		; xmm1=( 0 -- -- -- -- -- -- --)
+	pand	xmm2,xmm7		; xmm2=( 0 -- -- -- -- -- -- --)
+
+	movdqa	XMMWORD [wk(0)], xmm1
+	movdqa	XMMWORD [wk(1)], xmm2
+
+	poppic	ebx
+
+	add	eax, byte SIZEOF_XMMWORD-1
+	and	eax, byte -SIZEOF_XMMWORD
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	short .columnloop
+	alignx	16,7
+
+.columnloop_last:
+	; -- process the last column block
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pcmpeqb	xmm1,xmm1
+	pslldq	xmm1,(SIZEOF_XMMWORD-2)
+	movdqa	xmm2,xmm1
+
+	pand	xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
+	pand	xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+	movdqa	XMMWORD [wk(2)], xmm1	; xmm1=(-- -- -- -- -- -- -- 15)
+	movdqa	XMMWORD [wk(3)], xmm2	; xmm2=(-- -- -- -- -- -- -- 15)
+
+	jmp	near .upsample
+	alignx	16,7
+
+.columnloop:
+	; -- process the next column block
+
+	movdqa	xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]	; xmm0=row[ 0][1]
+	movdqa	xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]	; xmm1=row[-1][1]
+	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]	; xmm2=row[+1][1]
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	pxor      xmm3,xmm3		; xmm3=(all 0's)
+	movdqa    xmm4,xmm0
+	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm1
+	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm6,xmm2
+	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+	pmullw	xmm0,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
+
+	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+	movdqa	XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1	; temporarily save
+	movdqa	XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5	; the intermediate data
+	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
+
+	pslldq	xmm1,(SIZEOF_XMMWORD-2)	; xmm1=(-- -- -- -- -- -- --  0)
+	pslldq	xmm2,(SIZEOF_XMMWORD-2)	; xmm2=(-- -- -- -- -- -- --  0)
+
+	movdqa	XMMWORD [wk(2)], xmm1
+	movdqa	XMMWORD [wk(3)], xmm2
+
+.upsample:
+	; -- process the upper row
+
+	movdqa	xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
+	movdqa	xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
+
+	movdqa	xmm0,xmm7		; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
+	movdqa	xmm4,xmm3		; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
+	psrldq	xmm0,2			; xmm0=( 1  2  3  4  5  6  7 --)
+	pslldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(-- -- -- -- -- -- --  8)
+	movdqa	xmm5,xmm7
+	movdqa	xmm6,xmm3
+	psrldq	xmm5,(SIZEOF_XMMWORD-2)	; xmm5=( 7 -- -- -- -- -- -- --)
+	pslldq	xmm6,2			; xmm6=(--  8  9 10 11 12 13 14)
+
+	por	xmm0,xmm4		; xmm0=( 1  2  3  4  5  6  7  8)
+	por	xmm5,xmm6		; xmm5=( 7  8  9 10 11 12 13 14)
+
+	movdqa	xmm1,xmm7
+	movdqa	xmm2,xmm3
+	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
+	psrldq	xmm2,2			; xmm2=( 9 10 11 12 13 14 15 --)
+	movdqa	xmm4,xmm3
+	psrldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(15 -- -- -- -- -- -- --)
+
+	por	xmm1, XMMWORD [wk(0)]	; xmm1=(-1  0  1  2  3  4  5  6)
+	por	xmm2, XMMWORD [wk(2)]	; xmm2=( 9 10 11 12 13 14 15 16)
+
+	movdqa	XMMWORD [wk(0)], xmm4
+
+	pmullw	xmm7,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm3,[GOTOFF(ebx,PW_THREE)]
+	paddw	xmm1,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	xmm5,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	xmm0,[GOTOFF(ebx,PW_SEVEN)]
+	paddw	xmm2,[GOTOFF(ebx,PW_SEVEN)]
+
+	paddw	xmm1,xmm7
+	paddw	xmm5,xmm3
+	psrlw	xmm1,4			; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
+	psrlw	xmm5,4			; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+	paddw	xmm0,xmm7
+	paddw	xmm2,xmm3
+	psrlw	xmm0,4			; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
+	psrlw	xmm2,4			; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+	psllw	xmm0,BYTE_BIT
+	psllw	xmm2,BYTE_BIT
+	por	xmm1,xmm0		; xmm1=Out0L=( 0  1  2 ... 13 14 15)
+	por	xmm5,xmm2		; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+	movdqa	XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
+	movdqa	XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
+
+	; -- process the lower row
+
+	movdqa	xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
+	movdqa	xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+	movdqa	xmm7,xmm6		; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
+	movdqa	xmm3,xmm4		; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
+	psrldq	xmm7,2			; xmm7=( 1  2  3  4  5  6  7 --)
+	pslldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(-- -- -- -- -- -- --  8)
+	movdqa	xmm0,xmm6
+	movdqa	xmm2,xmm4
+	psrldq	xmm0,(SIZEOF_XMMWORD-2)	; xmm0=( 7 -- -- -- -- -- -- --)
+	pslldq	xmm2,2			; xmm2=(--  8  9 10 11 12 13 14)
+
+	por	xmm7,xmm3		; xmm7=( 1  2  3  4  5  6  7  8)
+	por	xmm0,xmm2		; xmm0=( 7  8  9 10 11 12 13 14)
+
+	movdqa	xmm1,xmm6
+	movdqa	xmm5,xmm4
+	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
+	psrldq	xmm5,2			; xmm5=( 9 10 11 12 13 14 15 --)
+	movdqa	xmm3,xmm4
+	psrldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(15 -- -- -- -- -- -- --)
+
+	por	xmm1, XMMWORD [wk(1)]	; xmm1=(-1  0  1  2  3  4  5  6)
+	por	xmm5, XMMWORD [wk(3)]	; xmm5=( 9 10 11 12 13 14 15 16)
+
+	movdqa	XMMWORD [wk(1)], xmm3
+
+	pmullw	xmm6,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	xmm1,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	xmm0,[GOTOFF(ebx,PW_EIGHT)]
+	paddw	xmm7,[GOTOFF(ebx,PW_SEVEN)]
+	paddw	xmm5,[GOTOFF(ebx,PW_SEVEN)]
+
+	paddw	xmm1,xmm6
+	paddw	xmm0,xmm4
+	psrlw	xmm1,4			; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
+	psrlw	xmm0,4			; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+	paddw	xmm7,xmm6
+	paddw	xmm5,xmm4
+	psrlw	xmm7,4			; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
+	psrlw	xmm5,4			; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+	psllw	xmm7,BYTE_BIT
+	psllw	xmm5,BYTE_BIT
+	por	xmm1,xmm7		; xmm1=Out1L=( 0  1  2 ... 13 14 15)
+	por	xmm0,xmm5		; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
+
+	poppic	ebx
+
+	sub	eax, byte SIZEOF_XMMWORD
+	add	ecx, byte 1*SIZEOF_XMMWORD	; inptr1(above)
+	add	ebx, byte 1*SIZEOF_XMMWORD	; inptr0
+	add	esi, byte 1*SIZEOF_XMMWORD	; inptr1(below)
+	add	edx, byte 2*SIZEOF_XMMWORD	; outptr0
+	add	edi, byte 2*SIZEOF_XMMWORD	; outptr1
+	cmp	eax, byte SIZEOF_XMMWORD
+	ja	near .columnloop
+	test	eax,eax
+	jnz	near .columnloop_last
+
+	pop	esi
+	pop	edi
+	pop	ecx
+	pop	eax
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%ifdef UPSAMPLE_H1V2_SUPPORTED
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 1:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jpeg_h1v2_fancy_upsample_sse2 (j_decompress_ptr cinfo,
+;                                jpeg_component_info * compptr,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+%define gotptr		ebp-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_h1v2_fancy_upsample_sse2)
+
+EXTN(jpeg_h1v2_fancy_upsample_sse2):
+	push	ebp
+	mov	ebp,esp
+	pushpic	eax		; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	mov	eax, POINTER [compptr(ebp)]
+	mov	eax, JDIMENSION [jcompinfo_downsampled_width(eax)]  ; colctr
+	add	eax, byte SIZEOF_XMMWORD-1
+	and	eax, byte -SIZEOF_XMMWORD
+	jz	near .return
+
+	mov	ecx, POINTER [cinfo(ebp)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	eax					; colctr
+	push	ecx
+	push	edi
+	push	esi
+
+	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
+	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
+	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+
+	pxor	xmm0,xmm0		; xmm0=(all 0's)
+	alignx	16,7
+
+.columnloop:
+	movdqa	xmm1, XMMWORD [ebx]	; xmm1=row[ 0]( 0  1  2 ... 13 14 15)
+	movdqa	xmm2, XMMWORD [ecx]	; xmm2=row[-1]( 0  1  2 ... 13 14 15)
+	movdqa	xmm3, XMMWORD [esi]	; xmm3=row[+1]( 0  1  2 ... 13 14 15)
+
+	pushpic	ebx
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	movdqa    xmm4,xmm1
+	punpcklbw xmm1,xmm0		; xmm1=row[ 0]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm4,xmm0		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm5,xmm2
+	punpcklbw xmm2,xmm0		; xmm2=row[-1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm5,xmm0		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+	movdqa    xmm6,xmm3
+	punpcklbw xmm3,xmm0		; xmm3=row[+1]( 0  1  2  3  4  5  6  7)
+	punpckhbw xmm6,xmm0		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+	pmullw	xmm1,[GOTOFF(ebx,PW_THREE)]
+	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
+	paddw	xmm2,[GOTOFF(ebx,PW_ONE)]
+	paddw	xmm5,[GOTOFF(ebx,PW_ONE)]
+	paddw	xmm3,[GOTOFF(ebx,PW_TWO)]
+	paddw	xmm6,[GOTOFF(ebx,PW_TWO)]
+
+	paddw	xmm2,xmm1
+	paddw	xmm5,xmm4
+	psrlw	xmm2,2			; xmm2=Out0L=( 0  1  2  3  4  5  6  7)
+	psrlw	xmm5,2			; xmm5=Out0H=( 8  9 10 11 12 13 14 15)
+	paddw	xmm3,xmm1
+	paddw	xmm6,xmm4
+	psrlw	xmm3,2			; xmm3=Out1L=( 0  1  2  3  4  5  6  7)
+	psrlw	xmm6,2			; xmm6=Out1H=( 8  9 10 11 12 13 14 15)
+
+	packuswb  xmm2,xmm5		; xmm2=Out0=( 0  1  2 ... 13 14 15)
+	packuswb  xmm3,xmm6		; xmm3=Out1=( 0  1  2 ... 13 14 15)
+
+	movdqa	XMMWORD [edx], xmm2
+	movdqa	XMMWORD [edi], xmm3
+
+	poppic	ebx
+
+	add	ecx, byte 1*SIZEOF_XMMWORD	; inptr1(above)
+	add	ebx, byte 1*SIZEOF_XMMWORD	; inptr0
+	add	esi, byte 1*SIZEOF_XMMWORD	; inptr1(below)
+	add	edx, byte 1*SIZEOF_XMMWORD	; outptr0
+	add	edi, byte 1*SIZEOF_XMMWORD	; outptr1
+	sub	eax, byte SIZEOF_XMMWORD
+	jnz	near .columnloop
+
+	pop	esi
+	pop	edi
+	pop	ecx
+	pop	eax
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	near .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	poppic	eax		; remove gotptr
+	pop	ebp
+	ret
+
+%endif ; UPSAMPLE_H1V2_SUPPORTED
+%endif ; JDSAMPLE_FANCY_SSE2_SUPPORTED
+
+%ifdef JDSAMPLE_SIMPLE_SSE2_SUPPORTED
+
+%ifndef JDSAMPLE_FANCY_SSE2_SUPPORTED
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+%endif
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jpeg_h2v1_upsample_sse2 (j_decompress_ptr cinfo,
+;                          jpeg_component_info * compptr,
+;                          JSAMPARRAY input_data,
+;                          JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jpeg_h2v1_upsample_sse2)
+
+EXTN(jpeg_h2v1_upsample_sse2):
+	push	ebp
+	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	edx, POINTER [cinfo(ebp)]
+	mov	edx, JDIMENSION [jdstruct_output_width(edx)]
+	add	edx, byte (2*SIZEOF_XMMWORD)-1
+	and	edx, byte -(2*SIZEOF_XMMWORD)
+	jz	short .return
+
+	mov	ecx, POINTER [cinfo(ebp)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	short .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]		; inptr
+	mov	edi, JSAMPROW [edi]		; outptr
+	mov	eax,edx				; colctr
+	alignx	16,7
+.columnloop:
+
+	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+	movdqa    xmm1,xmm0
+	punpcklbw xmm0,xmm0
+	punpckhbw xmm1,xmm1
+
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+	sub	eax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+	movdqa    xmm3,xmm2
+	punpcklbw xmm2,xmm2
+	punpckhbw xmm3,xmm3
+
+	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+	sub	eax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	edi, byte 4*SIZEOF_XMMWORD	; outptr
+	jmp	short .columnloop
+	alignx	16,7
+
+.nextrow:
+	pop	esi
+	pop	edi
+
+	add	esi, byte SIZEOF_JSAMPROW	; input_data
+	add	edi, byte SIZEOF_JSAMPROW	; output_data
+	dec	ecx				; rowctr
+	jg	short .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jpeg_h2v2_upsample_sse2 (j_decompress_ptr cinfo,
+;                          jpeg_component_info * compptr,
+;                          JSAMPARRAY input_data,
+;                          JSAMPARRAY * output_data_ptr);
+;
+
+%define cinfo(b)		(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)		(b)+12		; jpeg_component_info * compptr
+%define input_data(b)		(b)+16		; JSAMPARRAY input_data
+%define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
+
+	align	16
+	global	EXTN(jpeg_h2v2_upsample_sse2)
+
+EXTN(jpeg_h2v2_upsample_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	mov	edx, POINTER [cinfo(ebp)]
+	mov	edx, JDIMENSION [jdstruct_output_width(edx)]
+	add	edx, byte (2*SIZEOF_XMMWORD)-1
+	and	edx, byte -(2*SIZEOF_XMMWORD)
+	jz	near .return
+
+	mov	ecx, POINTER [cinfo(ebp)]
+	mov	ecx, INT [jdstruct_max_v_samp_factor(ecx)]	; rowctr
+	test	ecx,ecx
+	jz	near .return
+
+	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
+	mov	edi, POINTER [output_data_ptr(ebp)]
+	mov	edi, JSAMPARRAY [edi]			; output_data
+	alignx	16,7
+.rowloop:
+	push	edi
+	push	esi
+
+	mov	esi, JSAMPROW [esi]			; inptr
+	mov	ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
+	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
+	mov	eax,edx					; colctr
+	alignx	16,7
+.columnloop:
+
+	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+	movdqa    xmm1,xmm0
+	punpcklbw xmm0,xmm0
+	punpckhbw xmm1,xmm1
+
+	movdqa	XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+	movdqa	XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+	sub	eax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+	movdqa    xmm3,xmm2
+	punpcklbw xmm2,xmm2
+	punpckhbw xmm3,xmm3
+
+	movdqa	XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
+	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+	sub	eax, byte 2*SIZEOF_XMMWORD
+	jz	short .nextrow
+
+	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
+	add	ebx, byte 4*SIZEOF_XMMWORD	; outptr0
+	add	edi, byte 4*SIZEOF_XMMWORD	; outptr1
+	jmp	short .columnloop
+	alignx	16,7
+
+.nextrow:
+	pop	esi
+	pop	edi
+
+	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
+	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
+	sub	ecx, byte 2			; rowctr
+	jg	short .rowloop
+
+.return:
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%endif ; JDSAMPLE_SIMPLE_SSE2_SUPPORTED
diff --git a/jf3dnflt.asm b/jf3dnflt.asm
new file mode 100644
index 0000000..7117dd4
--- /dev/null
+++ b/jf3dnflt.asm
@@ -0,0 +1,327 @@
+;
+; jf3dnflt.asm - floating-point FDCT (3DNow!)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+%ifdef JFDCT_FLT_3DNOW_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_float_3dnow)
+
+EXTN(jconst_fdct_float_3dnow):
+
+PD_0_382	times 2 dd  0.382683432365089771728460
+PD_0_707	times 2 dd  0.707106781186547524400844
+PD_0_541	times 2 dd  0.541196100146196984399723
+PD_1_306	times 2 dd  1.306562964876376527856643
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jpeg_fdct_float_3dnow (FAST_FLOAT * data)
+;
+
+%define data(b)		(b)+8		; FAST_FLOAT * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jpeg_fdct_float_3dnow)
+
+EXTN(jpeg_fdct_float_3dnow):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.rowloop:
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+	; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
+
+	movq      mm4,mm0		; transpose coefficients
+	punpckldq mm0,mm1		; mm0=(00 10)=data0
+	punpckhdq mm4,mm1		; mm4=(01 11)=data1
+	movq      mm5,mm2		; transpose coefficients
+	punpckldq mm2,mm3		; mm2=(06 16)=data6
+	punpckhdq mm5,mm3		; mm5=(07 17)=data7
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	pfsub	mm4,mm2			; mm4=data1-data6=tmp6
+	pfsub	mm0,mm5			; mm0=data0-data7=tmp7
+	pfadd	mm6,mm2			; mm6=data1+data6=tmp1
+	pfadd	mm7,mm5			; mm7=data0+data7=tmp0
+
+	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+
+	; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm0	; wk(1)=tmp7
+
+	movq      mm4,mm1		; transpose coefficients
+	punpckldq mm1,mm3		; mm1=(02 12)=data2
+	punpckhdq mm4,mm3		; mm4=(03 13)=data3
+	movq      mm0,mm2		; transpose coefficients
+	punpckldq mm2,mm5		; mm2=(04 14)=data4
+	punpckhdq mm0,mm5		; mm0=(05 15)=data5
+
+	movq	mm3,mm4
+	movq	mm5,mm1
+	pfadd	mm4,mm2			; mm4=data3+data4=tmp3
+	pfadd	mm1,mm0			; mm1=data2+data5=tmp2
+	pfsub	mm3,mm2			; mm3=data3-data4=tmp4
+	pfsub	mm5,mm0			; mm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm2,mm7
+	movq	mm0,mm6
+	pfsub	mm7,mm4			; mm7=tmp13
+	pfsub	mm6,mm1			; mm6=tmp12
+	pfadd	mm2,mm4			; mm2=tmp10
+	pfadd	mm0,mm1			; mm0=tmp11
+
+	pfadd	mm6,mm7
+	pfmul	mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
+
+	movq	mm4,mm2
+	movq	mm1,mm7
+	pfsub	mm2,mm0			; mm2=data4
+	pfsub	mm7,mm6			; mm7=data6
+	pfadd	mm4,mm0			; mm4=data0
+	pfadd	mm1,mm6			; mm1=data2
+
+	movq	MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
+	movq	MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [wk(0)]	; mm0=tmp6
+	movq	mm6, MMWORD [wk(1)]	; mm6=tmp7
+
+	pfadd	mm3,mm5			; mm3=tmp10
+	pfadd	mm5,mm0			; mm5=tmp11
+	pfadd	mm0,mm6			; mm0=tmp12, mm6=tmp7
+
+	pfmul	mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
+
+	movq	mm2,mm3			; mm2=tmp10
+	pfsub	mm3,mm0
+	pfmul	mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
+	pfmul	mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+	pfmul	mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+	pfadd	mm2,mm3			; mm2=z2
+	pfadd	mm0,mm3			; mm0=z4
+
+	movq	mm7,mm6
+	pfsub	mm6,mm5			; mm6=z13
+	pfadd	mm7,mm5			; mm7=z11
+
+	movq	mm4,mm6
+	movq	mm1,mm7
+	pfsub	mm6,mm2			; mm6=data3
+	pfsub	mm7,mm0			; mm7=data7
+	pfadd	mm4,mm2			; mm4=data5
+	pfadd	mm1,mm0			; mm1=data1
+
+	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
+	movq	MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+	add	edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE/2
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+	; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
+
+	movq      mm4,mm0		; transpose coefficients
+	punpckldq mm0,mm1		; mm0=(00 01)=data0
+	punpckhdq mm4,mm1		; mm4=(10 11)=data1
+	movq      mm5,mm2		; transpose coefficients
+	punpckldq mm2,mm3		; mm2=(60 61)=data6
+	punpckhdq mm5,mm3		; mm5=(70 71)=data7
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	pfsub	mm4,mm2			; mm4=data1-data6=tmp6
+	pfsub	mm0,mm5			; mm0=data0-data7=tmp7
+	pfadd	mm6,mm2			; mm6=data1+data6=tmp1
+	pfadd	mm7,mm5			; mm7=data0+data7=tmp0
+
+	movq	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+	movq	mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+	; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm0	; wk(1)=tmp7
+
+	movq      mm4,mm1		; transpose coefficients
+	punpckldq mm1,mm3		; mm1=(20 21)=data2
+	punpckhdq mm4,mm3		; mm4=(30 31)=data3
+	movq      mm0,mm2		; transpose coefficients
+	punpckldq mm2,mm5		; mm2=(40 41)=data4
+	punpckhdq mm0,mm5		; mm0=(50 51)=data5
+
+	movq	mm3,mm4
+	movq	mm5,mm1
+	pfadd	mm4,mm2			; mm4=data3+data4=tmp3
+	pfadd	mm1,mm0			; mm1=data2+data5=tmp2
+	pfsub	mm3,mm2			; mm3=data3-data4=tmp4
+	pfsub	mm5,mm0			; mm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm2,mm7
+	movq	mm0,mm6
+	pfsub	mm7,mm4			; mm7=tmp13
+	pfsub	mm6,mm1			; mm6=tmp12
+	pfadd	mm2,mm4			; mm2=tmp10
+	pfadd	mm0,mm1			; mm0=tmp11
+
+	pfadd	mm6,mm7
+	pfmul	mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
+
+	movq	mm4,mm2
+	movq	mm1,mm7
+	pfsub	mm2,mm0			; mm2=data4
+	pfsub	mm7,mm6			; mm7=data6
+	pfadd	mm4,mm0			; mm4=data0
+	pfadd	mm1,mm6			; mm1=data2
+
+	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
+	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [wk(0)]	; mm0=tmp6
+	movq	mm6, MMWORD [wk(1)]	; mm6=tmp7
+
+	pfadd	mm3,mm5			; mm3=tmp10
+	pfadd	mm5,mm0			; mm5=tmp11
+	pfadd	mm0,mm6			; mm0=tmp12, mm6=tmp7
+
+	pfmul	mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
+
+	movq	mm2,mm3			; mm2=tmp10
+	pfsub	mm3,mm0
+	pfmul	mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
+	pfmul	mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+	pfmul	mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+	pfadd	mm2,mm3			; mm2=z2
+	pfadd	mm0,mm3			; mm0=z4
+
+	movq	mm7,mm6
+	pfsub	mm6,mm5			; mm6=z13
+	pfadd	mm7,mm5			; mm7=z11
+
+	movq	mm4,mm6
+	movq	mm1,mm7
+	pfsub	mm6,mm2			; mm6=data3
+	pfsub	mm7,mm0			; mm7=data7
+	pfadd	mm4,mm2			; mm4=data5
+	pfadd	mm1,mm0			; mm1=data1
+
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
+	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+	add	edx, byte 2*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .columnloop
+
+	femms		; empty MMX/3DNow! state
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JFDCT_FLT_3DNOW_MMX_SUPPORTED
+%endif ; DCT_FLOAT_SUPPORTED
diff --git a/jfdctflt.asm b/jfdctflt.asm
new file mode 100644
index 0000000..178e1f9
--- /dev/null
+++ b/jfdctflt.asm
@@ -0,0 +1,288 @@
+;
+; jfdctflt.asm - floating-point FDCT (non-SIMD)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; Last Modified : October 17, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+%define ROTATOR_TYPE	FP32	; float
+
+	alignz	16
+	global	EXTN(jconst_fdct_float)
+
+EXTN(jconst_fdct_float):
+
+F_0_382	dd	0.382683432365089771728460	; cos(PI*3/8)
+F_0_707	dd	0.707106781186547524400844	; cos(PI*1/4)
+F_0_541	dd	0.541196100146196984399723	; cos(PI*1/8)-cos(PI*3/8)
+F_1_306	dd	1.306562964876376527856643	; cos(PI*1/8)+cos(PI*3/8)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jpeg_fdct_float (FAST_FLOAT * data)
+;
+
+%define data(b)	(b)+8		; FAST_FLOAT * data
+
+	align	16
+	global	EXTN(jpeg_fdct_float)
+
+EXTN(jpeg_fdct_float):
+	push	ebp
+	mov	ebp,esp
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(ebp)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE
+	alignx	16,7
+.rowloop:
+	fld	FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
+	fadd	FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
+	fadd	FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
+	fadd	FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
+	fadd	FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
+
+	; -- Even part
+
+	fld	st2	; st2 = st2 + st1, st1 = st2 - st1
+	fsub	st0,st2
+	fxch	st0,st2
+	faddp	st3,st0
+	fld	st3	; st3 = st3 + st0, st0 = st3 - st0
+	fsub	st0,st1
+	fxch	st0,st1
+	faddp	st4,st0
+
+	fadd	st0,st1
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
+
+	fld	st2	; st3 = st2 + st3, st2 = st2 - st3
+	fsub	st0,st4
+	fxch	st0,st3
+	faddp	st4,st0
+	fld	st1	; st0 = st1 + st0, st1 = st1 - st0
+	fsub	st0,st1
+	fxch	st0,st2
+	faddp	st1,st0
+
+	fld	FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
+	fsub	FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st4
+	fld	FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
+	fsub	FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st4
+	fld	FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
+	fsub	FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st4
+	fld	FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
+	fsub	FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st4
+
+	fstp	FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
+
+	; -- Odd part
+
+	fadd	st2,st0
+	fadd	st0,st1
+	fxch	st0,st3
+	fadd	st1,st0
+	fxch	st0,st3
+
+	fld	st2
+	fxch	st0,st1
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
+	fxch	st0,st1
+	fsub	st0,st2
+	fxch	st0,st3
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_541)]
+	fxch	st0,st3
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_382)]
+	fxch	st0,st2
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_306)]
+	fxch	st0,st2
+	fadd	st3,st0
+	faddp	st2,st0
+
+	fld	st3	; st3 = st3 + st0, st0 = st3 - st0
+	fsub	st0,st1
+	fxch	st0,st1
+	faddp	st4,st0
+
+	fld	st2	; st0 = st0 + st2, st2 = st0 - st2
+	fsubr	st0,st1
+	fxch	st0,st3
+	faddp	st1,st0
+	fld	st1	; st3 = st3 + st1, st1 = st3 - st1
+	fsubr	st0,st4
+	fxch	st0,st2
+	faddp	st4,st0
+
+	fstp	FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
+
+	add	edx, byte DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx				; advance pointer to next row
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(ebp)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE
+	alignx	16,7
+.columnloop:
+	fld	FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
+	fadd	FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
+	fadd	FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
+	fadd	FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
+	fadd	FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
+
+	; -- Even part
+
+	fld	st2	; st2 = st2 + st1, st1 = st2 - st1
+	fsub	st0,st2
+	fxch	st0,st2
+	faddp	st3,st0
+	fld	st3	; st3 = st3 + st0, st0 = st3 - st0
+	fsub	st0,st1
+	fxch	st0,st1
+	faddp	st4,st0
+
+	fadd	st0,st1
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
+
+	fld	st2	; st3 = st2 + st3, st2 = st2 - st3
+	fsub	st0,st4
+	fxch	st0,st3
+	faddp	st4,st0
+	fld	st1	; st0 = st1 + st0, st1 = st1 - st0
+	fsub	st0,st1
+	fxch	st0,st2
+	faddp	st1,st0
+
+	fld	FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
+	fsub	FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st4
+	fld	FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
+	fsub	FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st4
+	fld	FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
+	fsub	FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st4
+	fld	FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
+	fsub	FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st4
+
+	fstp	FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
+
+	; -- Odd part
+
+	fadd	st2,st0
+	fadd	st0,st1
+	fxch	st0,st3
+	fadd	st1,st0
+	fxch	st0,st3
+
+	fld	st2
+	fxch	st0,st1
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
+	fxch	st0,st1
+	fsub	st0,st2
+	fxch	st0,st3
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_541)]
+	fxch	st0,st3
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_382)]
+	fxch	st0,st2
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_306)]
+	fxch	st0,st2
+	fadd	st3,st0
+	faddp	st2,st0
+
+	fld	st3	; st3 = st3 + st0, st0 = st3 - st0
+	fsub	st0,st1
+	fxch	st0,st1
+	faddp	st4,st0
+
+	fld	st2	; st0 = st0 + st2, st2 = st0 - st2
+	fsubr	st0,st1
+	fxch	st0,st3
+	faddp	st1,st0
+	fld	st1	; st3 = st3 + st1, st1 = st3 - st1
+	fsubr	st0,st4
+	fxch	st0,st2
+	faddp	st4,st0
+
+	fstp	FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
+
+	add	edx, byte SIZEOF_FAST_FLOAT ; advance pointer to next column
+	dec	ecx
+	jnz	near .columnloop
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	pop	ebp
+	ret
+
+%endif ; DCT_FLOAT_SUPPORTED
diff --git a/jfdctfst.asm b/jfdctfst.asm
new file mode 100644
index 0000000..c73c920
--- /dev/null
+++ b/jfdctfst.asm
@@ -0,0 +1,303 @@
+;
+; jfdctfst.asm - fast integer FDCT (non-SIMD)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctfst.c; see the jfdctfst.c for
+; more details.
+;
+; Last Modified : October 17, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_IFAST_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+; We can gain a little more speed, with a further compromise in accuracy,
+; by omitting the addition in a descaling shift.  This yields an
+; incorrectly rounded result half the time...
+;
+%macro	descale 2
+%ifdef USE_ACCURATE_ROUNDING
+%if (%2)<=7
+	add	%1, byte (1<<((%2)-1))	; add reg32,imm8
+%else
+	add	%1, (1<<((%2)-1))	; add reg32,imm32
+%endif
+%endif
+	sar	%1,%2
+%endmacro
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8
+
+%if CONST_BITS == 8
+F_0_382	equ	 98		; FIX(0.382683433)
+F_0_541	equ	139		; FIX(0.541196100)
+F_0_707	equ	181		; FIX(0.707106781)
+F_1_306	equ	334		; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
+F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jpeg_fdct_ifast (DCTELEM * data)
+;
+
+%define data(b)	(b)+8		; DCTELEM * data
+
+	align	16
+	global	EXTN(jpeg_fdct_ifast)
+
+EXTN(jpeg_fdct_ifast):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	; ---- Pass 1: process rows.
+
+	mov	ecx, DCTSIZE
+	mov	edx, POINTER [data(ebp)]	; (DCTELEM *)
+	alignx	16,7
+.rowloop:
+	push	ecx		; ctr
+	push	edx		; dataptr
+
+	movsx	eax, DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)]
+	movsx	edi, DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)]
+	lea	esi,[eax+edi]	; esi=tmp0
+	sub	eax,edi		; eax=tmp7
+	push	eax
+
+	movsx	ebx, DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)]
+	movsx	ecx, DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)]
+	lea	edi,[ebx+ecx]	; edi=tmp1
+	sub	ebx,ecx		; ebx=tmp6
+	push	ebx
+
+	movsx	eax, DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)]
+	movsx	ecx, DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)]
+	lea	ebx,[eax+ecx]	; ebx=tmp2
+	sub	eax,ecx		; eax=tmp5
+	push	eax
+
+	movsx	ecx, DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)]
+	movsx	eax, DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)]
+	lea	edx,[ecx+eax]	; edx=tmp3
+	sub	ecx,eax		; ecx=tmp4
+	push	ecx
+
+	; -- Even part
+
+	lea	eax,[esi+edx]	; eax=tmp10
+	lea	ecx,[edi+ebx]	; ecx=tmp11
+	sub	esi,edx		; esi=tmp13
+	sub	edi,ebx		; edi=tmp12
+
+	mov	edx, POINTER [esp+16]	; dataptr
+
+	add	edi,esi
+	imul	edi,(F_0_707)	; edi=z1
+	descale	edi,CONST_BITS
+
+	lea	ebx,[eax+ecx]	; ebx=data0
+	sub	eax,ecx		; eax=data4
+	mov	DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)], bx
+	mov	DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)], ax
+
+	lea	ecx,[esi+edi]	; ecx=data2
+	sub	esi,edi		; esi=data6
+	mov	DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)], cx
+	mov	DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)], si
+
+	; -- Odd part
+
+	pop	eax	; eax=tmp4
+	pop	edx	; edx=tmp5
+	pop	ebx	; ebx=tmp6
+	pop	edi	; edi=tmp7
+
+	add	eax,edx		; eax=tmp10
+	add	edx,ebx		; edx=tmp11
+	add	ebx,edi		; ebx=tmp12, edi=tmp7
+
+	imul	edx,(F_0_707)	; edx=z3
+	descale	edx,CONST_BITS
+	lea	esi,[edi+edx]	; esi=z11
+	sub	edi,edx		; edi=z13
+
+	mov	ecx,eax		; ecx=tmp10
+	sub	eax,ebx
+	imul	eax,(F_0_382)	; eax=z5
+	imul	ecx,(F_0_541)	; ecx=MULTIPLY(tmp10,FIX_0_541196100)
+	imul	ebx,(F_1_306)	; ebx=MULTIPLY(tmp12,FIX_1_306562965)
+	descale	eax,CONST_BITS
+	descale	ecx,CONST_BITS
+	descale	ebx,CONST_BITS
+	add	ecx,eax		; ecx=z2
+	add	ebx,eax		; ebx=z4
+
+	pop	edx		; dataptr
+
+	lea	eax,[edi+ecx]	; eax=data5
+	sub	edi,ecx		; edi=data3
+	mov	DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)], ax
+	mov	DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)], di
+
+	lea	ecx,[esi+ebx]	; ecx=data1
+	sub	esi,ebx		; esi=data7
+	mov	DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)], cx
+	mov	DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)], si
+
+	pop	ecx		; ctr
+
+	add	edx, byte DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx			; advance pointer to next row
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	ecx, DCTSIZE
+	mov	edx, POINTER [data(ebp)]	; (DCTELEM *)
+	alignx	16,7
+.columnloop:
+	push	ecx		; ctr
+	push	edx		; dataptr
+
+	movsx	eax, DCTELEM [COL(0,edx,SIZEOF_DCTELEM)]
+	movsx	edi, DCTELEM [COL(7,edx,SIZEOF_DCTELEM)]
+	lea	esi,[eax+edi]	; esi=tmp0
+	sub	eax,edi		; eax=tmp7
+	push	eax
+
+	movsx	ebx, DCTELEM [COL(1,edx,SIZEOF_DCTELEM)]
+	movsx	ecx, DCTELEM [COL(6,edx,SIZEOF_DCTELEM)]
+	lea	edi,[ebx+ecx]	; edi=tmp1
+	sub	ebx,ecx		; ebx=tmp6
+	push	ebx
+
+	movsx	eax, DCTELEM [COL(2,edx,SIZEOF_DCTELEM)]
+	movsx	ecx, DCTELEM [COL(5,edx,SIZEOF_DCTELEM)]
+	lea	ebx,[eax+ecx]	; ebx=tmp2
+	sub	eax,ecx		; eax=tmp5
+	push	eax
+
+	movsx	ecx, DCTELEM [COL(3,edx,SIZEOF_DCTELEM)]
+	movsx	eax, DCTELEM [COL(4,edx,SIZEOF_DCTELEM)]
+	lea	edx,[ecx+eax]	; edx=tmp3
+	sub	ecx,eax		; ecx=tmp4
+	push	ecx
+
+	; -- Even part
+
+	lea	eax,[esi+edx]	; eax=tmp10
+	lea	ecx,[edi+ebx]	; ecx=tmp11
+	sub	esi,edx		; esi=tmp13
+	sub	edi,ebx		; edi=tmp12
+
+	mov	edx, POINTER [esp+16]	; dataptr
+
+	add	edi,esi
+	imul	edi,(F_0_707)	; edi=z1
+	descale	edi,CONST_BITS
+
+	lea	ebx,[eax+ecx]	; ebx=data0
+	sub	eax,ecx		; eax=data4
+	mov	DCTELEM [COL(0,edx,SIZEOF_DCTELEM)], bx
+	mov	DCTELEM [COL(4,edx,SIZEOF_DCTELEM)], ax
+
+	lea	ecx,[esi+edi]	; ecx=data2
+	sub	esi,edi		; esi=data6
+	mov	DCTELEM [COL(2,edx,SIZEOF_DCTELEM)], cx
+	mov	DCTELEM [COL(6,edx,SIZEOF_DCTELEM)], si
+
+	; -- Odd part
+
+	pop	eax	; eax=tmp4
+	pop	edx	; edx=tmp5
+	pop	ebx	; ebx=tmp6
+	pop	edi	; edi=tmp7
+
+	add	eax,edx		; eax=tmp10
+	add	edx,ebx		; edx=tmp11
+	add	ebx,edi		; ebx=tmp12, edi=tmp7
+
+	imul	edx,(F_0_707)	; edx=z3
+	descale	edx,CONST_BITS
+	lea	esi,[edi+edx]	; esi=z11
+	sub	edi,edx		; edi=z13
+
+	mov	ecx,eax		; ecx=tmp10
+	sub	eax,ebx
+	imul	eax,(F_0_382)	; eax=z5
+	imul	ecx,(F_0_541)	; ecx=MULTIPLY(tmp10,FIX_0_541196100)
+	imul	ebx,(F_1_306)	; ebx=MULTIPLY(tmp12,FIX_1_306562965)
+	descale	eax,CONST_BITS
+	descale	ecx,CONST_BITS
+	descale	ebx,CONST_BITS
+	add	ecx,eax		; ecx=z2
+	add	ebx,eax		; ebx=z4
+
+	pop	edx		; dataptr
+
+	lea	eax,[edi+ecx]	; eax=data5
+	sub	edi,ecx		; edi=data3
+	mov	DCTELEM [COL(5,edx,SIZEOF_DCTELEM)], ax
+	mov	DCTELEM [COL(3,edx,SIZEOF_DCTELEM)], di
+
+	lea	ecx,[esi+ebx]	; ecx=data1
+	sub	esi,ebx		; esi=data7
+	mov	DCTELEM [COL(1,edx,SIZEOF_DCTELEM)], cx
+	mov	DCTELEM [COL(7,edx,SIZEOF_DCTELEM)], si
+
+	pop	ecx		; ctr
+
+	add	edx, byte SIZEOF_DCTELEM    ; advance pointer to next column
+	dec	ecx
+	jnz	near .columnloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%endif ; DCT_IFAST_SUPPORTED
diff --git a/jfdctint.asm b/jfdctint.asm
new file mode 100644
index 0000000..0f29725
--- /dev/null
+++ b/jfdctint.asm
@@ -0,0 +1,342 @@
+;
+; jfdctint.asm - accurate integer FDCT (non-SIMD)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; Last Modified : October 17, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_ISLOW_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+; Descale and correctly round a DWORD value that's scaled by N bits.
+;
+%macro	descale 2
+%if (%2)<=7
+	add	%1, byte (1<<((%2)-1))	; add reg32,imm8
+%else
+	add	%1, (1<<((%2)-1))	; add reg32,imm32
+%endif
+	sar	%1,%2
+%endmacro
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jpeg_fdct_islow (DCTELEM * data)
+;
+
+%define data(b)	(b)+8		; DCTELEM * data
+
+	align	16
+	global	EXTN(jpeg_fdct_islow)
+
+EXTN(jpeg_fdct_islow):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(ebp)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE
+	alignx	16,7
+.rowloop:
+	movsx	eax, DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)]
+	movsx	edi, DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)]
+	lea	esi,[eax+edi]	; esi=tmp0
+	sub	eax,edi		; eax=tmp7
+	push	ecx		; ctr
+	push	eax
+
+	movsx	ebx, DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)]
+	movsx	ecx, DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)]
+	lea	edi,[ebx+ecx]	; edi=tmp1
+	sub	ebx,ecx		; ebx=tmp6
+	push	ebx
+
+	movsx	eax, DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)]
+	movsx	ecx, DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)]
+	lea	ebx,[eax+ecx]	; ebx=tmp2
+	sub	eax,ecx		; eax=tmp5
+	push	edx		; dataptr
+	push	eax
+
+	movsx	ecx, DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)]
+	movsx	eax, DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)]
+	lea	edx,[ecx+eax]	; edx=tmp3
+	sub	ecx,eax		; ecx=tmp4
+	push	ecx
+
+	; -- Even part
+
+	lea	eax,[esi+edx]	; eax=tmp10
+	lea	ecx,[edi+ebx]	; ecx=tmp11
+	sub	esi,edx		; esi=tmp13
+	sub	edi,ebx		; edi=tmp12
+
+	lea	ebx,[eax+ecx]	; ebx=data0
+	sub	eax,ecx		; eax=data4
+	mov	edx, POINTER [esp+8]	; dataptr
+	sal	ebx, PASS1_BITS
+	sal	eax, PASS1_BITS
+	mov	DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)], bx
+	mov	DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)], ax
+
+	lea	ecx,[edi+esi]
+	imul	ecx,(F_0_541)	; ecx=z1
+	imul	esi,(F_0_765)	; esi=MULTIPLY(tmp13,FIX_0_765366865)
+	imul	edi,(-F_1_847)	; edi=MULTIPLY(tmp12,-FIX_1_847759065)
+	add	esi,ecx		; esi=data2
+	add	edi,ecx		; edi=data6
+	descale	esi,(CONST_BITS-PASS1_BITS)
+	descale	edi,(CONST_BITS-PASS1_BITS)
+	mov	DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)], si
+	mov	DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)], di
+
+	; -- Odd part
+
+	mov	eax, INT32 [esp]	; eax=tmp4
+	mov	ebx, INT32 [esp+4]	; ebx=tmp5
+	mov	ecx, INT32 [esp+12]	; ecx=tmp6
+	mov	esi, INT32 [esp+16]	; esi=tmp7
+
+	lea	edx,[eax+ecx]	; edx=z3
+	lea	edi,[ebx+esi]	; edi=z4
+	add	eax,esi		; eax=z1
+	add	ebx,ecx		; ebx=z2
+
+	lea	esi,[edx+edi]
+	imul	esi,(F_1_175)	; esi=z5
+
+	imul	edx,(-F_1_961)	; edx=z3(=MULTIPLY(z3,-FIX_1_961570560))
+	imul	edi,(-F_0_390)	; edi=z4(=MULTIPLY(z4,-FIX_0_390180644))
+	imul	eax,(-F_0_899)	; eax=z1(=MULTIPLY(z1,-FIX_0_899976223))
+	imul	ebx,(-F_2_562)	; ebx=z2(=MULTIPLY(z2,-FIX_2_562915447))
+
+	add	edx,esi		; edx=z3(=z3+z5)
+	add	edi,esi		; edi=z4(=z4+z5)
+
+	lea	ecx,[eax+edx]	; ecx=z1+z3
+	lea	esi,[ebx+edi]	; esi=z2+z4
+	add	eax,edi		; eax=z1+z4
+	add	ebx,edx		; ebx=z2+z3
+
+	pop	edx		; edx=tmp4
+	pop	edi		; edi=tmp5
+	imul	edx,(F_0_298)	; edx=tmp4(=MULTIPLY(tmp4,FIX_0_298631336))
+	imul	edi,(F_2_053)	; edi=tmp5(=MULTIPLY(tmp5,FIX_2_053119869))
+	add	ecx,edx		; ecx=data7(=tmp4+z1+z3)
+	add	esi,edi		; esi=data5(=tmp5+z2+z4)
+	pop	edx		; dataptr
+	descale	ecx,(CONST_BITS-PASS1_BITS)
+	descale	esi,(CONST_BITS-PASS1_BITS)
+	mov	DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)], cx
+	mov	DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)], si
+
+	pop	edi		; edi=tmp6
+	pop	ecx		; ecx=tmp7
+	imul	edi,(F_3_072)	; edi=tmp6(=MULTIPLY(tmp6,FIX_3_072711026))
+	imul	ecx,(F_1_501)	; ecx=tmp7(=MULTIPLY(tmp7,FIX_1_501321110))
+	add	ebx,edi		; ebx=data3(=tmp6+z2+z3)
+	add	eax,ecx		; eax=data1(=tmp7+z1+z4)
+	pop	ecx		; ctr
+	descale	ebx,(CONST_BITS-PASS1_BITS)
+	descale	eax,(CONST_BITS-PASS1_BITS)
+	mov	DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)], bx
+	mov	DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)], ax
+
+	add	edx, byte DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx			; advance pointer to next row
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(ebp)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE
+	alignx	16,7
+.columnloop:
+	movsx	eax, DCTELEM [COL(0,edx,SIZEOF_DCTELEM)]
+	movsx	edi, DCTELEM [COL(7,edx,SIZEOF_DCTELEM)]
+	lea	esi,[eax+edi]	; esi=tmp0
+	sub	eax,edi		; eax=tmp7
+	push	ecx		; ctr
+	push	eax
+
+	movsx	ebx, DCTELEM [COL(1,edx,SIZEOF_DCTELEM)]
+	movsx	ecx, DCTELEM [COL(6,edx,SIZEOF_DCTELEM)]
+	lea	edi,[ebx+ecx]	; edi=tmp1
+	sub	ebx,ecx		; ebx=tmp6
+	push	ebx
+
+	movsx	eax, DCTELEM [COL(2,edx,SIZEOF_DCTELEM)]
+	movsx	ecx, DCTELEM [COL(5,edx,SIZEOF_DCTELEM)]
+	lea	ebx,[eax+ecx]	; ebx=tmp2
+	sub	eax,ecx		; eax=tmp5
+	push	edx		; dataptr
+	push	eax
+
+	movsx	ecx, DCTELEM [COL(3,edx,SIZEOF_DCTELEM)]
+	movsx	eax, DCTELEM [COL(4,edx,SIZEOF_DCTELEM)]
+	lea	edx,[ecx+eax]	; edx=tmp3
+	sub	ecx,eax		; ecx=tmp4
+	push	ecx
+
+	; -- Even part
+
+	lea	eax,[esi+edx]	; eax=tmp10
+	lea	ecx,[edi+ebx]	; ecx=tmp11
+	sub	esi,edx		; esi=tmp13
+	sub	edi,ebx		; edi=tmp12
+
+	lea	ebx,[eax+ecx]	; ebx=data0
+	sub	eax,ecx		; eax=data4
+	mov	edx, POINTER [esp+8]	; dataptr
+	descale	ebx, PASS1_BITS
+	descale	eax, PASS1_BITS
+	mov	DCTELEM [COL(0,edx,SIZEOF_DCTELEM)], bx
+	mov	DCTELEM [COL(4,edx,SIZEOF_DCTELEM)], ax
+
+	lea	ecx,[edi+esi]
+	imul	ecx,(F_0_541)	; ecx=z1
+	imul	esi,(F_0_765)	; esi=MULTIPLY(tmp13,FIX_0_765366865)
+	imul	edi,(-F_1_847)	; edi=MULTIPLY(tmp12,-FIX_1_847759065)
+	add	esi,ecx		; esi=data2
+	add	edi,ecx		; edi=data6
+	descale	esi,(CONST_BITS+PASS1_BITS)
+	descale	edi,(CONST_BITS+PASS1_BITS)
+	mov	DCTELEM [COL(2,edx,SIZEOF_DCTELEM)], si
+	mov	DCTELEM [COL(6,edx,SIZEOF_DCTELEM)], di
+
+	; -- Odd part
+
+	mov	eax, INT32 [esp]	; eax=tmp4
+	mov	ebx, INT32 [esp+4]	; ebx=tmp5
+	mov	ecx, INT32 [esp+12]	; ecx=tmp6
+	mov	esi, INT32 [esp+16]	; esi=tmp7
+
+	lea	edx,[eax+ecx]	; edx=z3
+	lea	edi,[ebx+esi]	; edi=z4
+	add	eax,esi		; eax=z1
+	add	ebx,ecx		; ebx=z2
+
+	lea	esi,[edx+edi]
+	imul	esi,(F_1_175)	; esi=z5
+
+	imul	edx,(-F_1_961)	; edx=z3(=MULTIPLY(z3,-FIX_1_961570560))
+	imul	edi,(-F_0_390)	; edi=z4(=MULTIPLY(z4,-FIX_0_390180644))
+	imul	eax,(-F_0_899)	; eax=z1(=MULTIPLY(z1,-FIX_0_899976223))
+	imul	ebx,(-F_2_562)	; ebx=z2(=MULTIPLY(z2,-FIX_2_562915447))
+
+	add	edx,esi		; edx=z3(=z3+z5)
+	add	edi,esi		; edi=z4(=z4+z5)
+
+	lea	ecx,[eax+edx]	; ecx=z1+z3
+	lea	esi,[ebx+edi]	; esi=z2+z4
+	add	eax,edi		; eax=z1+z4
+	add	ebx,edx		; ebx=z2+z3
+
+	pop	edx		; edx=tmp4
+	pop	edi		; edi=tmp5
+	imul	edx,(F_0_298)	; edx=tmp4(=MULTIPLY(tmp4,FIX_0_298631336))
+	imul	edi,(F_2_053)	; edi=tmp5(=MULTIPLY(tmp5,FIX_2_053119869))
+	add	ecx,edx		; ecx=data7(=tmp4+z1+z3)
+	add	esi,edi		; esi=data5(=tmp5+z2+z4)
+	pop	edx		; dataptr
+	descale	ecx,(CONST_BITS+PASS1_BITS)
+	descale	esi,(CONST_BITS+PASS1_BITS)
+	mov	DCTELEM [COL(7,edx,SIZEOF_DCTELEM)], cx
+	mov	DCTELEM [COL(5,edx,SIZEOF_DCTELEM)], si
+
+	pop	edi		; edi=tmp6
+	pop	ecx		; ecx=tmp7
+	imul	edi,(F_3_072)	; edi=tmp6(=MULTIPLY(tmp6,FIX_3_072711026))
+	imul	ecx,(F_1_501)	; ecx=tmp7(=MULTIPLY(tmp7,FIX_1_501321110))
+	add	ebx,edi		; ebx=data3(=tmp6+z2+z3)
+	add	eax,ecx		; eax=data1(=tmp7+z1+z4)
+	pop	ecx		; ctr
+	descale	ebx,(CONST_BITS+PASS1_BITS)
+	descale	eax,(CONST_BITS+PASS1_BITS)
+	mov	DCTELEM [COL(3,edx,SIZEOF_DCTELEM)], bx
+	mov	DCTELEM [COL(1,edx,SIZEOF_DCTELEM)], ax
+
+	add	edx, byte SIZEOF_DCTELEM    ; advance pointer to next column
+	dec	ecx
+	jnz	near .columnloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%endif ; DCT_ISLOW_SUPPORTED
diff --git a/jfmmxfst.asm b/jfmmxfst.asm
new file mode 100644
index 0000000..2f8d53f
--- /dev/null
+++ b/jfmmxfst.asm
@@ -0,0 +1,404 @@
+;
+; jfmmxfst.asm - fast integer FDCT (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_IFAST_SUPPORTED
+%ifdef JFDCT_INT_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382	equ	 98		; FIX(0.382683433)
+F_0_541	equ	139		; FIX(0.541196100)
+F_0_707	equ	181		; FIX(0.707106781)
+F_1_306	equ	334		; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
+F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_fdct_ifast_mmx)
+
+EXTN(jconst_fdct_ifast_mmx):
+
+PW_F0707	times 4 dw  F_0_707 << CONST_SHIFT
+PW_F0382	times 4 dw  F_0_382 << CONST_SHIFT
+PW_F0541	times 4 dw  F_0_541 << CONST_SHIFT
+PW_F1306	times 4 dw  F_1_306 << CONST_SHIFT
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jpeg_fdct_ifast_mmx (DCTELEM * data)
+;
+
+%define data(b)		(b)+8		; DCTELEM * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jpeg_fdct_ifast_mmx)
+
+EXTN(jpeg_fdct_ifast_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.rowloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(20 21 22 23), mm2=(24 25 26 27)
+	; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(20 30 21 31)
+	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(24 34 25 35)
+	punpckhwd mm5,mm3		; mm5=(26 36 27 37)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 01 02 03), mm1=(04 05 06 07)
+	; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 32 23 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(24 34 25 35)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
+	punpckhwd mm4,mm7		; mm4=(02 12 03 13)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(04 14 05 15)
+	punpckhwd mm2,mm3		; mm2=(06 16 07 17)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 10 20 30)=data0
+	punpckhdq mm7,mm0		; mm7=(01 11 21 31)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(06 16 26 36)=data6
+	punpckhdq mm3,mm5		; mm3=(07 17 27 37)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 32 23 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(24 34 25 35)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(02 12 22 32)=data2
+	punpckhdq mm7,mm2		; mm7=(03 13 23 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(04 14 24 34)=data4
+	punpckhdq mm6,mm3		; mm6=(05 15 25 35)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	psubw	mm5,mm7			; mm5=tmp13
+	psubw	mm0,mm4			; mm0=tmp12
+	paddw	mm1,mm7			; mm1=tmp10
+	paddw	mm6,mm4			; mm6=tmp11
+
+	paddw	mm0,mm5
+	psllw	mm0,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
+
+	movq	mm7,mm1
+	movq	mm4,mm5
+	psubw	mm1,mm6			; mm1=data4
+	psubw	mm5,mm0			; mm5=data6
+	paddw	mm7,mm6			; mm7=data0
+	paddw	mm4,mm0			; mm4=data2
+
+	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+	; -- Odd part
+
+	movq	mm6, MMWORD [wk(0)]	; mm6=tmp6
+	movq	mm0, MMWORD [wk(1)]	; mm0=tmp7
+
+	paddw	mm2,mm3			; mm2=tmp10
+	paddw	mm3,mm6			; mm3=tmp11
+	paddw	mm6,mm0			; mm6=tmp12, mm0=tmp7
+
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm6,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	mm3,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
+
+	movq	mm1,mm2			; mm1=tmp10
+	psubw	mm2,mm6
+	pmulhw	mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
+	pmulhw	mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+	pmulhw	mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+	paddw	mm1,mm2			; mm1=z2
+	paddw	mm6,mm2			; mm6=z4
+
+	movq	mm5,mm0
+	psubw	mm0,mm3			; mm0=z13
+	paddw	mm5,mm3			; mm5=z11
+
+	movq	mm7,mm0
+	movq	mm4,mm5
+	psubw	mm0,mm1			; mm0=data3
+	psubw	mm5,mm6			; mm5=data7
+	paddw	mm7,mm1			; mm7=data5
+	paddw	mm4,mm6			; mm4=data1
+
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+	add	edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(02 12 22 32), mm2=(42 52 62 72)
+	; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(02 03 12 13)
+	punpckhwd mm4,mm1		; mm4=(22 23 32 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(42 43 52 53)
+	punpckhwd mm5,mm3		; mm5=(62 63 72 73)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 10 20 30), mm1=(40 50 60 70)
+	; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 23 32 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(42 43 52 53)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 01 10 11)
+	punpckhwd mm4,mm7		; mm4=(20 21 30 31)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(40 41 50 51)
+	punpckhwd mm2,mm3		; mm2=(60 61 70 71)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 01 02 03)=data0
+	punpckhdq mm7,mm0		; mm7=(10 11 12 13)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(60 61 62 63)=data6
+	punpckhdq mm3,mm5		; mm3=(70 71 72 73)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 23 32 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(42 43 52 53)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(20 21 22 23)=data2
+	punpckhdq mm7,mm2		; mm7=(30 31 32 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(40 41 42 43)=data4
+	punpckhdq mm6,mm3		; mm6=(50 51 52 53)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	psubw	mm5,mm7			; mm5=tmp13
+	psubw	mm0,mm4			; mm0=tmp12
+	paddw	mm1,mm7			; mm1=tmp10
+	paddw	mm6,mm4			; mm6=tmp11
+
+	paddw	mm0,mm5
+	psllw	mm0,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
+
+	movq	mm7,mm1
+	movq	mm4,mm5
+	psubw	mm1,mm6			; mm1=data4
+	psubw	mm5,mm0			; mm5=data6
+	paddw	mm7,mm6			; mm7=data0
+	paddw	mm4,mm0			; mm4=data2
+
+	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+	; -- Odd part
+
+	movq	mm6, MMWORD [wk(0)]	; mm6=tmp6
+	movq	mm0, MMWORD [wk(1)]	; mm0=tmp7
+
+	paddw	mm2,mm3			; mm2=tmp10
+	paddw	mm3,mm6			; mm3=tmp11
+	paddw	mm6,mm0			; mm6=tmp12, mm0=tmp7
+
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm6,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	mm3,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
+
+	movq	mm1,mm2			; mm1=tmp10
+	psubw	mm2,mm6
+	pmulhw	mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
+	pmulhw	mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+	pmulhw	mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+	paddw	mm1,mm2			; mm1=z2
+	paddw	mm6,mm2			; mm6=z4
+
+	movq	mm5,mm0
+	psubw	mm0,mm3			; mm0=z13
+	paddw	mm5,mm3			; mm5=z11
+
+	movq	mm7,mm0
+	movq	mm4,mm5
+	psubw	mm0,mm1			; mm0=data3
+	psubw	mm5,mm6			; mm5=data7
+	paddw	mm7,mm1			; mm7=data5
+	paddw	mm4,mm6			; mm4=data1
+
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+	add	edx, byte 4*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .columnloop
+
+	emms		; empty MMX state
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JFDCT_INT_MMX_SUPPORTED
+%endif ; DCT_IFAST_SUPPORTED
diff --git a/jfmmxint.asm b/jfmmxint.asm
new file mode 100644
index 0000000..afe47fd
--- /dev/null
+++ b/jfmmxint.asm
@@ -0,0 +1,629 @@
+;
+; jfmmxint.asm - accurate integer FDCT (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_ISLOW_SUPPORTED
+%ifdef JFDCT_INT_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_islow_mmx)
+
+EXTN(jconst_fdct_islow_mmx):
+
+PW_F130_F054	times 2 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 2 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 2 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 2 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 2 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 2 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 2 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 2 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 2 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 2 dd  1 << (DESCALE_P2-1)
+PW_DESCALE_P2X	times 4 dw  1 << (PASS1_BITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jpeg_fdct_islow_mmx (DCTELEM * data)
+;
+
+%define data(b)		(b)+8		; DCTELEM * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jpeg_fdct_islow_mmx)
+
+EXTN(jpeg_fdct_islow_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.rowloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(20 21 22 23), mm2=(24 25 26 27)
+	; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(20 30 21 31)
+	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(24 34 25 35)
+	punpckhwd mm5,mm3		; mm5=(26 36 27 37)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 01 02 03), mm1=(04 05 06 07)
+	; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 32 23 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(24 34 25 35)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
+	punpckhwd mm4,mm7		; mm4=(02 12 03 13)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(04 14 05 15)
+	punpckhwd mm2,mm3		; mm2=(06 16 07 17)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 10 20 30)=data0
+	punpckhdq mm7,mm0		; mm7=(01 11 21 31)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(06 16 26 36)=data6
+	punpckhdq mm3,mm5		; mm3=(07 17 27 37)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 32 23 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(24 34 25 35)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(02 12 22 32)=data2
+	punpckhdq mm7,mm2		; mm7=(03 13 23 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(04 14 24 34)=data4
+	punpckhdq mm6,mm3		; mm6=(05 15 25 35)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	paddw	mm5,mm7			; mm5=tmp10
+	paddw	mm0,mm4			; mm0=tmp11
+	psubw	mm1,mm7			; mm1=tmp13
+	psubw	mm6,mm4			; mm6=tmp12
+
+	movq	mm7,mm5
+	paddw	mm5,mm0			; mm5=tmp10+tmp11
+	psubw	mm7,mm0			; mm7=tmp10-tmp11
+
+	psllw	mm5,PASS1_BITS		; mm5=data0
+	psllw	mm7,PASS1_BITS		; mm7=data4
+
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movq      mm4,mm1		; mm1=tmp13
+	movq      mm0,mm1
+	punpcklwd mm4,mm6		; mm6=tmp12
+	punpckhwd mm0,mm6
+	movq      mm1,mm4
+	movq      mm6,mm0
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=data2L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]	; mm0=data2H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=data6L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]	; mm6=data6H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm0,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm4,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm6,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm1,DESCALE_P1
+	psrad	mm6,DESCALE_P1
+
+	packssdw  mm4,mm0		; mm4=data2
+	packssdw  mm1,mm6		; mm1=data6
+
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
+
+	; -- Odd part
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp6
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp7
+
+	movq	mm0,mm2			; mm2=tmp4
+	movq	mm6,mm3			; mm3=tmp5
+	paddw	mm0,mm5			; mm0=z3
+	paddw	mm6,mm7			; mm6=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm4,mm0
+	movq      mm1,mm0
+	punpcklwd mm4,mm6
+	punpckhwd mm1,mm6
+	movq      mm0,mm4
+	movq      mm6,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]	; mm4=z3L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]	; mm1=z3H
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]	; mm0=z4L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]	; mm6=z4H
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=z3L
+	movq	MMWORD [wk(1)], mm1	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movq      mm4,mm2
+	movq      mm1,mm2
+	punpcklwd mm4,mm7
+	punpckhwd mm1,mm7
+	movq      mm2,mm4
+	movq      mm7,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]	; mm4=tmp4L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]	; mm1=tmp4H
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]	; mm2=tmp7L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]	; mm7=tmp7H
+
+	paddd	mm4, MMWORD [wk(0)]	; mm4=data7L
+	paddd	mm1, MMWORD [wk(1)]	; mm1=data7H
+	paddd	mm2,mm0			; mm2=data1L
+	paddd	mm7,mm6			; mm7=data1H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm4,DESCALE_P1
+	psrad	mm1,DESCALE_P1
+	paddd	mm2,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm2,DESCALE_P1
+	psrad	mm7,DESCALE_P1
+
+	packssdw  mm4,mm1		; mm4=data7
+	packssdw  mm2,mm7		; mm2=data1
+
+	movq	MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+	movq      mm1,mm3
+	movq      mm7,mm3
+	punpcklwd mm1,mm5
+	punpckhwd mm7,mm5
+	movq      mm3,mm1
+	movq      mm5,mm7
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]	; mm1=tmp5L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]	; mm7=tmp5H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]	; mm3=tmp6L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]	; mm5=tmp6H
+
+	paddd	mm1,mm0			; mm1=data5L
+	paddd	mm7,mm6			; mm7=data5H
+	paddd	mm3, MMWORD [wk(0)]	; mm3=data3L
+	paddd	mm5, MMWORD [wk(1)]	; mm5=data3H
+
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm1,DESCALE_P1
+	psrad	mm7,DESCALE_P1
+	paddd	mm3,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	mm5,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	mm3,DESCALE_P1
+	psrad	mm5,DESCALE_P1
+
+	packssdw  mm1,mm7		; mm1=data5
+	packssdw  mm3,mm5		; mm3=data3
+
+	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+	add	edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.columnloop:
+
+	movq	mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+	; mm0=(02 12 22 32), mm2=(42 52 62 72)
+	; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+	movq      mm4,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm1		; mm0=(02 03 12 13)
+	punpckhwd mm4,mm1		; mm4=(22 23 32 33)
+	movq      mm5,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm3		; mm2=(42 43 52 53)
+	punpckhwd mm5,mm3		; mm5=(62 63 72 73)
+
+	movq	mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movq	mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movq	mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+	movq	mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+	; mm6=(00 10 20 30), mm1=(40 50 60 70)
+	; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=(22 23 32 33)
+	movq	MMWORD [wk(1)], mm2	; wk(1)=(42 43 52 53)
+
+	movq      mm4,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 01 10 11)
+	punpckhwd mm4,mm7		; mm4=(20 21 30 31)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm3		; mm1=(40 41 50 51)
+	punpckhwd mm2,mm3		; mm2=(60 61 70 71)
+
+	movq      mm7,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm0		; mm6=(00 01 02 03)=data0
+	punpckhdq mm7,mm0		; mm7=(10 11 12 13)=data1
+	movq      mm3,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm5		; mm2=(60 61 62 63)=data6
+	punpckhdq mm3,mm5		; mm3=(70 71 72 73)=data7
+
+	movq	mm0,mm7
+	movq	mm5,mm6
+	psubw	mm7,mm2			; mm7=data1-data6=tmp6
+	psubw	mm6,mm3			; mm6=data0-data7=tmp7
+	paddw	mm0,mm2			; mm0=data1+data6=tmp1
+	paddw	mm5,mm3			; mm5=data0+data7=tmp0
+
+	movq	mm2, MMWORD [wk(0)]	; mm2=(22 23 32 33)
+	movq	mm3, MMWORD [wk(1)]	; mm3=(42 43 52 53)
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp6
+	movq	MMWORD [wk(1)], mm6	; wk(1)=tmp7
+
+	movq      mm7,mm4		; transpose coefficients(phase 2)
+	punpckldq mm4,mm2		; mm4=(20 21 22 23)=data2
+	punpckhdq mm7,mm2		; mm7=(30 31 32 33)=data3
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm3		; mm1=(40 41 42 43)=data4
+	punpckhdq mm6,mm3		; mm6=(50 51 52 53)=data5
+
+	movq	mm2,mm7
+	movq	mm3,mm4
+	paddw	mm7,mm1			; mm7=data3+data4=tmp3
+	paddw	mm4,mm6			; mm4=data2+data5=tmp2
+	psubw	mm2,mm1			; mm2=data3-data4=tmp4
+	psubw	mm3,mm6			; mm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movq	mm1,mm5
+	movq	mm6,mm0
+	paddw	mm5,mm7			; mm5=tmp10
+	paddw	mm0,mm4			; mm0=tmp11
+	psubw	mm1,mm7			; mm1=tmp13
+	psubw	mm6,mm4			; mm6=tmp12
+
+	movq	mm7,mm5
+	paddw	mm5,mm0			; mm5=tmp10+tmp11
+	psubw	mm7,mm0			; mm7=tmp10-tmp11
+
+	paddw	mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
+	paddw	mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
+	psraw	mm5,PASS1_BITS		; mm5=data0
+	psraw	mm7,PASS1_BITS		; mm7=data4
+
+	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movq      mm4,mm1		; mm1=tmp13
+	movq      mm0,mm1
+	punpcklwd mm4,mm6		; mm6=tmp12
+	punpckhwd mm0,mm6
+	movq      mm1,mm4
+	movq      mm6,mm0
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=data2L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]	; mm0=data2H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=data6L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]	; mm6=data6H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm0,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm4,DESCALE_P2
+	psrad	mm0,DESCALE_P2
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm6,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm1,DESCALE_P2
+	psrad	mm6,DESCALE_P2
+
+	packssdw  mm4,mm0		; mm4=data2
+	packssdw  mm1,mm6		; mm1=data6
+
+	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
+
+	; -- Odd part
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp6
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp7
+
+	movq	mm0,mm2			; mm2=tmp4
+	movq	mm6,mm3			; mm3=tmp5
+	paddw	mm0,mm5			; mm0=z3
+	paddw	mm6,mm7			; mm6=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm4,mm0
+	movq      mm1,mm0
+	punpcklwd mm4,mm6
+	punpckhwd mm1,mm6
+	movq      mm0,mm4
+	movq      mm6,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]	; mm4=z3L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]	; mm1=z3H
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]	; mm0=z4L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]	; mm6=z4H
+
+	movq	MMWORD [wk(0)], mm4	; wk(0)=z3L
+	movq	MMWORD [wk(1)], mm1	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movq      mm4,mm2
+	movq      mm1,mm2
+	punpcklwd mm4,mm7
+	punpckhwd mm1,mm7
+	movq      mm2,mm4
+	movq      mm7,mm1
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]	; mm4=tmp4L
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]	; mm1=tmp4H
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]	; mm2=tmp7L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]	; mm7=tmp7H
+
+	paddd	mm4, MMWORD [wk(0)]	; mm4=data7L
+	paddd	mm1, MMWORD [wk(1)]	; mm1=data7H
+	paddd	mm2,mm0			; mm2=data1L
+	paddd	mm7,mm6			; mm7=data1H
+
+	paddd	mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm4,DESCALE_P2
+	psrad	mm1,DESCALE_P2
+	paddd	mm2,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm2,DESCALE_P2
+	psrad	mm7,DESCALE_P2
+
+	packssdw  mm4,mm1		; mm4=data7
+	packssdw  mm2,mm7		; mm2=data1
+
+	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
+	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+	movq      mm1,mm3
+	movq      mm7,mm3
+	punpcklwd mm1,mm5
+	punpckhwd mm7,mm5
+	movq      mm3,mm1
+	movq      mm5,mm7
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]	; mm1=tmp5L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]	; mm7=tmp5H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]	; mm3=tmp6L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]	; mm5=tmp6H
+
+	paddd	mm1,mm0			; mm1=data5L
+	paddd	mm7,mm6			; mm7=data5H
+	paddd	mm3, MMWORD [wk(0)]	; mm3=data3L
+	paddd	mm5, MMWORD [wk(1)]	; mm5=data3H
+
+	paddd	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm1,DESCALE_P2
+	psrad	mm7,DESCALE_P2
+	paddd	mm3,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	mm5,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	mm3,DESCALE_P2
+	psrad	mm5,DESCALE_P2
+
+	packssdw  mm1,mm7		; mm1=data5
+	packssdw  mm3,mm5		; mm3=data3
+
+	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
+	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+	add	edx, byte 4*SIZEOF_DCTELEM
+	dec	ecx
+	jnz	near .columnloop
+
+	emms		; empty MMX state
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JFDCT_INT_MMX_SUPPORTED
+%endif ; DCT_ISLOW_SUPPORTED
diff --git a/jfss2fst.asm b/jfss2fst.asm
new file mode 100644
index 0000000..567bcef
--- /dev/null
+++ b/jfss2fst.asm
@@ -0,0 +1,411 @@
+;
+; jfss2fst.asm - fast integer FDCT (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_IFAST_SUPPORTED
+%ifdef JFDCT_INT_SSE2_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382	equ	 98		; FIX(0.382683433)
+F_0_541	equ	139		; FIX(0.541196100)
+F_0_707	equ	181		; FIX(0.707106781)
+F_1_306	equ	334		; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_382	equ	DESCALE( 410903207,30-CONST_BITS)	; FIX(0.382683433)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_707	equ	DESCALE( 759250124,30-CONST_BITS)	; FIX(0.707106781)
+F_1_306	equ	DESCALE(1402911301,30-CONST_BITS)	; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707	times 8 dw  F_0_707 << CONST_SHIFT
+PW_F0382	times 8 dw  F_0_382 << CONST_SHIFT
+PW_F0541	times 8 dw  F_0_541 << CONST_SHIFT
+PW_F1306	times 8 dw  F_1_306 << CONST_SHIFT
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jpeg_fdct_ifast_sse2 (DCTELEM * data)
+;
+
+%define data(b)		(b)+8		; DCTELEM * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jpeg_fdct_ifast_sse2)
+
+EXTN(jpeg_fdct_ifast_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
+	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
+
+	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
+
+	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
+	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
+
+	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
+	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(42 52 62 72 43 53 63 73)
+	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=(44 54 64 74 45 55 65 75)
+
+	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
+	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
+
+	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
+	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
+	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
+	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+	movdqa	xmm6,xmm1
+	movdqa	xmm3,xmm0
+	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
+	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
+	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
+	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
+
+	movdqa	xmm2, XMMWORD [wk(0)]	; xmm2=(42 52 62 72 43 53 63 73)
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(44 54 64 74 45 55 65 75)
+	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
+
+	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
+	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
+	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
+	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+	movdqa	xmm2,xmm1
+	movdqa	xmm5,xmm7
+	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
+	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
+	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
+	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm0,xmm6
+	psubw	xmm3,xmm1		; xmm3=tmp13
+	psubw	xmm6,xmm7		; xmm6=tmp12
+	paddw	xmm4,xmm1		; xmm4=tmp10
+	paddw	xmm0,xmm7		; xmm0=tmp11
+
+	paddw	xmm6,xmm3
+	psllw	xmm6,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1
+
+	movdqa	xmm1,xmm4
+	movdqa	xmm7,xmm3
+	psubw	xmm4,xmm0		; xmm4=data4
+	psubw	xmm3,xmm6		; xmm3=data6
+	paddw	xmm1,xmm0		; xmm1=data0
+	paddw	xmm7,xmm6		; xmm7=data2
+
+	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=tmp6
+	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp7
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=data4
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=data6
+
+	; -- Odd part
+
+	paddw	xmm2,xmm5		; xmm2=tmp10
+	paddw	xmm5,xmm0		; xmm5=tmp11
+	paddw	xmm0,xmm6		; xmm0=tmp12, xmm6=tmp7
+
+	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3
+
+	movdqa	xmm4,xmm2		; xmm4=tmp10
+	psubw	xmm2,xmm0
+	pmulhw	xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5
+	pmulhw	xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+	pmulhw	xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+	paddw	xmm4,xmm2		; xmm4=z2
+	paddw	xmm0,xmm2		; xmm0=z4
+
+	movdqa	xmm3,xmm6
+	psubw	xmm6,xmm5		; xmm6=z13
+	paddw	xmm3,xmm5		; xmm3=z11
+
+	movdqa	xmm2,xmm6
+	movdqa	xmm5,xmm3
+	psubw	xmm6,xmm4		; xmm6=data3
+	psubw	xmm3,xmm0		; xmm3=data7
+	paddw	xmm2,xmm4		; xmm2=data5
+	paddw	xmm5,xmm0		; xmm5=data1
+
+	; ---- Pass 2: process columns.
+
+;	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+
+	; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+	; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+	movdqa    xmm4,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm5		; xmm1=(00 01 10 11 20 21 30 31)
+	punpckhwd xmm4,xmm5		; xmm4=(40 41 50 51 60 61 70 71)
+	movdqa    xmm0,xmm7		; transpose coefficients(phase 1)
+	punpcklwd xmm7,xmm6		; xmm7=(02 03 12 13 22 23 32 33)
+	punpckhwd xmm0,xmm6		; xmm0=(42 43 52 53 62 63 72 73)
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=col4
+	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=col6
+
+	; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+	; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=(02 03 12 13 22 23 32 33)
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(42 43 52 53 62 63 72 73)
+
+	movdqa    xmm7,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm2		; xmm5=(04 05 14 15 24 25 34 35)
+	punpckhwd xmm7,xmm2		; xmm7=(44 45 54 55 64 65 74 75)
+	movdqa    xmm0,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm3		; xmm6=(06 07 16 17 26 27 36 37)
+	punpckhwd xmm0,xmm3		; xmm0=(46 47 56 57 66 67 76 77)
+
+	movdqa    xmm2,xmm5		; transpose coefficients(phase 2)
+	punpckldq xmm5,xmm6		; xmm5=(04 05 06 07 14 15 16 17)
+	punpckhdq xmm2,xmm6		; xmm2=(24 25 26 27 34 35 36 37)
+	movdqa    xmm3,xmm7		; transpose coefficients(phase 2)
+	punpckldq xmm7,xmm0		; xmm7=(44 45 46 47 54 55 56 57)
+	punpckhdq xmm3,xmm0		; xmm3=(64 65 66 67 74 75 76 77)
+
+	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=(02 03 12 13 22 23 32 33)
+	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(42 43 52 53 62 63 72 73)
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(24 25 26 27 34 35 36 37)
+	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=(44 45 46 47 54 55 56 57)
+
+	movdqa    xmm2,xmm1		; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm6		; xmm1=(00 01 02 03 10 11 12 13)
+	punpckhdq xmm2,xmm6		; xmm2=(20 21 22 23 30 31 32 33)
+	movdqa    xmm7,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm0		; xmm4=(40 41 42 43 50 51 52 53)
+	punpckhdq xmm7,xmm0		; xmm7=(60 61 62 63 70 71 72 73)
+
+	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm5		; xmm1=(00 01 02 03 04 05 06 07)=data0
+	punpckhqdq xmm6,xmm5		; xmm6=(10 11 12 13 14 15 16 17)=data1
+	movdqa     xmm0,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm3		; xmm7=(60 61 62 63 64 65 66 67)=data6
+	punpckhqdq xmm0,xmm3		; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+	movdqa	xmm5,xmm6
+	movdqa	xmm3,xmm1
+	psubw	xmm6,xmm7		; xmm6=data1-data6=tmp6
+	psubw	xmm1,xmm0		; xmm1=data0-data7=tmp7
+	paddw	xmm5,xmm7		; xmm5=data1+data6=tmp1
+	paddw	xmm3,xmm0		; xmm3=data0+data7=tmp0
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(24 25 26 27 34 35 36 37)
+	movdqa	xmm0, XMMWORD [wk(1)]	; xmm0=(44 45 46 47 54 55 56 57)
+	movdqa	XMMWORD [wk(0)], xmm6	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=tmp7
+
+	movdqa     xmm6,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm7		; xmm2=(20 21 22 23 24 25 26 27)=data2
+	punpckhqdq xmm6,xmm7		; xmm6=(30 31 32 33 34 35 36 37)=data3
+	movdqa     xmm1,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm0		; xmm4=(40 41 42 43 44 45 46 47)=data4
+	punpckhqdq xmm1,xmm0		; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+	movdqa	xmm7,xmm6
+	movdqa	xmm0,xmm2
+	paddw	xmm6,xmm4		; xmm6=data3+data4=tmp3
+	paddw	xmm2,xmm1		; xmm2=data2+data5=tmp2
+	psubw	xmm7,xmm4		; xmm7=data3-data4=tmp4
+	psubw	xmm0,xmm1		; xmm0=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm1,xmm5
+	psubw	xmm3,xmm6		; xmm3=tmp13
+	psubw	xmm5,xmm2		; xmm5=tmp12
+	paddw	xmm4,xmm6		; xmm4=tmp10
+	paddw	xmm1,xmm2		; xmm1=tmp11
+
+	paddw	xmm5,xmm3
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1
+
+	movdqa	xmm6,xmm4
+	movdqa	xmm2,xmm3
+	psubw	xmm4,xmm1		; xmm4=data4
+	psubw	xmm3,xmm5		; xmm3=data6
+	paddw	xmm6,xmm1		; xmm6=data0
+	paddw	xmm2,xmm5		; xmm2=data2
+
+	movdqa	XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
+	movdqa	XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
+	movdqa	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
+	movdqa	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
+
+	; -- Odd part
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=tmp6
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
+
+	paddw	xmm7,xmm0		; xmm7=tmp10
+	paddw	xmm0,xmm1		; xmm0=tmp11
+	paddw	xmm1,xmm5		; xmm1=tmp12, xmm5=tmp7
+
+	psllw	xmm7,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
+
+	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3
+
+	movdqa	xmm4,xmm7		; xmm4=tmp10
+	psubw	xmm7,xmm1
+	pmulhw	xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5
+	pmulhw	xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+	pmulhw	xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+	paddw	xmm4,xmm7		; xmm4=z2
+	paddw	xmm1,xmm7		; xmm1=z4
+
+	movdqa	xmm3,xmm5
+	psubw	xmm5,xmm0		; xmm5=z13
+	paddw	xmm3,xmm0		; xmm3=z11
+
+	movdqa	xmm6,xmm5
+	movdqa	xmm2,xmm3
+	psubw	xmm5,xmm4		; xmm5=data3
+	psubw	xmm3,xmm1		; xmm3=data7
+	paddw	xmm6,xmm4		; xmm6=data5
+	paddw	xmm2,xmm1		; xmm2=data1
+
+	movdqa	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
+	movdqa	XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
+	movdqa	XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
+	movdqa	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JFDCT_INT_SSE2_SUPPORTED
+%endif ; DCT_IFAST_SUPPORTED
diff --git a/jfss2int.asm b/jfss2int.asm
new file mode 100644
index 0000000..106b42c
--- /dev/null
+++ b/jfss2int.asm
@@ -0,0 +1,641 @@
+;
+; jfss2int.asm - accurate integer FDCT (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_ISLOW_SUPPORTED
+%ifdef JFDCT_INT_SSE2_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
+PW_DESCALE_P2X	times 8 dw  1 << (PASS1_BITS-1)
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jpeg_fdct_islow_sse2 (DCTELEM * data)
+;
+
+%define data(b)		(b)+8		; DCTELEM * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		6
+
+	align	16
+	global	EXTN(jpeg_fdct_islow_sse2)
+
+EXTN(jpeg_fdct_islow_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+	; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+	; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+	movdqa    xmm4,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm1		; xmm0=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm4,xmm1		; xmm4=(04 14 05 15 06 16 07 17)
+	movdqa    xmm5,xmm2		; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm3		; xmm2=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm5,xmm3		; xmm5=(24 34 25 35 26 36 27 37)
+
+	movdqa	xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+	; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+	; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=(20 30 21 31 22 32 23 33)
+	movdqa	XMMWORD [wk(1)], xmm5	; wk(1)=(24 34 25 35 26 36 27 37)
+
+	movdqa    xmm2,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm7		; xmm6=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm2,xmm7		; xmm2=(44 54 45 55 46 56 47 57)
+	movdqa    xmm5,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm3		; xmm1=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm5,xmm3		; xmm5=(64 74 65 75 66 76 67 77)
+
+	movdqa    xmm7,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm1		; xmm6=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm7,xmm1		; xmm7=(42 52 62 72 43 53 63 73)
+	movdqa    xmm3,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm5		; xmm2=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm3,xmm5		; xmm3=(46 56 66 76 47 57 67 77)
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(20 30 21 31 22 32 23 33)
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=(24 34 25 35 26 36 27 37)
+	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=(42 52 62 72 43 53 63 73)
+	movdqa	XMMWORD [wk(3)], xmm2	; wk(3)=(44 54 64 74 45 55 65 75)
+
+	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm1		; xmm0=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm7,xmm1		; xmm7=(02 12 22 32 03 13 23 33)
+	movdqa    xmm2,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm5		; xmm4=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm2,xmm5		; xmm2=(06 16 26 36 07 17 27 37)
+
+	movdqa     xmm1,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm6		; xmm0=(00 10 20 30 40 50 60 70)=data0
+	punpckhqdq xmm1,xmm6		; xmm1=(01 11 21 31 41 51 61 71)=data1
+	movdqa     xmm5,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm3		; xmm2=(06 16 26 36 46 56 66 76)=data6
+	punpckhqdq xmm5,xmm3		; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+	movdqa	xmm6,xmm1
+	movdqa	xmm3,xmm0
+	psubw	xmm1,xmm2		; xmm1=data1-data6=tmp6
+	psubw	xmm0,xmm5		; xmm0=data0-data7=tmp7
+	paddw	xmm6,xmm2		; xmm6=data1+data6=tmp1
+	paddw	xmm3,xmm5		; xmm3=data0+data7=tmp0
+
+	movdqa	xmm2, XMMWORD [wk(2)]	; xmm2=(42 52 62 72 43 53 63 73)
+	movdqa	xmm5, XMMWORD [wk(3)]	; xmm5=(44 54 64 74 45 55 65 75)
+	movdqa	XMMWORD [wk(0)], xmm1	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp7
+
+	movdqa     xmm1,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm2		; xmm7=(02 12 22 32 42 52 62 72)=data2
+	punpckhqdq xmm1,xmm2		; xmm1=(03 13 23 33 43 53 63 73)=data3
+	movdqa     xmm0,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm5		; xmm4=(04 14 24 34 44 54 64 74)=data4
+	punpckhqdq xmm0,xmm5		; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+	movdqa	xmm2,xmm1
+	movdqa	xmm5,xmm7
+	paddw	xmm1,xmm4		; xmm1=data3+data4=tmp3
+	paddw	xmm7,xmm0		; xmm7=data2+data5=tmp2
+	psubw	xmm2,xmm4		; xmm2=data3-data4=tmp4
+	psubw	xmm5,xmm0		; xmm5=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm0,xmm6
+	paddw	xmm3,xmm1		; xmm3=tmp10
+	paddw	xmm6,xmm7		; xmm6=tmp11
+	psubw	xmm4,xmm1		; xmm4=tmp13
+	psubw	xmm0,xmm7		; xmm0=tmp12
+
+	movdqa	xmm1,xmm3
+	paddw	xmm3,xmm6		; xmm3=tmp10+tmp11
+	psubw	xmm1,xmm6		; xmm1=tmp10-tmp11
+
+	psllw	xmm3,PASS1_BITS		; xmm3=data0
+	psllw	xmm1,PASS1_BITS		; xmm1=data4
+
+	movdqa	XMMWORD [wk(2)], xmm3	; wk(2)=data0
+	movdqa	XMMWORD [wk(3)], xmm1	; wk(3)=data4
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movdqa    xmm7,xmm4		; xmm4=tmp13
+	movdqa    xmm6,xmm4
+	punpcklwd xmm7,xmm0		; xmm0=tmp12
+	punpckhwd xmm6,xmm0
+	movdqa    xmm4,xmm7
+	movdqa    xmm0,xmm6
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_F130_F054)]	; xmm7=data2L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]	; xmm6=data2H
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F054_MF130)]	; xmm4=data6L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_F054_MF130)]	; xmm0=data6H
+
+	paddd	xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm6,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm7,DESCALE_P1
+	psrad	xmm6,DESCALE_P1
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm0,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm4,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+
+	packssdw  xmm7,xmm6		; xmm7=data2
+	packssdw  xmm4,xmm0		; xmm4=data6
+
+	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=data2
+	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=data6
+
+	; -- Odd part
+
+	movdqa	xmm3, XMMWORD [wk(0)]	; xmm3=tmp6
+	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp7
+
+	movdqa	xmm6,xmm2		; xmm2=tmp4
+	movdqa	xmm0,xmm5		; xmm5=tmp5
+	paddw	xmm6,xmm3		; xmm6=z3
+	paddw	xmm0,xmm1		; xmm0=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm7,xmm6
+	movdqa    xmm4,xmm6
+	punpcklwd xmm7,xmm0
+	punpckhwd xmm4,xmm0
+	movdqa    xmm6,xmm7
+	movdqa    xmm0,xmm4
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]	; xmm7=z3L
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]	; xmm4=z3H
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]	; xmm6=z4L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_F117_F078)]	; xmm0=z4H
+
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=z3L
+	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movdqa    xmm7,xmm2
+	movdqa    xmm4,xmm2
+	punpcklwd xmm7,xmm1
+	punpckhwd xmm4,xmm1
+	movdqa    xmm2,xmm7
+	movdqa    xmm1,xmm4
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm7=tmp4L
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm4=tmp4H
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF089_F060)]	; xmm2=tmp7L
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]	; xmm1=tmp7H
+
+	paddd	xmm7, XMMWORD [wk(0)]	; xmm7=data7L
+	paddd	xmm4, XMMWORD [wk(1)]	; xmm4=data7H
+	paddd	xmm2,xmm6		; xmm2=data1L
+	paddd	xmm1,xmm0		; xmm1=data1H
+
+	paddd	xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm7,DESCALE_P1
+	psrad	xmm4,DESCALE_P1
+	paddd	xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm2,DESCALE_P1
+	psrad	xmm1,DESCALE_P1
+
+	packssdw  xmm7,xmm4		; xmm7=data7
+	packssdw  xmm2,xmm1		; xmm2=data1
+
+	movdqa    xmm4,xmm5
+	movdqa    xmm1,xmm5
+	punpcklwd xmm4,xmm3
+	punpckhwd xmm1,xmm3
+	movdqa    xmm5,xmm4
+	movdqa    xmm3,xmm1
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm4=tmp5L
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm1=tmp5H
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_MF256_F050)]	; xmm5=tmp6L
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]	; xmm3=tmp6H
+
+	paddd	xmm4,xmm6		; xmm4=data5L
+	paddd	xmm1,xmm0		; xmm1=data5H
+	paddd	xmm5, XMMWORD [wk(0)]	; xmm5=data3L
+	paddd	xmm3, XMMWORD [wk(1)]	; xmm3=data3H
+
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm4,DESCALE_P1
+	psrad	xmm1,DESCALE_P1
+	paddd	xmm5,[GOTOFF(ebx,PD_DESCALE_P1)]
+	paddd	xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]
+	psrad	xmm5,DESCALE_P1
+	psrad	xmm3,DESCALE_P1
+
+	packssdw  xmm4,xmm1		; xmm4=data5
+	packssdw  xmm5,xmm3		; xmm5=data3
+
+	; ---- Pass 2: process columns.
+
+;	mov	edx, POINTER [data(eax)]	; (DCTELEM *)
+
+	movdqa	xmm6, XMMWORD [wk(2)]	; xmm6=col0
+	movdqa	xmm0, XMMWORD [wk(4)]	; xmm0=col2
+
+	; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+	; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+	movdqa    xmm1,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm2		; xmm6=(00 01 10 11 20 21 30 31)
+	punpckhwd xmm1,xmm2		; xmm1=(40 41 50 51 60 61 70 71)
+	movdqa    xmm3,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm5		; xmm0=(02 03 12 13 22 23 32 33)
+	punpckhwd xmm3,xmm5		; xmm3=(42 43 52 53 62 63 72 73)
+
+	movdqa	xmm2, XMMWORD [wk(3)]	; xmm2=col4
+	movdqa	xmm5, XMMWORD [wk(5)]	; xmm5=col6
+
+	; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+	; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=(02 03 12 13 22 23 32 33)
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(42 43 52 53 62 63 72 73)
+
+	movdqa    xmm0,xmm2		; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm4		; xmm2=(04 05 14 15 24 25 34 35)
+	punpckhwd xmm0,xmm4		; xmm0=(44 45 54 55 64 65 74 75)
+	movdqa    xmm3,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm7		; xmm5=(06 07 16 17 26 27 36 37)
+	punpckhwd xmm3,xmm7		; xmm3=(46 47 56 57 66 67 76 77)
+
+	movdqa    xmm4,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm5		; xmm2=(04 05 06 07 14 15 16 17)
+	punpckhdq xmm4,xmm5		; xmm4=(24 25 26 27 34 35 36 37)
+	movdqa    xmm7,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm3		; xmm0=(44 45 46 47 54 55 56 57)
+	punpckhdq xmm7,xmm3		; xmm7=(64 65 66 67 74 75 76 77)
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=(02 03 12 13 22 23 32 33)
+	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53 62 63 72 73)
+	movdqa	XMMWORD [wk(2)], xmm4	; wk(2)=(24 25 26 27 34 35 36 37)
+	movdqa	XMMWORD [wk(3)], xmm0	; wk(3)=(44 45 46 47 54 55 56 57)
+
+	movdqa    xmm4,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm5		; xmm6=(00 01 02 03 10 11 12 13)
+	punpckhdq xmm4,xmm5		; xmm4=(20 21 22 23 30 31 32 33)
+	movdqa    xmm0,xmm1		; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm3		; xmm1=(40 41 42 43 50 51 52 53)
+	punpckhdq xmm0,xmm3		; xmm0=(60 61 62 63 70 71 72 73)
+
+	movdqa     xmm5,xmm6		; transpose coefficients(phase 3)
+	punpcklqdq xmm6,xmm2		; xmm6=(00 01 02 03 04 05 06 07)=data0
+	punpckhqdq xmm5,xmm2		; xmm5=(10 11 12 13 14 15 16 17)=data1
+	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm7		; xmm0=(60 61 62 63 64 65 66 67)=data6
+	punpckhqdq xmm3,xmm7		; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+	movdqa	xmm2,xmm5
+	movdqa	xmm7,xmm6
+	psubw	xmm5,xmm0		; xmm5=data1-data6=tmp6
+	psubw	xmm6,xmm3		; xmm6=data0-data7=tmp7
+	paddw	xmm2,xmm0		; xmm2=data1+data6=tmp1
+	paddw	xmm7,xmm3		; xmm7=data0+data7=tmp0
+
+	movdqa	xmm0, XMMWORD [wk(2)]	; xmm0=(24 25 26 27 34 35 36 37)
+	movdqa	xmm3, XMMWORD [wk(3)]	; xmm3=(44 45 46 47 54 55 56 57)
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=tmp6
+	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+
+	movdqa     xmm5,xmm4		; transpose coefficients(phase 3)
+	punpcklqdq xmm4,xmm0		; xmm4=(20 21 22 23 24 25 26 27)=data2
+	punpckhqdq xmm5,xmm0		; xmm5=(30 31 32 33 34 35 36 37)=data3
+	movdqa     xmm6,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm3		; xmm1=(40 41 42 43 44 45 46 47)=data4
+	punpckhqdq xmm6,xmm3		; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm3,xmm4
+	paddw	xmm5,xmm1		; xmm5=data3+data4=tmp3
+	paddw	xmm4,xmm6		; xmm4=data2+data5=tmp2
+	psubw	xmm0,xmm1		; xmm0=data3-data4=tmp4
+	psubw	xmm3,xmm6		; xmm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movdqa	xmm1,xmm7
+	movdqa	xmm6,xmm2
+	paddw	xmm7,xmm5		; xmm7=tmp10
+	paddw	xmm2,xmm4		; xmm2=tmp11
+	psubw	xmm1,xmm5		; xmm1=tmp13
+	psubw	xmm6,xmm4		; xmm6=tmp12
+
+	movdqa	xmm5,xmm7
+	paddw	xmm7,xmm2		; xmm7=tmp10+tmp11
+	psubw	xmm5,xmm2		; xmm5=tmp10-tmp11
+
+	paddw	xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
+	paddw	xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
+	psraw	xmm7,PASS1_BITS		; xmm7=data0
+	psraw	xmm5,PASS1_BITS		; xmm5=data4
+
+	movdqa	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
+	movdqa	XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
+
+	; (Original)
+	; z1 = (tmp12 + tmp13) * 0.541196100;
+	; data2 = z1 + tmp13 * 0.765366865;
+	; data6 = z1 + tmp12 * -1.847759065;
+	;
+	; (This implementation)
+	; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+	; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+	movdqa    xmm4,xmm1		; xmm1=tmp13
+	movdqa    xmm2,xmm1
+	punpcklwd xmm4,xmm6		; xmm6=tmp12
+	punpckhwd xmm2,xmm6
+	movdqa    xmm1,xmm4
+	movdqa    xmm6,xmm2
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]	; xmm4=data2L
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F130_F054)]	; xmm2=data2H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]	; xmm1=data6L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F054_MF130)]	; xmm6=data6H
+
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm2,DESCALE_P2
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm6,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm1,DESCALE_P2
+	psrad	xmm6,DESCALE_P2
+
+	packssdw  xmm4,xmm2		; xmm4=data2
+	packssdw  xmm1,xmm6		; xmm1=data6
+
+	movdqa	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
+	movdqa	XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
+
+	; -- Odd part
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp6
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp7
+
+	movdqa	xmm2,xmm0		; xmm0=tmp4
+	movdqa	xmm6,xmm3		; xmm3=tmp5
+	paddw	xmm2,xmm7		; xmm2=z3
+	paddw	xmm6,xmm5		; xmm6=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm4,xmm2
+	movdqa    xmm1,xmm2
+	punpcklwd xmm4,xmm6
+	punpckhwd xmm1,xmm6
+	movdqa    xmm2,xmm4
+	movdqa    xmm6,xmm1
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]	; xmm4=z3L
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF078_F117)]	; xmm1=z3H
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F117_F078)]	; xmm2=z4L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]	; xmm6=z4H
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=z3L
+	movdqa	XMMWORD [wk(1)], xmm1	; wk(1)=z3H
+
+	; (Original)
+	; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+	; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+	; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+	; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+	;
+	; (This implementation)
+	; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+	; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+	; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+	; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+	; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+	; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+	movdqa    xmm4,xmm0
+	movdqa    xmm1,xmm0
+	punpcklwd xmm4,xmm5
+	punpckhwd xmm1,xmm5
+	movdqa    xmm0,xmm4
+	movdqa    xmm5,xmm1
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm4=tmp4L
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm1=tmp4H
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF089_F060)]	; xmm0=tmp7L
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_MF089_F060)]	; xmm5=tmp7H
+
+	paddd	xmm4, XMMWORD [wk(0)]	; xmm4=data7L
+	paddd	xmm1, XMMWORD [wk(1)]	; xmm1=data7H
+	paddd	xmm0,xmm2		; xmm0=data1L
+	paddd	xmm5,xmm6		; xmm5=data1H
+
+	paddd	xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm1,DESCALE_P2
+	paddd	xmm0,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm0,DESCALE_P2
+	psrad	xmm5,DESCALE_P2
+
+	packssdw  xmm4,xmm1		; xmm4=data7
+	packssdw  xmm0,xmm5		; xmm0=data1
+
+	movdqa	XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
+	movdqa	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
+
+	movdqa    xmm1,xmm3
+	movdqa    xmm5,xmm3
+	punpcklwd xmm1,xmm7
+	punpckhwd xmm5,xmm7
+	movdqa    xmm3,xmm1
+	movdqa    xmm7,xmm5
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm1=tmp5L
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm5=tmp5H
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]	; xmm3=tmp6L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF256_F050)]	; xmm7=tmp6H
+
+	paddd	xmm1,xmm2		; xmm1=data5L
+	paddd	xmm5,xmm6		; xmm5=data5H
+	paddd	xmm3, XMMWORD [wk(0)]	; xmm3=data3L
+	paddd	xmm7, XMMWORD [wk(1)]	; xmm7=data3H
+
+	paddd	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm1,DESCALE_P2
+	psrad	xmm5,DESCALE_P2
+	paddd	xmm3,[GOTOFF(ebx,PD_DESCALE_P2)]
+	paddd	xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]
+	psrad	xmm3,DESCALE_P2
+	psrad	xmm7,DESCALE_P2
+
+	packssdw  xmm1,xmm5		; xmm1=data5
+	packssdw  xmm3,xmm7		; xmm3=data3
+
+	movdqa	XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
+	movdqa	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JFDCT_INT_SSE2_SUPPORTED
+%endif ; DCT_ISLOW_SUPPORTED
diff --git a/jfsseflt.asm b/jfsseflt.asm
new file mode 100644
index 0000000..98b0973
--- /dev/null
+++ b/jfsseflt.asm
@@ -0,0 +1,383 @@
+;
+; jfsseflt.asm - floating-point FDCT (SSE)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+%ifdef JFDCT_FLT_SSE_MMX_SUPPORTED
+%define JFDCT_FLT_SSE_SUPPORTED
+%endif
+%ifdef JFDCT_FLT_SSE_SSE2_SUPPORTED
+%define JFDCT_FLT_SSE_SUPPORTED
+%endif
+%ifdef JFDCT_FLT_SSE_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+	shufps	%1,%2,0x44
+%endmacro
+
+%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+	shufps	%1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382	times 4 dd  0.382683432365089771728460
+PD_0_707	times 4 dd  0.707106781186547524400844
+PD_0_541	times 4 dd  0.541196100146196984399723
+PD_1_306	times 4 dd  1.306562964876376527856643
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jpeg_fdct_float_sse (FAST_FLOAT * data)
+;
+
+%define data(b)		(b)+8		; FAST_FLOAT * data
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jpeg_fdct_float_sse)
+
+EXTN(jpeg_fdct_float_sse):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process rows.
+
+	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.rowloop:
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
+
+	; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+	; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
+	unpcklps xmm0,xmm1		; xmm0=(20 30 21 31)
+	unpckhps xmm4,xmm1		; xmm4=(22 32 23 33)
+	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
+	unpcklps xmm2,xmm3		; xmm2=(24 34 25 35)
+	unpckhps xmm5,xmm3		; xmm5=(26 36 27 37)
+
+	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+	; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+	; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 32 23 33)
+	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(24 34 25 35)
+
+	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
+	unpcklps xmm6,xmm7		; xmm6=(00 10 01 11)
+	unpckhps xmm4,xmm7		; xmm4=(02 12 03 13)
+	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
+	unpcklps xmm1,xmm3		; xmm1=(04 14 05 15)
+	unpckhps xmm2,xmm3		; xmm2=(06 16 07 17)
+
+	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm0		; xmm6=(00 10 20 30)=data0
+	unpckhps2 xmm7,xmm0		; xmm7=(01 11 21 31)=data1
+	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
+	unpcklps2 xmm2,xmm5		; xmm2=(06 16 26 36)=data6
+	unpckhps2 xmm3,xmm5		; xmm3=(07 17 27 37)=data7
+
+	movaps	xmm0,xmm7
+	movaps	xmm5,xmm6
+	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
+	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
+	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
+	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
+
+	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 32 23 33)
+	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(24 34 25 35)
+	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
+	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+
+	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(02 12 22 32)=data2
+	unpckhps2 xmm7,xmm2		; xmm7=(03 13 23 33)=data3
+	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm3		; xmm1=(04 14 24 34)=data4
+	unpckhps2 xmm6,xmm3		; xmm6=(05 15 25 35)=data5
+
+	movaps	xmm2,xmm7
+	movaps	xmm3,xmm4
+	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
+	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
+	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
+	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movaps	xmm1,xmm5
+	movaps	xmm6,xmm0
+	subps	xmm5,xmm7		; xmm5=tmp13
+	subps	xmm0,xmm4		; xmm0=tmp12
+	addps	xmm1,xmm7		; xmm1=tmp10
+	addps	xmm6,xmm4		; xmm6=tmp11
+
+	addps	xmm0,xmm5
+	mulps	xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+
+	movaps	xmm7,xmm1
+	movaps	xmm4,xmm5
+	subps	xmm1,xmm6		; xmm1=data4
+	subps	xmm5,xmm0		; xmm5=data6
+	addps	xmm7,xmm6		; xmm7=data0
+	addps	xmm4,xmm0		; xmm4=data2
+
+	movaps	XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+	; -- Odd part
+
+	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
+	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
+
+	addps	xmm2,xmm3		; xmm2=tmp10
+	addps	xmm3,xmm6		; xmm3=tmp11
+	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
+
+	mulps	xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+
+	movaps	xmm1,xmm2		; xmm1=tmp10
+	subps	xmm2,xmm6
+	mulps	xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+	mulps	xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+	mulps	xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+	addps	xmm1,xmm2		; xmm1=z2
+	addps	xmm6,xmm2		; xmm6=z4
+
+	movaps	xmm5,xmm0
+	subps	xmm0,xmm3		; xmm0=z13
+	addps	xmm5,xmm3		; xmm5=z11
+
+	movaps	xmm7,xmm0
+	movaps	xmm4,xmm5
+	subps	xmm0,xmm1		; xmm0=data3
+	subps	xmm5,xmm6		; xmm5=data7
+	addps	xmm7,xmm1		; xmm7=data5
+	addps	xmm4,xmm6		; xmm4=data1
+
+	movaps	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+	add	edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .rowloop
+
+	; ---- Pass 2: process columns.
+
+	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
+	mov	ecx, DCTSIZE/4
+	alignx	16,7
+.columnloop:
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+	; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+	; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
+	unpcklps xmm0,xmm1		; xmm0=(02 03 12 13)
+	unpckhps xmm4,xmm1		; xmm4=(22 23 32 33)
+	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
+	unpcklps xmm2,xmm3		; xmm2=(42 43 52 53)
+	unpckhps xmm5,xmm3		; xmm5=(62 63 72 73)
+
+	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+	; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+	; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 23 32 33)
+	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(42 43 52 53)
+
+	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
+	unpcklps xmm6,xmm7		; xmm6=(00 01 10 11)
+	unpckhps xmm4,xmm7		; xmm4=(20 21 30 31)
+	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
+	unpcklps xmm1,xmm3		; xmm1=(40 41 50 51)
+	unpckhps xmm2,xmm3		; xmm2=(60 61 70 71)
+
+	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm0		; xmm6=(00 01 02 03)=data0
+	unpckhps2 xmm7,xmm0		; xmm7=(10 11 12 13)=data1
+	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
+	unpcklps2 xmm2,xmm5		; xmm2=(60 61 62 63)=data6
+	unpckhps2 xmm3,xmm5		; xmm3=(70 71 72 73)=data7
+
+	movaps	xmm0,xmm7
+	movaps	xmm5,xmm6
+	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
+	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
+	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
+	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
+
+	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 23 32 33)
+	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53)
+	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
+	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+
+	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(20 21 22 23)=data2
+	unpckhps2 xmm7,xmm2		; xmm7=(30 31 32 33)=data3
+	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm3		; xmm1=(40 41 42 43)=data4
+	unpckhps2 xmm6,xmm3		; xmm6=(50 51 52 53)=data5
+
+	movaps	xmm2,xmm7
+	movaps	xmm3,xmm4
+	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
+	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
+	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
+	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
+
+	; -- Even part
+
+	movaps	xmm1,xmm5
+	movaps	xmm6,xmm0
+	subps	xmm5,xmm7		; xmm5=tmp13
+	subps	xmm0,xmm4		; xmm0=tmp12
+	addps	xmm1,xmm7		; xmm1=tmp10
+	addps	xmm6,xmm4		; xmm6=tmp11
+
+	addps	xmm0,xmm5
+	mulps	xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+
+	movaps	xmm7,xmm1
+	movaps	xmm4,xmm5
+	subps	xmm1,xmm6		; xmm1=data4
+	subps	xmm5,xmm0		; xmm5=data6
+	addps	xmm7,xmm6		; xmm7=data0
+	addps	xmm4,xmm0		; xmm4=data2
+
+	movaps	XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+	; -- Odd part
+
+	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
+	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
+
+	addps	xmm2,xmm3		; xmm2=tmp10
+	addps	xmm3,xmm6		; xmm3=tmp11
+	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
+
+	mulps	xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+
+	movaps	xmm1,xmm2		; xmm1=tmp10
+	subps	xmm2,xmm6
+	mulps	xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+	mulps	xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+	mulps	xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+	addps	xmm1,xmm2		; xmm1=z2
+	addps	xmm6,xmm2		; xmm6=z4
+
+	movaps	xmm5,xmm0
+	subps	xmm0,xmm3		; xmm0=z13
+	addps	xmm5,xmm3		; xmm5=z11
+
+	movaps	xmm7,xmm0
+	movaps	xmm4,xmm5
+	subps	xmm0,xmm1		; xmm0=data3
+	subps	xmm5,xmm6		; xmm5=data7
+	addps	xmm7,xmm1		; xmm7=data5
+	addps	xmm4,xmm6		; xmm4=data1
+
+	movaps	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+	movaps	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+	add	edx, byte 4*SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .columnloop
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JFDCT_FLT_SSE_SUPPORTED
+%endif ; DCT_FLOAT_SUPPORTED
diff --git a/ji3dnflt.asm b/ji3dnflt.asm
new file mode 100644
index 0000000..9c31e99
--- /dev/null
+++ b/ji3dnflt.asm
@@ -0,0 +1,462 @@
+;
+; ji3dnflt.asm - floating-point IDCT (3DNow! & MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+%ifdef JIDCT_FLT_3DNOW_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_float_3dnow)
+
+EXTN(jconst_idct_float_3dnow):
+
+PD_1_414	times 2 dd  1.414213562373095048801689
+PD_1_847	times 2 dd  1.847759065022573512256366
+PD_1_082	times 2 dd  1.082392200292393968799446
+PD_2_613	times 2 dd  2.613125929752753055713286
+PD_RNDINT_MAGIC	times 2 dd  100663296.0	; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_float_3dnow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                        JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+					; FAST_FLOAT workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jpeg_idct_float_3dnow)
+
+EXTN(jpeg_idct_float_3dnow):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [compptr(eax)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; FAST_FLOAT * wsptr
+	mov	ecx, DCTSIZE/2				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	pushpic	ebx		; save GOT address
+	mov	ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	mov	eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	or	ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	or	ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	or	eax,ebx
+	poppic	ebx		; restore GOT address
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd mm0,mm0
+	psrad     mm0,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm0,mm0
+
+	pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movq      mm1,mm0
+	punpckldq mm0,mm0
+	punpckhdq mm1,mm1
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
+	movq	MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
+	movq	MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movd      mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movd      mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movd      mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd mm0,mm0
+	punpcklwd mm1,mm1
+	psrad     mm0,(DWORD_BIT-WORD_BIT)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm0,mm0
+	pi2fd     mm1,mm1
+
+	pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	pfmul     mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	punpcklwd mm2,mm2
+	punpcklwd mm3,mm3
+	psrad     mm2,(DWORD_BIT-WORD_BIT)
+	psrad     mm3,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm2,mm2
+	pi2fd     mm3,mm3
+
+	pfmul     mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	pfmul     mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pfsub	mm0,mm2			; mm0=tmp11
+	pfsub	mm1,mm3
+	pfadd	mm4,mm2			; mm4=tmp10
+	pfadd	mm5,mm3			; mm5=tmp13
+
+	pfmul	mm1,[GOTOFF(ebx,PD_1_414)]
+	pfsub	mm1,mm5			; mm1=tmp12
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	pfsub	mm4,mm5			; mm4=tmp3
+	pfsub	mm0,mm1			; mm0=tmp2
+	pfadd	mm6,mm5			; mm6=tmp0
+	pfadd	mm7,mm1			; mm7=tmp1
+
+	movq	MMWORD [wk(1)], mm4	; tmp3
+	movq	MMWORD [wk(0)], mm0	; tmp2
+
+	; -- Odd part
+
+	movd      mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movd      mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movd      mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movd      mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd mm2,mm2
+	punpcklwd mm3,mm3
+	psrad     mm2,(DWORD_BIT-WORD_BIT)
+	psrad     mm3,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm2,mm2
+	pi2fd     mm3,mm3
+
+	pfmul     mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	pfmul     mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	punpcklwd mm5,mm5
+	punpcklwd mm1,mm1
+	psrad     mm5,(DWORD_BIT-WORD_BIT)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)
+	pi2fd     mm5,mm5
+	pi2fd     mm1,mm1
+
+	pfmul     mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	pfmul     mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movq	mm4,mm2
+	movq	mm0,mm5
+	pfadd	mm2,mm1			; mm2=z11
+	pfadd	mm5,mm3			; mm5=z13
+	pfsub	mm4,mm1			; mm4=z12
+	pfsub	mm0,mm3			; mm0=z10
+
+	movq	mm1,mm2
+	pfsub	mm2,mm5
+	pfadd	mm1,mm5			; mm1=tmp7
+
+	pfmul	mm2,[GOTOFF(ebx,PD_1_414)]	; mm2=tmp11
+
+	movq	mm3,mm0
+	pfadd	mm0,mm4
+	pfmul	mm0,[GOTOFF(ebx,PD_1_847)]	; mm0=z5
+	pfmul	mm3,[GOTOFF(ebx,PD_2_613)]	; mm3=(z10 * 2.613125930)
+	pfmul	mm4,[GOTOFF(ebx,PD_1_082)]	; mm4=(z12 * 1.082392200)
+	pfsubr	mm3,mm0			; mm3=tmp12
+	pfsub	mm4,mm0			; mm4=tmp10
+
+	; -- Final output stage
+
+	pfsub	mm3,mm1			; mm3=tmp6
+	movq	mm5,mm6
+	movq	mm0,mm7
+	pfadd	mm6,mm1			; mm6=data0=(00 01)
+	pfadd	mm7,mm3			; mm7=data1=(10 11)
+	pfsub	mm5,mm1			; mm5=data7=(70 71)
+	pfsub	mm0,mm3			; mm0=data6=(60 61)
+	pfsub	mm2,mm3			; mm2=tmp5
+
+	movq      mm1,mm6		; transpose coefficients
+	punpckldq mm6,mm7		; mm6=(00 10)
+	punpckhdq mm1,mm7		; mm1=(01 11)
+	movq      mm3,mm0		; transpose coefficients
+	punpckldq mm0,mm5		; mm0=(60 70)
+	punpckhdq mm3,mm5		; mm3=(61 71)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+	movq	MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=tmp2
+	movq	mm5, MMWORD [wk(1)]	; mm5=tmp3
+
+	pfadd	mm4,mm2			; mm4=tmp4
+	movq	mm6,mm7
+	movq	mm1,mm5
+	pfadd	mm7,mm2			; mm7=data2=(20 21)
+	pfadd	mm5,mm4			; mm5=data4=(40 41)
+	pfsub	mm6,mm2			; mm6=data5=(50 51)
+	pfsub	mm1,mm4			; mm1=data3=(30 31)
+
+	movq      mm0,mm7		; transpose coefficients
+	punpckldq mm7,mm1		; mm7=(20 30)
+	punpckhdq mm0,mm1		; mm0=(21 31)
+	movq      mm3,mm5		; transpose coefficients
+	punpckldq mm5,mm6		; mm5=(40 50)
+	punpckhdq mm3,mm6		; mm3=(41 51)
+
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
+	movq	MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+	movq	MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
+
+.nextcolumn:
+	add	esi, byte 2*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 2*SIZEOF_FLOAT_MULT_TYPE	; quantptr
+	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; -- Prefetch the next coefficient block
+
+	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+	prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; FAST_FLOAT * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/2				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	pfsub	mm0,mm2			; mm0=tmp11
+	pfsub	mm1,mm3
+	pfadd	mm4,mm2			; mm4=tmp10
+	pfadd	mm5,mm3			; mm5=tmp13
+
+	pfmul	mm1,[GOTOFF(ebx,PD_1_414)]
+	pfsub	mm1,mm5			; mm1=tmp12
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	pfsub	mm4,mm5			; mm4=tmp3
+	pfsub	mm0,mm1			; mm0=tmp2
+	pfadd	mm6,mm5			; mm6=tmp0
+	pfadd	mm7,mm1			; mm7=tmp1
+
+	movq	MMWORD [wk(1)], mm4	; tmp3
+	movq	MMWORD [wk(0)], mm0	; tmp2
+
+	; -- Odd part
+
+	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movq	mm4,mm2
+	movq	mm0,mm5
+	pfadd	mm2,mm1			; mm2=z11
+	pfadd	mm5,mm3			; mm5=z13
+	pfsub	mm4,mm1			; mm4=z12
+	pfsub	mm0,mm3			; mm0=z10
+
+	movq	mm1,mm2
+	pfsub	mm2,mm5
+	pfadd	mm1,mm5			; mm1=tmp7
+
+	pfmul	mm2,[GOTOFF(ebx,PD_1_414)]	; mm2=tmp11
+
+	movq	mm3,mm0
+	pfadd	mm0,mm4
+	pfmul	mm0,[GOTOFF(ebx,PD_1_847)]	; mm0=z5
+	pfmul	mm3,[GOTOFF(ebx,PD_2_613)]	; mm3=(z10 * 2.613125930)
+	pfmul	mm4,[GOTOFF(ebx,PD_1_082)]	; mm4=(z12 * 1.082392200)
+	pfsubr	mm3,mm0			; mm3=tmp12
+	pfsub	mm4,mm0			; mm4=tmp10
+
+	; -- Final output stage
+
+	pfsub	mm3,mm1			; mm3=tmp6
+	movq	mm5,mm6
+	movq	mm0,mm7
+	pfadd	mm6,mm1			; mm6=data0=(00 10)
+	pfadd	mm7,mm3			; mm7=data1=(01 11)
+	pfsub	mm5,mm1			; mm5=data7=(07 17)
+	pfsub	mm0,mm3			; mm0=data6=(06 16)
+	pfsub	mm2,mm3			; mm2=tmp5
+
+	movq	mm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; mm1=[PD_RNDINT_MAGIC]
+	pcmpeqd	mm3,mm3
+	psrld	mm3,WORD_BIT		; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
+
+	pfadd	mm6,mm1			; mm6=roundint(data0/8)=(00 ** 10 **)
+	pfadd	mm7,mm1			; mm7=roundint(data1/8)=(01 ** 11 **)
+	pfadd	mm0,mm1			; mm0=roundint(data6/8)=(06 ** 16 **)
+	pfadd	mm5,mm1			; mm5=roundint(data7/8)=(07 ** 17 **)
+
+	pand	mm6,mm3			; mm6=(00 -- 10 --)
+	pslld	mm7,WORD_BIT		; mm7=(-- 01 -- 11)
+	pand	mm0,mm3			; mm0=(06 -- 16 --)
+	pslld	mm5,WORD_BIT		; mm5=(-- 07 -- 17)
+	por	mm6,mm7			; mm6=(00 01 10 11)
+	por	mm0,mm5			; mm0=(06 07 16 17)
+
+	movq	mm1, MMWORD [wk(0)]	; mm1=tmp2
+	movq	mm3, MMWORD [wk(1)]	; mm3=tmp3
+
+	pfadd	mm4,mm2			; mm4=tmp4
+	movq	mm7,mm1
+	movq	mm5,mm3
+	pfadd	mm1,mm2			; mm1=data2=(02 12)
+	pfadd	mm3,mm4			; mm3=data4=(04 14)
+	pfsub	mm7,mm2			; mm7=data5=(05 15)
+	pfsub	mm5,mm4			; mm5=data3=(03 13)
+
+	movq	mm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; mm2=[PD_RNDINT_MAGIC]
+	pcmpeqd	mm4,mm4
+	psrld	mm4,WORD_BIT		; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
+
+	pfadd	mm3,mm2			; mm3=roundint(data4/8)=(04 ** 14 **)
+	pfadd	mm7,mm2			; mm7=roundint(data5/8)=(05 ** 15 **)
+	pfadd	mm1,mm2			; mm1=roundint(data2/8)=(02 ** 12 **)
+	pfadd	mm5,mm2			; mm5=roundint(data3/8)=(03 ** 13 **)
+
+	pand	mm3,mm4			; mm3=(04 -- 14 --)
+	pslld	mm7,WORD_BIT		; mm7=(-- 05 -- 15)
+	pand	mm1,mm4			; mm1=(02 -- 12 --)
+	pslld	mm5,WORD_BIT		; mm5=(-- 03 -- 13)
+	por	mm3,mm7			; mm3=(04 05 14 15)
+	por	mm1,mm5			; mm1=(02 03 12 13)
+
+	movq      mm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm2=[PB_CENTERJSAMP]
+
+	packsswb  mm6,mm3		; mm6=(00 01 10 11 04 05 14 15)
+	packsswb  mm1,mm0		; mm1=(02 03 12 13 06 07 16 17)
+	paddb     mm6,mm2
+	paddb     mm1,mm2
+
+	movq      mm4,mm6		; transpose coefficients(phase 2)
+	punpcklwd mm6,mm1		; mm6=(00 01 02 03 10 11 12 13)
+	punpckhwd mm4,mm1		; mm4=(04 05 06 07 14 15 16 17)
+
+	movq      mm7,mm6		; transpose coefficients(phase 3)
+	punpckldq mm6,mm4		; mm6=(00 01 02 03 04 05 06 07)
+	punpckhdq mm7,mm4		; mm7=(10 11 12 13 14 15 16 17)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 2*SIZEOF_FAST_FLOAT	; wsptr
+	add	edi, byte 2*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	femms		; empty MMX/3DNow! state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JIDCT_FLT_3DNOW_MMX_SUPPORTED
+%endif ; DCT_FLOAT_SUPPORTED
diff --git a/jidctflt.asm b/jidctflt.asm
new file mode 100644
index 0000000..126dc7b
--- /dev/null
+++ b/jidctflt.asm
@@ -0,0 +1,473 @@
+;
+; jidctflt.asm - floating-point IDCT (non-SIMD)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; Last Modified : October 17, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+%define ROTATOR_TYPE	FP32	; float
+
+	alignz	16
+	global	EXTN(jconst_idct_float)
+
+EXTN(jconst_idct_float):
+
+F_1_414	dd	1.414213562373095048801689	; 2*cos(PI*1/4)
+F_1_847	dd	1.847759065022573512256366	; 2*cos(PI*1/8)
+F_1_082	dd	1.082392200292393968799446	; 2*(cos(PI*1/8)-cos(PI*3/8))
+F_2_613	dd	2.613125929752753055713286	; 2*(cos(PI*1/8)+cos(PI*3/8))
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                  JCOEFPTR coef_block,
+;                  JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define tmp		ebp-SIZEOF_FP64	; double tmp
+%define workspace	tmp-DCTSIZE2*SIZEOF_FAST_FLOAT
+					; FAST_FLOAT workspace[DCTSIZE2]
+%define rndint_magic	workspace-SIZEOF_FP32
+					; float rndint_magic = 100663296.0F
+%define gotptr		rndint_magic-SIZEOF_POINTER	; void * gotptr
+
+	align	16
+	global	EXTN(jpeg_idct_float)
+
+EXTN(jpeg_idct_float):
+	push	ebp
+	mov	ebp,esp
+	lea	esp, [workspace]
+	push	FP32 0x4CC00000		; (float)(0x00C00000 << 3)
+	pushpic	eax			; make a room for GOT address
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx			; get GOT address
+	movpic	POINTER [gotptr], ebx	; save GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+	mov	edx, POINTER [compptr(ebp)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
+	lea	edi, [workspace]			; FAST_FLOAT * wsptr
+	mov	ecx, DCTSIZE				; ctr
+	alignx	16,7
+.columnloop:
+	mov	ax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(2,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	mov	bx, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	mov	ax, JCOEF [COL(4,esi,SIZEOF_JCOEF)]
+	or	bx, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(6,esi,SIZEOF_JCOEF)]
+	or	bx, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	or	ax,bx
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	fild	JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+	fmul	FLOAT_MULT_TYPE [COL(0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	fst	FAST_FLOAT [COL(0,edi,SIZEOF_FAST_FLOAT)]
+	fst	FAST_FLOAT [COL(1,edi,SIZEOF_FAST_FLOAT)]
+	fst	FAST_FLOAT [COL(2,edi,SIZEOF_FAST_FLOAT)]
+	fst	FAST_FLOAT [COL(3,edi,SIZEOF_FAST_FLOAT)]
+	fst	FAST_FLOAT [COL(4,edi,SIZEOF_FAST_FLOAT)]
+	fst	FAST_FLOAT [COL(5,edi,SIZEOF_FAST_FLOAT)]
+	fst	FAST_FLOAT [COL(6,edi,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(7,edi,SIZEOF_FAST_FLOAT)]
+	jmp	near .nextcolumn
+	alignx	16,7
+
+.columnDCT:
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	; -- Even part
+
+	fild	JCOEF [COL(2,esi,SIZEOF_JCOEF)]
+	fild	JCOEF [COL(6,esi,SIZEOF_JCOEF)]
+	fild	JCOEF [COL(4,esi,SIZEOF_JCOEF)]
+	fild	JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+
+	fxch	st0,st3
+
+	fmul	FLOAT_MULT_TYPE [COL(2,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	fxch	st0,st2
+	fmul	FLOAT_MULT_TYPE [COL(6,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	fxch	st0,st1
+	fmul	FLOAT_MULT_TYPE [COL(4,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	fxch	st0,st3
+	fmul	FLOAT_MULT_TYPE [COL(0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	fxch	st0,st1
+
+	fld	st2	; st2 = st2 + st0, st0 = st2 - st0
+	fsub	st0,st1
+	fxch	st0,st1
+	faddp	st3,st0
+
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]
+
+	fld	st3	; st1 = st1 + st3, st3 = st1 - st3
+	fsubr	st0,st2
+	fxch	st0,st4
+	faddp	st2,st0
+
+	fsub	st0,st2
+
+	fld	st1	; st2 = st1 + st2, st1 = st1 - st2
+	fsub	st0,st3
+	fxch	st0,st2
+	faddp	st3,st0
+	fld	st3	; st0 = st3 + st0, st3 = st3 - st0
+	fsub	st0,st1
+	fxch	st0,st4
+	faddp	st1,st0
+
+	; -- Odd part
+
+	fild	JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	fild	JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	fild	JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	fild	JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+
+	fxch	st0,st3
+
+	fmul	FLOAT_MULT_TYPE [COL(1,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	fxch	st0,st2
+	fmul	FLOAT_MULT_TYPE [COL(7,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	fxch	st0,st1
+	fmul	FLOAT_MULT_TYPE [COL(3,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	fxch	st0,st6
+	fxch	st3,st0
+	fmul	FLOAT_MULT_TYPE [COL(5,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	fxch	st0,st5
+	fstp	FP64 [tmp]
+
+	fld	st1	; st1 = st1 + st0, st0 = st1 - st0
+	fsub	st0,st1
+	fxch	st0,st1
+	faddp	st2,st0
+	fld	st5	; st4 = st4 + st5, st5 = st4 - st5
+	fsubr	st0,st5
+	fxch	st0,st6
+	faddp	st5,st0
+
+	fld	st1	; st1 = st1 + st4, st4 = st1 - st4
+	fsub	st0,st5
+	fxch	st0,st5
+	faddp	st2,st0
+
+	fld	st5
+	fadd	st0,st1
+	fxch	st0,st5
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]
+	fxch	st0,st5
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_847)]
+	fxch	st0,st6
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_2_613)]
+	fxch	st0,st1
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_082)]
+	fxch	st0,st6
+	fsubr	st1,st0
+	fsubp	st6,st0
+
+	; -- Final output stage
+
+	fsub	st0,st1
+	fld	st2	; st1 = st2 + st1, st2 = st2 - st1
+	fsub	st0,st2
+	fxch	st0,st3
+	faddp	st2,st0
+	fsub	st4,st0
+	fld	st3	; st0 = st3 + st0, st3 = st3 - st0
+	fsub	st0,st1
+	fxch	st0,st4
+	faddp	st1,st0
+
+	fxch	st0,st2
+
+	fstp	FAST_FLOAT [COL(7,edi,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(0,edi,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(1,edi,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(6,edi,SIZEOF_FAST_FLOAT)]
+
+	fadd	st1,st0
+	fld	FP64 [tmp]
+	fld	st1	; st3 = st3 + st1, st1 = st3 - st1
+	fsubr	st0,st4
+	fxch	st0,st2
+	faddp	st4,st0
+	fld	st0	; st0 = st0 + st2, st2 = st0 - st2
+	fsub	st0,st3
+	fxch	st0,st3
+	faddp	st1,st0
+
+	fxch	st0,st3
+
+	fstp	FAST_FLOAT [COL(2,edi,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(5,edi,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(3,edi,SIZEOF_FAST_FLOAT)]
+	fstp	FAST_FLOAT [COL(4,edi,SIZEOF_FAST_FLOAT)]
+
+.nextcolumn:
+	add	esi, byte SIZEOF_JCOEF	; advance pointers to next column
+	add	edx, byte SIZEOF_FLOAT_MULT_TYPE
+	add	edi, byte SIZEOF_FAST_FLOAT
+	dec	ecx
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	edx, POINTER [cinfo(ebp)]
+	mov	edx, POINTER [jdstruct_sample_range_limit(edx)]
+	sub	edx, byte -CENTERJSAMPLE*SIZEOF_JSAMPLE	; JSAMPLE * range_limit
+
+	lea	esi, [workspace]			; FAST_FLOAT * wsptr
+	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	ecx, DCTSIZE				; ctr
+	alignx	16,7
+.rowloop:
+	push	edi
+	mov	edi, JSAMPROW [edi]			; (JSAMPLE *)
+	add	edi, JDIMENSION [output_col(ebp)]	; edi=outptr
+
+%ifndef NO_ZERO_ROW_TEST_FLOAT
+	mov	eax, FAST_FLOAT [ROW(1,esi,SIZEOF_FAST_FLOAT)]
+	add	eax,eax			; shl eax,1 (shift out the sign bit)
+	jnz	short .rowDCT
+
+	mov	eax, FAST_FLOAT [ROW(2,esi,SIZEOF_FAST_FLOAT)]
+	mov	ebx, FAST_FLOAT [ROW(3,esi,SIZEOF_FAST_FLOAT)]
+	or	eax, FAST_FLOAT [ROW(4,esi,SIZEOF_FAST_FLOAT)]
+	or	ebx, FAST_FLOAT [ROW(5,esi,SIZEOF_FAST_FLOAT)]
+	or	eax, FAST_FLOAT [ROW(6,esi,SIZEOF_FAST_FLOAT)]
+	or	ebx, FAST_FLOAT [ROW(7,esi,SIZEOF_FAST_FLOAT)]
+	or	eax,ebx
+	add	eax,eax			; shl eax,1 (shift out the sign bit)
+	jnz	short .rowDCT
+
+	; -- AC terms all zero
+
+	push	eax
+
+	fld	FAST_FLOAT [ROW(0,esi,SIZEOF_FAST_FLOAT)]
+	fadd	FP32 [rndint_magic]
+	fstp	FP32 [esp]
+
+	pop	eax
+	and	eax,RANGE_MASK
+	mov	al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [edi+0*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+1*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+2*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+3*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+4*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+5*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+6*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+7*SIZEOF_JSAMPLE], al
+	jmp	near .nextrow
+	alignx	16,7
+%endif
+.rowDCT:
+	movpic	ebx, POINTER [gotptr]	; load GOT address
+
+	; -- Even part
+
+	fld	FAST_FLOAT [ROW(4,esi,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [ROW(2,esi,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [ROW(0,esi,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [ROW(6,esi,SIZEOF_FAST_FLOAT)]
+
+	fld	st2	; st2 = st2 + st0, st0 = st2 - st0
+	fsub	st0,st1
+	fxch	st0,st1
+	faddp	st3,st0
+
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]
+
+	fld	st3	; st1 = st1 + st3, st3 = st1 - st3
+	fsubr	st0,st2
+	fxch	st0,st4
+	faddp	st2,st0
+
+	fsub	st0,st2
+
+	fld	st1	; st2 = st1 + st2, st1 = st1 - st2
+	fsub	st0,st3
+	fxch	st0,st2
+	faddp	st3,st0
+	fld	st3	; st0 = st3 + st0, st3 = st3 - st0
+	fsub	st0,st1
+	fxch	st0,st4
+	faddp	st1,st0
+
+	; -- Odd part
+
+	fld	FAST_FLOAT [ROW(3,esi,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st3
+	fld	FAST_FLOAT [ROW(1,esi,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [ROW(7,esi,SIZEOF_FAST_FLOAT)]
+	fld	FAST_FLOAT [ROW(5,esi,SIZEOF_FAST_FLOAT)]
+	fxch	st0,st5
+	fstp	FP64 [tmp]
+
+	fld	st1	; st1 = st1 + st0, st0 = st1 - st0
+	fsub	st0,st1
+	fxch	st0,st1
+	faddp	st2,st0
+	fld	st5	; st4 = st4 + st5, st5 = st4 - st5
+	fsubr	st0,st5
+	fxch	st0,st6
+	faddp	st5,st0
+
+	fld	st1	; st1 = st1 + st4, st4 = st1 - st4
+	fsub	st0,st5
+	fxch	st0,st5
+	faddp	st2,st0
+
+	fld	st5
+	fadd	st0,st1
+	fxch	st0,st5
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_414)]
+	fxch	st0,st5
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_847)]
+	fxch	st0,st6
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_2_613)]
+	fxch	st0,st1
+	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_082)]
+	fxch	st0,st6
+	fsubr	st1,st0
+	fsubp	st6,st0
+
+	; -- Final output stage
+
+	sub	esp, byte DCTSIZE*SIZEOF_FP32
+
+	fsub	st0,st1
+	fld	st2	; st1 = st2 + st1, st2 = st2 - st1
+	fsub	st0,st2
+	fxch	st0,st3
+	faddp	st2,st0
+	fsub	st4,st0
+	fld	st3	; st0 = st3 + st0, st3 = st3 - st0
+	fsub	st0,st1
+	fxch	st0,st4
+	faddp	st1,st0
+
+	fld	FP32 [rndint_magic]
+
+	fadd	st4,st0
+	fadd	st1,st0
+	fadd	st2,st0
+	fadd	st3,st0
+
+	fxch	st0,st4
+
+	fstp	FP32 [esp+6*SIZEOF_FP32]
+	fstp	FP32 [esp+1*SIZEOF_FP32]
+	fstp	FP32 [esp+0*SIZEOF_FP32]
+	fstp	FP32 [esp+7*SIZEOF_FP32]
+
+	fxch	st0,st1
+
+	fadd	st2,st0
+	fld	FP64 [tmp]
+	fld	st1	; st4 = st4 + st1, st1 = st4 - st1
+	fsubr	st0,st5
+	fxch	st0,st2
+	faddp	st5,st0
+	fld	st0	; st0 = st0 + st3, st3 = st0 - st3
+	fsub	st0,st4
+	fxch	st0,st4
+	faddp	st1,st0
+
+	fxch	st0,st2
+
+	fadd	st1,st0
+	fadd	st2,st0
+	fadd	st3,st0
+	faddp	st4,st0
+
+	fstp	FP32 [esp+5*SIZEOF_FP32]
+	fstp	FP32 [esp+4*SIZEOF_FP32]
+	fstp	FP32 [esp+3*SIZEOF_FP32]
+	fstp	FP32 [esp+2*SIZEOF_FP32]
+
+%assign i 0	; i=0;
+%rep 4	; -- repeat 4 times ---
+	pop	eax
+	pop	ebx
+	and	eax,RANGE_MASK
+	and	ebx,RANGE_MASK
+	mov	al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
+	mov	bl, JSAMPLE [edx+ebx*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [edi+(i+0)*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+(i+1)*SIZEOF_JSAMPLE], bl
+%assign i i+2	; i+=2;
+%endrep	; -- repeat end ---
+
+.nextrow:
+	pop	edi
+	add	esi, byte DCTSIZE*SIZEOF_FAST_FLOAT
+	add	edi, byte SIZEOF_JSAMPROW	; advance pointer to next row
+	dec	ecx
+	jnz	near .rowloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp
+	pop	ebp
+	ret
+
+%endif ; DCT_FLOAT_SUPPORTED
diff --git a/jidctfst.asm b/jidctfst.asm
new file mode 100644
index 0000000..8022ac6
--- /dev/null
+++ b/jidctfst.asm
@@ -0,0 +1,464 @@
+;
+; jidctfst.asm - fast integer IDCT (non-SIMD)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; Last Modified : October 17, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_IFAST_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+; We can gain a little more speed, with a further compromise in accuracy,
+; by omitting the addition in a descaling shift.  This yields an
+; incorrectly rounded result half the time...
+;
+%macro	descale 2
+%ifdef USE_ACCURATE_ROUNDING
+%if (%2)<=7
+	add	%1, byte (1<<((%2)-1))	; add reg32,imm8
+%else
+	add	%1, (1<<((%2)-1))	; add reg32,imm32
+%endif
+%endif
+	sar	%1,%2
+%endmacro
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8
+%define PASS1_BITS	2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082	equ	277		; FIX(1.082392200)
+F_1_414	equ	362		; FIX(1.414213562)
+F_1_847	equ	473		; FIX(1.847759065)
+F_2_613	equ	669		; FIX(2.613125930)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_1_082	equ	DESCALE(1162209775,30-CONST_BITS)	; FIX(1.082392200)
+F_1_414	equ	DESCALE(1518500249,30-CONST_BITS)	; FIX(1.414213562)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_613	equ	DESCALE(2805822602,30-CONST_BITS)	; FIX(2.613125930)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                  JCOEFPTR coef_block,
+;                  JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define range_limit	ebp-SIZEOF_POINTER		; JSAMPLE * range_limit
+%define ptr		range_limit-SIZEOF_POINTER	; void * ptr
+%define workspace	ptr-DCTSIZE2*SIZEOF_INT
+					; int workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jpeg_idct_ifast)
+
+EXTN(jpeg_idct_ifast):
+	push	ebp
+	mov	ebp,esp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+	mov	edx, POINTER [compptr(ebp)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
+	lea	edi, [workspace]			; int * wsptr
+	mov	ecx, DCTSIZE				; ctr
+	alignx	16,7
+.columnloop:
+	mov	ax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(2,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	mov	bx, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	mov	ax, JCOEF [COL(4,esi,SIZEOF_JCOEF)]
+	or	bx, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(6,esi,SIZEOF_JCOEF)]
+	or	bx, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	or	ax,bx
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	mov	ax, JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+	imul	ax, IFAST_MULT_TYPE [COL(0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	cwde
+
+	mov	INT [COL(0,edi,SIZEOF_INT)], eax
+	mov	INT [COL(1,edi,SIZEOF_INT)], eax
+	mov	INT [COL(2,edi,SIZEOF_INT)], eax
+	mov	INT [COL(3,edi,SIZEOF_INT)], eax
+	mov	INT [COL(4,edi,SIZEOF_INT)], eax
+	mov	INT [COL(5,edi,SIZEOF_INT)], eax
+	mov	INT [COL(6,edi,SIZEOF_INT)], eax
+	mov	INT [COL(7,edi,SIZEOF_INT)], eax
+	jmp	near .nextcolumn
+	alignx	16,7
+
+.columnDCT:
+	push	ecx	; ctr
+	push	esi	; coef_block
+	push	edx	; quantptr
+
+	mov	POINTER [ptr], edi	; wsptr
+
+	; -- Even part
+
+	movsx	eax, JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+	movsx	ecx, JCOEF [COL(4,esi,SIZEOF_JCOEF)]
+	imul	ax, IFAST_MULT_TYPE [COL(0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	imul	cx, IFAST_MULT_TYPE [COL(4,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movsx	ebx, JCOEF [COL(2,esi,SIZEOF_JCOEF)]
+	movsx	edi, JCOEF [COL(6,esi,SIZEOF_JCOEF)]
+	imul	bx, IFAST_MULT_TYPE [COL(2,edx,SIZEOF_IFAST_MULT_TYPE)]
+	imul	di, IFAST_MULT_TYPE [COL(6,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	lea	edx,[eax+ecx]		; edx=tmp10
+	sub	eax,ecx			; eax=tmp11
+
+	lea	ecx,[ebx+edi]		; ecx=tmp13
+	sub	ebx,edi
+	imul	ebx,(F_1_414)
+	descale	ebx,CONST_BITS
+	sub	ebx,ecx			; ebx=tmp12
+
+	lea	edi,[edx+ecx]		; edi=tmp0
+	sub	edx,ecx			; edx=tmp3
+	lea	ecx,[eax+ebx]		; ecx=tmp1
+	sub	eax,ebx			; eax=tmp2
+
+	push	edx		; tmp3
+	push	eax		; tmp2
+	push	ecx		; tmp1
+	push	edi		; tmp0
+
+	; -- Odd part
+
+	mov	edx, POINTER [esp+16]	; quantptr
+
+	movsx	eax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	movsx	ebx, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	imul	ax, IFAST_MULT_TYPE [COL(1,edx,SIZEOF_IFAST_MULT_TYPE)]
+	imul	bx, IFAST_MULT_TYPE [COL(7,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movsx	edi, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+	movsx	ecx, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	imul	di, IFAST_MULT_TYPE [COL(5,edx,SIZEOF_IFAST_MULT_TYPE)]
+	imul	cx, IFAST_MULT_TYPE [COL(3,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	lea	esi,[eax+ebx]		; esi=z11
+	sub	eax,ebx			; eax=z12
+	lea	edx,[edi+ecx]		; edx=z13
+	sub	edi,ecx			; edi=z10
+
+	lea	ebx,[esi+edx]		; ebx=tmp7
+	sub	esi,edx
+	imul	esi,(F_1_414)		; esi=tmp11
+	descale	esi,CONST_BITS
+
+	lea	ecx,[edi+eax]
+	imul	ecx,(F_1_847)		; ecx=z5
+	imul	edi,(-F_2_613)		; edi=MULTIPLY(z10,-FIX_2_613125930)
+	imul	eax,(F_1_082)		; eax=MULTIPLY(z12,FIX_1_082392200)
+	descale	ecx,CONST_BITS
+	descale	edi,CONST_BITS
+	descale	eax,CONST_BITS
+	add	edi,ecx			; edi=tmp12
+	sub	eax,ecx			; eax=tmp10
+
+	; -- Final output stage
+
+	sub	edi,ebx		; edi=tmp6
+	pop	edx		; edx=tmp0
+	sub	esi,edi		; esi=tmp5
+	pop	ecx		; ecx=tmp1
+	add	eax,esi		; eax=tmp4
+	push	esi		; tmp5
+	push	eax		; tmp4
+
+	lea	eax,[edx+ebx]	; eax=data0(=tmp0+tmp7)
+	sub	edx,ebx		; edx=data7(=tmp0-tmp7)
+	lea	ebx,[ecx+edi]	; ebx=data1(=tmp1+tmp6)
+	sub	ecx,edi		; ecx=data6(=tmp1-tmp6)
+
+	mov	edi, POINTER [ptr]	; edi=wsptr
+
+	mov	INT [COL(0,edi,SIZEOF_INT)], eax
+	mov	INT [COL(7,edi,SIZEOF_INT)], edx
+	mov	INT [COL(1,edi,SIZEOF_INT)], ebx
+	mov	INT [COL(6,edi,SIZEOF_INT)], ecx
+
+	pop	esi		; esi=tmp4
+	pop	eax		; eax=tmp5
+	pop	edx		; edx=tmp2
+	pop	ecx		; ecx=tmp3
+
+	lea	ebx,[edx+eax]	; ebx=data2(=tmp2+tmp5)
+	sub	edx,eax		; edx=data5(=tmp2-tmp5)
+	lea	eax,[ecx+esi]	; eax=data4(=tmp3+tmp4)
+	sub	ecx,esi		; ecx=data3(=tmp3-tmp4)
+
+	mov	INT [COL(2,edi,SIZEOF_INT)], ebx
+	mov	INT [COL(5,edi,SIZEOF_INT)], edx
+	mov	INT [COL(4,edi,SIZEOF_INT)], eax
+	mov	INT [COL(3,edi,SIZEOF_INT)], ecx
+
+	pop	edx	; quantptr
+	pop	esi	; coef_block
+	pop	ecx	; ctr
+
+.nextcolumn:
+	add	esi, byte SIZEOF_JCOEF	; advance pointers to next column
+	add	edx, byte SIZEOF_IFAST_MULT_TYPE
+	add	edi, byte SIZEOF_INT
+	dec	ecx
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, POINTER [cinfo(ebp)]
+	mov	eax, POINTER [jdstruct_sample_range_limit(eax)]
+	sub	eax, byte -CENTERJSAMPLE*SIZEOF_JSAMPLE	; JSAMPLE * range_limit
+	mov	POINTER [range_limit], eax
+
+	lea	esi, [workspace]			; int * wsptr
+	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	ecx, DCTSIZE				; ctr
+	alignx	16,7
+.rowloop:
+	push	edi
+	mov	edi, JSAMPROW [edi]			; (JSAMPLE *)
+	add	edi, JDIMENSION [output_col(ebp)]	; edi=outptr
+
+%ifndef NO_ZERO_ROW_TEST
+	mov	eax, INT [ROW(1,esi,SIZEOF_INT)]
+	or	eax, INT [ROW(2,esi,SIZEOF_INT)]
+	jnz	short .rowDCT
+
+	mov	ebx, INT [ROW(3,esi,SIZEOF_INT)]
+	mov	eax, INT [ROW(4,esi,SIZEOF_INT)]
+	or	ebx, INT [ROW(5,esi,SIZEOF_INT)]
+	or	eax, INT [ROW(6,esi,SIZEOF_INT)]
+	or	ebx, INT [ROW(7,esi,SIZEOF_INT)]
+	or	eax,ebx
+	jnz	short .rowDCT
+
+	; -- AC terms all zero
+
+	mov	eax, INT [ROW(0,esi,SIZEOF_INT)]
+
+	mov	edx, POINTER [range_limit]	; (JSAMPLE *)
+
+	descale	eax,(PASS1_BITS+3)
+	and	eax,RANGE_MASK
+	mov	al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [edi+0*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+1*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+2*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+3*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+4*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+5*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+6*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+7*SIZEOF_JSAMPLE], al
+	jmp	near .nextrow
+	alignx	16,7
+%endif
+.rowDCT:
+	push	esi	; wsptr
+	push	ecx	; ctr
+
+	mov	POINTER [ptr], edi	; outptr
+
+	; -- Even part
+
+	mov	eax, INT [ROW(0,esi,SIZEOF_INT)]
+	mov	ebx, INT [ROW(2,esi,SIZEOF_INT)]
+	mov	ecx, INT [ROW(4,esi,SIZEOF_INT)]
+	mov	edi, INT [ROW(6,esi,SIZEOF_INT)]
+
+	lea	edx,[eax+ecx]		; edx=tmp10
+	sub	eax,ecx			; eax=tmp11
+
+	lea	ecx,[ebx+edi]		; ecx=tmp13
+	sub	ebx,edi
+	imul	ebx,(F_1_414)
+	descale	ebx,CONST_BITS
+	sub	ebx,ecx			; ebx=tmp12
+
+	lea	edi,[edx+ecx]		; edi=tmp0
+	sub	edx,ecx			; edx=tmp3
+	lea	ecx,[eax+ebx]		; ecx=tmp1
+	sub	eax,ebx			; eax=tmp2
+
+	push	edx		; tmp3
+	push	eax		; tmp2
+	push	ecx		; tmp1
+	push	edi		; tmp0
+
+	; -- Odd part
+
+	mov	eax, INT [ROW(1,esi,SIZEOF_INT)]
+	mov	ecx, INT [ROW(3,esi,SIZEOF_INT)]
+	mov	edi, INT [ROW(5,esi,SIZEOF_INT)]
+	mov	ebx, INT [ROW(7,esi,SIZEOF_INT)]
+
+	lea	esi,[eax+ebx]		; esi=z11
+	sub	eax,ebx			; eax=z12
+	lea	edx,[edi+ecx]		; edx=z13
+	sub	edi,ecx			; edi=z10
+
+	lea	ebx,[esi+edx]		; ebx=tmp7
+	sub	esi,edx
+	imul	esi,(F_1_414)		; esi=tmp11
+	descale	esi,CONST_BITS
+
+	lea	ecx,[edi+eax]
+	imul	ecx,(F_1_847)		; ecx=z5
+	imul	edi,(-F_2_613)		; edi=MULTIPLY(z10,-FIX_2_613125930)
+	imul	eax,(F_1_082)		; eax=MULTIPLY(z12,FIX_1_082392200)
+	descale	ecx,CONST_BITS
+	descale	edi,CONST_BITS
+	descale	eax,CONST_BITS
+	add	edi,ecx			; edi=tmp12
+	sub	eax,ecx			; eax=tmp10
+
+	; -- Final output stage
+
+	sub	edi,ebx		; edi=tmp6
+	pop	edx		; edx=tmp0
+	sub	esi,edi		; esi=tmp5
+	pop	ecx		; ecx=tmp1
+	add	eax,esi		; eax=tmp4
+	push	esi		; tmp5
+	push	eax		; tmp4
+
+	lea	eax,[edx+ebx]	; eax=data0(=tmp0+tmp7)
+	sub	edx,ebx		; edx=data7(=tmp0-tmp7)
+	lea	ebx,[ecx+edi]	; ebx=data1(=tmp1+tmp6)
+	sub	ecx,edi		; ecx=data6(=tmp1-tmp6)
+
+	mov	esi, POINTER [range_limit]	; (JSAMPLE *)
+
+	descale	eax,(PASS1_BITS+3)
+	descale	edx,(PASS1_BITS+3)
+	descale	ebx,(PASS1_BITS+3)
+	descale	ecx,(PASS1_BITS+3)
+
+	mov	edi, POINTER [ptr]		; edi=outptr
+
+	and	eax,RANGE_MASK
+	and	edx,RANGE_MASK
+	and	ebx,RANGE_MASK
+	and	ecx,RANGE_MASK
+
+	mov	al, JSAMPLE [esi+eax*SIZEOF_JSAMPLE]
+	mov	dl, JSAMPLE [esi+edx*SIZEOF_JSAMPLE]
+	mov	bl, JSAMPLE [esi+ebx*SIZEOF_JSAMPLE]
+	mov	cl, JSAMPLE [esi+ecx*SIZEOF_JSAMPLE]
+
+	mov	JSAMPLE [edi+0*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+7*SIZEOF_JSAMPLE], dl
+	mov	JSAMPLE [edi+1*SIZEOF_JSAMPLE], bl
+	mov	JSAMPLE [edi+6*SIZEOF_JSAMPLE], cl
+
+	pop	esi		; esi=tmp4
+	pop	eax		; eax=tmp5
+	pop	edx		; edx=tmp2
+	pop	ecx		; ecx=tmp3
+
+	lea	ebx,[edx+eax]	; ebx=data2(=tmp2+tmp5)
+	sub	edx,eax		; edx=data5(=tmp2-tmp5)
+	lea	eax,[ecx+esi]	; eax=data4(=tmp3+tmp4)
+	sub	ecx,esi		; ecx=data3(=tmp3-tmp4)
+
+	mov	esi, POINTER [range_limit]	; (JSAMPLE *)
+
+	descale	ebx,(PASS1_BITS+3)
+	descale	edx,(PASS1_BITS+3)
+	descale	eax,(PASS1_BITS+3)
+	descale	ecx,(PASS1_BITS+3)
+
+	and	ebx,RANGE_MASK
+	and	edx,RANGE_MASK
+	and	eax,RANGE_MASK
+	and	ecx,RANGE_MASK
+
+	mov	bl, JSAMPLE [esi+ebx*SIZEOF_JSAMPLE]
+	mov	dl, JSAMPLE [esi+edx*SIZEOF_JSAMPLE]
+	mov	al, JSAMPLE [esi+eax*SIZEOF_JSAMPLE]
+	mov	cl, JSAMPLE [esi+ecx*SIZEOF_JSAMPLE]
+
+	mov	JSAMPLE [edi+2*SIZEOF_JSAMPLE], bl
+	mov	JSAMPLE [edi+5*SIZEOF_JSAMPLE], dl
+	mov	JSAMPLE [edi+4*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+3*SIZEOF_JSAMPLE], cl
+
+	pop	ecx	; ctr
+	pop	esi	; wsptr
+
+.nextrow:
+	pop	edi
+	add	esi, byte DCTSIZE*SIZEOF_INT	; advance pointer to next row
+	add	edi, byte SIZEOF_JSAMPROW
+	dec	ecx
+	jnz	near .rowloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp
+	pop	ebp
+	ret
+
+%endif ; DCT_IFAST_SUPPORTED
diff --git a/jidctint.asm b/jidctint.asm
new file mode 100644
index 0000000..eb81919
--- /dev/null
+++ b/jidctint.asm
@@ -0,0 +1,524 @@
+;
+; jidctint.asm - accurate integer IDCT (non-SIMD)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; Last Modified : October 17, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_ISLOW_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+; Descale and correctly round a DWORD value that's scaled by N bits.
+;
+%macro	descale 2
+%if (%2)<=7
+	add	%1, byte (1<<((%2)-1))	; add reg32,imm8
+%else
+	add	%1, (1<<((%2)-1))	; add reg32,imm32
+%endif
+	sar	%1,%2
+%endmacro
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                  JCOEFPTR coef_block,
+;                  JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define range_limit	ebp-SIZEOF_POINTER		; JSAMPLE * range_limit
+%define ptr		range_limit-SIZEOF_POINTER	; void * ptr
+%define workspace	ptr-DCTSIZE2*SIZEOF_INT
+					; int workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jpeg_idct_islow)
+
+EXTN(jpeg_idct_islow):
+	push	ebp
+	mov	ebp,esp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+	mov	edx, POINTER [compptr(ebp)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
+	lea	edi, [workspace]			; int * wsptr
+	mov	ecx, DCTSIZE				; ctr
+	alignx	16,7
+.columnloop:
+	mov	ax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(2,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	mov	bx, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	mov	ax, JCOEF [COL(4,esi,SIZEOF_JCOEF)]
+	or	bx, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(6,esi,SIZEOF_JCOEF)]
+	or	bx, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	or	ax,bx
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	mov	ax, JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+	imul	ax, ISLOW_MULT_TYPE [COL(0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	cwde
+
+	sal	eax,PASS1_BITS
+
+	mov	INT [COL(0,edi,SIZEOF_INT)], eax
+	mov	INT [COL(1,edi,SIZEOF_INT)], eax
+	mov	INT [COL(2,edi,SIZEOF_INT)], eax
+	mov	INT [COL(3,edi,SIZEOF_INT)], eax
+	mov	INT [COL(4,edi,SIZEOF_INT)], eax
+	mov	INT [COL(5,edi,SIZEOF_INT)], eax
+	mov	INT [COL(6,edi,SIZEOF_INT)], eax
+	mov	INT [COL(7,edi,SIZEOF_INT)], eax
+	jmp	near .nextcolumn
+	alignx	16,7
+
+.columnDCT:
+	push	ecx	; ctr
+	push	esi	; coef_block
+	push	edx	; quantptr
+
+	mov	POINTER [ptr], edi	; wsptr
+
+	; -- Even part
+
+	movsx	eax, JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+	movsx	ecx, JCOEF [COL(4,esi,SIZEOF_JCOEF)]
+	imul	ax, ISLOW_MULT_TYPE [COL(0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	cx, ISLOW_MULT_TYPE [COL(4,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movsx	ebx, JCOEF [COL(2,esi,SIZEOF_JCOEF)]
+	movsx	edi, JCOEF [COL(6,esi,SIZEOF_JCOEF)]
+	imul	bx, ISLOW_MULT_TYPE [COL(2,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	di, ISLOW_MULT_TYPE [COL(6,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	lea	edx,[eax+ecx]
+	sub	eax,ecx
+	sal	edx,CONST_BITS	; edx=tmp0
+	sal	eax,CONST_BITS	; eax=tmp1
+
+	lea	ecx,[ebx+edi]
+	imul	ecx,(F_0_541)	; ecx=z1
+	imul	ebx,(F_0_765)	; ebx=MULTIPLY(z2,FIX_0_765366865)
+	imul	edi,(-F_1_847)	; edi=MULTIPLY(z3,-FIX_1_847759065)
+	add	ebx,ecx		; ebx=tmp3
+	add	edi,ecx		; edi=tmp2
+
+	lea	ecx,[edx+ebx]	; ecx=tmp10
+	sub	edx,ebx		; edx=tmp13
+	lea	ebx,[eax+edi]	; ebx=tmp11
+	sub	eax,edi		; eax=tmp12
+
+	push	edx		; tmp13
+	push	eax		; tmp12
+	push	ebx		; tmp11
+	push	ecx		; tmp10
+
+	; -- Odd part
+
+	mov	edx, POINTER [esp+16]	; quantptr
+
+	movsx	eax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	movsx	edi, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	imul	ax, ISLOW_MULT_TYPE [COL(1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	di, ISLOW_MULT_TYPE [COL(3,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movsx	ecx, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+	movsx	ebx, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	imul	cx, ISLOW_MULT_TYPE [COL(5,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	bx, ISLOW_MULT_TYPE [COL(7,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	push	eax		; eax=tmp3
+	push	edi		; edi=tmp2
+	push	ecx		; ecx=tmp1
+	push	ebx		; ebx=tmp0
+
+	lea	esi,[ebx+edi]	; esi=z3
+	lea	edx,[ecx+eax]	; edx=z4
+	add	ebx,eax		; ebx=z1
+	add	ecx,edi		; ecx=z2
+
+	lea	eax,[esi+edx]
+	imul	eax,(F_1_175)	; eax=z5
+
+	imul	esi,(-F_1_961)	; esi=z3(=MULTIPLY(z3,-FIX_1_961570560))
+	imul	edx,(-F_0_390)	; edx=z4(=MULTIPLY(z4,-FIX_0_390180644))
+	imul	ebx,(-F_0_899)	; ebx=z1(=MULTIPLY(z1,-FIX_0_899976223))
+	imul	ecx,(-F_2_562)	; ecx=z2(=MULTIPLY(z2,-FIX_2_562915447))
+
+	add	esi,eax		; esi=z3(=z3+z5)
+	add	edx,eax		; edx=z4(=z4+z5)
+
+	lea	edi,[esi+ebx]	; edi=z1+z3
+	lea	eax,[edx+ecx]	; eax=z2+z4
+	add	esi,ecx		; esi=z2+z3
+	add	edx,ebx		; edx=z1+z4
+
+	pop	ecx		; ecx=tmp0
+	pop	ebx		; ebx=tmp1
+	imul	ecx,(F_0_298)	; ecx=tmp0(=MULTIPLY(tmp0,FIX_0_298631336))
+	imul	ebx,(F_2_053)	; ebx=tmp1(=MULTIPLY(tmp1,FIX_2_053119869))
+	add	edi,ecx		; edi=tmp0(=tmp0+z1+z3)
+	add	eax,ebx		; eax=tmp1(=tmp1+z2+z4)
+
+	pop	ecx		; ecx=tmp2
+	pop	ebx		; ebx=tmp3
+	imul	ecx,(F_3_072)	; ecx=tmp2(=MULTIPLY(tmp2,FIX_3_072711026))
+	imul	ebx,(F_1_501)	; ebx=tmp3(=MULTIPLY(tmp3,FIX_1_501321110))
+	add	esi,ecx		; esi=tmp2(=tmp2+z2+z3)
+	add	edx,ebx		; edx=tmp3(=tmp3+z1+z4)
+
+	; -- Final output stage
+
+	pop	ecx		; ecx=tmp10
+	pop	ebx		; ebx=tmp11
+	push	eax		; tmp1
+	push	edi		; tmp0
+
+	lea	eax,[ecx+edx]	; eax=data0(=tmp10+tmp3)
+	sub	ecx,edx		; ecx=data7(=tmp10-tmp3)
+	lea	edx,[ebx+esi]	; edx=data1(=tmp11+tmp2)
+	sub	ebx,esi		; ebx=data6(=tmp11-tmp2)
+
+	mov	edi, POINTER [ptr]	; edi=wsptr
+
+	descale	eax,(CONST_BITS-PASS1_BITS)
+	descale	ecx,(CONST_BITS-PASS1_BITS)
+	descale	edx,(CONST_BITS-PASS1_BITS)
+	descale	ebx,(CONST_BITS-PASS1_BITS)
+
+	mov	INT [COL(0,edi,SIZEOF_INT)], eax
+	mov	INT [COL(7,edi,SIZEOF_INT)], ecx
+	mov	INT [COL(1,edi,SIZEOF_INT)], edx
+	mov	INT [COL(6,edi,SIZEOF_INT)], ebx
+
+	pop	esi		; esi=tmp0
+	pop	eax		; eax=tmp1
+	pop	ecx		; ecx=tmp12
+	pop	edx		; edx=tmp13
+
+	lea	ebx,[ecx+eax]	; ebx=data2(=tmp12+tmp1)
+	sub	ecx,eax		; ecx=data5(=tmp12-tmp1)
+	lea	eax,[edx+esi]	; eax=data3(=tmp13+tmp0)
+	sub	edx,esi		; edx=data4(=tmp13-tmp0)
+
+	descale	ebx,(CONST_BITS-PASS1_BITS)
+	descale	ecx,(CONST_BITS-PASS1_BITS)
+	descale	eax,(CONST_BITS-PASS1_BITS)
+	descale	edx,(CONST_BITS-PASS1_BITS)
+
+	mov	INT [COL(2,edi,SIZEOF_INT)], ebx
+	mov	INT [COL(5,edi,SIZEOF_INT)], ecx
+	mov	INT [COL(3,edi,SIZEOF_INT)], eax
+	mov	INT [COL(4,edi,SIZEOF_INT)], edx
+
+	pop	edx	; quantptr
+	pop	esi	; coef_block
+	pop	ecx	; ctr
+
+.nextcolumn:
+	add	esi, byte SIZEOF_JCOEF	; advance pointers to next column
+	add	edx, byte SIZEOF_ISLOW_MULT_TYPE
+	add	edi, byte SIZEOF_INT
+	dec	ecx
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, POINTER [cinfo(ebp)]
+	mov	eax, POINTER [jdstruct_sample_range_limit(eax)]
+	sub	eax, byte -CENTERJSAMPLE*SIZEOF_JSAMPLE	; JSAMPLE * range_limit
+	mov	POINTER [range_limit], eax
+
+	lea	esi, [workspace]			; int * wsptr
+	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	ecx, DCTSIZE				; ctr
+	alignx	16,7
+.rowloop:
+	push	edi
+	mov	edi, JSAMPROW [edi]			; (JSAMPLE *)
+	add	edi, JDIMENSION [output_col(ebp)]	; edi=outptr
+
+%ifndef NO_ZERO_ROW_TEST
+	mov	eax, INT [ROW(1,esi,SIZEOF_INT)]
+	or	eax, INT [ROW(2,esi,SIZEOF_INT)]
+	jnz	short .rowDCT
+
+	mov	ebx, INT [ROW(3,esi,SIZEOF_INT)]
+	mov	eax, INT [ROW(4,esi,SIZEOF_INT)]
+	or	ebx, INT [ROW(5,esi,SIZEOF_INT)]
+	or	eax, INT [ROW(6,esi,SIZEOF_INT)]
+	or	ebx, INT [ROW(7,esi,SIZEOF_INT)]
+	or	eax,ebx
+	jnz	short .rowDCT
+
+	; -- AC terms all zero
+
+	mov	eax, INT [ROW(0,esi,SIZEOF_INT)]
+
+	mov	edx, POINTER [range_limit]	; (JSAMPLE *)
+
+	descale	eax,(PASS1_BITS+3)
+	and	eax,RANGE_MASK
+	mov	al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [edi+0*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+1*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+2*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+3*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+4*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+5*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+6*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+7*SIZEOF_JSAMPLE], al
+	jmp	near .nextrow
+	alignx	16,7
+%endif
+.rowDCT:
+	push	esi	; wsptr
+	push	ecx	; ctr
+
+	mov	POINTER [ptr], edi	; outptr
+
+	; -- Even part
+
+	mov	eax, INT [ROW(0,esi,SIZEOF_INT)]
+	mov	ebx, INT [ROW(2,esi,SIZEOF_INT)]
+	mov	ecx, INT [ROW(4,esi,SIZEOF_INT)]
+	mov	edi, INT [ROW(6,esi,SIZEOF_INT)]
+
+	lea	edx,[eax+ecx]
+	sub	eax,ecx
+	sal	edx,CONST_BITS	; edx=tmp0
+	sal	eax,CONST_BITS	; eax=tmp1
+
+	lea	ecx,[ebx+edi]
+	imul	ecx,(F_0_541)	; ecx=z1
+	imul	ebx,(F_0_765)	; ebx=MULTIPLY(z2,FIX_0_765366865)
+	imul	edi,(-F_1_847)	; edi=MULTIPLY(z3,-FIX_1_847759065)
+	add	ebx,ecx		; ebx=tmp3
+	add	edi,ecx		; edi=tmp2
+
+	lea	ecx,[edx+ebx]	; ecx=tmp10
+	sub	edx,ebx		; edx=tmp13
+	lea	ebx,[eax+edi]	; ebx=tmp11
+	sub	eax,edi		; eax=tmp12
+
+	push	edx		; tmp13
+	push	eax		; tmp12
+	push	ebx		; tmp11
+	push	ecx		; tmp10
+
+	; -- Odd part
+
+	mov	eax, INT [ROW(1,esi,SIZEOF_INT)]
+	mov	edi, INT [ROW(3,esi,SIZEOF_INT)]
+	mov	ecx, INT [ROW(5,esi,SIZEOF_INT)]
+	mov	ebx, INT [ROW(7,esi,SIZEOF_INT)]
+
+	push	eax		; eax=tmp3
+	push	edi		; edi=tmp2
+	push	ecx		; ecx=tmp1
+	push	ebx		; ebx=tmp0
+
+	lea	esi,[ebx+edi]	; esi=z3
+	lea	edx,[ecx+eax]	; edx=z4
+	add	ebx,eax		; ebx=z1
+	add	ecx,edi		; ecx=z2
+
+	lea	eax,[esi+edx]
+	imul	eax,(F_1_175)	; eax=z5
+
+	imul	esi,(-F_1_961)	; esi=z3(=MULTIPLY(z3,-FIX_1_961570560))
+	imul	edx,(-F_0_390)	; edx=z4(=MULTIPLY(z4,-FIX_0_390180644))
+	imul	ebx,(-F_0_899)	; ebx=z1(=MULTIPLY(z1,-FIX_0_899976223))
+	imul	ecx,(-F_2_562)	; ecx=z2(=MULTIPLY(z2,-FIX_2_562915447))
+
+	add	esi,eax		; esi=z3(=z3+z5)
+	add	edx,eax		; edx=z4(=z4+z5)
+
+	lea	edi,[esi+ebx]	; edi=z1+z3
+	lea	eax,[edx+ecx]	; eax=z2+z4
+	add	esi,ecx		; esi=z2+z3
+	add	edx,ebx		; edx=z1+z4
+
+	pop	ecx		; ecx=tmp0
+	pop	ebx		; ebx=tmp1
+	imul	ecx,(F_0_298)	; ecx=tmp0(=MULTIPLY(tmp0,FIX_0_298631336))
+	imul	ebx,(F_2_053)	; ebx=tmp1(=MULTIPLY(tmp1,FIX_2_053119869))
+	add	edi,ecx		; edi=tmp0(=tmp0+z1+z3)
+	add	eax,ebx		; eax=tmp1(=tmp1+z2+z4)
+
+	pop	ecx		; ecx=tmp2
+	pop	ebx		; ebx=tmp3
+	imul	ecx,(F_3_072)	; ecx=tmp2(=MULTIPLY(tmp2,FIX_3_072711026))
+	imul	ebx,(F_1_501)	; ebx=tmp3(=MULTIPLY(tmp3,FIX_1_501321110))
+	add	esi,ecx		; esi=tmp2(=tmp2+z2+z3)
+	add	edx,ebx		; edx=tmp3(=tmp3+z1+z4)
+
+	; -- Final output stage
+
+	pop	ecx		; ecx=tmp10
+	pop	ebx		; ebx=tmp11
+	push	eax		; tmp1
+	push	edi		; tmp0
+
+	lea	eax,[ecx+edx]	; eax=data0(=tmp10+tmp3)
+	sub	ecx,edx		; ecx=data7(=tmp10-tmp3)
+	lea	edx,[ebx+esi]	; edx=data1(=tmp11+tmp2)
+	sub	ebx,esi		; ebx=data6(=tmp11-tmp2)
+
+	mov	esi, POINTER [range_limit]	; (JSAMPLE *)
+
+	descale	eax,(CONST_BITS+PASS1_BITS+3)
+	descale	ecx,(CONST_BITS+PASS1_BITS+3)
+	descale	edx,(CONST_BITS+PASS1_BITS+3)
+	descale	ebx,(CONST_BITS+PASS1_BITS+3)
+
+	mov	edi, POINTER [ptr]		; edi=outptr
+
+	and	eax,RANGE_MASK
+	and	ecx,RANGE_MASK
+	and	edx,RANGE_MASK
+	and	ebx,RANGE_MASK
+
+	mov	al, JSAMPLE [esi+eax*SIZEOF_JSAMPLE]
+	mov	cl, JSAMPLE [esi+ecx*SIZEOF_JSAMPLE]
+	mov	dl, JSAMPLE [esi+edx*SIZEOF_JSAMPLE]
+	mov	bl, JSAMPLE [esi+ebx*SIZEOF_JSAMPLE]
+
+	mov	JSAMPLE [edi+0*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+7*SIZEOF_JSAMPLE], cl
+	mov	JSAMPLE [edi+1*SIZEOF_JSAMPLE], dl
+	mov	JSAMPLE [edi+6*SIZEOF_JSAMPLE], bl
+
+	pop	esi		; esi=tmp0
+	pop	eax		; eax=tmp1
+	pop	ecx		; ecx=tmp12
+	pop	edx		; edx=tmp13
+
+	lea	ebx,[ecx+eax]	; ebx=data2(=tmp12+tmp1)
+	sub	ecx,eax		; ecx=data5(=tmp12-tmp1)
+	lea	eax,[edx+esi]	; eax=data3(=tmp13+tmp0)
+	sub	edx,esi		; edx=data4(=tmp13-tmp0)
+
+	mov	esi, POINTER [range_limit]	; (JSAMPLE *)
+
+	descale	ebx,(CONST_BITS+PASS1_BITS+3)
+	descale	ecx,(CONST_BITS+PASS1_BITS+3)
+	descale	eax,(CONST_BITS+PASS1_BITS+3)
+	descale	edx,(CONST_BITS+PASS1_BITS+3)
+
+	and	ebx,RANGE_MASK
+	and	ecx,RANGE_MASK
+	and	eax,RANGE_MASK
+	and	edx,RANGE_MASK
+
+	mov	bl, JSAMPLE [esi+ebx*SIZEOF_JSAMPLE]
+	mov	cl, JSAMPLE [esi+ecx*SIZEOF_JSAMPLE]
+	mov	al, JSAMPLE [esi+eax*SIZEOF_JSAMPLE]
+	mov	dl, JSAMPLE [esi+edx*SIZEOF_JSAMPLE]
+
+	mov	JSAMPLE [edi+2*SIZEOF_JSAMPLE], bl
+	mov	JSAMPLE [edi+5*SIZEOF_JSAMPLE], cl
+	mov	JSAMPLE [edi+3*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+4*SIZEOF_JSAMPLE], dl
+
+	pop	ecx	; ctr
+	pop	esi	; wsptr
+
+.nextrow:
+	pop	edi
+	add	esi, byte DCTSIZE*SIZEOF_INT	; advance pointer to next row
+	add	edi, byte SIZEOF_JSAMPROW
+	dec	ecx
+	jnz	near .rowloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp
+	pop	ebp
+	ret
+
+%endif ; DCT_ISLOW_SUPPORTED
diff --git a/jidctred.asm b/jidctred.asm
new file mode 100644
index 0000000..4463bfb
--- /dev/null
+++ b/jidctred.asm
@@ -0,0 +1,688 @@
+;
+; jidctred.asm - reduced-size IDCT (non-SIMD)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size output:
+; either 4x4, 2x2, or 1x1 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; Last Modified : October 17, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef IDCT_SCALING_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+; Descale and correctly round a DWORD value that's scaled by N bits.
+;
+%macro	descale 2
+%if (%2)<=7
+	add	%1, byte (1<<((%2)-1))	; add reg32,imm8
+%else
+	add	%1, (1<<((%2)-1))	; add reg32,imm32
+%endif
+	sar	%1,%2
+%endmacro
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%if CONST_BITS == 13
+F_0_211	equ	 1730		; FIX(0.211164243)
+F_0_509	equ	 4176		; FIX(0.509795579)
+F_0_601	equ	 4926		; FIX(0.601344887)
+F_0_720	equ	 5906		; FIX(0.720959822)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_850	equ	 6967		; FIX(0.850430095)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_061	equ	 8697		; FIX(1.061594337)
+F_1_272	equ	10426		; FIX(1.272758580)
+F_1_451	equ	11893		; FIX(1.451774981)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_2_172	equ	17799		; FIX(2.172734803)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_624	equ	29692		; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
+F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
+F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
+F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
+F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
+F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                JCOEFPTR coef_block,
+;                JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define range_limit	ebp-SIZEOF_POINTER	; JSAMPLE * range_limit
+%define workspace	range_limit-(DCTSIZE*4)*SIZEOF_INT
+					; int workspace[DCTSIZE*4]
+
+	align	16
+	global	EXTN(jpeg_idct_4x4)
+
+EXTN(jpeg_idct_4x4):
+	push	ebp
+	mov	ebp,esp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+	mov	edx, POINTER [compptr(ebp)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
+	lea	edi, [workspace]			; int * wsptr
+	mov	ecx, DCTSIZE				; ctr
+	alignx	16,7
+.columnloop:
+	; Don't bother to process column 4, because second pass won't use it
+	cmp	ecx, byte DCTSIZE-4
+	je	near .nextcolumn
+
+	mov	ax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(2,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	mov	ax, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	mov	bx, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(6,esi,SIZEOF_JCOEF)]
+	or	bx, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	or	ax,bx
+	jnz	short .columnDCT
+
+	; -- AC terms all zero; we need not examine term 4 for 4x4 output
+
+	mov	ax, JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+	imul	ax, ISLOW_MULT_TYPE [COL(0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	cwde
+
+	sal	eax, PASS1_BITS
+
+	mov	INT [COL(0,edi,SIZEOF_INT)], eax
+	mov	INT [COL(1,edi,SIZEOF_INT)], eax
+	mov	INT [COL(2,edi,SIZEOF_INT)], eax
+	mov	INT [COL(3,edi,SIZEOF_INT)], eax
+	jmp	near .nextcolumn
+	alignx	16,7
+
+.columnDCT:
+	push	ecx	; ctr
+	push	esi	; coef_block
+	push	edx	; quantptr
+	push	edi	; wsptr
+
+	; -- Even part
+
+	movsx	ebx, JCOEF [COL(2,esi,SIZEOF_JCOEF)]
+	movsx	ecx, JCOEF [COL(6,esi,SIZEOF_JCOEF)]
+	movsx	eax, JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+	imul	bx, ISLOW_MULT_TYPE [COL(2,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	cx, ISLOW_MULT_TYPE [COL(6,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	ax, ISLOW_MULT_TYPE [COL(0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	imul	ebx,(F_1_847)		; ebx=MULTIPLY(z2,FIX_1_847759065)
+	imul	ecx,(-F_0_765)		; ecx=MULTIPLY(z3,-FIX_0_765366865)
+	sal	eax,(CONST_BITS+1)	; eax=tmp0
+	add	ecx,ebx			; ecx=tmp2
+
+	lea	edi,[eax+ecx]		; edi=tmp10
+	sub	eax,ecx			; eax=tmp12
+
+	push	eax		; tmp12
+	push	edi		; tmp10
+
+	; -- Odd part
+
+	movsx	edi, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	movsx	ecx, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+	imul	di, ISLOW_MULT_TYPE [COL(7,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	cx, ISLOW_MULT_TYPE [COL(5,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movsx	ebx, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	movsx	eax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	imul	bx, ISLOW_MULT_TYPE [COL(3,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	ax, ISLOW_MULT_TYPE [COL(1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	mov	esi,edi		; esi=edi=z1
+	mov	edx,ecx		; edx=ecx=z2
+	imul	edi,(-F_0_211)	; edi=MULTIPLY(z1,-FIX_0_211164243)
+	imul	ecx,(F_1_451)	; ecx=MULTIPLY(z2,FIX_1_451774981)
+	imul	esi,(-F_0_509)	; esi=MULTIPLY(z1,-FIX_0_509795579)
+	imul	edx,(-F_0_601)	; edx=MULTIPLY(z2,-FIX_0_601344887)
+
+	add	edi,ecx		; edi=(tmp0)
+	add	esi,edx		; esi=(tmp2)
+
+	mov	ecx,ebx		; ecx=ebx=z3
+	mov	edx,eax		; edx=eax=z4
+	imul	ebx,(-F_2_172)	; ebx=MULTIPLY(z3,-FIX_2_172734803)
+	imul	eax,(F_1_061)	; eax=MULTIPLY(z4,FIX_1_061594337)
+	imul	ecx,(F_0_899)	; ecx=MULTIPLY(z3,FIX_0_899976223)
+	imul	edx,(F_2_562)	; edx=MULTIPLY(z4,FIX_2_562915447)
+
+	add	edi,ebx
+	add	esi,ecx
+	add	edi,eax		; edi=tmp0
+	add	esi,edx		; esi=tmp2
+
+	; -- Final output stage
+
+	pop	ebx		; ebx=tmp10
+	pop	ecx		; ecx=tmp12
+
+	lea	eax,[ebx+esi]	; eax=data0(=tmp10+tmp2)
+	sub	ebx,esi		; ebx=data3(=tmp10-tmp2)
+	lea	edx,[ecx+edi]	; edx=data1(=tmp12+tmp0)
+	sub	ecx,edi		; ecx=data2(=tmp12-tmp0)
+
+	pop	edi	; wsptr
+
+	descale	eax,(CONST_BITS-PASS1_BITS+1)
+	descale	ebx,(CONST_BITS-PASS1_BITS+1)
+	descale	edx,(CONST_BITS-PASS1_BITS+1)
+	descale	ecx,(CONST_BITS-PASS1_BITS+1)
+
+	mov	INT [COL(0,edi,SIZEOF_INT)], eax
+	mov	INT [COL(3,edi,SIZEOF_INT)], ebx
+	mov	INT [COL(1,edi,SIZEOF_INT)], edx
+	mov	INT [COL(2,edi,SIZEOF_INT)], ecx
+
+	pop	edx	; quantptr
+	pop	esi	; coef_block
+	pop	ecx	; ctr
+
+.nextcolumn:
+	add	esi, byte SIZEOF_JCOEF	; advance pointers to next column
+	add	edx, byte SIZEOF_ISLOW_MULT_TYPE
+	add	edi, byte SIZEOF_INT
+	dec	ecx
+	jnz	near .columnloop
+
+	; ---- Pass 2: process 4 rows from work array, store into output array.
+
+	mov	eax, POINTER [cinfo(ebp)]
+	mov	eax, POINTER [jdstruct_sample_range_limit(eax)]
+	sub	eax, byte -CENTERJSAMPLE*SIZEOF_JSAMPLE	; JSAMPLE * range_limit
+	mov	POINTER [range_limit], eax
+
+	lea	esi, [workspace]			; int * wsptr
+	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	ecx, DCTSIZE/2				; ctr
+	alignx	16,7
+.rowloop:
+	push	edi
+	mov	edi, JSAMPROW [edi]			; (JSAMPLE *)
+	add	edi, JDIMENSION [output_col(ebp)]	; edi=outptr
+
+%ifndef NO_ZERO_ROW_TEST
+	mov	eax, INT [ROW(1,esi,SIZEOF_INT)]
+	or	eax, INT [ROW(2,esi,SIZEOF_INT)]
+	jnz	short .rowDCT
+
+	mov	eax, INT [ROW(3,esi,SIZEOF_INT)]
+	mov	ebx, INT [ROW(5,esi,SIZEOF_INT)]
+	or	eax, INT [ROW(6,esi,SIZEOF_INT)]
+	or	ebx, INT [ROW(7,esi,SIZEOF_INT)]
+	or	eax,ebx
+	jnz	short .rowDCT
+
+	; -- AC terms all zero
+
+	mov	eax, INT [ROW(0,esi,SIZEOF_INT)]
+
+	mov	edx, POINTER [range_limit]	; (JSAMPLE *)
+
+	descale	eax,(PASS1_BITS+3)
+	and	eax,RANGE_MASK
+	mov	al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [edi+0*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+1*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+2*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+3*SIZEOF_JSAMPLE], al
+	jmp	near .nextrow
+	alignx	16,7
+%endif
+.rowDCT:
+	push	esi	; wsptr
+	push	ecx	; ctr
+	push	edi	; outptr
+
+	; -- Even part
+
+	mov	eax, INT [ROW(0,esi,SIZEOF_INT)]
+	mov	ebx, INT [ROW(2,esi,SIZEOF_INT)]
+	mov	ecx, INT [ROW(6,esi,SIZEOF_INT)]
+
+	imul	ebx,(F_1_847)		; ebx=MULTIPLY(z2,FIX_1_847759065)
+	imul	ecx,(-F_0_765)		; ecx=MULTIPLY(z3,-FIX_0_765366865)
+	sal	eax,(CONST_BITS+1)	; eax=tmp0
+	add	ecx,ebx			; ecx=tmp2
+
+	lea	edi,[eax+ecx]		; edi=tmp10
+	sub	eax,ecx			; eax=tmp12
+
+	push	eax		; tmp12
+	push	edi		; tmp10
+
+	; -- Odd part
+
+	mov	eax, INT [ROW(1,esi,SIZEOF_INT)]
+	mov	ebx, INT [ROW(3,esi,SIZEOF_INT)]
+	mov	ecx, INT [ROW(5,esi,SIZEOF_INT)]
+	mov	edi, INT [ROW(7,esi,SIZEOF_INT)]
+
+	mov	esi,edi		; esi=edi=z1
+	mov	edx,ecx		; edx=ecx=z2
+	imul	edi,(-F_0_211)	; edi=MULTIPLY(z1,-FIX_0_211164243)
+	imul	ecx,(F_1_451)	; ecx=MULTIPLY(z2,FIX_1_451774981)
+	imul	esi,(-F_0_509)	; esi=MULTIPLY(z1,-FIX_0_509795579)
+	imul	edx,(-F_0_601)	; edx=MULTIPLY(z2,-FIX_0_601344887)
+
+	add	edi,ecx		; edi=(tmp0)
+	add	esi,edx		; esi=(tmp2)
+
+	mov	ecx,ebx		; ecx=ebx=z3
+	mov	edx,eax		; edx=eax=z4
+	imul	ebx,(-F_2_172)	; ebx=MULTIPLY(z3,-FIX_2_172734803)
+	imul	eax,(F_1_061)	; eax=MULTIPLY(z4,FIX_1_061594337)
+	imul	ecx,(F_0_899)	; ecx=MULTIPLY(z3,FIX_0_899976223)
+	imul	edx,(F_2_562)	; edx=MULTIPLY(z4,FIX_2_562915447)
+
+	add	edi,ebx
+	add	esi,ecx
+	add	edi,eax		; edi=tmp0
+	add	esi,edx		; esi=tmp2
+
+	; -- Final output stage
+
+	pop	ebx		; ebx=tmp10
+	pop	ecx		; ecx=tmp12
+
+	lea	eax,[ebx+esi]	; eax=data0(=tmp10+tmp2)
+	sub	ebx,esi		; ebx=data3(=tmp10-tmp2)
+	lea	edx,[ecx+edi]	; edx=data1(=tmp12+tmp0)
+	sub	ecx,edi		; ecx=data2(=tmp12-tmp0)
+
+	mov	esi, POINTER [range_limit]	; (JSAMPLE *)
+
+	descale	eax,(CONST_BITS+PASS1_BITS+3+1)
+	descale	ebx,(CONST_BITS+PASS1_BITS+3+1)
+	descale	edx,(CONST_BITS+PASS1_BITS+3+1)
+	descale	ecx,(CONST_BITS+PASS1_BITS+3+1)
+
+	pop	edi	; outptr
+
+	and	eax,RANGE_MASK
+	and	ebx,RANGE_MASK
+	and	edx,RANGE_MASK
+	and	ecx,RANGE_MASK
+
+	mov	al, JSAMPLE [esi+eax*SIZEOF_JSAMPLE]
+	mov	bl, JSAMPLE [esi+ebx*SIZEOF_JSAMPLE]
+	mov	dl, JSAMPLE [esi+edx*SIZEOF_JSAMPLE]
+	mov	cl, JSAMPLE [esi+ecx*SIZEOF_JSAMPLE]
+
+	mov	JSAMPLE [edi+0*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+3*SIZEOF_JSAMPLE], bl
+	mov	JSAMPLE [edi+1*SIZEOF_JSAMPLE], dl
+	mov	JSAMPLE [edi+2*SIZEOF_JSAMPLE], cl
+
+	pop	ecx	; ctr
+	pop	esi	; wsptr
+
+.nextrow:
+	pop	edi
+	add	esi, byte DCTSIZE*SIZEOF_INT	; advance pointer to next row
+	add	edi, byte SIZEOF_JSAMPROW
+	dec	ecx
+	jnz	near .rowloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                JCOEFPTR coef_block,
+;                JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define range_limit	ebp-SIZEOF_POINTER	; JSAMPLE * range_limit
+%define workspace	range_limit-(DCTSIZE*2)*SIZEOF_INT
+					; int workspace[DCTSIZE*2]
+
+	align	16
+	global	EXTN(jpeg_idct_2x2)
+
+EXTN(jpeg_idct_2x2):
+	push	ebp
+	mov	ebp,esp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+	mov	edx, POINTER [compptr(ebp)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
+	lea	edi, [workspace]			; int * wsptr
+	mov	ecx, DCTSIZE				; ctr
+	alignx	16,7
+.columnloop:
+	; Don't bother to process columns 2,4,6
+	test	ecx, 0x09
+	jz	near .nextcolumn
+
+	mov	ax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	mov	ax, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+	or	ax, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	; -- AC terms all zero; we need not examine terms 2,4,6 for 2x2 output
+
+	mov	ax, JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+	imul	ax, ISLOW_MULT_TYPE [COL(0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	cwde
+
+	sal	eax, PASS1_BITS
+
+	mov	INT [COL(0,edi,SIZEOF_INT)], eax
+	mov	INT [COL(1,edi,SIZEOF_INT)], eax
+	jmp	short .nextcolumn
+	alignx	16,7
+
+.columnDCT:
+	push	ecx	; ctr
+	push	edi	; wsptr
+
+	; -- Odd part
+
+	movsx	eax, JCOEF [COL(1,esi,SIZEOF_JCOEF)]
+	movsx	ebx, JCOEF [COL(3,esi,SIZEOF_JCOEF)]
+	imul	ax, ISLOW_MULT_TYPE [COL(1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	bx, ISLOW_MULT_TYPE [COL(3,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movsx	ecx, JCOEF [COL(5,esi,SIZEOF_JCOEF)]
+	movsx	edi, JCOEF [COL(7,esi,SIZEOF_JCOEF)]
+	imul	cx, ISLOW_MULT_TYPE [COL(5,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	imul	di, ISLOW_MULT_TYPE [COL(7,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	imul	eax,(F_3_624)	; eax=MULTIPLY(data1,FIX_3_624509785)
+	imul	ebx,(-F_1_272)	; ebx=MULTIPLY(data3,-FIX_1_272758580)
+	imul	ecx,(F_0_850)	; ecx=MULTIPLY(data5,FIX_0_850430095)
+	imul	edi,(-F_0_720)	; edi=MULTIPLY(data7,-FIX_0_720959822)
+
+	add	eax,ebx
+	add	ecx,edi
+	add	ecx,eax		; ecx=tmp0
+
+	; -- Even part
+
+	mov	ax, JCOEF [COL(0,esi,SIZEOF_JCOEF)]
+	imul	ax, ISLOW_MULT_TYPE [COL(0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	cwde
+
+	sal	eax,(CONST_BITS+2)	; eax=tmp10
+
+	; -- Final output stage
+
+	pop	edi	; wsptr
+
+	lea	ebx,[eax+ecx]	; ebx=data0(=tmp10+tmp0)
+	sub	eax,ecx		; eax=data1(=tmp10-tmp0)
+
+	pop	ecx	; ctr
+
+	descale	ebx,(CONST_BITS-PASS1_BITS+2)
+	descale	eax,(CONST_BITS-PASS1_BITS+2)
+
+	mov	INT [COL(0,edi,SIZEOF_INT)], ebx
+	mov	INT [COL(1,edi,SIZEOF_INT)], eax
+
+.nextcolumn:
+	add	esi, byte SIZEOF_JCOEF	; advance pointers to next column
+	add	edx, byte SIZEOF_ISLOW_MULT_TYPE
+	add	edi, byte SIZEOF_INT
+	dec	ecx
+	jnz	near .columnloop
+
+	; ---- Pass 2: process 2 rows from work array, store into output array.
+
+	mov	eax, POINTER [cinfo(ebp)]
+	mov	eax, POINTER [jdstruct_sample_range_limit(eax)]
+	sub	eax, byte -CENTERJSAMPLE*SIZEOF_JSAMPLE	; JSAMPLE * range_limit
+	mov	POINTER [range_limit], eax
+
+	lea	esi, [workspace]			; int * wsptr
+	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+	push	edi
+	mov	edi, JSAMPROW [edi]			; (JSAMPLE *)
+	add	edi, JDIMENSION [output_col(ebp)]	; edi=outptr
+
+%ifndef NO_ZERO_ROW_TEST
+	mov	eax, INT [ROW(1,esi,SIZEOF_INT)]
+	or	eax, INT [ROW(3,esi,SIZEOF_INT)]
+	jnz	short .rowDCT
+
+	mov	eax, INT [ROW(5,esi,SIZEOF_INT)]
+	or	eax, INT [ROW(7,esi,SIZEOF_INT)]
+	jnz	short .rowDCT
+
+	; -- AC terms all zero
+
+	mov	eax, INT [ROW(0,esi,SIZEOF_INT)]
+
+	mov	edx, POINTER [range_limit]	; (JSAMPLE *)
+
+	descale	eax,(PASS1_BITS+3)
+	and	eax,RANGE_MASK
+	mov	al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [edi+0*SIZEOF_JSAMPLE], al
+	mov	JSAMPLE [edi+1*SIZEOF_JSAMPLE], al
+	jmp	short .nextrow
+	alignx	16,7
+%endif
+.rowDCT:
+	push	ecx	; ctr
+
+	; -- Odd part
+
+	mov	eax, INT [ROW(1,esi,SIZEOF_INT)]
+	mov	ebx, INT [ROW(3,esi,SIZEOF_INT)]
+	mov	ecx, INT [ROW(5,esi,SIZEOF_INT)]
+	mov	edx, INT [ROW(7,esi,SIZEOF_INT)]
+
+	imul	eax,(F_3_624)	; eax=MULTIPLY(data1,FIX_3_624509785)
+	imul	ebx,(-F_1_272)	; ebx=MULTIPLY(data3,-FIX_1_272758580)
+	imul	ecx,(F_0_850)	; ecx=MULTIPLY(data5,FIX_0_850430095)
+	imul	edx,(-F_0_720)	; edx=MULTIPLY(data7,-FIX_0_720959822)
+
+	add	eax,ebx
+	add	ecx,edx
+	add	ecx,eax		; ecx=tmp0
+
+	; -- Even part
+
+	mov	eax, INT [ROW(0,esi,SIZEOF_INT)]
+
+	sal	eax,(CONST_BITS+2)	; eax=tmp10
+
+	; -- Final output stage
+
+	mov	edx, POINTER [range_limit]	; (JSAMPLE *)
+
+	lea	ebx,[eax+ecx]	; ebx=data0(=tmp10+tmp0)
+	sub	eax,ecx		; eax=data1(=tmp10-tmp0)
+
+	pop	ecx	; ctr
+
+	descale	ebx,(CONST_BITS+PASS1_BITS+3+2)
+	descale	eax,(CONST_BITS+PASS1_BITS+3+2)
+
+	and	ebx,RANGE_MASK
+	and	eax,RANGE_MASK
+	mov	bl, JSAMPLE [edx+ebx*SIZEOF_JSAMPLE]
+	mov	al, JSAMPLE [edx+eax*SIZEOF_JSAMPLE]
+	mov	JSAMPLE [edi+0*SIZEOF_JSAMPLE], bl
+	mov	JSAMPLE [edi+1*SIZEOF_JSAMPLE], al
+
+.nextrow:
+	pop	edi
+	add	esi, byte DCTSIZE*SIZEOF_INT	; advance pointer to next row
+	add	edi, byte SIZEOF_JSAMPROW
+	dec	ecx
+	jnz	near .rowloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 1x1 output block.
+;
+; GLOBAL(void)
+; jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                JCOEFPTR coef_block,
+;                JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define ebp		esp-4		; use esp instead of ebp
+
+	align	16
+	global	EXTN(jpeg_idct_1x1)
+
+EXTN(jpeg_idct_1x1):
+;	push	ebp
+;	mov	ebp,esp
+;	push	ebx		; unused
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+;	push	edi		; unused
+
+	; We hardly need an inverse DCT routine for this: just take the
+	; average pixel value, which is one-eighth of the DC coefficient.
+
+	mov	edx, POINTER [compptr(ebp)]
+	mov	ecx, JCOEFPTR [coef_block(ebp)]		; inptr
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+
+	mov	ax, JCOEF [COL(0,ecx,SIZEOF_JCOEF)]
+	imul	ax, ISLOW_MULT_TYPE [COL(0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	mov	ecx, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	edx, JDIMENSION [output_col(ebp)]
+	mov	ecx, JSAMPROW [ecx]			; (JSAMPLE *)
+
+	add	ax, (1 << (3-1)) + (CENTERJSAMPLE << 3)
+	sar	ax,3		; descale
+
+	test	ah,ah		; unsigned saturation
+	jz	short .output
+	not	ax
+	sar	ax,15
+	alignx	16,3
+.output:
+	mov	JSAMPLE [ecx+edx*SIZEOF_JSAMPLE], al
+
+;	pop	edi		; unused
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+;	pop	ebx		; unused
+;	pop	ebp
+	ret
+
+%endif ; IDCT_SCALING_SUPPORTED
diff --git a/jimmxfst.asm b/jimmxfst.asm
new file mode 100644
index 0000000..de0def6
--- /dev/null
+++ b/jimmxfst.asm
@@ -0,0 +1,510 @@
+;
+; jimmxfst.asm - fast integer IDCT (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_IFAST_SUPPORTED
+%ifdef JIDCT_INT_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+%define PASS1_BITS	2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082	equ	277		; FIX(1.082392200)
+F_1_414	equ	362		; FIX(1.414213562)
+F_1_847	equ	473		; FIX(1.847759065)
+F_2_613	equ	669		; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - 256)	; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define	DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_1_082	equ	DESCALE(1162209775,30-CONST_BITS)	; FIX(1.082392200)
+F_1_414	equ	DESCALE(1518500249,30-CONST_BITS)	; FIX(1.414213562)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_613	equ	DESCALE(2805822602,30-CONST_BITS)	; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - (1 << CONST_BITS))	; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_idct_ifast_mmx)
+
+EXTN(jconst_idct_ifast_mmx):
+
+PW_F1414	times 4 dw  F_1_414 << CONST_SHIFT
+PW_F1847	times 4 dw  F_1_847 << CONST_SHIFT
+PW_MF1613	times 4 dw -F_1_613 << CONST_SHIFT
+PW_F1082	times 4 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_ifast_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                      JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
+					; JCOEF workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jpeg_idct_ifast_mmx)
+
+EXTN(jpeg_idct_ifast_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [compptr(eax)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; JCOEF * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	mm1,mm0
+	packsswb mm1,mm1
+	movd	eax,mm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
+	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
+	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
+
+	movq      mm1,mm0
+	punpckldq mm0,mm0		; mm0=(00 00 00 00)
+	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
+	movq      mm3,mm2
+	punpckldq mm2,mm2		; mm2=(02 02 02 02)
+	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	psubw	mm0,mm2			; mm0=tmp11
+	psubw	mm1,mm3
+	paddw	mm4,mm2			; mm4=tmp10
+	paddw	mm5,mm3			; mm5=tmp13
+
+	psllw	mm1,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm1,[GOTOFF(ebx,PW_F1414)]
+	psubw	mm1,mm5			; mm1=tmp12
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	psubw	mm4,mm5			; mm4=tmp3
+	psubw	mm0,mm1			; mm0=tmp2
+	paddw	mm6,mm5			; mm6=tmp0
+	paddw	mm7,mm1			; mm7=tmp1
+
+	movq	MMWORD [wk(1)], mm4	; wk(1)=tmp3
+	movq	MMWORD [wk(0)], mm0	; wk(0)=tmp2
+
+	; -- Odd part
+
+	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movq	mm4,mm2
+	movq	mm0,mm5
+	psubw	mm2,mm1			; mm2=z12
+	psubw	mm5,mm3			; mm5=z10
+	paddw	mm4,mm1			; mm4=z11
+	paddw	mm0,mm3			; mm0=z13
+
+	movq	mm1,mm5			; mm1=z10(unscaled)
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm5,PRE_MULTIPLY_SCALE_BITS
+
+	movq	mm3,mm4
+	psubw	mm4,mm0
+	paddw	mm3,mm0			; mm3=tmp7
+
+	psllw	mm4,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm4,[GOTOFF(ebx,PW_F1414)]	; mm4=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movq	mm0,mm5
+	paddw	mm5,mm2
+	pmulhw	mm5,[GOTOFF(ebx,PW_F1847)]	; mm5=z5
+	pmulhw	mm0,[GOTOFF(ebx,PW_MF1613)]
+	pmulhw	mm2,[GOTOFF(ebx,PW_F1082)]
+	psubw	mm0,mm1
+	psubw	mm2,mm5			; mm2=tmp10
+	paddw	mm0,mm5			; mm0=tmp12
+
+	; -- Final output stage
+
+	psubw	mm0,mm3			; mm0=tmp6
+	movq	mm1,mm6
+	movq	mm5,mm7
+	paddw	mm6,mm3			; mm6=data0=(00 01 02 03)
+	paddw	mm7,mm0			; mm7=data1=(10 11 12 13)
+	psubw	mm1,mm3			; mm1=data7=(70 71 72 73)
+	psubw	mm5,mm0			; mm5=data6=(60 61 62 63)
+	psubw	mm4,mm0			; mm4=tmp5
+
+	movq      mm3,mm6		; transpose coefficients(phase 1)
+	punpcklwd mm6,mm7		; mm6=(00 10 01 11)
+	punpckhwd mm3,mm7		; mm3=(02 12 03 13)
+	movq      mm0,mm5		; transpose coefficients(phase 1)
+	punpcklwd mm5,mm1		; mm5=(60 70 61 71)
+	punpckhwd mm0,mm1		; mm0=(62 72 63 73)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=tmp2
+	movq	mm1, MMWORD [wk(1)]	; mm1=tmp3
+
+	movq	MMWORD [wk(0)], mm5	; wk(0)=(60 70 61 71)
+	movq	MMWORD [wk(1)], mm0	; wk(1)=(62 72 63 73)
+
+	paddw	mm2,mm4			; mm2=tmp4
+	movq	mm5,mm7
+	movq	mm0,mm1
+	paddw	mm7,mm4			; mm7=data2=(20 21 22 23)
+	paddw	mm1,mm2			; mm1=data4=(40 41 42 43)
+	psubw	mm5,mm4			; mm5=data5=(50 51 52 53)
+	psubw	mm0,mm2			; mm0=data3=(30 31 32 33)
+
+	movq      mm4,mm7		; transpose coefficients(phase 1)
+	punpcklwd mm7,mm0		; mm7=(20 30 21 31)
+	punpckhwd mm4,mm0		; mm4=(22 32 23 33)
+	movq      mm2,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm5		; mm1=(40 50 41 51)
+	punpckhwd mm2,mm5		; mm2=(42 52 43 53)
+
+	movq      mm0,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm7		; mm6=(00 10 20 30)
+	punpckhdq mm0,mm7		; mm0=(01 11 21 31)
+	movq      mm5,mm3		; transpose coefficients(phase 2)
+	punpckldq mm3,mm4		; mm3=(02 12 22 32)
+	punpckhdq mm5,mm4		; mm5=(03 13 23 33)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=(60 70 61 71)
+	movq	mm4, MMWORD [wk(1)]	; mm4=(62 72 63 73)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+	movq      mm6,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm7		; mm1=(40 50 60 70)
+	punpckhdq mm6,mm7		; mm6=(41 51 61 71)
+	movq      mm0,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm4		; mm2=(42 52 62 72)
+	punpckhdq mm0,mm4		; mm0=(43 53 63 73)
+
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_IFAST_MULT_TYPE	; quantptr
+	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; JCOEF * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	movq	mm4,mm0
+	movq	mm5,mm1
+	psubw	mm0,mm2			; mm0=tmp11
+	psubw	mm1,mm3
+	paddw	mm4,mm2			; mm4=tmp10
+	paddw	mm5,mm3			; mm5=tmp13
+
+	psllw	mm1,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm1,[GOTOFF(ebx,PW_F1414)]
+	psubw	mm1,mm5			; mm1=tmp12
+
+	movq	mm6,mm4
+	movq	mm7,mm0
+	psubw	mm4,mm5			; mm4=tmp3
+	psubw	mm0,mm1			; mm0=tmp2
+	paddw	mm6,mm5			; mm6=tmp0
+	paddw	mm7,mm1			; mm7=tmp1
+
+	movq	MMWORD [wk(1)], mm4	; wk(1)=tmp3
+	movq	MMWORD [wk(0)], mm0	; wk(0)=tmp2
+
+	; -- Odd part
+
+	movq	mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	movq	mm4,mm2
+	movq	mm0,mm5
+	psubw	mm2,mm1			; mm2=z12
+	psubw	mm5,mm3			; mm5=z10
+	paddw	mm4,mm1			; mm4=z11
+	paddw	mm0,mm3			; mm0=z13
+
+	movq	mm1,mm5			; mm1=z10(unscaled)
+	psllw	mm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	mm5,PRE_MULTIPLY_SCALE_BITS
+
+	movq	mm3,mm4
+	psubw	mm4,mm0
+	paddw	mm3,mm0			; mm3=tmp7
+
+	psllw	mm4,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	mm4,[GOTOFF(ebx,PW_F1414)]	; mm4=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movq	mm0,mm5
+	paddw	mm5,mm2
+	pmulhw	mm5,[GOTOFF(ebx,PW_F1847)]	; mm5=z5
+	pmulhw	mm0,[GOTOFF(ebx,PW_MF1613)]
+	pmulhw	mm2,[GOTOFF(ebx,PW_F1082)]
+	psubw	mm0,mm1
+	psubw	mm2,mm5			; mm2=tmp10
+	paddw	mm0,mm5			; mm0=tmp12
+
+	; -- Final output stage
+
+	psubw	mm0,mm3			; mm0=tmp6
+	movq	mm1,mm6
+	movq	mm5,mm7
+	paddw	mm6,mm3			; mm6=data0=(00 10 20 30)
+	paddw	mm7,mm0			; mm7=data1=(01 11 21 31)
+	psraw	mm6,(PASS1_BITS+3)	; descale
+	psraw	mm7,(PASS1_BITS+3)	; descale
+	psubw	mm1,mm3			; mm1=data7=(07 17 27 37)
+	psubw	mm5,mm0			; mm5=data6=(06 16 26 36)
+	psraw	mm1,(PASS1_BITS+3)	; descale
+	psraw	mm5,(PASS1_BITS+3)	; descale
+	psubw	mm4,mm0			; mm4=tmp5
+
+	packsswb  mm6,mm5		; mm6=(00 10 20 30 06 16 26 36)
+	packsswb  mm7,mm1		; mm7=(01 11 21 31 07 17 27 37)
+
+	movq	mm3, MMWORD [wk(0)]	; mm3=tmp2
+	movq	mm0, MMWORD [wk(1)]	; mm0=tmp3
+
+	paddw	mm2,mm4			; mm2=tmp4
+	movq	mm5,mm3
+	movq	mm1,mm0
+	paddw	mm3,mm4			; mm3=data2=(02 12 22 32)
+	paddw	mm0,mm2			; mm0=data4=(04 14 24 34)
+	psraw	mm3,(PASS1_BITS+3)	; descale
+	psraw	mm0,(PASS1_BITS+3)	; descale
+	psubw	mm5,mm4			; mm5=data5=(05 15 25 35)
+	psubw	mm1,mm2			; mm1=data3=(03 13 23 33)
+	psraw	mm5,(PASS1_BITS+3)	; descale
+	psraw	mm1,(PASS1_BITS+3)	; descale
+
+	movq      mm4,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm4=[PB_CENTERJSAMP]
+
+	packsswb  mm3,mm0		; mm3=(02 12 22 32 04 14 24 34)
+	packsswb  mm1,mm5		; mm1=(03 13 23 33 05 15 25 35)
+
+	paddb     mm6,mm4
+	paddb     mm7,mm4
+	paddb     mm3,mm4
+	paddb     mm1,mm4
+
+	movq      mm2,mm6		; transpose coefficients(phase 1)
+	punpcklbw mm6,mm7		; mm6=(00 01 10 11 20 21 30 31)
+	punpckhbw mm2,mm7		; mm2=(06 07 16 17 26 27 36 37)
+	movq      mm0,mm3		; transpose coefficients(phase 1)
+	punpcklbw mm3,mm1		; mm3=(02 03 12 13 22 23 32 33)
+	punpckhbw mm0,mm1		; mm0=(04 05 14 15 24 25 34 35)
+
+	movq      mm5,mm6		; transpose coefficients(phase 2)
+	punpcklwd mm6,mm3		; mm6=(00 01 02 03 10 11 12 13)
+	punpckhwd mm5,mm3		; mm5=(20 21 22 23 30 31 32 33)
+	movq      mm4,mm0		; transpose coefficients(phase 2)
+	punpcklwd mm0,mm2		; mm0=(04 05 06 07 14 15 16 17)
+	punpckhwd mm4,mm2		; mm4=(24 25 26 27 34 35 36 37)
+
+	movq      mm7,mm6		; transpose coefficients(phase 3)
+	punpckldq mm6,mm0		; mm6=(00 01 02 03 04 05 06 07)
+	punpckhdq mm7,mm0		; mm7=(10 11 12 13 14 15 16 17)
+	movq      mm1,mm5		; transpose coefficients(phase 3)
+	punpckldq mm5,mm4		; mm5=(20 21 22 23 24 25 26 27)
+	punpckhdq mm1,mm4		; mm1=(30 31 32 33 34 35 36 37)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 4*SIZEOF_JCOEF	; wsptr
+	add	edi, byte 4*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JIDCT_INT_MMX_SUPPORTED
+%endif ; DCT_IFAST_SUPPORTED
diff --git a/jimmxint.asm b/jimmxint.asm
new file mode 100644
index 0000000..2a33a63
--- /dev/null
+++ b/jimmxint.asm
@@ -0,0 +1,862 @@
+;
+; jimmxint.asm - accurate integer IDCT (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_ISLOW_SUPPORTED
+%ifdef JIDCT_INT_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS+3)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_islow_mmx)
+
+EXTN(jconst_idct_islow_mmx):
+
+PW_F130_F054	times 2 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 2 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 2 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 2 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 2 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 2 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 2 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 2 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 2 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 2 dd  1 << (DESCALE_P2-1)
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_islow_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                      JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		12
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
+					; JCOEF workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jpeg_idct_islow_mmx)
+
+EXTN(jpeg_idct_islow_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [compptr(eax)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; JCOEF * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	mm1,mm0
+	packsswb mm1,mm1
+	movd	eax,mm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	mm0,PASS1_BITS
+
+	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
+	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
+	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
+
+	movq      mm1,mm0
+	punpckldq mm0,mm0		; mm0=(00 00 00 00)
+	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
+	movq      mm3,mm2
+	punpckldq mm2,mm2		; mm2=(02 02 02 02)
+	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movq      mm4,mm1		; mm1=in2=z2
+	movq      mm5,mm1
+	punpcklwd mm4,mm3		; mm3=in6=z3
+	punpckhwd mm5,mm3
+	movq      mm1,mm4
+	movq      mm3,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=tmp3L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]	; mm5=tmp3H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]	; mm3=tmp2H
+
+	movq      mm6,mm0
+	paddw     mm0,mm2		; mm0=in0+in4
+	psubw     mm6,mm2		; mm6=in0-in4
+
+	pxor      mm7,mm7
+	pxor      mm2,mm2
+	punpcklwd mm7,mm0		; mm7=tmp0L
+	punpckhwd mm2,mm0		; mm2=tmp0H
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+	psrad     mm2,(16-CONST_BITS)	; psrad mm2,16 & pslld mm2,CONST_BITS
+
+	movq	mm0,mm7
+	paddd	mm7,mm4			; mm7=tmp10L
+	psubd	mm0,mm4			; mm0=tmp13L
+	movq	mm4,mm2
+	paddd	mm2,mm5			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp13H
+
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp10L
+	movq	MMWORD [wk(1)], mm2	; wk(1)=tmp10H
+	movq	MMWORD [wk(2)], mm0	; wk(2)=tmp13L
+	movq	MMWORD [wk(3)], mm4	; wk(3)=tmp13H
+
+	pxor      mm5,mm5
+	pxor      mm7,mm7
+	punpcklwd mm5,mm6		; mm5=tmp1L
+	punpckhwd mm7,mm6		; mm7=tmp1H
+	psrad     mm5,(16-CONST_BITS)	; psrad mm5,16 & pslld mm5,CONST_BITS
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+
+	movq	mm2,mm5
+	paddd	mm5,mm1			; mm5=tmp11L
+	psubd	mm2,mm1			; mm2=tmp12L
+	movq	mm0,mm7
+	paddd	mm7,mm3			; mm7=tmp11H
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	movq	MMWORD [wk(4)], mm5	; wk(4)=tmp11L
+	movq	MMWORD [wk(5)], mm7	; wk(5)=tmp11H
+	movq	MMWORD [wk(6)], mm2	; wk(6)=tmp12L
+	movq	MMWORD [wk(7)], mm0	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movq	mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movq	mm5,mm6
+	movq	mm7,mm4
+	paddw	mm5,mm3			; mm5=z3
+	paddw	mm7,mm1			; mm7=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm2,mm5
+	movq      mm0,mm5
+	punpcklwd mm2,mm7
+	punpckhwd mm0,mm7
+	movq      mm5,mm2
+	movq      mm7,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]	; mm2=z3L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]	; mm0=z3H
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]	; mm5=z4L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]	; mm7=z4H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=z3L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movq      mm2,mm3
+	movq      mm0,mm3
+	punpcklwd mm2,mm4
+	punpckhwd mm0,mm4
+	movq      mm3,mm2
+	movq      mm4,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]	; mm2=tmp0L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]	; mm0=tmp0H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]	; mm3=tmp3L
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]	; mm4=tmp3H
+
+	paddd	mm2, MMWORD [wk(10)]	; mm2=tmp0L
+	paddd	mm0, MMWORD [wk(11)]	; mm0=tmp0H
+	paddd	mm3,mm5			; mm3=tmp3L
+	paddd	mm4,mm7			; mm4=tmp3H
+
+	movq	MMWORD [wk(8)], mm2	; wk(8)=tmp0L
+	movq	MMWORD [wk(9)], mm0	; wk(9)=tmp0H
+
+	movq      mm2,mm1
+	movq      mm0,mm1
+	punpcklwd mm2,mm6
+	punpckhwd mm0,mm6
+	movq      mm1,mm2
+	movq      mm6,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]	; mm2=tmp1L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]	; mm0=tmp1H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]	; mm1=tmp2L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]	; mm6=tmp2H
+
+	paddd	mm2,mm5			; mm2=tmp1L
+	paddd	mm0,mm7			; mm0=tmp1H
+	paddd	mm1, MMWORD [wk(10)]	; mm1=tmp2L
+	paddd	mm6, MMWORD [wk(11)]	; mm6=tmp2H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=tmp1L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp10L
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp10H
+
+	movq	mm2,mm5
+	movq	mm0,mm7
+	paddd	mm5,mm3			; mm5=data0L
+	paddd	mm7,mm4			; mm7=data0H
+	psubd	mm2,mm3			; mm2=data7L
+	psubd	mm0,mm4			; mm0=data7H
+
+	movq	mm3,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm3=[PD_DESCALE_P1]
+
+	paddd	mm5,mm3
+	paddd	mm7,mm3
+	psrad	mm5,DESCALE_P1
+	psrad	mm7,DESCALE_P1
+	paddd	mm2,mm3
+	paddd	mm0,mm3
+	psrad	mm2,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+
+	packssdw  mm5,mm7		; mm5=data0=(00 01 02 03)
+	packssdw  mm2,mm0		; mm2=data7=(70 71 72 73)
+
+	movq	mm4, MMWORD [wk(4)]	; mm4=tmp11L
+	movq	mm3, MMWORD [wk(5)]	; mm3=tmp11H
+
+	movq	mm7,mm4
+	movq	mm0,mm3
+	paddd	mm4,mm1			; mm4=data1L
+	paddd	mm3,mm6			; mm3=data1H
+	psubd	mm7,mm1			; mm7=data6L
+	psubd	mm0,mm6			; mm0=data6H
+
+	movq	mm1,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm1=[PD_DESCALE_P1]
+
+	paddd	mm4,mm1
+	paddd	mm3,mm1
+	psrad	mm4,DESCALE_P1
+	psrad	mm3,DESCALE_P1
+	paddd	mm7,mm1
+	paddd	mm0,mm1
+	psrad	mm7,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+
+	packssdw  mm4,mm3		; mm4=data1=(10 11 12 13)
+	packssdw  mm7,mm0		; mm7=data6=(60 61 62 63)
+
+	movq      mm6,mm5		; transpose coefficients(phase 1)
+	punpcklwd mm5,mm4		; mm5=(00 10 01 11)
+	punpckhwd mm6,mm4		; mm6=(02 12 03 13)
+	movq      mm1,mm7		; transpose coefficients(phase 1)
+	punpcklwd mm7,mm2		; mm7=(60 70 61 71)
+	punpckhwd mm1,mm2		; mm1=(62 72 63 73)
+
+	movq	mm3, MMWORD [wk(6)]	; mm3=tmp12L
+	movq	mm0, MMWORD [wk(7)]	; mm0=tmp12H
+	movq	mm4, MMWORD [wk(10)]	; mm4=tmp1L
+	movq	mm2, MMWORD [wk(11)]	; mm2=tmp1H
+
+	movq	MMWORD [wk(0)], mm5	; wk(0)=(00 10 01 11)
+	movq	MMWORD [wk(1)], mm6	; wk(1)=(02 12 03 13)
+	movq	MMWORD [wk(4)], mm7	; wk(4)=(60 70 61 71)
+	movq	MMWORD [wk(5)], mm1	; wk(5)=(62 72 63 73)
+
+	movq	mm5,mm3
+	movq	mm6,mm0
+	paddd	mm3,mm4			; mm3=data2L
+	paddd	mm0,mm2			; mm0=data2H
+	psubd	mm5,mm4			; mm5=data5L
+	psubd	mm6,mm2			; mm6=data5H
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm7=[PD_DESCALE_P1]
+
+	paddd	mm3,mm7
+	paddd	mm0,mm7
+	psrad	mm3,DESCALE_P1
+	psrad	mm0,DESCALE_P1
+	paddd	mm5,mm7
+	paddd	mm6,mm7
+	psrad	mm5,DESCALE_P1
+	psrad	mm6,DESCALE_P1
+
+	packssdw  mm3,mm0		; mm3=data2=(20 21 22 23)
+	packssdw  mm5,mm6		; mm5=data5=(50 51 52 53)
+
+	movq	mm1, MMWORD [wk(2)]	; mm1=tmp13L
+	movq	mm4, MMWORD [wk(3)]	; mm4=tmp13H
+	movq	mm2, MMWORD [wk(8)]	; mm2=tmp0L
+	movq	mm7, MMWORD [wk(9)]	; mm7=tmp0H
+
+	movq	mm0,mm1
+	movq	mm6,mm4
+	paddd	mm1,mm2			; mm1=data3L
+	paddd	mm4,mm7			; mm4=data3H
+	psubd	mm0,mm2			; mm0=data4L
+	psubd	mm6,mm7			; mm6=data4H
+
+	movq	mm2,[GOTOFF(ebx,PD_DESCALE_P1)]	; mm2=[PD_DESCALE_P1]
+
+	paddd	mm1,mm2
+	paddd	mm4,mm2
+	psrad	mm1,DESCALE_P1
+	psrad	mm4,DESCALE_P1
+	paddd	mm0,mm2
+	paddd	mm6,mm2
+	psrad	mm0,DESCALE_P1
+	psrad	mm6,DESCALE_P1
+
+	packssdw  mm1,mm4		; mm1=data3=(30 31 32 33)
+	packssdw  mm0,mm6		; mm0=data4=(40 41 42 43)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=(00 10 01 11)
+	movq	mm2, MMWORD [wk(1)]	; mm2=(02 12 03 13)
+
+	movq      mm4,mm3		; transpose coefficients(phase 1)
+	punpcklwd mm3,mm1		; mm3=(20 30 21 31)
+	punpckhwd mm4,mm1		; mm4=(22 32 23 33)
+	movq      mm6,mm0		; transpose coefficients(phase 1)
+	punpcklwd mm0,mm5		; mm0=(40 50 41 51)
+	punpckhwd mm6,mm5		; mm6=(42 52 43 53)
+
+	movq      mm1,mm7		; transpose coefficients(phase 2)
+	punpckldq mm7,mm3		; mm7=(00 10 20 30)
+	punpckhdq mm1,mm3		; mm1=(01 11 21 31)
+	movq      mm5,mm2		; transpose coefficients(phase 2)
+	punpckldq mm2,mm4		; mm2=(02 12 22 32)
+	punpckhdq mm5,mm4		; mm5=(03 13 23 33)
+
+	movq	mm3, MMWORD [wk(4)]	; mm3=(60 70 61 71)
+	movq	mm4, MMWORD [wk(5)]	; mm4=(62 72 63 73)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+	movq      mm7,mm0		; transpose coefficients(phase 2)
+	punpckldq mm0,mm3		; mm0=(40 50 60 70)
+	punpckhdq mm7,mm3		; mm7=(41 51 61 71)
+	movq      mm1,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm4		; mm6=(42 52 62 72)
+	punpckhdq mm1,mm4		; mm1=(43 53 63 73)
+
+	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
+	movq	MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_ISLOW_MULT_TYPE	; quantptr
+	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; JCOEF * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movq      mm4,mm1		; mm1=in2=z2
+	movq      mm5,mm1
+	punpcklwd mm4,mm3		; mm3=in6=z3
+	punpckhwd mm5,mm3
+	movq      mm1,mm4
+	movq      mm3,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]	; mm4=tmp3L
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]	; mm5=tmp3H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]	; mm1=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]	; mm3=tmp2H
+
+	movq      mm6,mm0
+	paddw     mm0,mm2		; mm0=in0+in4
+	psubw     mm6,mm2		; mm6=in0-in4
+
+	pxor      mm7,mm7
+	pxor      mm2,mm2
+	punpcklwd mm7,mm0		; mm7=tmp0L
+	punpckhwd mm2,mm0		; mm2=tmp0H
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+	psrad     mm2,(16-CONST_BITS)	; psrad mm2,16 & pslld mm2,CONST_BITS
+
+	movq	mm0,mm7
+	paddd	mm7,mm4			; mm7=tmp10L
+	psubd	mm0,mm4			; mm0=tmp13L
+	movq	mm4,mm2
+	paddd	mm2,mm5			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp13H
+
+	movq	MMWORD [wk(0)], mm7	; wk(0)=tmp10L
+	movq	MMWORD [wk(1)], mm2	; wk(1)=tmp10H
+	movq	MMWORD [wk(2)], mm0	; wk(2)=tmp13L
+	movq	MMWORD [wk(3)], mm4	; wk(3)=tmp13H
+
+	pxor      mm5,mm5
+	pxor      mm7,mm7
+	punpcklwd mm5,mm6		; mm5=tmp1L
+	punpckhwd mm7,mm6		; mm7=tmp1H
+	psrad     mm5,(16-CONST_BITS)	; psrad mm5,16 & pslld mm5,CONST_BITS
+	psrad     mm7,(16-CONST_BITS)	; psrad mm7,16 & pslld mm7,CONST_BITS
+
+	movq	mm2,mm5
+	paddd	mm5,mm1			; mm5=tmp11L
+	psubd	mm2,mm1			; mm2=tmp12L
+	movq	mm0,mm7
+	paddd	mm7,mm3			; mm7=tmp11H
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	movq	MMWORD [wk(4)], mm5	; wk(4)=tmp11L
+	movq	MMWORD [wk(5)], mm7	; wk(5)=tmp11H
+	movq	MMWORD [wk(6)], mm2	; wk(6)=tmp12L
+	movq	MMWORD [wk(7)], mm0	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movq	mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	movq	mm5,mm6
+	movq	mm7,mm4
+	paddw	mm5,mm3			; mm5=z3
+	paddw	mm7,mm1			; mm7=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movq      mm2,mm5
+	movq      mm0,mm5
+	punpcklwd mm2,mm7
+	punpckhwd mm0,mm7
+	movq      mm5,mm2
+	movq      mm7,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]	; mm2=z3L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]	; mm0=z3H
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]	; mm5=z4L
+	pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]	; mm7=z4H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=z3L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movq      mm2,mm3
+	movq      mm0,mm3
+	punpcklwd mm2,mm4
+	punpckhwd mm0,mm4
+	movq      mm3,mm2
+	movq      mm4,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]	; mm2=tmp0L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]	; mm0=tmp0H
+	pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]	; mm3=tmp3L
+	pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]	; mm4=tmp3H
+
+	paddd	mm2, MMWORD [wk(10)]	; mm2=tmp0L
+	paddd	mm0, MMWORD [wk(11)]	; mm0=tmp0H
+	paddd	mm3,mm5			; mm3=tmp3L
+	paddd	mm4,mm7			; mm4=tmp3H
+
+	movq	MMWORD [wk(8)], mm2	; wk(8)=tmp0L
+	movq	MMWORD [wk(9)], mm0	; wk(9)=tmp0H
+
+	movq      mm2,mm1
+	movq      mm0,mm1
+	punpcklwd mm2,mm6
+	punpckhwd mm0,mm6
+	movq      mm1,mm2
+	movq      mm6,mm0
+	pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]	; mm2=tmp1L
+	pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]	; mm0=tmp1H
+	pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]	; mm1=tmp2L
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]	; mm6=tmp2H
+
+	paddd	mm2,mm5			; mm2=tmp1L
+	paddd	mm0,mm7			; mm0=tmp1H
+	paddd	mm1, MMWORD [wk(10)]	; mm1=tmp2L
+	paddd	mm6, MMWORD [wk(11)]	; mm6=tmp2H
+
+	movq	MMWORD [wk(10)], mm2	; wk(10)=tmp1L
+	movq	MMWORD [wk(11)], mm0	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movq	mm5, MMWORD [wk(0)]	; mm5=tmp10L
+	movq	mm7, MMWORD [wk(1)]	; mm7=tmp10H
+
+	movq	mm2,mm5
+	movq	mm0,mm7
+	paddd	mm5,mm3			; mm5=data0L
+	paddd	mm7,mm4			; mm7=data0H
+	psubd	mm2,mm3			; mm2=data7L
+	psubd	mm0,mm4			; mm0=data7H
+
+	movq	mm3,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm3=[PD_DESCALE_P2]
+
+	paddd	mm5,mm3
+	paddd	mm7,mm3
+	psrad	mm5,DESCALE_P2
+	psrad	mm7,DESCALE_P2
+	paddd	mm2,mm3
+	paddd	mm0,mm3
+	psrad	mm2,DESCALE_P2
+	psrad	mm0,DESCALE_P2
+
+	packssdw  mm5,mm7		; mm5=data0=(00 10 20 30)
+	packssdw  mm2,mm0		; mm2=data7=(07 17 27 37)
+
+	movq	mm4, MMWORD [wk(4)]	; mm4=tmp11L
+	movq	mm3, MMWORD [wk(5)]	; mm3=tmp11H
+
+	movq	mm7,mm4
+	movq	mm0,mm3
+	paddd	mm4,mm1			; mm4=data1L
+	paddd	mm3,mm6			; mm3=data1H
+	psubd	mm7,mm1			; mm7=data6L
+	psubd	mm0,mm6			; mm0=data6H
+
+	movq	mm1,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm1=[PD_DESCALE_P2]
+
+	paddd	mm4,mm1
+	paddd	mm3,mm1
+	psrad	mm4,DESCALE_P2
+	psrad	mm3,DESCALE_P2
+	paddd	mm7,mm1
+	paddd	mm0,mm1
+	psrad	mm7,DESCALE_P2
+	psrad	mm0,DESCALE_P2
+
+	packssdw  mm4,mm3		; mm4=data1=(01 11 21 31)
+	packssdw  mm7,mm0		; mm7=data6=(06 16 26 36)
+
+	packsswb  mm5,mm7		; mm5=(00 10 20 30 06 16 26 36)
+	packsswb  mm4,mm2		; mm4=(01 11 21 31 07 17 27 37)
+
+	movq	mm6, MMWORD [wk(6)]	; mm6=tmp12L
+	movq	mm1, MMWORD [wk(7)]	; mm1=tmp12H
+	movq	mm3, MMWORD [wk(10)]	; mm3=tmp1L
+	movq	mm0, MMWORD [wk(11)]	; mm0=tmp1H
+
+	movq	MMWORD [wk(0)], mm5	; wk(0)=(00 10 20 30 06 16 26 36)
+	movq	MMWORD [wk(1)], mm4	; wk(1)=(01 11 21 31 07 17 27 37)
+
+	movq	mm7,mm6
+	movq	mm2,mm1
+	paddd	mm6,mm3			; mm6=data2L
+	paddd	mm1,mm0			; mm1=data2H
+	psubd	mm7,mm3			; mm7=data5L
+	psubd	mm2,mm0			; mm2=data5H
+
+	movq	mm5,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm5=[PD_DESCALE_P2]
+
+	paddd	mm6,mm5
+	paddd	mm1,mm5
+	psrad	mm6,DESCALE_P2
+	psrad	mm1,DESCALE_P2
+	paddd	mm7,mm5
+	paddd	mm2,mm5
+	psrad	mm7,DESCALE_P2
+	psrad	mm2,DESCALE_P2
+
+	packssdw  mm6,mm1		; mm6=data2=(02 12 22 32)
+	packssdw  mm7,mm2		; mm7=data5=(05 15 25 35)
+
+	movq	mm4, MMWORD [wk(2)]	; mm4=tmp13L
+	movq	mm3, MMWORD [wk(3)]	; mm3=tmp13H
+	movq	mm0, MMWORD [wk(8)]	; mm0=tmp0L
+	movq	mm5, MMWORD [wk(9)]	; mm5=tmp0H
+
+	movq	mm1,mm4
+	movq	mm2,mm3
+	paddd	mm4,mm0			; mm4=data3L
+	paddd	mm3,mm5			; mm3=data3H
+	psubd	mm1,mm0			; mm1=data4L
+	psubd	mm2,mm5			; mm2=data4H
+
+	movq	mm0,[GOTOFF(ebx,PD_DESCALE_P2)]	; mm0=[PD_DESCALE_P2]
+
+	paddd	mm4,mm0
+	paddd	mm3,mm0
+	psrad	mm4,DESCALE_P2
+	psrad	mm3,DESCALE_P2
+	paddd	mm1,mm0
+	paddd	mm2,mm0
+	psrad	mm1,DESCALE_P2
+	psrad	mm2,DESCALE_P2
+
+	movq      mm5,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm5=[PB_CENTERJSAMP]
+
+	packssdw  mm4,mm3		; mm4=data3=(03 13 23 33)
+	packssdw  mm1,mm2		; mm1=data4=(04 14 24 34)
+
+	movq      mm0, MMWORD [wk(0)]	; mm0=(00 10 20 30 06 16 26 36)
+	movq      mm3, MMWORD [wk(1)]	; mm3=(01 11 21 31 07 17 27 37)
+
+	packsswb  mm6,mm1		; mm6=(02 12 22 32 04 14 24 34)
+	packsswb  mm4,mm7		; mm4=(03 13 23 33 05 15 25 35)
+
+	paddb     mm0,mm5
+	paddb     mm3,mm5
+	paddb     mm6,mm5
+	paddb     mm4,mm5
+
+	movq      mm2,mm0		; transpose coefficients(phase 1)
+	punpcklbw mm0,mm3		; mm0=(00 01 10 11 20 21 30 31)
+	punpckhbw mm2,mm3		; mm2=(06 07 16 17 26 27 36 37)
+	movq      mm1,mm6		; transpose coefficients(phase 1)
+	punpcklbw mm6,mm4		; mm6=(02 03 12 13 22 23 32 33)
+	punpckhbw mm1,mm4		; mm1=(04 05 14 15 24 25 34 35)
+
+	movq      mm7,mm0		; transpose coefficients(phase 2)
+	punpcklwd mm0,mm6		; mm0=(00 01 02 03 10 11 12 13)
+	punpckhwd mm7,mm6		; mm7=(20 21 22 23 30 31 32 33)
+	movq      mm5,mm1		; transpose coefficients(phase 2)
+	punpcklwd mm1,mm2		; mm1=(04 05 06 07 14 15 16 17)
+	punpckhwd mm5,mm2		; mm5=(24 25 26 27 34 35 36 37)
+
+	movq      mm3,mm0		; transpose coefficients(phase 3)
+	punpckldq mm0,mm1		; mm0=(00 01 02 03 04 05 06 07)
+	punpckhdq mm3,mm1		; mm3=(10 11 12 13 14 15 16 17)
+	movq      mm4,mm7		; transpose coefficients(phase 3)
+	punpckldq mm7,mm5		; mm7=(20 21 22 23 24 25 26 27)
+	punpckhdq mm4,mm5		; mm4=(30 31 32 33 34 35 36 37)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
+	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 4*SIZEOF_JCOEF	; wsptr
+	add	edi, byte 4*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JIDCT_INT_MMX_SUPPORTED
+%endif ; DCT_ISLOW_SUPPORTED
diff --git a/jimmxred.asm b/jimmxred.asm
new file mode 100644
index 0000000..491fa7b
--- /dev/null
+++ b/jimmxred.asm
@@ -0,0 +1,719 @@
+;
+; jimmxred.asm - reduced-size IDCT (MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef IDCT_SCALING_SUPPORTED
+%ifdef JIDCT_INT_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1_4	(CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4	(CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2	(CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2	(CONST_BITS+PASS1_BITS+3+2)
+
+%if CONST_BITS == 13
+F_0_211	equ	 1730		; FIX(0.211164243)
+F_0_509	equ	 4176		; FIX(0.509795579)
+F_0_601	equ	 4926		; FIX(0.601344887)
+F_0_720	equ	 5906		; FIX(0.720959822)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_850	equ	 6967		; FIX(0.850430095)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_061	equ	 8697		; FIX(1.061594337)
+F_1_272	equ	10426		; FIX(1.272758580)
+F_1_451	equ	11893		; FIX(1.451774981)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_2_172	equ	17799		; FIX(2.172734803)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_624	equ	29692		; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
+F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
+F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
+F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
+F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
+F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_red_mmx)
+
+EXTN(jconst_idct_red_mmx):
+
+PW_F184_MF076	times 2 dw  F_1_847,-F_0_765
+PW_F256_F089	times 2 dw  F_2_562, F_0_899
+PW_F106_MF217	times 2 dw  F_1_061,-F_2_172
+PW_MF060_MF050	times 2 dw -F_0_601,-F_0_509
+PW_F145_MF021	times 2 dw  F_1_451,-F_0_211
+PW_F362_MF127	times 2 dw  F_3_624,-F_1_272
+PW_F085_MF072	times 2 dw  F_0_850,-F_0_720
+PD_DESCALE_P1_4	times 2 dd  1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4	times 2 dd  1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2	times 2 dd  1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2	times 2 dd  1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jpeg_idct_4x4_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                    JCOEFPTR coef_block,
+;                    JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
+					; JCOEF workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jpeg_idct_4x4_mmx)
+
+EXTN(jpeg_idct_4x4_mmx):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	pushpic	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [compptr(eax)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; JCOEF * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	mm0,mm1
+	packsswb mm0,mm0
+	movd	eax,mm0
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	mm0,PASS1_BITS
+
+	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
+	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
+	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
+
+	movq      mm1,mm0
+	punpckldq mm0,mm0		; mm0=(00 00 00 00)
+	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
+	movq      mm3,mm2
+	punpckldq mm2,mm2		; mm2=(02 02 02 02)
+	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movq      mm4,mm0
+	movq      mm5,mm0
+	punpcklwd mm4,mm1
+	punpckhwd mm5,mm1
+	movq      mm0,mm4
+	movq      mm1,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]	; mm4=(tmp2L)
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]	; mm5=(tmp2H)
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]	; mm0=(tmp0L)
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]	; mm1=(tmp0H)
+
+	movq      mm6,mm2
+	movq      mm7,mm2
+	punpcklwd mm6,mm3
+	punpckhwd mm7,mm3
+	movq      mm2,mm6
+	movq      mm3,mm7
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]	; mm6=(tmp2L)
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]	; mm7=(tmp2H)
+	pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]	; mm2=(tmp0L)
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]	; mm3=(tmp0H)
+
+	paddd	mm6,mm4			; mm6=tmp2L
+	paddd	mm7,mm5			; mm7=tmp2H
+	paddd	mm2,mm0			; mm2=tmp0L
+	paddd	mm3,mm1			; mm3=tmp0H
+
+	movq	MMWORD [wk(0)], mm2	; wk(0)=tmp0L
+	movq	MMWORD [wk(1)], mm3	; wk(1)=tmp0H
+
+	; -- Even part
+
+	movq	mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	pxor      mm1,mm1
+	pxor      mm2,mm2
+	punpcklwd mm1,mm4		; mm1=tmp0L
+	punpckhwd mm2,mm4		; mm2=tmp0H
+	psrad     mm1,(16-CONST_BITS-1)	; psrad mm1,16 & pslld mm1,CONST_BITS+1
+	psrad     mm2,(16-CONST_BITS-1)	; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+	movq      mm3,mm5		; mm5=in2=z2
+	punpcklwd mm5,mm0		; mm0=in6=z3
+	punpckhwd mm3,mm0
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]	; mm5=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]	; mm3=tmp2H
+
+	movq	mm4,mm1
+	movq	mm0,mm2
+	paddd	mm1,mm5			; mm1=tmp10L
+	paddd	mm2,mm3			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp12L
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	; -- Final output stage
+
+	movq	mm5,mm1
+	movq	mm3,mm2
+	paddd	mm1,mm6			; mm1=data0L
+	paddd	mm2,mm7			; mm2=data0H
+	psubd	mm5,mm6			; mm5=data3L
+	psubd	mm3,mm7			; mm3=data3H
+
+	movq	mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; mm6=[PD_DESCALE_P1_4]
+
+	paddd	mm1,mm6
+	paddd	mm2,mm6
+	psrad	mm1,DESCALE_P1_4
+	psrad	mm2,DESCALE_P1_4
+	paddd	mm5,mm6
+	paddd	mm3,mm6
+	psrad	mm5,DESCALE_P1_4
+	psrad	mm3,DESCALE_P1_4
+
+	packssdw  mm1,mm2		; mm1=data0=(00 01 02 03)
+	packssdw  mm5,mm3		; mm5=data3=(30 31 32 33)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=tmp0L
+	movq	mm6, MMWORD [wk(1)]	; mm6=tmp0H
+
+	movq	mm2,mm4
+	movq	mm3,mm0
+	paddd	mm4,mm7			; mm4=data1L
+	paddd	mm0,mm6			; mm0=data1H
+	psubd	mm2,mm7			; mm2=data2L
+	psubd	mm3,mm6			; mm3=data2H
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; mm7=[PD_DESCALE_P1_4]
+
+	paddd	mm4,mm7
+	paddd	mm0,mm7
+	psrad	mm4,DESCALE_P1_4
+	psrad	mm0,DESCALE_P1_4
+	paddd	mm2,mm7
+	paddd	mm3,mm7
+	psrad	mm2,DESCALE_P1_4
+	psrad	mm3,DESCALE_P1_4
+
+	packssdw  mm4,mm0		; mm4=data1=(10 11 12 13)
+	packssdw  mm2,mm3		; mm2=data2=(20 21 22 23)
+
+	movq      mm6,mm1		; transpose coefficients(phase 1)
+	punpcklwd mm1,mm4		; mm1=(00 10 01 11)
+	punpckhwd mm6,mm4		; mm6=(02 12 03 13)
+	movq      mm7,mm2		; transpose coefficients(phase 1)
+	punpcklwd mm2,mm5		; mm2=(20 30 21 31)
+	punpckhwd mm7,mm5		; mm7=(22 32 23 33)
+
+	movq      mm0,mm1		; transpose coefficients(phase 2)
+	punpckldq mm1,mm2		; mm1=(00 10 20 30)
+	punpckhdq mm0,mm2		; mm0=(01 11 21 31)
+	movq      mm3,mm6		; transpose coefficients(phase 2)
+	punpckldq mm6,mm7		; mm6=(02 12 22 32)
+	punpckhdq mm3,mm7		; mm3=(03 13 23 33)
+
+	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
+	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
+	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_ISLOW_MULT_TYPE	; quantptr
+	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; JCOEF * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	movq      mm4,mm0
+	movq      mm5,mm0
+	punpcklwd mm4,mm1
+	punpckhwd mm5,mm1
+	movq      mm0,mm4
+	movq      mm1,mm5
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]	; mm4=(tmp2L)
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]	; mm5=(tmp2H)
+	pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]	; mm0=(tmp0L)
+	pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]	; mm1=(tmp0H)
+
+	movq      mm6,mm2
+	movq      mm7,mm2
+	punpcklwd mm6,mm3
+	punpckhwd mm7,mm3
+	movq      mm2,mm6
+	movq      mm3,mm7
+	pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]	; mm6=(tmp2L)
+	pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]	; mm7=(tmp2H)
+	pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]	; mm2=(tmp0L)
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]	; mm3=(tmp0H)
+
+	paddd	mm6,mm4			; mm6=tmp2L
+	paddd	mm7,mm5			; mm7=tmp2H
+	paddd	mm2,mm0			; mm2=tmp0L
+	paddd	mm3,mm1			; mm3=tmp0H
+
+	movq	MMWORD [wk(0)], mm2	; wk(0)=tmp0L
+	movq	MMWORD [wk(1)], mm3	; wk(1)=tmp0H
+
+	; -- Even part
+
+	movq	mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	pxor      mm1,mm1
+	pxor      mm2,mm2
+	punpcklwd mm1,mm4		; mm1=tmp0L
+	punpckhwd mm2,mm4		; mm2=tmp0H
+	psrad     mm1,(16-CONST_BITS-1)	; psrad mm1,16 & pslld mm1,CONST_BITS+1
+	psrad     mm2,(16-CONST_BITS-1)	; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+	movq      mm3,mm5		; mm5=in2=z2
+	punpcklwd mm5,mm0		; mm0=in6=z3
+	punpckhwd mm3,mm0
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]	; mm5=tmp2L
+	pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]	; mm3=tmp2H
+
+	movq	mm4,mm1
+	movq	mm0,mm2
+	paddd	mm1,mm5			; mm1=tmp10L
+	paddd	mm2,mm3			; mm2=tmp10H
+	psubd	mm4,mm5			; mm4=tmp12L
+	psubd	mm0,mm3			; mm0=tmp12H
+
+	; -- Final output stage
+
+	movq	mm5,mm1
+	movq	mm3,mm2
+	paddd	mm1,mm6			; mm1=data0L
+	paddd	mm2,mm7			; mm2=data0H
+	psubd	mm5,mm6			; mm5=data3L
+	psubd	mm3,mm7			; mm3=data3H
+
+	movq	mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; mm6=[PD_DESCALE_P2_4]
+
+	paddd	mm1,mm6
+	paddd	mm2,mm6
+	psrad	mm1,DESCALE_P2_4
+	psrad	mm2,DESCALE_P2_4
+	paddd	mm5,mm6
+	paddd	mm3,mm6
+	psrad	mm5,DESCALE_P2_4
+	psrad	mm3,DESCALE_P2_4
+
+	packssdw  mm1,mm2		; mm1=data0=(00 10 20 30)
+	packssdw  mm5,mm3		; mm5=data3=(03 13 23 33)
+
+	movq	mm7, MMWORD [wk(0)]	; mm7=tmp0L
+	movq	mm6, MMWORD [wk(1)]	; mm6=tmp0H
+
+	movq	mm2,mm4
+	movq	mm3,mm0
+	paddd	mm4,mm7			; mm4=data1L
+	paddd	mm0,mm6			; mm0=data1H
+	psubd	mm2,mm7			; mm2=data2L
+	psubd	mm3,mm6			; mm3=data2H
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; mm7=[PD_DESCALE_P2_4]
+
+	paddd	mm4,mm7
+	paddd	mm0,mm7
+	psrad	mm4,DESCALE_P2_4
+	psrad	mm0,DESCALE_P2_4
+	paddd	mm2,mm7
+	paddd	mm3,mm7
+	psrad	mm2,DESCALE_P2_4
+	psrad	mm3,DESCALE_P2_4
+
+	packssdw  mm4,mm0		; mm4=data1=(01 11 21 31)
+	packssdw  mm2,mm3		; mm2=data2=(02 12 22 32)
+
+	movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm6=[PB_CENTERJSAMP]
+
+	packsswb  mm1,mm2		; mm1=(00 10 20 30 02 12 22 32)
+	packsswb  mm4,mm5		; mm4=(01 11 21 31 03 13 23 33)
+	paddb     mm1,mm6
+	paddb     mm4,mm6
+
+	movq      mm7,mm1		; transpose coefficients(phase 1)
+	punpcklbw mm1,mm4		; mm1=(00 01 10 11 20 21 30 31)
+	punpckhbw mm7,mm4		; mm7=(02 03 12 13 22 23 32 33)
+
+	movq      mm0,mm1		; transpose coefficients(phase 2)
+	punpcklwd mm1,mm7		; mm1=(00 01 02 03 10 11 12 13)
+	punpckhwd mm0,mm7		; mm0=(20 21 22 23 30 31 32 33)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	movd	DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
+	movd	DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+
+	psrlq	mm1,4*BYTE_BIT
+	psrlq	mm0,4*BYTE_BIT
+
+	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movd	DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
+	movd	DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jpeg_idct_2x2_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                    JCOEFPTR coef_block,
+;                    JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+	align	16
+	global	EXTN(jpeg_idct_2x2_mmx)
+
+EXTN(jpeg_idct_2x2_mmx):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+	mov	edx, POINTER [compptr(ebp)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
+
+	; | input:                  | result:        |
+	; | 00 01 ** 03 ** 05 ** 07 |                |
+	; | 10 11 ** 13 ** 15 ** 17 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+	; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+	; | 50 51 ** 53 ** 55 ** 57 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 70 71 ** 73 ** 75 ** 77 |                |
+
+	; -- Odd part
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
+	; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
+
+	pcmpeqd   mm7,mm7
+	pslld     mm7,WORD_BIT		; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
+
+	movq      mm4,mm0		; mm4=(10 11 ** 13)
+	movq      mm5,mm2		; mm5=(50 51 ** 53)
+	punpcklwd mm4,mm1		; mm4=(10 30 11 31)
+	punpcklwd mm5,mm3		; mm5=(50 70 51 71)
+	pmaddwd   mm4,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+	psrld	mm0,WORD_BIT		; mm0=(11 -- 13 --)
+	pand	mm1,mm7			; mm1=(-- 31 -- 33)
+	psrld	mm2,WORD_BIT		; mm2=(51 -- 53 --)
+	pand	mm3,mm7			; mm3=(-- 71 -- 73)
+	por	mm0,mm1			; mm0=(11 31 13 33)
+	por	mm2,mm3			; mm2=(51 71 53 73)
+	pmaddwd	mm0,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd	mm2,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd	mm4,mm5			; mm4=tmp0[col0 col1]
+
+	movq	mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
+	pmullw	mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movq	mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
+	pmullw	mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
+	; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
+
+	psrld	mm6,WORD_BIT		; mm6=(15 -- 17 --)
+	pand	mm1,mm7			; mm1=(-- 35 -- 37)
+	psrld	mm3,WORD_BIT		; mm3=(55 -- 57 --)
+	pand	mm5,mm7			; mm5=(-- 75 -- 77)
+	por	mm6,mm1			; mm6=(15 35 17 37)
+	por	mm3,mm5			; mm3=(55 75 57 77)
+	pmaddwd	mm6,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd	mm3,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd	mm0,mm2			; mm0=tmp0[col1 col3]
+	paddd	mm6,mm3			; mm6=tmp0[col5 col7]
+
+	; -- Even part
+
+	movq	mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq	mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
+	pmullw	mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
+
+	movq	mm2,mm1				; mm2=(00 01 ** 03)
+	pslld	mm1,WORD_BIT			; mm1=(-- 00 -- **)
+	psrad	mm1,(WORD_BIT-CONST_BITS-2)	; mm1=tmp10[col0 ****]
+
+	pand	mm2,mm7				; mm2=(-- 01 -- 03)
+	pand	mm5,mm7				; mm5=(-- 05 -- 07)
+	psrad	mm2,(WORD_BIT-CONST_BITS-2)	; mm2=tmp10[col1 col3]
+	psrad	mm5,(WORD_BIT-CONST_BITS-2)	; mm5=tmp10[col5 col7]
+
+	; -- Final output stage
+
+	movq      mm3,mm1
+	paddd     mm1,mm4		; mm1=data0[col0 ****]=(A0 **)
+	psubd     mm3,mm4		; mm3=data1[col0 ****]=(B0 **)
+	punpckldq mm1,mm3		; mm1=(A0 B0)
+
+	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)]	; mm7=[PD_DESCALE_P1_2]
+
+	movq	mm4,mm2
+	movq	mm3,mm5
+	paddd	mm2,mm0			; mm2=data0[col1 col3]=(A1 A3)
+	paddd	mm5,mm6			; mm5=data0[col5 col7]=(A5 A7)
+	psubd	mm4,mm0			; mm4=data1[col1 col3]=(B1 B3)
+	psubd	mm3,mm6			; mm3=data1[col5 col7]=(B5 B7)
+
+	paddd	mm1,mm7
+	psrad	mm1,DESCALE_P1_2
+
+	paddd	mm2,mm7
+	paddd	mm5,mm7
+	psrad	mm2,DESCALE_P1_2
+	psrad	mm5,DESCALE_P1_2
+	paddd	mm4,mm7
+	paddd	mm3,mm7
+	psrad	mm4,DESCALE_P1_2
+	psrad	mm3,DESCALE_P1_2
+
+	; ---- Pass 2: process rows, store into output array.
+
+	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(ebp)]
+
+	; | input:| result:|
+	; | A0 B0 |        |
+	; | A1 B1 | C0 C1  |
+	; | A3 B3 | D0 D1  |
+	; | A5 B5 |        |
+	; | A7 B7 |        |
+
+	; -- Odd part
+
+	packssdw  mm2,mm4		; mm2=(A1 A3 B1 B3)
+	packssdw  mm5,mm3		; mm5=(A5 A7 B5 B7)
+	pmaddwd   mm2,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd     mm2,mm5		; mm2=tmp0[row0 row1]
+
+	; -- Even part
+
+	pslld     mm1,(CONST_BITS+2)	; mm1=tmp10[row0 row1]
+
+	; -- Final output stage
+
+	movq      mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)]	; mm0=[PD_DESCALE_P2_2]
+
+	movq      mm6,mm1
+	paddd     mm1,mm2		; mm1=data0[row0 row1]=(C0 C1)
+	psubd     mm6,mm2		; mm6=data1[row0 row1]=(D0 D1)
+
+	paddd     mm1,mm0
+	paddd     mm6,mm0
+	psrad     mm1,DESCALE_P2_2
+	psrad     mm6,DESCALE_P2_2
+
+	movq      mm7,mm1		; transpose coefficients
+	punpckldq mm1,mm6		; mm1=(C0 D0)
+	punpckhdq mm7,mm6		; mm7=(C1 D1)
+
+	packssdw  mm1,mm7		; mm1=(C0 D0 C1 D1)
+	packsswb  mm1,mm1		; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
+	paddb     mm1,[GOTOFF(ebx,PB_CENTERJSAMP)]
+
+	movd	ecx,mm1
+	movd	ebx,mm1			; ebx=(C0 D0 C1 D1)
+	shr	ecx,2*BYTE_BIT		; ecx=(C1 D1 -- --)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	WORD [edx+eax*SIZEOF_JSAMPLE], bx
+	mov	WORD [esi+eax*SIZEOF_JSAMPLE], cx
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%endif ; JIDCT_INT_MMX_SUPPORTED
+%endif ; IDCT_SCALING_SUPPORTED
diff --git a/jiss2flt.asm b/jiss2flt.asm
new file mode 100644
index 0000000..c0565a3
--- /dev/null
+++ b/jiss2flt.asm
@@ -0,0 +1,508 @@
+;
+; jiss2flt.asm - floating-point IDCT (SSE & SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+%ifdef JIDCT_FLT_SSE_SSE2_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+	shufps	%1,%2,0x44
+%endmacro
+
+%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+	shufps	%1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414	times 4 dd  1.414213562373095048801689
+PD_1_847	times 4 dd  1.847759065022573512256366
+PD_1_082	times 4 dd  1.082392200292393968799446
+PD_M2_613	times 4 dd -2.613125929752753055713286
+PD_RNDINT_MAGIC	times 4 dd  100663296.0	; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_float_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                       JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+					; FAST_FLOAT workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jpeg_idct_float_sse2)
+
+EXTN(jpeg_idct_float_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [compptr(eax)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; FAST_FLOAT * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movq	xmm1, _MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	xmm2, _MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq	xmm3, _MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq	xmm4, _MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq	xmm5, _MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq	xmm6, _MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	movq	xmm7, _MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	xmm1,xmm2
+	por	xmm3,xmm4
+	por	xmm5,xmm6
+	por	xmm1,xmm3
+	por	xmm5,xmm7
+	por	xmm1,xmm5
+	packsswb xmm1,xmm1
+	movd	eax,xmm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq      xmm0, _MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
+	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
+	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
+
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm1,xmm0
+	movaps	xmm2,xmm0
+	movaps	xmm3,xmm0
+
+	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
+	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
+	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
+	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq      xmm0, _MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq      xmm1, _MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq      xmm2, _MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq      xmm3, _MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
+	punpcklwd xmm1,xmm1		; xmm1=(20 20 21 21 22 22 23 23)
+	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
+	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in2=(20 21 22 23)
+	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
+	cvtdq2ps  xmm1,xmm1			; xmm1=in2=(20 21 22 23)
+
+	punpcklwd xmm2,xmm2		; xmm2=(40 40 41 41 42 42 43 43)
+	punpcklwd xmm3,xmm3		; xmm3=(60 60 61 61 62 62 63 63)
+	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in4=(40 41 42 43)
+	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in6=(60 61 62 63)
+	cvtdq2ps  xmm2,xmm2			; xmm2=in4=(40 41 42 43)
+	cvtdq2ps  xmm3,xmm3			; xmm3=in6=(60 61 62 63)
+
+	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movq      xmm2, _MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq      xmm3, _MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq      xmm5, _MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq      xmm1, _MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	punpcklwd xmm2,xmm2		; xmm2=(10 10 11 11 12 12 13 13)
+	punpcklwd xmm3,xmm3		; xmm3=(30 30 31 31 32 32 33 33)
+	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in1=(10 11 12 13)
+	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in3=(30 31 32 33)
+	cvtdq2ps  xmm2,xmm2			; xmm2=in1=(10 11 12 13)
+	cvtdq2ps  xmm3,xmm3			; xmm3=in3=(30 31 32 33)
+
+	punpcklwd xmm5,xmm5		; xmm5=(50 50 51 51 52 52 53 53)
+	punpcklwd xmm1,xmm1		; xmm1=(70 70 71 71 72 72 73 73)
+	psrad     xmm5,(DWORD_BIT-WORD_BIT)	; xmm5=in5=(50 51 52 53)
+	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in7=(70 71 72 73)
+	cvtdq2ps  xmm5,xmm5			; xmm5=in5=(50 51 52 53)
+	cvtdq2ps  xmm1,xmm1			; xmm1=in7=(70 71 72 73)
+
+	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
+	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
+	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
+	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
+	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
+	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
+	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
+	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
+	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
+	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
+	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
+
+	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
+	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm0,xmm7
+	movaps	xmm3,xmm5
+	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
+	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
+	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
+	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
+
+	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
+	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
+	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
+	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
+	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
+	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
+
+	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
+	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
+	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
+	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
+	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
+	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
+	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
+	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
+	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
+	add	edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; FAST_FLOAT * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
+	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
+	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
+	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
+	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps	xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; xmm1=[PD_RNDINT_MAGIC]
+	pcmpeqd	xmm3,xmm3
+	psrld	xmm3,WORD_BIT		; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+	addps	xmm6,xmm1	; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+	addps	xmm7,xmm1	; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+	addps	xmm0,xmm1	; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+	addps	xmm5,xmm1	; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+	pand	xmm6,xmm3		; xmm6=(00 -- 10 -- 20 -- 30 --)
+	pslld	xmm7,WORD_BIT		; xmm7=(-- 01 -- 11 -- 21 -- 31)
+	pand	xmm0,xmm3		; xmm0=(06 -- 16 -- 26 -- 36 --)
+	pslld	xmm5,WORD_BIT		; xmm5=(-- 07 -- 17 -- 27 -- 37)
+	por	xmm6,xmm7		; xmm6=(00 01 10 11 20 21 30 31)
+	por	xmm0,xmm5		; xmm0=(06 07 16 17 26 27 36 37)
+
+	movaps	xmm1, XMMWORD [wk(0)]	; xmm1=tmp2
+	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=tmp3
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm7,xmm1
+	movaps	xmm5,xmm3
+	addps	xmm1,xmm2		; xmm1=data2=(02 12 22 32)
+	addps	xmm3,xmm4		; xmm3=data4=(04 14 24 34)
+	subps	xmm7,xmm2		; xmm7=data5=(05 15 25 35)
+	subps	xmm5,xmm4		; xmm5=data3=(03 13 23 33)
+
+	movaps	xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; xmm2=[PD_RNDINT_MAGIC]
+	pcmpeqd	xmm4,xmm4
+	psrld	xmm4,WORD_BIT		; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+	addps	xmm3,xmm2	; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+	addps	xmm7,xmm2	; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+	addps	xmm1,xmm2	; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+	addps	xmm5,xmm2	; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+	pand	xmm3,xmm4		; xmm3=(04 -- 14 -- 24 -- 34 --)
+	pslld	xmm7,WORD_BIT		; xmm7=(-- 05 -- 15 -- 25 -- 35)
+	pand	xmm1,xmm4		; xmm1=(02 -- 12 -- 22 -- 32 --)
+	pslld	xmm5,WORD_BIT		; xmm5=(-- 03 -- 13 -- 23 -- 33)
+	por	xmm3,xmm7		; xmm3=(04 05 14 15 24 25 34 35)
+	por	xmm1,xmm5		; xmm1=(02 03 12 13 22 23 32 33)
+
+	movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm2=[PB_CENTERJSAMP]
+
+	packsswb  xmm6,xmm3	; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+	packsswb  xmm1,xmm0	; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+	paddb     xmm6,xmm2
+	paddb     xmm1,xmm2
+
+	movdqa    xmm4,xmm6	; transpose coefficients(phase 2)
+	punpcklwd xmm6,xmm1	; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+	punpckhwd xmm4,xmm1	; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+	movdqa    xmm7,xmm6	; transpose coefficients(phase 3)
+	punpckldq xmm6,xmm4	; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+	punpckhdq xmm7,xmm4	; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+	pshufd	xmm5,xmm6,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+	pshufd	xmm3,xmm7,0x4E	; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+	movq	_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
+	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+	movq	_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
+	add	edi, byte 4*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JIDCT_FLT_SSE_SSE2_SUPPORTED
+%endif ; DCT_FLOAT_SUPPORTED
diff --git a/jiss2fst.asm b/jiss2fst.asm
new file mode 100644
index 0000000..937a260
--- /dev/null
+++ b/jiss2fst.asm
@@ -0,0 +1,512 @@
+;
+; jiss2fst.asm - fast integer IDCT (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_IFAST_SUPPORTED
+%ifdef JIDCT_INT_SSE2_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	8	; 14 is also OK.
+%define PASS1_BITS	2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082	equ	277		; FIX(1.082392200)
+F_1_414	equ	362		; FIX(1.414213562)
+F_1_847	equ	473		; FIX(1.847759065)
+F_2_613	equ	669		; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - 256)	; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define	DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_1_082	equ	DESCALE(1162209775,30-CONST_BITS)	; FIX(1.082392200)
+F_1_414	equ	DESCALE(1518500249,30-CONST_BITS)	; FIX(1.414213562)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_613	equ	DESCALE(2805822602,30-CONST_BITS)	; FIX(2.613125930)
+F_1_613	equ	(F_2_613 - (1 << CONST_BITS))	; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+	alignz	16
+	global	EXTN(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414	times 8 dw  F_1_414 << CONST_SHIFT
+PW_F1847	times 8 dw  F_1_847 << CONST_SHIFT
+PW_MF1613	times 8 dw -F_1_613 << CONST_SHIFT
+PW_F1082	times 8 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_ifast_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                       JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jpeg_idct_ifast_sse2)
+
+EXTN(jpeg_idct_ifast_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [compptr(eax)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	xmm1,xmm0
+	packsswb xmm1,xmm1
+	packsswb xmm1,xmm1
+	movd	eax,xmm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movdqa    xmm7,xmm0		; xmm0=in0=(00 01 02 03 04 05 06 07)
+	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
+	punpckhwd xmm7,xmm7		; xmm7=(04 04 05 05 06 06 07 07)
+
+	pshufd	xmm6,xmm0,0x00		; xmm6=col0=(00 00 00 00 00 00 00 00)
+	pshufd	xmm2,xmm0,0x55		; xmm2=col1=(01 01 01 01 01 01 01 01)
+	pshufd	xmm5,xmm0,0xAA		; xmm5=col2=(02 02 02 02 02 02 02 02)
+	pshufd	xmm0,xmm0,0xFF		; xmm0=col3=(03 03 03 03 03 03 03 03)
+	pshufd	xmm1,xmm7,0x00		; xmm1=col4=(04 04 04 04 04 04 04 04)
+	pshufd	xmm4,xmm7,0x55		; xmm4=col5=(05 05 05 05 05 05 05 05)
+	pshufd	xmm3,xmm7,0xAA		; xmm3=col6=(06 06 06 06 06 06 06 06)
+	pshufd	xmm7,xmm7,0xFF		; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=col1
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=col3
+	jmp	near .column_end
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movdqa	xmm4,xmm0
+	movdqa	xmm5,xmm1
+	psubw	xmm0,xmm2		; xmm0=tmp11
+	psubw	xmm1,xmm3
+	paddw	xmm4,xmm2		; xmm4=tmp10
+	paddw	xmm5,xmm3		; xmm5=tmp13
+
+	psllw	xmm1,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm1,[GOTOFF(ebx,PW_F1414)]
+	psubw	xmm1,xmm5		; xmm1=tmp12
+
+	movdqa	xmm6,xmm4
+	movdqa	xmm7,xmm0
+	psubw	xmm4,xmm5		; xmm4=tmp3
+	psubw	xmm0,xmm1		; xmm0=tmp2
+	paddw	xmm6,xmm5		; xmm6=tmp0
+	paddw	xmm7,xmm1		; xmm7=tmp1
+
+	movdqa	XMMWORD [wk(1)], xmm4	; wk(1)=tmp3
+	movdqa	XMMWORD [wk(0)], xmm0	; wk(0)=tmp2
+
+	; -- Odd part
+
+	movdqa	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+	movdqa	xmm4,xmm2
+	movdqa	xmm0,xmm5
+	psubw	xmm2,xmm1		; xmm2=z12
+	psubw	xmm5,xmm3		; xmm5=z10
+	paddw	xmm4,xmm1		; xmm4=z11
+	paddw	xmm0,xmm3		; xmm0=z13
+
+	movdqa	xmm1,xmm5		; xmm1=z10(unscaled)
+	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+
+	movdqa	xmm3,xmm4
+	psubw	xmm4,xmm0
+	paddw	xmm3,xmm0		; xmm3=tmp7
+
+	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm4,[GOTOFF(ebx,PW_F1414)]	; xmm4=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movdqa	xmm0,xmm5
+	paddw	xmm5,xmm2
+	pmulhw	xmm5,[GOTOFF(ebx,PW_F1847)]	; xmm5=z5
+	pmulhw	xmm0,[GOTOFF(ebx,PW_MF1613)]
+	pmulhw	xmm2,[GOTOFF(ebx,PW_F1082)]
+	psubw	xmm0,xmm1
+	psubw	xmm2,xmm5		; xmm2=tmp10
+	paddw	xmm0,xmm5		; xmm0=tmp12
+
+	; -- Final output stage
+
+	psubw	xmm0,xmm3		; xmm0=tmp6
+	movdqa	xmm1,xmm6
+	movdqa	xmm5,xmm7
+	paddw	xmm6,xmm3		; xmm6=data0=(00 01 02 03 04 05 06 07)
+	paddw	xmm7,xmm0		; xmm7=data1=(10 11 12 13 14 15 16 17)
+	psubw	xmm1,xmm3		; xmm1=data7=(70 71 72 73 74 75 76 77)
+	psubw	xmm5,xmm0		; xmm5=data6=(60 61 62 63 64 65 66 67)
+	psubw	xmm4,xmm0		; xmm4=tmp5
+
+	movdqa    xmm3,xmm6		; transpose coefficients(phase 1)
+	punpcklwd xmm6,xmm7		; xmm6=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm3,xmm7		; xmm3=(04 14 05 15 06 16 07 17)
+	movdqa    xmm0,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm1		; xmm5=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm0,xmm1		; xmm0=(64 74 65 75 66 76 67 77)
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
+	movdqa	xmm1, XMMWORD [wk(1)]	; xmm1=tmp3
+
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(60 70 61 71 62 72 63 73)
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(64 74 65 75 66 76 67 77)
+
+	paddw	xmm2,xmm4		; xmm2=tmp4
+	movdqa	xmm5,xmm7
+	movdqa	xmm0,xmm1
+	paddw	xmm7,xmm4		; xmm7=data2=(20 21 22 23 24 25 26 27)
+	paddw	xmm1,xmm2		; xmm1=data4=(40 41 42 43 44 45 46 47)
+	psubw	xmm5,xmm4		; xmm5=data5=(50 51 52 53 54 55 56 57)
+	psubw	xmm0,xmm2		; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+	movdqa    xmm4,xmm7		; transpose coefficients(phase 1)
+	punpcklwd xmm7,xmm0		; xmm7=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm4,xmm0		; xmm4=(24 34 25 35 26 36 27 37)
+	movdqa    xmm2,xmm1		; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm5		; xmm1=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm2,xmm5		; xmm2=(44 54 45 55 46 56 47 57)
+
+	movdqa    xmm0,xmm3		; transpose coefficients(phase 2)
+	punpckldq xmm3,xmm4		; xmm3=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm0,xmm4		; xmm0=(06 16 26 36 07 17 27 37)
+	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm7		; xmm6=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm5,xmm7		; xmm5=(02 12 22 32 03 13 23 33)
+
+	movdqa	xmm4, XMMWORD [wk(0)]	; xmm4=(60 70 61 71 62 72 63 73)
+	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=(64 74 65 75 66 76 67 77)
+
+	movdqa	XMMWORD [wk(0)], xmm3	; wk(0)=(04 14 24 34 05 15 25 35)
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=(06 16 26 36 07 17 27 37)
+
+	movdqa    xmm3,xmm1		; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm4		; xmm1=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm3,xmm4		; xmm3=(42 52 62 72 43 53 63 73)
+	movdqa    xmm0,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm7		; xmm2=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm0,xmm7		; xmm0=(46 56 66 76 47 57 67 77)
+
+	movdqa     xmm4,xmm6		; transpose coefficients(phase 3)
+	punpcklqdq xmm6,xmm1		; xmm6=col0=(00 10 20 30 40 50 60 70)
+	punpckhqdq xmm4,xmm1		; xmm4=col1=(01 11 21 31 41 51 61 71)
+	movdqa     xmm7,xmm5		; transpose coefficients(phase 3)
+	punpcklqdq xmm5,xmm3		; xmm5=col2=(02 12 22 32 42 52 62 72)
+	punpckhqdq xmm7,xmm3		; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+	movdqa	xmm1, XMMWORD [wk(0)]	; xmm1=(04 14 24 34 05 15 25 35)
+	movdqa	xmm3, XMMWORD [wk(1)]	; xmm3=(06 16 26 36 07 17 27 37)
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=col1
+	movdqa	XMMWORD [wk(1)], xmm7	; wk(1)=col3
+
+	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm2		; xmm1=col4=(04 14 24 34 44 54 64 74)
+	punpckhqdq xmm4,xmm2		; xmm4=col5=(05 15 25 35 45 55 65 75)
+	movdqa     xmm7,xmm3		; transpose coefficients(phase 3)
+	punpcklqdq xmm3,xmm0		; xmm3=col6=(06 16 26 36 46 56 66 76)
+	punpckhqdq xmm7,xmm0		; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+
+	; -- Even part
+
+	; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+	movdqa	xmm2,xmm6
+	movdqa	xmm0,xmm5
+	psubw	xmm6,xmm1		; xmm6=tmp11
+	psubw	xmm5,xmm3
+	paddw	xmm2,xmm1		; xmm2=tmp10
+	paddw	xmm0,xmm3		; xmm0=tmp13
+
+	psllw	xmm5,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm5,[GOTOFF(ebx,PW_F1414)]
+	psubw	xmm5,xmm0		; xmm5=tmp12
+
+	movdqa	xmm1,xmm2
+	movdqa	xmm3,xmm6
+	psubw	xmm2,xmm0		; xmm2=tmp3
+	psubw	xmm6,xmm5		; xmm6=tmp2
+	paddw	xmm1,xmm0		; xmm1=tmp0
+	paddw	xmm3,xmm5		; xmm3=tmp1
+
+	movdqa	xmm0, XMMWORD [wk(0)]	; xmm0=col1
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=col3
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp3
+	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=tmp2
+
+	; -- Odd part
+
+	; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+	movdqa	xmm2,xmm0
+	movdqa	xmm6,xmm4
+	psubw	xmm0,xmm7		; xmm0=z12
+	psubw	xmm4,xmm5		; xmm4=z10
+	paddw	xmm2,xmm7		; xmm2=z11
+	paddw	xmm6,xmm5		; xmm6=z13
+
+	movdqa	xmm7,xmm4		; xmm7=z10(unscaled)
+	psllw	xmm0,PRE_MULTIPLY_SCALE_BITS
+	psllw	xmm4,PRE_MULTIPLY_SCALE_BITS
+
+	movdqa	xmm5,xmm2
+	psubw	xmm2,xmm6
+	paddw	xmm5,xmm6		; xmm5=tmp7
+
+	psllw	xmm2,PRE_MULTIPLY_SCALE_BITS
+	pmulhw	xmm2,[GOTOFF(ebx,PW_F1414)]	; xmm2=tmp11
+
+	; To avoid overflow...
+	;
+	; (Original)
+	; tmp12 = -2.613125930 * z10 + z5;
+	;
+	; (This implementation)
+	; tmp12 = (-1.613125930 - 1) * z10 + z5;
+	;       = -1.613125930 * z10 - z10 + z5;
+
+	movdqa	xmm6,xmm4
+	paddw	xmm4,xmm0
+	pmulhw	xmm4,[GOTOFF(ebx,PW_F1847)]	; xmm4=z5
+	pmulhw	xmm6,[GOTOFF(ebx,PW_MF1613)]
+	pmulhw	xmm0,[GOTOFF(ebx,PW_F1082)]
+	psubw	xmm6,xmm7
+	psubw	xmm0,xmm4		; xmm0=tmp10
+	paddw	xmm6,xmm4		; xmm6=tmp12
+
+	; -- Final output stage
+
+	psubw	xmm6,xmm5		; xmm6=tmp6
+	movdqa	xmm7,xmm1
+	movdqa	xmm4,xmm3
+	paddw	xmm1,xmm5		; xmm1=data0=(00 10 20 30 40 50 60 70)
+	paddw	xmm3,xmm6		; xmm3=data1=(01 11 21 31 41 51 61 71)
+	psraw	xmm1,(PASS1_BITS+3)	; descale
+	psraw	xmm3,(PASS1_BITS+3)	; descale
+	psubw	xmm7,xmm5		; xmm7=data7=(07 17 27 37 47 57 67 77)
+	psubw	xmm4,xmm6		; xmm4=data6=(06 16 26 36 46 56 66 76)
+	psraw	xmm7,(PASS1_BITS+3)	; descale
+	psraw	xmm4,(PASS1_BITS+3)	; descale
+	psubw	xmm2,xmm6		; xmm2=tmp5
+
+	packsswb  xmm1,xmm4	; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	packsswb  xmm3,xmm7	; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	movdqa	xmm5, XMMWORD [wk(1)]	; xmm5=tmp2
+	movdqa	xmm6, XMMWORD [wk(0)]	; xmm6=tmp3
+
+	paddw	xmm0,xmm2		; xmm0=tmp4
+	movdqa	xmm4,xmm5
+	movdqa	xmm7,xmm6
+	paddw	xmm5,xmm2		; xmm5=data2=(02 12 22 32 42 52 62 72)
+	paddw	xmm6,xmm0		; xmm6=data4=(04 14 24 34 44 54 64 74)
+	psraw	xmm5,(PASS1_BITS+3)	; descale
+	psraw	xmm6,(PASS1_BITS+3)	; descale
+	psubw	xmm4,xmm2		; xmm4=data5=(05 15 25 35 45 55 65 75)
+	psubw	xmm7,xmm0		; xmm7=data3=(03 13 23 33 43 53 63 73)
+	psraw	xmm4,(PASS1_BITS+3)	; descale
+	psraw	xmm7,(PASS1_BITS+3)	; descale
+
+	movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm2=[PB_CENTERJSAMP]
+
+	packsswb  xmm5,xmm6	; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+	packsswb  xmm7,xmm4	; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+	paddb     xmm1,xmm2
+	paddb     xmm3,xmm2
+	paddb     xmm5,xmm2
+	paddb     xmm7,xmm2
+
+	movdqa    xmm0,xmm1	; transpose coefficients(phase 1)
+	punpcklbw xmm1,xmm3	; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+	punpckhbw xmm0,xmm3	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+	movdqa    xmm6,xmm5	; transpose coefficients(phase 1)
+	punpcklbw xmm5,xmm7	; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+	punpckhbw xmm6,xmm7	; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+	movdqa    xmm4,xmm1	; transpose coefficients(phase 2)
+	punpcklwd xmm1,xmm5	; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+	punpckhwd xmm4,xmm5	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+	movdqa    xmm2,xmm6	; transpose coefficients(phase 2)
+	punpcklwd xmm6,xmm0	; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+	punpckhwd xmm2,xmm0	; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+	movdqa    xmm3,xmm1	; transpose coefficients(phase 3)
+	punpckldq xmm1,xmm6	; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+	punpckhdq xmm3,xmm6	; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+	movdqa    xmm7,xmm4	; transpose coefficients(phase 3)
+	punpckldq xmm4,xmm2	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+	punpckhdq xmm7,xmm2	; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+	pshufd	xmm5,xmm1,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+	pshufd	xmm0,xmm3,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+	pshufd	xmm6,xmm4,0x4E	; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+	pshufd	xmm2,xmm7,0x4E	; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+	movq	_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+	mov	edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+	movq	_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+	movq	_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+	mov	edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+	movq	_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JIDCT_INT_SSE2_SUPPORTED
+%endif ; DCT_IFAST_SUPPORTED
diff --git a/jiss2int.asm b/jiss2int.asm
new file mode 100644
index 0000000..b0e7109
--- /dev/null
+++ b/jiss2int.asm
@@ -0,0 +1,869 @@
+;
+; jiss2int.asm - accurate integer IDCT (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_ISLOW_SUPPORTED
+%ifdef JIDCT_INT_SSE2_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1	(CONST_BITS-PASS1_BITS)
+%define DESCALE_P2	(CONST_BITS+PASS1_BITS+3)
+
+%if CONST_BITS == 13
+F_0_298	equ	 2446		; FIX(0.298631336)
+F_0_390	equ	 3196		; FIX(0.390180644)
+F_0_541	equ	 4433		; FIX(0.541196100)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_175	equ	 9633		; FIX(1.175875602)
+F_1_501	equ	12299		; FIX(1.501321110)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_1_961	equ	16069		; FIX(1.961570560)
+F_2_053	equ	16819		; FIX(2.053119869)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_072	equ	25172		; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
+F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
+F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
+F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
+F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054	times 4 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130	times 4 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117	times 4 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078	times 4 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089	times 4 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060	times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256	times 4 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050	times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1	times 4 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2	times 4 dd  1 << (DESCALE_P2-1)
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_islow_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                       JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		12
+
+	align	16
+	global	EXTN(jpeg_idct_islow_sse2)
+
+EXTN(jpeg_idct_islow_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [compptr(eax)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	xmm1,xmm0
+	packsswb xmm1,xmm1
+	packsswb xmm1,xmm1
+	movd	eax,xmm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movdqa	xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	xmm5,PASS1_BITS
+
+	movdqa    xmm4,xmm5		; xmm5=in0=(00 01 02 03 04 05 06 07)
+	punpcklwd xmm5,xmm5		; xmm5=(00 00 01 01 02 02 03 03)
+	punpckhwd xmm4,xmm4		; xmm4=(04 04 05 05 06 06 07 07)
+
+	pshufd	xmm7,xmm5,0x00		; xmm7=col0=(00 00 00 00 00 00 00 00)
+	pshufd	xmm6,xmm5,0x55		; xmm6=col1=(01 01 01 01 01 01 01 01)
+	pshufd	xmm1,xmm5,0xAA		; xmm1=col2=(02 02 02 02 02 02 02 02)
+	pshufd	xmm5,xmm5,0xFF		; xmm5=col3=(03 03 03 03 03 03 03 03)
+	pshufd	xmm0,xmm4,0x00		; xmm0=col4=(04 04 04 04 04 04 04 04)
+	pshufd	xmm3,xmm4,0x55		; xmm3=col5=(05 05 05 05 05 05 05 05)
+	pshufd	xmm2,xmm4,0xAA		; xmm2=col6=(06 06 06 06 06 06 06 06)
+	pshufd	xmm4,xmm4,0xFF		; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+	movdqa	XMMWORD [wk(8)], xmm6	; wk(8)=col1
+	movdqa	XMMWORD [wk(9)], xmm5	; wk(9)=col3
+	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
+	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
+	jmp	near .column_end
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movdqa    xmm4,xmm1		; xmm1=in2=z2
+	movdqa    xmm5,xmm1
+	punpcklwd xmm4,xmm3		; xmm3=in6=z3
+	punpckhwd xmm5,xmm3
+	movdqa    xmm1,xmm4
+	movdqa    xmm3,xmm5
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]	; xmm4=tmp3L
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]	; xmm5=tmp3H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]	; xmm1=tmp2L
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_F054_MF130)]	; xmm3=tmp2H
+
+	movdqa    xmm6,xmm0
+	paddw     xmm0,xmm2		; xmm0=in0+in4
+	psubw     xmm6,xmm2		; xmm6=in0-in4
+
+	pxor      xmm7,xmm7
+	pxor      xmm2,xmm2
+	punpcklwd xmm7,xmm0		; xmm7=tmp0L
+	punpckhwd xmm2,xmm0		; xmm2=tmp0H
+	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
+	psrad     xmm2,(16-CONST_BITS)	; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+	movdqa	xmm0,xmm7
+	paddd	xmm7,xmm4		; xmm7=tmp10L
+	psubd	xmm0,xmm4		; xmm0=tmp13L
+	movdqa	xmm4,xmm2
+	paddd	xmm2,xmm5		; xmm2=tmp10H
+	psubd	xmm4,xmm5		; xmm4=tmp13H
+
+	movdqa	XMMWORD [wk(0)], xmm7	; wk(0)=tmp10L
+	movdqa	XMMWORD [wk(1)], xmm2	; wk(1)=tmp10H
+	movdqa	XMMWORD [wk(2)], xmm0	; wk(2)=tmp13L
+	movdqa	XMMWORD [wk(3)], xmm4	; wk(3)=tmp13H
+
+	pxor      xmm5,xmm5
+	pxor      xmm7,xmm7
+	punpcklwd xmm5,xmm6		; xmm5=tmp1L
+	punpckhwd xmm7,xmm6		; xmm7=tmp1H
+	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
+	psrad     xmm7,(16-CONST_BITS)	; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+	movdqa	xmm2,xmm5
+	paddd	xmm5,xmm1		; xmm5=tmp11L
+	psubd	xmm2,xmm1		; xmm2=tmp12L
+	movdqa	xmm0,xmm7
+	paddd	xmm7,xmm3		; xmm7=tmp11H
+	psubd	xmm0,xmm3		; xmm0=tmp12H
+
+	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
+	movdqa	XMMWORD [wk(5)], xmm7	; wk(5)=tmp11H
+	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=tmp12L
+	movdqa	XMMWORD [wk(7)], xmm0	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movdqa	xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movdqa	xmm5,xmm6
+	movdqa	xmm7,xmm4
+	paddw	xmm5,xmm3		; xmm5=z3
+	paddw	xmm7,xmm1		; xmm7=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm2,xmm5
+	movdqa    xmm0,xmm5
+	punpcklwd xmm2,xmm7
+	punpckhwd xmm0,xmm7
+	movdqa    xmm5,xmm2
+	movdqa    xmm7,xmm0
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF078_F117)]	; xmm2=z3L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]	; xmm0=z3H
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]	; xmm5=z4L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_F117_F078)]	; xmm7=z4H
+
+	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=z3L
+	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movdqa    xmm2,xmm3
+	movdqa    xmm0,xmm3
+	punpcklwd xmm2,xmm4
+	punpckhwd xmm0,xmm4
+	movdqa    xmm3,xmm2
+	movdqa    xmm4,xmm0
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm2=tmp0L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm0=tmp0H
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]	; xmm3=tmp3L
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_MF089_F060)]	; xmm4=tmp3H
+
+	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp0L
+	paddd	xmm0, XMMWORD [wk(11)]	; xmm0=tmp0H
+	paddd	xmm3,xmm5		; xmm3=tmp3L
+	paddd	xmm4,xmm7		; xmm4=tmp3H
+
+	movdqa	XMMWORD [wk(8)], xmm2	; wk(8)=tmp0L
+	movdqa	XMMWORD [wk(9)], xmm0	; wk(9)=tmp0H
+
+	movdqa    xmm2,xmm1
+	movdqa    xmm0,xmm1
+	punpcklwd xmm2,xmm6
+	punpckhwd xmm0,xmm6
+	movdqa    xmm1,xmm2
+	movdqa    xmm6,xmm0
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm2=tmp1L
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm0=tmp1H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF256_F050)]	; xmm1=tmp2L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]	; xmm6=tmp2H
+
+	paddd	xmm2,xmm5		; xmm2=tmp1L
+	paddd	xmm0,xmm7		; xmm0=tmp1H
+	paddd	xmm1, XMMWORD [wk(10)]	; xmm1=tmp2L
+	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
+
+	movdqa	XMMWORD [wk(10)], xmm2	; wk(10)=tmp1L
+	movdqa	XMMWORD [wk(11)], xmm0	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
+	movdqa	xmm7, XMMWORD [wk(1)]	; xmm7=tmp10H
+
+	movdqa	xmm2,xmm5
+	movdqa	xmm0,xmm7
+	paddd	xmm5,xmm3		; xmm5=data0L
+	paddd	xmm7,xmm4		; xmm7=data0H
+	psubd	xmm2,xmm3		; xmm2=data7L
+	psubd	xmm0,xmm4		; xmm0=data7H
+
+	movdqa	xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm3=[PD_DESCALE_P1]
+
+	paddd	xmm5,xmm3
+	paddd	xmm7,xmm3
+	psrad	xmm5,DESCALE_P1
+	psrad	xmm7,DESCALE_P1
+	paddd	xmm2,xmm3
+	paddd	xmm0,xmm3
+	psrad	xmm2,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+
+	packssdw  xmm5,xmm7		; xmm5=data0=(00 01 02 03 04 05 06 07)
+	packssdw  xmm2,xmm0		; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+	movdqa	xmm4, XMMWORD [wk(4)]	; xmm4=tmp11L
+	movdqa	xmm3, XMMWORD [wk(5)]	; xmm3=tmp11H
+
+	movdqa	xmm7,xmm4
+	movdqa	xmm0,xmm3
+	paddd	xmm4,xmm1		; xmm4=data1L
+	paddd	xmm3,xmm6		; xmm3=data1H
+	psubd	xmm7,xmm1		; xmm7=data6L
+	psubd	xmm0,xmm6		; xmm0=data6H
+
+	movdqa	xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm1=[PD_DESCALE_P1]
+
+	paddd	xmm4,xmm1
+	paddd	xmm3,xmm1
+	psrad	xmm4,DESCALE_P1
+	psrad	xmm3,DESCALE_P1
+	paddd	xmm7,xmm1
+	paddd	xmm0,xmm1
+	psrad	xmm7,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+
+	packssdw  xmm4,xmm3		; xmm4=data1=(10 11 12 13 14 15 16 17)
+	packssdw  xmm7,xmm0		; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+	movdqa    xmm6,xmm5		; transpose coefficients(phase 1)
+	punpcklwd xmm5,xmm4		; xmm5=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm6,xmm4		; xmm6=(04 14 05 15 06 16 07 17)
+	movdqa    xmm1,xmm7		; transpose coefficients(phase 1)
+	punpcklwd xmm7,xmm2		; xmm7=(60 70 61 71 62 72 63 73)
+	punpckhwd xmm1,xmm2		; xmm1=(64 74 65 75 66 76 67 77)
+
+	movdqa	xmm3, XMMWORD [wk(6)]	; xmm3=tmp12L
+	movdqa	xmm0, XMMWORD [wk(7)]	; xmm0=tmp12H
+	movdqa	xmm4, XMMWORD [wk(10)]	; xmm4=tmp1L
+	movdqa	xmm2, XMMWORD [wk(11)]	; xmm2=tmp1H
+
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 01 11 02 12 03 13)
+	movdqa	XMMWORD [wk(1)], xmm6	; wk(1)=(04 14 05 15 06 16 07 17)
+	movdqa	XMMWORD [wk(4)], xmm7	; wk(4)=(60 70 61 71 62 72 63 73)
+	movdqa	XMMWORD [wk(5)], xmm1	; wk(5)=(64 74 65 75 66 76 67 77)
+
+	movdqa	xmm5,xmm3
+	movdqa	xmm6,xmm0
+	paddd	xmm3,xmm4		; xmm3=data2L
+	paddd	xmm0,xmm2		; xmm0=data2H
+	psubd	xmm5,xmm4		; xmm5=data5L
+	psubd	xmm6,xmm2		; xmm6=data5H
+
+	movdqa	xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm7=[PD_DESCALE_P1]
+
+	paddd	xmm3,xmm7
+	paddd	xmm0,xmm7
+	psrad	xmm3,DESCALE_P1
+	psrad	xmm0,DESCALE_P1
+	paddd	xmm5,xmm7
+	paddd	xmm6,xmm7
+	psrad	xmm5,DESCALE_P1
+	psrad	xmm6,DESCALE_P1
+
+	packssdw  xmm3,xmm0		; xmm3=data2=(20 21 22 23 24 25 26 27)
+	packssdw  xmm5,xmm6		; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+	movdqa	xmm1, XMMWORD [wk(2)]	; xmm1=tmp13L
+	movdqa	xmm4, XMMWORD [wk(3)]	; xmm4=tmp13H
+	movdqa	xmm2, XMMWORD [wk(8)]	; xmm2=tmp0L
+	movdqa	xmm7, XMMWORD [wk(9)]	; xmm7=tmp0H
+
+	movdqa	xmm0,xmm1
+	movdqa	xmm6,xmm4
+	paddd	xmm1,xmm2		; xmm1=data3L
+	paddd	xmm4,xmm7		; xmm4=data3H
+	psubd	xmm0,xmm2		; xmm0=data4L
+	psubd	xmm6,xmm7		; xmm6=data4H
+
+	movdqa	xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]	; xmm2=[PD_DESCALE_P1]
+
+	paddd	xmm1,xmm2
+	paddd	xmm4,xmm2
+	psrad	xmm1,DESCALE_P1
+	psrad	xmm4,DESCALE_P1
+	paddd	xmm0,xmm2
+	paddd	xmm6,xmm2
+	psrad	xmm0,DESCALE_P1
+	psrad	xmm6,DESCALE_P1
+
+	packssdw  xmm1,xmm4		; xmm1=data3=(30 31 32 33 34 35 36 37)
+	packssdw  xmm0,xmm6		; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 01 11 02 12 03 13)
+	movdqa	xmm2, XMMWORD [wk(1)]	; xmm2=(04 14 05 15 06 16 07 17)
+
+	movdqa    xmm4,xmm3		; transpose coefficients(phase 1)
+	punpcklwd xmm3,xmm1		; xmm3=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm4,xmm1		; xmm4=(24 34 25 35 26 36 27 37)
+	movdqa    xmm6,xmm0		; transpose coefficients(phase 1)
+	punpcklwd xmm0,xmm5		; xmm0=(40 50 41 51 42 52 43 53)
+	punpckhwd xmm6,xmm5		; xmm6=(44 54 45 55 46 56 47 57)
+
+	movdqa    xmm1,xmm7		; transpose coefficients(phase 2)
+	punpckldq xmm7,xmm3		; xmm7=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm1,xmm3		; xmm1=(02 12 22 32 03 13 23 33)
+	movdqa    xmm5,xmm2		; transpose coefficients(phase 2)
+	punpckldq xmm2,xmm4		; xmm2=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm5,xmm4		; xmm5=(06 16 26 36 07 17 27 37)
+
+	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=(60 70 61 71 62 72 63 73)
+	movdqa	xmm4, XMMWORD [wk(5)]	; xmm4=(64 74 65 75 66 76 67 77)
+
+	movdqa	XMMWORD [wk(6)], xmm2	; wk(6)=(04 14 24 34 05 15 25 35)
+	movdqa	XMMWORD [wk(7)], xmm5	; wk(7)=(06 16 26 36 07 17 27 37)
+
+	movdqa    xmm2,xmm0		; transpose coefficients(phase 2)
+	punpckldq xmm0,xmm3		; xmm0=(40 50 60 70 41 51 61 71)
+	punpckhdq xmm2,xmm3		; xmm2=(42 52 62 72 43 53 63 73)
+	movdqa    xmm5,xmm6		; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm4		; xmm6=(44 54 64 74 45 55 65 75)
+	punpckhdq xmm5,xmm4		; xmm5=(46 56 66 76 47 57 67 77)
+
+	movdqa     xmm3,xmm7		; transpose coefficients(phase 3)
+	punpcklqdq xmm7,xmm0		; xmm7=col0=(00 10 20 30 40 50 60 70)
+	punpckhqdq xmm3,xmm0		; xmm3=col1=(01 11 21 31 41 51 61 71)
+	movdqa     xmm4,xmm1		; transpose coefficients(phase 3)
+	punpcklqdq xmm1,xmm2		; xmm1=col2=(02 12 22 32 42 52 62 72)
+	punpckhqdq xmm4,xmm2		; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+	movdqa	xmm0, XMMWORD [wk(6)]	; xmm0=(04 14 24 34 05 15 25 35)
+	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=(06 16 26 36 07 17 27 37)
+
+	movdqa	XMMWORD [wk(8)], xmm3	; wk(8)=col1
+	movdqa	XMMWORD [wk(9)], xmm4	; wk(9)=col3
+
+	movdqa     xmm3,xmm0		; transpose coefficients(phase 3)
+	punpcklqdq xmm0,xmm6		; xmm0=col4=(04 14 24 34 44 54 64 74)
+	punpckhqdq xmm3,xmm6		; xmm3=col5=(05 15 25 35 45 55 65 75)
+	movdqa     xmm4,xmm2		; transpose coefficients(phase 3)
+	punpcklqdq xmm2,xmm5		; xmm2=col6=(06 16 26 36 46 56 66 76)
+	punpckhqdq xmm4,xmm5		; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+	movdqa	XMMWORD [wk(10)], xmm3	; wk(10)=col5
+	movdqa	XMMWORD [wk(11)], xmm4	; wk(11)=col7
+.column_end:
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+
+	; -- Even part
+
+	; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+	; (Original)
+	; z1 = (z2 + z3) * 0.541196100;
+	; tmp2 = z1 + z3 * -1.847759065;
+	; tmp3 = z1 + z2 * 0.765366865;
+	;
+	; (This implementation)
+	; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+	; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+	movdqa    xmm6,xmm1		; xmm1=in2=z2
+	movdqa    xmm5,xmm1
+	punpcklwd xmm6,xmm2		; xmm2=in6=z3
+	punpckhwd xmm5,xmm2
+	movdqa    xmm1,xmm6
+	movdqa    xmm2,xmm5
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]	; xmm6=tmp3L
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]	; xmm5=tmp3H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]	; xmm1=tmp2L
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F054_MF130)]	; xmm2=tmp2H
+
+	movdqa    xmm3,xmm7
+	paddw     xmm7,xmm0		; xmm7=in0+in4
+	psubw     xmm3,xmm0		; xmm3=in0-in4
+
+	pxor      xmm4,xmm4
+	pxor      xmm0,xmm0
+	punpcklwd xmm4,xmm7		; xmm4=tmp0L
+	punpckhwd xmm0,xmm7		; xmm0=tmp0H
+	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
+	psrad     xmm0,(16-CONST_BITS)	; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+	movdqa	xmm7,xmm4
+	paddd	xmm4,xmm6		; xmm4=tmp10L
+	psubd	xmm7,xmm6		; xmm7=tmp13L
+	movdqa	xmm6,xmm0
+	paddd	xmm0,xmm5		; xmm0=tmp10H
+	psubd	xmm6,xmm5		; xmm6=tmp13H
+
+	movdqa	XMMWORD [wk(0)], xmm4	; wk(0)=tmp10L
+	movdqa	XMMWORD [wk(1)], xmm0	; wk(1)=tmp10H
+	movdqa	XMMWORD [wk(2)], xmm7	; wk(2)=tmp13L
+	movdqa	XMMWORD [wk(3)], xmm6	; wk(3)=tmp13H
+
+	pxor      xmm5,xmm5
+	pxor      xmm4,xmm4
+	punpcklwd xmm5,xmm3		; xmm5=tmp1L
+	punpckhwd xmm4,xmm3		; xmm4=tmp1H
+	psrad     xmm5,(16-CONST_BITS)	; psrad xmm5,16 & pslld xmm5,CONST_BITS
+	psrad     xmm4,(16-CONST_BITS)	; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+	movdqa	xmm0,xmm5
+	paddd	xmm5,xmm1		; xmm5=tmp11L
+	psubd	xmm0,xmm1		; xmm0=tmp12L
+	movdqa	xmm7,xmm4
+	paddd	xmm4,xmm2		; xmm4=tmp11H
+	psubd	xmm7,xmm2		; xmm7=tmp12H
+
+	movdqa	XMMWORD [wk(4)], xmm5	; wk(4)=tmp11L
+	movdqa	XMMWORD [wk(5)], xmm4	; wk(5)=tmp11H
+	movdqa	XMMWORD [wk(6)], xmm0	; wk(6)=tmp12L
+	movdqa	XMMWORD [wk(7)], xmm7	; wk(7)=tmp12H
+
+	; -- Odd part
+
+	movdqa	xmm6, XMMWORD [wk(9)]	; xmm6=col3
+	movdqa	xmm3, XMMWORD [wk(8)]	; xmm3=col1
+	movdqa	xmm1, XMMWORD [wk(11)]	; xmm1=col7
+	movdqa	xmm2, XMMWORD [wk(10)]	; xmm2=col5
+
+	movdqa	xmm5,xmm6
+	movdqa	xmm4,xmm3
+	paddw	xmm5,xmm1		; xmm5=z3
+	paddw	xmm4,xmm2		; xmm4=z4
+
+	; (Original)
+	; z5 = (z3 + z4) * 1.175875602;
+	; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+	; z3 += z5;  z4 += z5;
+	;
+	; (This implementation)
+	; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+	; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+	movdqa    xmm0,xmm5
+	movdqa    xmm7,xmm5
+	punpcklwd xmm0,xmm4
+	punpckhwd xmm7,xmm4
+	movdqa    xmm5,xmm0
+	movdqa    xmm4,xmm7
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]	; xmm0=z3L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]	; xmm7=z3H
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]	; xmm5=z4L
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F117_F078)]	; xmm4=z4H
+
+	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=z3L
+	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=z3H
+
+	; (Original)
+	; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+	; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+	; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+	; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+	; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+	; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+	;
+	; (This implementation)
+	; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+	; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+	; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+	; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+	; tmp0 += z3;  tmp1 += z4;
+	; tmp2 += z3;  tmp3 += z4;
+
+	movdqa    xmm0,xmm1
+	movdqa    xmm7,xmm1
+	punpcklwd xmm0,xmm3
+	punpckhwd xmm7,xmm3
+	movdqa    xmm1,xmm0
+	movdqa    xmm3,xmm7
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm0=tmp0L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]	; xmm7=tmp0H
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]	; xmm1=tmp3L
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]	; xmm3=tmp3H
+
+	paddd	xmm0, XMMWORD [wk(10)]	; xmm0=tmp0L
+	paddd	xmm7, XMMWORD [wk(11)]	; xmm7=tmp0H
+	paddd	xmm1,xmm5		; xmm1=tmp3L
+	paddd	xmm3,xmm4		; xmm3=tmp3H
+
+	movdqa	XMMWORD [wk(8)], xmm0	; wk(8)=tmp0L
+	movdqa	XMMWORD [wk(9)], xmm7	; wk(9)=tmp0H
+
+	movdqa    xmm0,xmm2
+	movdqa    xmm7,xmm2
+	punpcklwd xmm0,xmm6
+	punpckhwd xmm7,xmm6
+	movdqa    xmm2,xmm0
+	movdqa    xmm6,xmm7
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm0=tmp1L
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF050_MF256)]	; xmm7=tmp1H
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_MF256_F050)]	; xmm2=tmp2L
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]	; xmm6=tmp2H
+
+	paddd	xmm0,xmm5		; xmm0=tmp1L
+	paddd	xmm7,xmm4		; xmm7=tmp1H
+	paddd	xmm2, XMMWORD [wk(10)]	; xmm2=tmp2L
+	paddd	xmm6, XMMWORD [wk(11)]	; xmm6=tmp2H
+
+	movdqa	XMMWORD [wk(10)], xmm0	; wk(10)=tmp1L
+	movdqa	XMMWORD [wk(11)], xmm7	; wk(11)=tmp1H
+
+	; -- Final output stage
+
+	movdqa	xmm5, XMMWORD [wk(0)]	; xmm5=tmp10L
+	movdqa	xmm4, XMMWORD [wk(1)]	; xmm4=tmp10H
+
+	movdqa	xmm0,xmm5
+	movdqa	xmm7,xmm4
+	paddd	xmm5,xmm1		; xmm5=data0L
+	paddd	xmm4,xmm3		; xmm4=data0H
+	psubd	xmm0,xmm1		; xmm0=data7L
+	psubd	xmm7,xmm3		; xmm7=data7H
+
+	movdqa	xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm1=[PD_DESCALE_P2]
+
+	paddd	xmm5,xmm1
+	paddd	xmm4,xmm1
+	psrad	xmm5,DESCALE_P2
+	psrad	xmm4,DESCALE_P2
+	paddd	xmm0,xmm1
+	paddd	xmm7,xmm1
+	psrad	xmm0,DESCALE_P2
+	psrad	xmm7,DESCALE_P2
+
+	packssdw  xmm5,xmm4		; xmm5=data0=(00 10 20 30 40 50 60 70)
+	packssdw  xmm0,xmm7		; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+	movdqa	xmm3, XMMWORD [wk(4)]	; xmm3=tmp11L
+	movdqa	xmm1, XMMWORD [wk(5)]	; xmm1=tmp11H
+
+	movdqa	xmm4,xmm3
+	movdqa	xmm7,xmm1
+	paddd	xmm3,xmm2		; xmm3=data1L
+	paddd	xmm1,xmm6		; xmm1=data1H
+	psubd	xmm4,xmm2		; xmm4=data6L
+	psubd	xmm7,xmm6		; xmm7=data6H
+
+	movdqa	xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm2=[PD_DESCALE_P2]
+
+	paddd	xmm3,xmm2
+	paddd	xmm1,xmm2
+	psrad	xmm3,DESCALE_P2
+	psrad	xmm1,DESCALE_P2
+	paddd	xmm4,xmm2
+	paddd	xmm7,xmm2
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm7,DESCALE_P2
+
+	packssdw  xmm3,xmm1		; xmm3=data1=(01 11 21 31 41 51 61 71)
+	packssdw  xmm4,xmm7		; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+	packsswb  xmm5,xmm4		; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	packsswb  xmm3,xmm0		; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	movdqa	xmm6, XMMWORD [wk(6)]	; xmm6=tmp12L
+	movdqa	xmm2, XMMWORD [wk(7)]	; xmm2=tmp12H
+	movdqa	xmm1, XMMWORD [wk(10)]	; xmm1=tmp1L
+	movdqa	xmm7, XMMWORD [wk(11)]	; xmm7=tmp1H
+
+	movdqa	XMMWORD [wk(0)], xmm5	; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	movdqa	xmm4,xmm6
+	movdqa	xmm0,xmm2
+	paddd	xmm6,xmm1		; xmm6=data2L
+	paddd	xmm2,xmm7		; xmm2=data2H
+	psubd	xmm4,xmm1		; xmm4=data5L
+	psubd	xmm0,xmm7		; xmm0=data5H
+
+	movdqa	xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm5=[PD_DESCALE_P2]
+
+	paddd	xmm6,xmm5
+	paddd	xmm2,xmm5
+	psrad	xmm6,DESCALE_P2
+	psrad	xmm2,DESCALE_P2
+	paddd	xmm4,xmm5
+	paddd	xmm0,xmm5
+	psrad	xmm4,DESCALE_P2
+	psrad	xmm0,DESCALE_P2
+
+	packssdw  xmm6,xmm2		; xmm6=data2=(02 12 22 32 42 52 62 72)
+	packssdw  xmm4,xmm0		; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+	movdqa	xmm3, XMMWORD [wk(2)]	; xmm3=tmp13L
+	movdqa	xmm1, XMMWORD [wk(3)]	; xmm1=tmp13H
+	movdqa	xmm7, XMMWORD [wk(8)]	; xmm7=tmp0L
+	movdqa	xmm5, XMMWORD [wk(9)]	; xmm5=tmp0H
+
+	movdqa	xmm2,xmm3
+	movdqa	xmm0,xmm1
+	paddd	xmm3,xmm7		; xmm3=data3L
+	paddd	xmm1,xmm5		; xmm1=data3H
+	psubd	xmm2,xmm7		; xmm2=data4L
+	psubd	xmm0,xmm5		; xmm0=data4H
+
+	movdqa	xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]	; xmm7=[PD_DESCALE_P2]
+
+	paddd	xmm3,xmm7
+	paddd	xmm1,xmm7
+	psrad	xmm3,DESCALE_P2
+	psrad	xmm1,DESCALE_P2
+	paddd	xmm2,xmm7
+	paddd	xmm0,xmm7
+	psrad	xmm2,DESCALE_P2
+	psrad	xmm0,DESCALE_P2
+
+	movdqa    xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm5=[PB_CENTERJSAMP]
+
+	packssdw  xmm3,xmm1		; xmm3=data3=(03 13 23 33 43 53 63 73)
+	packssdw  xmm2,xmm0		; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+	movdqa    xmm7, XMMWORD [wk(0)]	; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+	movdqa    xmm1, XMMWORD [wk(1)]	; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+	packsswb  xmm6,xmm2		; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+	packsswb  xmm3,xmm4		; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+	paddb     xmm7,xmm5
+	paddb     xmm1,xmm5
+	paddb     xmm6,xmm5
+	paddb     xmm3,xmm5
+
+	movdqa    xmm0,xmm7	; transpose coefficients(phase 1)
+	punpcklbw xmm7,xmm1	; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+	punpckhbw xmm0,xmm1	; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+	movdqa    xmm2,xmm6	; transpose coefficients(phase 1)
+	punpcklbw xmm6,xmm3	; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+	punpckhbw xmm2,xmm3	; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+	movdqa    xmm4,xmm7	; transpose coefficients(phase 2)
+	punpcklwd xmm7,xmm6	; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+	punpckhwd xmm4,xmm6	; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+	movdqa    xmm5,xmm2	; transpose coefficients(phase 2)
+	punpcklwd xmm2,xmm0	; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+	punpckhwd xmm5,xmm0	; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+	movdqa    xmm1,xmm7	; transpose coefficients(phase 3)
+	punpckldq xmm7,xmm2	; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+	punpckhdq xmm1,xmm2	; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+	movdqa    xmm3,xmm4	; transpose coefficients(phase 3)
+	punpckldq xmm4,xmm5	; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+	punpckhdq xmm3,xmm5	; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+	pshufd	xmm6,xmm7,0x4E	; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+	pshufd	xmm0,xmm1,0x4E	; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+	pshufd	xmm2,xmm4,0x4E	; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+	pshufd	xmm5,xmm3,0x4E	; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
+	movq	_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+	mov	edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+	movq	_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+	movq	_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+	mov	edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+	movq	_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+	movq	_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JIDCT_INT_SSE2_SUPPORTED
+%endif ; DCT_ISLOW_SUPPORTED
diff --git a/jiss2red.asm b/jiss2red.asm
new file mode 100644
index 0000000..53af6fe
--- /dev/null
+++ b/jiss2red.asm
@@ -0,0 +1,607 @@
+;
+; jiss2red.asm - reduced-size IDCT (SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef IDCT_SCALING_SUPPORTED
+%ifdef JIDCT_INT_SSE2_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS	13
+%define PASS1_BITS	2
+
+%define DESCALE_P1_4	(CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4	(CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2	(CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2	(CONST_BITS+PASS1_BITS+3+2)
+
+%if CONST_BITS == 13
+F_0_211	equ	 1730		; FIX(0.211164243)
+F_0_509	equ	 4176		; FIX(0.509795579)
+F_0_601	equ	 4926		; FIX(0.601344887)
+F_0_720	equ	 5906		; FIX(0.720959822)
+F_0_765	equ	 6270		; FIX(0.765366865)
+F_0_850	equ	 6967		; FIX(0.850430095)
+F_0_899	equ	 7373		; FIX(0.899976223)
+F_1_061	equ	 8697		; FIX(1.061594337)
+F_1_272	equ	10426		; FIX(1.272758580)
+F_1_451	equ	11893		; FIX(1.451774981)
+F_1_847	equ	15137		; FIX(1.847759065)
+F_2_172	equ	17799		; FIX(2.172734803)
+F_2_562	equ	20995		; FIX(2.562915447)
+F_3_624	equ	29692		; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
+F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
+F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
+F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
+F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
+F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
+F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
+F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
+F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
+F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
+F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
+F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
+F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
+F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076	times 4 dw  F_1_847,-F_0_765
+PW_F256_F089	times 4 dw  F_2_562, F_0_899
+PW_F106_MF217	times 4 dw  F_1_061,-F_2_172
+PW_MF060_MF050	times 4 dw -F_0_601,-F_0_509
+PW_F145_MF021	times 4 dw  F_1_451,-F_0_211
+PW_F362_MF127	times 4 dw  F_3_624,-F_1_272
+PW_F085_MF072	times 4 dw  F_0_850,-F_0_720
+PD_DESCALE_P1_4	times 4 dd  1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4	times 4 dd  1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2	times 4 dd  1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2	times 4 dd  1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jpeg_idct_4x4_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                     JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+
+	align	16
+	global	EXTN(jpeg_idct_4x4_sse2)
+
+EXTN(jpeg_idct_4x4_sse2):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [wk(0)]
+	pushpic	ebx
+;	push	ecx		; unused
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [compptr(eax)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	short .columnDCT
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	xmm0,xmm1
+	packsswb xmm0,xmm0
+	packsswb xmm0,xmm0
+	movd	eax,xmm0
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	psllw	xmm0,PASS1_BITS
+
+	movdqa    xmm3,xmm0	; xmm0=in0=(00 01 02 03 04 05 06 07)
+	punpcklwd xmm0,xmm0	; xmm0=(00 00 01 01 02 02 03 03)
+	punpckhwd xmm3,xmm3	; xmm3=(04 04 05 05 06 06 07 07)
+
+	pshufd	xmm1,xmm0,0x50	; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+	pshufd	xmm0,xmm0,0xFA	; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+	pshufd	xmm6,xmm3,0x50	; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+	pshufd	xmm3,xmm3,0xFA	; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+	jmp	near .column_end
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Odd part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	movdqa    xmm4,xmm0
+	movdqa    xmm5,xmm0
+	punpcklwd xmm4,xmm1
+	punpckhwd xmm5,xmm1
+	movdqa    xmm0,xmm4
+	movdqa    xmm1,xmm5
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F256_F089)]	; xmm4=(tmp2L)
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F256_F089)]	; xmm5=(tmp2H)
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_F106_MF217)]	; xmm0=(tmp0L)
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F106_MF217)]	; xmm1=(tmp0H)
+
+	movdqa    xmm6,xmm2
+	movdqa    xmm7,xmm2
+	punpcklwd xmm6,xmm3
+	punpckhwd xmm7,xmm3
+	movdqa    xmm2,xmm6
+	movdqa    xmm3,xmm7
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]	; xmm6=(tmp2L)
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF050)]	; xmm7=(tmp2H)
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]	; xmm2=(tmp0L)
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_F145_MF021)]	; xmm3=(tmp0H)
+
+	paddd	xmm6,xmm4		; xmm6=tmp2L
+	paddd	xmm7,xmm5		; xmm7=tmp2H
+	paddd	xmm2,xmm0		; xmm2=tmp0L
+	paddd	xmm3,xmm1		; xmm3=tmp0H
+
+	movdqa	XMMWORD [wk(0)], xmm2	; wk(0)=tmp0L
+	movdqa	XMMWORD [wk(1)], xmm3	; wk(1)=tmp0H
+
+	; -- Even part
+
+	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	pxor      xmm1,xmm1
+	pxor      xmm2,xmm2
+	punpcklwd xmm1,xmm4		; xmm1=tmp0L
+	punpckhwd xmm2,xmm4		; xmm2=tmp0H
+	psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+	psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+	movdqa    xmm3,xmm5		; xmm5=in2=z2
+	punpcklwd xmm5,xmm0		; xmm0=in6=z3
+	punpckhwd xmm3,xmm0
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F184_MF076)]	; xmm5=tmp2L
+	pmaddwd   xmm3,[GOTOFF(ebx,PW_F184_MF076)]	; xmm3=tmp2H
+
+	movdqa	xmm4,xmm1
+	movdqa	xmm0,xmm2
+	paddd	xmm1,xmm5		; xmm1=tmp10L
+	paddd	xmm2,xmm3		; xmm2=tmp10H
+	psubd	xmm4,xmm5		; xmm4=tmp12L
+	psubd	xmm0,xmm3		; xmm0=tmp12H
+
+	; -- Final output stage
+
+	movdqa	xmm5,xmm1
+	movdqa	xmm3,xmm2
+	paddd	xmm1,xmm6		; xmm1=data0L
+	paddd	xmm2,xmm7		; xmm2=data0H
+	psubd	xmm5,xmm6		; xmm5=data3L
+	psubd	xmm3,xmm7		; xmm3=data3H
+
+	movdqa	xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; xmm6=[PD_DESCALE_P1_4]
+
+	paddd	xmm1,xmm6
+	paddd	xmm2,xmm6
+	psrad	xmm1,DESCALE_P1_4
+	psrad	xmm2,DESCALE_P1_4
+	paddd	xmm5,xmm6
+	paddd	xmm3,xmm6
+	psrad	xmm5,DESCALE_P1_4
+	psrad	xmm3,DESCALE_P1_4
+
+	packssdw  xmm1,xmm2		; xmm1=data0=(00 01 02 03 04 05 06 07)
+	packssdw  xmm5,xmm3		; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+	movdqa	xmm7, XMMWORD [wk(0)]	; xmm7=tmp0L
+	movdqa	xmm6, XMMWORD [wk(1)]	; xmm6=tmp0H
+
+	movdqa	xmm2,xmm4
+	movdqa	xmm3,xmm0
+	paddd	xmm4,xmm7		; xmm4=data1L
+	paddd	xmm0,xmm6		; xmm0=data1H
+	psubd	xmm2,xmm7		; xmm2=data2L
+	psubd	xmm3,xmm6		; xmm3=data2H
+
+	movdqa	xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; xmm7=[PD_DESCALE_P1_4]
+
+	paddd	xmm4,xmm7
+	paddd	xmm0,xmm7
+	psrad	xmm4,DESCALE_P1_4
+	psrad	xmm0,DESCALE_P1_4
+	paddd	xmm2,xmm7
+	paddd	xmm3,xmm7
+	psrad	xmm2,DESCALE_P1_4
+	psrad	xmm3,DESCALE_P1_4
+
+	packssdw  xmm4,xmm0		; xmm4=data1=(10 11 12 13 14 15 16 17)
+	packssdw  xmm2,xmm3		; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+	movdqa    xmm6,xmm1	; transpose coefficients(phase 1)
+	punpcklwd xmm1,xmm4	; xmm1=(00 10 01 11 02 12 03 13)
+	punpckhwd xmm6,xmm4	; xmm6=(04 14 05 15 06 16 07 17)
+	movdqa    xmm7,xmm2	; transpose coefficients(phase 1)
+	punpcklwd xmm2,xmm5	; xmm2=(20 30 21 31 22 32 23 33)
+	punpckhwd xmm7,xmm5	; xmm7=(24 34 25 35 26 36 27 37)
+
+	movdqa    xmm0,xmm1	; transpose coefficients(phase 2)
+	punpckldq xmm1,xmm2	; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+	punpckhdq xmm0,xmm2	; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+	movdqa    xmm3,xmm6	; transpose coefficients(phase 2)
+	punpckldq xmm6,xmm7	; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+	punpckhdq xmm3,xmm7	; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows, store into output array.
+
+	mov	eax, [original_ebp]
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+
+	; -- Even part
+
+	pxor      xmm4,xmm4
+	punpcklwd xmm4,xmm1		; xmm4=tmp0
+	psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+	; -- Odd part
+
+	punpckhwd xmm1,xmm0
+	punpckhwd xmm6,xmm3
+	movdqa    xmm5,xmm1
+	movdqa    xmm2,xmm6
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F256_F089)]	; xmm1=(tmp2)
+	pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]	; xmm6=(tmp2)
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F106_MF217)]	; xmm5=(tmp0)
+	pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]	; xmm2=(tmp0)
+
+	paddd     xmm6,xmm1		; xmm6=tmp2
+	paddd     xmm2,xmm5		; xmm2=tmp0
+
+	; -- Even part
+
+	punpcklwd xmm0,xmm3
+	pmaddwd   xmm0,[GOTOFF(ebx,PW_F184_MF076)]	; xmm0=tmp2
+
+	movdqa    xmm7,xmm4
+	paddd     xmm4,xmm0		; xmm4=tmp10
+	psubd     xmm7,xmm0		; xmm7=tmp12
+
+	; -- Final output stage
+
+	movdqa	xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; xmm1=[PD_DESCALE_P2_4]
+
+	movdqa	xmm5,xmm4
+	movdqa	xmm3,xmm7
+	paddd	xmm4,xmm6		; xmm4=data0=(00 10 20 30)
+	paddd	xmm7,xmm2		; xmm7=data1=(01 11 21 31)
+	psubd	xmm5,xmm6		; xmm5=data3=(03 13 23 33)
+	psubd	xmm3,xmm2		; xmm3=data2=(02 12 22 32)
+
+	paddd	xmm4,xmm1
+	paddd	xmm7,xmm1
+	psrad	xmm4,DESCALE_P2_4
+	psrad	xmm7,DESCALE_P2_4
+	paddd	xmm5,xmm1
+	paddd	xmm3,xmm1
+	psrad	xmm5,DESCALE_P2_4
+	psrad	xmm3,DESCALE_P2_4
+
+	packssdw  xmm4,xmm3		; xmm4=(00 10 20 30 02 12 22 32)
+	packssdw  xmm7,xmm5		; xmm7=(01 11 21 31 03 13 23 33)
+
+	movdqa    xmm0,xmm4		; transpose coefficients(phase 1)
+	punpcklwd xmm4,xmm7		; xmm4=(00 01 10 11 20 21 30 31)
+	punpckhwd xmm0,xmm7		; xmm0=(02 03 12 13 22 23 32 33)
+
+	movdqa    xmm6,xmm4		; transpose coefficients(phase 2)
+	punpckldq xmm4,xmm0		; xmm4=(00 01 02 03 10 11 12 13)
+	punpckhdq xmm6,xmm0		; xmm6=(20 21 22 23 30 31 32 33)
+
+	packsswb  xmm4,xmm6		; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+	paddb     xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)]
+
+	pshufd    xmm2,xmm4,0x39	; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+	pshufd    xmm1,xmm4,0x4E	; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+	pshufd    xmm3,xmm4,0x93	; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movd	_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+	movd	_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movd	_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+	movd	_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; unused
+	poppic	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jpeg_idct_2x2_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                     JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+	align	16
+	global	EXTN(jpeg_idct_2x2_sse2)
+
+EXTN(jpeg_idct_2x2_sse2):
+	push	ebp
+	mov	ebp,esp
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input.
+
+	mov	edx, POINTER [compptr(ebp)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
+
+	; | input:                  | result:        |
+	; | 00 01 ** 03 ** 05 ** 07 |                |
+	; | 10 11 ** 13 ** 15 ** 17 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+	; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+	; | 50 51 ** 53 ** 55 ** 57 |                |
+	; | ** ** ** ** ** ** ** ** |                |
+	; | 70 71 ** 73 ** 75 ** 77 |                |
+
+	; -- Odd part
+
+	movdqa	xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	movdqa	xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movdqa	xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+	pmullw	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+	; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+	pcmpeqd   xmm7,xmm7
+	pslld     xmm7,WORD_BIT		; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+	movdqa    xmm4,xmm0		; xmm4=(10 11 ** 13 ** 15 ** 17)
+	movdqa    xmm5,xmm2		; xmm5=(50 51 ** 53 ** 55 ** 57)
+	punpcklwd xmm4,xmm1		; xmm4=(10 30 11 31 ** ** 13 33)
+	punpcklwd xmm5,xmm3		; xmm5=(50 70 51 71 ** ** 53 73)
+	pmaddwd   xmm4,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd   xmm5,[GOTOFF(ebx,PW_F085_MF072)]
+
+	psrld	xmm0,WORD_BIT		; xmm0=(11 -- 13 -- 15 -- 17 --)
+	pand	xmm1,xmm7		; xmm1=(-- 31 -- 33 -- 35 -- 37)
+	psrld	xmm2,WORD_BIT		; xmm2=(51 -- 53 -- 55 -- 57 --)
+	pand	xmm3,xmm7		; xmm3=(-- 71 -- 73 -- 75 -- 77)
+	por	xmm0,xmm1		; xmm0=(11 31 13 33 15 35 17 37)
+	por	xmm2,xmm3		; xmm2=(51 71 53 73 55 75 57 77)
+	pmaddwd	xmm0,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd	xmm2,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd	xmm4,xmm5		; xmm4=tmp0[col0 col1 **** col3]
+	paddd	xmm0,xmm2		; xmm0=tmp0[col1 col3 col5 col7]
+
+	; -- Even part
+
+	movdqa	xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	pmullw	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+	; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+	movdqa	xmm1,xmm6		; xmm1=(00 01 ** 03 ** 05 ** 07)
+	pslld	xmm6,WORD_BIT		; xmm6=(-- 00 -- ** -- ** -- **)
+	pand	xmm1,xmm7		; xmm1=(-- 01 -- 03 -- 05 -- 07)
+	psrad	xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
+	psrad	xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
+
+	; -- Final output stage
+
+	movdqa	xmm3,xmm6
+	movdqa	xmm5,xmm1
+	paddd	xmm6,xmm4	; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+	paddd	xmm1,xmm0	; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+	psubd	xmm3,xmm4	; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+	psubd	xmm5,xmm0	; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+	movdqa	xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)]	; xmm2=[PD_DESCALE_P1_2]
+
+	punpckldq  xmm6,xmm3		; xmm6=(A0 B0 ** **)
+
+	movdqa     xmm7,xmm1
+	punpcklqdq xmm1,xmm5		; xmm1=(A1 A3 B1 B3)
+	punpckhqdq xmm7,xmm5		; xmm7=(A5 A7 B5 B7)
+
+	paddd	xmm6,xmm2
+	psrad	xmm6,DESCALE_P1_2
+
+	paddd	xmm1,xmm2
+	paddd	xmm7,xmm2
+	psrad	xmm1,DESCALE_P1_2
+	psrad	xmm7,DESCALE_P1_2
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows, store into output array.
+
+	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(ebp)]
+
+	; | input:| result:|
+	; | A0 B0 |        |
+	; | A1 B1 | C0 C1  |
+	; | A3 B3 | D0 D1  |
+	; | A5 B5 |        |
+	; | A7 B7 |        |
+
+	; -- Odd part
+
+	packssdw  xmm1,xmm1		; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+	packssdw  xmm7,xmm7		; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+	pmaddwd   xmm1,[GOTOFF(ebx,PW_F362_MF127)]
+	pmaddwd   xmm7,[GOTOFF(ebx,PW_F085_MF072)]
+
+	paddd     xmm1,xmm7		; xmm1=tmp0[row0 row1 row0 row1]
+
+	; -- Even part
+
+	pslld     xmm6,(CONST_BITS+2)	; xmm6=tmp10[row0 row1 **** ****]
+
+	; -- Final output stage
+
+	movdqa    xmm4,xmm6
+	paddd     xmm6,xmm1	; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+	psubd     xmm4,xmm1	; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+	punpckldq xmm6,xmm4	; xmm6=(C0 D0 C1 D1)
+
+	paddd     xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)]
+	psrad     xmm6,DESCALE_P2_2
+
+	packssdw  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+	packsswb  xmm6,xmm6		; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+	paddb     xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)]
+
+	pextrw	ebx,xmm6,0x00		; ebx=(C0 D0 -- --)
+	pextrw	ecx,xmm6,0x01		; ecx=(C1 D1 -- --)
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	mov	WORD [edx+eax*SIZEOF_JSAMPLE], bx
+	mov	WORD [esi+eax*SIZEOF_JSAMPLE], cx
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	pop	ebp
+	ret
+
+%endif ; JIDCT_INT_SSE2_SUPPORTED
+%endif ; IDCT_SCALING_SUPPORTED
diff --git a/jisseflt.asm b/jisseflt.asm
new file mode 100644
index 0000000..20eaeeb
--- /dev/null
+++ b/jisseflt.asm
@@ -0,0 +1,582 @@
+;
+; jisseflt.asm - floating-point IDCT (SSE & MMX)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+%ifdef DCT_FLOAT_SUPPORTED
+%ifdef JIDCT_FLT_SSE_MMX_SUPPORTED
+
+; This module is specialized to the case DCTSIZE = 8.
+;
+%if DCTSIZE != 8
+%error "Sorry, this code only copes with 8x8 DCTs."
+%endif
+
+; --------------------------------------------------------------------------
+
+%macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+	shufps	%1,%2,0x44
+%endmacro
+
+%macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+	shufps	%1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+	global	EXTN(jconst_idct_float_sse)
+
+EXTN(jconst_idct_float_sse):
+
+PD_1_414	times 4 dd  1.414213562373095048801689
+PD_1_847	times 4 dd  1.847759065022573512256366
+PD_1_082	times 4 dd  1.082392200292393968799446
+PD_M2_613	times 4 dd -2.613125929752753055713286
+PD_0_125	times 4 dd  0.125	; 1/8
+PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jpeg_idct_float_sse (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+;                      JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
+%define compptr(b)	(b)+12		; jpeg_component_info * compptr
+%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
+%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
+%define output_col(b)	(b)+24		; JDIMENSION output_col
+
+%define original_ebp	ebp+0
+%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+%define WK_NUM		2
+%define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+					; FAST_FLOAT workspace[DCTSIZE2]
+
+	align	16
+	global	EXTN(jpeg_idct_float_sse)
+
+EXTN(jpeg_idct_float_sse):
+	push	ebp
+	mov	eax,esp				; eax = original ebp
+	sub	esp, byte 4
+	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+	mov	[esp],eax
+	mov	ebp,esp				; ebp = aligned ebp
+	lea	esp, [workspace]
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+	push	esi
+	push	edi
+
+	get_GOT	ebx		; get GOT address
+
+	; ---- Pass 1: process columns from input, store into work array.
+
+;	mov	eax, [original_ebp]
+	mov	edx, POINTER [compptr(eax)]
+	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
+	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+	lea	edi, [workspace]			; FAST_FLOAT * wsptr
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	jnz	near .columnDCT
+
+	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+	por	mm1,mm0
+	packsswb mm1,mm1
+	movd	eax,mm1
+	test	eax,eax
+	jnz	short .columnDCT
+
+	; -- AC terms all zero
+
+	movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+	punpckhwd mm1,mm0			; mm1=(** 02 ** 03)
+	punpcklwd mm0,mm0			; mm0=(00 00 01 01)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in0H=(02 03)
+	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in0L=(00 01)
+	cvtpi2ps  xmm3,mm1			; xmm3=(02 03 ** **)
+	cvtpi2ps  xmm0,mm0			; xmm0=(00 01 ** **)
+	movlhps   xmm0,xmm3			; xmm0=in0=(00 01 02 03)
+
+	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm1,xmm0
+	movaps	xmm2,xmm0
+	movaps	xmm3,xmm0
+
+	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
+	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
+	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
+	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+	jmp	near .nextcolumn
+	alignx	16,7
+%endif
+.columnDCT:
+
+	; -- Even part
+
+	movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+	movq      mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+	movq      mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+	movq      mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+	punpckhwd mm4,mm0			; mm4=(** 02 ** 03)
+	punpcklwd mm0,mm0			; mm0=(00 00 01 01)
+	punpckhwd mm5,mm1			; mm5=(** 22 ** 23)
+	punpcklwd mm1,mm1			; mm1=(20 20 21 21)
+
+	psrad     mm4,(DWORD_BIT-WORD_BIT)	; mm4=in0H=(02 03)
+	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in0L=(00 01)
+	cvtpi2ps  xmm4,mm4			; xmm4=(02 03 ** **)
+	cvtpi2ps  xmm0,mm0			; xmm0=(00 01 ** **)
+	psrad     mm5,(DWORD_BIT-WORD_BIT)	; mm5=in2H=(22 23)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in2L=(20 21)
+	cvtpi2ps  xmm5,mm5			; xmm5=(22 23 ** **)
+	cvtpi2ps  xmm1,mm1			; xmm1=(20 21 ** **)
+
+	punpckhwd mm6,mm2			; mm6=(** 42 ** 43)
+	punpcklwd mm2,mm2			; mm2=(40 40 41 41)
+	punpckhwd mm7,mm3			; mm7=(** 62 ** 63)
+	punpcklwd mm3,mm3			; mm3=(60 60 61 61)
+
+	psrad     mm6,(DWORD_BIT-WORD_BIT)	; mm6=in4H=(42 43)
+	psrad     mm2,(DWORD_BIT-WORD_BIT)	; mm2=in4L=(40 41)
+	cvtpi2ps  xmm6,mm6			; xmm6=(42 43 ** **)
+	cvtpi2ps  xmm2,mm2			; xmm2=(40 41 ** **)
+	psrad     mm7,(DWORD_BIT-WORD_BIT)	; mm7=in6H=(62 63)
+	psrad     mm3,(DWORD_BIT-WORD_BIT)	; mm3=in6L=(60 61)
+	cvtpi2ps  xmm7,mm7			; xmm7=(62 63 ** **)
+	cvtpi2ps  xmm3,mm3			; xmm3=(60 61 ** **)
+
+	movlhps   xmm0,xmm4			; xmm0=in0=(00 01 02 03)
+	movlhps   xmm1,xmm5			; xmm1=in2=(20 21 22 23)
+	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movlhps   xmm2,xmm6			; xmm2=in4=(40 41 42 43)
+	movlhps   xmm3,xmm7			; xmm3=in6=(60 61 62 63)
+	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movq      mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+	movq      mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+	movq      mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+	movq      mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+	punpckhwd mm6,mm4			; mm6=(** 12 ** 13)
+	punpcklwd mm4,mm4			; mm4=(10 10 11 11)
+	punpckhwd mm2,mm0			; mm2=(** 32 ** 33)
+	punpcklwd mm0,mm0			; mm0=(30 30 31 31)
+
+	psrad     mm6,(DWORD_BIT-WORD_BIT)	; mm6=in1H=(12 13)
+	psrad     mm4,(DWORD_BIT-WORD_BIT)	; mm4=in1L=(10 11)
+	cvtpi2ps  xmm4,mm6			; xmm4=(12 13 ** **)
+	cvtpi2ps  xmm2,mm4			; xmm2=(10 11 ** **)
+	psrad     mm2,(DWORD_BIT-WORD_BIT)	; mm2=in3H=(32 33)
+	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in3L=(30 31)
+	cvtpi2ps  xmm0,mm2			; xmm0=(32 33 ** **)
+	cvtpi2ps  xmm3,mm0			; xmm3=(30 31 ** **)
+
+	punpckhwd mm7,mm5			; mm7=(** 52 ** 53)
+	punpcklwd mm5,mm5			; mm5=(50 50 51 51)
+	punpckhwd mm3,mm1			; mm3=(** 72 ** 73)
+	punpcklwd mm1,mm1			; mm1=(70 70 71 71)
+
+	movlhps   xmm2,xmm4			; xmm2=in1=(10 11 12 13)
+	movlhps   xmm3,xmm0			; xmm3=in3=(30 31 32 33)
+
+	psrad     mm7,(DWORD_BIT-WORD_BIT)	; mm7=in5H=(52 53)
+	psrad     mm5,(DWORD_BIT-WORD_BIT)	; mm5=in5L=(50 51)
+	cvtpi2ps  xmm4,mm7			; xmm4=(52 53 ** **)
+	cvtpi2ps  xmm5,mm5			; xmm5=(50 51 ** **)
+	psrad     mm3,(DWORD_BIT-WORD_BIT)	; mm3=in7H=(72 73)
+	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in7L=(70 71)
+	cvtpi2ps  xmm0,mm3			; xmm0=(72 73 ** **)
+	cvtpi2ps  xmm1,mm1			; xmm1=(70 71 ** **)
+
+	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movlhps   xmm5,xmm4			; xmm5=in5=(50 51 52 53)
+	movlhps   xmm1,xmm0			; xmm1=in7=(70 71 72 73)
+	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
+	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
+	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
+	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
+	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
+	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
+	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
+	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
+	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
+	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
+	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
+
+	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
+	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm0,xmm7
+	movaps	xmm3,xmm5
+	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
+	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
+	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
+	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
+
+	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
+	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
+	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
+	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
+	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
+	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
+
+	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
+	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
+	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
+	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
+	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
+	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
+
+	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
+	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
+	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
+	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
+	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
+	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
+	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
+
+	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+	add	edx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
+	add	edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
+	dec	ecx					; ctr
+	jnz	near .columnloop
+
+	; -- Prefetch the next coefficient block
+
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+	; ---- Pass 2: process rows from work array, store into output array.
+
+	mov	eax, [original_ebp]
+	lea	esi, [workspace]			; FAST_FLOAT * wsptr
+	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+	mov	eax, JDIMENSION [output_col(eax)]
+	mov	ecx, DCTSIZE/4				; ctr
+	alignx	16,7
+.rowloop:
+
+	; -- Even part
+
+	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm0
+	movaps	xmm5,xmm1
+	subps	xmm0,xmm2		; xmm0=tmp11
+	subps	xmm1,xmm3
+	addps	xmm4,xmm2		; xmm4=tmp10
+	addps	xmm5,xmm3		; xmm5=tmp13
+
+	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
+	subps	xmm1,xmm5		; xmm1=tmp12
+
+	movaps	xmm6,xmm4
+	movaps	xmm7,xmm0
+	subps	xmm4,xmm5		; xmm4=tmp3
+	subps	xmm0,xmm1		; xmm0=tmp2
+	addps	xmm6,xmm5		; xmm6=tmp0
+	addps	xmm7,xmm1		; xmm7=tmp1
+
+	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+
+	; -- Odd part
+
+	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+	movaps	xmm4,xmm2
+	movaps	xmm0,xmm5
+	addps	xmm2,xmm1		; xmm2=z11
+	addps	xmm5,xmm3		; xmm5=z13
+	subps	xmm4,xmm1		; xmm4=z12
+	subps	xmm0,xmm3		; xmm0=z10
+
+	movaps	xmm1,xmm2
+	subps	xmm2,xmm5
+	addps	xmm1,xmm5		; xmm1=tmp7
+
+	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
+
+	movaps	xmm3,xmm0
+	addps	xmm0,xmm4
+	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
+	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
+	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
+	addps	xmm3,xmm0		; xmm3=tmp12
+	subps	xmm4,xmm0		; xmm4=tmp10
+
+	; -- Final output stage
+
+	subps	xmm3,xmm1		; xmm3=tmp6
+	movaps	xmm5,xmm6
+	movaps	xmm0,xmm7
+	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
+	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
+	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
+	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
+	subps	xmm2,xmm3		; xmm2=tmp5
+
+	movaps	xmm1,[GOTOFF(ebx,PD_0_125)]	; xmm1=[PD_0_125]
+
+	mulps	xmm6,xmm1		; descale(1/8)
+	mulps	xmm7,xmm1		; descale(1/8)
+	mulps	xmm5,xmm1		; descale(1/8)
+	mulps	xmm0,xmm1		; descale(1/8)
+
+	movhlps   xmm3,xmm6
+	movhlps   xmm1,xmm7
+	cvtps2pi  mm0,xmm6		; round to int32, mm0=data0L=(00 10)
+	cvtps2pi  mm1,xmm7		; round to int32, mm1=data1L=(01 11)
+	cvtps2pi  mm2,xmm3		; round to int32, mm2=data0H=(20 30)
+	cvtps2pi  mm3,xmm1		; round to int32, mm3=data1H=(21 31)
+	packssdw  mm0,mm2		; mm0=data0=(00 10 20 30)
+	packssdw  mm1,mm3		; mm1=data1=(01 11 21 31)
+
+	movhlps   xmm6,xmm5
+	movhlps   xmm7,xmm0
+	cvtps2pi  mm4,xmm5		; round to int32, mm4=data7L=(07 17)
+	cvtps2pi  mm5,xmm0		; round to int32, mm5=data6L=(06 16)
+	cvtps2pi  mm6,xmm6		; round to int32, mm6=data7H=(27 37)
+	cvtps2pi  mm7,xmm7		; round to int32, mm7=data6H=(26 36)
+	packssdw  mm4,mm6		; mm4=data7=(07 17 27 37)
+	packssdw  mm5,mm7		; mm5=data6=(06 16 26 36)
+
+	packsswb  mm0,mm5		; mm0=(00 10 20 30 06 16 26 36)
+	packsswb  mm1,mm4		; mm1=(01 11 21 31 07 17 27 37)
+
+	movaps	xmm3, XMMWORD [wk(0)]	; xmm3=tmp2
+	movaps	xmm1, XMMWORD [wk(1)]	; xmm1=tmp3
+
+	movaps	xmm6,[GOTOFF(ebx,PD_0_125)]	; xmm6=[PD_0_125]
+
+	addps	xmm4,xmm2		; xmm4=tmp4
+	movaps	xmm5,xmm3
+	movaps	xmm0,xmm1
+	addps	xmm3,xmm2		; xmm3=data2=(02 12 22 32)
+	addps	xmm1,xmm4		; xmm1=data4=(04 14 24 34)
+	subps	xmm5,xmm2		; xmm5=data5=(05 15 25 35)
+	subps	xmm0,xmm4		; xmm0=data3=(03 13 23 33)
+
+	mulps	xmm3,xmm6		; descale(1/8)
+	mulps	xmm1,xmm6		; descale(1/8)
+	mulps	xmm5,xmm6		; descale(1/8)
+	mulps	xmm0,xmm6		; descale(1/8)
+
+	movhlps   xmm7,xmm3
+	movhlps   xmm2,xmm1
+	cvtps2pi  mm2,xmm3		; round to int32, mm2=data2L=(02 12)
+	cvtps2pi  mm3,xmm1		; round to int32, mm3=data4L=(04 14)
+	cvtps2pi  mm6,xmm7		; round to int32, mm6=data2H=(22 32)
+	cvtps2pi  mm7,xmm2		; round to int32, mm7=data4H=(24 34)
+	packssdw  mm2,mm6		; mm2=data2=(02 12 22 32)
+	packssdw  mm3,mm7		; mm3=data4=(04 14 24 34)
+
+	movhlps   xmm4,xmm5
+	movhlps   xmm6,xmm0
+	cvtps2pi  mm5,xmm5		; round to int32, mm5=data5L=(05 15)
+	cvtps2pi  mm4,xmm0		; round to int32, mm4=data3L=(03 13)
+	cvtps2pi  mm6,xmm4		; round to int32, mm6=data5H=(25 35)
+	cvtps2pi  mm7,xmm6		; round to int32, mm7=data3H=(23 33)
+	packssdw  mm5,mm6		; mm5=data5=(05 15 25 35)
+	packssdw  mm4,mm7		; mm4=data3=(03 13 23 33)
+
+	movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm6=[PB_CENTERJSAMP]
+
+	packsswb  mm2,mm3		; mm2=(02 12 22 32 04 14 24 34)
+	packsswb  mm4,mm5		; mm4=(03 13 23 33 05 15 25 35)
+
+	paddb     mm0,mm6
+	paddb     mm1,mm6
+	paddb     mm2,mm6
+	paddb     mm4,mm6
+
+	movq      mm7,mm0		; transpose coefficients(phase 1)
+	punpcklbw mm0,mm1		; mm0=(00 01 10 11 20 21 30 31)
+	punpckhbw mm7,mm1		; mm7=(06 07 16 17 26 27 36 37)
+	movq      mm3,mm2		; transpose coefficients(phase 1)
+	punpcklbw mm2,mm4		; mm2=(02 03 12 13 22 23 32 33)
+	punpckhbw mm3,mm4		; mm3=(04 05 14 15 24 25 34 35)
+
+	movq      mm5,mm0		; transpose coefficients(phase 2)
+	punpcklwd mm0,mm2		; mm0=(00 01 02 03 10 11 12 13)
+	punpckhwd mm5,mm2		; mm5=(20 21 22 23 30 31 32 33)
+	movq      mm6,mm3		; transpose coefficients(phase 2)
+	punpcklwd mm3,mm7		; mm3=(04 05 06 07 14 15 16 17)
+	punpckhwd mm6,mm7		; mm6=(24 25 26 27 34 35 36 37)
+
+	movq      mm1,mm0		; transpose coefficients(phase 3)
+	punpckldq mm0,mm3		; mm0=(00 01 02 03 04 05 06 07)
+	punpckhdq mm1,mm3		; mm1=(10 11 12 13 14 15 16 17)
+	movq      mm4,mm5		; transpose coefficients(phase 3)
+	punpckldq mm5,mm6		; mm5=(20 21 22 23 24 25 26 27)
+	punpckhdq mm4,mm6		; mm4=(30 31 32 33 34 35 36 37)
+
+	pushpic	ebx			; save GOT address
+
+	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+	poppic	ebx			; restore GOT address
+
+	add	esi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
+	add	edi, byte 4*SIZEOF_JSAMPROW
+	dec	ecx				; ctr
+	jnz	near .rowloop
+
+	emms		; empty MMX state
+
+	pop	edi
+	pop	esi
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	mov	esp,ebp		; esp <- aligned ebp
+	pop	esp		; esp <- original ebp
+	pop	ebp
+	ret
+
+%endif ; JIDCT_FLT_SSE_MMX_SUPPORTED
+%endif ; DCT_FLOAT_SUPPORTED
diff --git a/jmemmgr.c b/jmemmgr.c
index d801b32..e3149e5 100644
--- a/jmemmgr.c
+++ b/jmemmgr.c
@@ -5,6 +5,13 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : January 27, 2004
+ * ---------------------------------------------------------------------
+ *
  * This file contains the JPEG system-independent memory management
  * routines.  This code is usable across a wide variety of machines; most
  * of the system dependencies have been isolated in a separate file.
@@ -51,27 +58,12 @@
 
 
 /*
- * Many machines require storage alignment: longs must start on 4-byte
- * boundaries, doubles on 8-byte boundaries, etc.  On such machines, malloc()
- * always returns pointers that are multiples of the worst-case alignment
- * requirement, and we had better do so too.
- * There isn't any really portable way to determine the worst-case alignment
- * requirement.  This module assumes that the alignment requirement is
- * multiples of sizeof(ALIGN_TYPE).
- * By default, we define ALIGN_TYPE as double.  This is necessary on some
- * workstations (where doubles really do need 8-byte alignment) and will work
- * fine on nearly everything.  If your machine has lesser alignment needs,
- * you can save a few bytes by making ALIGN_TYPE smaller.
- * The only place I know of where this will NOT work is certain Macintosh
- * 680x0 compilers that define double as a 10-byte IEEE extended float.
- * Doing 10-byte alignment is counterproductive because longwords won't be
- * aligned well.  Put "#define ALIGN_TYPE long" in jconfig.h if you have
- * such a compiler.
+ * SIMD Ext: Most of SSE/SSE2 instructions require that the memory address
+ * is aligned to a 16-byte boundary; if not, a general-protection exception
+ * (#GP) is generated.
  */
 
-#ifndef ALIGN_TYPE		/* so can override from jconfig.h */
-#define ALIGN_TYPE  double
-#endif
+#define ALIGN_SIZE  16		/* sizeof SSE/SSE2 register */
 
 
 /*
@@ -81,31 +73,24 @@
  * header with a link to the next pool of the same class.
  * Small and large pool headers are identical except that the latter's
  * link pointer must be FAR on 80x86 machines.
- * Notice that the "real" header fields are union'ed with a dummy ALIGN_TYPE
- * field.  This forces the compiler to make SIZEOF(small_pool_hdr) a multiple
- * of the alignment requirement of ALIGN_TYPE.
  */
 
-typedef union small_pool_struct * small_pool_ptr;
+typedef struct small_pool_struct * small_pool_ptr;
 
-typedef union small_pool_struct {
-  struct {
-    small_pool_ptr next;	/* next in list of pools */
-    size_t bytes_used;		/* how many bytes already used within pool */
-    size_t bytes_left;		/* bytes still available in this pool */
-  } hdr;
-  ALIGN_TYPE dummy;		/* included in union to ensure alignment */
+typedef struct small_pool_struct {
+  small_pool_ptr next;		/* next in list of pools */
+  size_t bytes_used;		/* how many bytes already used within pool */
+  size_t bytes_left;		/* bytes still available in this pool */
+  char dummy[ALIGN_SIZE-1];
 } small_pool_hdr;
 
-typedef union large_pool_struct FAR * large_pool_ptr;
+typedef struct large_pool_struct FAR * large_pool_ptr;
 
-typedef union large_pool_struct {
-  struct {
-    large_pool_ptr next;	/* next in list of pools */
-    size_t bytes_used;		/* how many bytes already used within pool */
-    size_t bytes_left;		/* bytes still available in this pool */
-  } hdr;
-  ALIGN_TYPE dummy;		/* included in union to ensure alignment */
+typedef struct large_pool_struct {
+  large_pool_ptr next;		/* next in list of pools */
+  size_t bytes_used;		/* how many bytes already used within pool */
+  size_t bytes_left;		/* bytes still available in this pool */
+  char dummy[ALIGN_SIZE-1];
 } large_pool_hdr;
 
 
@@ -197,16 +182,16 @@
 	  pool_id, mem->total_space_allocated);
 
   for (lhdr_ptr = mem->large_list[pool_id]; lhdr_ptr != NULL;
-       lhdr_ptr = lhdr_ptr->hdr.next) {
+       lhdr_ptr = lhdr_ptr->next) {
     fprintf(stderr, "  Large chunk used %ld\n",
-	    (long) lhdr_ptr->hdr.bytes_used);
+	    (long) lhdr_ptr->bytes_used);
   }
 
   for (shdr_ptr = mem->small_list[pool_id]; shdr_ptr != NULL;
-       shdr_ptr = shdr_ptr->hdr.next) {
+       shdr_ptr = shdr_ptr->next) {
     fprintf(stderr, "  Small chunk used %ld free %ld\n",
-	    (long) shdr_ptr->hdr.bytes_used,
-	    (long) shdr_ptr->hdr.bytes_left);
+	    (long) shdr_ptr->bytes_used,
+	    (long) shdr_ptr->bytes_left);
   }
 }
 
@@ -266,10 +251,10 @@
   if (sizeofobject > (size_t) (MAX_ALLOC_CHUNK-SIZEOF(small_pool_hdr)))
     out_of_memory(cinfo, 1);	/* request exceeds malloc's ability */
 
-  /* Round up the requested size to a multiple of SIZEOF(ALIGN_TYPE) */
-  odd_bytes = sizeofobject % SIZEOF(ALIGN_TYPE);
+  /* Round up the requested size to a multiple of ALIGN_SIZE */
+  odd_bytes = sizeofobject % ALIGN_SIZE;
   if (odd_bytes > 0)
-    sizeofobject += SIZEOF(ALIGN_TYPE) - odd_bytes;
+    sizeofobject += ALIGN_SIZE - odd_bytes;
 
   /* See if space is available in any existing pool */
   if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
@@ -277,10 +262,10 @@
   prev_hdr_ptr = NULL;
   hdr_ptr = mem->small_list[pool_id];
   while (hdr_ptr != NULL) {
-    if (hdr_ptr->hdr.bytes_left >= sizeofobject)
+    if (hdr_ptr->bytes_left >= sizeofobject)
       break;			/* found pool with enough space */
     prev_hdr_ptr = hdr_ptr;
-    hdr_ptr = hdr_ptr->hdr.next;
+    hdr_ptr = hdr_ptr->next;
   }
 
   /* Time to make a new pool? */
@@ -305,20 +290,20 @@
     }
     mem->total_space_allocated += min_request + slop;
     /* Success, initialize the new pool header and add to end of list */
-    hdr_ptr->hdr.next = NULL;
-    hdr_ptr->hdr.bytes_used = 0;
-    hdr_ptr->hdr.bytes_left = sizeofobject + slop;
+    hdr_ptr->next = NULL;
+    hdr_ptr->bytes_used = 0;
+    hdr_ptr->bytes_left = sizeofobject + slop;
     if (prev_hdr_ptr == NULL)	/* first pool in class? */
       mem->small_list[pool_id] = hdr_ptr;
     else
-      prev_hdr_ptr->hdr.next = hdr_ptr;
+      prev_hdr_ptr->next = hdr_ptr;
   }
 
   /* OK, allocate the object from the current pool */
-  data_ptr = (char *) (hdr_ptr + 1); /* point to first data byte in pool */
-  data_ptr += hdr_ptr->hdr.bytes_used; /* point to place for object */
-  hdr_ptr->hdr.bytes_used += sizeofobject;
-  hdr_ptr->hdr.bytes_left -= sizeofobject;
+  data_ptr = (char *) ((size_t) (hdr_ptr + 1) & -ALIGN_SIZE);
+  data_ptr += hdr_ptr->bytes_used; /* point to place for object */
+  hdr_ptr->bytes_used += sizeofobject;
+  hdr_ptr->bytes_left -= sizeofobject;
 
   return (void *) data_ptr;
 }
@@ -350,10 +335,10 @@
   if (sizeofobject > (size_t) (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)))
     out_of_memory(cinfo, 3);	/* request exceeds malloc's ability */
 
-  /* Round up the requested size to a multiple of SIZEOF(ALIGN_TYPE) */
-  odd_bytes = sizeofobject % SIZEOF(ALIGN_TYPE);
+  /* Round up the requested size to a multiple of ALIGN_SIZE */
+  odd_bytes = sizeofobject % ALIGN_SIZE;
   if (odd_bytes > 0)
-    sizeofobject += SIZEOF(ALIGN_TYPE) - odd_bytes;
+    sizeofobject += ALIGN_SIZE - odd_bytes;
 
   /* Always make a new pool */
   if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
@@ -366,15 +351,15 @@
   mem->total_space_allocated += sizeofobject + SIZEOF(large_pool_hdr);
 
   /* Success, initialize the new pool header and add to list */
-  hdr_ptr->hdr.next = mem->large_list[pool_id];
+  hdr_ptr->next = mem->large_list[pool_id];
   /* We maintain space counts in each pool header for statistical purposes,
    * even though they are not needed for allocation.
    */
-  hdr_ptr->hdr.bytes_used = sizeofobject;
-  hdr_ptr->hdr.bytes_left = 0;
+  hdr_ptr->bytes_used = sizeofobject;
+  hdr_ptr->bytes_left = 0;
   mem->large_list[pool_id] = hdr_ptr;
 
-  return (void FAR *) (hdr_ptr + 1); /* point to first data byte in pool */
+  return (void FAR *) ((size_t) (hdr_ptr + 1) & -ALIGN_SIZE);
 }
 
 
@@ -401,6 +386,12 @@
   JSAMPROW workspace;
   JDIMENSION rowsperchunk, currow, i;
   long ltemp;
+  JDIMENSION odd_samples;
+
+  /* Round up the row bytes to a multiple of ALIGN_SIZE */
+  odd_samples = samplesperrow % (ALIGN_SIZE / SIZEOF(JSAMPLE));
+  if (odd_samples > 0)
+    samplesperrow += (ALIGN_SIZE / SIZEOF(JSAMPLE)) - odd_samples;
 
   /* Calculate max # of rows allowed in one allocation chunk */
   ltemp = (MAX_ALLOC_CHUNK-SIZEOF(large_pool_hdr)) /
@@ -968,9 +959,9 @@
   mem->large_list[pool_id] = NULL;
 
   while (lhdr_ptr != NULL) {
-    large_pool_ptr next_lhdr_ptr = lhdr_ptr->hdr.next;
-    space_freed = lhdr_ptr->hdr.bytes_used +
-		  lhdr_ptr->hdr.bytes_left +
+    large_pool_ptr next_lhdr_ptr = lhdr_ptr->next;
+    space_freed = lhdr_ptr->bytes_used +
+		  lhdr_ptr->bytes_left +
 		  SIZEOF(large_pool_hdr);
     jpeg_free_large(cinfo, (void FAR *) lhdr_ptr, space_freed);
     mem->total_space_allocated -= space_freed;
@@ -982,9 +973,9 @@
   mem->small_list[pool_id] = NULL;
 
   while (shdr_ptr != NULL) {
-    small_pool_ptr next_shdr_ptr = shdr_ptr->hdr.next;
-    space_freed = shdr_ptr->hdr.bytes_used +
-		  shdr_ptr->hdr.bytes_left +
+    small_pool_ptr next_shdr_ptr = shdr_ptr->next;
+    space_freed = shdr_ptr->bytes_used +
+		  shdr_ptr->bytes_left +
 		  SIZEOF(small_pool_hdr);
     jpeg_free_small(cinfo, (void *) shdr_ptr, space_freed);
     mem->total_space_allocated -= space_freed;
@@ -1035,22 +1026,22 @@
   cinfo->mem = NULL;		/* for safety if init fails */
 
   /* Check for configuration errors.
-   * SIZEOF(ALIGN_TYPE) should be a power of 2; otherwise, it probably
+   * ALIGN_SIZE should be a power of 2; otherwise, it probably
    * doesn't reflect any real hardware alignment requirement.
    * The test is a little tricky: for X>0, X and X-1 have no one-bits
    * in common if and only if X is a power of 2, ie has only one one-bit.
    * Some compilers may give an "unreachable code" warning here; ignore it.
    */
-  if ((SIZEOF(ALIGN_TYPE) & (SIZEOF(ALIGN_TYPE)-1)) != 0)
+  if ((ALIGN_SIZE & (ALIGN_SIZE-1)) != 0)
     ERREXIT(cinfo, JERR_BAD_ALIGN_TYPE);
   /* MAX_ALLOC_CHUNK must be representable as type size_t, and must be
-   * a multiple of SIZEOF(ALIGN_TYPE).
+   * a multiple of ALIGN_SIZE.
    * Again, an "unreachable code" warning may be ignored here.
    * But a "constant too large" warning means you need to fix MAX_ALLOC_CHUNK.
    */
   test_mac = (size_t) MAX_ALLOC_CHUNK;
   if ((long) test_mac != MAX_ALLOC_CHUNK ||
-      (MAX_ALLOC_CHUNK % SIZEOF(ALIGN_TYPE)) != 0)
+      (MAX_ALLOC_CHUNK % ALIGN_SIZE) != 0)
     ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK);
 
   max_to_use = jpeg_mem_init(cinfo); /* system-dependent initialization */
diff --git a/jmorecfg.h b/jmorecfg.h
index 54a7d1c..b425519 100644
--- a/jmorecfg.h
+++ b/jmorecfg.h
@@ -5,6 +5,13 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : March 28, 2005
+ * ---------------------------------------------------------------------
+ *
  * This file contains additional configuration options that customize the
  * JPEG software for special applications or support machine-dependent
  * optimizations.  Most users will not need to touch this file.
@@ -20,7 +27,9 @@
  * We do not support run-time selection of data precision, sorry.
  */
 
-#define BITS_IN_JSAMPLE  8	/* use 8 or 12 */
+/* SIMD Ext: This SIMD code only copes with 8-bit sample values. */
+
+#define BITS_IN_JSAMPLE  8	/* SIMD Ext: cannot be changed! */
 
 
 /*
@@ -157,7 +166,8 @@
 
 /* INT32 must hold at least signed 32-bit values. */
 
-#ifndef XMD_H			/* X11/xmd.h correctly defines INT32 */
+	/* X11/xmd.h and basetsd.h (Win32 SDK) correctly define INT32 */
+#if !defined(XMD_H) && !defined(_BASETSD_H_) && !defined(_BASETSD_H)
 typedef long INT32;
 #endif
 
@@ -180,14 +190,24 @@
  * or code profilers that require it.
  */
 
+#if defined(_MSC_VER) || defined(__BORLANDC__) || \
+    defined(__WATCOMC__) || defined(__MWERKS__) || \
+    defined(__ICC) || defined(__INTEL_COMPILER)
+#define JCDECL  __cdecl
+#elif defined(__GNUC__)
+#define JCDECL  __attribute__((__cdecl__))
+#else
+#define JCDECL
+#endif
+
 /* a function called through method pointers: */
-#define METHODDEF(type)		static type
+#define METHODDEF(type)		static type JCDECL
 /* a function used only in its module: */
 #define LOCAL(type)		static type
 /* a function referenced thru EXTERNs: */
-#define GLOBAL(type)		type
+#define GLOBAL(type)		type JCDECL
 /* a reference to a GLOBAL function: */
-#define EXTERN(type)		extern type
+#define EXTERN(type)		extern type JCDECL
 
 
 /* This macro is used to declare a "method", that is, a function pointer.
@@ -197,9 +217,9 @@
  */
 
 #ifdef HAVE_PROTOTYPES
-#define JMETHOD(type,methodname,arglist)  type (*methodname) arglist
+#define JMETHOD(type,methodname,arglist)  type (JCDECL *methodname) arglist
 #else
-#define JMETHOD(type,methodname,arglist)  type (*methodname) ()
+#define JMETHOD(type,methodname,arglist)  type (JCDECL *methodname) ()
 #endif
 
 
@@ -209,11 +229,13 @@
  * explicit coding is needed; see uses of the NEED_FAR_POINTERS symbol.
  */
 
+#ifndef FAR
 #ifdef NEED_FAR_POINTERS
 #define FAR  far
 #else
 #define FAR
 #endif
+#endif /* !FAR */
 
 
 /*
@@ -224,8 +246,14 @@
  */
 
 #ifndef HAVE_BOOLEAN
-typedef int boolean;
+#ifdef TYPEDEF_UCHAR_BOOLEAN
+#ifndef __RPCNDR_H__		/* don't conflict if rpcndr.h already read */
+typedef unsigned char boolean;
 #endif
+#else /* !TYPEDEF_UCHAR_BOOLEAN */
+typedef int boolean;
+#endif /* TYPEDEF_UCHAR_BOOLEAN */
+#endif /* !HAVE_BOOLEAN */
 #ifndef FALSE			/* in case these macros already exist */
 #define FALSE	0		/* values of boolean */
 #endif
@@ -290,6 +318,7 @@
 #define IDCT_SCALING_SUPPORTED	    /* Output rescaling via IDCT? */
 #undef  UPSAMPLE_SCALING_SUPPORTED  /* Output rescaling at upsample stage? */
 #define UPSAMPLE_MERGING_SUPPORTED  /* Fast path for sloppy upsampling? */
+#define UPSAMPLE_H1V2_SUPPORTED	    /* Fast/fancy processing for 1h2v? */
 #define QUANT_1PASS_SUPPORTED	    /* 1-pass color quantization? */
 #define QUANT_2PASS_SUPPORTED	    /* 2-pass color quantization? */
 
@@ -316,6 +345,84 @@
 #define RGB_BLUE	2	/* Offset of Blue */
 #define RGB_PIXELSIZE	3	/* JSAMPLEs per RGB scanline element */
 
+#undef RGBX_FILLER_0XFF 	/* fill dummy bytes with 0xFF in RGBX format */
+
+
+/* SIMD support options: */
+
+#ifndef JSIMD_MMX_NOT_SUPPORTED
+#define JSIMD_ENCODER_MMX_SUPPORTED	/* Use MMX    in encoding process */
+#define JSIMD_DECODER_MMX_SUPPORTED	/* Use MMX    in decoding process */
+#endif
+#ifndef JSIMD_3DNOW_NOT_SUPPORTED
+#define JSIMD_ENCODER_3DNOW_SUPPORTED	/* Use 3DNow! in encoding process */
+#define JSIMD_DECODER_3DNOW_SUPPORTED	/* Use 3DNow! in decoding process */
+#endif
+#ifndef JSIMD_SSE_NOT_SUPPORTED
+#define JSIMD_ENCODER_SSE_SUPPORTED	/* Use SSE    in encoding process */
+#define JSIMD_DECODER_SSE_SUPPORTED	/* Use SSE    in decoding process */
+#endif
+#ifndef JSIMD_SSE2_NOT_SUPPORTED
+#define JSIMD_ENCODER_SSE2_SUPPORTED	/* Use SSE2   in encoding process */
+#define JSIMD_DECODER_SSE2_SUPPORTED	/* Use SSE2   in decoding process */
+#endif
+
+/* (encoder part): */
+
+#undef JFDCT_INT_QUANTIZE_WITH_DIVISION /* Use general quantization method */
+
+#if defined(JSIMD_ENCODER_MMX_SUPPORTED)
+#define JCCOLOR_RGBYCC_MMX_SUPPORTED	/* RGB->YCC conversion with MMX */
+#define JCSAMPLE_MMX_SUPPORTED		/* downsampling with MMX */
+#define JFDCT_INT_MMX_SUPPORTED		/* forward DCT with MMX */
+#endif
+#if defined(JSIMD_ENCODER_SSE2_SUPPORTED)
+#define JCCOLOR_RGBYCC_SSE2_SUPPORTED	/* RGB->YCC conversion with SSE2 */
+#define JCSAMPLE_SSE2_SUPPORTED		/* downsampling with SSE2 */
+#define JFDCT_INT_SSE2_SUPPORTED	/* forward DCT with SSE2 */
+#endif
+#if defined(JSIMD_ENCODER_3DNOW_SUPPORTED) && \
+    defined(JSIMD_ENCODER_MMX_SUPPORTED)
+#define JFDCT_FLT_3DNOW_MMX_SUPPORTED	/* forward DCT with 3DNow!/MMX */
+#endif
+#if defined(JSIMD_ENCODER_SSE_SUPPORTED) && \
+    defined(JSIMD_ENCODER_MMX_SUPPORTED)
+#define JFDCT_FLT_SSE_MMX_SUPPORTED	/* forward DCT with SSE/MMX */
+#endif
+#if defined(JSIMD_ENCODER_SSE_SUPPORTED) && \
+    defined(JSIMD_ENCODER_SSE2_SUPPORTED)
+#define JFDCT_FLT_SSE_SSE2_SUPPORTED	/* forward DCT with SSE/SSE2 */
+#endif
+
+/* (decoder part): */
+
+#if defined(JSIMD_DECODER_MMX_SUPPORTED)
+#define JDCOLOR_YCCRGB_MMX_SUPPORTED	/* YCC->RGB conversion with MMX */
+#define JDMERGE_MMX_SUPPORTED		/* merged upsampling with MMX */
+#define JDSAMPLE_FANCY_MMX_SUPPORTED	/* fancy upsampling with MMX */
+#define JDSAMPLE_SIMPLE_MMX_SUPPORTED	/* sloppy upsampling with MMX */
+#define JIDCT_INT_MMX_SUPPORTED		/* inverse DCT with MMX */
+#endif
+#if defined(JSIMD_DECODER_SSE2_SUPPORTED)
+#define JDCOLOR_YCCRGB_SSE2_SUPPORTED	/* YCC->RGB conversion with SSE2 */
+#define JDMERGE_SSE2_SUPPORTED		/* merged upsampling with SSE2 */
+#define JDSAMPLE_FANCY_SSE2_SUPPORTED	/* fancy upsampling with SSE2 */
+#define JDSAMPLE_SIMPLE_SSE2_SUPPORTED	/* sloppy upsampling with SSE2 */
+#define JIDCT_INT_SSE2_SUPPORTED	/* inverse DCT with SSE2 */
+#endif
+#if defined(JSIMD_DECODER_3DNOW_SUPPORTED) && \
+    defined(JSIMD_DECODER_MMX_SUPPORTED)
+#define JIDCT_FLT_3DNOW_MMX_SUPPORTED	/* inverse DCT with 3DNow!/MMX */
+#endif
+#if defined(JSIMD_DECODER_SSE_SUPPORTED) && \
+    defined(JSIMD_DECODER_MMX_SUPPORTED)
+#define JIDCT_FLT_SSE_MMX_SUPPORTED	/* inverse DCT with SSE/MMX */
+#endif
+#if defined(JSIMD_DECODER_SSE_SUPPORTED) && \
+    defined(JSIMD_DECODER_SSE2_SUPPORTED)
+#define JIDCT_FLT_SSE_SSE2_SUPPORTED	/* inverse DCT with SSE/SSE2 */
+#endif
+
 
 /* Definitions for speed-related optimizations. */
 
@@ -328,6 +435,9 @@
 #ifdef __GNUC__			/* for instance, GNU C knows about inline */
 #define INLINE __inline__
 #endif
+#ifdef _MSC_VER
+#define INLINE __inline
+#endif
 #ifndef INLINE
 #define INLINE			/* default is to define it as empty */
 #endif
diff --git a/jpegdll.def b/jpegdll.def
new file mode 100644
index 0000000..5a86cd6
--- /dev/null
+++ b/jpegdll.def
@@ -0,0 +1,73 @@
+;
+; jpegdll.def - module definition file for Win32 DLL
+;
+
+; sed -e "/\(jinit\|jpeg_simd_\(cpu\|os\|merged\)\)/d" -e "s/^EXTERN(..*) \([_A-Za-z][_A-Za-z0-9]*\).*/  \1/p" -e d jpeglib.h jpegint.h
+
+EXPORTS
+  ; API functions in jpeglib.h, which are intended
+  ; to be called by the user applications.
+  jpeg_std_error
+  jpeg_CreateCompress
+  jpeg_CreateDecompress
+  jpeg_destroy_compress
+  jpeg_destroy_decompress
+  jpeg_stdio_dest
+  jpeg_stdio_src
+  jpeg_set_defaults
+  jpeg_set_colorspace
+  jpeg_default_colorspace
+  jpeg_set_quality
+  jpeg_set_linear_quality
+  jpeg_add_quant_table
+  jpeg_quality_scaling
+  jpeg_simple_progression
+  jpeg_suppress_tables
+  jpeg_alloc_quant_table
+  jpeg_alloc_huff_table
+  jpeg_start_compress
+  jpeg_write_scanlines
+  jpeg_finish_compress
+  jpeg_write_raw_data
+  jpeg_write_marker
+  jpeg_write_m_header
+  jpeg_write_m_byte
+  jpeg_write_tables
+  jpeg_read_header
+  jpeg_start_decompress
+  jpeg_read_scanlines
+  jpeg_finish_decompress
+  jpeg_read_raw_data
+  jpeg_has_multiple_scans
+  jpeg_start_output
+  jpeg_finish_output
+  jpeg_input_complete
+  jpeg_new_colormap
+  jpeg_consume_input
+  jpeg_calc_output_dimensions
+  jpeg_save_markers
+  jpeg_set_marker_processor
+  jpeg_read_coefficients
+  jpeg_write_coefficients
+  jpeg_copy_critical_parameters
+  jpeg_abort_compress
+  jpeg_abort_decompress
+  jpeg_abort
+  jpeg_destroy
+  jpeg_resync_to_restart
+  ; Functions that are introduced by SIMD extension.
+  jpeg_simd_support
+  jpeg_simd_mask
+  jpeg_simd_color_converter
+  jpeg_simd_downsampler
+  jpeg_simd_forward_dct
+  jpeg_simd_color_deconverter
+  jpeg_simd_upsampler
+  jpeg_simd_inverse_dct
+  ; Utility functions in jutils.c.
+  ; These are needed by some applications.
+  jdiv_round_up
+  jround_up
+  jcopy_sample_rows
+  jcopy_block_row
+  jzero_far
diff --git a/jpegdll.rc b/jpegdll.rc
new file mode 100644
index 0000000..fb3d327
--- /dev/null
+++ b/jpegdll.rc
@@ -0,0 +1,57 @@
+//
+// jpegdll.rc - version information for Win32 DLL
+//
+
+// from <winver.h>
+#define VS_VERSION_INFO         1
+#define VS_FFI_FILEFLAGSMASK    0x0000003FL
+#define VS_FF_DEBUG             0x00000001L
+#define VOS__WINDOWS32          0x00000004L
+#define VFT_DLL                 0x00000002L
+#define VFT2_UNKNOWN            0x00000000L
+
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Version
+//
+
+VS_VERSION_INFO VERSIONINFO
+ FILEVERSION    6,2,1,2
+ PRODUCTVERSION 6,2,1,2
+ FILEFLAGSMASK  VS_FFI_FILEFLAGSMASK
+#ifdef _DEBUG
+ FILEFLAGS      VS_FF_DEBUG
+#else
+ FILEFLAGS      0x00000000L
+#endif
+ FILEOS         VOS__WINDOWS32
+ FILETYPE       VFT_DLL
+ FILESUBTYPE    VFT2_UNKNOWN
+BEGIN
+    BLOCK "StringFileInfo"
+    BEGIN
+        BLOCK "00000000"
+        BEGIN
+            VALUE "LegalCopyright",  "Copyright (C) 1991-1998 Thomas G. Lane\0"
+            VALUE "FileDescription", "Independent JPEG Group's JPEG Library"
+                                     " with SIMD support\0"
+            VALUE "ProductName", "The Independent JPEG Group's JPEG software"
+                                 " release 6b   with x86 SIMD extension for"
+                                 " IJG JPEG library version 1.02\0"
+            VALUE "Comments", "This is not an official binary from IJG.   "
+                              "The SIMD code in this DLL is copyright (C)"
+                              " 1999-2006 MIYASAKA Masaru.\0"
+            VALUE "FileVersion",      "6.2.1.02\0"
+            VALUE "ProductVersion",   "6.2.1.02\0"
+            VALUE "OriginalFilename", "jpeg62.dll\0"
+            VALUE "InternalName",     "jpeg62\0"
+        END
+    END
+    BLOCK "VarFileInfo"
+    BEGIN
+        VALUE "Translation", 0x0, 0
+    END
+END
+
+/////////////////////////////////////////////////////////////////////////////
diff --git a/jpegint.h b/jpegint.h
index 95b00d4..511e07c 100644
--- a/jpegint.h
+++ b/jpegint.h
@@ -5,6 +5,13 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : February 4, 2006
+ * ---------------------------------------------------------------------
+ *
  * This file provides common declarations for the various JPEG modules.
  * These declarations are considered internal to the JPEG library; most
  * applications using the library shouldn't need to include this file.
@@ -291,6 +298,19 @@
 #endif
 
 
+/* SIMD Ext: This macro checks if constants for SSE/SSE2 instructions are
+ * aligned to a 16-byte boundary. Most of SSE/SSE2 instructions require
+ * that the memory operand is aligned to a 16-byte boundary; if not,
+ * a general-protection exception (#GP) is generated.
+ */
+
+#ifdef JSIMD_NO_SSECONST_ALIGNMENT_CHECK
+#define IS_CONST_ALIGNED_16(p)	(1)
+#else
+#define IS_CONST_ALIGNED_16(p)	(((unsigned)(p) & 0x0F) == 0)
+#endif
+
+
 /* Short forms of external names for systems with brain-damaged linkers. */
 
 #ifdef NEED_SHORT_EXTERNAL_NAMES
@@ -327,6 +347,8 @@
 #define jzero_far		jZeroFar
 #define jpeg_zigzag_order	jZIGTable
 #define jpeg_natural_order	jZAGTable
+#define jpeg_simd_cpu_support	jSiCpuSupport
+#define jpeg_simd_os_support	jSiOsSupport
 #endif /* NEED_SHORT_EXTERNAL_NAMES */
 
 
@@ -382,6 +404,10 @@
 #endif
 extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */
 
+/* SIMD Ext: retrieve SIMD/CPU information */
+EXTERN(unsigned int) jpeg_simd_cpu_support JPP((void));
+EXTERN(unsigned int) jpeg_simd_os_support JPP((unsigned int simd));
+
 /* Suppress undefined-structure complaints if necessary. */
 
 #ifdef INCOMPLETE_TYPES_BROKEN
diff --git a/jpeglib.h b/jpeglib.h
index d1be8dd..0506316 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -5,6 +5,13 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified for SIMD extension.
+ * Last Modified : February 4, 2006
+ * ---------------------------------------------------------------------
+ *
  * This file defines the application interface for the JPEG library.
  * Most applications using the library need only include this file,
  * and perhaps jerror.h if they want to know the exact error codes.
@@ -13,6 +20,10 @@
 #ifndef JPEGLIB_H
 #define JPEGLIB_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /*
  * First we include the configuration files that record how this
  * installation of the JPEG library is set up.  jconfig.h can be
@@ -33,6 +44,13 @@
 #define JPEG_LIB_VERSION  62	/* Version 6b */
 
 
+/* SIMD Ext: Version ID for the SIMD extension.
+ */
+
+#define JPEG_SIMDEXT_VERSION  102	/* version 1.02 */
+#define JPEG_SIMDEXT_VER_STR  "1.02"
+
+
 /* Various constants determining the sizes of things.
  * All of these are specified by the JPEG standard, so don't change them
  * if you want to be compatible.
@@ -235,6 +253,15 @@
 	JDITHER_FS		/* Floyd-Steinberg error diffusion dither */
 } J_DITHER_MODE;
 
+/* SIMD Ext: bitflags for jpeg_simd_support() and jpeg_simd_mask() */
+
+#define JSIMD_NONE    0x00
+#define JSIMD_MMX     0x01
+#define JSIMD_3DNOW   0x02
+#define JSIMD_SSE     0x04
+#define JSIMD_SSE2    0x08
+#define JSIMD_ALL     (JSIMD_MMX | JSIMD_3DNOW | JSIMD_SSE | JSIMD_SSE2)
+
 
 /* Common fields between JPEG compression and decompression master structs. */
 
@@ -877,6 +904,18 @@
 #define jpeg_abort		jAbort
 #define jpeg_destroy		jDestroy
 #define jpeg_resync_to_restart	jResyncRestart
+#define jpeg_simd_support	jSiSupport
+#ifndef JSIMD_MASKFUNC_NOT_SUPPORTED
+#define jpeg_simd_mask		jSiMask
+#endif
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+#define jpeg_simd_color_converter	jSiCColor
+#define jpeg_simd_downsampler		jSiDownsampler
+#define jpeg_simd_forward_dct		jSiFDCT
+#define jpeg_simd_color_deconverter	jSiDColor
+#define jpeg_simd_upsampler		jSiUpsampler
+#define jpeg_simd_inverse_dct		jSiIDCT
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
 #endif /* NEED_SHORT_EXTERNAL_NAMES */
 
 
@@ -1037,6 +1076,24 @@
 EXTERN(boolean) jpeg_resync_to_restart JPP((j_decompress_ptr cinfo,
 					    int desired));
 
+/* SIMD Ext: retrieve SIMD/CPU information */
+EXTERN(unsigned int) jpeg_simd_support JPP((j_common_ptr cinfo));
+#ifndef JSIMD_MASKFUNC_NOT_SUPPORTED
+EXTERN(unsigned int) jpeg_simd_mask
+	JPP((j_common_ptr cinfo, unsigned int remove, unsigned int add));
+#endif
+#ifndef JSIMD_MODEINFO_NOT_SUPPORTED
+EXTERN(unsigned int) jpeg_simd_color_converter JPP((j_compress_ptr cinfo));
+EXTERN(unsigned int) jpeg_simd_downsampler JPP((j_compress_ptr cinfo));
+EXTERN(unsigned int) jpeg_simd_forward_dct JPP((j_compress_ptr cinfo,
+						int method));
+EXTERN(unsigned int) jpeg_simd_color_deconverter JPP((j_decompress_ptr cinfo));
+EXTERN(unsigned int) jpeg_simd_upsampler JPP((j_decompress_ptr cinfo,
+					      int do_fancy));
+EXTERN(unsigned int) jpeg_simd_inverse_dct JPP((j_decompress_ptr cinfo,
+						int method));
+#endif /* !JSIMD_MODEINFO_NOT_SUPPORTED */
+
 
 /* These marker codes are exported since applications and data source modules
  * are likely to want to use them.
@@ -1093,4 +1150,8 @@
 #include "jerror.h"		/* fetch error codes too */
 #endif
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* JPEGLIB_H */
diff --git a/jsimdcpu.asm b/jsimdcpu.asm
new file mode 100644
index 0000000..1c851d1
--- /dev/null
+++ b/jsimdcpu.asm
@@ -0,0 +1,112 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : August 23, 2005
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_CONST
+
+	alignz	16
+
+copyright:
+	db	" x86 SIMD ext for IJG lib V", JPEG_SIMDEXT_VER_STR
+	db	" Copyright 2006, MIYASAKA Masaru "
+
+	alignz	16
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support (void)
+;
+
+	align	16
+	global	EXTN(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+	push	ebx
+;	push	ecx		; need not be preserved
+;	push	edx		; need not be preserved
+;	push	esi		; unused
+	push	edi
+
+	xor	edi,edi			; simd support flag
+
+	pushfd
+	pop	eax
+	mov	edx,eax
+	xor	eax, 1<<21		; flip ID bit in EFLAGS
+	push	eax
+	popfd
+	pushfd
+	pop	eax
+	xor	eax,edx
+	jz	short .return		; CPUID is not supported
+
+	; Check for MMX, SSE and SSE2 instruction support
+	xor	eax,eax
+	cpuid
+	test	eax,eax
+	jz	short .return
+
+	xor	eax,eax
+	inc	eax
+	cpuid
+	mov	eax,edx			; eax = Standard feature flags
+
+	test	eax, 1<<23		; bit23:MMX
+	jz	short .no_mmx
+	or	edi, byte JSIMD_MMX
+.no_mmx:
+	test	eax, 1<<25		; bit25:SSE
+	jz	short .no_sse
+	or	edi, byte JSIMD_SSE
+.no_sse:
+	test	eax, 1<<26		; bit26:SSE2
+	jz	short .no_sse2
+	or	edi, byte JSIMD_SSE2
+.no_sse2:
+
+	; Check for 3DNow! instruction support
+	mov	eax, 0x80000000
+	cpuid
+	cmp	eax, 0x80000000
+	jbe	short .return
+
+	mov	eax, 0x80000001
+	cpuid
+	mov	eax,edx			; eax = Extended feature flags
+
+	test	eax, 1<<31		; bit31:3DNow!(vendor independent)
+	jz	short .no_3dnow
+	or	edi, byte JSIMD_3DNOW
+.no_3dnow:
+
+.return:
+	mov	eax,edi
+
+	pop	edi
+;	pop	esi		; unused
+;	pop	edx		; need not be preserved
+;	pop	ecx		; need not be preserved
+	pop	ebx
+	ret
+
diff --git a/jsimddjg.asm b/jsimddjg.asm
new file mode 100644
index 0000000..02c82e4
--- /dev/null
+++ b/jsimddjg.asm
@@ -0,0 +1,130 @@
+;
+; jsimddjg.asm - SIMD instruction support check (for DJGPP V.2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : September 26, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Check if the OS supports SIMD instructions (DJGPP V.2)
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_os_support (unsigned int simd)
+;
+
+%define EXCEPTION_ILLEGAL_INSTRUCTION	6	; vector number of #UD
+
+%define simd	ebp+8			; unsigned int simd
+%define mxcsr	ebp-4			; unsigned int mxcsr = 0x1F80
+
+	align	16
+	global	EXTN(jpeg_simd_os_support)
+
+EXTN(jpeg_simd_os_support):
+	push	ebp
+	mov	ebp,esp
+	push	dword 0x1F80		; default value of MXCSR register
+	push	ebx
+
+	push	DWORD [simd]	; simd_flags - modified from exception_handler
+
+	mov	bl, EXCEPTION_ILLEGAL_INSTRUCTION
+	mov	ax, 0x0202	; Get Processor Exception Handler Vector
+	int	0x31		; DPMI function call
+	push	ecx		; selector of old exception handler
+	push	edx		; offset   of old exception handler
+
+	mov	ecx,cs
+	mov	edx, exception_handler
+	mov	bl, EXCEPTION_ILLEGAL_INSTRUCTION
+	mov	ax, 0x0203	; Set Processor Exception Handler Vector
+	int	0x31		; DPMI function call
+
+	mov	eax, DWORD [simd]
+
+	; If floating point emulation is enabled (CR0.EM = 1),
+	; executing an MMX/3DNow! instruction generates invalid
+	; opcode exception (#UD).
+
+	push	byte (.mmx_1 - .mmx_0)		; inst_bytes
+	push	byte (JSIMD_MMX | JSIMD_3DNOW)	; test_flags
+	test	eax, DWORD [esp]
+	jz	short .mmx_1
+.mmx_0:	emms				; executing MMX instruction
+.mmx_1:	add	esp, byte 8
+
+	push	byte (.sse_1 - .sse_0)
+	push	byte (JSIMD_SSE | JSIMD_SSE2)
+	test	eax, DWORD [esp]
+	jz	short .sse_1
+.sse_0:	ldmxcsr	DWORD [mxcsr]		; executing SSE instruction
+.sse_1:	add	esp, byte 8
+
+	pop	edx		; offset   of old exception handler
+	pop	ecx		; selector of old exception handler
+	mov	bl, EXCEPTION_ILLEGAL_INSTRUCTION
+	mov	ax, 0x0203	; Set Processor Exception Handler Vector
+	int	0x31		; DPMI function call
+
+	pop	eax		; return simd_flags
+	and	eax, byte JSIMD_ALL
+
+	pop	ebx
+	mov	esp,ebp
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; LOCAL(void) far
+; exception_handler (unsigned long error_code,
+;                    void * context_eip, unsigned short context_cs,
+;                    unsigned long context_eflags,
+;                    void * context_esp, unsigned short context_ss);
+;
+
+%define error_code	esp+12+8	; unsigned long error_code
+%define context_eip	esp+12+12	; void * context_eip
+%define context_cs	esp+12+16	; unsigned short context_cs
+%define context_eflags	esp+12+20	; unsigned long context_eflags
+%define context_esp	esp+12+24	; void * context_esp
+%define context_ss	esp+12+28	; unsigned short context_ss
+
+%define test_flags(b)	(b)+0
+%define inst_bytes(b)	(b)+4
+%define simd_flags(b)	(b)+16
+
+	align	16
+
+exception_handler:
+	push	eax
+	push	ecx
+	push	edx
+
+	mov	eax, POINTER [context_esp]
+	mov	ecx, DWORD [test_flags(eax)]
+	mov	edx, DWORD [inst_bytes(eax)]
+	not	ecx
+	add	POINTER [context_eip], edx	; next instruction
+	and	DWORD [simd_flags(eax)], ecx	; turn off flag
+
+	pop	edx
+	pop	ecx
+	pop	eax
+	retf
+
diff --git a/jsimdext.inc b/jsimdext.inc
new file mode 100644
index 0000000..a502c07
--- /dev/null
+++ b/jsimdext.inc
@@ -0,0 +1,347 @@
+;
+; jsimdext.inc - common declarations
+;
+; x86 SIMD extension for IJG JPEG library - version 1.02
+;
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+;
+; This software is provided 'as-is', without any express or implied
+; warranty.  In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+;    claim that you wrote the original software. If you use this software
+;    in a product, an acknowledgment in the product documentation would be
+;    appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+;    misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+;
+; Last Modified : February 4, 2006
+;
+; [TAB8]
+
+%ifndef JSIMDCFG_INCLUDED	; in case jsimdcfg.inc already did
+%include "jsimdcfg.inc"		; configuration declarations
+%endif
+
+; ==========================================================================
+;  System-dependent configurations
+
+%ifdef WIN32	; ----(nasm -fwin32 -DWIN32 ...)--------
+; * Microsoft Visual C++
+; * MinGW (Minimalist GNU for Windows)
+; * CygWin
+; * LCC-Win32
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text  align=16 public use32 class=CODE
+%define SEG_CONST   .rdata align=16 public use32 class=CONST
+
+%elifdef OBJ32	; ----(nasm -fobj -DOBJ32 ...)----------
+; * Borland C++ (Win32)
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text  align=16 public use32 class=CODE
+%define SEG_CONST   .data  align=16 public use32 class=DATA
+
+%elifdef ELF	; ----(nasm -felf -DELF ...)------------
+; * Linux
+; * *BSD family Unix using elf format
+; * Unix System V, including Solaris x86, UnixWare and SCO Unix
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text   progbits alloc exec   nowrite align=16
+%define SEG_CONST   .rodata progbits alloc noexec nowrite align=16
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_	; ELF supports PIC
+%define EXTN(name)  name			; foo() -> foo
+
+%elifdef AOUT	; ----(nasm -faoutb/aout -DAOUT ...)----
+; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
+; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text
+%define SEG_CONST   .data
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_	; BSD-style a.out supports PIC
+
+%elifdef MACHO	; ----(nasm -fmacho -DMACHO ...)--------
+; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text  ;align=16	; nasm doesn't accept align=16. why?
+%define SEG_CONST   .rodata align=16
+
+; The generation of position-independent code (PIC) is the default on Darwin.
+;
+%define PIC
+%define GOT_SYMBOL  _MACHO_PIC_		; Mach-O style code-relative addressing
+
+%else		; ----(Other case)----------------------
+
+; -- segment definition --
+;
+%define SEG_TEXT    .text
+%define SEG_CONST   .data
+
+%endif	; ----------------------------------------------
+
+; ==========================================================================
+
+; ---- jpeglib.h -----------------------------------------------------------
+
+%define DCTSIZE		8	; The basic DCT block is 8x8 samples
+%define DCTSIZE2	64	; DCTSIZE squared; # of elements in a block
+
+%define JSIMD_NONE	0x00	; bitflags for jpeg_simd_*_support()
+%define JSIMD_MMX	0x01
+%define JSIMD_3DNOW	0x02
+%define JSIMD_SSE	0x04
+%define JSIMD_SSE2	0x08
+%define JSIMD_ALL	(JSIMD_MMX | JSIMD_3DNOW | JSIMD_SSE | JSIMD_SSE2)
+
+; ---- jpegint.h -----------------------------------------------------------
+
+; Short forms of external names for systems with brain-damaged linkers.
+;
+%ifdef NEED_SHORT_EXTERNAL_NAMES
+%define jpeg_simd_cpu_support	jSiCpuSupport
+%define jpeg_simd_os_support	jSiOsSupport
+%endif ; NEED_SHORT_EXTERNAL_NAMES
+
+; ---- jmorecfg.h ----------------------------------------------------------
+;
+; BITS_IN_JSAMPLE==8 (8-bit sample values) is the only valid setting
+; on this SIMD implementation.
+;
+%define BITS_IN_JSAMPLE	8	; Caution: Cannot be changed
+
+; Representation of a single sample (pixel element value).
+; On this SIMD implementation, this must be 'unsigned char'.
+;
+%define JSAMPLE		byte		; unsigned char
+%define SIZEOF_JSAMPLE	SIZEOF_BYTE	; sizeof(JSAMPLE)
+%define MAXJSAMPLE	255
+%define CENTERJSAMPLE	128
+
+; Representation of a DCT frequency coefficient.
+; On this SIMD implementation, this must be 'short'.
+;
+%define JCOEF		word		; short
+%define SIZEOF_JCOEF	SIZEOF_WORD	; sizeof(JCOEF)
+
+; INT32 must hold at least signed 32-bit values.
+; On this SIMD implementation, this must be 'long'.
+;
+%define INT32		dword		; long
+%define SIZEOF_INT32	SIZEOF_DWORD	; sizeof(INT32)
+
+; Datatype used for image dimensions.
+; On this SIMD implementation, this must be 'unsigned int'.
+;
+%define JDIMENSION		dword		; unsigned int
+%define SIZEOF_JDIMENSION	SIZEOF_DWORD	; sizeof(JDIMENSION)
+
+; --------------------------------------------------------------------------
+
+%define JSAMPROW		POINTER		; JSAMPLE FAR * (jpeglib.h)
+%define JSAMPARRAY		POINTER		; JSAMPROW *    (jpeglib.h)
+%define JSAMPIMAGE		POINTER		; JSAMPARRAY *  (jpeglib.h)
+%define JCOEFPTR		POINTER		; JCOEF FAR *   (jpeglib.h)
+%define SIZEOF_JSAMPROW		SIZEOF_POINTER	; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY	SIZEOF_POINTER	; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE	SIZEOF_POINTER	; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR		SIZEOF_POINTER	; sizeof(JCOEFPTR)
+
+%define POINTER			dword		; general pointer type
+%define SIZEOF_POINTER		SIZEOF_DWORD	; sizeof(POINTER)
+%define POINTER_BIT		DWORD_BIT	; sizeof(POINTER)*BYTE_BIT
+
+%define INT			dword		; signed integer type
+%define SIZEOF_INT		SIZEOF_DWORD	; sizeof(INT)
+%define INT_BIT			DWORD_BIT	; sizeof(INT)*BYTE_BIT
+
+%define FP32			dword		; IEEE754 single
+%define SIZEOF_FP32		SIZEOF_DWORD	; sizeof(FP32)
+%define FP32_BIT		DWORD_BIT	; sizeof(FP32)*BYTE_BIT
+
+%define FP64			qword		; IEEE754 double
+%define SIZEOF_FP64		SIZEOF_QWORD	; sizeof(FP64)
+%define FP64_BIT		QWORD_BIT	; sizeof(FP64)*BYTE_BIT
+
+%define FP80			tword		; IEEE754 double-extended(x86)
+%define SIZEOF_FP80		SIZEOF_TWORD	; sizeof(FP80)
+%define FP80_BIT		TWORD_BIT	; sizeof(FP80)*BYTE_BIT
+
+%define MMWORD			qword		; int64  (MMX register)
+%define SIZEOF_MMWORD		SIZEOF_QWORD	; sizeof(MMWORD)
+%define MMWORD_BIT		QWORD_BIT	; sizeof(MMWORD)*BYTE_BIT
+
+%define XMMWORD			dqword		; int128 (SSE register)
+%define SIZEOF_XMMWORD		SIZEOF_DQWORD	; sizeof(XMMWORD)
+%define XMMWORD_BIT		DQWORD_BIT	; sizeof(XMMWORD)*BYTE_BIT
+
+%define SIZEOF_BYTE		1		; sizeof(BYTE)
+%define SIZEOF_WORD		2		; sizeof(WORD)
+%define SIZEOF_DWORD		4		; sizeof(DWORD)
+%define SIZEOF_QWORD		8		; sizeof(QWORD)
+%define SIZEOF_TBYTE		10		; sizeof(TBYTE)
+%define SIZEOF_TWORD		10		; sizeof(TWORD)
+%define SIZEOF_DQWORD		16		; sizeof(DQWORD)
+
+%define BYTE_BIT		8		; CHAR_BIT in C
+%define WORD_BIT		16		; sizeof(WORD)*BYTE_BIT
+%define DWORD_BIT		32		; sizeof(DWORD)*BYTE_BIT
+%define QWORD_BIT		64		; sizeof(QWORD)*BYTE_BIT
+%define TBYTE_BIT		80		; sizeof(TBYTE)*BYTE_BIT
+%define TWORD_BIT		80		; sizeof(TWORD)*BYTE_BIT
+%define DQWORD_BIT		128		; sizeof(DQWORD)*BYTE_BIT
+
+%idefine TBYTE	TWORD	; NASM uses the keyword 'TWORD' instead of 'TBYTE'
+%idefine DQWORD		; currently not supported by NASM
+%idefine _MMWORD	;
+%idefine _DWORD		;
+
+; --------------------------------------------------------------------------
+;  External Symbol Name
+;
+%ifndef EXTN
+%define EXTN(name)   _ %+ name		; foo() -> _foo
+%endif
+
+; --------------------------------------------------------------------------
+;  Macros for position-independent code (PIC) support
+;
+%ifndef GOT_SYMBOL
+%undef PIC
+%endif
+
+%ifdef PIC ; -------------------------------------------
+
+%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
+
+; At present, nasm doesn't seem to support PIC generation for Mach-O.
+; The PIC support code below is a little tricky.
+
+	SECTION	SEG_CONST
+const_base:
+
+%define GOTOFF(got,sym) (got) + (sym) - const_base
+
+%imacro get_GOT	1
+	; NOTE: this macro destroys ecx resister.
+	call	%%geteip
+	add	ecx, byte (%%ref - $)
+	jmp	short %%adjust
+%%geteip:
+	mov	ecx, POINTER [esp]
+	ret
+%%adjust:
+	push	ebp
+	xor	ebp,ebp		; ebp = 0
+%ifidni %1,ebx	; (%1 == ebx)
+	; db 0x8D,0x9C + jmp near const_base =
+	;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
+	db	0x8D,0x9C		; 8D,9C
+	jmp	near const_base		; E9,(const_base-%%ref)
+%%ref:
+%else  ; (%1 != ebx)
+	; db 0x8D,0x8C + jmp near const_base =
+	;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
+	db	0x8D,0x8C		; 8D,8C
+	jmp	near const_base		; E9,(const_base-%%ref)
+%%ref:	mov	%1, ecx
+%endif ; (%1 == ebx)
+	pop	ebp
+%endmacro
+
+%else	; GOT_SYMBOL != _MACHO_PIC_ ----------------
+
+%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
+
+%imacro get_GOT	1
+	extern	GOT_SYMBOL
+	call	%%geteip
+	add	%1, GOT_SYMBOL + $$ - $ wrt ..gotpc
+	jmp	short %%done
+%%geteip:
+	mov	%1, POINTER [esp]
+	ret
+%%done:
+%endmacro
+
+%endif	; GOT_SYMBOL == _MACHO_PIC_ ----------------
+
+%imacro pushpic	1.nolist
+	push	%1
+%endmacro
+%imacro poppic	1.nolist
+	pop	%1
+%endmacro
+%imacro movpic	2.nolist
+	mov	%1,%2
+%endmacro
+
+%else	; !PIC -----------------------------------------
+
+%define GOTOFF(got,sym) (sym)
+
+%imacro get_GOT	1.nolist
+%endmacro
+%imacro pushpic	1.nolist
+%endmacro
+%imacro poppic	1.nolist
+%endmacro
+%imacro movpic	2.nolist
+%endmacro
+
+%endif	;  PIC -----------------------------------------
+
+; --------------------------------------------------------------------------
+;  Align the next instruction on {2,4,8,16,..}-byte boundary.
+;  ".balign n,,m" in GNU as
+;
+%define MSKLE(x,y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
+%define FILLB(b,n)  (($$-(b)) & ((n)-1))
+
+%imacro alignx 1-2.nolist 0xFFFF
+%%bs:	times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
+	       db 0x90                               ; nop
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
+	       db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
+	       db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
+	       db 0x8D,0xAD,0x00,0x00,0x00,0x00      ; lea ebp,[ebp+0x00000000]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
+	       db 0x8D,0x6C,0x25,0x00                ; lea ebp,[ebp+0x00]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
+	       db 0x8D,0x6D,0x00                     ; lea ebp,[ebp+0x00]
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
+	       db 0x8B,0xED                          ; mov ebp,ebp
+	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
+	       db 0x90                               ; nop
+%endmacro
+
+; Align the next data on {2,4,8,16,..}-byte boundary.
+;
+%imacro alignz 1.nolist
+	align %1, db 0		; filling zeros
+%endmacro
+
+; --------------------------------------------------------------------------
diff --git a/jsimdgcc.c b/jsimdgcc.c
new file mode 100644
index 0000000..d6ad75b
--- /dev/null
+++ b/jsimdgcc.c
@@ -0,0 +1,95 @@
+/*
+ * jsimdgcc.c - SIMD instruction support check (gcc)
+ *
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * Last Modified : January 24, 2006
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+#include <setjmp.h>
+#include <signal.h>
+
+
+static volatile int lockf /* = 0 */;
+static jmp_buf jmpbuf;
+
+
+/*
+ * Exception handler for signal()
+ */
+
+LOCAL(void)
+exception_handler (int sig)
+{
+  signal(SIGILL, SIG_DFL);
+  longjmp(jmpbuf, 1);
+}
+
+
+/*
+ * Check if the OS supports SIMD instructions
+ */
+
+GLOBAL(unsigned int)
+jpeg_simd_os_support (unsigned int simd)
+{
+#ifdef __GNUC__		/* gcc (i386) */
+  unsigned int mxcsr = 0x1F80;
+
+  /* enter critical section */
+  __asm__ __volatile__ (
+  "get_lock:                  \n\t"
+    "movl  $1,%%eax           \n\t"
+    "xchgl %0,%%eax           \n\t"	/* try to get lock */
+    "cmpl  $0,%%eax           \n\t"	/* test if successful */
+    "je    critical_section   \n"
+  "spin_loop:                 \n\t"
+  /*".byte 0xF3,0x90          \n\t"*/	/* "pause" on P4 (short delay) */
+    "cmpl  $0,%0              \n\t"	/* check if lock is free */
+    "jne   spin_loop          \n\t"
+    "jmp   get_lock           \n"
+  "critical_section:          \n\t"
+     : "=m" (lockf) : "m" (lockf) : "%eax"
+  );
+
+  /* If floating point emulation is enabled (CR0.EM = 1),
+   * executing an MMX/3DNow! instruction generates invalid
+   * opcode exception (#UD).
+   */
+  if (simd & (JSIMD_MMX | JSIMD_3DNOW)) {
+    if (!setjmp(jmpbuf)) {
+      signal(SIGILL, exception_handler);
+      __asm__ __volatile__ (
+        ".byte 0x0F,0x77"		/* emms */
+      );
+      signal(SIGILL, SIG_DFL);
+    } else {
+      simd &= ~(JSIMD_MMX | JSIMD_3DNOW);
+    }
+  }
+  if (simd & (JSIMD_SSE | JSIMD_SSE2)) {
+    if (!setjmp(jmpbuf)) {
+      signal(SIGILL, exception_handler);
+      __asm__ __volatile__ (
+        "leal  %0,%%eax        \n\t"
+        ".byte 0x0F,0xAE,0x10  \n\t"	/* ldmxcsr [eax] */
+         : : "m" (mxcsr) : "%eax"
+      );
+      signal(SIGILL, SIG_DFL);
+    } else {
+      simd &= ~(JSIMD_SSE | JSIMD_SSE2);
+    }
+  }
+
+  /* leave critical section */
+  lockf = 0;	/* release lock */
+#endif	/* __GNUC__ */
+
+  return simd;
+}
diff --git a/jsimdw32.asm b/jsimdw32.asm
new file mode 100644
index 0000000..7f2cdbc
--- /dev/null
+++ b/jsimdw32.asm
@@ -0,0 +1,121 @@
+;
+; jsimdw32.asm - SIMD instruction support check (for Win32)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; Last Modified : September 26, 2004
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+	SECTION	SEG_TEXT
+	BITS	32
+;
+; Check if the OS supports SIMD instructions (Win32)
+;
+; Reference: "Win32 Exception handling for assembler programmers"
+;               http://www.jorgon.freeserve.co.uk/Except/Except.htm
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_os_support (unsigned int simd)
+;
+
+%define simd	ebp+8			; unsigned int simd
+%define mxcsr	ebp-4			; unsigned int mxcsr = 0x1F80
+
+	align	16
+	global	EXTN(jpeg_simd_os_support)
+
+EXTN(jpeg_simd_os_support):
+	push	ebp
+	mov	ebp,esp
+	push	dword 0x1F80		; default value of MXCSR register
+	push	exception_handler
+	push	POINTER [fs:0]		; prev_record_ptr
+	mov	POINTER [fs:0], esp	; this_record_ptr
+
+	mov	eax, DWORD [simd]
+	and	eax, byte JSIMD_ALL
+	xor	ecx,ecx
+	xor	edx,edx
+
+	; If floating point emulation is enabled (CR0.EM = 1),
+	; executing an MMX/3DNow! instruction generates invalid
+	; opcode exception (#UD).
+
+	mov	cl, (JSIMD_MMX | JSIMD_3DNOW)
+	mov	dl, (.mmx_1 - .mmx_0)
+	test	al,cl
+	jz	short .mmx_1
+.mmx_0:	emms				; executing MMX instruction
+.mmx_1:
+	mov	cl, (JSIMD_SSE | JSIMD_SSE2)
+	mov	dl, (.sse_1 - .sse_0)
+	test	al,cl
+	jz	short .sse_1
+.sse_0:	ldmxcsr	DWORD [mxcsr]		; executing SSE instruction
+.sse_1:
+
+	pop	POINTER [fs:0]		; prev_record_ptr
+	mov	esp,ebp
+	pop	ebp
+	ret
+
+; --------------------------------------------------------------------------
+;
+; LOCAL(EXCEPTION_DISPOSITION)
+; exception_handler (struct _EXCEPTION_RECORD * ExceptionRecord,
+;                    void * EstablisherFrame, struct _CONTEXT * ContextRecord,
+;                    void * DispatcherContext);
+;
+
+%define ExceptionContinueExecution  0	; from <excpt.h>
+%define ExceptionContinueSearch     1	; typedef enum _EXCEPTION_DISPOSITION {
+%define ExceptionNestedException    2	;   ...
+%define ExceptionCollidedUnwind     3	; } EXCEPTION_DISPOSITION
+
+%define EXCEPTION_ILLEGAL_INSTRUCTION   0xC000001D	; from <winbase.h>
+
+%define ExceptionRecord     esp+4	; struct _EXCEPTION_RECORD *
+%define EstablisherFrame    esp+8	; void * EstablisherFrame
+%define ContextRecord       esp+12	; struct _CONTEXT * ContextRecord
+%define DispatcherContext   esp+16	; void * DispatcherContext
+
+%define ExceptionCode(b)    (b)+0	; ExceptionRecord->ExceptionCode
+%define ExceptionFlags(b)   (b)+4	; ExceptionRecord->ExceptionFlags
+%define Context_Edx(b)      (b)+168	; ContextRecord->Edx
+%define Context_Ecx(b)      (b)+172	; ContextRecord->Ecx
+%define Context_Eax(b)      (b)+176	; ContextRecord->Eax
+%define Context_Eip(b)      (b)+184	; ContextRecord->Eip
+
+	align	16
+
+exception_handler:
+	mov	edx, POINTER [ExceptionRecord]
+	mov	eax, ExceptionContinueSearch
+
+	cmp	DWORD [ExceptionFlags(edx)], byte 0
+	jne	short .return			; noncontinuable exception
+	cmp	DWORD [ExceptionCode(edx)], EXCEPTION_ILLEGAL_INSTRUCTION
+	jne	short .return			; not a #UD exception
+
+	mov	eax, POINTER [ContextRecord]
+	mov	ecx, DWORD [Context_Ecx(eax)]
+	mov	edx, DWORD [Context_Edx(eax)]
+	not	ecx
+	add	DWORD [Context_Eip(eax)], edx	; next instruction
+	and	DWORD [Context_Eax(eax)], ecx	; turn off flag
+	mov	eax, ExceptionContinueExecution
+.return:
+	ret
+
diff --git a/libjpeg.spec b/libjpeg.spec
new file mode 100644
index 0000000..2c9c224
--- /dev/null
+++ b/libjpeg.spec
@@ -0,0 +1,234 @@
+%define LIBVER 62.1.0
+Summary: A library for manipulating JPEG image format files (with SIMD support)
+Summary(ja): JPEG ·Á¼°²èÁü¥Õ¥¡¥¤¥ë¤ò°·¤¦°Ù¤Î¥é¥¤¥Ö¥é¥ê (x86 SIMD ÂбþÈÇ)
+Name: libjpeg
+Version: 6bx1.02
+Release: 1
+License: distributable
+Group: System Environment/Libraries
+Source0: http://cetus.sakura.ne.jp/softlab/jpeg-x86simd/sources/jpegsrc-6b-x86simd-1.02.tar.bz2
+Buildroot: %{_tmppath}/%{name}-%{version}-root
+ExclusiveArch: %{ix86}
+BuildPrereq: nasm >= 0.98.25
+
+%package devel
+Summary: Development tools for programs which will use the libjpeg library.
+Summary(ja): libjpeg ¥é¥¤¥Ö¥é¥ê¤ò»È¤¦¥×¥í¥°¥é¥à¸þ¤±³«È¯¥Ä¡¼¥ë
+Group: Development/Libraries
+Requires: libjpeg = %{version}-%{release}
+
+%description
+The libjpeg package contains a library of functions for manipulating
+JPEG images, as well as simple client programs for accessing the
+libjpeg functions.  Libjpeg client programs include cjpeg, djpeg,
+jpegtran, rdjpgcom and wrjpgcom.  Cjpeg compresses an image file into
+JPEG format.  Djpeg decompresses a JPEG file into a regular image
+file.  Jpegtran can perform various useful transformations on JPEG
+files.  Rdjpgcom displays any text comments included in a JPEG file.
+Wrjpgcom inserts text comments into a JPEG file.
+
+The libjpeg library in this package uses SIMD instructions if available.
+On a processor that supports SIMD instructions (MMX, SSE, etc),
+it runs 2-3 times faster than the original version of libjpeg.
+
+%description -l ja
+libjpeg ¥Ñ¥Ã¥±¡¼¥¸¤Ë¤Ï JPEG ²èÁü¤ò°·¤¦°Ù¤ËɬÍפʥ饤¥Ö¥é¥ê¤È¡¤
+libjpeg ´Ø¿ô¤Ë¥¢¥¯¥»¥¹¤¹¤ë°Ù¤Î´Êñ¤Ê¥¯¥é¥¤¥¢¥ó¥È¥×¥í¥°¥é¥à¤¬
+¼ý¤á¤é¤ì¤Æ¤¤¤Þ¤¹¡¥libjpeg ¥¯¥é¥¤¥¢¥ó¥È¥×¥í¥°¥é¥à¤Ë¤Ï cjpeg, djpeg,
+jpegtran, rdjpgcom, wrjpgcom ¤¬¤¢¤ê¤Þ¤¹¡¥cjpeg ¤Ï²èÁü¥Õ¥¡¥¤¥ë¤ò
+JPEG ·Á¼°¤Ë°µ½Ì¤·¤Þ¤¹¡¥djpeg ¤Ï JPEG ¥Õ¥¡¥¤¥ë¤òÄ̾ï¤Î²èÁü¥Õ¥¡¥¤¥ë¤Ë
+Ÿ³«¤·¤Þ¤¹¡¥jpegtran ¤Ï JPEG ¥Õ¥¡¥¤¥ë¤ËÍÍ¡¹¤ÊÊÑ´¹¤ò»Ü¤¹¤³¤È¤¬½ÐÍè¤Þ¤¹¡¥
+rdjpgcom ¤Ï JPEG ¥Õ¥¡¥¤¥ë¤Ë´Þ¤Þ¤ì¤Æ¤¤¤ë¥Æ¥­¥¹¥È·Á¼°¤Î¥³¥á¥ó¥È¤òɽ¼¨¤·¡¤
+wrjpgcom ¤Ï JPEG ¥Õ¥¡¥¤¥ë¤Ë¥Æ¥­¥¹¥È·Á¼°¤Î¥³¥á¥ó¥È¤òÄɲä·¤Þ¤¹¡¥
+
+¤³¤Î¥Ñ¥Ã¥±¡¼¥¸¤Ë¼ý¤á¤é¤ì¤Æ¤¤¤ë libjpeg ¥é¥¤¥Ö¥é¥ê¤Ï¡¢x86 SIMD ÂбþÈǤǤ¹¡£
+MMX ¤ä SSE ¤Ê¤É¤Î SIMD ±é»»µ¡Ç½¤òÁõÈ÷¤·¤Æ¤¤¤ë¥×¥í¥»¥Ã¥µ¾å¤ÇÆ°ºî¤µ¤»¤ë¤È¡¢
+¥ª¥ê¥¸¥Ê¥ëÈǤΠlibjpeg ¥é¥¤¥Ö¥é¥ê¤ÈÈæ³Ó¤·¤Æ 2¡Á3ÇÜÄøÅ٤ήÅÙ¤ÇÆ°ºî¤·¤Þ¤¹¡£
+
+%description devel
+The libjpeg-devel package includes the header files and static libraries
+necessary for developing programs which will manipulate JPEG files using
+the libjpeg library.
+
+If you are going to develop programs which will manipulate JPEG images,
+you should install libjpeg-devel.  You'll also need to have the libjpeg
+package installed.
+
+%description devel -l ja
+libjpeg-devel ¥Ñ¥Ã¥±¡¼¥¸¤Ë¤Ï¡¤libjpeg ¥é¥¤¥Ö¥é¥ê¤ò»È¤Ã¤Æ JPEG ¥Õ¥¡¥¤¥ë¤ò
+°·¤¦¥×¥í¥°¥é¥à¤ò³«È¯¤¹¤ë¤Î¤ËɬÍפʥإåÀ¥Õ¥¡¥¤¥ë¤È¥¹¥¿¥Æ¥£¥Ã¥¯¥é¥¤¥Ö¥é¥ê¤¬
+¼ý¤á¤é¤ì¤Æ¤¤¤Þ¤¹¡¥
+
+JPEG ²èÁü¤ò°·¤¦¥×¥í¥°¥é¥à¤ò³«È¯¤¹¤ëºÝ¤Ë¤Ï¡¤libjpeg-devel ¤ò
+¥¤¥ó¥¹¥È¡¼¥ë¤·¤Æ²¼¤µ¤¤¡¥Æ±»þ¤Ë libjpeg ¥Ñ¥Ã¥±¡¼¥¸¤â¥¤¥ó¥¹¥È¡¼¥ë¤¹¤ë
+ɬÍפ¬¤¢¤ê¤Þ¤¹¡¥
+
+%prep
+%setup -q -n jpeg-6bx
+# suppress "libtoolize --copy --force"
+mv configure.in configure.in_
+
+%build
+%configure --enable-shared --enable-static
+
+make libdir=%{_libdir} %{?_smp_mflags}
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD make test
+
+%install
+rm -rf $RPM_BUILD_ROOT
+
+%makeinstall
+#strip -R .comment $RPM_BUILD_ROOT/usr/bin/* || :
+#/sbin/ldconfig -n $RPM_BUILD_ROOT/%{_libdir}
+
+%post -p /sbin/ldconfig
+
+%postun -p /sbin/ldconfig
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%files
+%defattr(-,root,root)
+%doc usage.doc wizard.doc README
+%{_libdir}/libjpeg.so.*
+%{_bindir}/*
+%{_mandir}/*/*
+
+%files devel
+%defattr(-,root,root)
+%doc libjpeg.doc coderules.doc structure.doc example.c
+%doc simd_*.txt
+%{_libdir}/*.a
+%{_libdir}/*.la
+%{_libdir}/*.so
+/usr/include/*.h
+
+%changelog
+* Sat Feb 04 2006 MIYASAKA Masaru <alkaid@coral.ocn.ne.jp> - 6bx1.02-1
+- upgraded to 6bx1.02
+
+* Thu Jan 26 2006 MIYASAKA Masaru <alkaid@coral.ocn.ne.jp> - 6bx1.01-1
+- upgraded to 6bx1.01
+
+* Thu Mar 24 2005 MIYASAKA Masaru <alkaid@coral.ocn.ne.jp> - 6bx1.0-1
+- based on 6b-33 from Fedora Core 3 and modified for SIMD-extended libjpeg
+- added Japanese summary and description, which is delivered from Vine Linux
+- moved wizard.doc to main package
+
+* Thu Oct  7 2004 Matthias Clasen <mclasen@redhat.com> - 6b-33
+- Add URL.  (#134791)
+
+* Tue Jun 15 2004 Elliot Lee <sopwith@redhat.com>
+- rebuilt
+
+* Tue Mar 02 2004 Elliot Lee <sopwith@redhat.com>
+- rebuilt
+
+* Fri Feb 13 2004 Elliot Lee <sopwith@redhat.com>
+- rebuilt
+
+* Thu Sep 25 2003 Jeremy Katz <katzj@redhat.com> 6b-30
+- rebuild to fix gzipped file md5sums (#91211)
+
+* Tue Sep 23 2003 Florian La Roche <Florian.LaRoche@redhat.de>
+- do not set rpath
+
+* Wed Jun 04 2003 Elliot Lee <sopwith@redhat.com>
+- rebuilt
+
+* Thu Feb 13 2003 Elliot Lee <sopwith@redhat.com> 6b-27
+- Add libjpeg-shared.patch to fix shlibs on powerpc
+
+* Tue Feb 04 2003 Florian La Roche <Florian.LaRoche@redhat.de>
+- add symlink to shared lib
+
+* Wed Jan 22 2003 Tim Powers <timp@redhat.com>
+- rebuilt
+
+* Mon Jan  6 2003 Jonathan Blandford <jrb@redhat.com>
+- add docs, #76508
+
+* Fri Dec 13 2002 Elliot Lee <sopwith@redhat.com> 6b-23
+- Merge in multilib changes
+- _smp_mflags
+
+* Tue Sep 10 2002 Than Ngo <than@redhat.com> 6b-22
+- use %%_libdir
+
+* Fri Jun 21 2002 Tim Powers <timp@redhat.com>
+- automated rebuild
+
+* Thu May 23 2002 Tim Powers <timp@redhat.com>
+- automated rebuild
+
+* Thu Jan 31 2002 Bernhard Rosenkraenzer <bero@redhat.com> 6b-19
+- Fix bug #59011
+
+* Mon Jan 28 2002 Bernhard Rosenkraenzer <bero@redhat.com> 6b-18
+- Fix bug #58982
+
+* Wed Jan 09 2002 Tim Powers <timp@redhat.com>
+- automated rebuild
+
+* Tue Jul 24 2001 Bill Nottingham <notting@redhat.com>
+- require libjpeg = %%{version}
+
+* Sun Jun 24 2001 Elliot Lee <sopwith@redhat.com>
+- Bump release + rebuild.
+
+* Mon Dec 11 2000 Than Ngo <than@redhat.com>
+- rebuilt with the fixed fileutils
+- use %%{_tmppath}
+
+* Wed Nov  8 2000 Bernhard Rosenkraenzer <bero@redhat.com>
+- fix a typo (strip -R .comment, not .comments)
+
+* Thu Jul 13 2000 Prospector <bugzilla@redhat.com>
+- automatic rebuild
+
+* Sat Jun 17 2000 Bernhard Rosenkraenzer <bero@redhat.com>
+- FHSify
+- add some C++ tweaks to the headers as suggested by bug #9822)
+
+* Wed May  5 2000 Bill Nottingham <notting@redhat.com>
+- configure tweaks for ia64; remove alpha patch (it's pointless)
+
+* Sat Feb  5 2000 Bernhard Rosenkräîzer <bero@redhat.com>
+- rebuild to get compressed man pages
+- fix description
+- some minor tweaks to the spec file
+- add docs
+- fix build on alpha (alphaev6 stuff)
+
+* Sun Mar 21 1999 Cristian Gafton <gafton@redhat.com> 
+- auto rebuild in the new build environment (release 9)
+
+* Wed Jan 13 1999 Cristian Gafton <gafton@redhat.com>
+- patch to build on arm
+- build for glibc 2.1
+
+* Mon Oct 12 1998 Cristian Gafton <gafton@redhat.com>
+- strip binaries
+
+* Mon Aug  3 1998 Jeff Johnson <jbj@redhat.com>
+- fix buildroot problem.
+
+* Tue Jun 09 1998 Prospector System <bugs@redhat.com>
+- translations modified for de
+
+* Thu Jun 04 1998 Marc Ewing <marc@redhat.com>
+- up to release 4
+- remove patch that set (improper) soname - libjpeg now does it itself
+
+* Thu May 07 1998 Prospector System <bugs@redhat.com>
+- translations modified for de, fr, tr
+
+* Fri May 01 1998 Cristian Gafton <gafton@redhat.com>
+- fixed build on manhattan
+
+* Wed Apr 08 1998 Cristian Gafton <gafton@redhat.com>
+- upgraded to version 6b
+
+* Wed Oct 08 1997 Donnie Barnes <djb@redhat.com>
+- new package to remove jpeg stuff from libgr and put in it's own package
diff --git a/ltconfig b/ltconfig
deleted file mode 100755
index 2347e69..0000000
--- a/ltconfig
+++ /dev/null
@@ -1,1512 +0,0 @@
-#! /bin/sh
-
-# ltconfig - Create a system-specific libtool.
-# Copyright (C) 1996-1998 Free Software Foundation, Inc.
-# Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
-#
-# This file is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# A lot of this script is taken from autoconf-2.10.
-
-# The HP-UX ksh and POSIX shell print the target directory to stdout
-# if CDPATH is set.
-if test "${CDPATH+set}" = set; then CDPATH=; export CDPATH; fi
-
-echo=echo
-if test "X`($echo '\t') 2>/dev/null`" = 'X\t'; then :
-else
-  # The Solaris and AIX default echo program unquotes backslashes.
-  # This makes it impossible to quote backslashes using
-  #   echo "$something" | sed 's/\\/\\\\/g'
-  # So, we emulate echo with printf '%s\n'
-  echo="printf %s\\n"
-  if test "X`($echo '\t') 2>/dev/null`" = 'X\t'; then :
-  else
-    # Oops.  We have no working printf.  Try to find a not-so-buggy echo.
-    echo=echo
-    IFS="${IFS= 	}"; save_ifs="$IFS"; IFS="${IFS}:"
-    for dir in $PATH /usr/ucb; do
-      if test -f $dir/echo && test "X`$dir/echo '\t'`" = 'X\t'; then
-        echo="$dir/echo"
-        break
-      fi
-    done
-    IFS="$save_ifs"
-  fi
-fi
-
-# Sed substitution that helps us do robust quoting.  It backslashifies
-# metacharacters that are still active within double-quoted strings.
-Xsed='sed -e s/^X//'
-sed_quote_subst='s/\([\\"\\`$\\\\]\)/\\\1/g'
-
-# Same as above, but do not quote variable references.
-double_quote_subst='s/\([\\"\\`\\\\]\)/\\\1/g'
-
-# The name of this program.
-progname=`$echo "X$0" | $Xsed -e 's%^.*/%%'`
-
-# Constants:
-PROGRAM=ltconfig
-PACKAGE=libtool
-VERSION=1.2
-ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.c 1>&5'
-ac_link='${CC-cc} -o conftest $CFLAGS $CPPFLAGS $LDFLAGS conftest.c $LIBS 1>&5'
-rm="rm -f"
-
-help="Try \`$progname --help' for more information."
-
-# Global variables:
-can_build_shared=yes
-enable_shared=yes
-# All known linkers require a `.a' archive for static linking.
-enable_static=yes
-ltmain=
-silent=
-srcdir=
-ac_config_guess=
-ac_config_sub=
-host=
-nonopt=
-verify_host=yes
-with_gcc=no
-with_gnu_ld=no
-
-old_AR="$AR"
-old_CC="$CC"
-old_CFLAGS="$CFLAGS"
-old_CPPFLAGS="$CPPFLAGS"
-old_LD="$LD"
-old_LN_S="$LN_S"
-old_NM="$NM"
-old_RANLIB="$RANLIB"
-
-# Parse the command line options.
-args=
-prev=
-for option
-do
-  case "$option" in
-  -*=*) optarg=`echo "$option" | sed 's/[-_a-zA-Z0-9]*=//'` ;;
-  *) optarg= ;;
-  esac
-
-  # If the previous option needs an argument, assign it.
-  if test -n "$prev"; then
-    eval "$prev=\$option"
-    prev=
-    continue
-  fi
-
-  case "$option" in
-  --help) cat <<EOM
-Usage: $progname [OPTION]... LTMAIN [HOST]
-
-Generate a system-specific libtool script.
-
-    --disable-shared       do not build shared libraries
-    --disable-static       do not build static libraries
-    --help                 display this help and exit
-    --no-verify            do not verify that HOST is a valid host type
-    --quiet                same as \`--silent'
-    --silent               do not print informational messages
-    --srcdir=DIR           find \`config.guess' in DIR
-    --version              output version information and exit
-    --with-gcc             assume that the GNU C compiler will be used
-    --with-gnu-ld          assume that the C compiler uses the GNU linker
-
-LTMAIN is the \`ltmain.sh' shell script fragment that provides basic libtool
-functionality.
-
-HOST is the canonical host system name [default=guessed].
-EOM
-  exit 0
-  ;;
-
-  --disable-shared) enable_shared=no ;;
-
-  --disable-static) enable_static=no ;;
-
-  --quiet | --silent) silent=yes ;;
-
-  --srcdir) prev=srcdir ;;
-  --srcdir=*) srcdir="$optarg" ;;
-
-  --no-verify) verify_host=no ;;
-
-  --version) echo "$PROGRAM (GNU $PACKAGE) $VERSION"; exit 0 ;;
-
-  --with-gcc) with_gcc=yes ;;
-  --with-gnu-ld) with_gnu_ld=yes ;;
-
-  -*)
-    echo "$progname: unrecognized option \`$option'" 1>&2
-    echo "$help" 1>&2
-    exit 1
-    ;;
-
-  *)
-    if test -z "$ltmain"; then
-      ltmain="$option"
-    elif test -z "$host"; then
-# This generates an unnecessary warning for sparc-sun-solaris4.1.3_U1
-#      if test -n "`echo $option| sed 's/[-a-z0-9.]//g'`"; then
-#        echo "$progname: warning \`$option' is not a valid host type" 1>&2
-#      fi
-      host="$option"
-    else
-      echo "$progname: too many arguments" 1>&2
-      echo "$help" 1>&2
-      exit 1
-    fi ;;
-  esac
-done
-
-if test -z "$ltmain"; then
-  echo "$progname: you must specify a LTMAIN file" 1>&2
-  echo "$help" 1>&2
-  exit 1
-fi
-
-if test -f "$ltmain"; then :
-else
-  echo "$progname: \`$ltmain' does not exist" 1>&2
-  echo "$help" 1>&2
-  exit 1
-fi
-
-# Quote any args containing shell metacharacters.
-ltconfig_args=
-for arg
-do
-  case "$arg" in
-  *" "*|*"	"*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?]*)
-  ltconfig_args="$ltconfig_args '$arg'" ;;
-  *) ltconfig_args="$ltconfig_args $arg" ;;
-  esac
-done
-
-# A relevant subset of AC_INIT.
-
-# File descriptor usage:
-# 0 standard input
-# 1 file creation
-# 2 errors and warnings
-# 3 some systems may open it to /dev/tty
-# 4 used on the Kubota Titan
-# 5 compiler messages saved in config.log
-# 6 checking for... messages and results
-if test "$silent" = yes; then
-  exec 6>/dev/null
-else
-  exec 6>&1
-fi
-exec 5>>./config.log
-
-# NLS nuisances.
-# Only set LANG and LC_ALL to C if already set.
-# These must not be set unconditionally because not all systems understand
-# e.g. LANG=C (notably SCO).
-if test "${LC_ALL+set}" = set; then LC_ALL=C; export LC_ALL; fi
-if test "${LANG+set}"   = set; then LANG=C;   export LANG;   fi
-
-if (echo "testing\c"; echo 1,2,3) | grep c >/dev/null; then
-  # Stardent Vistra SVR4 grep lacks -e, says ghazi@caip.rutgers.edu.
-  if (echo -n testing; echo 1,2,3) | sed s/-n/xn/ | grep xn >/dev/null; then
-    ac_n= ac_c='
-' ac_t='	'
-  else
-    ac_n=-n ac_c= ac_t=
-  fi
-else
-  ac_n= ac_c='\c' ac_t=
-fi
-
-if test -z "$srcdir"; then
-  # Assume the source directory is the same one as the path to ltmain.sh.
-  srcdir=`$echo "$ltmain" | $Xsed -e 's%/[^/]*$%%'`
-  test "$srcdir" = "$ltmain" && srcdir=.
-fi
-
-trap "$rm conftest*; exit 1" 1 2 15
-if test "$verify_host" = yes; then
-  # Check for config.guess and config.sub.
-  ac_aux_dir=
-  for ac_dir in $srcdir $srcdir/.. $srcdir/../..; do
-    if test -f $ac_dir/config.guess; then
-      ac_aux_dir=$ac_dir
-      break
-    fi
-  done
-  if test -z "$ac_aux_dir"; then
-    echo "$progname: cannot find config.guess in $srcdir $srcdir/.. $srcdir/../.." 1>&2
-    echo "$help" 1>&2
-    exit 1
-  fi
-  ac_config_guess=$ac_aux_dir/config.guess
-  ac_config_sub=$ac_aux_dir/config.sub
-
-  # Make sure we can run config.sub.
-  if $ac_config_sub sun4 >/dev/null 2>&1; then :
-  else
-    echo "$progname: cannot run $ac_config_sub" 1>&2
-    echo "$help" 1>&2
-    exit 1
-  fi
-
-  echo $ac_n "checking host system type""... $ac_c" 1>&6
-
-  host_alias=$host
-  case "$host_alias" in
-  "")
-    if host_alias=`$ac_config_guess`; then :
-    else
-      echo "$progname: cannot guess host type; you must specify one" 1>&2
-      echo "$help" 1>&2
-      exit 1
-    fi ;;
-  esac
-  host=`$ac_config_sub $host_alias`
-  echo "$ac_t$host" 1>&6
-
-  # Make sure the host verified.
-  test -z "$host" && exit 1
-
-elif test -z "$host"; then
-  echo "$progname: you must specify a host type if you use \`--no-verify'" 1>&2
-  echo "$help" 1>&2
-  exit 1
-else
-  host_alias=$host
-fi
-
-# Transform linux* to *-*-linux-gnu*, to support old configure scripts.
-case "$host_os" in
-linux-gnu*) ;;
-linux*) host=`echo $host | sed 's/^\(.*-.*-linux\)\(.*\)$/\1-gnu\2/'`
-esac
-
-host_cpu=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
-host_vendor=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
-host_os=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
-
-case "$host_os" in
-aix3*)
-  # AIX sometimes has problems with the GCC collect2 program.  For some
-  # reason, if we set the COLLECT_NAMES environment variable, the problems
-  # vanish in a puff of smoke.
-  if test "${COLLECT_NAMES+set}" != set; then
-    COLLECT_NAMES=
-    export COLLECT_NAMES
-  fi
-  ;;
-esac
-
-# Determine commands to create old-style static archives.
-old_archive_cmds='$AR cru $oldlib$oldobjs'
-old_postinstall_cmds='chmod 644 $oldlib'
-old_postuninstall_cmds=
-
-# Set a sane default for `AR'.
-test -z "$AR" && AR=ar
-
-# If RANLIB is not set, then run the test.
-if test "${RANLIB+set}" != "set"; then
-  result=no
-
-  echo $ac_n "checking for ranlib... $ac_c" 1>&6
-  IFS="${IFS= 	}"; save_ifs="$IFS"; IFS="${IFS}:"
-  for dir in $PATH; do
-    test -z "$dir" && dir=.
-    if test -f $dir/ranlib; then
-      RANLIB="ranlib"
-      result="ranlib"
-      break
-    fi
-  done
-  IFS="$save_ifs"
-
-  echo "$ac_t$result" 1>&6
-fi
-
-if test -n "$RANLIB"; then
-  old_archive_cmds="$old_archive_cmds;\$RANLIB \$oldlib"
-  old_postinstall_cmds="\$RANLIB \$oldlib;$old_postinstall_cmds"
-fi
-
-# Check to see if we are using GCC.
-if test "$with_gcc" != yes || test -z "$CC"; then
-  # If CC is not set, then try to find GCC or a usable CC.
-  if test -z "$CC"; then
-    echo $ac_n "checking for gcc... $ac_c" 1>&6
-    IFS="${IFS= 	}"; save_ifs="$IFS"; IFS="${IFS}:"
-    for dir in $PATH; do
-      IFS="$save_ifs"
-      test -z "$dir" && dir=.
-      if test -f $dir/gcc; then
-	CC="gcc"
-	break
-      fi
-    done
-    IFS="$save_ifs"
-
-    if test -n "$CC"; then
-      echo "$ac_t$CC" 1>&6
-    else
-      echo "$ac_t"no 1>&6
-    fi
-  fi
-
-  # Not "gcc", so try "cc", rejecting "/usr/ucb/cc".
-  if test -z "$CC"; then
-    echo $ac_n "checking for cc... $ac_c" 1>&6
-    IFS="${IFS= 	}"; save_ifs="$IFS"; IFS="${IFS}:"
-    cc_rejected=no
-    for dir in $PATH; do
-      test -z "$dir" && dir=.
-      if test -f $dir/cc; then
-	if test "$dir/cc" = "/usr/ucb/cc"; then
-	  cc_rejected=yes
-	  continue
-	fi
-	CC="cc"
-	break
-      fi
-    done
-    IFS="$save_ifs"
-    if test $cc_rejected = yes; then
-      # We found a bogon in the path, so make sure we never use it.
-      set dummy $CC
-      shift
-      if test $# -gt 0; then
-	# We chose a different compiler from the bogus one.
-	# However, it has the same name, so the bogon will be chosen
-	# first if we set CC to just the name; use the full file name.
-	shift
-	set dummy "$dir/cc" "$@"
-	shift
-	CC="$@"
-      fi
-    fi
-
-    if test -n "$CC"; then
-      echo "$ac_t$CC" 1>&6
-    else
-      echo "$ac_t"no 1>&6
-    fi
-
-    if test -z "$CC"; then
-      echo "$progname: error: no acceptable cc found in \$PATH" 1>&2
-      exit 1
-    fi
-  fi
-
-  # Now see if the compiler is really GCC.
-  with_gcc=no
-  echo $ac_n "checking whether we are using GNU C... $ac_c" 1>&6
-  echo "$progname:424: checking whether we are using GNU C" >&5
-
-  $rm conftest.c
-  cat > conftest.c <<EOF
-#ifdef __GNUC__
-  yes;
-#endif
-EOF
-  if { ac_try='${CC-cc} -E conftest.c'; { (eval echo $progname:432: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
-    with_gcc=yes
-  fi
-  $rm conftest.c
-  echo "$ac_t$with_gcc" 1>&6
-fi
-
-# Allow CC to be a program name with arguments.
-set dummy $CC
-compiler="$2"
-
-echo $ac_n "checking for $compiler option to produce PIC... $ac_c" 1>&6
-pic_flag=
-special_shlib_compile_flags=
-wl=
-link_static_flag=
-no_builtin_flag=
-
-if test "$with_gcc" = yes; then
-  wl='-Wl,'
-  link_static_flag='-static'
-  no_builtin_flag=' -fno-builtin'
-
-  case "$host_os" in
-  aix3* | aix4* | irix5* | irix6* | osf3* | osf4*)
-    # PIC is the default for these OSes.
-    ;;
-  os2*)
-    # We can build DLLs from non-PIC.
-    ;;
-  amigaos*)
-    # FIXME: we need at least 68020 code to build shared libraries, but
-    # adding the `-m68020' flag to GCC prevents building anything better,
-    # like `-m68040'.
-    pic_flag='-m68020 -resident32 -malways-restore-a4'
-    ;;
-  *)
-    pic_flag='-fPIC'
-    ;;
-  esac
-else
-  # PORTME Check for PIC flags for the system compiler.
-  case "$host_os" in
-  aix3* | aix4*)
-    # All AIX code is PIC.
-    link_static_flag='-bnso -bI:/lib/syscalls.exp'
-    ;;
-
-  hpux9* | hpux10*)
-    # Is there a better link_static_flag that works with the bundled CC?
-    wl='-Wl,'
-    link_static_flag="${wl}-a ${wl}archive"
-    pic_flag='+Z'
-    ;;
-
-  irix5* | irix6*)
-    wl='-Wl,'
-    link_static_flag='-non_shared'
-    # PIC (with -KPIC) is the default.
-    ;;
-
-  os2*)
-    # We can build DLLs from non-PIC.
-    ;;
-
-  osf3* | osf4*)
-    # All OSF/1 code is PIC.
-    wl='-Wl,'
-    link_static_flag='-non_shared'
-    ;;
-
-  sco3.2v5*)
-    pic_flag='-Kpic'
-    link_static_flag='-dn'
-    special_shlib_compile_flags='-belf'
-    ;;
-
-  solaris2*)
-    pic_flag='-KPIC'
-    link_static_flag='-Bstatic'
-    wl='-Wl,'
-    ;;
-
-  sunos4*)
-    pic_flag='-PIC'
-    link_static_flag='-Bstatic'
-    wl='-Qoption ld '
-    ;;
-
-  sysv4.2uw2*)
-    pic_flag='-KPIC'
-    link_static_flag='-Bstatic'
-    wl='-Wl,'
-    ;;
-
-  uts4*)
-    pic_flag='-pic'
-    link_static_flag='-Bstatic'
-    ;;
-
-  *)
-    can_build_shared=no
-    ;;
-  esac
-fi
-
-if test -n "$pic_flag"; then
-  echo "$ac_t$pic_flag" 1>&6
-
-  # Check to make sure the pic_flag actually works.
-  echo $ac_n "checking if $compiler PIC flag $pic_flag works... $ac_c" 1>&6
-  $rm conftest*
-  echo > conftest.c
-  save_CFLAGS="$CFLAGS"
-  CFLAGS="$CFLAGS $pic_flag -DPIC"
-  echo "$progname:547: checking if $compiler PIC flag $pic_flag works" >&5
-  if { (eval echo $progname:548: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; } && test -s conftest.o; then
-    # Append any warnings to the config.log.
-    cat conftest.err 1>&5
-
-    # On HP-UX, both CC and GCC only warn that PIC is supported... then they
-    # create non-PIC objects.  So, if there were any warnings, we assume that
-    # PIC is not supported.
-    if test -s conftest.err; then
-      echo "$ac_t"no 1>&6
-      can_build_shared=no
-      pic_flag=
-    else
-      echo "$ac_t"yes 1>&6
-      pic_flag=" $pic_flag"
-    fi
-  else
-    # Append any errors to the config.log.
-    cat conftest.err 1>&5
-    can_build_shared=no
-    pic_flag=
-    echo "$ac_t"no 1>&6
-  fi
-  CFLAGS="$save_CFLAGS"
-  $rm conftest*
-else
-  echo "$ac_t"none 1>&6
-fi
-
-# Check for any special shared library compilation flags.
-if test -n "$special_shlib_compile_flags"; then
-  echo "$progname: warning: \`$CC' requires \`$special_shlib_compile_flags' to build shared libraries" 1>&2
-  if echo "$old_CC $old_CFLAGS " | egrep -e "[ 	]$special_shlib_compile_flags[ 	]" >/dev/null; then :
-  else
-    echo "$progname: add \`$special_shlib_compile_flags' to the CC or CFLAGS env variable and reconfigure" 1>&2
-    can_build_shared=no
-  fi
-fi
-
-echo $ac_n "checking if $compiler static flag $link_static_flag works... $ac_c" 1>&6
-$rm conftest*
-echo 'main(){return(0);}' > conftest.c
-save_LDFLAGS="$LDFLAGS"
-LDFLAGS="$LDFLAGS $link_static_flag"
-echo "$progname:591: checking if $compiler static flag $link_static_flag works" >&5
-if { (eval echo $progname:592: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
-  echo "$ac_t$link_static_flag" 1>&6
-else
-  echo "$ac_t"none 1>&6
-  link_static_flag=
-fi
-LDFLAGS="$save_LDFLAGS"
-$rm conftest*
-
-if test -z "$LN_S"; then
-  # Check to see if we can use ln -s, or we need hard links.
-  echo $ac_n "checking whether ln -s works... $ac_c" 1>&6
-  $rm conftestdata
-  if ln -s X conftestdata 2>/dev/null; then
-    $rm conftestdata
-    LN_S="ln -s"
-  else
-    LN_S=ln
-  fi
-  if test "$LN_S" = "ln -s"; then
-    echo "$ac_t"yes 1>&6
-  else
-    echo "$ac_t"no 1>&6
-  fi
-fi
-
-# Make sure LD is an absolute path.
-if test -z "$LD"; then
-  ac_prog=ld
-  if test "$with_gcc" = yes; then
-    # Check if gcc -print-prog-name=ld gives a path.
-    echo $ac_n "checking for ld used by GCC... $ac_c" 1>&6
-    echo "$progname:624: checking for ld used by GCC" >&5
-    ac_prog=`($CC -print-prog-name=ld) 2>&5`
-    case "$ac_prog" in
-    # Accept absolute paths.
-    /* | [A-Za-z]:\\*)
-      test -z "$LD" && LD="$ac_prog"
-      ;;
-    "")
-      # If it fails, then pretend we are not using GCC.
-      ac_prog=ld
-      ;;
-    *)
-      # If it is relative, then search for the first ld in PATH.
-      with_gnu_ld=unknown
-      ;;
-    esac
-  elif test "$with_gnu_ld" = yes; then
-    echo $ac_n "checking for GNU ld... $ac_c" 1>&6
-    echo "$progname:642: checking for GNU ld" >&5
-  else
-    echo $ac_n "checking for non-GNU ld""... $ac_c" 1>&6
-    echo "$progname:645: checking for non-GNU ld" >&5
-  fi
-
-  if test -z "$LD"; then
-    IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:"
-    for ac_dir in $PATH; do
-      test -z "$ac_dir" && ac_dir=.
-      if test -f "$ac_dir/$ac_prog"; then
-	LD="$ac_dir/$ac_prog"
-	# Check to see if the program is GNU ld.  I'd rather use --version,
-	# but apparently some GNU ld's only accept -v.
-	# Break only if it was the GNU/non-GNU ld that we prefer.
-	if "$LD" -v 2>&1 < /dev/null | egrep '(GNU|with BFD)' > /dev/null; then
-	  test "$with_gnu_ld" != no && break
-	else
-	  test "$with_gnu_ld" != yes && break
-	fi
-      fi
-    done
-    IFS="$ac_save_ifs"
-  fi
-
-  if test -n "$LD"; then
-    echo "$ac_t$LD" 1>&6
-  else
-    echo "$ac_t"no 1>&6
-  fi
-
-  if test -z "$LD"; then
-    echo "$progname: error: no acceptable ld found in \$PATH" 1>&2
-    exit 1
-  fi
-fi
-
-# Check to see if it really is or is not GNU ld.
-echo $ac_n "checking if the linker ($LD) is GNU ld... $ac_c" 1>&6
-# I'd rather use --version here, but apparently some GNU ld's only accept -v.
-if $LD -v 2>&1 </dev/null | egrep '(GNU|with BFD)' 1>&5; then
-  with_gnu_ld=yes
-else
-  with_gnu_ld=no
-fi
-echo "$ac_t$with_gnu_ld" 1>&6
-
-# See if the linker supports building shared libraries.
-echo $ac_n "checking whether the linker ($LD) supports shared libraries... $ac_c" 1>&6
-
-allow_undefined_flag=
-no_undefined_flag=
-archive_cmds=
-old_archive_from_new_cmds=
-export_dynamic_flag_spec=
-hardcode_libdir_flag_spec=
-hardcode_libdir_separator=
-hardcode_direct=no
-hardcode_minus_L=no
-hardcode_shlibpath_var=unsupported
-runpath_var=
-
-case "$host_os" in
-amigaos* | sunos4*)
-  # On these operating systems, we should treat GNU ld like the system ld.
-  gnu_ld_acts_native=yes
-  ;;
-*)
-  gnu_ld_acts_native=no
-  ;;
-esac
-
-ld_shlibs=yes
-if test "$with_gnu_ld" = yes && test "$gnu_ld_acts_native" != yes; then
-
-  # See if GNU ld supports shared libraries.
-  if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
-    archive_cmds='$CC -shared ${wl}-soname $wl$soname -o $lib$libobjs'
-    runpath_var=LD_RUN_PATH
-    ld_shlibs=yes
-  else
-    ld_shlibs=no
-  fi
-
-  if test "$ld_shlibs" = yes; then
-    hardcode_libdir_flag_spec='${wl}--rpath ${wl}$libdir'
-    export_dynamic_flag_spec='${wl}--export-dynamic'
-  fi
-else
-  # PORTME fill in a description of your system's linker (not GNU ld)
-  case "$host_os" in
-  aix3*)
-    allow_undefined_flag=unsupported
-    archive_cmds='$NM$libobjs | $global_symbol_pipe | sed '\''s/.* //'\'' > $lib.exp;$LD -o $objdir/$soname$libobjs -bE:$lib.exp -T512 -H512 -bM:SRE;$AR cru $lib $objdir/$soname'
-    # Note: this linker hardcodes the directories in LIBPATH if there
-    # are no directories specified by -L.
-    hardcode_minus_L=yes
-    if test "$with_gcc" = yes && test -z "$link_static_flag"; then
-      # Neither direct hardcoding nor static linking is supported with a
-      # broken collect2.
-      hardcode_direct=unsupported
-    fi
-    ;;
-
-  aix4*)
-    allow_undefined_flag=unsupported
-    archive_cmds='$NM$libobjs | $global_symbol_pipe | sed '\''s/.* //'\'' > $lib.exp;$CC -o $objdir/$soname$libobjs ${wl}-bE:$lib.exp ${wl}-bM:SRE ${wl}-bnoentry;$AR cru $lib $objdir/$soname'
-    hardcode_direct=yes
-    hardcode_minus_L=yes
-    ;;
-
-  amigaos*)
-    archive_cmds='$rm $objdir/a2ixlibrary.data;$echo "#define NAME $libname" > $objdir/a2ixlibrary.data;$echo "#define LIBRARY_ID 1" >> $objdir/a2ixlibrary.data;$echo "#define VERSION $major" >> $objdir/a2ixlibrary.data;$echo "#define REVISION $revision" >> $objdir/a2ixlibrary.data;$AR cru $lib$libobjs;$RANLIB $lib;(cd $objdir && a2ixlibrary -32)'
-    hardcode_libdir_flag_spec='-L$libdir'
-    hardcode_minus_L=yes
-    ;;
-
-  # FreeBSD 2.2.[012] allows us to include c++rt0.o to get C++ constructor
-  # support.  Future versions do this automatically, but an explicit c++rt0.o
-  # does not break anything, and helps significantly (at the cost of a little
-  # extra space).
-  freebsd2.2*)
-    archive_cmds='$LD -Bshareable -o $lib$libobjs /usr/lib/c++rt0.o'
-    hardcode_libdir_flag_spec='-R$libdir'
-    hardcode_direct=yes
-    hardcode_minus_L=yes
-    hardcode_shlibpath_var=no
-    ;;
-
-  # Unfortunately, older versions of FreeBSD 2 do not have this feature.
-  freebsd2*)
-    archive_cmds='$LD -Bshareable -o $lib$libobjs'
-    hardcode_direct=yes
-    hardcode_minus_L=yes
-    hardcode_shlibpath_var=no
-    ;;
-
-  # FreeBSD 3, at last, uses gcc -shared to do shared libraries.
-  freebsd3*)
-    archive_cmds='$CC -shared -o $lib$libobjs'
-    hardcode_libdir_flag_spec='-R$libdir'
-    hardcode_direct=yes
-    hardcode_minus_L=yes
-    hardcode_shlibpath_var=no
-    ;;
-
-  hpux9*)
-    archive_cmds='$rm $objdir/$soname;$LD -b +s +b $install_libdir -o $objdir/$soname$libobjs;mv $objdir/$soname $lib'
-    hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir'
-    hardcode_direct=yes
-    hardcode_minus_L=yes
-    export_dynamic_flag_spec='${wl}-E'
-    ;;
-
-  hpux10*)
-    archive_cmds='$LD -b +h $soname +s +b $install_libdir -o $lib$libobjs'
-    hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir'
-    hardcode_direct=yes
-    hardcode_minus_L=yes
-    export_dynamic_flag_spec='${wl}-E'
-    ;;
-
-  irix5* | irix6*)
-    archive_cmds='$LD -shared -o $lib -soname $soname -set_version $verstring$libobjs'
-    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
-    ;;
-
-  netbsd*)
-    # Tested with NetBSD 1.2 ld
-    archive_cmds='$LD -Bshareable -o $lib$libobjs'
-    hardcode_libdir_flag_spec='-R$libdir'
-    hardcode_direct=yes
-    hardcode_shlibpath_var=no
-    ;;
-
-  openbsd*)
-    archive_cmds='$LD -Bshareable -o $lib$libobjs'
-    hardcode_libdir_flag_spec='-R$libdir'
-    hardcode_direct=yes
-    hardcode_shlibpath_var=no
-    ;;
-
-  os2*)
-    hardcode_libdir_flag_spec='-L$libdir'
-    hardcode_minus_L=yes
-    allow_undefined_flag=unsupported
-    archive_cmds='$echo "LIBRARY $libname INITINSTANCE" > $objdir/$libname.def;$echo "DESCRIPTION \"$libname\"" >> $objdir/$libname.def;$echo DATA >> $objdir/$libname.def;$echo " SINGLE NONSHARED" >> $objdir/$libname.def;$echo EXPORTS >> $objdir/$libname.def;emxexp$libobjs >> $objdir/$libname.def;$CC -Zdll -Zcrtdll -o $lib$libobjs $objdir/$libname.def'
-    old_archive_from_new_cmds='emximp -o $objdir/$libname.a $objdir/$libname.def'
-    ;;
-
-  osf3* | osf4*)
-    allow_undefined_flag=' -expect_unresolved \*'
-    archive_cmds='$LD -shared${allow_undefined_flag} -o $lib -soname $soname -set_version $verstring$libobjs$deplibs'
-    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
-    hardcode_libdir_separator=:
-    ;;
-
-  sco3.2v5*)
-    archive_cmds='$LD -G -o $lib$libobjs'
-    hardcode_direct=yes
-    ;;
-
-  solaris2*)
-    no_undefined_flag=' -z text'
-    archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib$libobjs'
-    hardcode_libdir_flag_spec='-R$libdir'
-    hardcode_shlibpath_var=no
-
-    # Solaris 2 before 2.5 hardcodes -L paths.
-    case "$host_os" in
-    solaris2.[0-4]*)
-      hardcode_minus_L=yes
-      ;;
-    esac
-    ;;
-
-  sunos4*)
-    if test "$with_gcc" = yes; then
-      archive_cmds='$CC -shared -o $lib$libobjs'
-    else
-      archive_cmds='$LD -assert pure-text -Bstatic -o $lib$libobjs'
-    fi
-
-    if test "$with_gnu_ld" = yes; then
-      export_dynamic_flag_spec='${wl}-export-dynamic'
-    fi
-    hardcode_libdir_flag_spec='-L$libdir'
-    hardcode_direct=yes
-    hardcode_minus_L=yes
-    hardcode_shlibpath_var=no
-    ;;
-
-  uts4*)
-    archive_cmds='$LD -G -h $soname -o $lib$libobjs'
-    hardcode_libdir_flag_spec='-L$libdir'
-    hardcode_direct=no
-    hardcode_minus_L=no
-    hardcode_shlibpath_var=no
-    ;;
-
-  *)
-    ld_shlibs=no
-    can_build_shared=no
-    ;;
-  esac
-fi
-echo "$ac_t$ld_shlibs" 1>&6
-
-if test -z "$NM"; then
-  echo $ac_n "checking for BSD-compatible nm... $ac_c" 1>&6
-  case "$NM" in
-  /* | [A-Za-z]:\\*) ;; # Let the user override the test with a path.
-  *)
-    IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:"
-    for ac_dir in /usr/ucb /usr/ccs/bin $PATH /bin; do
-      test -z "$ac_dir" && ac_dir=.
-      if test -f $ac_dir/nm; then
-        # Check to see if the nm accepts a BSD-compat flag.
-        # Adding the `sed 1q' prevents false positives on HP-UX, which says:
-        #   nm: unknown option "B" ignored
-        if ($ac_dir/nm -B /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
-          NM="$ac_dir/nm -B"
-        elif ($ac_dir/nm -p /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
-          NM="$ac_dir/nm -p"
-	else
-          NM="$ac_dir/nm"
-	fi
-        break
-      fi
-    done
-    IFS="$ac_save_ifs"
-    test -z "$NM" && NM=nm
-    ;;
-  esac
-  echo "$ac_t$NM" 1>&6
-fi
-
-# Check for command to grab the raw symbol name followed by C symbol from nm.
-echo $ac_n "checking command to parse $NM output... $ac_c" 1>&6
-
-# These are sane defaults that work on at least a few old systems.
-# [They come from Ultrix.  What could be older than Ultrix?!! ;)]
-
-# Character class describing NM global symbol codes.
-symcode='[BCDEGRSTU]'
-
-# Regexp to match symbols that can be accessed directly from C.
-sympat='\([_A-Za-z][_A-Za-z0-9]*\)'
-
-# Transform the above into a raw symbol and a C symbol.
-symxfrm='\1 \1'
-
-# Define system-specific variables.
-case "$host_os" in
-aix*)
-  symcode='[BCDTU]'
-  ;;
-irix*)
-  # Cannot use undefined symbols on IRIX because inlined functions mess us up.
-  symcode='[BCDEGRST]'
-  ;;
-solaris2*)
-  symcode='[BDTU]'
-  ;;
-esac
-
-# If we're using GNU nm, then use its standard symbol codes.
-if $NM -V 2>&1 | egrep '(GNU|with BFD)' > /dev/null; then
-  symcode='[ABCDGISTUW]'
-fi
-
-# Write the raw and C identifiers.
-global_symbol_pipe="sed -n -e 's/^.* $symcode $sympat$/$symxfrm/p'"
-
-# Check to see that the pipe works correctly.
-pipe_works=no
-$rm conftest*
-cat > conftest.c <<EOF
-#ifdef __cplusplus
-extern "C" {
-#endif
-char nm_test_var;
-void nm_test_func(){}
-#ifdef __cplusplus
-}
-#endif
-main(){nm_test_var='a';nm_test_func();return(0);}
-EOF
-
-echo "$progname:971: checking if global_symbol_pipe works" >&5
-if { (eval echo $progname:972: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; } && test -s conftest.o; then
-  # Now try to grab the symbols.
-  nlist=conftest.nm
-  if { echo "$progname:975: eval \"$NM conftest.o | $global_symbol_pipe > $nlist\"" >&5; eval "$NM conftest.o | $global_symbol_pipe > $nlist 2>&5"; } && test -s "$nlist"; then
-
-    # Try sorting and uniquifying the output.
-    if sort "$nlist" | uniq > "$nlist"T; then
-      mv -f "$nlist"T "$nlist"
-      wcout=`wc "$nlist" 2>/dev/null`
-      count=`$echo "X$wcout" | $Xsed -e 's/^[ 	]*\([0-9][0-9]*\).*$/\1/'`
-      (test "$count" -ge 0) 2>/dev/null || count=-1
-    else
-      rm -f "$nlist"T
-      count=-1
-    fi
-
-    # Make sure that we snagged all the symbols we need.
-    if egrep ' nm_test_var$' "$nlist" >/dev/null; then
-      if egrep ' nm_test_func$' "$nlist" >/dev/null; then
-	cat <<EOF > conftest.c
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-EOF
-        # Now generate the symbol file.
-        sed 's/^.* \(.*\)$/extern char \1;/' < "$nlist" >> conftest.c
-
-	cat <<EOF >> conftest.c
-#if defined (__STDC__) && __STDC__
-# define __ptr_t void *
-#else
-# define __ptr_t char *
-#endif
-
-/* The number of symbols in dld_preloaded_symbols, -1 if unsorted. */
-int dld_preloaded_symbol_count = $count;
-
-/* The mapping between symbol names and symbols. */
-struct {
-  char *name;
-  __ptr_t address;
-}
-dld_preloaded_symbols[] =
-{
-EOF
-        sed 's/^\(.*\) \(.*\)$/  {"\1", (__ptr_t) \&\2},/' < "$nlist" >> conftest.c
-        cat <<\EOF >> conftest.c
-  {0, (__ptr_t) 0}
-};
-
-#ifdef __cplusplus
-}
-#endif
-EOF
-        # Now try linking the two files.
-        mv conftest.o conftestm.o
-	save_LIBS="$LIBS"
-	save_CFLAGS="$CFLAGS"
-        LIBS='conftestm.o'
-	CFLAGS="$CFLAGS$no_builtin_flag"
-        if { (eval echo $progname:1033: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
-          pipe_works=yes
-        else
-          echo "$progname: failed program was:" >&5
-          cat conftest.c >&5
-        fi
-        LIBS="$save_LIBS"
-      else
-        echo "cannot find nm_test_func in $nlist" >&5
-      fi
-    else
-      echo "cannot find nm_test_var in $nlist" >&5
-    fi
-  else
-    echo "cannot run $global_symbol_pipe" >&5
-  fi
-else
-  echo "$progname: failed program was:" >&5
-  cat conftest.c >&5
-fi
-$rm conftest*
-
-# Do not use the global_symbol_pipe unless it works.
-echo "$ac_t$pipe_works" 1>&6
-test "$pipe_works" = yes || global_symbol_pipe=
-
-# Check hardcoding attributes.
-echo $ac_n "checking how to hardcode library paths into programs... $ac_c" 1>&6
-hardcode_action=
-if test -n "$hardcode_libdir_flag_spec" || \
-   test -n "$runpath_var"; then
-
-  # We can hardcode non-existant directories.
-  if test "$hardcode_direct" != no && \
-     test "$hardcode_minus_L" != no && \
-     test "$hardcode_shlibpath_var" != no; then
-
-    # Linking always hardcodes the temporary library directory.
-    hardcode_action=relink
-  else
-    # We can link without hardcoding, and we can hardcode nonexisting dirs.
-    hardcode_action=immediate
-  fi
-elif test "$hardcode_direct" != yes && \
-     test "$hardcode_minus_L" != yes && \
-     test "$hardcode_shlibpath_var" != yes; then
-  # We cannot hardcode anything.
-  hardcode_action=unsupported
-else
-  # We can only hardcode existing directories.
-  hardcode_action=relink
-fi
-echo "$ac_t$hardcode_action" 1>&6
-test "$hardcode_action" = unsupported && can_build_shared=no
-
-
-reload_flag=
-reload_cmds='$LD$reload_flag -o $output$reload_objs'
-echo $ac_n "checking for $LD option to reload object files... $ac_c" 1>&6
-# PORTME Some linker may need a different reload flag.
-reload_flag='-r'
-echo "$ac_t$reload_flag"
-test -n "$reload_flag" && reload_flag=" $reload_flag"
-
-# PORTME Fill in your ld.so characteristics
-library_names_spec=
-libname_spec='lib$name'
-soname_spec=
-postinstall_cmds=
-postuninstall_cmds=
-finish_cmds=
-finish_eval=
-shlibpath_var=
-version_type=none
-dynamic_linker="$host_os ld.so"
-
-echo $ac_n "checking dynamic linker characteristics... $ac_c" 1>&6
-case "$host_os" in
-aix3* | aix4*)
-  version_type=linux
-  library_names_spec='${libname}${release}.so.$versuffix $libname.a'
-  shlibpath_var=LIBPATH
-
-  # AIX has no versioning support, so we append a major version to the name.
-  soname_spec='${libname}${release}.so.$major'
-  ;;
-
-amigaos*)
-  library_names_spec='$libname.ixlibrary $libname.a'
-  # Create ${libname}_ixlibrary.a entries in /sys/libs.
-  finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`$echo "X$lib" | $Xsed -e '\''s%^.*/\([^/]*\)\.ixlibrary$%\1%'\''`; test $rm /sys/libs/${libname}_ixlibrary.a; $show "(cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a)"; (cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a) || exit 1; done'
-  ;;
-
-freebsd2* | freebsd3*)
-  version_type=sunos
-  library_names_spec='${libname}${release}.so.$versuffix $libname.so'
-  finish_cmds='PATH="$PATH:/sbin" ldconfig -m $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-gnu*)
-  version_type=sunos
-  library_names_spec='${libname}${release}.so.$versuffix'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-hpux9* | hpux10*)
-  # Give a soname corresponding to the major version so that dld.sl refuses to
-  # link against other versions.
-  dynamic_linker="$host_os dld.sl"
-  version_type=sunos
-  shlibpath_var=SHLIB_PATH
-  library_names_spec='${libname}${release}.sl.$versuffix ${libname}${release}.sl.$major $libname.sl'
-  soname_spec='${libname}${release}.sl.$major'
-  # HP-UX runs *really* slowly unless shared libraries are mode 555.
-  postinstall_cmds='chmod 555 $lib'
-  ;;
-
-irix5* | irix6*)
-  version_type=osf
-  soname_spec='${libname}${release}.so'
-  library_names_spec='${libname}${release}.so.$versuffix $libname.so'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-# No shared lib support for Linux oldld, aout, or coff.
-linux-gnuoldld* | linux-gnuaout* | linux-gnucoff*)
-  dynamic_linker=no
-  ;;
-
-# This must be Linux ELF.
-linux-gnu*)
-  version_type=linux
-  library_names_spec='${libname}${release}.so.$versuffix ${libname}${release}.so.$major $libname.so'
-  soname_spec='${libname}${release}.so.$major'
-  finish_cmds='PATH="$PATH:/sbin" ldconfig -n $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-
-  if test -f /lib/ld.so.1; then
-    dynamic_linker='GNU ld.so'
-  else
-    # Only the GNU ld.so supports shared libraries on MkLinux.
-    case "$host_cpu" in
-    powerpc*) dynamic_linker=no ;;
-    *) dynamic_linker='Linux ld.so' ;;
-    esac
-  fi
-  ;;
-
-netbsd* | openbsd*)
-  version_type=sunos
-  library_names_spec='${libname}${release}.so.$versuffix'
-  finish_cmds='PATH="$PATH:/sbin" ldconfig -m $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-os2*)
-  libname_spec='$name'
-  library_names_spec='$libname.dll $libname.a'
-  dynamic_linker='OS/2 ld.exe'
-  shlibpath_var=LIBPATH
-  ;;
-
-osf3* | osf4*)
-  version_type=osf
-  soname_spec='${libname}${release}.so'
-  library_names_spec='${libname}${release}.so.$versuffix $libname.so'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-sco3.2v5*)
-  version_type=osf
-  soname_spec='${libname}${release}.so.$major'
-  library_names_spec='${libname}${release}.so.$versuffix ${libname}${release}.so.$major $libname.so'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-solaris2*)
-  version_type=linux
-  library_names_spec='${libname}${release}.so.$versuffix ${libname}${release}.so.$major $libname.so'
-  soname_spec='${libname}${release}.so.$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-sunos4*)
-  version_type=sunos
-  library_names_spec='${libname}${release}.so.$versuffix'
-  finish_cmds='PATH="$PATH:/usr/etc" ldconfig $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-sysv4.2uw2*)
-  version_type=linux
-  library_names_spec='${libname}${release}.so.$versuffix ${libname}${release}.so.$major $libname.so'
-  soname_spec='${libname}${release}.so.$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-uts4*)
-  version_type=linux
-  library_names_spec='${libname}${release}.so.$versuffix ${libname}${release}.so.$major $libname.so'
-  soname_spec='${libname}${release}.so.$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-*)
-  dynamic_linker=no
-  ;;
-esac
-echo "$ac_t$dynamic_linker"
-test "$dynamic_linker" = no && can_build_shared=no
-
-# Report the final consequences.
-echo "checking if libtool supports shared libraries... $can_build_shared" 1>&6
-
-echo $ac_n "checking whether to build shared libraries... $ac_c" 1>&6
-test "$can_build_shared" = "no" && enable_shared=no
-
-# On AIX, shared libraries and static libraries use the same namespace, and
-# are all built from PIC.
-case "$host_os" in
-aix*)
-  test "$enable_shared" = yes && enable_static=no
-  if test -n "$RANLIB"; then
-    archive_cmds="$archive_cmds;\$RANLIB \$lib"
-    postinstall_cmds='$RANLIB $lib'
-  fi
-  ;;
-esac
-
-echo "$ac_t$enable_shared" 1>&6
-
-# Make sure either enable_shared or enable_static is yes.
-test "$enable_shared" = yes || enable_static=yes
-
-echo "checking whether to build static libraries... $enable_static" 1>&6
-
-echo $ac_n "checking for objdir... $ac_c" 1>&6
-rm -f .libs 2>/dev/null
-mkdir .libs 2>/dev/null
-if test -d .libs; then
-  objdir=.libs
-else
-  # MS-DOS does not allow filenames that begin with a dot.
-  objdir=_libs
-fi
-rmdir .libs 2>/dev/null
-echo "$ac_t$objdir" 1>&6
-
-# Copy echo and quote the copy, instead of the original, because it is
-# used later.
-ltecho="$echo"
-
-# Now quote all the things that may contain metacharacters.
-for var in ltecho old_CC old_CFLAGS old_CPPFLAGS old_LD old_NM old_RANLIB \
-  old_LN_S AR CC LD LN_S NM reload_flag reload_cmds wl pic_flag \
-  link_static_flag no_builtin_flag export_dynamic_flag_spec \
-  libname_spec library_names_spec soname_spec RANLIB \
-  old_archive_cmds old_archive_from_new_cmds old_postinstall_cmds \
-  old_postuninstall_cmds archive_cmds postinstall_cmds postuninstall_cmds \
-  allow_undefined_flag no_undefined_flag \
-  finish_cmds finish_eval global_symbol_pipe \
-  hardcode_libdir_flag_spec hardcode_libdir_separator; do
-
-  case "$var" in
-  reload_cmds | old_archive_cmds | old_archive_from_new_cmds | \
-  old_postinstall_cmds | old_postuninstall_cmds | archive_cmds | \
-  postinstall_cmds | postuninstall_cmds | finish_cmds)
-    # Double-quote double-evaled strings.
-    eval "$var=\`\$echo \"X\$$var\" | \$Xsed -e \"\$double_quote_subst\" -e \"\$sed_quote_subst\"\`"
-    ;;
-  *)
-    eval "$var=\`\$echo \"X\$$var\" | \$Xsed -e \"\$sed_quote_subst\"\`"
-    ;;
-  esac
-done
-
-ofile=libtool
-trap "$rm $ofile; exit 1" 1 2 15
-echo creating $ofile
-$rm $ofile
-cat <<EOF > $ofile
-#! /bin/sh
-
-# libtool - Provide generalized library-building support services.
-# Generated automatically by $PROGRAM - GNU $PACKAGE $VERSION
-# NOTE: Changes made to this file will be lost: look at ltconfig or ltmain.sh.
-#
-# Copyright (C) 1996-1998 Free Software Foundation, Inc.
-# Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# This program was configured as follows,
-# on host `(hostname || uname -n) 2>/dev/null | sed 1q`:
-#
-# CC="$old_CC" CFLAGS="$old_CFLAGS" CPPFLAGS="$old_CPPFLAGS" \\
-# LD="$old_LD" NM="$old_NM" RANLIB="$old_RANLIB" LN_S="$old_LN_S" \\
-#   $0$ltconfig_args
-#
-# Compiler and other test output produced by $progname, useful for
-# debugging $progname, is in ./config.log if it exists.
-
-# Sed that helps us avoid accidentally triggering echo(1) options like -n.
-Xsed="sed -e s/^X//"
-
-# The HP-UX ksh and POSIX shell print the target directory to stdout
-# if CDPATH is set.
-if test "\${CDPATH+set}" = set; then CDPATH=; export CDPATH; fi
-
-# An echo program that does not interpret backslashes.
-echo="$ltecho"
-
-# The version of $progname that generated this script.
-LTCONFIG_VERSION="$VERSION"
-
-# Shell to use when invoking shell scripts.
-SHELL=${CONFIG_SHELL-/bin/sh}
-
-# Whether or not to build libtool libraries.
-build_libtool_libs=$enable_shared
-
-# Whether or not to build old-style libraries.
-build_old_libs=$enable_static
-
-# The host system.
-host_alias="$host_alias"
-host="$host"
-
-# The archiver.
-AR="$AR"
-
-# The default C compiler.
-CC="$CC"
-
-# The linker used to build libraries.
-LD="$LD"
-
-# Whether we need hard or soft links.
-LN_S="$LN_S"
-
-# A BSD-compatible nm program.
-NM="$NM"
-
-# The name of the directory that contains temporary libtool files.
-objdir="$objdir"
-
-# How to create reloadable object files.
-reload_flag="$reload_flag"
-reload_cmds="$reload_cmds"
-
-# How to pass a linker flag through the compiler.
-wl="$wl"
-
-# Additional compiler flags for building library objects.
-pic_flag="$pic_flag"
-
-# Compiler flag to prevent dynamic linking.
-link_static_flag="$link_static_flag"
-
-# Compiler flag to turn off builtin functions.
-no_builtin_flag="$no_builtin_flag"
-
-# Compiler flag to allow reflexive dlopens.
-export_dynamic_flag_spec="$export_dynamic_flag_spec"
-
-# Library versioning type.
-version_type=$version_type
-
-# Format of library name prefix.
-libname_spec="$libname_spec"
-
-# List of archive names.  First name is the real one, the rest are links.
-# The last name is the one that the linker finds with -lNAME.
-library_names_spec="$library_names_spec"
-
-# The coded name of the library, if different from the real name.
-soname_spec="$soname_spec"
-
-# Commands used to build and install an old-style archive.
-RANLIB="$RANLIB"
-old_archive_cmds="$old_archive_cmds"
-old_postinstall_cmds="$old_postinstall_cmds"
-old_postuninstall_cmds="$old_postuninstall_cmds"
-
-# Create an old-style archive from a shared archive.
-old_archive_from_new_cmds="$old_archive_from_new_cmds"
-
-# Commands used to build and install a shared archive.
-archive_cmds="$archive_cmds"
-postinstall_cmds="$postinstall_cmds"
-postuninstall_cmds="$postuninstall_cmds"
-
-# Flag that allows shared libraries with undefined symbols to be built.
-allow_undefined_flag="$allow_undefined_flag"
-
-# Flag that forces no undefined symbols.
-no_undefined_flag="$no_undefined_flag"
-
-# Commands used to finish a libtool library installation in a directory.
-finish_cmds="$finish_cmds"
-
-# Same as above, but a single script fragment to be evaled but not shown.
-finish_eval="$finish_eval"
-
-# Take the output of nm and produce a listing of raw symbols and C names.
-global_symbol_pipe="$global_symbol_pipe"
-
-# This is the shared library runtime path variable.
-runpath_var=$runpath_var
-
-# This is the shared library path variable.
-shlibpath_var=$shlibpath_var
-
-# How to hardcode a shared library path into an executable.
-hardcode_action=$hardcode_action
-
-# Flag to hardcode \$libdir into a binary during linking.
-# This must work even if \$libdir does not exist.
-hardcode_libdir_flag_spec="$hardcode_libdir_flag_spec"
-
-# Whether we need a single -rpath flag with a separated argument.
-hardcode_libdir_separator="$hardcode_libdir_separator"
-
-# Set to yes if using DIR/libNAME.so during linking hardcodes DIR into the
-# resulting binary.
-hardcode_direct=$hardcode_direct
-
-# Set to yes if using the -LDIR flag during linking hardcodes DIR into the
-# resulting binary.
-hardcode_minus_L=$hardcode_minus_L
-
-# Set to yes if using SHLIBPATH_VAR=DIR during linking hardcodes DIR into
-# the resulting binary.
-hardcode_shlibpath_var=$hardcode_shlibpath_var
-
-EOF
-
-case "$host_os" in
-aix3*)
-  cat <<\EOF >> $ofile
-# AIX sometimes has problems with the GCC collect2 program.  For some
-# reason, if we set the COLLECT_NAMES environment variable, the problems
-# vanish in a puff of smoke.
-if test "${COLLECT_NAMES+set}" != set; then
-  COLLECT_NAMES=
-  export COLLECT_NAMES
-fi
-
-EOF
-  ;;
-esac
-
-# Append the ltmain.sh script.
-cat "$ltmain" >> $ofile || (rm -f $ofile; exit 1)
-
-chmod +x $ofile
-exit 0
-
-# Local Variables:
-# mode:shell-script
-# sh-indentation:2
-# End:
diff --git a/ltmain.sh b/ltmain.sh
index e9350b3..0dbca1e 100644
--- a/ltmain.sh
+++ b/ltmain.sh
@@ -1,8 +1,9 @@
 # ltmain.sh - Provide generalized library-building support services.
-# NOTE: Changing this file will not affect anything until you rerun ltconfig.
+# NOTE: Changing this file will not affect anything until you rerun configure.
 #
-# Copyright (C) 1996-1998 Free Software Foundation, Inc.
-# Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001
+# Free Software Foundation, Inc.
+# Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -23,14 +24,42 @@
 # configuration script generated by Autoconf, you may include it under
 # the same distribution terms that you use for the rest of that program.
 
+# Check that we have a working $echo.
+if test "X$1" = X--no-reexec; then
+  # Discard the --no-reexec flag, and continue.
+  shift
+elif test "X$1" = X--fallback-echo; then
+  # Avoid inline document here, it may be left over
+  :
+elif test "X`($echo '\t') 2>/dev/null`" = 'X\t'; then
+  # Yippee, $echo works!
+  :
+else
+  # Restart under the correct shell, and then maybe $echo will work.
+  exec $SHELL "$0" --no-reexec ${1+"$@"}
+fi
+
+if test "X$1" = X--fallback-echo; then
+  # used as fallback echo
+  shift
+  cat <<EOF
+$*
+EOF
+  exit 0
+fi
+
+# define SED for historic ltconfig's generated by Libtool 1.3
+test -z "$SED" && SED=sed
+
 # The name of this program.
-progname=`$echo "$0" | sed 's%^.*/%%'`
+progname=`$echo "$0" | ${SED} 's%^.*/%%'`
 modename="$progname"
 
 # Constants.
 PROGRAM=ltmain.sh
 PACKAGE=libtool
-VERSION=1.2
+VERSION=1.4.3
+TIMESTAMP=" (1.922.2.110 2002/10/23 01:39:54)"
 
 default_mode=
 help="Try \`$progname --help' for more information."
@@ -41,21 +70,34 @@
 
 # Sed substitution that helps us do robust quoting.  It backslashifies
 # metacharacters that are still active within double-quoted strings.
-Xsed='sed -e s/^X//'
+Xsed="${SED}"' -e 1s/^X//'
 sed_quote_subst='s/\([\\`\\"$\\\\]\)/\\\1/g'
+# test EBCDIC or ASCII                                                         
+case `echo A|od -x` in                                                         
+ *[Cc]1*) # EBCDIC based system                                                
+  SP2NL="tr '\100' '\n'"                                                       
+  NL2SP="tr '\r\n' '\100\100'"                                                 
+  ;;                                                                           
+ *) # Assume ASCII based system                                                
+  SP2NL="tr '\040' '\012'"                                                     
+  NL2SP="tr '\015\012' '\040\040'"                                             
+  ;;                                                                           
+esac                                                                           
 
 # NLS nuisances.
 # Only set LANG and LC_ALL to C if already set.
 # These must not be set unconditionally because not all systems understand
 # e.g. LANG=C (notably SCO).
-if test "${LC_ALL+set}" = set; then LC_ALL=C; export LC_ALL; fi
-if test "${LANG+set}"   = set; then LANG=C;   export LANG;   fi
-
-if test "$LTCONFIG_VERSION" != "$VERSION"; then
-  echo "$modename: ltconfig version \`$LTCONFIG_VERSION' does not match $PROGRAM version \`$VERSION'" 1>&2
-  echo "Fatal configuration error.  See the $PACKAGE docs for more information." 1>&2
-  exit 1
+# We save the old values to restore during execute mode.
+if test "${LC_ALL+set}" = set; then
+  save_LC_ALL="$LC_ALL"; LC_ALL=C; export LC_ALL
 fi
+if test "${LANG+set}" = set; then
+  save_LANG="$LANG"; LANG=C; export LANG
+fi
+
+# Make sure IFS has a sensible default
+: ${IFS=" 	"}
 
 if test "$build_libtool_libs" != yes && test "$build_old_libs" != yes; then
   echo "$modename: not configured to build any kind of library" 1>&2
@@ -72,6 +114,8 @@
 show="$echo"
 show_help=
 execute_dlfiles=
+lo2o="s/\\.lo\$/.${objext}/"
+o2lo="s/\\.${objext}\$/.lo/"
 
 # Parse our command line options once, thoroughly.
 while test $# -gt 0
@@ -79,16 +123,16 @@
   arg="$1"
   shift
 
-  case "$arg" in
+  case $arg in
   -*=*) optarg=`$echo "X$arg" | $Xsed -e 's/[-_a-zA-Z0-9]*=//'` ;;
   *) optarg= ;;
   esac
 
   # If the previous option needs an argument, assign it.
   if test -n "$prev"; then
-    case "$prev" in
+    case $prev in
     execute_dlfiles)
-      eval "$prev=\"\$$prev \$arg\""
+      execute_dlfiles="$execute_dlfiles $arg"
       ;;
     *)
       eval "$prev=\$arg"
@@ -101,16 +145,26 @@
   fi
 
   # Have we seen a non-optional argument yet?
-  case "$arg" in
+  case $arg in
   --help)
     show_help=yes
     ;;
 
   --version)
-    echo "$PROGRAM (GNU $PACKAGE) $VERSION"
+    echo "$PROGRAM (GNU $PACKAGE) $VERSION$TIMESTAMP"
     exit 0
     ;;
 
+  --config)
+    ${SED} -e '1,/^# ### BEGIN LIBTOOL CONFIG/d' -e '/^# ### END LIBTOOL CONFIG/,$d' $0
+    exit 0
+    ;;
+
+  --debug)
+    echo "$progname: enabling shell trace mode"
+    set -x
+    ;;
+
   --dry-run | -n)
     run=:
     ;;
@@ -135,6 +189,8 @@
   --mode) prevopt="--mode" prev=mode ;;
   --mode=*) mode="$optarg" ;;
 
+  --preserve-dup-deps) duplicate_deps="yes" ;;
+
   --quiet | --silent)
     show=:
     ;;
@@ -163,24 +219,29 @@
   exit 1
 fi
 
+# If this variable is set in any of the actions, the command in it
+# will be execed at the end.  This prevents here-documents from being
+# left over by shells.
+exec_cmd=
+
 if test -z "$show_help"; then
 
   # Infer the operation mode.
   if test -z "$mode"; then
-    case "$nonopt" in
-    *cc | *++ | gcc* | *-gcc*)
+    case $nonopt in
+    *cc | *++ | gcc* | *-gcc* | xlc*)
       mode=link
       for arg
       do
-        case "$arg" in
-        -c)
-           mode=compile
-           break
-           ;;
-        esac
+	case $arg in
+	-c)
+	   mode=compile
+	   break
+	   ;;
+	esac
       done
       ;;
-    *db | *dbx)
+    *db | *dbx | *strace | *truss)
       mode=execute
       ;;
     *install*|cp|mv)
@@ -195,11 +256,11 @@
 
       # Just use the default operation mode.
       if test -z "$mode"; then
-        if test -n "$nonopt"; then
-          $echo "$modename: warning: cannot infer operation mode from \`$nonopt'" 1>&2
-        else
-          $echo "$modename: warning: cannot infer operation mode without MODE-ARGS" 1>&2
-        fi
+	if test -n "$nonopt"; then
+	  $echo "$modename: warning: cannot infer operation mode from \`$nonopt'" 1>&2
+	else
+	  $echo "$modename: warning: cannot infer operation mode without MODE-ARGS" 1>&2
+	fi
       fi
       ;;
     esac
@@ -217,31 +278,118 @@
   help="Try \`$modename --help --mode=$mode' for more information."
 
   # These modes are in order of execution frequency so that they run quickly.
-  case "$mode" in
+  case $mode in
   # libtool compile mode
   compile)
     modename="$modename: compile"
     # Get the compilation command and the source file.
     base_compile=
+    prev=
     lastarg=
     srcfile="$nonopt"
     suppress_output=
 
+    user_target=no
     for arg
     do
+      case $prev in
+      "") ;;
+      xcompiler)
+	# Aesthetically quote the previous argument.
+	prev=
+	lastarg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+
+	case $arg in
+	# Double-quote args containing other shell metacharacters.
+	# Many Bourne shells cannot handle close brackets correctly
+	# in scan sets, so we specify it separately.
+	*[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	  arg="\"$arg\""
+	  ;;
+	esac
+
+	# Add the previous argument to base_compile.
+	if test -z "$base_compile"; then
+	  base_compile="$lastarg"
+	else
+	  base_compile="$base_compile $lastarg"
+	fi
+	continue
+	;;
+      esac
+
       # Accept any command-line options.
-      case "$arg" in
+      case $arg in
       -o)
-	$echo "$modename: you cannot specify the output filename with \`-o'" 1>&2
-	$echo "$help" 1>&2
-	exit 1
+	if test "$user_target" != "no"; then
+	  $echo "$modename: you cannot specify \`-o' more than once" 1>&2
+	  exit 1
+	fi
+	user_target=next
 	;;
 
       -static)
-	build_libtool_libs=no
 	build_old_libs=yes
 	continue
 	;;
+
+      -prefer-pic)
+	pic_mode=yes
+	continue
+	;;
+
+      -prefer-non-pic)
+	pic_mode=no
+	continue
+	;;
+
+      -Xcompiler)
+	prev=xcompiler
+	continue
+	;;
+
+      -Wc,*)
+	args=`$echo "X$arg" | $Xsed -e "s/^-Wc,//"`
+	lastarg=
+	save_ifs="$IFS"; IFS=','
+	for arg in $args; do
+	  IFS="$save_ifs"
+
+	  # Double-quote args containing other shell metacharacters.
+	  # Many Bourne shells cannot handle close brackets correctly
+	  # in scan sets, so we specify it separately.
+	  case $arg in
+	    *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	    arg="\"$arg\""
+	    ;;
+	  esac
+	  lastarg="$lastarg $arg"
+	done
+	IFS="$save_ifs"
+	lastarg=`$echo "X$lastarg" | $Xsed -e "s/^ //"`
+
+	# Add the arguments to base_compile.
+	if test -z "$base_compile"; then
+	  base_compile="$lastarg"
+	else
+	  base_compile="$base_compile $lastarg"
+	fi
+	continue
+	;;
+      esac
+
+      case $user_target in
+      next)
+	# The next one is the -o target name
+	user_target=yes
+	continue
+	;;
+      yes)
+	# We got the output file
+	user_target=set
+	libobj="$arg"
+	continue
+	;;
       esac
 
       # Accept the current argument as the source file.
@@ -256,10 +404,10 @@
       lastarg=`$echo "X$lastarg" | $Xsed -e "$sed_quote_subst"`
 
       # Double-quote args containing other shell metacharacters.
-      # Many Bourne shells cannot handle close brackets correctly in scan
-      # sets, so we specify it separately.
-      case "$lastarg" in
-      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*)
+      # Many Bourne shells cannot handle close brackets correctly
+      # in scan sets, so we specify it separately.
+      case $lastarg in
+      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
 	lastarg="\"$lastarg\""
 	;;
       esac
@@ -272,12 +420,23 @@
       fi
     done
 
-    # Get the name of the library object.
-    libobj=`$echo "X$srcfile" | $Xsed -e 's%^.*/%%'`
+    case $user_target in
+    set)
+      ;;
+    no)
+      # Get the name of the library object.
+      libobj=`$echo "X$srcfile" | $Xsed -e 's%^.*/%%'`
+      ;;
+    *)
+      $echo "$modename: you must specify a target with \`-o'" 1>&2
+      exit 1
+      ;;
+    esac
 
     # Recognize several different file suffixes.
-    xform='[cCFSfms]'
-    case "$libobj" in
+    # If the user specifies -o file.o, it is replaced with file.lo
+    xform='[cCFSfmso]'
+    case $libobj in
     *.ada) xform=ada ;;
     *.adb) xform=adb ;;
     *.ads) xform=ads ;;
@@ -292,10 +451,10 @@
 
     libobj=`$echo "X$libobj" | $Xsed -e "s/\.$xform$/.lo/"`
 
-    case "$libobj" in
-    *.lo) obj=`$echo "X$libobj" | $Xsed -e 's/\.lo$/.o/'` ;;
+    case $libobj in
+    *.lo) obj=`$echo "X$libobj" | $Xsed -e "$lo2o"` ;;
     *)
-      $echo "$modename: cannot determine name of library object from \`$srcfile'" 1>&2
+      $echo "$modename: cannot determine name of library object from \`$libobj'" 1>&2
       exit 1
       ;;
     esac
@@ -308,11 +467,65 @@
 
     # Delete any leftover library objects.
     if test "$build_old_libs" = yes; then
-      $run $rm $obj $libobj
-      trap "$run $rm $obj $libobj; exit 1" 1 2 15
+      removelist="$obj $libobj"
     else
-      $run $rm $libobj
-      trap "$run $rm $libobj; exit 1" 1 2 15
+      removelist="$libobj"
+    fi
+
+    $run $rm $removelist
+    trap "$run $rm $removelist; exit 1" 1 2 15
+
+    # On Cygwin there's no "real" PIC flag so we must build both object types
+    case $host_os in
+    cygwin* | mingw* | pw32* | os2*)
+      pic_mode=default
+      ;;
+    esac
+    if test "$pic_mode" = no && test "$deplibs_check_method" != pass_all; then
+      # non-PIC code in shared libraries is not supported
+      pic_mode=default
+    fi
+
+    # Calculate the filename of the output object if compiler does
+    # not support -o with -c
+    if test "$compiler_c_o" = no; then
+      output_obj=`$echo "X$srcfile" | $Xsed -e 's%^.*/%%' -e 's%\.[^.]*$%%'`.${objext}
+      lockfile="$output_obj.lock"
+      removelist="$removelist $output_obj $lockfile"
+      trap "$run $rm $removelist; exit 1" 1 2 15
+    else
+      need_locks=no
+      lockfile=
+    fi
+
+    # Lock this critical section if it is needed
+    # We use this script file to make the link, it avoids creating a new file
+    if test "$need_locks" = yes; then
+      until $run ln "$0" "$lockfile" 2>/dev/null; do
+	$show "Waiting for $lockfile to be removed"
+	sleep 2
+      done
+    elif test "$need_locks" = warn; then
+      if test -f "$lockfile"; then
+	echo "\
+*** ERROR, $lockfile exists and contains:
+`cat $lockfile 2>/dev/null`
+
+This indicates that another process is trying to use the same
+temporary object file, and libtool could not work around it because
+your compiler does not support \`-c' and \`-o' together.  If you
+repeat this compilation, it may succeed, by chance, but you had better
+avoid parallel builds (make -j) in this platform, or get a better
+compiler."
+
+	$run $rm $removelist
+	exit 1
+      fi
+      echo $srcfile > "$lockfile"
+    fi
+
+    if test -n "$fix_srcfile_path"; then
+      eval srcfile=\"$fix_srcfile_path\"
     fi
 
     # Only build a PIC object if we are building libtool libraries.
@@ -320,24 +533,121 @@
       # Without this assignment, base_compile gets emptied.
       fbsd_hideous_sh_bug=$base_compile
 
-      # All platforms use -DPIC, to notify preprocessed assembler code.
-      $show "$base_compile$pic_flag -DPIC $srcfile"
-      if $run eval "$base_compile\$pic_flag -DPIC \$srcfile"; then :
+      if test "$pic_mode" != no; then
+	# All platforms use -DPIC, to notify preprocessed assembler code.
+	command="$base_compile $srcfile $pic_flag -DPIC"
       else
-        test -n "$obj" && $run $rm $obj
-        exit 1
+	# Don't build PIC code
+	command="$base_compile $srcfile"
+      fi
+      if test "$build_old_libs" = yes; then
+	lo_libobj="$libobj"
+	dir=`$echo "X$libobj" | $Xsed -e 's%/[^/]*$%%'`
+	if test "X$dir" = "X$libobj"; then
+	  dir="$objdir"
+	else
+	  dir="$dir/$objdir"
+	fi
+	libobj="$dir/"`$echo "X$libobj" | $Xsed -e 's%^.*/%%'`
+
+	if test -d "$dir"; then
+	  $show "$rm $libobj"
+	  $run $rm $libobj
+	else
+	  $show "$mkdir $dir"
+	  $run $mkdir $dir
+	  status=$?
+	  if test $status -ne 0 && test ! -d $dir; then
+	    exit $status
+	  fi
+	fi
+      fi
+      if test "$compiler_o_lo" = yes; then
+	output_obj="$libobj"
+	command="$command -o $output_obj"
+      elif test "$compiler_c_o" = yes; then
+	output_obj="$obj"
+	command="$command -o $output_obj"
+      fi
+
+      $run $rm "$output_obj"
+      $show "$command"
+      if $run eval "$command"; then :
+      else
+	test -n "$output_obj" && $run $rm $removelist
+	exit 1
+      fi
+
+      if test "$need_locks" = warn &&
+	 test x"`cat $lockfile 2>/dev/null`" != x"$srcfile"; then
+	echo "\
+*** ERROR, $lockfile contains:
+`cat $lockfile 2>/dev/null`
+
+but it should contain:
+$srcfile
+
+This indicates that another process is trying to use the same
+temporary object file, and libtool could not work around it because
+your compiler does not support \`-c' and \`-o' together.  If you
+repeat this compilation, it may succeed, by chance, but you had better
+avoid parallel builds (make -j) in this platform, or get a better
+compiler."
+
+	$run $rm $removelist
+	exit 1
+      fi
+
+      # Just move the object if needed, then go on to compile the next one
+      if test x"$output_obj" != x"$libobj"; then
+	$show "$mv $output_obj $libobj"
+	if $run $mv $output_obj $libobj; then :
+	else
+	  error=$?
+	  $run $rm $removelist
+	  exit $error
+	fi
       fi
 
       # If we have no pic_flag, then copy the object into place and finish.
-      if test -z "$pic_flag"; then
-        $show "$LN_S $obj $libobj"
-        $run $LN_S $obj $libobj
-        exit $?
-      fi
+      if (test -z "$pic_flag" || test "$pic_mode" != default) &&
+	 test "$build_old_libs" = yes; then
+	# Rename the .lo from within objdir to obj
+	if test -f $obj; then
+	  $show $rm $obj
+	  $run $rm $obj
+	fi
 
-      # Just move the object, then go on to compile the next one
-      $show "$mv $obj $libobj"
-      $run $mv $obj $libobj || exit 1
+	$show "$mv $libobj $obj"
+	if $run $mv $libobj $obj; then :
+	else
+	  error=$?
+	  $run $rm $removelist
+	  exit $error
+	fi
+
+	xdir=`$echo "X$obj" | $Xsed -e 's%/[^/]*$%%'`
+	if test "X$xdir" = "X$obj"; then
+	  xdir="."
+	else
+	  xdir="$xdir"
+	fi
+	baseobj=`$echo "X$obj" | $Xsed -e "s%.*/%%"`
+	libobj=`$echo "X$baseobj" | $Xsed -e "$o2lo"`
+	# Now arrange that obj and lo_libobj become the same file
+	$show "(cd $xdir && $LN_S $baseobj $libobj)"
+	if $run eval '(cd $xdir && $LN_S $baseobj $libobj)'; then
+	  # Unlock the critical section if it was locked
+	  if test "$need_locks" != no; then
+	    $run $rm "$lockfile"
+	  fi
+	  exit 0
+	else
+	  error=$?
+	  $run $rm $removelist
+	  exit $error
+	fi
+      fi
 
       # Allow error messages only from the first compilation.
       suppress_output=' >/dev/null 2>&1'
@@ -345,64 +655,170 @@
 
     # Only build a position-dependent object if we build old libraries.
     if test "$build_old_libs" = yes; then
-      # Suppress compiler output if we already did a PIC compilation.
-      $show "$base_compile $srcfile$suppress_output"
-      if $run eval "$base_compile \$srcfile$suppress_output"; then :
+      if test "$pic_mode" != yes; then
+	# Don't build PIC code
+	command="$base_compile $srcfile"
       else
-        $run $rm $obj $libobj
-        exit 1
+	# All platforms use -DPIC, to notify preprocessed assembler code.
+	command="$base_compile $srcfile $pic_flag -DPIC"
+      fi
+      if test "$compiler_c_o" = yes; then
+	command="$command -o $obj"
+	output_obj="$obj"
+      fi
+
+      # Suppress compiler output if we already did a PIC compilation.
+      command="$command$suppress_output"
+      $run $rm "$output_obj"
+      $show "$command"
+      if $run eval "$command"; then :
+      else
+	$run $rm $removelist
+	exit 1
+      fi
+
+      if test "$need_locks" = warn &&
+	 test x"`cat $lockfile 2>/dev/null`" != x"$srcfile"; then
+	echo "\
+*** ERROR, $lockfile contains:
+`cat $lockfile 2>/dev/null`
+
+but it should contain:
+$srcfile
+
+This indicates that another process is trying to use the same
+temporary object file, and libtool could not work around it because
+your compiler does not support \`-c' and \`-o' together.  If you
+repeat this compilation, it may succeed, by chance, but you had better
+avoid parallel builds (make -j) in this platform, or get a better
+compiler."
+
+	$run $rm $removelist
+	exit 1
+      fi
+
+      # Just move the object if needed
+      if test x"$output_obj" != x"$obj"; then
+	$show "$mv $output_obj $obj"
+	if $run $mv $output_obj $obj; then :
+	else
+	  error=$?
+	  $run $rm $removelist
+	  exit $error
+	fi
+      fi
+
+      # Create an invalid libtool object if no PIC, so that we do not
+      # accidentally link it into a program.
+      if test "$build_libtool_libs" != yes; then
+	$show "echo timestamp > $libobj"
+	$run eval "echo timestamp > \$libobj" || exit $?
+      else
+	# Move the .lo from within objdir
+	$show "$mv $libobj $lo_libobj"
+	if $run $mv $libobj $lo_libobj; then :
+	else
+	  error=$?
+	  $run $rm $removelist
+	  exit $error
+	fi
       fi
     fi
 
-    # Create an invalid libtool object if no PIC, so that we do not
-    # accidentally link it into a program.
-    if test "$build_libtool_libs" != yes; then
-      $show "echo timestamp > $libobj"
-      $run eval "echo timestamp > \$libobj" || exit $?
+    # Unlock the critical section if it was locked
+    if test "$need_locks" != no; then
+      $run $rm "$lockfile"
     fi
 
     exit 0
     ;;
 
   # libtool link mode
-  link)
+  link | relink)
     modename="$modename: link"
-    CC="$nonopt"
-    allow_undefined=yes
-    compile_command="$CC"
-    finalize_command="$CC"
+    case $host in
+    *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2*)
+      # It is impossible to link a dll without this setting, and
+      # we shouldn't force the makefile maintainer to figure out
+      # which system we are compiling for in order to pass an extra
+      # flag for every libtool invokation.
+      # allow_undefined=no
 
+      # FIXME: Unfortunately, there are problems with the above when trying
+      # to make a dll which has undefined symbols, in which case not
+      # even a static library is built.  For now, we need to specify
+      # -no-undefined on the libtool link line when we can be certain
+      # that all symbols are satisfied, otherwise we get a static library.
+      allow_undefined=yes
+      ;;
+    *)
+      allow_undefined=yes
+      ;;
+    esac
+    libtool_args="$nonopt"
+    compile_command="$nonopt"
+    finalize_command="$nonopt"
+
+    compile_rpath=
+    finalize_rpath=
     compile_shlibpath=
     finalize_shlibpath=
+    convenience=
+    old_convenience=
     deplibs=
+    old_deplibs=
+    compiler_flags=
+    linker_flags=
+    dllsearchpath=
+    lib_search_path=`pwd`
+
+    avoid_version=no
     dlfiles=
     dlprefiles=
+    dlself=no
     export_dynamic=no
-    hardcode_libdirs=
+    export_symbols=
+    export_symbols_regex=
+    generated=
     libobjs=
-    link_against_libtool_libs=
     ltlibs=
+    module=no
+    no_install=no
     objs=
+    prefer_static_libs=no
+    preload=no
     prev=
     prevarg=
     release=
     rpath=
+    xrpath=
     perm_rpath=
     temp_rpath=
+    thread_safe=no
     vinfo=
 
     # We need to know -static, to get the right output filenames.
     for arg
     do
-      case "$arg" in
+      case $arg in
       -all-static | -static)
-        if test "X$arg" = "X-all-static" && test "$build_libtool_libs" = yes && test -z "$link_static_flag"; then
+	if test "X$arg" = "X-all-static"; then
+	  if test "$build_libtool_libs" = yes && test -z "$link_static_flag"; then
 	    $echo "$modename: warning: complete static linking is impossible in this configuration" 1>&2
-        fi
-        build_libtool_libs=no
+	  fi
+	  if test -n "$link_static_flag"; then
+	    dlopen_self=$dlopen_self_static
+	  fi
+	else
+	  if test -z "$pic_flag" && test -n "$link_static_flag"; then
+	    dlopen_self=$dlopen_self_static
+	  fi
+	fi
+	build_libtool_libs=no
 	build_old_libs=yes
-        break
-        ;;
+	prefer_static_libs=yes
+	break
+	;;
       esac
     done
 
@@ -410,55 +826,141 @@
     test -n "$old_archive_from_new_cmds" && build_old_libs=yes
 
     # Go through the arguments, transforming them on the way.
-    for arg
-    do
+    while test $# -gt 0; do
+      arg="$1"
+      shift
+      case $arg in
+      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	qarg=\"`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`\" ### testsuite: skip nested quoting test
+	;;
+      *) qarg=$arg ;;
+      esac
+      libtool_args="$libtool_args $qarg"
+
       # If the previous option needs an argument, assign it.
       if test -n "$prev"; then
-        case "$prev" in
-        output)
-          compile_command="$compile_command @OUTPUT@"
-          finalize_command="$finalize_command @OUTPUT@"
-          ;;
-        esac
+	case $prev in
+	output)
+	  compile_command="$compile_command @OUTPUT@"
+	  finalize_command="$finalize_command @OUTPUT@"
+	  ;;
+	esac
 
-        case "$prev" in
-        dlfiles|dlprefiles)
-          case "$arg" in
-          *.la | *.lo) ;;  # We handle these cases below.
-          *)
-            dlprefiles="$dlprefiles $arg"
-            test "$prev" = dlfiles && dlfiles="$dlfiles $arg"
-            prev=
-            ;;
-          esac
-          ;;
+	case $prev in
+	dlfiles|dlprefiles)
+	  if test "$preload" = no; then
+	    # Add the symbol object into the linking commands.
+	    compile_command="$compile_command @SYMFILE@"
+	    finalize_command="$finalize_command @SYMFILE@"
+	    preload=yes
+	  fi
+	  case $arg in
+	  *.la | *.lo) ;;  # We handle these cases below.
+	  force)
+	    if test "$dlself" = no; then
+	      dlself=needless
+	      export_dynamic=yes
+	    fi
+	    prev=
+	    continue
+	    ;;
+	  self)
+	    if test "$prev" = dlprefiles; then
+	      dlself=yes
+	    elif test "$prev" = dlfiles && test "$dlopen_self" != yes; then
+	      dlself=yes
+	    else
+	      dlself=needless
+	      export_dynamic=yes
+	    fi
+	    prev=
+	    continue
+	    ;;
+	  *)
+	    if test "$prev" = dlfiles; then
+	      dlfiles="$dlfiles $arg"
+	    else
+	      dlprefiles="$dlprefiles $arg"
+	    fi
+	    prev=
+	    continue
+	    ;;
+	  esac
+	  ;;
+	expsyms)
+	  export_symbols="$arg"
+	  if test ! -f "$arg"; then
+	    $echo "$modename: symbol file \`$arg' does not exist"
+	    exit 1
+	  fi
+	  prev=
+	  continue
+	  ;;
+	expsyms_regex)
+	  export_symbols_regex="$arg"
+	  prev=
+	  continue
+	  ;;
 	release)
 	  release="-$arg"
 	  prev=
 	  continue
 	  ;;
-        rpath)
-          rpath="$rpath $arg"
+	rpath | xrpath)
+	  # We need an absolute path.
+	  case $arg in
+	  [\\/]* | [A-Za-z]:[\\/]*) ;;
+	  *)
+	    $echo "$modename: only absolute run-paths are allowed" 1>&2
+	    exit 1
+	    ;;
+	  esac
+	  if test "$prev" = rpath; then
+	    case "$rpath " in
+	    *" $arg "*) ;;
+	    *) rpath="$rpath $arg" ;;
+	    esac
+	  else
+	    case "$xrpath " in
+	    *" $arg "*) ;;
+	    *) xrpath="$xrpath $arg" ;;
+	    esac
+	  fi
 	  prev=
 	  continue
 	  ;;
-        *)
-          eval "$prev=\"\$arg\""
-          prev=
-          continue
-          ;;
-        esac
-      fi
+	xcompiler)
+	  compiler_flags="$compiler_flags $qarg"
+	  prev=
+	  compile_command="$compile_command $qarg"
+	  finalize_command="$finalize_command $qarg"
+	  continue
+	  ;;
+	xlinker)
+	  linker_flags="$linker_flags $qarg"
+	  compiler_flags="$compiler_flags $wl$qarg"
+	  prev=
+	  compile_command="$compile_command $wl$qarg"
+	  finalize_command="$finalize_command $wl$qarg"
+	  continue
+	  ;;
+	*)
+	  eval "$prev=\"\$arg\""
+	  prev=
+	  continue
+	  ;;
+	esac
+      fi # test -n $prev
 
       prevarg="$arg"
 
-      case "$arg" in
+      case $arg in
       -all-static)
 	if test -n "$link_static_flag"; then
-          compile_command="$compile_command $link_static_flag"
+	  compile_command="$compile_command $link_static_flag"
 	  finalize_command="$finalize_command $link_static_flag"
-        fi
-        continue
+	fi
+	continue
 	;;
 
       -allow-undefined)
@@ -467,46 +969,134 @@
 	continue
 	;;
 
+      -avoid-version)
+	avoid_version=yes
+	continue
+	;;
+
       -dlopen)
-        prev=dlfiles
-        continue
-        ;;
+	prev=dlfiles
+	continue
+	;;
 
       -dlpreopen)
-        prev=dlprefiles
-        continue
-        ;;
+	prev=dlprefiles
+	continue
+	;;
 
       -export-dynamic)
-        if test "$export_dynamic" != yes; then
-          export_dynamic=yes
-	  if test -n "$export_dynamic_flag_spec"; then
-	    eval arg=\"$export_dynamic_flag_spec\"
-	  else
-	    arg=
-	  fi
+	export_dynamic=yes
+	continue
+	;;
 
-          # Add the symbol object into the linking commands.
-	  compile_command="$compile_command @SYMFILE@"
-	  finalize_command="$finalize_command @SYMFILE@"
-        fi
-        ;;
+      -export-symbols | -export-symbols-regex)
+	if test -n "$export_symbols" || test -n "$export_symbols_regex"; then
+	  $echo "$modename: more than one -exported-symbols argument is not allowed"
+	  exit 1
+	fi
+	if test "X$arg" = "X-export-symbols"; then
+	  prev=expsyms
+	else
+	  prev=expsyms_regex
+	fi
+	continue
+	;;
+
+      # The native IRIX linker understands -LANG:*, -LIST:* and -LNO:*
+      # so, if we see these flags be careful not to treat them like -L
+      -L[A-Z][A-Z]*:*)
+	case $with_gcc/$host in
+	no/*-*-irix* | no/*-*-nonstopux*)
+	  compile_command="$compile_command $arg"
+	  finalize_command="$finalize_command $arg"
+	  ;;
+	esac
+	continue
+	;;
 
       -L*)
-        dir=`$echo "X$arg" | $Xsed -e 's%^-L\(.*\)$%\1%'`
-        case "$dir" in
-        /* | [A-Za-z]:\\*)
-	  # Add the corresponding hardcode_libdir_flag, if it is not identical.
-          ;;
-        *)
-          $echo "$modename: \`-L$dir' cannot specify a relative directory" 1>&2
-          exit 1
-          ;;
-        esac
-        deplibs="$deplibs $arg"
-        ;;
+	dir=`$echo "X$arg" | $Xsed -e 's/^-L//'`
+	# We need an absolute path.
+	case $dir in
+	[\\/]* | [A-Za-z]:[\\/]*) ;;
+	*)
+	  absdir=`cd "$dir" && pwd`
+	  if test -z "$absdir"; then
+	    $echo "$modename: cannot determine absolute directory name of \`$dir'" 1>&2
+	    exit 1
+	  fi
+	  dir="$absdir"
+	  ;;
+	esac
+	case "$deplibs " in
+	*" -L$dir "*) ;;
+	*)
+	  deplibs="$deplibs -L$dir"
+	  lib_search_path="$lib_search_path $dir"
+	  ;;
+	esac
+	case $host in
+	*-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2*)
+	  case :$dllsearchpath: in
+	  *":$dir:"*) ;;
+	  *) dllsearchpath="$dllsearchpath:$dir";;
+	  esac
+	  ;;
+	esac
+	continue
+	;;
 
-      -l*) deplibs="$deplibs $arg" ;;
+      -l*)
+	if test "X$arg" = "X-lc" || test "X$arg" = "X-lm"; then
+	  case $host in
+	  *-*-cygwin* | *-*-pw32* | *-*-beos*)
+	    # These systems don't actually have a C or math library (as such)
+	    continue
+	    ;;
+	  *-*-mingw* | *-*-os2*)
+	    # These systems don't actually have a C library (as such)
+	    test "X$arg" = "X-lc" && continue
+	    ;;
+	  *-*-openbsd* | *-*-freebsd*)
+	    # Do not include libc due to us having libc/libc_r.
+	    test "X$arg" = "X-lc" && continue
+	    ;;
+	  esac
+	 elif test "X$arg" = "X-lc_r"; then
+	  case $host in
+	 *-*-openbsd* | *-*-freebsd*)
+	    # Do not include libc_r directly, use -pthread flag.
+	    continue
+	    ;;
+	  esac
+	fi
+	deplibs="$deplibs $arg"
+	continue
+	;;
+
+      -module)
+	module=yes
+	continue
+	;;
+
+      -no-fast-install)
+	fast_install=no
+	continue
+	;;
+
+      -no-install)
+	case $host in
+	*-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2*)
+	  # The PATH hackery in wrapper scripts is required on Windows
+	  # in order for the loader to find any dlls it needs.
+	  $echo "$modename: warning: \`-no-install' is ignored for $host" 1>&2
+	  $echo "$modename: warning: assuming \`-no-fast-install' instead" 1>&2
+	  fast_install=no
+	  ;;
+	*) no_install=yes ;;
+	esac
+	continue
+	;;
 
       -no-undefined)
 	allow_undefined=no
@@ -521,46 +1111,116 @@
 	;;
 
       -rpath)
-        prev=rpath
-        continue
-        ;;
+	prev=rpath
+	continue
+	;;
+
+      -R)
+	prev=xrpath
+	continue
+	;;
+
+      -R*)
+	dir=`$echo "X$arg" | $Xsed -e 's/^-R//'`
+	# We need an absolute path.
+	case $dir in
+	[\\/]* | [A-Za-z]:[\\/]*) ;;
+	*)
+	  $echo "$modename: only absolute run-paths are allowed" 1>&2
+	  exit 1
+	  ;;
+	esac
+	case "$xrpath " in
+	*" $dir "*) ;;
+	*) xrpath="$xrpath $dir" ;;
+	esac
+	continue
+	;;
 
       -static)
-	# If we have no pic_flag, then this is the same as -all-static.
-	if test -z "$pic_flag" && test -n "$link_static_flag"; then
-          compile_command="$compile_command $link_static_flag"
-	  finalize_command="$finalize_command $link_static_flag"
-        fi
+	# The effects of -static are defined in a previous loop.
+	# We used to do the same as -all-static on platforms that
+	# didn't have a PIC flag, but the assumption that the effects
+	# would be equivalent was wrong.  It would break on at least
+	# Digital Unix and AIX.
+	continue
+	;;
+
+      -thread-safe)
+	thread_safe=yes
 	continue
 	;;
 
       -version-info)
-        prev=vinfo
-        continue
-        ;;
+	prev=vinfo
+	continue
+	;;
+
+      -Wc,*)
+	args=`$echo "X$arg" | $Xsed -e "$sed_quote_subst" -e 's/^-Wc,//'`
+	arg=
+	save_ifs="$IFS"; IFS=','
+	for flag in $args; do
+	  IFS="$save_ifs"
+	  case $flag in
+	    *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	    flag="\"$flag\""
+	    ;;
+	  esac
+	  arg="$arg $wl$flag"
+	  compiler_flags="$compiler_flags $flag"
+	done
+	IFS="$save_ifs"
+	arg=`$echo "X$arg" | $Xsed -e "s/^ //"`
+	;;
+
+      -Wl,*)
+	args=`$echo "X$arg" | $Xsed -e "$sed_quote_subst" -e 's/^-Wl,//'`
+	arg=
+	save_ifs="$IFS"; IFS=','
+	for flag in $args; do
+	  IFS="$save_ifs"
+	  case $flag in
+	    *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	    flag="\"$flag\""
+	    ;;
+	  esac
+	  arg="$arg $wl$flag"
+	  compiler_flags="$compiler_flags $wl$flag"
+	  linker_flags="$linker_flags $flag"
+	done
+	IFS="$save_ifs"
+	arg=`$echo "X$arg" | $Xsed -e "s/^ //"`
+	;;
+
+      -Xcompiler)
+	prev=xcompiler
+	continue
+	;;
+
+      -Xlinker)
+	prev=xlinker
+	continue
+	;;
 
       # Some other compiler flag.
       -* | +*)
 	# Unknown arguments in both finalize_command and compile_command need
 	# to be aesthetically quoted because they are evaled later.
 	arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
-	case "$arg" in
-	*[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*)
+	case $arg in
+	*[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
 	  arg="\"$arg\""
 	  ;;
 	esac
-        ;;
+	;;
 
-      *.o | *.a)
-        # A standard object.
-        objs="$objs $arg"
-        ;;
-
-      *.lo)
-        # A library object.
+      *.lo | *.$objext)
+	# A library or standard object.
 	if test "$prev" = dlfiles; then
-	  dlfiles="$dlfiles $arg"
-	  if test "$build_libtool_libs" = yes; then
+	  # This file was specified with -dlopen.
+	  if test "$build_libtool_libs" = yes && test "$dlopen_support" = yes; then
+	    dlfiles="$dlfiles $arg"
 	    prev=
 	    continue
 	  else
@@ -571,230 +1231,59 @@
 
 	if test "$prev" = dlprefiles; then
 	  # Preload the old-style object.
-	  dlprefiles="$dlprefiles "`$echo "X$arg" | $Xsed -e 's/\.lo$/\.o/'`
+	  dlprefiles="$dlprefiles "`$echo "X$arg" | $Xsed -e "$lo2o"`
 	  prev=
+	else
+	  case $arg in
+	  *.lo) libobjs="$libobjs $arg" ;;
+	  *) objs="$objs $arg" ;;
+	  esac
 	fi
-	libobjs="$libobjs $arg"
-        ;;
+	;;
+
+      *.$libext)
+	# An archive.
+	deplibs="$deplibs $arg"
+	old_deplibs="$old_deplibs $arg"
+	continue
+	;;
 
       *.la)
-        # A libtool-controlled library.
+	# A libtool-controlled library.
 
-        dlname=
-        libdir=
-        library_names=
-        old_library=
-
-        # Check to see that this really is a libtool archive.
-        if (sed -e '2q' $arg | egrep '^# Generated by ltmain\.sh') >/dev/null 2>&1; then :
-        else
-          $echo "$modename: \`$arg' is not a valid libtool archive" 1>&2
-          exit 1
-        fi
-
-        # If there is no directory component, then add one.
-        case "$arg" in
-        */* | *\\*) . $arg ;;
-        *) . ./$arg ;;
-        esac
-
-        if test -z "$libdir"; then
-          $echo "$modename: \`$arg' contains no -rpath information" 1>&2
-          exit 1
-        fi
-
-        # Get the name of the library we link against.
-        linklib=
-        for l in $old_library $library_names; do
-          linklib="$l"
-        done
-
-        if test -z "$linklib"; then
-          $echo "$modename: cannot find name of link library for \`$arg'" 1>&2
-          exit 1
-        fi
-
-        # Find the relevant object directory and library name.
-        name=`$echo "X$arg" | $Xsed -e 's%^.*/%%' -e 's/\.la$//' -e 's/^lib//'`
-        dir=`$echo "X$arg" | $Xsed -e 's%/[^/]*$%%'`
-        if test "X$dir" = "X$arg"; then
-          dir="$objdir"
-        else
-          dir="$dir/$objdir"
-        fi
-
-        # This library was specified with -dlopen.
-        if test "$prev" = dlfiles; then
-          dlfiles="$dlfiles $arg"
-          if test -z "$dlname"; then
-            # If there is no dlname, we need to preload.
-            prev=dlprefiles
-          else
-            # We should not create a dependency on this library, but we
-	    # may need any libraries it requires.
-	    compile_command="$compile_command$dependency_libs"
-	    finalize_command="$finalize_command$dependency_libs"
-            prev=
-            continue
-          fi
-        fi
-
-        # The library was specified with -dlpreopen.
-        if test "$prev" = dlprefiles; then
-          # Prefer using a static library (so that no silly _DYNAMIC symbols
-          # are required to link).
-          if test -n "$old_library"; then
-            dlprefiles="$dlprefiles $dir/$old_library"
-          else
-            dlprefiles="$dlprefiles $dir/$linklib"
-          fi
-          prev=
-        fi
-
-        if test "$build_libtool_libs" = yes && test -n "$library_names"; then
-          link_against_libtool_libs="$link_against_libtool_libs $arg"
-          if test -n "$shlibpath_var"; then
-            # Make sure the rpath contains only unique directories.
-            case "$temp_rpath " in
-            *" $dir "*) ;;
-            *) temp_rpath="$temp_rpath $dir" ;;
-            esac
-          fi
-
-	  # This is the magic to use -rpath.
-          if test -n "$hardcode_libdir_flag_spec"; then
-            if test -n "$hardcode_libdir_separator"; then
-              if test -z "$hardcode_libdirs"; then
-                # Put the magic libdir with the hardcode flag.
-                hardcode_libdirs="$libdir"
-                libdir="@HARDCODE_LIBDIRS@"
-              else
-                # Just accumulate the unique libdirs.
-		case "$hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator" in
-		*"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
-		  ;;
-		*)
-		  hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
-		  ;;
-		esac
-                libdir=
-              fi
-            fi
-
-            if test -n "$libdir"; then
-              eval flag=\"$hardcode_libdir_flag_spec\"
-
-              compile_command="$compile_command $flag"
-              finalize_command="$finalize_command $flag"
-            fi
-          elif test -n "$runpath_var"; then
-            # Do the same for the permanent run path.
-            case "$perm_rpath " in
-            *" $libdir "*) ;;
-            *) perm_rpath="$perm_rpath $libdir" ;;
-            esac
-          fi
-
-
-          case "$hardcode_action" in
-          immediate)
-            if test "$hardcode_direct" = no; then
-              compile_command="$compile_command $dir/$linklib"
-            elif test "$hardcode_minus_L" = no; then
-              compile_command="$compile_command -L$dir -l$name"
-            elif test "$hardcode_shlibpath_var" = no; then
-              compile_shlibpath="$compile_shlibpath$dir:"
-              compile_command="$compile_command -l$name"
-            fi
-            ;;
-
-          relink)
-            # We need an absolute path.
-            case "$dir" in
-            /* | [A-Za-z]:\\*) ;;
-            *)
-              absdir=`cd "$dir" && pwd`
-              if test -z "$absdir"; then
-                $echo "$modename: cannot determine absolute directory name of \`$dir'" 1>&2
-                exit 1
-              fi
-              dir="$absdir"
-              ;;
-            esac
-
-            if test "$hardcode_direct" = yes; then
-              compile_command="$compile_command $dir/$linklib"
-            elif test "$hardcode_minus_L" = yes; then
-              compile_command="$compile_command -L$dir -l$name"
-            elif test "$hardcode_shlibpath_var" = yes; then
-              compile_shlibpath="$compile_shlibpath$dir:"
-              compile_command="$compile_command -l$name"
-            fi
-            ;;
-
-          *)
-            $echo "$modename: \`$hardcode_action' is an unknown hardcode action" 1>&2
-            exit 1
-            ;;
-          esac
-
-          # Finalize command for both is simple: just hardcode it.
-          if test "$hardcode_direct" = yes; then
-            finalize_command="$finalize_command $libdir/$linklib"
-          elif test "$hardcode_minus_L" = yes; then
-            finalize_command="$finalize_command -L$libdir -l$name"
-          elif test "$hardcode_shlibpath_var" = yes; then
-            finalize_shlibpath="$finalize_shlibpath$libdir:"
-            finalize_command="$finalize_command -l$name"
-          else
-            # We cannot seem to hardcode it, guess we'll fake it.
-            finalize_command="$finalize_command -L$libdir -l$name"
-          fi
-        else
-          # Transform directly to old archives if we don't build new libraries.
-          if test -n "$pic_flag" && test -z "$old_library"; then
-            $echo "$modename: cannot find static library for \`$arg'" 1>&2
-            exit 1
-          fi
-
-	  # Here we assume that one of hardcode_direct or hardcode_minus_L
-	  # is not unsupported.  This is valid on all known static and
-	  # shared platforms.
-	  if test "$hardcode_direct" != unsupported; then
-	    test -n "$old_library" && linklib="$old_library"
-	    compile_command="$compile_command $dir/$linklib"
-	    finalize_command="$finalize_command $dir/$linklib"
-	  else
-	    compile_command="$compile_command -L$dir -l$name"
-	    finalize_command="$finalize_command -L$dir -l$name"
-	  fi
-        fi
-
-	# Add in any libraries that this one depends upon.
-	compile_command="$compile_command$dependency_libs"
-	finalize_command="$finalize_command$dependency_libs"
+	if test "$prev" = dlfiles; then
+	  # This library was specified with -dlopen.
+	  dlfiles="$dlfiles $arg"
+	  prev=
+	elif test "$prev" = dlprefiles; then
+	  # The library was specified with -dlpreopen.
+	  dlprefiles="$dlprefiles $arg"
+	  prev=
+	else
+	  deplibs="$deplibs $arg"
+	fi
 	continue
-        ;;
+	;;
 
       # Some other compiler argument.
       *)
 	# Unknown arguments in both finalize_command and compile_command need
 	# to be aesthetically quoted because they are evaled later.
 	arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
-	case "$arg" in
-	*[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*)
+	case $arg in
+	*[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
 	  arg="\"$arg\""
 	  ;;
 	esac
-        ;;
-      esac
+	;;
+      esac # arg
 
       # Now actually substitute the argument into the commands.
       if test -n "$arg"; then
 	compile_command="$compile_command $arg"
 	finalize_command="$finalize_command $arg"
       fi
-    done
+    done # argument parsing loop
 
     if test -n "$prev"; then
       $echo "$modename: the \`$prevarg' option requires an argument" 1>&2
@@ -802,79 +1291,910 @@
       exit 1
     fi
 
-    if test -n "$vinfo" && test -n "$release"; then
-      $echo "$modename: you cannot specify both \`-version-info' and \`-release'" 1>&2
-      $echo "$help" 1>&2
-      exit 1
+    if test "$export_dynamic" = yes && test -n "$export_dynamic_flag_spec"; then
+      eval arg=\"$export_dynamic_flag_spec\"
+      compile_command="$compile_command $arg"
+      finalize_command="$finalize_command $arg"
     fi
 
-    oldlib=
-    oldobjs=
-    case "$output" in
+    # calculate the name of the file, without its directory
+    outputname=`$echo "X$output" | $Xsed -e 's%^.*/%%'`
+    libobjs_save="$libobjs"
+
+    if test -n "$shlibpath_var"; then
+      # get the directories listed in $shlibpath_var
+      eval shlib_search_path=\`\$echo \"X\${$shlibpath_var}\" \| \$Xsed -e \'s/:/ /g\'\`
+    else
+      shlib_search_path=
+    fi
+    eval sys_lib_search_path=\"$sys_lib_search_path_spec\"
+    eval sys_lib_dlsearch_path=\"$sys_lib_dlsearch_path_spec\"
+
+    output_objdir=`$echo "X$output" | $Xsed -e 's%/[^/]*$%%'`
+    if test "X$output_objdir" = "X$output"; then
+      output_objdir="$objdir"
+    else
+      output_objdir="$output_objdir/$objdir"
+    fi
+    # Create the object directory.
+    if test ! -d $output_objdir; then
+      $show "$mkdir $output_objdir"
+      $run $mkdir $output_objdir
+      status=$?
+      if test $status -ne 0 && test ! -d $output_objdir; then
+	exit $status
+      fi
+    fi
+
+    # Determine the type of output
+    case $output in
     "")
       $echo "$modename: you must specify an output file" 1>&2
       $echo "$help" 1>&2
       exit 1
       ;;
+    *.$libext) linkmode=oldlib ;;
+    *.lo | *.$objext) linkmode=obj ;;
+    *.la) linkmode=lib ;;
+    *) linkmode=prog ;; # Anything else should be a program.
+    esac
 
-    */* | *\\*)
-      $echo "$modename: output file \`$output' must have no directory components" 1>&2
-      exit 1
-      ;;
+    specialdeplibs=
+    libs=
+    # Find all interdependent deplibs by searching for libraries
+    # that are linked more than once (e.g. -la -lb -la)
+    for deplib in $deplibs; do
+      if test "X$duplicate_deps" = "Xyes" ; then
+	case "$libs " in
+	*" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+	esac
+      fi
+      libs="$libs $deplib"
+    done
+    deplibs=
+    newdependency_libs=
+    newlib_search_path=
+    need_relink=no # whether we're linking any uninstalled libtool libraries
+    notinst_deplibs= # not-installed libtool libraries
+    notinst_path= # paths that contain not-installed libtool libraries
+    case $linkmode in
+    lib)
+	passes="conv link"
+	for file in $dlfiles $dlprefiles; do
+	  case $file in
+	  *.la) ;;
+	  *)
+	    $echo "$modename: libraries can \`-dlopen' only libtool libraries: $file" 1>&2
+	    exit 1
+	    ;;
+	  esac
+	done
+	;;
+    prog)
+	compile_deplibs=
+	finalize_deplibs=
+	alldeplibs=no
+	newdlfiles=
+	newdlprefiles=
+	passes="conv scan dlopen dlpreopen link"
+	;;
+    *)  passes="conv"
+	;;
+    esac
+    for pass in $passes; do
+      if test $linkmode = prog; then
+	# Determine which files to process
+	case $pass in
+	dlopen)
+	  libs="$dlfiles"
+	  save_deplibs="$deplibs" # Collect dlpreopened libraries
+	  deplibs=
+	  ;;
+	dlpreopen) libs="$dlprefiles" ;;
+	link) libs="$deplibs %DEPLIBS% $dependency_libs" ;;
+	esac
+      fi
+      for deplib in $libs; do
+	lib=
+	found=no
+	case $deplib in
+	-l*)
+	  if test $linkmode = oldlib && test $linkmode = obj; then
+	    $echo "$modename: warning: \`-l' is ignored for archives/objects: $deplib" 1>&2
+	    continue
+	  fi
+	  if test $pass = conv; then
+	    deplibs="$deplib $deplibs"
+	    continue
+	  fi
+	  name=`$echo "X$deplib" | $Xsed -e 's/^-l//'`
+	  for searchdir in $newlib_search_path $lib_search_path $sys_lib_search_path $shlib_search_path; do
+	    # Search the libtool library
+	    lib="$searchdir/lib${name}.la"
+	    if test -f "$lib"; then
+	      found=yes
+	      break
+	    fi
+	  done
+	  if test "$found" != yes; then
+	    # deplib doesn't seem to be a libtool library
+	    if test "$linkmode,$pass" = "prog,link"; then
+	      compile_deplibs="$deplib $compile_deplibs"
+	      finalize_deplibs="$deplib $finalize_deplibs"
+	    else
+	      deplibs="$deplib $deplibs"
+	      test $linkmode = lib && newdependency_libs="$deplib $newdependency_libs"
+	    fi
+	    continue
+	  fi
+	  ;; # -l
+	-L*)
+	  case $linkmode in
+	  lib)
+	    deplibs="$deplib $deplibs"
+	    test $pass = conv && continue
+	    newdependency_libs="$deplib $newdependency_libs"
+	    newlib_search_path="$newlib_search_path "`$echo "X$deplib" | $Xsed -e 's/^-L//'`
+	    ;;
+	  prog)
+	    if test $pass = conv; then
+	      deplibs="$deplib $deplibs"
+	      continue
+	    fi
+	    if test $pass = scan; then
+	      deplibs="$deplib $deplibs"
+	      newlib_search_path="$newlib_search_path "`$echo "X$deplib" | $Xsed -e 's/^-L//'`
+	    else
+	      compile_deplibs="$deplib $compile_deplibs"
+	      finalize_deplibs="$deplib $finalize_deplibs"
+	    fi
+	    ;;
+	  *)
+	    $echo "$modename: warning: \`-L' is ignored for archives/objects: $deplib" 1>&2
+	    ;;
+	  esac # linkmode
+	  continue
+	  ;; # -L
+	-R*)
+	  if test $pass = link; then
+	    dir=`$echo "X$deplib" | $Xsed -e 's/^-R//'`
+	    # Make sure the xrpath contains only unique directories.
+	    case "$xrpath " in
+	    *" $dir "*) ;;
+	    *) xrpath="$xrpath $dir" ;;
+	    esac
+	  fi
+	  deplibs="$deplib $deplibs"
+	  continue
+	  ;;
+	*.la) lib="$deplib" ;;
+	*.$libext)
+	  if test $pass = conv; then
+	    deplibs="$deplib $deplibs"
+	    continue
+	  fi
+	  case $linkmode in
+	  lib)
+	    if test "$deplibs_check_method" != pass_all; then
+	      echo
+	      echo "*** Warning: Trying to link with static lib archive $deplib."
+	      echo "*** I have the capability to make that library automatically link in when"
+	      echo "*** you link to this library.  But I can only do this if you have a"
+	      echo "*** shared version of the library, which you do not appear to have"
+	      echo "*** because the file extensions .$libext of this argument makes me believe"
+	      echo "*** that it is just a static archive that I should not used here."
+	    else
+	      echo
+	      echo "*** Warning: Linking the shared library $output against the"
+	      echo "*** static library $deplib is not portable!"
+	      deplibs="$deplib $deplibs"
+	    fi
+	    continue
+	    ;;
+	  prog)
+	    if test $pass != link; then
+	      deplibs="$deplib $deplibs"
+	    else
+	      compile_deplibs="$deplib $compile_deplibs"
+	      finalize_deplibs="$deplib $finalize_deplibs"
+	    fi
+	    continue
+	    ;;
+	  esac # linkmode
+	  ;; # *.$libext
+	*.lo | *.$objext)
+	  if test $pass = dlpreopen || test "$dlopen_support" != yes || test "$build_libtool_libs" = no; then
+	    # If there is no dlopen support or we're linking statically,
+	    # we need to preload.
+	    newdlprefiles="$newdlprefiles $deplib"
+	    compile_deplibs="$deplib $compile_deplibs"
+	    finalize_deplibs="$deplib $finalize_deplibs"
+	  else
+	    newdlfiles="$newdlfiles $deplib"
+	  fi
+	  continue
+	  ;;
+	%DEPLIBS%)
+	  alldeplibs=yes
+	  continue
+	  ;;
+	esac # case $deplib
+	if test $found = yes || test -f "$lib"; then :
+	else
+	  $echo "$modename: cannot find the library \`$lib'" 1>&2
+	  exit 1
+	fi
 
-    *.a)
+	# Check to see that this really is a libtool archive.
+	if (${SED} -e '2q' $lib | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then :
+	else
+	  $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+	  exit 1
+	fi
+
+	ladir=`$echo "X$lib" | $Xsed -e 's%/[^/]*$%%'`
+	test "X$ladir" = "X$lib" && ladir="."
+
+	dlname=
+	dlopen=
+	dlpreopen=
+	libdir=
+	library_names=
+	old_library=
+	# If the library was installed with an old release of libtool,
+	# it will not redefine variable installed.
+	installed=yes
+
+	# Read the .la file
+	case $lib in
+	*/* | *\\*) . $lib ;;
+	*) . ./$lib ;;
+	esac
+
+	if test "$linkmode,$pass" = "lib,link" ||
+	   test "$linkmode,$pass" = "prog,scan" ||
+	   { test $linkmode = oldlib && test $linkmode = obj; }; then
+	   # Add dl[pre]opened files of deplib
+	  test -n "$dlopen" && dlfiles="$dlfiles $dlopen"
+	  test -n "$dlpreopen" && dlprefiles="$dlprefiles $dlpreopen"
+	fi
+
+	if test $pass = conv; then
+	  # Only check for convenience libraries
+	  deplibs="$lib $deplibs"
+	  if test -z "$libdir"; then
+	    if test -z "$old_library"; then
+	      $echo "$modename: cannot find name of link library for \`$lib'" 1>&2
+	      exit 1
+	    fi
+	    # It is a libtool convenience library, so add in its objects.
+	    convenience="$convenience $ladir/$objdir/$old_library"
+	    old_convenience="$old_convenience $ladir/$objdir/$old_library"
+	    tmp_libs=
+	    for deplib in $dependency_libs; do
+	      deplibs="$deplib $deplibs"
+              if test "X$duplicate_deps" = "Xyes" ; then
+	        case "$tmp_libs " in
+	        *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+	        esac
+              fi
+	      tmp_libs="$tmp_libs $deplib"
+	    done
+	  elif test $linkmode != prog && test $linkmode != lib; then
+	    $echo "$modename: \`$lib' is not a convenience library" 1>&2
+	    exit 1
+	  fi
+	  continue
+	fi # $pass = conv
+
+	# Get the name of the library we link against.
+	linklib=
+	for l in $old_library $library_names; do
+	  linklib="$l"
+	done
+	if test -z "$linklib"; then
+	  $echo "$modename: cannot find name of link library for \`$lib'" 1>&2
+	  exit 1
+	fi
+
+	# This library was specified with -dlopen.
+	if test $pass = dlopen; then
+	  if test -z "$libdir"; then
+	    $echo "$modename: cannot -dlopen a convenience library: \`$lib'" 1>&2
+	    exit 1
+	  fi
+	  if test -z "$dlname" || test "$dlopen_support" != yes || test "$build_libtool_libs" = no; then
+	    # If there is no dlname, no dlopen support or we're linking
+	    # statically, we need to preload.
+	    dlprefiles="$dlprefiles $lib"
+	  else
+	    newdlfiles="$newdlfiles $lib"
+	  fi
+	  continue
+	fi # $pass = dlopen
+
+	# We need an absolute path.
+	case $ladir in
+	[\\/]* | [A-Za-z]:[\\/]*) abs_ladir="$ladir" ;;
+	*)
+	  abs_ladir=`cd "$ladir" && pwd`
+	  if test -z "$abs_ladir"; then
+	    $echo "$modename: warning: cannot determine absolute directory name of \`$ladir'" 1>&2
+	    $echo "$modename: passing it literally to the linker, although it might fail" 1>&2
+	    abs_ladir="$ladir"
+	  fi
+	  ;;
+	esac
+	laname=`$echo "X$lib" | $Xsed -e 's%^.*/%%'`
+
+	# Find the relevant object directory and library name.
+	if test "X$installed" = Xyes; then
+	  if test ! -f "$libdir/$linklib" && test -f "$abs_ladir/$linklib"; then
+	    $echo "$modename: warning: library \`$lib' was moved." 1>&2
+	    dir="$ladir"
+	    absdir="$abs_ladir"
+	    libdir="$abs_ladir"
+	  else
+	    dir="$libdir"
+	    absdir="$libdir"
+	  fi
+	else
+	  dir="$ladir/$objdir"
+	  absdir="$abs_ladir/$objdir"
+	  # Remove this search path later
+	  notinst_path="$notinst_path $abs_ladir"
+	fi # $installed = yes
+	name=`$echo "X$laname" | $Xsed -e 's/\.la$//' -e 's/^lib//'`
+
+	# This library was specified with -dlpreopen.
+	if test $pass = dlpreopen; then
+	  if test -z "$libdir"; then
+	    $echo "$modename: cannot -dlpreopen a convenience library: \`$lib'" 1>&2
+	    exit 1
+	  fi
+	  # Prefer using a static library (so that no silly _DYNAMIC symbols
+	  # are required to link).
+	  if test -n "$old_library"; then
+	    newdlprefiles="$newdlprefiles $dir/$old_library"
+	  # Otherwise, use the dlname, so that lt_dlopen finds it.
+	  elif test -n "$dlname"; then
+	    newdlprefiles="$newdlprefiles $dir/$dlname"
+	  else
+	    newdlprefiles="$newdlprefiles $dir/$linklib"
+	  fi
+	fi # $pass = dlpreopen
+
+	if test -z "$libdir"; then
+	  # Link the convenience library
+	  if test $linkmode = lib; then
+	    deplibs="$dir/$old_library $deplibs"
+	  elif test "$linkmode,$pass" = "prog,link"; then
+	    compile_deplibs="$dir/$old_library $compile_deplibs"
+	    finalize_deplibs="$dir/$old_library $finalize_deplibs"
+	  else
+	    deplibs="$lib $deplibs"
+	  fi
+	  continue
+	fi
+
+	if test $linkmode = prog && test $pass != link; then
+	  newlib_search_path="$newlib_search_path $ladir"
+	  deplibs="$lib $deplibs"
+
+	  linkalldeplibs=no
+	  if test "$link_all_deplibs" != no || test -z "$library_names" ||
+	     test "$build_libtool_libs" = no; then
+	    linkalldeplibs=yes
+	  fi
+
+	  tmp_libs=
+	  for deplib in $dependency_libs; do
+	    case $deplib in
+	    -L*) newlib_search_path="$newlib_search_path "`$echo "X$deplib" | $Xsed -e 's/^-L//'`;; ### testsuite: skip nested quoting test
+	    esac
+	    # Need to link against all dependency_libs?
+	    if test $linkalldeplibs = yes; then
+	      deplibs="$deplib $deplibs"
+	    else
+	      # Need to hardcode shared library paths
+	      # or/and link against static libraries
+	      newdependency_libs="$deplib $newdependency_libs"
+	    fi
+	    if test "X$duplicate_deps" = "Xyes" ; then
+	      case "$tmp_libs " in
+	      *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+	      esac
+	    fi
+	    tmp_libs="$tmp_libs $deplib"
+	  done # for deplib
+	  continue
+	fi # $linkmode = prog...
+
+	link_static=no # Whether the deplib will be linked statically
+	if test -n "$library_names" &&
+	   { test "$prefer_static_libs" = no || test -z "$old_library"; }; then
+	  # Link against this shared library
+
+	  if test "$linkmode,$pass" = "prog,link" ||
+	   { test $linkmode = lib && test $hardcode_into_libs = yes; }; then
+	    # Hardcode the library path.
+	    # Skip directories that are in the system default run-time
+	    # search path.
+	    case " $sys_lib_dlsearch_path " in
+	    *" $absdir "*) ;;
+	    *)
+	      case "$compile_rpath " in
+	      *" $absdir "*) ;;
+	      *) compile_rpath="$compile_rpath $absdir"
+	      esac
+	      ;;
+	    esac
+	    case " $sys_lib_dlsearch_path " in
+	    *" $libdir "*) ;;
+	    *)
+	      case "$finalize_rpath " in
+	      *" $libdir "*) ;;
+	      *) finalize_rpath="$finalize_rpath $libdir"
+	      esac
+	      ;;
+	    esac
+	    if test $linkmode = prog; then
+	      # We need to hardcode the library path
+	      if test -n "$shlibpath_var"; then
+		# Make sure the rpath contains only unique directories.
+		case "$temp_rpath " in
+		*" $dir "*) ;;
+		*" $absdir "*) ;;
+		*) temp_rpath="$temp_rpath $dir" ;;
+		esac
+	      fi
+	    fi
+	  fi # $linkmode,$pass = prog,link...
+
+	  if test "$alldeplibs" = yes &&
+	     { test "$deplibs_check_method" = pass_all ||
+	       { test "$build_libtool_libs" = yes &&
+		 test -n "$library_names"; }; }; then
+	    # We only need to search for static libraries
+	    continue
+	  fi
+
+	  if test "$installed" = no; then
+	    notinst_deplibs="$notinst_deplibs $lib"
+	    need_relink=yes
+	  fi
+
+	  if test -n "$old_archive_from_expsyms_cmds"; then
+	    # figure out the soname
+	    set dummy $library_names
+	    realname="$2"
+	    shift; shift
+	    libname=`eval \\$echo \"$libname_spec\"`
+	    # use dlname if we got it. it's perfectly good, no?
+	    if test -n "$dlname"; then
+	      soname="$dlname"
+	    elif test -n "$soname_spec"; then
+	      # bleh windows
+	      case $host in
+	      *cygwin*)
+		major=`expr $current - $age`
+		versuffix="-$major"
+		;;
+	      esac
+	      eval soname=\"$soname_spec\"
+	    else
+	      soname="$realname"
+	    fi
+
+	    # Make a new name for the extract_expsyms_cmds to use
+	    soroot="$soname"
+	    soname=`echo $soroot | ${SED} -e 's/^.*\///'`
+	    newlib="libimp-`echo $soname | ${SED} 's/^lib//;s/\.dll$//'`.a"
+
+	    # If the library has no export list, then create one now
+	    if test -f "$output_objdir/$soname-def"; then :
+	    else
+	      $show "extracting exported symbol list from \`$soname'"
+	      save_ifs="$IFS"; IFS='~'
+	      eval cmds=\"$extract_expsyms_cmds\"
+	      for cmd in $cmds; do
+		IFS="$save_ifs"
+		$show "$cmd"
+		$run eval "$cmd" || exit $?
+	      done
+	      IFS="$save_ifs"
+	    fi
+
+	    # Create $newlib
+	    if test -f "$output_objdir/$newlib"; then :; else
+	      $show "generating import library for \`$soname'"
+	      save_ifs="$IFS"; IFS='~'
+	      eval cmds=\"$old_archive_from_expsyms_cmds\"
+	      for cmd in $cmds; do
+		IFS="$save_ifs"
+		$show "$cmd"
+		$run eval "$cmd" || exit $?
+	      done
+	      IFS="$save_ifs"
+	    fi
+	    # make sure the library variables are pointing to the new library
+	    dir=$output_objdir
+	    linklib=$newlib
+	  fi # test -n $old_archive_from_expsyms_cmds
+
+	  if test $linkmode = prog || test "$mode" != relink; then
+	    add_shlibpath=
+	    add_dir=
+	    add=
+	    lib_linked=yes
+	    case $hardcode_action in
+	    immediate | unsupported)
+	      if test "$hardcode_direct" = no; then
+		add="$dir/$linklib"
+	      elif test "$hardcode_minus_L" = no; then
+		case $host in
+		*-*-sunos*) add_shlibpath="$dir" ;;
+		esac
+		add_dir="-L$dir"
+		add="-l$name"
+	      elif test "$hardcode_shlibpath_var" = no; then
+		add_shlibpath="$dir"
+		add="-l$name"
+	      else
+		lib_linked=no
+	      fi
+	      ;;
+	    relink)
+	      if test "$hardcode_direct" = yes; then
+		add="$dir/$linklib"
+	      elif test "$hardcode_minus_L" = yes; then
+		add_dir="-L$dir"
+		add="-l$name"
+	      elif test "$hardcode_shlibpath_var" = yes; then
+		add_shlibpath="$dir"
+		add="-l$name"
+	      else
+		lib_linked=no
+	      fi
+	      ;;
+	    *) lib_linked=no ;;
+	    esac
+
+	    if test "$lib_linked" != yes; then
+	      $echo "$modename: configuration error: unsupported hardcode properties"
+	      exit 1
+	    fi
+
+	    if test -n "$add_shlibpath"; then
+	      case :$compile_shlibpath: in
+	      *":$add_shlibpath:"*) ;;
+	      *) compile_shlibpath="$compile_shlibpath$add_shlibpath:" ;;
+	      esac
+	    fi
+	    if test $linkmode = prog; then
+	      test -n "$add_dir" && compile_deplibs="$add_dir $compile_deplibs"
+	      test -n "$add" && compile_deplibs="$add $compile_deplibs"
+	    else
+	      test -n "$add_dir" && deplibs="$add_dir $deplibs"
+	      test -n "$add" && deplibs="$add $deplibs"
+	      if test "$hardcode_direct" != yes && \
+		 test "$hardcode_minus_L" != yes && \
+		 test "$hardcode_shlibpath_var" = yes; then
+		case :$finalize_shlibpath: in
+		*":$libdir:"*) ;;
+		*) finalize_shlibpath="$finalize_shlibpath$libdir:" ;;
+		esac
+	      fi
+	    fi
+	  fi
+
+	  if test $linkmode = prog || test "$mode" = relink; then
+	    add_shlibpath=
+	    add_dir=
+	    add=
+	    # Finalize command for both is simple: just hardcode it.
+	    if test "$hardcode_direct" = yes; then
+	      add="$libdir/$linklib"
+	    elif test "$hardcode_minus_L" = yes; then
+	      add_dir="-L$libdir"
+	      add="-l$name"
+	    elif test "$hardcode_shlibpath_var" = yes; then
+	      case :$finalize_shlibpath: in
+	      *":$libdir:"*) ;;
+	      *) finalize_shlibpath="$finalize_shlibpath$libdir:" ;;
+	      esac
+	      add="-l$name"
+	    else
+	      # We cannot seem to hardcode it, guess we'll fake it.
+	      add_dir="-L$libdir"
+	      add="-l$name"
+	    fi
+
+	    if test $linkmode = prog; then
+	      test -n "$add_dir" && finalize_deplibs="$add_dir $finalize_deplibs"
+	      test -n "$add" && finalize_deplibs="$add $finalize_deplibs"
+	    else
+	      test -n "$add_dir" && deplibs="$add_dir $deplibs"
+	      test -n "$add" && deplibs="$add $deplibs"
+	    fi
+	  fi
+	elif test $linkmode = prog; then
+	  if test "$alldeplibs" = yes &&
+	     { test "$deplibs_check_method" = pass_all ||
+	       { test "$build_libtool_libs" = yes &&
+		 test -n "$library_names"; }; }; then
+	    # We only need to search for static libraries
+	    continue
+	  fi
+
+	  # Try to link the static library
+	  # Here we assume that one of hardcode_direct or hardcode_minus_L
+	  # is not unsupported.  This is valid on all known static and
+	  # shared platforms.
+	  if test "$hardcode_direct" != unsupported; then
+	    test -n "$old_library" && linklib="$old_library"
+	    compile_deplibs="$dir/$linklib $compile_deplibs"
+	    finalize_deplibs="$dir/$linklib $finalize_deplibs"
+	  else
+	    compile_deplibs="-l$name -L$dir $compile_deplibs"
+	    finalize_deplibs="-l$name -L$dir $finalize_deplibs"
+	  fi
+	elif test "$build_libtool_libs" = yes; then
+	  # Not a shared library
+	  if test "$deplibs_check_method" != pass_all; then
+	    # We're trying link a shared library against a static one
+	    # but the system doesn't support it.
+
+	    # Just print a warning and add the library to dependency_libs so
+	    # that the program can be linked against the static library.
+	    echo
+	    echo "*** Warning: This system can not link to static lib archive $lib."
+	    echo "*** I have the capability to make that library automatically link in when"
+	    echo "*** you link to this library.  But I can only do this if you have a"
+	    echo "*** shared version of the library, which you do not appear to have."
+	    if test "$module" = yes; then
+	      echo "*** But as you try to build a module library, libtool will still create "
+	      echo "*** a static module, that should work as long as the dlopening application"
+	      echo "*** is linked with the -dlopen flag to resolve symbols at runtime."
+	      if test -z "$global_symbol_pipe"; then
+		echo
+		echo "*** However, this would only work if libtool was able to extract symbol"
+		echo "*** lists from a program, using \`nm' or equivalent, but libtool could"
+		echo "*** not find such a program.  So, this module is probably useless."
+		echo "*** \`nm' from GNU binutils and a full rebuild may help."
+	      fi
+	      if test "$build_old_libs" = no; then
+		build_libtool_libs=module
+		build_old_libs=yes
+	      else
+		build_libtool_libs=no
+	      fi
+	    fi
+	  else
+	    convenience="$convenience $dir/$old_library"
+	    old_convenience="$old_convenience $dir/$old_library"
+	    deplibs="$dir/$old_library $deplibs"
+	    link_static=yes
+	  fi
+	fi # link shared/static library?
+
+	if test $linkmode = lib; then
+	  if test -n "$dependency_libs" &&
+	     { test $hardcode_into_libs != yes || test $build_old_libs = yes ||
+	       test $link_static = yes; }; then
+	    # Extract -R from dependency_libs
+	    temp_deplibs=
+	    for libdir in $dependency_libs; do
+	      case $libdir in
+	      -R*) temp_xrpath=`$echo "X$libdir" | $Xsed -e 's/^-R//'`
+		   case " $xrpath " in
+		   *" $temp_xrpath "*) ;;
+		   *) xrpath="$xrpath $temp_xrpath";;
+		   esac;;
+	      *) temp_deplibs="$temp_deplibs $libdir";;
+	      esac
+	    done
+	    dependency_libs="$temp_deplibs"
+	  fi
+
+	  newlib_search_path="$newlib_search_path $absdir"
+	  # Link against this library
+	  test "$link_static" = no && newdependency_libs="$abs_ladir/$laname $newdependency_libs"
+	  # ... and its dependency_libs
+	  tmp_libs=
+	  for deplib in $dependency_libs; do
+	    newdependency_libs="$deplib $newdependency_libs"
+	    if test "X$duplicate_deps" = "Xyes" ; then
+	      case "$tmp_libs " in
+	      *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+	      esac
+	    fi
+	    tmp_libs="$tmp_libs $deplib"
+	  done
+
+	  if test $link_all_deplibs != no; then
+	    # Add the search paths of all dependency libraries
+	    for deplib in $dependency_libs; do
+	      case $deplib in
+	      -L*) path="$deplib" ;;
+	      *.la)
+		dir=`$echo "X$deplib" | $Xsed -e 's%/[^/]*$%%'`
+		test "X$dir" = "X$deplib" && dir="."
+		# We need an absolute path.
+		case $dir in
+		[\\/]* | [A-Za-z]:[\\/]*) absdir="$dir" ;;
+		*)
+		  absdir=`cd "$dir" && pwd`
+		  if test -z "$absdir"; then
+		    $echo "$modename: warning: cannot determine absolute directory name of \`$dir'" 1>&2
+		    absdir="$dir"
+		  fi
+		  ;;
+		esac
+		if grep "^installed=no" $deplib > /dev/null; then
+		  path="-L$absdir/$objdir"
+		else
+		  eval libdir=`${SED} -n -e 's/^libdir=\(.*\)$/\1/p' $deplib`
+		  if test -z "$libdir"; then
+		    $echo "$modename: \`$deplib' is not a valid libtool archive" 1>&2
+		    exit 1
+		  fi
+		  if test "$absdir" != "$libdir"; then
+		    $echo "$modename: warning: \`$deplib' seems to be moved" 1>&2
+		  fi
+		  path="-L$absdir"
+		fi
+		;;
+	      *) continue ;;
+	      esac
+	      case " $deplibs " in
+	      *" $path "*) ;;
+	      *) deplibs="$deplibs $path" ;;
+	      esac
+	    done
+	  fi # link_all_deplibs != no
+	fi # linkmode = lib
+      done # for deplib in $libs
+      if test $pass = dlpreopen; then
+	# Link the dlpreopened libraries before other libraries
+	for deplib in $save_deplibs; do
+	  deplibs="$deplib $deplibs"
+	done
+      fi
+      if test $pass != dlopen; then
+	test $pass != scan && dependency_libs="$newdependency_libs"
+	if test $pass != conv; then
+	  # Make sure lib_search_path contains only unique directories.
+	  lib_search_path=
+	  for dir in $newlib_search_path; do
+	    case "$lib_search_path " in
+	    *" $dir "*) ;;
+	    *) lib_search_path="$lib_search_path $dir" ;;
+	    esac
+	  done
+	  newlib_search_path=
+	fi
+
+	if test "$linkmode,$pass" != "prog,link"; then
+	  vars="deplibs"
+	else
+	  vars="compile_deplibs finalize_deplibs"
+	fi
+	for var in $vars dependency_libs; do
+	  # Add libraries to $var in reverse order
+	  eval tmp_libs=\"\$$var\"
+	  new_libs=
+	  for deplib in $tmp_libs; do
+	    case $deplib in
+	    -L*) new_libs="$deplib $new_libs" ;;
+	    *)
+	      case " $specialdeplibs " in
+	      *" $deplib "*) new_libs="$deplib $new_libs" ;;
+	      *)
+		case " $new_libs " in
+		*" $deplib "*) ;;
+		*) new_libs="$deplib $new_libs" ;;
+		esac
+		;;
+	      esac
+	      ;;
+	    esac
+	  done
+	  tmp_libs=
+	  for deplib in $new_libs; do
+	    case $deplib in
+	    -L*)
+	      case " $tmp_libs " in
+	      *" $deplib "*) ;;
+	      *) tmp_libs="$tmp_libs $deplib" ;;
+	      esac
+	      ;;
+	    *) tmp_libs="$tmp_libs $deplib" ;;
+	    esac
+	  done
+	  eval $var=\"$tmp_libs\"
+	done # for var
+      fi
+      if test "$pass" = "conv" &&
+       { test "$linkmode" = "lib" || test "$linkmode" = "prog"; }; then
+	libs="$deplibs" # reset libs
+	deplibs=
+      fi
+    done # for pass
+    if test $linkmode = prog; then
+      dlfiles="$newdlfiles"
+      dlprefiles="$newdlprefiles"
+    fi
+
+    case $linkmode in
+    oldlib)
+      if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
+	$echo "$modename: warning: \`-dlopen' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$rpath"; then
+	$echo "$modename: warning: \`-rpath' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$xrpath"; then
+	$echo "$modename: warning: \`-R' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$vinfo"; then
+	$echo "$modename: warning: \`-version-info' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$release"; then
+	$echo "$modename: warning: \`-release' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$export_symbols" || test -n "$export_symbols_regex"; then
+	$echo "$modename: warning: \`-export-symbols' is ignored for archives" 1>&2
+      fi
+
       # Now set the variables for building old libraries.
       build_libtool_libs=no
-      build_old_libs=yes
-      oldlib="$output"
-      $show "$rm $oldlib"
-      $run $rm $oldlib
+      oldlibs="$output"
+      objs="$objs$old_deplibs"
       ;;
 
-    *.la)
+    lib)
       # Make sure we only generate libraries of the form `libNAME.la'.
-      case "$output" in
-      lib*) ;;
+      case $outputname in
+      lib*)
+	name=`$echo "X$outputname" | $Xsed -e 's/\.la$//' -e 's/^lib//'`
+	eval libname=\"$libname_spec\"
+	;;
       *)
-	$echo "$modename: libtool library \`$arg' must begin with \`lib'" 1>&2
-	$echo "$help" 1>&2
-	exit 1
+	if test "$module" = no; then
+	  $echo "$modename: libtool library \`$output' must begin with \`lib'" 1>&2
+	  $echo "$help" 1>&2
+	  exit 1
+	fi
+	if test "$need_lib_prefix" != no; then
+	  # Add the "lib" prefix for modules if required
+	  name=`$echo "X$outputname" | $Xsed -e 's/\.la$//'`
+	  eval libname=\"$libname_spec\"
+	else
+	  libname=`$echo "X$outputname" | $Xsed -e 's/\.la$//'`
+	fi
 	;;
       esac
 
-      name=`$echo "X$output" | $Xsed -e 's/\.la$//' -e 's/^lib//'`
-      eval libname=\"$libname_spec\"
-
-      # All the library-specific variables (install_libdir is set above).
-      library_names=
-      old_library=
-      dlname=
-      current=0
-      revision=0
-      age=0
-
       if test -n "$objs"; then
-        $echo "$modename: cannot build libtool library \`$output' from non-libtool objects:$objs" 2>&1
-        exit 1
+	if test "$deplibs_check_method" != pass_all; then
+	  $echo "$modename: cannot build libtool library \`$output' from non-libtool objects on this host:$objs" 2>&1
+	  exit 1
+	else
+	  echo
+	  echo "*** Warning: Linking the shared library $output against the non-libtool"
+	  echo "*** objects $objs is not portable!"
+	  libobjs="$libobjs $objs"
+	fi
       fi
 
-      # How the heck are we supposed to write a wrapper for a shared library?
-      if test -n "$link_against_libtool_libs"; then
-        $echo "$modename: libtool library \`$output' may not depend on uninstalled libraries:$link_against_libtool_libs" 1>&2
-        exit 1
-      fi
-
-      if test -n "$dlfiles$dlprefiles"; then
-        $echo "$modename: warning: \`-dlopen' is ignored while creating libtool libraries" 1>&2
-        # Nullify the symbol file.
-        compile_command=`$echo "X$compile_command" | $Xsed -e "s% @SYMFILE@%%"`
-        finalize_command=`$echo "X$finalize_command" | $Xsed -e "s% @SYMFILE@%%"`
-      fi
-
-      if test -z "$rpath"; then
-        $echo "$modename: you must specify an installation directory with \`-rpath'" 1>&2
-	$echo "$help" 1>&2
-        exit 1
+      if test "$dlself" != no; then
+	$echo "$modename: warning: \`-dlopen self' is ignored for libtool libraries" 1>&2
       fi
 
       set dummy $rpath
@@ -883,641 +2203,1605 @@
       fi
       install_libdir="$2"
 
-      # Parse the version information argument.
-      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=':'
-      set dummy $vinfo
-      IFS="$save_ifs"
+      oldlibs=
+      if test -z "$rpath"; then
+	if test "$build_libtool_libs" = yes; then
+	  # Building a libtool convenience library.
+	  libext=al
+	  oldlibs="$output_objdir/$libname.$libext $oldlibs"
+	  build_libtool_libs=convenience
+	  build_old_libs=yes
+	fi
 
-      if test -n "$5"; then
-        $echo "$modename: too many parameters to \`-version-info'" 1>&2
-        $echo "$help" 1>&2
-        exit 1
-      fi
+	if test -n "$vinfo"; then
+	  $echo "$modename: warning: \`-version-info' is ignored for convenience libraries" 1>&2
+	fi
 
-      test -n "$2" && current="$2"
-      test -n "$3" && revision="$3"
-      test -n "$4" && age="$4"
-
-      # Check that each of the things are valid numbers.
-      case "$current" in
-      0 | [1-9] | [1-9][0-9]*) ;;
-      *)
-        $echo "$modename: CURRENT \`$current' is not a nonnegative integer" 1>&2
-        $echo "$modename: \`$vinfo' is not valid version information" 1>&2
-        exit 1
-        ;;
-      esac
-
-      case "$revision" in
-      0 | [1-9] | [1-9][0-9]*) ;;
-      *)
-        $echo "$modename: REVISION \`$revision' is not a nonnegative integer" 1>&2
-        $echo "$modename: \`$vinfo' is not valid version information" 1>&2
-        exit 1
-        ;;
-      esac
-
-      case "$age" in
-      0 | [1-9] | [1-9][0-9]*) ;;
-      *)
-        $echo "$modename: AGE \`$age' is not a nonnegative integer" 1>&2
-        $echo "$modename: \`$vinfo' is not valid version information" 1>&2
-        exit 1
-        ;;
-      esac
-
-      if test $age -gt $current; then
-        $echo "$modename: AGE \`$age' is greater than the current interface number \`$current'" 1>&2
-        $echo "$modename: \`$vinfo' is not valid version information" 1>&2
-        exit 1
-      fi
-
-      # Calculate the version variables.
-      version_vars="version_type current age revision"
-      case "$version_type" in
-      none) ;;
-
-      linux)
-        version_vars="$version_vars major versuffix"
-        major=`expr $current - $age`
-        versuffix="$major.$age.$revision"
-        ;;
-
-      osf)
-        version_vars="$version_vars versuffix verstring"
-        major=`expr $current - $age`
-        versuffix="$current.$age.$revision"
-        verstring="$versuffix"
-
-        # Add in all the interfaces that we are compatible with.
-        loop=$age
-        while test $loop != 0; do
-          iface=`expr $current - $loop`
-          loop=`expr $loop - 1`
-          verstring="$verstring:${iface}.0"
-        done
-
-        # Make executables depend on our current version.
-        verstring="$verstring:${current}.0"
-        ;;
-
-      sunos)
-        version_vars="$version_vars major versuffix"
-        major="$current"
-        versuffix="$current.$revision"
-        ;;
-
-      *)
-        $echo "$modename: unknown library version type \`$version_type'" 1>&2
-        echo "Fatal configuration error.  See the $PACKAGE docs for more information." 1>&2
-        exit 1
-        ;;
-      esac
-
-      # Create the output directory, or remove our outputs if we need to.
-      if test -d $objdir; then
-        $show "$rm $objdir/$output $objdir/$libname.* $objdir/${libname}${release}.*"
-        $run $rm $objdir/$output $objdir/$libname.* $objdir/${libname}${release}.*
+	if test -n "$release"; then
+	  $echo "$modename: warning: \`-release' is ignored for convenience libraries" 1>&2
+	fi
       else
-        $show "$mkdir $objdir"
-        $run $mkdir $objdir
-	status=$?
-	if test $status -eq 0 || test -d $objdir; then :
+
+	# Parse the version information argument.
+	save_ifs="$IFS"; IFS=':'
+	set dummy $vinfo 0 0 0
+	IFS="$save_ifs"
+
+	if test -n "$8"; then
+	  $echo "$modename: too many parameters to \`-version-info'" 1>&2
+	  $echo "$help" 1>&2
+	  exit 1
+	fi
+
+	current="$2"
+	revision="$3"
+	age="$4"
+
+	# Check that each of the things are valid numbers.
+	case $current in
+	0 | [1-9] | [1-9][0-9] | [1-9][0-9][0-9]) ;;
+	*)
+	  $echo "$modename: CURRENT \`$current' is not a nonnegative integer" 1>&2
+	  $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+	  exit 1
+	  ;;
+	esac
+
+	case $revision in
+	0 | [1-9] | [1-9][0-9] | [1-9][0-9][0-9]) ;;
+	*)
+	  $echo "$modename: REVISION \`$revision' is not a nonnegative integer" 1>&2
+	  $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+	  exit 1
+	  ;;
+	esac
+
+	case $age in
+	0 | [1-9] | [1-9][0-9] | [1-9][0-9][0-9]) ;;
+	*)
+	  $echo "$modename: AGE \`$age' is not a nonnegative integer" 1>&2
+	  $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+	  exit 1
+	  ;;
+	esac
+
+	if test $age -gt $current; then
+	  $echo "$modename: AGE \`$age' is greater than the current interface number \`$current'" 1>&2
+	  $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+	  exit 1
+	fi
+
+	# Calculate the version variables.
+	major=
+	versuffix=
+	verstring=
+	case $version_type in
+	none) ;;
+
+	darwin)
+	  # Like Linux, but with the current version available in
+	  # verstring for coding it into the library header
+	  major=.`expr $current - $age`
+	  versuffix="$major.$age.$revision"
+	  # Darwin ld doesn't like 0 for these options...
+	  minor_current=`expr $current + 1`
+	  verstring="-compatibility_version $minor_current -current_version $minor_current.$revision"
+	  ;;
+
+	freebsd-aout)
+	  major=".$current"
+	  versuffix=".$current.$revision";
+	  ;;
+
+	freebsd-elf)
+	  major=".$current"
+	  versuffix=".$current";
+	  ;;
+
+	irix | nonstopux)
+	  major=`expr $current - $age + 1`
+
+	  case $version_type in
+	    nonstopux) verstring_prefix=nonstopux ;;
+	    *)         verstring_prefix=sgi ;;
+	  esac
+	  verstring="$verstring_prefix$major.$revision"
+
+	  # Add in all the interfaces that we are compatible with.
+	  loop=$revision
+	  while test $loop != 0; do
+	    iface=`expr $revision - $loop`
+	    loop=`expr $loop - 1`
+	    verstring="$verstring_prefix$major.$iface:$verstring"
+	  done
+
+	  # Before this point, $major must not contain `.'.
+	  major=.$major
+	  versuffix="$major.$revision"
+	  ;;
+
+	linux)
+	  major=.`expr $current - $age`
+	  versuffix="$major.$age.$revision"
+	  ;;
+
+	osf)
+	  major=.`expr $current - $age`
+	  versuffix=".$current.$age.$revision"
+	  verstring="$current.$age.$revision"
+
+	  # Add in all the interfaces that we are compatible with.
+	  loop=$age
+	  while test $loop != 0; do
+	    iface=`expr $current - $loop`
+	    loop=`expr $loop - 1`
+	    verstring="$verstring:${iface}.0"
+	  done
+
+	  # Make executables depend on our current version.
+	  verstring="$verstring:${current}.0"
+	  ;;
+
+	sunos)
+	  major=".$current"
+	  versuffix=".$current.$revision"
+	  ;;
+
+	windows)
+	  # Use '-' rather than '.', since we only want one
+	  # extension on DOS 8.3 filesystems.
+	  major=`expr $current - $age`
+	  versuffix="-$major"
+	  ;;
+
+	*)
+	  $echo "$modename: unknown library version type \`$version_type'" 1>&2
+	  echo "Fatal configuration error.  See the $PACKAGE docs for more information." 1>&2
+	  exit 1
+	  ;;
+	esac
+
+	# Clear the version info if we defaulted, and they specified a release.
+	if test -z "$vinfo" && test -n "$release"; then
+	  major=
+	  verstring="0.0"
+	  case $version_type in
+	  darwin)
+	    # we can't check for "0.0" in archive_cmds due to quoting
+	    # problems, so we reset it completely
+	    verstring=""
+	    ;;
+	  *)
+	    verstring="0.0"
+	    ;;
+	  esac
+	  if test "$need_version" = no; then
+	    versuffix=
+	  else
+	    versuffix=".0.0"
+	  fi
+	fi
+
+	# Remove version info from name if versioning should be avoided
+	if test "$avoid_version" = yes && test "$need_version" = no; then
+	  major=
+	  versuffix=
+	  verstring=""
+	fi
+
+	# Check to see if the archive will have undefined symbols.
+	if test "$allow_undefined" = yes; then
+	  if test "$allow_undefined_flag" = unsupported; then
+	    $echo "$modename: warning: undefined symbols not allowed in $host shared libraries" 1>&2
+	    build_libtool_libs=no
+	    build_old_libs=yes
+	  fi
 	else
-	  exit $status
+	  # Don't allow undefined symbols.
+	  allow_undefined_flag="$no_undefined_flag"
 	fi
       fi
 
-      # Check to see if the archive will have undefined symbols.
-      if test "$allow_undefined" = yes; then
-        if test "$allow_undefined_flag" = unsupported; then
-          $echo "$modename: warning: undefined symbols not allowed in $host shared libraries" 1>&2
-          build_libtool_libs=no
-	  build_old_libs=yes
-        fi
-      else
-        # Don't allow undefined symbols.
-        allow_undefined_flag="$no_undefined_flag"
+      if test "$mode" != relink; then
+	# Remove our outputs.
+	$show "${rm}r $output_objdir/$outputname $output_objdir/$libname.* $output_objdir/${libname}${release}.*"
+	$run ${rm}r $output_objdir/$outputname $output_objdir/$libname.* $output_objdir/${libname}${release}.*
       fi
 
-      # Add libc to deplibs on all systems.
-      dependency_libs="$deplibs"
-      deplibs="$deplibs -lc"
+      # Now set the variables for building old libraries.
+      if test "$build_old_libs" = yes && test "$build_libtool_libs" != convenience ; then
+	oldlibs="$oldlibs $output_objdir/$libname.$libext"
+
+	# Transform .lo files to .o files.
+	oldobjs="$objs "`$echo "X$libobjs" | $SP2NL | $Xsed -e '/\.'${libext}'$/d' -e "$lo2o" | $NL2SP`
+      fi
+
+      # Eliminate all temporary directories.
+      for path in $notinst_path; do
+	lib_search_path=`echo "$lib_search_path " | ${SED} -e 's% $path % %g'`
+	deplibs=`echo "$deplibs " | ${SED} -e 's% -L$path % %g'`
+	dependency_libs=`echo "$dependency_libs " | ${SED} -e 's% -L$path % %g'`
+      done
+
+      if test -n "$xrpath"; then
+	# If the user specified any rpath flags, then add them.
+	temp_xrpath=
+	for libdir in $xrpath; do
+	  temp_xrpath="$temp_xrpath -R$libdir"
+	  case "$finalize_rpath " in
+	  *" $libdir "*) ;;
+	  *) finalize_rpath="$finalize_rpath $libdir" ;;
+	  esac
+	done
+	if test $hardcode_into_libs != yes || test $build_old_libs = yes; then
+	  dependency_libs="$temp_xrpath $dependency_libs"
+	fi
+      fi
+
+      # Make sure dlfiles contains only unique files that won't be dlpreopened
+      old_dlfiles="$dlfiles"
+      dlfiles=
+      for lib in $old_dlfiles; do
+	case " $dlprefiles $dlfiles " in
+	*" $lib "*) ;;
+	*) dlfiles="$dlfiles $lib" ;;
+	esac
+      done
+
+      # Make sure dlprefiles contains only unique files
+      old_dlprefiles="$dlprefiles"
+      dlprefiles=
+      for lib in $old_dlprefiles; do
+	case "$dlprefiles " in
+	*" $lib "*) ;;
+	*) dlprefiles="$dlprefiles $lib" ;;
+	esac
+      done
 
       if test "$build_libtool_libs" = yes; then
-        # Get the real and link names of the library.
-        eval library_names=\"$library_names_spec\"
-        set dummy $library_names
-        realname="$2"
-        shift; shift
+	if test -n "$rpath"; then
+	  case $host in
+	  *-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2* | *-*-beos*)
+	    # these systems don't actually have a c library (as such)!
+	    ;;
+	  *-*-rhapsody* | *-*-darwin1.[012])
+	    # Rhapsody C library is in the System framework
+	    deplibs="$deplibs -framework System"
+	    ;;
+	  *-*-netbsd*)
+	    # Don't link with libc until the a.out ld.so is fixed.
+	    ;;
+	  *-*-openbsd* | *-*-freebsd*)
+	    # Do not include libc due to us having libc/libc_r.
+	    ;;
+	  *)
+	    # Add libc to deplibs on all other systems if necessary.
+	    if test $build_libtool_need_lc = "yes"; then
+	      deplibs="$deplibs -lc"
+	    fi
+	    ;;
+	  esac
+	fi
 
-        if test -n "$soname_spec"; then
-          eval soname=\"$soname_spec\"
-        else
-          soname="$realname"
-        fi
+	# Transform deplibs into only deplibs that can be linked in shared.
+	name_save=$name
+	libname_save=$libname
+	release_save=$release
+	versuffix_save=$versuffix
+	major_save=$major
+	# I'm not sure if I'm treating the release correctly.  I think
+	# release should show up in the -l (ie -lgmp5) so we don't want to
+	# add it in twice.  Is that correct?
+	release=""
+	versuffix=""
+	major=""
+	newdeplibs=
+	droppeddeps=no
+	case $deplibs_check_method in
+	pass_all)
+	  # Don't check for shared/static.  Everything works.
+	  # This might be a little naive.  We might want to check
+	  # whether the library exists or not.  But this is on
+	  # osf3 & osf4 and I'm not really sure... Just
+	  # implementing what was already the behaviour.
+	  newdeplibs=$deplibs
+	  ;;
+	test_compile)
+	  # This code stresses the "libraries are programs" paradigm to its
+	  # limits. Maybe even breaks it.  We compile a program, linking it
+	  # against the deplibs as a proxy for the library.  Then we can check
+	  # whether they linked in statically or dynamically with ldd.
+	  $rm conftest.c
+	  cat > conftest.c <<EOF
+	  int main() { return 0; }
+EOF
+	  $rm conftest
+	  $CC -o conftest conftest.c $deplibs
+	  if test $? -eq 0 ; then
+	    ldd_output=`ldd conftest`
+	    for i in $deplibs; do
+	      name="`expr $i : '-l\(.*\)'`"
+	      # If $name is empty we are operating on a -L argument.
+	      if test -n "$name" && test "$name" != "0"; then
+		libname=`eval \\$echo \"$libname_spec\"`
+		deplib_matches=`eval \\$echo \"$library_names_spec\"`
+		set dummy $deplib_matches
+		deplib_match=$2
+		if test `expr "$ldd_output" : ".*$deplib_match"` -ne 0 ; then
+		  newdeplibs="$newdeplibs $i"
+		else
+		  droppeddeps=yes
+		  echo
+		  echo "*** Warning: dynamic linker does not accept needed library $i."
+		  echo "*** I have the capability to make that library automatically link in when"
+		  echo "*** you link to this library.  But I can only do this if you have a"
+		  echo "*** shared version of the library, which I believe you do not have"
+		  echo "*** because a test_compile did reveal that the linker did not use it for"
+		  echo "*** its dynamic dependency list that programs get resolved with at runtime."
+		fi
+	      else
+		newdeplibs="$newdeplibs $i"
+	      fi
+	    done
+	  else
+	    # Error occured in the first compile.  Let's try to salvage
+	    # the situation: Compile a separate program for each library.
+	    for i in $deplibs; do
+	      name="`expr $i : '-l\(.*\)'`"
+	     # If $name is empty we are operating on a -L argument.
+	      if test -n "$name" && test "$name" != "0"; then
+		$rm conftest
+		$CC -o conftest conftest.c $i
+		# Did it work?
+		if test $? -eq 0 ; then
+		  ldd_output=`ldd conftest`
+		  libname=`eval \\$echo \"$libname_spec\"`
+		  deplib_matches=`eval \\$echo \"$library_names_spec\"`
+		  set dummy $deplib_matches
+		  deplib_match=$2
+		  if test `expr "$ldd_output" : ".*$deplib_match"` -ne 0 ; then
+		    newdeplibs="$newdeplibs $i"
+		  else
+		    droppeddeps=yes
+		    echo
+		    echo "*** Warning: dynamic linker does not accept needed library $i."
+		    echo "*** I have the capability to make that library automatically link in when"
+		    echo "*** you link to this library.  But I can only do this if you have a"
+		    echo "*** shared version of the library, which you do not appear to have"
+		    echo "*** because a test_compile did reveal that the linker did not use this one"
+		    echo "*** as a dynamic dependency that programs can get resolved with at runtime."
+		  fi
+		else
+		  droppeddeps=yes
+		  echo
+		  echo "*** Warning!  Library $i is needed by this library but I was not able to"
+		  echo "***  make it link in!  You will probably need to install it or some"
+		  echo "*** library that it depends on before this library will be fully"
+		  echo "*** functional.  Installing it before continuing would be even better."
+		fi
+	      else
+		newdeplibs="$newdeplibs $i"
+	      fi
+	    done
+	  fi
+	  ;;
+	file_magic*)
+	  set dummy $deplibs_check_method
+	  file_magic_regex=`expr "$deplibs_check_method" : "$2 \(.*\)"`
+	  for a_deplib in $deplibs; do
+	    name="`expr $a_deplib : '-l\(.*\)'`"
+	    # If $name is empty we are operating on a -L argument.
+	    if test -n "$name" && test "$name" != "0"; then
+	      libname=`eval \\$echo \"$libname_spec\"`
+	      for i in $lib_search_path $sys_lib_search_path $shlib_search_path; do
+		    potential_libs=`ls $i/$libname[.-]* 2>/dev/null`
+		    for potent_lib in $potential_libs; do
+		      # Follow soft links.
+		      if ls -lLd "$potent_lib" 2>/dev/null \
+			 | grep " -> " >/dev/null; then
+			continue
+		      fi
+		      # The statement above tries to avoid entering an
+		      # endless loop below, in case of cyclic links.
+		      # We might still enter an endless loop, since a link
+		      # loop can be closed while we follow links,
+		      # but so what?
+		      potlib="$potent_lib"
+		      while test -h "$potlib" 2>/dev/null; do
+			potliblink=`ls -ld $potlib | ${SED} 's/.* -> //'`
+			case $potliblink in
+			[\\/]* | [A-Za-z]:[\\/]*) potlib="$potliblink";;
+			*) potlib=`$echo "X$potlib" | $Xsed -e 's,[^/]*$,,'`"$potliblink";;
+			esac
+		      done
+		      if eval $file_magic_cmd \"\$potlib\" 2>/dev/null \
+			 | ${SED} 10q \
+			 | egrep "$file_magic_regex" > /dev/null; then
+			newdeplibs="$newdeplibs $a_deplib"
+			a_deplib=""
+			break 2
+		      fi
+		    done
+	      done
+	      if test -n "$a_deplib" ; then
+		droppeddeps=yes
+		echo
+		echo "*** Warning: linker path does not have real file for library $a_deplib."
+		echo "*** I have the capability to make that library automatically link in when"
+		echo "*** you link to this library.  But I can only do this if you have a"
+		echo "*** shared version of the library, which you do not appear to have"
+		echo "*** because I did check the linker path looking for a file starting"
+		if test -z "$potlib" ; then
+		  echo "*** with $libname but no candidates were found. (...for file magic test)"
+		else
+		  echo "*** with $libname and none of the candidates passed a file format test"
+		  echo "*** using a file magic. Last file checked: $potlib"
+		fi
+	      fi
+	    else
+	      # Add a -L argument.
+	      newdeplibs="$newdeplibs $a_deplib"
+	    fi
+	  done # Gone through all deplibs.
+	  ;;
+	match_pattern*)
+	  set dummy $deplibs_check_method
+	  match_pattern_regex=`expr "$deplibs_check_method" : "$2 \(.*\)"`
+	  for a_deplib in $deplibs; do
+	    name="`expr $a_deplib : '-l\(.*\)'`"
+	    # If $name is empty we are operating on a -L argument.
+	    if test -n "$name" && test "$name" != "0"; then
+	      libname=`eval \\$echo \"$libname_spec\"`
+	      for i in $lib_search_path $sys_lib_search_path $shlib_search_path; do
+		potential_libs=`ls $i/$libname[.-]* 2>/dev/null`
+		for potent_lib in $potential_libs; do
+		  potlib="$potent_lib" # see symlink-check below in file_magic test
+		  if eval echo \"$potent_lib\" 2>/dev/null \
+		      | ${SED} 10q \
+		      | egrep "$match_pattern_regex" > /dev/null; then
+		    newdeplibs="$newdeplibs $a_deplib"
+		    a_deplib=""
+		    break 2
+		  fi
+		done
+	      done
+	      if test -n "$a_deplib" ; then
+		droppeddeps=yes
+		echo
+		echo "*** Warning: linker path does not have real file for library $a_deplib."
+		echo "*** I have the capability to make that library automatically link in when"
+		echo "*** you link to this library.  But I can only do this if you have a"
+		echo "*** shared version of the library, which you do not appear to have"
+		echo "*** because I did check the linker path looking for a file starting"
+		if test -z "$potlib" ; then
+		  echo "*** with $libname but no candidates were found. (...for regex pattern test)"
+		else
+		  echo "*** with $libname and none of the candidates passed a file format test"
+		  echo "*** using a regex pattern. Last file checked: $potlib"
+		fi
+	      fi
+	    else
+	      # Add a -L argument.
+	      newdeplibs="$newdeplibs $a_deplib"
+	    fi
+	  done # Gone through all deplibs.
+	  ;;
+	none | unknown | *)
+	  newdeplibs=""
+	  if $echo "X $deplibs" | $Xsed -e 's/ -lc$//' \
+	       -e 's/ -[LR][^ ]*//g' -e 's/[ 	]//g' |
+	     grep . >/dev/null; then
+	    echo
+	    if test "X$deplibs_check_method" = "Xnone"; then
+	      echo "*** Warning: inter-library dependencies are not supported in this platform."
+	    else
+	      echo "*** Warning: inter-library dependencies are not known to be supported."
+	    fi
+	    echo "*** All declared inter-library dependencies are being dropped."
+	    droppeddeps=yes
+	  fi
+	  ;;
+	esac
+	versuffix=$versuffix_save
+	major=$major_save
+	release=$release_save
+	libname=$libname_save
+	name=$name_save
 
-        lib="$objdir/$realname"
+	case $host in
+	*-*-rhapsody* | *-*-darwin1.[012])
+	  # On Rhapsody replace the C library is the System framework
+	  newdeplibs=`$echo "X $newdeplibs" | $Xsed -e 's/ -lc / -framework System /'`
+	  ;;
+	esac
+
+	if test "$droppeddeps" = yes; then
+	  if test "$module" = yes; then
+	    echo
+	    echo "*** Warning: libtool could not satisfy all declared inter-library"
+	    echo "*** dependencies of module $libname.  Therefore, libtool will create"
+	    echo "*** a static module, that should work as long as the dlopening"
+	    echo "*** application is linked with the -dlopen flag."
+	    if test -z "$global_symbol_pipe"; then
+	      echo
+	      echo "*** However, this would only work if libtool was able to extract symbol"
+	      echo "*** lists from a program, using \`nm' or equivalent, but libtool could"
+	      echo "*** not find such a program.  So, this module is probably useless."
+	      echo "*** \`nm' from GNU binutils and a full rebuild may help."
+	    fi
+	    if test "$build_old_libs" = no; then
+	      oldlibs="$output_objdir/$libname.$libext"
+	      build_libtool_libs=module
+	      build_old_libs=yes
+	    else
+	      build_libtool_libs=no
+	    fi
+	  else
+	    echo "*** The inter-library dependencies that have been dropped here will be"
+	    echo "*** automatically added whenever a program is linked with this library"
+	    echo "*** or is declared to -dlopen it."
+
+	    if test $allow_undefined = no; then
+	      echo
+	      echo "*** Since this library must not contain undefined symbols,"
+	      echo "*** because either the platform does not support them or"
+	      echo "*** it was explicitly requested with -no-undefined,"
+	      echo "*** libtool will only create a static version of it."
+	      if test "$build_old_libs" = no; then
+		oldlibs="$output_objdir/$libname.$libext"
+		build_libtool_libs=module
+		build_old_libs=yes
+	      else
+		build_libtool_libs=no
+	      fi
+	    fi
+	  fi
+	fi
+	# Done checking deplibs!
+	deplibs=$newdeplibs
+      fi
+
+      # All the library-specific variables (install_libdir is set above).
+      library_names=
+      old_library=
+      dlname=
+
+      # Test again, we may have decided not to build it any more
+      if test "$build_libtool_libs" = yes; then
+	if test $hardcode_into_libs = yes; then
+	  # Hardcode the library paths
+	  hardcode_libdirs=
+	  dep_rpath=
+	  rpath="$finalize_rpath"
+	  test "$mode" != relink && rpath="$compile_rpath$rpath"
+	  for libdir in $rpath; do
+	    if test -n "$hardcode_libdir_flag_spec"; then
+	      if test -n "$hardcode_libdir_separator"; then
+		if test -z "$hardcode_libdirs"; then
+		  hardcode_libdirs="$libdir"
+		else
+		  # Just accumulate the unique libdirs.
+		  case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in
+		  *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
+		    ;;
+		  *)
+		    hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
+		    ;;
+		  esac
+		fi
+	      else
+		eval flag=\"$hardcode_libdir_flag_spec\"
+		dep_rpath="$dep_rpath $flag"
+	      fi
+	    elif test -n "$runpath_var"; then
+	      case "$perm_rpath " in
+	      *" $libdir "*) ;;
+	      *) perm_rpath="$perm_rpath $libdir" ;;
+	      esac
+	    fi
+	  done
+	  # Substitute the hardcoded libdirs into the rpath.
+	  if test -n "$hardcode_libdir_separator" &&
+	     test -n "$hardcode_libdirs"; then
+	    libdir="$hardcode_libdirs"
+	    eval dep_rpath=\"$hardcode_libdir_flag_spec\"
+	  fi
+	  if test -n "$runpath_var" && test -n "$perm_rpath"; then
+	    # We should set the runpath_var.
+	    rpath=
+	    for dir in $perm_rpath; do
+	      rpath="$rpath$dir:"
+	    done
+	    eval "$runpath_var='$rpath\$$runpath_var'; export $runpath_var"
+	  fi
+	  test -n "$dep_rpath" && deplibs="$dep_rpath $deplibs"
+	fi
+
+	shlibpath="$finalize_shlibpath"
+	test "$mode" != relink && shlibpath="$compile_shlibpath$shlibpath"
+	if test -n "$shlibpath"; then
+	  eval "$shlibpath_var='$shlibpath\$$shlibpath_var'; export $shlibpath_var"
+	fi
+
+	# Get the real and link names of the library.
+	eval library_names=\"$library_names_spec\"
+	set dummy $library_names
+	realname="$2"
+	shift; shift
+
+	if test -n "$soname_spec"; then
+	  eval soname=\"$soname_spec\"
+	else
+	  soname="$realname"
+	fi
+	test -z "$dlname" && dlname=$soname
+
+	lib="$output_objdir/$realname"
 	for link
 	do
 	  linknames="$linknames $link"
 	done
 
-        # Use standard objects if they are PIC.
-        test -z "$pic_flag" && libobjs=`$echo "X$libobjs " | $Xsed -e 's/\.lo /.o /g' -e 's/ $//g'`
+	# Ensure that we have .o objects for linkers which dislike .lo
+	# (e.g. aix) in case we are running --disable-static
+	for obj in $libobjs; do
+	  xdir=`$echo "X$obj" | $Xsed -e 's%/[^/]*$%%'`
+	  if test "X$xdir" = "X$obj"; then
+	    xdir="."
+	  else
+	    xdir="$xdir"
+	  fi
+	  baseobj=`$echo "X$obj" | $Xsed -e 's%^.*/%%'`
+	  oldobj=`$echo "X$baseobj" | $Xsed -e "$lo2o"`
+	  if test ! -f $xdir/$oldobj; then
+	    $show "(cd $xdir && ${LN_S} $baseobj $oldobj)"
+	    $run eval '(cd $xdir && ${LN_S} $baseobj $oldobj)' || exit $?
+	  fi
+	done
 
-        # Do each of the archive commands.
-        eval cmds=\"$archive_cmds\"
-        IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=';'
-        for cmd in $cmds; do
-          IFS="$save_ifs"
-          $show "$cmd"
-          $run eval "$cmd" || exit $?
-        done
-        IFS="$save_ifs"
+	# Use standard objects if they are pic
+	test -z "$pic_flag" && libobjs=`$echo "X$libobjs" | $SP2NL | $Xsed -e "$lo2o" | $NL2SP`
 
-        # Create links to the real library.
-        for linkname in $linknames; do
-          $show "(cd $objdir && $LN_S $realname $linkname)"
-          $run eval '(cd $objdir && $LN_S $realname $linkname)' || exit $?
-        done
+	# Prepare the list of exported symbols
+	if test -z "$export_symbols"; then
+	  if test "$always_export_symbols" = yes || test -n "$export_symbols_regex"; then
+	    $show "generating symbol list for \`$libname.la'"
+	    export_symbols="$output_objdir/$libname.exp"
+	    $run $rm $export_symbols
+	    eval cmds=\"$export_symbols_cmds\"
+	    save_ifs="$IFS"; IFS='~'
+	    for cmd in $cmds; do
+	      IFS="$save_ifs"
+	      $show "$cmd"
+	      $run eval "$cmd" || exit $?
+	    done
+	    IFS="$save_ifs"
+	    if test -n "$export_symbols_regex"; then
+	      $show "egrep -e \"$export_symbols_regex\" \"$export_symbols\" > \"${export_symbols}T\""
+	      $run eval 'egrep -e "$export_symbols_regex" "$export_symbols" > "${export_symbols}T"'
+	      $show "$mv \"${export_symbols}T\" \"$export_symbols\""
+	      $run eval '$mv "${export_symbols}T" "$export_symbols"'
+	    fi
+	  fi
+	fi
 
-        # If -export-dynamic was specified, set the dlname.
-        if test "$export_dynamic" = yes; then
-          # On all known operating systems, these are identical.
-          dlname="$soname"
-        fi
+	if test -n "$export_symbols" && test -n "$include_expsyms"; then
+	  $run eval '$echo "X$include_expsyms" | $SP2NL >> "$export_symbols"'
+	fi
+
+	if test -n "$convenience"; then
+	  if test -n "$whole_archive_flag_spec"; then
+	    eval libobjs=\"\$libobjs $whole_archive_flag_spec\"
+	  else
+	    gentop="$output_objdir/${outputname}x"
+	    $show "${rm}r $gentop"
+	    $run ${rm}r "$gentop"
+	    $show "mkdir $gentop"
+	    $run mkdir "$gentop"
+	    status=$?
+	    if test $status -ne 0 && test ! -d "$gentop"; then
+	      exit $status
+	    fi
+	    generated="$generated $gentop"
+
+	    for xlib in $convenience; do
+	      # Extract the objects.
+	      case $xlib in
+	      [\\/]* | [A-Za-z]:[\\/]*) xabs="$xlib" ;;
+	      *) xabs=`pwd`"/$xlib" ;;
+	      esac
+	      xlib=`$echo "X$xlib" | $Xsed -e 's%^.*/%%'`
+	      xdir="$gentop/$xlib"
+
+	      $show "${rm}r $xdir"
+	      $run ${rm}r "$xdir"
+	      $show "mkdir $xdir"
+	      $run mkdir "$xdir"
+	      status=$?
+	      if test $status -ne 0 && test ! -d "$xdir"; then
+		exit $status
+	      fi
+	      $show "(cd $xdir && $AR x $xabs)"
+	      $run eval "(cd \$xdir && $AR x \$xabs)" || exit $?
+
+	      libobjs="$libobjs "`find $xdir -name \*.o -print -o -name \*.lo -print | $NL2SP`
+	    done
+	  fi
+	fi
+
+	if test "$thread_safe" = yes && test -n "$thread_safe_flag_spec"; then
+	  eval flag=\"$thread_safe_flag_spec\"
+	  linker_flags="$linker_flags $flag"
+	fi
+
+	# Make a backup of the uninstalled library when relinking
+	if test "$mode" = relink; then
+	  $run eval '(cd $output_objdir && $rm ${realname}U && $mv $realname ${realname}U)' || exit $?
+	fi
+
+	# Do each of the archive commands.
+	if test -n "$export_symbols" && test -n "$archive_expsym_cmds"; then
+	  eval cmds=\"$archive_expsym_cmds\"
+	else
+	  save_deplibs="$deplibs"
+	  for conv in $convenience; do
+	    tmp_deplibs=
+	    for test_deplib in $deplibs; do
+	      if test "$test_deplib" != "$conv"; then
+		tmp_deplibs="$tmp_deplibs $test_deplib"
+	      fi
+	    done
+	    deplibs="$tmp_deplibs"
+	  done
+	  eval cmds=\"$archive_cmds\"
+	  deplibs="$save_deplibs"
+	fi
+	save_ifs="$IFS"; IFS='~'
+	for cmd in $cmds; do
+	  IFS="$save_ifs"
+	  $show "$cmd"
+	  $run eval "$cmd" || exit $?
+	done
+	IFS="$save_ifs"
+
+	# Restore the uninstalled library and exit
+	if test "$mode" = relink; then
+	  $run eval '(cd $output_objdir && $rm ${realname}T && $mv $realname ${realname}T && $mv "$realname"U $realname)' || exit $?
+	  exit 0
+	fi
+
+	# Create links to the real library.
+	for linkname in $linknames; do
+	  if test "$realname" != "$linkname"; then
+	    $show "(cd $output_objdir && $rm $linkname && $LN_S $realname $linkname)"
+	    $run eval '(cd $output_objdir && $rm $linkname && $LN_S $realname $linkname)' || exit $?
+	  fi
+	done
+
+	# If -module or -export-dynamic was specified, set the dlname.
+	if test "$module" = yes || test "$export_dynamic" = yes; then
+	  # On all known operating systems, these are identical.
+	  dlname="$soname"
+	fi
       fi
-
-      # Now set the variables for building old libraries.
-      oldlib="$objdir/$libname.a"
       ;;
 
-    *.lo | *.o)
-      if test -n "$link_against_libtool_libs"; then
-        $echo "$modename: error: cannot link libtool libraries into reloadable objects" 1>&2
-        exit 1
-      fi
-
+    obj)
       if test -n "$deplibs"; then
-        $echo "$modename: warning: \`-l' and \`-L' are ignored while creating objects" 1>&2
+	$echo "$modename: warning: \`-l' and \`-L' are ignored for objects" 1>&2
       fi
 
-      if test -n "$dlfiles$dlprefiles"; then
-        $echo "$modename: warning: \`-dlopen' is ignored while creating objects" 1>&2
-        # Nullify the symbol file.
-        compile_command=`$echo "X$compile_command" | $Xsed -e "s% @SYMFILE@%%"`
-        finalize_command=`$echo "X$finalize_command" | $Xsed -e "s% @SYMFILE@%%"`
+      if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
+	$echo "$modename: warning: \`-dlopen' is ignored for objects" 1>&2
       fi
 
       if test -n "$rpath"; then
-        $echo "$modename: warning: \`-rpath' is ignored while creating objects" 1>&2
+	$echo "$modename: warning: \`-rpath' is ignored for objects" 1>&2
+      fi
+
+      if test -n "$xrpath"; then
+	$echo "$modename: warning: \`-R' is ignored for objects" 1>&2
       fi
 
       if test -n "$vinfo"; then
-        $echo "$modename: warning: \`-version-info' is ignored while creating objects" 1>&2
+	$echo "$modename: warning: \`-version-info' is ignored for objects" 1>&2
       fi
 
       if test -n "$release"; then
-        $echo "$modename: warning: \`-release' is ignored while creating objects" 1>&2
+	$echo "$modename: warning: \`-release' is ignored for objects" 1>&2
       fi
 
-      case "$output" in
+      case $output in
       *.lo)
-        if test -n "$objs"; then
-          $echo "$modename: cannot build library object \`$output' from non-libtool objects" 1>&2
-          exit 1
-        fi
-        libobj="$output"
-        obj=`$echo "X$output" | $Xsed -e 's/\.lo$/.o/'`
-        ;;
+	if test -n "$objs$old_deplibs"; then
+	  $echo "$modename: cannot build library object \`$output' from non-libtool objects" 1>&2
+	  exit 1
+	fi
+	libobj="$output"
+	obj=`$echo "X$output" | $Xsed -e "$lo2o"`
+	;;
       *)
-        libobj=
-        obj="$output"
-        ;;
+	libobj=
+	obj="$output"
+	;;
       esac
 
       # Delete the old objects.
       $run $rm $obj $libobj
 
+      # Objects from convenience libraries.  This assumes
+      # single-version convenience libraries.  Whenever we create
+      # different ones for PIC/non-PIC, this we'll have to duplicate
+      # the extraction.
+      reload_conv_objs=
+      gentop=
+      # reload_cmds runs $LD directly, so let us get rid of
+      # -Wl from whole_archive_flag_spec
+      wl=
+
+      if test -n "$convenience"; then
+	if test -n "$whole_archive_flag_spec"; then
+	  eval reload_conv_objs=\"\$reload_objs $whole_archive_flag_spec\"
+	else
+	  gentop="$output_objdir/${obj}x"
+	  $show "${rm}r $gentop"
+	  $run ${rm}r "$gentop"
+	  $show "mkdir $gentop"
+	  $run mkdir "$gentop"
+	  status=$?
+	  if test $status -ne 0 && test ! -d "$gentop"; then
+	    exit $status
+	  fi
+	  generated="$generated $gentop"
+
+	  for xlib in $convenience; do
+	    # Extract the objects.
+	    case $xlib in
+	    [\\/]* | [A-Za-z]:[\\/]*) xabs="$xlib" ;;
+	    *) xabs=`pwd`"/$xlib" ;;
+	    esac
+	    xlib=`$echo "X$xlib" | $Xsed -e 's%^.*/%%'`
+	    xdir="$gentop/$xlib"
+
+	    $show "${rm}r $xdir"
+	    $run ${rm}r "$xdir"
+	    $show "mkdir $xdir"
+	    $run mkdir "$xdir"
+	    status=$?
+	    if test $status -ne 0 && test ! -d "$xdir"; then
+	      exit $status
+	    fi
+	    $show "(cd $xdir && $AR x $xabs)"
+	    $run eval "(cd \$xdir && $AR x \$xabs)" || exit $?
+
+	    reload_conv_objs="$reload_objs "`find $xdir -name \*.o -print -o -name \*.lo -print | $NL2SP`
+	  done
+	fi
+      fi
+
       # Create the old-style object.
-      reload_objs="$objs"`$echo "X$libobjs " | $Xsed -e 's/[^       ]*\.a //g' -e 's/\.lo /.o /g' -e 's/ $//g'`
+      reload_objs="$objs$old_deplibs "`$echo "X$libobjs" | $SP2NL | $Xsed -e '/\.'${libext}$'/d' -e '/\.lib$/d' -e "$lo2o" | $NL2SP`" $reload_conv_objs" ### testsuite: skip nested quoting test
 
       output="$obj"
       eval cmds=\"$reload_cmds\"
-      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=';'
+      save_ifs="$IFS"; IFS='~'
       for cmd in $cmds; do
-        IFS="$save_ifs"
-        $show "$cmd"
-        $run eval "$cmd" || exit $?
+	IFS="$save_ifs"
+	$show "$cmd"
+	$run eval "$cmd" || exit $?
       done
       IFS="$save_ifs"
 
       # Exit if we aren't doing a library object file.
-      test -z "$libobj" && exit 0
+      if test -z "$libobj"; then
+	if test -n "$gentop"; then
+	  $show "${rm}r $gentop"
+	  $run ${rm}r $gentop
+	fi
 
-      if test "$build_libtool_libs" != yes; then
-        # Create an invalid libtool object if no PIC, so that we don't
-        # accidentally link it into a program.
-        $show "echo timestamp > $libobj"
-        $run eval "echo timestamp > $libobj" || exit $?
-        exit 0
+	exit 0
       fi
 
-      if test -n "$pic_flag"; then
-        # Only do commands if we really have different PIC objects.
-        reload_objs="$libobjs"
-        output="$libobj"
-        eval cmds=\"$reload_cmds\"
-        IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=';'
-        for cmd in $cmds; do
-          IFS="$save_ifs"
-          $show "$cmd"
-          $run eval "$cmd" || exit $?
-        done
-        IFS="$save_ifs"
+      if test "$build_libtool_libs" != yes; then
+	if test -n "$gentop"; then
+	  $show "${rm}r $gentop"
+	  $run ${rm}r $gentop
+	fi
+
+	# Create an invalid libtool object if no PIC, so that we don't
+	# accidentally link it into a program.
+	$show "echo timestamp > $libobj"
+	$run eval "echo timestamp > $libobj" || exit $?
+	exit 0
+      fi
+
+      if test -n "$pic_flag" || test "$pic_mode" != default; then
+	# Only do commands if we really have different PIC objects.
+	reload_objs="$libobjs $reload_conv_objs"
+	output="$libobj"
+	eval cmds=\"$reload_cmds\"
+	save_ifs="$IFS"; IFS='~'
+	for cmd in $cmds; do
+	  IFS="$save_ifs"
+	  $show "$cmd"
+	  $run eval "$cmd" || exit $?
+	done
+	IFS="$save_ifs"
       else
-        # Just create a symlink.
-        $show "$LN_S $obj $libobj"
-        $run $LN_S $obj $libobj || exit 1
+	# Just create a symlink.
+	$show $rm $libobj
+	$run $rm $libobj
+	xdir=`$echo "X$libobj" | $Xsed -e 's%/[^/]*$%%'`
+	if test "X$xdir" = "X$libobj"; then
+	  xdir="."
+	else
+	  xdir="$xdir"
+	fi
+	baseobj=`$echo "X$libobj" | $Xsed -e 's%^.*/%%'`
+	oldobj=`$echo "X$baseobj" | $Xsed -e "$lo2o"`
+	$show "(cd $xdir && $LN_S $oldobj $baseobj)"
+	$run eval '(cd $xdir && $LN_S $oldobj $baseobj)' || exit $?
+      fi
+
+      if test -n "$gentop"; then
+	$show "${rm}r $gentop"
+	$run ${rm}r $gentop
       fi
 
       exit 0
       ;;
 
-    *)
+    prog)
+      case $host in
+	*cygwin*) output=`echo $output | ${SED} -e 's,.exe$,,;s,$,.exe,'` ;;
+      esac
       if test -n "$vinfo"; then
-        $echo "$modename: warning: \`-version-info' is ignored while linking programs" 1>&2
+	$echo "$modename: warning: \`-version-info' is ignored for programs" 1>&2
       fi
 
       if test -n "$release"; then
-        $echo "$modename: warning: \`-release' is ignored while creating objects" 1>&2
+	$echo "$modename: warning: \`-release' is ignored for programs" 1>&2
       fi
 
-      if test -n "$rpath"; then
+      if test "$preload" = yes; then
+	if test "$dlopen_support" = unknown && test "$dlopen_self" = unknown &&
+	   test "$dlopen_self_static" = unknown; then
+	  $echo "$modename: warning: \`AC_LIBTOOL_DLOPEN' not used. Assuming no dlopen support."
+	fi
+      fi
+
+      case $host in
+      *-*-rhapsody* | *-*-darwin1.[012])
+	# On Rhapsody replace the C library is the System framework
+	compile_deplibs=`$echo "X $compile_deplibs" | $Xsed -e 's/ -lc / -framework System /'`
+	finalize_deplibs=`$echo "X $finalize_deplibs" | $Xsed -e 's/ -lc / -framework System /'`
+	case $host in
+	*darwin*)
+	  # Don't allow lazy linking, it breaks C++ global constructors
+	  compile_command="$compile_command ${wl}-bind_at_load"
+	  finalize_command="$finalize_command ${wl}-bind_at_load"
+	  ;;
+	esac
+	;;
+      esac
+
+      compile_command="$compile_command $compile_deplibs"
+      finalize_command="$finalize_command $finalize_deplibs"
+
+      if test -n "$rpath$xrpath"; then
 	# If the user specified any rpath flags, then add them.
-	for libdir in $rpath; do
-          if test -n "$hardcode_libdir_flag_spec"; then
-            if test -n "$hardcode_libdir_separator"; then
-              if test -z "$hardcode_libdirs"; then
-                # Put the magic libdir with the hardcode flag.
-                hardcode_libdirs="$libdir"
-                libdir="@HARDCODE_LIBDIRS@"
-              else
-                # Just accumulate the unique libdirs.
-		case "$hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator" in
-		*"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
-		  ;;
-		*)
-		  hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
-		  ;;
-		esac
-                libdir=
-              fi
-            fi
-
-            if test -n "$libdir"; then
-              eval flag=\"$hardcode_libdir_flag_spec\"
-
-              compile_command="$compile_command $flag"
-              finalize_command="$finalize_command $flag"
-            fi
-          elif test -n "$runpath_var"; then
-            case "$perm_rpath " in
-            *" $libdir "*) ;;
-            *) perm_rpath="$perm_rpath $libdir" ;;
-            esac
-          fi
+	for libdir in $rpath $xrpath; do
+	  # This is the magic to use -rpath.
+	  case "$finalize_rpath " in
+	  *" $libdir "*) ;;
+	  *) finalize_rpath="$finalize_rpath $libdir" ;;
+	  esac
 	done
       fi
 
-      # Substitute the hardcoded libdirs into the compile commands.
-      if test -n "$hardcode_libdir_separator"; then
-	compile_command=`$echo "X$compile_command" | $Xsed -e "s%@HARDCODE_LIBDIRS@%$hardcode_libdirs%g"`
-	finalize_command=`$echo "X$finalize_command" | $Xsed -e "s%@HARDCODE_LIBDIRS@%$hardcode_libdirs%g"`
+      # Now hardcode the library paths
+      rpath=
+      hardcode_libdirs=
+      for libdir in $compile_rpath $finalize_rpath; do
+	if test -n "$hardcode_libdir_flag_spec"; then
+	  if test -n "$hardcode_libdir_separator"; then
+	    if test -z "$hardcode_libdirs"; then
+	      hardcode_libdirs="$libdir"
+	    else
+	      # Just accumulate the unique libdirs.
+	      case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in
+	      *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
+		;;
+	      *)
+		hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
+		;;
+	      esac
+	    fi
+	  else
+	    eval flag=\"$hardcode_libdir_flag_spec\"
+	    rpath="$rpath $flag"
+	  fi
+	elif test -n "$runpath_var"; then
+	  case "$perm_rpath " in
+	  *" $libdir "*) ;;
+	  *) perm_rpath="$perm_rpath $libdir" ;;
+	  esac
+	fi
+	case $host in
+	*-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-os2*)
+	  case :$dllsearchpath: in
+	  *":$libdir:"*) ;;
+	  *) dllsearchpath="$dllsearchpath:$libdir";;
+	  esac
+	  ;;
+	esac
+      done
+      # Substitute the hardcoded libdirs into the rpath.
+      if test -n "$hardcode_libdir_separator" &&
+	 test -n "$hardcode_libdirs"; then
+	libdir="$hardcode_libdirs"
+	eval rpath=\" $hardcode_libdir_flag_spec\"
       fi
+      compile_rpath="$rpath"
+
+      rpath=
+      hardcode_libdirs=
+      for libdir in $finalize_rpath; do
+	if test -n "$hardcode_libdir_flag_spec"; then
+	  if test -n "$hardcode_libdir_separator"; then
+	    if test -z "$hardcode_libdirs"; then
+	      hardcode_libdirs="$libdir"
+	    else
+	      # Just accumulate the unique libdirs.
+	      case $hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator in
+	      *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
+		;;
+	      *)
+		hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
+		;;
+	      esac
+	    fi
+	  else
+	    eval flag=\"$hardcode_libdir_flag_spec\"
+	    rpath="$rpath $flag"
+	  fi
+	elif test -n "$runpath_var"; then
+	  case "$finalize_perm_rpath " in
+	  *" $libdir "*) ;;
+	  *) finalize_perm_rpath="$finalize_perm_rpath $libdir" ;;
+	  esac
+	fi
+      done
+      # Substitute the hardcoded libdirs into the rpath.
+      if test -n "$hardcode_libdir_separator" &&
+	 test -n "$hardcode_libdirs"; then
+	libdir="$hardcode_libdirs"
+	eval rpath=\" $hardcode_libdir_flag_spec\"
+      fi
+      finalize_rpath="$rpath"
 
       if test -n "$libobjs" && test "$build_old_libs" = yes; then
-        # Transform all the library objects into standard objects.
-        compile_command=`$echo "X$compile_command " | $Xsed -e 's/\.lo /.o /g' -e 's/ $//'`
-        finalize_command=`$echo "X$finalize_command " | $Xsed -e 's/\.lo /.o /g' -e 's/ $//'`
+	# Transform all the library objects into standard objects.
+	compile_command=`$echo "X$compile_command" | $SP2NL | $Xsed -e "$lo2o" | $NL2SP`
+	finalize_command=`$echo "X$finalize_command" | $SP2NL | $Xsed -e "$lo2o" | $NL2SP`
       fi
 
-      if test "$export_dynamic" = yes && test -n "$NM" && test -n "$global_symbol_pipe"; then
-        dlsyms="${output}S.c"
-      else
-        dlsyms=
+      dlsyms=
+      if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
+	if test -n "$NM" && test -n "$global_symbol_pipe"; then
+	  dlsyms="${outputname}S.c"
+	else
+	  $echo "$modename: not configured to extract global symbols from dlpreopened files" 1>&2
+	fi
       fi
 
       if test -n "$dlsyms"; then
-        # Add our own program objects to the preloaded list.
-        dlprefiles=`$echo "X$objs$dlprefiles " | $Xsed -e 's/\.lo /.o /g' -e 's/ $//'`
+	case $dlsyms in
+	"") ;;
+	*.c)
+	  # Discover the nlist of each of the dlfiles.
+	  nlist="$output_objdir/${outputname}.nm"
 
-	# Discover the nlist of each of the dlfiles.
-        nlist="$objdir/${output}.nm"
+	  $show "$rm $nlist ${nlist}S ${nlist}T"
+	  $run $rm "$nlist" "${nlist}S" "${nlist}T"
 
-	if test -d $objdir; then
-	  $show "$rm $nlist ${nlist}T"
-	  $run $rm "$nlist" "${nlist}T"
-	else
-	  $show "$mkdir $objdir"
-	  $run $mkdir $objdir
-	  status=$?
-	  if test $status -eq 0 || test -d $objdir; then :
-	  else
-	    exit $status
-	  fi
-	fi
+	  # Parse the name list into a source file.
+	  $show "creating $output_objdir/$dlsyms"
 
-        for arg in $dlprefiles; do
-	  $show "extracting global C symbols from \`$arg'"
-	  $run eval "$NM $arg | $global_symbol_pipe >> '$nlist'"
-        done
-
-        # Parse the name list into a source file.
-        $show "creating $objdir/$dlsyms"
-        if test -z "$run"; then
-	  # Make sure we at least have an empty file.
-	  test -f "$nlist" || : > "$nlist"
-
-	  # Try sorting and uniquifying the output.
-	  if sort "$nlist" | uniq > "$nlist"T; then
-	    mv -f "$nlist"T "$nlist"
-	    wcout=`wc "$nlist" 2>/dev/null`
-	    count=`echo "X$wcout" | $Xsed -e 's/^[ 	]*\([0-9][0-9]*\).*$/\1/'`
-	    (test "$count" -ge 0) 2>/dev/null || count=-1
-	  else
-	    $rm "$nlist"T
-	    count=-1
-	  fi
-
-	  case "$dlsyms" in
-	  "") ;;
-	  *.c)
-	    $echo > "$objdir/$dlsyms" "\
-/* $dlsyms - symbol resolution table for \`$output' dlsym emulation. */
-/* Generated by $PROGRAM - GNU $PACKAGE $VERSION */
+	  test -z "$run" && $echo > "$output_objdir/$dlsyms" "\
+/* $dlsyms - symbol resolution table for \`$outputname' dlsym emulation. */
+/* Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP */
 
 #ifdef __cplusplus
 extern \"C\" {
 #endif
 
 /* Prevent the only kind of declaration conflicts we can make. */
-#define dld_preloaded_symbol_count some_other_symbol
-#define dld_preloaded_symbols some_other_symbol
+#define lt_preloaded_symbols some_other_symbol
 
 /* External symbol declarations for the compiler. */\
 "
 
-	    if test -f "$nlist"; then
-	      sed -e 's/^.* \(.*\)$/extern char \1;/' < "$nlist" >> "$objdir/$dlsyms"
-	    else
-	      echo '/* NONE */' >> "$objdir/$dlsyms"
+	  if test "$dlself" = yes; then
+	    $show "generating symbol list for \`$output'"
+
+	    test -z "$run" && $echo ': @PROGRAM@ ' > "$nlist"
+
+	    # Add our own program objects to the symbol list.
+	    progfiles=`$echo "X$objs$old_deplibs" | $SP2NL | $Xsed -e "$lo2o" | $NL2SP`
+	    for arg in $progfiles; do
+	      $show "extracting global C symbols from \`$arg'"
+	      $run eval "$NM $arg | $global_symbol_pipe >> '$nlist'"
+	    done
+
+	    if test -n "$exclude_expsyms"; then
+	      $run eval 'egrep -v " ($exclude_expsyms)$" "$nlist" > "$nlist"T'
+	      $run eval '$mv "$nlist"T "$nlist"'
 	    fi
 
-	    $echo >> "$objdir/$dlsyms" "\
+	    if test -n "$export_symbols_regex"; then
+	      $run eval 'egrep -e "$export_symbols_regex" "$nlist" > "$nlist"T'
+	      $run eval '$mv "$nlist"T "$nlist"'
+	    fi
 
-#undef dld_preloaded_symbol_count
-#undef dld_preloaded_symbols
+	    # Prepare the list of exported symbols
+	    if test -z "$export_symbols"; then
+	      export_symbols="$output_objdir/$output.exp"
+	      $run $rm $export_symbols
+	      $run eval "${SED} -n -e '/^: @PROGRAM@$/d' -e 's/^.* \(.*\)$/\1/p' "'< "$nlist" > "$export_symbols"'
+	    else
+	      $run eval "${SED} -e 's/\([][.*^$]\)/\\\1/g' -e 's/^/ /' -e 's/$/$/'"' < "$export_symbols" > "$output_objdir/$output.exp"'
+	      $run eval 'grep -f "$output_objdir/$output.exp" < "$nlist" > "$nlist"T'
+	      $run eval 'mv "$nlist"T "$nlist"'
+	    fi
+	  fi
+
+	  for arg in $dlprefiles; do
+	    $show "extracting global C symbols from \`$arg'"
+	    name=`echo "$arg" | ${SED} -e 's%^.*/%%'`
+	    $run eval 'echo ": $name " >> "$nlist"'
+	    $run eval "$NM $arg | $global_symbol_pipe >> '$nlist'"
+	  done
+
+	  if test -z "$run"; then
+	    # Make sure we have at least an empty file.
+	    test -f "$nlist" || : > "$nlist"
+
+	    if test -n "$exclude_expsyms"; then
+	      egrep -v " ($exclude_expsyms)$" "$nlist" > "$nlist"T
+	      $mv "$nlist"T "$nlist"
+	    fi
+
+	    # Try sorting and uniquifying the output.
+	    if grep -v "^: " < "$nlist" |
+		if sort -k 3 </dev/null >/dev/null 2>&1; then
+		  sort -k 3
+		else
+		  sort +2
+		fi |
+		uniq > "$nlist"S; then
+	      :
+	    else
+	      grep -v "^: " < "$nlist" > "$nlist"S
+	    fi
+
+	    if test -f "$nlist"S; then
+	      eval "$global_symbol_to_cdecl"' < "$nlist"S >> "$output_objdir/$dlsyms"'
+	    else
+	      echo '/* NONE */' >> "$output_objdir/$dlsyms"
+	    fi
+
+	    $echo >> "$output_objdir/$dlsyms" "\
+
+#undef lt_preloaded_symbols
 
 #if defined (__STDC__) && __STDC__
-# define __ptr_t void *
+# define lt_ptr void *
 #else
-# define __ptr_t char *
+# define lt_ptr char *
+# define const
 #endif
 
-/* The number of symbols in dld_preloaded_symbols, -1 if unsorted. */
-int dld_preloaded_symbol_count = $count;
-
 /* The mapping between symbol names and symbols. */
-struct {
-  char *name;
-  __ptr_t address;
+const struct {
+  const char *name;
+  lt_ptr address;
 }
-dld_preloaded_symbols[] =
+lt_preloaded_symbols[] =
 {\
 "
 
-	    if test -f "$nlist"; then
-	      sed 's/^\(.*\) \(.*\)$/  {"\1", (__ptr_t) \&\2},/' < "$nlist" >> "$objdir/$dlsyms"
-	    fi
+	    eval "$global_symbol_to_c_name_address" < "$nlist" >> "$output_objdir/$dlsyms"
 
-	    $echo >> "$objdir/$dlsyms" "\
-  {0, (__ptr_t) 0}
+	    $echo >> "$output_objdir/$dlsyms" "\
+  {0, (lt_ptr) 0}
 };
 
+/* This works around a problem in FreeBSD linker */
+#ifdef FREEBSD_WORKAROUND
+static const void *lt_preloaded_setup() {
+  return lt_preloaded_symbols;
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif\
 "
-	    ;;
+	  fi
 
-	  *)
-	    $echo "$modename: unknown suffix for \`$dlsyms'" 1>&2
-	    exit 1
-	    ;;
+	  pic_flag_for_symtable=
+	  case $host in
+	  # compiling the symbol table file with pic_flag works around
+	  # a FreeBSD bug that causes programs to crash when -lm is
+	  # linked before any other PIC object.  But we must not use
+	  # pic_flag when linking with -static.  The problem exists in
+	  # FreeBSD 2.2.6 and is fixed in FreeBSD 3.1.
+	  *-*-freebsd2*|*-*-freebsd3.0*|*-*-freebsdelf3.0*)
+	    case "$compile_command " in
+	    *" -static "*) ;;
+	    *) pic_flag_for_symtable=" $pic_flag -DPIC -DFREEBSD_WORKAROUND";;
+	    esac;;
+	  *-*-hpux*)
+	    case "$compile_command " in
+	    *" -static "*) ;;
+	    *) pic_flag_for_symtable=" $pic_flag -DPIC";;
+	    esac
 	  esac
-        fi
 
-        # Now compile the dynamic symbol file.
-        $show "(cd $objdir && $CC -c$no_builtin_flag \"$dlsyms\")"
-        $run eval '(cd $objdir && $CC -c$no_builtin_flag "$dlsyms")' || exit $?
+	  # Now compile the dynamic symbol file.
+	  $show "(cd $output_objdir && $CC -c$no_builtin_flag$pic_flag_for_symtable \"$dlsyms\")"
+	  $run eval '(cd $output_objdir && $CC -c$no_builtin_flag$pic_flag_for_symtable "$dlsyms")' || exit $?
 
-        # Transform the symbol file into the correct name.
-        compile_command=`$echo "X$compile_command" | $Xsed -e "s%@SYMFILE@%$objdir/${output}S.o%"`
-        finalize_command=`$echo "X$finalize_command" | $Xsed -e "s%@SYMFILE@%$objdir/${output}S.o%"`
-      elif test "$export_dynamic" != yes; then
-        test -n "$dlfiles$dlprefiles" && $echo "$modename: warning: \`-dlopen' and \`-dlpreopen' are ignored without \`-export-dynamic'" 1>&2
+	  # Clean up the generated files.
+	  $show "$rm $output_objdir/$dlsyms $nlist ${nlist}S ${nlist}T"
+	  $run $rm "$output_objdir/$dlsyms" "$nlist" "${nlist}S" "${nlist}T"
+
+	  # Transform the symbol file into the correct name.
+	  compile_command=`$echo "X$compile_command" | $Xsed -e "s%@SYMFILE@%$output_objdir/${outputname}S.${objext}%"`
+	  finalize_command=`$echo "X$finalize_command" | $Xsed -e "s%@SYMFILE@%$output_objdir/${outputname}S.${objext}%"`
+	  ;;
+	*)
+	  $echo "$modename: unknown suffix for \`$dlsyms'" 1>&2
+	  exit 1
+	  ;;
+	esac
       else
-        # We keep going just in case the user didn't refer to
-        # dld_preloaded_symbols.  The linker will fail if global_symbol_pipe
-        # really was required.
-        $echo "$modename: not configured to extract global symbols from dlpreopened files" 1>&2
+	# We keep going just in case the user didn't refer to
+	# lt_preloaded_symbols.  The linker will fail if global_symbol_pipe
+	# really was required.
 
-        # Nullify the symbol file.
-        compile_command=`$echo "X$compile_command" | $Xsed -e "s% @SYMFILE@%%"`
-        finalize_command=`$echo "X$finalize_command" | $Xsed -e "s% @SYMFILE@%%"`
+	# Nullify the symbol file.
+	compile_command=`$echo "X$compile_command" | $Xsed -e "s% @SYMFILE@%%"`
+	finalize_command=`$echo "X$finalize_command" | $Xsed -e "s% @SYMFILE@%%"`
       fi
 
-      if test -z "$link_against_libtool_libs" || test "$build_libtool_libs" != yes; then
-        # Replace the output file specification.
-        compile_command=`$echo "X$compile_command" | $Xsed -e 's%@OUTPUT@%'"$output"'%g'`
-        finalize_command=`$echo "X$finalize_command" | $Xsed -e 's%@OUTPUT@%'"$output"'%g'`
+      if test $need_relink = no || test "$build_libtool_libs" != yes; then
+	# Replace the output file specification.
+	compile_command=`$echo "X$compile_command" | $Xsed -e 's%@OUTPUT@%'"$output"'%g'`
+	link_command="$compile_command$compile_rpath"
 
-        # We have no uninstalled library dependencies, so finalize right now.
-        $show "$compile_command"
-        $run eval "$compile_command"
-        exit $?
-      fi
-
-      # Replace the output file specification.
-      compile_command=`$echo "X$compile_command" | $Xsed -e 's%@OUTPUT@%'"$objdir/$output"'%g'`
-      finalize_command=`$echo "X$finalize_command" | $Xsed -e 's%@OUTPUT@%'"$objdir/$output"'T%g'`
-
-      # Create the binary in the object directory, then wrap it.
-      if test -d $objdir; then :
-      else
-        $show "$mkdir $objdir"
-	$run $mkdir $objdir
+	# We have no uninstalled library dependencies, so finalize right now.
+	$show "$link_command"
+	$run eval "$link_command"
 	status=$?
-	if test $status -eq 0 || test -d $objdir; then :
-	else
-	  exit $status
+
+	# Delete the generated files.
+	if test -n "$dlsyms"; then
+	  $show "$rm $output_objdir/${outputname}S.${objext}"
+	  $run $rm "$output_objdir/${outputname}S.${objext}"
 	fi
+
+	exit $status
       fi
 
       if test -n "$shlibpath_var"; then
-        # We should set the shlibpath_var
-        rpath=
-        for dir in $temp_rpath; do
-          case "$dir" in
-          /* | [A-Za-z]:\\*)
-            # Absolute path.
-            rpath="$rpath$dir:"
-            ;;
-          *)
-            # Relative path: add a thisdir entry.
-            rpath="$rpath\$thisdir/$dir:"
-            ;;
-          esac
-        done
-        temp_rpath="$rpath"
+	# We should set the shlibpath_var
+	rpath=
+	for dir in $temp_rpath; do
+	  case $dir in
+	  [\\/]* | [A-Za-z]:[\\/]*)
+	    # Absolute path.
+	    rpath="$rpath$dir:"
+	    ;;
+	  *)
+	    # Relative path: add a thisdir entry.
+	    rpath="$rpath\$thisdir/$dir:"
+	    ;;
+	  esac
+	done
+	temp_rpath="$rpath"
       fi
 
-      # Delete the old output file.
-      $run $rm $output
-
-      if test -n "$compile_shlibpath"; then
-        compile_command="$shlibpath_var=\"$compile_shlibpath\$$shlibpath_var\" $compile_command"
+      if test -n "$compile_shlibpath$finalize_shlibpath"; then
+	compile_command="$shlibpath_var=\"$compile_shlibpath$finalize_shlibpath\$$shlibpath_var\" $compile_command"
       fi
       if test -n "$finalize_shlibpath"; then
-        finalize_command="$shlibpath_var=\"$finalize_shlibpath\$$shlibpath_var\" $finalize_command"
+	finalize_command="$shlibpath_var=\"$finalize_shlibpath\$$shlibpath_var\" $finalize_command"
       fi
 
-      if test -n "$runpath_var" && test -n "$perm_rpath"; then
-        # We should set the runpath_var.
-        rpath=
-        for dir in $perm_rpath; do
-          rpath="$rpath$dir:"
-        done
-        compile_command="$runpath_var=\"$rpath\$$runpath_var\" $compile_command"
-        finalize_command="$runpath_var=\"$rpath\$$runpath_var\" $finalize_command"
+      compile_var=
+      finalize_var=
+      if test -n "$runpath_var"; then
+	if test -n "$perm_rpath"; then
+	  # We should set the runpath_var.
+	  rpath=
+	  for dir in $perm_rpath; do
+	    rpath="$rpath$dir:"
+	  done
+	  compile_var="$runpath_var=\"$rpath\$$runpath_var\" "
+	fi
+	if test -n "$finalize_perm_rpath"; then
+	  # We should set the runpath_var.
+	  rpath=
+	  for dir in $finalize_perm_rpath; do
+	    rpath="$rpath$dir:"
+	  done
+	  finalize_var="$runpath_var=\"$rpath\$$runpath_var\" "
+	fi
       fi
 
-      case "$hardcode_action" in
-      relink)
-        # AGH! Flame the AIX and HP-UX people for me, will ya?
-        $echo "$modename: warning: using a buggy system linker" 1>&2
-        $echo "$modename: relinking will be required before \`$output' can be installed" 1>&2
-        ;;
-      esac
+      if test "$no_install" = yes; then
+	# We don't need to create a wrapper script.
+	link_command="$compile_var$compile_command$compile_rpath"
+	# Replace the output file specification.
+	link_command=`$echo "X$link_command" | $Xsed -e 's%@OUTPUT@%'"$output"'%g'`
+	# Delete the old output file.
+	$run $rm $output
+	# Link the executable and exit
+	$show "$link_command"
+	$run eval "$link_command" || exit $?
+	exit 0
+      fi
 
-      $show "$compile_command"
-      $run eval "$compile_command" || exit $?
+      if test "$hardcode_action" = relink; then
+	# Fast installation is not supported
+	link_command="$compile_var$compile_command$compile_rpath"
+	relink_command="$finalize_var$finalize_command$finalize_rpath"
+
+	$echo "$modename: warning: this platform does not like uninstalled shared libraries" 1>&2
+	$echo "$modename: \`$output' will be relinked during installation" 1>&2
+      else
+	if test "$fast_install" != no; then
+	  link_command="$finalize_var$compile_command$finalize_rpath"
+	  if test "$fast_install" = yes; then
+	    relink_command=`$echo "X$compile_var$compile_command$compile_rpath" | $Xsed -e 's%@OUTPUT@%\$progdir/\$file%g'`
+	  else
+	    # fast_install is set to needless
+	    relink_command=
+	  fi
+	else
+	  link_command="$compile_var$compile_command$compile_rpath"
+	  relink_command="$finalize_var$finalize_command$finalize_rpath"
+	fi
+      fi
+
+      # Replace the output file specification.
+      link_command=`$echo "X$link_command" | $Xsed -e 's%@OUTPUT@%'"$output_objdir/$outputname"'%g'`
+
+      # Delete the old output files.
+      $run $rm $output $output_objdir/$outputname $output_objdir/lt-$outputname
+
+      $show "$link_command"
+      $run eval "$link_command" || exit $?
 
       # Now create the wrapper script.
       $show "creating $output"
 
-      # Quote the finalize command for shipping.
-      finalize_command=`$echo "X$finalize_command" | $Xsed -e "$sed_quote_subst"`
+      # Quote the relink command for shipping.
+      if test -n "$relink_command"; then
+	# Preserve any variables that may affect compiler behavior
+	for var in $variables_saved_for_relink; do
+	  if eval test -z \"\${$var+set}\"; then
+	    relink_command="{ test -z \"\${$var+set}\" || unset $var || { $var=; export $var; }; }; $relink_command"
+	  elif eval var_value=\$$var; test -z "$var_value"; then
+	    relink_command="$var=; export $var; $relink_command"
+	  else
+	    var_value=`$echo "X$var_value" | $Xsed -e "$sed_quote_subst"`
+	    relink_command="$var=\"$var_value\"; export $var; $relink_command"
+	  fi
+	done
+	relink_command="(cd `pwd`; $relink_command)"
+	relink_command=`$echo "X$relink_command" | $Xsed -e "$sed_quote_subst"`
+      fi
 
       # Quote $echo for shipping.
-      qecho=`$echo "X$echo" | $Xsed -e "$sed_quote_subst"`
+      if test "X$echo" = "X$SHELL $0 --fallback-echo"; then
+	case $0 in
+	[\\/]* | [A-Za-z]:[\\/]*) qecho="$SHELL $0 --fallback-echo";;
+	*) qecho="$SHELL `pwd`/$0 --fallback-echo";;
+	esac
+	qecho=`$echo "X$qecho" | $Xsed -e "$sed_quote_subst"`
+      else
+	qecho=`$echo "X$echo" | $Xsed -e "$sed_quote_subst"`
+      fi
 
       # Only actually do things if our run command is non-null.
       if test -z "$run"; then
-        $rm $output
-        trap "$rm $output; exit 1" 1 2 15
+	# win32 will think the script is a binary if it has
+	# a .exe suffix, so we strip it off here.
+	case $output in
+	  *.exe) output=`echo $output|${SED} 's,.exe$,,'` ;;
+	esac
+	# test for cygwin because mv fails w/o .exe extensions
+	case $host in
+	  *cygwin*) exeext=.exe ;;
+	  *) exeext= ;;
+	esac
+	$rm $output
+	trap "$rm $output; exit 1" 1 2 15
 
-        $echo > $output "\
-#! /bin/sh
+	$echo > $output "\
+#! $SHELL
 
-# $output - temporary wrapper script for $objdir/$output
-# Generated by ltmain.sh - GNU $PACKAGE $VERSION
+# $output - temporary wrapper script for $objdir/$outputname
+# Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP
 #
 # The $output program cannot be directly executed until all the libtool
 # libraries that it depends on are installed.
 #
-# This wrapper script should never be moved out of \``pwd`'.
+# This wrapper script should never be moved out of the build directory.
 # If it is, it will not operate correctly.
 
 # Sed substitution that helps us do robust quoting.  It backslashifies
 # metacharacters that are still active within double-quoted strings.
-Xsed='sed -e s/^X//'
+Xsed="${SED}"' -e 1s/^X//'
 sed_quote_subst='$sed_quote_subst'
 
 # The HP-UX ksh and POSIX shell print the target directory to stdout
 # if CDPATH is set.
-if test \"\${CDPATH+set}\" = set; then CDPATH=; export CDPATH; fi
+if test \"\${CDPATH+set}\" = set; then CDPATH=:; export CDPATH; fi
+
+relink_command=\"$relink_command\"
 
 # This environment variable determines our operation mode.
 if test \"\$libtool_install_magic\" = \"$magic\"; then
-  # install mode needs the following variables:
-  link_against_libtool_libs='$link_against_libtool_libs'
-  finalize_command=\"$finalize_command\"
+  # install mode needs the following variable:
+  notinst_deplibs='$notinst_deplibs'
 else
   # When we are sourced in execute mode, \$file and \$echo are already set.
-  if test \"\$libtool_execute_magic\" = \"$magic\"; then :
-  else
+  if test \"\$libtool_execute_magic\" != \"$magic\"; then
     echo=\"$qecho\"
     file=\"\$0\"
+    # Make sure echo works.
+    if test \"X\$1\" = X--no-reexec; then
+      # Discard the --no-reexec flag, and continue.
+      shift
+    elif test \"X\`(\$echo '\t') 2>/dev/null\`\" = 'X\t'; then
+      # Yippee, \$echo works!
+      :
+    else
+      # Restart under the correct shell, and then maybe \$echo will work.
+      exec $SHELL \"\$0\" --no-reexec \${1+\"\$@\"}
+    fi
   fi\
 "
-        $echo >> $output "\
+	$echo >> $output "\
 
   # Find the directory that this script lives in.
   thisdir=\`\$echo \"X\$file\" | \$Xsed -e 's%/[^/]*$%%'\`
   test \"x\$thisdir\" = \"x\$file\" && thisdir=.
 
   # Follow symbolic links until we get to the real thisdir.
-  file=\`ls -ld \"\$file\" | sed -n 's/.*-> //p'\`
+  file=\`ls -ld \"\$file\" | ${SED} -n 's/.*-> //p'\`
   while test -n \"\$file\"; do
     destdir=\`\$echo \"X\$file\" | \$Xsed -e 's%/[^/]*\$%%'\`
 
     # If there was a directory component, then change thisdir.
     if test \"x\$destdir\" != \"x\$file\"; then
       case \"\$destdir\" in
-      /* | [A-Za-z]:\\*) thisdir=\"\$destdir\" ;;
+      [\\\\/]* | [A-Za-z]:[\\\\/]*) thisdir=\"\$destdir\" ;;
       *) thisdir=\"\$thisdir/\$destdir\" ;;
       esac
     fi
 
     file=\`\$echo \"X\$file\" | \$Xsed -e 's%^.*/%%'\`
-    file=\`ls -ld \"\$thisdir/\$file\" | sed -n 's/.*-> //p'\`
+    file=\`ls -ld \"\$thisdir/\$file\" | ${SED} -n 's/.*-> //p'\`
   done
 
   # Try to get the absolute directory name.
   absdir=\`cd \"\$thisdir\" && pwd\`
   test -n \"\$absdir\" && thisdir=\"\$absdir\"
+"
 
+	if test "$fast_install" = yes; then
+	  echo >> $output "\
+  program=lt-'$outputname'$exeext
   progdir=\"\$thisdir/$objdir\"
-  program='$output'
+
+  if test ! -f \"\$progdir/\$program\" || \\
+     { file=\`ls -1dt \"\$progdir/\$program\" \"\$progdir/../\$program\" 2>/dev/null | ${SED} 1q\`; \\
+       test \"X\$file\" != \"X\$progdir/\$program\"; }; then
+
+    file=\"\$\$-\$program\"
+
+    if test ! -d \"\$progdir\"; then
+      $mkdir \"\$progdir\"
+    else
+      $rm \"\$progdir/\$file\"
+    fi"
+
+	  echo >> $output "\
+
+    # relink executable if necessary
+    if test -n \"\$relink_command\"; then
+      if relink_command_output=\`eval \$relink_command 2>&1\`; then :
+      else
+	$echo \"\$relink_command_output\" >&2
+	$rm \"\$progdir/\$file\"
+	exit 1
+      fi
+    fi
+
+    $mv \"\$progdir/\$file\" \"\$progdir/\$program\" 2>/dev/null ||
+    { $rm \"\$progdir/\$program\";
+      $mv \"\$progdir/\$file\" \"\$progdir/\$program\"; }
+    $rm \"\$progdir/\$file\"
+  fi"
+	else
+	  echo >> $output "\
+  program='$outputname'
+  progdir=\"\$thisdir/$objdir\"
+"
+	fi
+
+	echo >> $output "\
 
   if test -f \"\$progdir/\$program\"; then"
 
-        # Export our shlibpath_var if we have one.
-        if test -n "$shlibpath_var" && test -n "$temp_rpath"; then
-          $echo >> $output "\
+	# Export our shlibpath_var if we have one.
+	if test "$shlibpath_overrides_runpath" = yes && test -n "$shlibpath_var" && test -n "$temp_rpath"; then
+	  $echo >> $output "\
     # Add our own library path to $shlibpath_var
     $shlibpath_var=\"$temp_rpath\$$shlibpath_var\"
 
     # Some systems cannot cope with colon-terminated $shlibpath_var
-    $shlibpath_var=\`\$echo \"X\$$shlibpath_var\" | \$Xsed -e 's/:*\$//'\`
+    # The second colon is a workaround for a bug in BeOS R4 ${SED}
+    $shlibpath_var=\`\$echo \"X\$$shlibpath_var\" | \$Xsed -e 's/::*\$//'\`
 
     export $shlibpath_var
 "
-        fi
+	fi
 
-        $echo >> $output "\
+	# fixup the dll searchpath if we need to.
+	if test -n "$dllsearchpath"; then
+	  $echo >> $output "\
+    # Add the dll search path components to the executable PATH
+    PATH=$dllsearchpath:\$PATH
+"
+	fi
+
+	$echo >> $output "\
     if test \"\$libtool_execute_magic\" != \"$magic\"; then
       # Run the actual program with our arguments.
+"
+	case $host in
+	# win32 systems need to use the prog path for dll
+	# lookup to work
+	*-*-cygwin* | *-*-pw32*)
+	  $echo >> $output "\
+      exec \$progdir/\$program \${1+\"\$@\"}
+"
+	  ;;
 
+	# Backslashes separate directories on plain windows
+	*-*-mingw | *-*-os2*)
+	  $echo >> $output "\
+      exec \$progdir\\\\\$program \${1+\"\$@\"}
+"
+	  ;;
+
+	*)
+	  $echo >> $output "\
       # Export the path to the program.
       PATH=\"\$progdir:\$PATH\"
       export PATH
 
       exec \$program \${1+\"\$@\"}
-
+"
+	  ;;
+	esac
+	$echo >> $output "\
       \$echo \"\$0: cannot exec \$program \${1+\"\$@\"}\"
       exit 1
     fi
@@ -1530,48 +3814,189 @@
   fi
 fi\
 "
-        chmod +x $output
+	chmod +x $output
       fi
       exit 0
       ;;
     esac
 
     # See if we need to build an old-fashioned archive.
-    if test "$build_old_libs" = "yes"; then
-      # Transform .lo files to .o files.
-      oldobjs="$objs"`$echo "X$libobjs " | $Xsed -e 's/[^   ]*\.a //g' -e 's/\.lo /.o /g' -e 's/ $//g'`
+    for oldlib in $oldlibs; do
+
+      if test "$build_libtool_libs" = convenience; then
+	oldobjs="$libobjs_save"
+	addlibs="$convenience"
+	build_libtool_libs=no
+      else
+	if test "$build_libtool_libs" = module; then
+	  oldobjs="$libobjs_save"
+	  build_libtool_libs=no
+	else
+	  oldobjs="$objs$old_deplibs "`$echo "X$libobjs_save" | $SP2NL | $Xsed -e '/\.'${libext}'$/d' -e '/\.lib$/d' -e "$lo2o" | $NL2SP`
+	fi
+	addlibs="$old_convenience"
+      fi
+
+      if test -n "$addlibs"; then
+	gentop="$output_objdir/${outputname}x"
+	$show "${rm}r $gentop"
+	$run ${rm}r "$gentop"
+	$show "mkdir $gentop"
+	$run mkdir "$gentop"
+	status=$?
+	if test $status -ne 0 && test ! -d "$gentop"; then
+	  exit $status
+	fi
+	generated="$generated $gentop"
+
+	# Add in members from convenience archives.
+	for xlib in $addlibs; do
+	  # Extract the objects.
+	  case $xlib in
+	  [\\/]* | [A-Za-z]:[\\/]*) xabs="$xlib" ;;
+	  *) xabs=`pwd`"/$xlib" ;;
+	  esac
+	  xlib=`$echo "X$xlib" | $Xsed -e 's%^.*/%%'`
+	  xdir="$gentop/$xlib"
+
+	  $show "${rm}r $xdir"
+	  $run ${rm}r "$xdir"
+	  $show "mkdir $xdir"
+	  $run mkdir "$xdir"
+	  status=$?
+	  if test $status -ne 0 && test ! -d "$xdir"; then
+	    exit $status
+	  fi
+	  $show "(cd $xdir && $AR x $xabs)"
+	  $run eval "(cd \$xdir && $AR x \$xabs)" || exit $?
+
+	  oldobjs="$oldobjs "`find $xdir -name \*.${objext} -print -o -name \*.lo -print | $NL2SP`
+	done
+      fi
 
       # Do each command in the archive commands.
       if test -n "$old_archive_from_new_cmds" && test "$build_libtool_libs" = yes; then
 	eval cmds=\"$old_archive_from_new_cmds\"
       else
+	# Ensure that we have .o objects in place in case we decided
+	# not to build a shared library, and have fallen back to building
+	# static libs even though --disable-static was passed!
+	for oldobj in $oldobjs; do
+	  if test ! -f $oldobj; then
+	    xdir=`$echo "X$oldobj" | $Xsed -e 's%/[^/]*$%%'`
+	    if test "X$xdir" = "X$oldobj"; then
+	      xdir="."
+	    else
+	      xdir="$xdir"
+	    fi
+	    baseobj=`$echo "X$oldobj" | $Xsed -e 's%^.*/%%'`
+	    obj=`$echo "X$baseobj" | $Xsed -e "$o2lo"`
+	    $show "(cd $xdir && ${LN_S} $obj $baseobj)"
+	    $run eval '(cd $xdir && ${LN_S} $obj $baseobj)' || exit $?
+	  fi
+	done
+
 	eval cmds=\"$old_archive_cmds\"
       fi
-      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=';'
+      save_ifs="$IFS"; IFS='~'
       for cmd in $cmds; do
-        IFS="$save_ifs"
-        $show "$cmd"
-        $run eval "$cmd" || exit $?
+	IFS="$save_ifs"
+	$show "$cmd"
+	$run eval "$cmd" || exit $?
       done
       IFS="$save_ifs"
+    done
+
+    if test -n "$generated"; then
+      $show "${rm}r$generated"
+      $run ${rm}r$generated
     fi
 
     # Now create the libtool archive.
-    case "$output" in
+    case $output in
     *.la)
       old_library=
-      test "$build_old_libs" = yes && old_library="$libname.a"
-
+      test "$build_old_libs" = yes && old_library="$libname.$libext"
       $show "creating $output"
 
+      # Preserve any variables that may affect compiler behavior
+      for var in $variables_saved_for_relink; do
+	if eval test -z \"\${$var+set}\"; then
+	  relink_command="{ test -z \"\${$var+set}\" || unset $var || { $var=; export $var; }; }; $relink_command"
+	elif eval var_value=\$$var; test -z "$var_value"; then
+	  relink_command="$var=; export $var; $relink_command"
+	else
+	  var_value=`$echo "X$var_value" | $Xsed -e "$sed_quote_subst"`
+	  relink_command="$var=\"$var_value\"; export $var; $relink_command"
+	fi
+      done
+      # Quote the link command for shipping.
+      relink_command="(cd `pwd`; $SHELL $0 --mode=relink $libtool_args)"
+      relink_command=`$echo "X$relink_command" | $Xsed -e "$sed_quote_subst"`
+
       # Only create the output if not a dry run.
       if test -z "$run"; then
-        $echo > $output "\
-# $output - a libtool library file
-# Generated by ltmain.sh - GNU $PACKAGE $VERSION
+	for installed in no yes; do
+	  if test "$installed" = yes; then
+	    if test -z "$install_libdir"; then
+	      break
+	    fi
+	    output="$output_objdir/$outputname"i
+	    # Replace all uninstalled libtool libraries with the installed ones
+	    newdependency_libs=
+	    for deplib in $dependency_libs; do
+	      case $deplib in
+	      *.la)
+		name=`$echo "X$deplib" | $Xsed -e 's%^.*/%%'`
+		eval libdir=`${SED} -n -e 's/^libdir=\(.*\)$/\1/p' $deplib`
+		if test -z "$libdir"; then
+		  $echo "$modename: \`$deplib' is not a valid libtool archive" 1>&2
+		  exit 1
+		fi
+		newdependency_libs="$newdependency_libs $libdir/$name"
+		;;
+	      *) newdependency_libs="$newdependency_libs $deplib" ;;
+	      esac
+	    done
+	    dependency_libs="$newdependency_libs"
+	    newdlfiles=
+	    for lib in $dlfiles; do
+	      name=`$echo "X$lib" | $Xsed -e 's%^.*/%%'`
+	      eval libdir=`${SED} -n -e 's/^libdir=\(.*\)$/\1/p' $lib`
+	      if test -z "$libdir"; then
+		$echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+		exit 1
+	      fi
+	      newdlfiles="$newdlfiles $libdir/$name"
+	    done
+	    dlfiles="$newdlfiles"
+	    newdlprefiles=
+	    for lib in $dlprefiles; do
+	      name=`$echo "X$lib" | $Xsed -e 's%^.*/%%'`
+	      eval libdir=`${SED} -n -e 's/^libdir=\(.*\)$/\1/p' $lib`
+	      if test -z "$libdir"; then
+		$echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+		exit 1
+	      fi
+	      newdlprefiles="$newdlprefiles $libdir/$name"
+	    done
+	    dlprefiles="$newdlprefiles"
+	  fi
+	  $rm $output
+	  # place dlname in correct position for cygwin
+	  tdlname=$dlname
+	  case $host,$output,$installed,$module,$dlname in
+	    *cygwin*,*lai,yes,no,*.dll) tdlname=../bin/$dlname ;;
+	  esac
+	  $echo > $output "\
+# $outputname - a libtool library file
+# Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP
+#
+# Please DO NOT delete this file!
+# It is necessary for linking the library.
 
 # The name that we can dlopen(3).
-dlname='$dlname'
+dlname='$tdlname'
 
 # Names of this library.
 library_names='$library_names'
@@ -1587,15 +4012,26 @@
 age=$age
 revision=$revision
 
+# Is this an already installed library?
+installed=$installed
+
+# Files to dlopen/dlpreopen
+dlopen='$dlfiles'
+dlpreopen='$dlprefiles'
+
 # Directory that this library needs to be installed in:
-libdir='$install_libdir'\
-"
+libdir='$install_libdir'"
+	  if test "$installed" = no && test $need_relink = yes; then
+	    $echo >> $output "\
+relink_command=\"$relink_command\""
+	  fi
+	done
       fi
 
       # Do a symbolic link so that the libtool archive can be found in
       # LD_LIBRARY_PATH before the program is installed.
-      $show "(cd $objdir && $LN_S ../$output $output)"
-      $run eval "(cd $objdir && $LN_S ../$output $output)" || exit 1
+      $show "(cd $output_objdir && $rm $outputname && $LN_S ../$outputname $outputname)"
+      $run eval '(cd $output_objdir && $rm $outputname && $LN_S ../$outputname $outputname)' || exit $?
       ;;
     esac
     exit 0
@@ -1605,12 +4041,14 @@
   install)
     modename="$modename: install"
 
-    # There may be an optional /bin/sh argument at the beginning of
+    # There may be an optional sh(1) argument at the beginning of
     # install_prog (especially on Windows NT).
-    if test "$nonopt" = "$SHELL"; then
+    if test "$nonopt" = "$SHELL" || test "$nonopt" = /bin/sh ||
+       # Allow the use of GNU shtool's install command.
+       $echo "X$nonopt" | $Xsed | grep shtool > /dev/null; then
       # Aesthetically quote it.
       arg=`$echo "X$nonopt" | $Xsed -e "$sed_quote_subst"`
-      case "$arg" in
+      case $arg in
       *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*)
 	arg="\"$arg\""
 	;;
@@ -1626,7 +4064,7 @@
     # The real first argument should be the name of the installation program.
     # Aesthetically quote it.
     arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
-    case "$arg" in
+    case $arg in
     *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*)
       arg="\"$arg\""
       ;;
@@ -1639,42 +4077,42 @@
     opts=
     prev=
     install_type=
-    isdir=
+    isdir=no
     stripme=
     for arg
     do
       if test -n "$dest"; then
-        files="$files $dest"
-        dest="$arg"
-        continue
+	files="$files $dest"
+	dest="$arg"
+	continue
       fi
 
-      case "$arg" in
+      case $arg in
       -d) isdir=yes ;;
       -f) prev="-f" ;;
       -g) prev="-g" ;;
       -m) prev="-m" ;;
       -o) prev="-o" ;;
       -s)
-        stripme=" -s"
-        continue
-        ;;
+	stripme=" -s"
+	continue
+	;;
       -*) ;;
 
       *)
-        # If the previous option needed an argument, then skip it.
-        if test -n "$prev"; then
-          prev=
-        else
-          dest="$arg"
-          continue
-        fi
-        ;;
+	# If the previous option needed an argument, then skip it.
+	if test -n "$prev"; then
+	  prev=
+	else
+	  dest="$arg"
+	  continue
+	fi
+	;;
       esac
 
       # Aesthetically quote the argument.
       arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
-      case "$arg" in
+      case $arg in
       *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*)
 	arg="\"$arg\""
 	;;
@@ -1696,9 +4134,9 @@
 
     if test -z "$files"; then
       if test -z "$dest"; then
-        $echo "$modename: no file or destination specified" 1>&2
+	$echo "$modename: no file or destination specified" 1>&2
       else
-        $echo "$modename: you must specify a destination" 1>&2
+	$echo "$modename: you must specify a destination" 1>&2
       fi
       $echo "$help" 1>&2
       exit 1
@@ -1709,7 +4147,7 @@
 
     # Check to see that the destination is a directory.
     test -d "$dest" && isdir=yes
-    if test -n "$isdir"; then
+    if test "$isdir" = yes; then
       destdir="$dest"
       destname=
     else
@@ -1720,23 +4158,23 @@
       # Not a directory, so check to see that there is only one file specified.
       set dummy $files
       if test $# -gt 2; then
-        $echo "$modename: \`$dest' is not a directory" 1>&2
-        $echo "$help" 1>&2
-        exit 1
+	$echo "$modename: \`$dest' is not a directory" 1>&2
+	$echo "$help" 1>&2
+	exit 1
       fi
     fi
-    case "$destdir" in
-    /* | [A-Za-z]:\\*) ;;
+    case $destdir in
+    [\\/]* | [A-Za-z]:[\\/]*) ;;
     *)
       for file in $files; do
-        case "$file" in
-        *.lo) ;;
-        *)
-          $echo "$modename: \`$destdir' must be an absolute directory name" 1>&2
-          $echo "$help" 1>&2
-          exit 1
-          ;;
-        esac
+	case $file in
+	*.lo) ;;
+	*)
+	  $echo "$modename: \`$destdir' must be an absolute directory name" 1>&2
+	  $echo "$help" 1>&2
+	  exit 1
+	  ;;
+	esac
       done
       ;;
     esac
@@ -1751,210 +4189,266 @@
     for file in $files; do
 
       # Do each installation.
-      case "$file" in
-      *.a)
-        # Do the static libraries later.
-        staticlibs="$staticlibs $file"
-        ;;
+      case $file in
+      *.$libext)
+	# Do the static libraries later.
+	staticlibs="$staticlibs $file"
+	;;
 
       *.la)
-        # Check to see that this really is a libtool archive.
-        if (sed -e '2q' $file | egrep '^# Generated by ltmain\.sh') >/dev/null 2>&1; then :
-        else
-          $echo "$modename: \`$file' is not a valid libtool archive" 1>&2
-          $echo "$help" 1>&2
-          exit 1
-        fi
+	# Check to see that this really is a libtool archive.
+	if (${SED} -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then :
+	else
+	  $echo "$modename: \`$file' is not a valid libtool archive" 1>&2
+	  $echo "$help" 1>&2
+	  exit 1
+	fi
 
-        library_names=
-        old_library=
-        # If there is no directory component, then add one.
-        case "$file" in
-        */* | *\\*) . $file ;;
-        *) . ./$file ;;
-        esac
+	library_names=
+	old_library=
+	relink_command=
+	# If there is no directory component, then add one.
+	case $file in
+	*/* | *\\*) . $file ;;
+	*) . ./$file ;;
+	esac
 
-        # Add the libdir to current_libdirs if it is the destination.
-        if test "X$destdir" = "X$libdir"; then
-          case "$current_libdirs " in
-          *" $libdir "*) ;;
-          *) current_libdirs="$current_libdirs $libdir" ;;
-          esac
-        else
-          # Note the libdir as a future libdir.
-          case "$future_libdirs " in
-          *" $libdir "*) ;;
-          *) future_libdirs="$future_libdirs $libdir" ;;
-          esac
-        fi
+	# Add the libdir to current_libdirs if it is the destination.
+	if test "X$destdir" = "X$libdir"; then
+	  case "$current_libdirs " in
+	  *" $libdir "*) ;;
+	  *) current_libdirs="$current_libdirs $libdir" ;;
+	  esac
+	else
+	  # Note the libdir as a future libdir.
+	  case "$future_libdirs " in
+	  *" $libdir "*) ;;
+	  *) future_libdirs="$future_libdirs $libdir" ;;
+	  esac
+	fi
 
-        dir="`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`/"
-        test "X$dir" = "X$file/" && dir=
-        dir="$dir$objdir"
+	dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`/
+	test "X$dir" = "X$file/" && dir=
+	dir="$dir$objdir"
 
-        # See the names of the shared library.
-        set dummy $library_names
-        if test -n "$2"; then
-          realname="$2"
-          shift
-          shift
+	if test -n "$relink_command"; then
+	  $echo "$modename: warning: relinking \`$file'" 1>&2
+	  $show "$relink_command"
+	  if $run eval "$relink_command"; then :
+	  else
+	    $echo "$modename: error: relink \`$file' with the above command before installing it" 1>&2
+	    continue
+	  fi
+	fi
 
-          # Install the shared library and build the symlinks.
-          $show "$install_prog $dir/$realname $destdir/$realname"
-          $run eval "$install_prog $dir/$realname $destdir/$realname" || exit $?
-          test "X$dlname" = "X$realname" && dlname=
+	# See the names of the shared library.
+	set dummy $library_names
+	if test -n "$2"; then
+	  realname="$2"
+	  shift
+	  shift
 
-          if test $# -gt 0; then
-            # Delete the old symlinks.
-            rmcmd="$rm"
-            for linkname
-            do
-              rmcmd="$rmcmd $destdir/$linkname"
-            done
-            $show "$rmcmd"
-            $run $rmcmd
+	  srcname="$realname"
+	  test -n "$relink_command" && srcname="$realname"T
 
-            # ... and create new ones.
-            for linkname
-            do
-              test "X$dlname" = "X$linkname" && dlname=
-              $show "(cd $destdir && $LN_S $realname $linkname)"
-              $run eval "(cd $destdir && $LN_S $realname $linkname)"
-            done
-          fi
+	  # Install the shared library and build the symlinks.
+	  $show "$install_prog $dir/$srcname $destdir/$realname"
+	  $run eval "$install_prog $dir/$srcname $destdir/$realname" || exit $?
+	  if test -n "$stripme" && test -n "$striplib"; then
+	    $show "$striplib $destdir/$realname"
+	    $run eval "$striplib $destdir/$realname" || exit $?
+	  fi
 
-          if test -n "$dlname"; then
-            # Install the dynamically-loadable library.
-            $show "$install_prog $dir/$dlname $destdir/$dlname"
-            $run eval "$install_prog $dir/$dlname $destdir/$dlname" || exit $?
-          fi
+	  if test $# -gt 0; then
+	    # Delete the old symlinks, and create new ones.
+	    for linkname
+	    do
+	      if test "$linkname" != "$realname"; then
+		$show "(cd $destdir && $rm $linkname && $LN_S $realname $linkname)"
+		$run eval "(cd $destdir && $rm $linkname && $LN_S $realname $linkname)"
+	      fi
+	    done
+	  fi
 
-          # Do each command in the postinstall commands.
-          lib="$destdir/$realname"
-          eval cmds=\"$postinstall_cmds\"
-          IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=';'
-          for cmd in $cmds; do
-            IFS="$save_ifs"
-            $show "$cmd"
-            $run eval "$cmd" || exit $?
-          done
-          IFS="$save_ifs"
-        fi
+	  # Do each command in the postinstall commands.
+	  lib="$destdir/$realname"
+	  eval cmds=\"$postinstall_cmds\"
+	  save_ifs="$IFS"; IFS='~'
+	  for cmd in $cmds; do
+	    IFS="$save_ifs"
+	    $show "$cmd"
+	    $run eval "$cmd" || exit $?
+	  done
+	  IFS="$save_ifs"
+	fi
 
-        # Install the pseudo-library for information purposes.
-        name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
-        $show "$install_prog $file $destdir/$name"
-        $run eval "$install_prog $file $destdir/$name" || exit $?
+	# Install the pseudo-library for information purposes.
+	name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+	instname="$dir/$name"i
+	$show "$install_prog $instname $destdir/$name"
+	$run eval "$install_prog $instname $destdir/$name" || exit $?
 
-        # Maybe install the static library, too.
-        test -n "$old_library" && staticlibs="$staticlibs $dir/$old_library"
-        ;;
+	# Maybe install the static library, too.
+	test -n "$old_library" && staticlibs="$staticlibs $dir/$old_library"
+	;;
 
       *.lo)
-        # Install (i.e. copy) a libtool object.
+	# Install (i.e. copy) a libtool object.
 
-        # Figure out destination file name, if it wasn't already specified.
-        if test -n "$destname"; then
-          destfile="$destdir/$destname"
-        else
-          destfile=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
-          destfile="$destdir/$destfile"
-        fi
+	# Figure out destination file name, if it wasn't already specified.
+	if test -n "$destname"; then
+	  destfile="$destdir/$destname"
+	else
+	  destfile=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+	  destfile="$destdir/$destfile"
+	fi
 
-        # Deduce the name of the destination old-style object file.
-        case "$destfile" in
-        *.lo)
-          staticdest=`$echo "X$destfile" | $Xsed -e 's/\.lo$/\.o/'`
-          ;;
-        *.o)
-          staticdest="$destfile"
-          destfile=
-          ;;
-        *)
-          $echo "$modename: cannot copy a libtool object to \`$destfile'" 1>&2
-          $echo "$help" 1>&2
-          exit 1
-          ;;
-        esac
+	# Deduce the name of the destination old-style object file.
+	case $destfile in
+	*.lo)
+	  staticdest=`$echo "X$destfile" | $Xsed -e "$lo2o"`
+	  ;;
+	*.$objext)
+	  staticdest="$destfile"
+	  destfile=
+	  ;;
+	*)
+	  $echo "$modename: cannot copy a libtool object to \`$destfile'" 1>&2
+	  $echo "$help" 1>&2
+	  exit 1
+	  ;;
+	esac
 
-        # Install the libtool object if requested.
-        if test -n "$destfile"; then
-          $show "$install_prog $file $destfile"
-          $run eval "$install_prog $file $destfile" || exit $?
-        fi
+	# Install the libtool object if requested.
+	if test -n "$destfile"; then
+	  $show "$install_prog $file $destfile"
+	  $run eval "$install_prog $file $destfile" || exit $?
+	fi
 
-        # Install the old object if enabled.
-        if test "$build_old_libs" = yes; then
-          # Deduce the name of the old-style object file.
-          staticobj=`$echo "X$file" | $Xsed -e 's/\.lo$/\.o/'`
+	# Install the old object if enabled.
+	if test "$build_old_libs" = yes; then
+	  # Deduce the name of the old-style object file.
+	  staticobj=`$echo "X$file" | $Xsed -e "$lo2o"`
 
-          $show "$install_prog $staticobj $staticdest"
-          $run eval "$install_prog \$staticobj \$staticdest" || exit $?
-        fi
-        exit 0
-        ;;
+	  $show "$install_prog $staticobj $staticdest"
+	  $run eval "$install_prog \$staticobj \$staticdest" || exit $?
+	fi
+	exit 0
+	;;
 
       *)
-        # Do a test to see if this is really a libtool program.
-        if (sed -e '4q' $file | egrep '^# Generated by ltmain\.sh') >/dev/null 2>&1; then
-          link_against_libtool_libs=
-          finalize_command=
+	# Figure out destination file name, if it wasn't already specified.
+	if test -n "$destname"; then
+	  destfile="$destdir/$destname"
+	else
+	  destfile=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+	  destfile="$destdir/$destfile"
+	fi
 
-          # If there is no directory component, then add one.
-          case "$file" in
-          */* | *\\*) . $file ;;
-          *) . ./$file ;;
-          esac
+	# Do a test to see if this is really a libtool program.
+	case $host in
+	*cygwin*|*mingw*)
+	    wrapper=`echo $file | ${SED} -e 's,.exe$,,'`
+	    ;;
+	*)
+	    wrapper=$file
+	    ;;
+	esac
+	if (${SED} -e '4q' $wrapper | egrep "^# Generated by .*$PACKAGE")>/dev/null 2>&1; then
+	  notinst_deplibs=
+	  relink_command=
 
-          # Check the variables that should have been set.
-          if test -z "$link_against_libtool_libs" || test -z "$finalize_command"; then
-            $echo "$modename: invalid libtool wrapper script \`$file'" 1>&2
-            exit 1
-          fi
+	  # If there is no directory component, then add one.
+	  case $file in
+	  */* | *\\*) . $wrapper ;;
+	  *) . ./$wrapper ;;
+	  esac
 
-          finalize=yes
-          for lib in $link_against_libtool_libs; do
-            # Check to see that each library is installed.
-            libdir=
-            if test -f "$lib"; then
-              # If there is no directory component, then add one.
-              case "$lib" in
-              */* | *\\*) . $lib ;;
-              *) . ./$lib ;;
-              esac
-            fi
-            libfile="$libdir/`$echo "X$lib" | $Xsed -e 's%^.*/%%g'`"
-            if test -z "$libdir"; then
-              $echo "$modename: warning: \`$lib' contains no -rpath information" 1>&2
-            elif test -f "$libfile"; then :
-            else
-              $echo "$modename: warning: \`$lib' has not been installed in \`$libdir'" 1>&2
-              finalize=no
-            fi
-          done
+	  # Check the variables that should have been set.
+	  if test -z "$notinst_deplibs"; then
+	    $echo "$modename: invalid libtool wrapper script \`$wrapper'" 1>&2
+	    exit 1
+	  fi
 
-          if test "$hardcode_action" = relink; then
-            if test "$finalize" = yes; then
-              $echo "$modename: warning: relinking \`$file' on behalf of your buggy system linker" 1>&2
-              $show "$finalize_command"
-              if $run eval "$finalize_command"; then :
-              else
-                $echo "$modename: error: relink \`$file' with the above command before installing it" 1>&2
-                continue
-              fi
-              file="$objdir/$file"T
-            else
-              $echo "$modename: warning: cannot relink \`$file' on behalf of your buggy system linker" 1>&2
-            fi
-          else
-            # Install the binary that we compiled earlier.
+	  finalize=yes
+	  for lib in $notinst_deplibs; do
+	    # Check to see that each library is installed.
+	    libdir=
+	    if test -f "$lib"; then
+	      # If there is no directory component, then add one.
+	      case $lib in
+	      */* | *\\*) . $lib ;;
+	      *) . ./$lib ;;
+	      esac
+	    fi
+	    libfile="$libdir/"`$echo "X$lib" | $Xsed -e 's%^.*/%%g'` ### testsuite: skip nested quoting test
+	    if test -n "$libdir" && test ! -f "$libfile"; then
+	      $echo "$modename: warning: \`$lib' has not been installed in \`$libdir'" 1>&2
+	      finalize=no
+	    fi
+	  done
+
+	  relink_command=
+	  # If there is no directory component, then add one.
+	  case $file in
+	  */* | *\\*) . $wrapper ;;
+	  *) . ./$wrapper ;;
+	  esac
+
+	  outputname=
+	  if test "$fast_install" = no && test -n "$relink_command"; then
+	    if test "$finalize" = yes && test -z "$run"; then
+	      tmpdir="/tmp"
+	      test -n "$TMPDIR" && tmpdir="$TMPDIR"
+	      tmpdir="$tmpdir/libtool-$$"
+	      if $mkdir -p "$tmpdir" && chmod 700 "$tmpdir"; then :
+	      else
+		$echo "$modename: error: cannot create temporary directory \`$tmpdir'" 1>&2
+		continue
+	      fi
+	      file=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+	      outputname="$tmpdir/$file"
+	      # Replace the output file specification.
+	      relink_command=`$echo "X$relink_command" | $Xsed -e 's%@OUTPUT@%'"$outputname"'%g'`
+
+	      $show "$relink_command"
+	      if $run eval "$relink_command"; then :
+	      else
+		$echo "$modename: error: relink \`$file' with the above command before installing it" 1>&2
+		${rm}r "$tmpdir"
+		continue
+	      fi
+	      file="$outputname"
+	    else
+	      $echo "$modename: warning: cannot relink \`$file'" 1>&2
+	    fi
+	  else
+	    # Install the binary that we compiled earlier.
 	    file=`$echo "X$file" | $Xsed -e "s%\([^/]*\)$%$objdir/\1%"`
-          fi
-        fi
+	  fi
+	fi
 
-        $show "$install_prog$stripme $file $dest"
-        $run eval "$install_prog\$stripme \$file \$dest" || exit $?
-        ;;
+	# remove .exe since cygwin /usr/bin/install will append another
+	# one anyways
+	case $install_prog,$host in
+	/usr/bin/install*,*cygwin*)
+	  case $file:$destfile in
+	  *.exe:*.exe)
+	    # this is ok
+	    ;;
+	  *.exe:*)
+	    destfile=$destfile.exe
+	    ;;
+	  *:*.exe)
+	    destfile=`echo $destfile | ${SED} -e 's,.exe$,,'`
+	    ;;
+	  esac
+	  ;;
+	esac
+	$show "$install_prog$stripme $file $destfile"
+	$run eval "$install_prog\$stripme \$file \$destfile" || exit $?
+	test -n "$outputname" && ${rm}r "$tmpdir"
+	;;
       esac
     done
 
@@ -1967,13 +4461,18 @@
       $show "$install_prog $file $oldlib"
       $run eval "$install_prog \$file \$oldlib" || exit $?
 
+      if test -n "$stripme" && test -n "$striplib"; then
+	$show "$old_striplib $oldlib"
+	$run eval "$old_striplib $oldlib" || exit $?
+      fi
+
       # Do each command in the postinstall commands.
       eval cmds=\"$old_postinstall_cmds\"
-      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=';'
+      save_ifs="$IFS"; IFS='~'
       for cmd in $cmds; do
-        IFS="$save_ifs"
-        $show "$cmd"
-        $run eval "$cmd" || exit $?
+	IFS="$save_ifs"
+	$show "$cmd"
+	$run eval "$cmd" || exit $?
       done
       IFS="$save_ifs"
     done
@@ -1985,54 +4484,59 @@
     if test -n "$current_libdirs"; then
       # Maybe just do a dry run.
       test -n "$run" && current_libdirs=" -n$current_libdirs"
-      exec $SHELL $0 --finish$current_libdirs
-      exit 1
+      exec_cmd='$SHELL $0 --finish$current_libdirs'
+    else
+      exit 0
     fi
-
-    exit 0
     ;;
 
   # libtool finish mode
   finish)
     modename="$modename: finish"
     libdirs="$nonopt"
+    admincmds=
 
     if test -n "$finish_cmds$finish_eval" && test -n "$libdirs"; then
       for dir
       do
-        libdirs="$libdirs $dir"
+	libdirs="$libdirs $dir"
       done
 
       for libdir in $libdirs; do
 	if test -n "$finish_cmds"; then
 	  # Do each command in the finish commands.
 	  eval cmds=\"$finish_cmds\"
-          IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=';'
-          for cmd in $cmds; do
-            IFS="$save_ifs"
-            $show "$cmd"
-            $run eval "$cmd"
-          done
-          IFS="$save_ifs"
+	  save_ifs="$IFS"; IFS='~'
+	  for cmd in $cmds; do
+	    IFS="$save_ifs"
+	    $show "$cmd"
+	    $run eval "$cmd" || admincmds="$admincmds
+       $cmd"
+	  done
+	  IFS="$save_ifs"
 	fi
 	if test -n "$finish_eval"; then
 	  # Do the single finish_eval.
 	  eval cmds=\"$finish_eval\"
-	  $run eval "$cmds"
+	  $run eval "$cmds" || admincmds="$admincmds
+       $cmds"
 	fi
       done
     fi
 
-    echo "------------------------------------------------------------------------------"
+    # Exit here if they wanted silent mode.
+    test "$show" = ":" && exit 0
+
+    echo "----------------------------------------------------------------------"
     echo "Libraries have been installed in:"
     for libdir in $libdirs; do
       echo "   $libdir"
     done
     echo
-    echo "To link against installed libraries in a given directory, LIBDIR,"
-    echo "you must use the \`-LLIBDIR' flag during linking."
-    echo
-    echo " You will also need to do one of the following:"
+    echo "If you ever happen to want to link against installed libraries"
+    echo "in a given directory, LIBDIR, you must either use libtool, and"
+    echo "specify the full pathname of the library, or use the \`-LLIBDIR'"
+    echo "flag during linking and do at least one of the following:"
     if test -n "$shlibpath_var"; then
       echo "   - add LIBDIR to the \`$shlibpath_var' environment variable"
       echo "     during execution"
@@ -2047,13 +4551,16 @@
 
       echo "   - use the \`$flag' linker flag"
     fi
+    if test -n "$admincmds"; then
+      echo "   - have your system administrator run these commands:$admincmds"
+    fi
     if test -f /etc/ld.so.conf; then
       echo "   - have your system administrator add LIBDIR to \`/etc/ld.so.conf'"
     fi
     echo
     echo "See any operating system documentation about shared libraries for"
     echo "more information, such as the ld(1) and ld.so(8) manual pages."
-    echo "------------------------------------------------------------------------------"
+    echo "----------------------------------------------------------------------"
     exit 0
     ;;
 
@@ -2071,32 +4578,31 @@
 
     # Handle -dlopen flags immediately.
     for file in $execute_dlfiles; do
-      if test -f "$file"; then :
-      else
+      if test ! -f "$file"; then
 	$echo "$modename: \`$file' is not a file" 1>&2
 	$echo "$help" 1>&2
 	exit 1
       fi
 
       dir=
-      case "$file" in
+      case $file in
       *.la)
-        # Check to see that this really is a libtool archive.
-        if (sed -e '2q' $file | egrep '^# Generated by ltmain\.sh') >/dev/null 2>&1; then :
-        else
-          $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
-          $echo "$help" 1>&2
-          exit 1
-        fi
+	# Check to see that this really is a libtool archive.
+	if (${SED} -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then :
+	else
+	  $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+	  $echo "$help" 1>&2
+	  exit 1
+	fi
 
 	# Read the libtool library.
 	dlname=
 	library_names=
 
-        # If there is no directory component, then add one.
-	case "$file" in
+	# If there is no directory component, then add one.
+	case $file in
 	*/* | *\\*) . $file ;;
-        *) . ./$file ;;
+	*) . ./$file ;;
 	esac
 
 	# Skip this library if it cannot be dlopened.
@@ -2125,7 +4631,7 @@
 
       *)
 	$echo "$modename: warning \`-dlopen' is ignored for non-libtool libraries and objects" 1>&2
-        continue
+	continue
 	;;
       esac
 
@@ -2149,13 +4655,13 @@
     args=
     for file
     do
-      case "$file" in
+      case $file in
       -*) ;;
       *)
-        # Do a test to see if this is really a libtool program.
-        if (sed -e '4q' $file | egrep '^# Generated by ltmain\.sh') >/dev/null 2>&1; then
+	# Do a test to see if this is really a libtool program.
+	if (${SED} -e '4q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
 	  # If there is no directory component, then add one.
-	  case "$file" in
+	  case $file in
 	  */* | *\\*) . $file ;;
 	  *) . ./$file ;;
 	  esac
@@ -2163,7 +4669,7 @@
 	  # Transform arg to wrapped name.
 	  file="$progdir/$program"
 	fi
-        ;;
+	;;
       esac
       # Quote arguments (to preserve shell metacharacters).
       file=`$echo "X$file" | $Xsed -e "$sed_quote_subst"`
@@ -2171,32 +4677,48 @@
     done
 
     if test -z "$run"; then
-      # Export the shlibpath_var.
-      eval "export $shlibpath_var"
+      if test -n "$shlibpath_var"; then
+	# Export the shlibpath_var.
+	eval "export $shlibpath_var"
+      fi
 
-      # Now actually exec the command.
-      eval "exec \$cmd$args"
+      # Restore saved enviroment variables
+      if test "${save_LC_ALL+set}" = set; then
+	LC_ALL="$save_LC_ALL"; export LC_ALL
+      fi
+      if test "${save_LANG+set}" = set; then
+	LANG="$save_LANG"; export LANG
+      fi
 
-      $echo "$modename: cannot exec \$cmd$args"
-      exit 1
+      # Now prepare to actually exec the command.
+      exec_cmd="\$cmd$args"
     else
       # Display what would be done.
-      eval "\$echo \"\$shlibpath_var=\$$shlibpath_var\""
-      $echo "export $shlibpath_var"
+      if test -n "$shlibpath_var"; then
+	eval "\$echo \"\$shlibpath_var=\$$shlibpath_var\""
+	$echo "export $shlibpath_var"
+      fi
       $echo "$cmd$args"
       exit 0
     fi
     ;;
 
-  # libtool uninstall mode
-  uninstall)
-    modename="$modename: uninstall"
+  # libtool clean and uninstall mode
+  clean | uninstall)
+    modename="$modename: $mode"
     rm="$nonopt"
     files=
+    rmforce=
+    exit_status=0
+
+    # This variable tells wrapper scripts just to set variables rather
+    # than running their programs.
+    libtool_install_magic="$magic"
 
     for arg
     do
-      case "$arg" in
+      case $arg in
+      -f) rm="$rm $arg"; rmforce=yes ;;
       -*) rm="$rm $arg" ;;
       *) files="$files $arg" ;;
       esac
@@ -2208,74 +4730,123 @@
       exit 1
     fi
 
+    rmdirs=
+
     for file in $files; do
       dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`
-      test "X$dir" = "X$file" && dir=.
+      if test "X$dir" = "X$file"; then
+	dir=.
+	objdir="$objdir"
+      else
+	objdir="$dir/$objdir"
+      fi
       name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+      test $mode = uninstall && objdir="$dir"
+
+      # Remember objdir for removal later, being careful to avoid duplicates
+      if test $mode = clean; then
+	case " $rmdirs " in
+	  *" $objdir "*) ;;
+	  *) rmdirs="$rmdirs $objdir" ;;
+	esac
+      fi
+
+      # Don't error if the file doesn't exist and rm -f was used.
+      if (test -L "$file") >/dev/null 2>&1 \
+	|| (test -h "$file") >/dev/null 2>&1 \
+	|| test -f "$file"; then
+	:
+      elif test -d "$file"; then
+	exit_status=1
+	continue
+      elif test "$rmforce" = yes; then
+	continue
+      fi
 
       rmfiles="$file"
 
-      case "$name" in
+      case $name in
       *.la)
-        # Possibly a libtool archive, so verify it.
-        if (sed -e '2q' $file | egrep '^# Generated by ltmain\.sh') >/dev/null 2>&1; then
-          . $dir/$name
+	# Possibly a libtool archive, so verify it.
+	if (${SED} -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+	  . $dir/$name
 
-          # Delete the libtool libraries and symlinks.
-          for n in $library_names; do
-            rmfiles="$rmfiles $dir/$n"
-            test "X$n" = "X$dlname" && dlname=
-          done
-          test -n "$dlname" && rmfiles="$rmfiles $dir/$dlname"
-          test -n "$old_library" && rmfiles="$rmfiles $dir/$old_library"
+	  # Delete the libtool libraries and symlinks.
+	  for n in $library_names; do
+	    rmfiles="$rmfiles $objdir/$n"
+	  done
+	  test -n "$old_library" && rmfiles="$rmfiles $objdir/$old_library"
+	  test $mode = clean && rmfiles="$rmfiles $objdir/$name $objdir/${name}i"
 
-	  $show "$rm $rmfiles"
-	  $run $rm $rmfiles
-
-	  if test -n "$library_names"; then
-	    # Do each command in the postuninstall commands.
-	    eval cmds=\"$postuninstall_cmds\"
-	    IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=';'
-	    for cmd in $cmds; do
+	  if test $mode = uninstall; then
+	    if test -n "$library_names"; then
+	      # Do each command in the postuninstall commands.
+	      eval cmds=\"$postuninstall_cmds\"
+	      save_ifs="$IFS"; IFS='~'
+	      for cmd in $cmds; do
+		IFS="$save_ifs"
+		$show "$cmd"
+		$run eval "$cmd"
+		if test $? != 0 && test "$rmforce" != yes; then
+		  exit_status=1
+		fi
+	      done
 	      IFS="$save_ifs"
-	      $show "$cmd"
-	      $run eval "$cmd"
-	    done
-	    IFS="$save_ifs"
-	  fi
+	    fi
 
-          if test -n "$old_library"; then
-	    # Do each command in the old_postuninstall commands.
-	    eval cmds=\"$old_postuninstall_cmds\"
-	    IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=';'
-	    for cmd in $cmds; do
+	    if test -n "$old_library"; then
+	      # Do each command in the old_postuninstall commands.
+	      eval cmds=\"$old_postuninstall_cmds\"
+	      save_ifs="$IFS"; IFS='~'
+	      for cmd in $cmds; do
+		IFS="$save_ifs"
+		$show "$cmd"
+		$run eval "$cmd"
+		if test $? != 0 && test "$rmforce" != yes; then
+		  exit_status=1
+		fi
+	      done
 	      IFS="$save_ifs"
-	      $show "$cmd"
-	      $run eval "$cmd"
-	    done
-	    IFS="$save_ifs"
+	    fi
+	    # FIXME: should reinstall the best remaining shared library.
 	  fi
-
-          # FIXME: should reinstall the best remaining shared library.
-        fi
-        ;;
+	fi
+	;;
 
       *.lo)
-        if test "$build_old_libs" = yes; then
-          oldobj=`$echo "X$name" | $Xsed -e 's/\.lo$/\.o/'`
-          rmfiles="$rmfiles $dir/$oldobj"
-        fi
-	$show "$rm $rmfiles"
-	$run $rm $rmfiles
-        ;;
+	if test "$build_old_libs" = yes; then
+	  oldobj=`$echo "X$name" | $Xsed -e "$lo2o"`
+	  rmfiles="$rmfiles $dir/$oldobj"
+	fi
+	;;
 
       *)
-      	$show "$rm $rmfiles"
-	$run $rm $rmfiles
+	# Do a test to see if this is a libtool program.
+	if test $mode = clean &&
+	   (${SED} -e '4q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+	  relink_command=
+	  . $dir/$file
+
+	  rmfiles="$rmfiles $objdir/$name $objdir/${name}S.${objext}"
+	  if test "$fast_install" = yes && test -n "$relink_command"; then
+	    rmfiles="$rmfiles $objdir/lt-$name"
+	  fi
+	fi
 	;;
       esac
+      $show "$rm $rmfiles"
+      $run $rm $rmfiles || exit_status=1
     done
-    exit 0
+
+    # Try to remove the ${objdir}s in the directories where we deleted files
+    for dir in $rmdirs; do
+      if test -d "$dir"; then
+	$show "rmdir $dir"
+	$run rmdir $dir >/dev/null 2>&1
+      fi
+    done
+
+    exit $exit_status
     ;;
 
   "")
@@ -2285,20 +4856,29 @@
     ;;
   esac
 
-  $echo "$modename: invalid operation mode \`$mode'" 1>&2
-  $echo "$generic_help" 1>&2
-  exit 1
+  if test -z "$exec_cmd"; then
+    $echo "$modename: invalid operation mode \`$mode'" 1>&2
+    $echo "$generic_help" 1>&2
+    exit 1
+  fi
 fi # test -z "$show_help"
 
+if test -n "$exec_cmd"; then
+  eval exec $exec_cmd
+  exit 1
+fi
+
 # We need to display help for each of the modes.
-case "$mode" in
+case $mode in
 "") $echo \
 "Usage: $modename [OPTION]... [MODE-ARG]...
 
 Provide generalized library-building support services.
 
+    --config          show all configuration variables
+    --debug           enable verbose shell tracing
 -n, --dry-run         display commands without modifying any files
-    --features        display configuration information and exit
+    --features        display basic configuration information and exit
     --finish          same as \`--mode=finish'
     --help            display this help message and exit
     --mode=MODE       use operation mode MODE [default=inferred from MODE-ARGS]
@@ -2308,6 +4888,7 @@
 
 MODE must be one of the following:
 
+      clean           remove files from the build directory
       compile         compile a source file into a libtool object
       execute         automatically set library path, then run a program
       finish          complete the installation of libtool libraries
@@ -2320,12 +4901,33 @@
   exit 0
   ;;
 
+clean)
+  $echo \
+"Usage: $modename [OPTION]... --mode=clean RM [RM-OPTION]... FILE...
+
+Remove files from the build directory.
+
+RM is the name of the program to use to delete files associated with each FILE
+(typically \`/bin/rm').  RM-OPTIONS are options (such as \`-f') to be passed
+to RM.
+
+If FILE is a libtool library, object or program, all the files associated
+with it are deleted. Otherwise, only FILE itself is deleted using RM."
+  ;;
+
 compile)
   $echo \
 "Usage: $modename [OPTION]... --mode=compile COMPILE-COMMAND... SOURCEFILE
 
 Compile a source file into a libtool library object.
 
+This mode accepts the following additional options:
+
+  -o OUTPUT-FILE    set the output file name to OUTPUT-FILE
+  -prefer-pic       try to building PIC objects only
+  -prefer-non-pic   try to building non-PIC objects only
+  -static           always build a \`.o' file suitable for static linking
+
 COMPILE-COMMAND is a command to be used in creating a \`standard' object file
 from the given SOURCEFILE.
 
@@ -2392,18 +4994,27 @@
 The following components of LINK-COMMAND are treated specially:
 
   -all-static       do not do any dynamic linking at all
+  -avoid-version    do not add a version suffix if possible
   -dlopen FILE      \`-dlpreopen' FILE if it cannot be dlopened at runtime
-  -dlpreopen FILE   link in FILE and add its symbols to dld_preloaded_symbols
+  -dlpreopen FILE   link in FILE and add its symbols to lt_preloaded_symbols
   -export-dynamic   allow symbols from OUTPUT-FILE to be resolved with dlsym(3)
+  -export-symbols SYMFILE
+		    try to export only the symbols listed in SYMFILE
+  -export-symbols-regex REGEX
+		    try to export only the symbols matching REGEX
   -LLIBDIR          search LIBDIR for required installed libraries
   -lNAME            OUTPUT-FILE requires the installed library libNAME
+  -module           build a library that can dlopened
+  -no-fast-install  disable the fast-install mode
+  -no-install       link a not-installable executable
   -no-undefined     declare that a library does not refer to external symbols
   -o OUTPUT-FILE    create OUTPUT-FILE from the specified objects
   -release RELEASE  specify package release information
   -rpath LIBDIR     the created library will eventually be installed in LIBDIR
+  -R[ ]LIBDIR       add LIBDIR to the runtime path of programs and libraries
   -static           do not do any dynamic linking of libtool libraries
   -version-info CURRENT[:REVISION[:AGE]]
-                    specify library version info [each variable defaults to 0]
+		    specify library version info [each variable defaults to 0]
 
 All other options (arguments beginning with \`-') are ignored.
 
@@ -2411,18 +5022,19 @@
 treated as uninstalled libtool libraries, other files are standard or library
 object files.
 
-If the OUTPUT-FILE ends in \`.la', then a libtool library is created, only
-library objects (\`.lo' files) may be specified, and \`-rpath' is required.
+If the OUTPUT-FILE ends in \`.la', then a libtool library is created,
+only library objects (\`.lo' files) may be specified, and \`-rpath' is
+required, except when creating a convenience library.
 
-If OUTPUT-FILE ends in \`.a', then a standard library is created using \`ar'
-and \`ranlib'.
+If OUTPUT-FILE ends in \`.a' or \`.lib', then a standard library is created
+using \`ar' and \`ranlib', or on Windows using \`lib'.
 
-If OUTPUT-FILE ends in \`.lo' or \`.o', then a reloadable object file is
-created, otherwise an executable program is created."
+If OUTPUT-FILE ends in \`.lo' or \`.${objext}', then a reloadable object file
+is created, otherwise an executable program is created."
   ;;
 
 uninstall)
-  $echo
+  $echo \
 "Usage: $modename [OPTION]... --mode=uninstall RM [RM-OPTION]... FILE...
 
 Remove libraries from an installation directory.
diff --git a/makecfg.c b/makecfg.c
new file mode 100644
index 0000000..787ea37
--- /dev/null
+++ b/makecfg.c
@@ -0,0 +1,300 @@
+/*
+ * makecfg.c
+ *
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ * Last Modified : March 23, 2005
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+#ifndef offsetof		/* defined in <stddef.h> */
+#define offsetof(type, mem) ((size_t) \
+		((char *)&((type *)0)->mem - (char *)(type *)0))
+#endif
+
+void
+print_structure_offset (void)
+{
+  printf("\n");
+  printf("; ---- macros for structure access -----------------------------------------\n");
+  printf("\n");
+
+  printf("; struct jpeg_compress_struct\n\n");
+  printf("%%define jcstruct_image_width(b)         ((b) + %3u) ; cinfo->image_width\n",
+	(unsigned)offsetof(struct jpeg_compress_struct, image_width));
+  printf("%%define jcstruct_max_v_samp_factor(b)   ((b) + %3u) ; cinfo->max_v_samp_factor\n",
+	(unsigned)offsetof(struct jpeg_compress_struct, max_v_samp_factor));
+  printf("\n");
+
+  printf("; struct jpeg_decompress_struct\n\n");
+  printf("%%define jdstruct_output_width(b)        ((b) + %3u) ; cinfo->output_width\n",
+	(unsigned)offsetof(struct jpeg_decompress_struct, output_width));
+  printf("%%define jdstruct_max_v_samp_factor(b)   ((b) + %3u) ; cinfo->max_v_samp_factor\n",
+	(unsigned)offsetof(struct jpeg_decompress_struct, max_v_samp_factor));
+  printf("%%define jdstruct_sample_range_limit(b)  ((b) + %3u) ; cinfo->sample_range_limit\n",
+	(unsigned)offsetof(struct jpeg_decompress_struct, sample_range_limit));
+  printf("\n");
+
+  printf("; jpeg_component_info\n\n");
+  printf("%%define jcompinfo_v_samp_factor(b)      ((b) + %2u) ; compptr->v_samp_factor\n",
+	(unsigned)offsetof(jpeg_component_info, v_samp_factor));
+  printf("%%define jcompinfo_width_in_blocks(b)    ((b) + %2u) ; compptr->width_in_blocks\n",
+	(unsigned)offsetof(jpeg_component_info, width_in_blocks));
+  printf("%%define jcompinfo_downsampled_width(b)  ((b) + %2u) ; compptr->downsampled_width\n",
+	(unsigned)offsetof(jpeg_component_info, downsampled_width));
+  printf("%%define jcompinfo_dct_table(b)          ((b) + %2u) ; compptr->dct_table\n",
+	(unsigned)offsetof(jpeg_component_info, dct_table));
+  printf("\n");
+}
+
+
+void
+print_jconfig_h_macro (void)
+{
+  printf("\n");
+  printf("; ---- macros from jconfig.h -----------------------------------------------\n");
+  printf("\n");
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+  printf("%%define NEED_SHORT_EXTERNAL_NAMES\t; Use short forms of external names\n");
+#else
+  printf("%%undef NEED_SHORT_EXTERNAL_NAMES\t; Use short forms of external names\n");
+#endif
+  printf("\n");
+}
+
+
+void
+print_jmorecfg_h_macro (void)
+{
+  printf("\n");
+  printf("; ---- macros from jmorecfg.h ----------------------------------------------\n");
+  printf("\n");
+
+  printf("; Capability options common to encoder and decoder:\n");
+  printf("\n");
+#ifdef DCT_ISLOW_SUPPORTED
+  printf("%%define DCT_ISLOW_SUPPORTED\t; slow but accurate integer algorithm\n");
+#else
+  printf("%%undef DCT_ISLOW_SUPPORTED\t; slow but accurate integer algorithm\n");
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+  printf("%%define DCT_IFAST_SUPPORTED\t; faster, less accurate integer method\n");
+#else
+  printf("%%undef DCT_IFAST_SUPPORTED\t; faster, less accurate integer method\n");
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+  printf("%%define DCT_FLOAT_SUPPORTED\t; floating-point: accurate, fast on fast HW\n");
+#else
+  printf("%%undef DCT_FLOAT_SUPPORTED\t; floating-point: accurate, fast on fast HW\n");
+#endif
+  printf("\n");
+
+  printf("; Decoder capability options:\n");
+  printf("\n");
+#ifdef IDCT_SCALING_SUPPORTED
+  printf("%%define IDCT_SCALING_SUPPORTED\t\t; Output rescaling via IDCT?\n");
+#else
+  printf("%%undef IDCT_SCALING_SUPPORTED\t\t; Output rescaling via IDCT?\n");
+#endif
+#ifdef UPSAMPLE_MERGING_SUPPORTED
+  printf("%%define UPSAMPLE_MERGING_SUPPORTED\t; Fast path for sloppy upsampling?\n");
+#else
+  printf("%%undef UPSAMPLE_MERGING_SUPPORTED\t; Fast path for sloppy upsampling?\n");
+#endif
+#ifdef UPSAMPLE_H1V2_SUPPORTED
+  printf("%%define UPSAMPLE_H1V2_SUPPORTED\t\t; Fast/fancy processing for 1h2v?\n");
+#else
+  printf("%%undef UPSAMPLE_H1V2_SUPPORTED\t\t; Fast/fancy processing for 1h2v?\n");
+#endif
+  printf("\n");
+
+#if (RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4) && \
+    (RGB_RED < 0 || RGB_RED >= RGB_PIXELSIZE || RGB_GREEN < 0 || \
+     RGB_GREEN >= RGB_PIXELSIZE || RGB_BLUE < 0 || RGB_BLUE >= RGB_PIXELSIZE || \
+     RGB_RED == RGB_GREEN || RGB_GREEN == RGB_BLUE || RGB_RED == RGB_BLUE)
+#error "Incorrect RGB pixel offset."
+#endif
+  printf("; Ordering of RGB data in scanlines passed to or from the application.\n");
+  printf("\n");
+  printf("%%define RGB_RED\t\t%u\t; Offset of Red in an RGB scanline element\n", RGB_RED);
+  printf("%%define RGB_GREEN\t%u\t; Offset of Green\n", RGB_GREEN);
+  printf("%%define RGB_BLUE\t%u\t; Offset of Blue\n", RGB_BLUE);
+  printf("%%define RGB_PIXELSIZE\t%u\t; JSAMPLEs per RGB scanline element\n", RGB_PIXELSIZE);
+  printf("\n");
+#ifdef RGBX_FILLER_0XFF
+  printf("%%define RGBX_FILLER_0XFF\t; fill dummy bytes with 0xFF in RGBX format\n");
+#else
+  printf("%%undef RGBX_FILLER_0XFF\t\t; fill dummy bytes with 0xFF in RGBX format\n");
+#endif
+  printf("\n");
+
+  printf("; SIMD support options (encoder):\n");
+  printf("\n");
+#ifdef JCCOLOR_RGBYCC_MMX_SUPPORTED
+  printf("%%define JCCOLOR_RGBYCC_MMX_SUPPORTED\t; RGB->YCC conversion with MMX\n");
+#else
+  printf("%%undef JCCOLOR_RGBYCC_MMX_SUPPORTED\t; RGB->YCC conversion with MMX\n");
+#endif
+#ifdef JCCOLOR_RGBYCC_SSE2_SUPPORTED
+  printf("%%define JCCOLOR_RGBYCC_SSE2_SUPPORTED\t; RGB->YCC conversion with SSE2\n");
+#else
+  printf("%%undef JCCOLOR_RGBYCC_SSE2_SUPPORTED\t; RGB->YCC conversion with SSE2\n");
+#endif
+#ifdef JCSAMPLE_MMX_SUPPORTED
+  printf("%%define JCSAMPLE_MMX_SUPPORTED\t\t; downsampling with MMX\n");
+#else
+  printf("%%undef JCSAMPLE_MMX_SUPPORTED\t\t; downsampling with MMX\n");
+#endif
+#ifdef JCSAMPLE_SSE2_SUPPORTED
+  printf("%%define JCSAMPLE_SSE2_SUPPORTED\t\t; downsampling with SSE2\n");
+#else
+  printf("%%undef JCSAMPLE_SSE2_SUPPORTED\t\t; downsampling with SSE2\n");
+#endif
+#ifdef JFDCT_INT_MMX_SUPPORTED
+  printf("%%define JFDCT_INT_MMX_SUPPORTED\t\t; forward DCT with MMX\n");
+#else
+  printf("%%undef JFDCT_INT_MMX_SUPPORTED\t\t; forward DCT with MMX\n");
+#endif
+#ifdef JFDCT_INT_SSE2_SUPPORTED
+  printf("%%define JFDCT_INT_SSE2_SUPPORTED\t; forward DCT with SSE2\n");
+#else
+  printf("%%undef JFDCT_INT_SSE2_SUPPORTED\t\t; forward DCT with SSE2\n");
+#endif
+#ifdef JFDCT_FLT_3DNOW_MMX_SUPPORTED
+  printf("%%define JFDCT_FLT_3DNOW_MMX_SUPPORTED\t; forward DCT with 3DNow!/MMX\n");
+#else
+  printf("%%undef JFDCT_FLT_3DNOW_MMX_SUPPORTED\t; forward DCT with 3DNow!/MMX\n");
+#endif
+#ifdef JFDCT_FLT_SSE_MMX_SUPPORTED
+  printf("%%define JFDCT_FLT_SSE_MMX_SUPPORTED\t; forward DCT with SSE/MMX\n");
+#else
+  printf("%%undef JFDCT_FLT_SSE_MMX_SUPPORTED\t; forward DCT with SSE/MMX\n");
+#endif
+#ifdef JFDCT_FLT_SSE_SSE2_SUPPORTED
+  printf("%%define JFDCT_FLT_SSE_SSE2_SUPPORTED\t; forward DCT with SSE/SSE2\n");
+#else
+  printf("%%undef JFDCT_FLT_SSE_SSE2_SUPPORTED\t; forward DCT with SSE/SSE2\n");
+#endif
+#ifdef JFDCT_INT_QUANTIZE_WITH_DIVISION
+  printf("%%define JFDCT_INT_QUANTIZE_WITH_DIVISION ; Use general quantization method\n");
+#else
+  printf("%%undef JFDCT_INT_QUANTIZE_WITH_DIVISION ; Use general quantization method\n");
+#endif
+  printf("\n");
+
+  printf("; SIMD support options (decoder):\n");
+  printf("\n");
+#ifdef JDCOLOR_YCCRGB_MMX_SUPPORTED
+  printf("%%define JDCOLOR_YCCRGB_MMX_SUPPORTED\t; YCC->RGB conversion with MMX\n");
+#else
+  printf("%%undef JDCOLOR_YCCRGB_MMX_SUPPORTED\t; YCC->RGB conversion with MMX\n");
+#endif
+#ifdef JDCOLOR_YCCRGB_SSE2_SUPPORTED
+  printf("%%define JDCOLOR_YCCRGB_SSE2_SUPPORTED\t; YCC->RGB conversion with SSE2\n");
+#else
+  printf("%%undef JDCOLOR_YCCRGB_SSE2_SUPPORTED\t; YCC->RGB conversion with SSE2\n");
+#endif
+#ifdef JDMERGE_MMX_SUPPORTED
+  printf("%%define JDMERGE_MMX_SUPPORTED\t\t; merged upsampling with MMX\n");
+#else
+  printf("%%undef JDMERGE_MMX_SUPPORTED\t\t; merged upsampling with MMX\n");
+#endif
+#ifdef JDMERGE_SSE2_SUPPORTED
+  printf("%%define JDMERGE_SSE2_SUPPORTED\t\t; merged upsampling with SSE2\n");
+#else
+  printf("%%undef JDMERGE_SSE2_SUPPORTED\t\t; merged upsampling with SSE2\n");
+#endif
+#ifdef JDSAMPLE_FANCY_MMX_SUPPORTED
+  printf("%%define JDSAMPLE_FANCY_MMX_SUPPORTED\t; fancy upsampling with MMX\n");
+#else
+  printf("%%undef JDSAMPLE_FANCY_MMX_SUPPORTED\t; fancy upsampling with MMX\n");
+#endif
+#ifdef JDSAMPLE_FANCY_SSE2_SUPPORTED
+  printf("%%define JDSAMPLE_FANCY_SSE2_SUPPORTED\t; fancy upsampling with SSE2\n");
+#else
+  printf("%%undef JDSAMPLE_FANCY_SSE2_SUPPORTED\t; fancy upsampling with SSE2\n");
+#endif
+#ifdef JDSAMPLE_SIMPLE_MMX_SUPPORTED
+  printf("%%define JDSAMPLE_SIMPLE_MMX_SUPPORTED\t; sloppy upsampling with MMX\n");
+#else
+  printf("%%undef JDSAMPLE_SIMPLE_MMX_SUPPORTED\t; sloppy upsampling with MMX\n");
+#endif
+#ifdef JDSAMPLE_SIMPLE_SSE2_SUPPORTED
+  printf("%%define JDSAMPLE_SIMPLE_SSE2_SUPPORTED\t; sloppy upsampling with SSE2\n");
+#else
+  printf("%%undef JDSAMPLE_SIMPLE_SSE2_SUPPORTED\t; sloppy upsampling with SSE2\n");
+#endif
+#ifdef JIDCT_INT_MMX_SUPPORTED
+  printf("%%define JIDCT_INT_MMX_SUPPORTED\t\t; inverse DCT with MMX\n");
+#else
+  printf("%%undef JIDCT_INT_MMX_SUPPORTED\t\t; inverse DCT with MMX\n");
+#endif
+#ifdef JIDCT_INT_SSE2_SUPPORTED
+  printf("%%define JIDCT_INT_SSE2_SUPPORTED\t; inverse DCT with SSE2\n");
+#else
+  printf("%%undef JIDCT_INT_SSE2_SUPPORTED\t\t; inverse DCT with SSE2\n");
+#endif
+#ifdef JIDCT_FLT_3DNOW_MMX_SUPPORTED
+  printf("%%define JIDCT_FLT_3DNOW_MMX_SUPPORTED\t; inverse DCT with 3DNow!/MMX\n");
+#else
+  printf("%%undef JIDCT_FLT_3DNOW_MMX_SUPPORTED\t; inverse DCT with 3DNow!/MMX\n");
+#endif
+#ifdef JIDCT_FLT_SSE_MMX_SUPPORTED
+  printf("%%define JIDCT_FLT_SSE_MMX_SUPPORTED\t; inverse DCT with SSE/MMX\n");
+#else
+  printf("%%undef JIDCT_FLT_SSE_MMX_SUPPORTED\t; inverse DCT with SSE/MMX\n");
+#endif
+#ifdef JIDCT_FLT_SSE_SSE2_SUPPORTED
+  printf("%%define JIDCT_FLT_SSE_SSE2_SUPPORTED\t; inverse DCT with SSE/SSE2\n");
+#else
+  printf("%%undef JIDCT_FLT_SSE_SSE2_SUPPORTED\t; inverse DCT with SSE/SSE2\n");
+#endif
+  printf("\n");
+}
+
+
+void
+print_jpeglib_h_macro (void)
+{
+  printf("\n");
+  printf("; ---- macros from jpeglib.h ----------------------------------------------\n");
+  printf("\n");
+
+  printf("; Version ID for the JPEG library.\n");
+  printf("; Might be useful for tests like \"#if JPEG_LIB_VERSION >= 60\".\n");
+  printf("\n");
+  printf("%%define JPEG_LIB_VERSION  %d\n", JPEG_LIB_VERSION);
+  printf("\n");
+  printf("; SIMD Ext: Version ID for the SIMD extension.\n");
+  printf("\n");
+  printf("%%define JPEG_SIMDEXT_VERSION  %d\n", JPEG_SIMDEXT_VERSION);
+  printf("%%define JPEG_SIMDEXT_VER_STR  \"%s\"\n", JPEG_SIMDEXT_VER_STR);
+  printf("\n");
+}
+
+
+int
+main (void)
+{
+  printf(";\n; jsimdcfg.inc --- generated by makecfg.c");
+#ifdef __DATE__
+#ifdef __TIME__
+  printf(" (%s, %s)", __DATE__, __TIME__);
+#endif
+#endif
+  printf("\n;\n\n");
+  printf("%%define JSIMDCFG_INCLUDED\t; so that jsimdcfg.inc doesn't do it again\n\n");
+
+  print_structure_offset();
+  print_jconfig_h_macro();
+  print_jmorecfg_h_macro();
+  print_jpeglib_h_macro();
+
+  exit(0);
+  return 0;			/* suppress no-return-value warnings */
+}
diff --git a/makefile.ansi b/makefile.ansi
index 8291913..fb830fc 100644
--- a/makefile.ansi
+++ b/makefile.ansi
@@ -1,4 +1,5 @@
 # Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension
 
 # This makefile is suitable for Unix-like systems with ANSI-capable compilers.
 # If you have a non-ANSI compiler, makefile.unix is a better starting point.
@@ -13,6 +14,13 @@
 # Generally, we recommend defining any configuration symbols in jconfig.h,
 # NOT via -D switches here.
 
+# The executable name of NASM and its options:
+NASM= nasm
+NAFLAGS= $(NASM_OBJFMT) -I./
+# object file format specifier for NASM
+# see jsimdext.inc for more details.
+NASM_OBJFMT= -felf -DELF
+
 # Link-time cc options:
 LDFLAGS= 
 
@@ -24,6 +32,10 @@
 # to use jmemansi.o or jmemname.o if you have limited swap space.
 SYSDEPMEM= jmemnobs.o
 
+# OS-dependent SIMD instruction support checker
+# jsimdw32.o (Win32) / jsimddjg.o (DJGPP V.2) / jsimdgcc.o (Unix/gcc)
+SYSDEPSIMDCHK= jsimdgcc.o
+
 # miscellaneous OS-dependent stuff
 # linker
 LN= $(CC)
@@ -75,17 +87,23 @@
 DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
         $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
 # library object files common to compression and decompression
-COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM)
+COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM) jsimdcpu.o \
+        $(SYSDEPSIMDCHK)
 # compression library object files
 CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
         jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
-        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jfdctfst.o jfdctflt.o \
-        jfdctint.o
+        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jccolmmx.o jccolss2.o \
+        jcsammmx.o jcsamss2.o jcqntint.o jcqntflt.o jcqntmmx.o jcqnt3dn.o \
+        jcqnts2i.o jcqntsse.o jcqnts2f.o jfdctint.o jfdctfst.o jfdctflt.o \
+        jfmmxint.o jfmmxfst.o jf3dnflt.o jfss2int.o jfss2fst.o jfsseflt.o
 # decompression library object files
 DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
         jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
-        jdpostct.o jddctmgr.o jidctfst.o jidctflt.o jidctint.o jidctred.o \
-        jdsample.o jdcolor.o jquant1.o jquant2.o jdmerge.o
+        jdpostct.o jddctmgr.o jdsample.o jdcolor.o jquant1.o jquant2.o \
+        jdmerge.o jidctint.o jidctfst.o jidctred.o jidctflt.o jimmxint.o \
+        jimmxfst.o jimmxred.o ji3dnflt.o jiss2int.o jiss2fst.o jiss2red.o \
+        jisseflt.o jiss2flt.o jdsammmx.o jdsamss2.o jdcolmmx.o jdcolss2.o \
+        jdmermmx.o jdmerss2.o
 # These objectfiles are included in libjpeg.a
 LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
 # object files for sample applications (excluding library files)
@@ -125,7 +143,7 @@
 
 clean:
 	$(RM) *.o cjpeg djpeg jpegtran libjpeg.a rdjpgcom wrjpgcom
-	$(RM) core testout*
+	$(RM) jsimdcfg.inc core testout*
 
 test: cjpeg djpeg jpegtran
 	$(RM) testout*
@@ -143,10 +161,63 @@
 	cmp testorig.jpg testoutt.jpg
 
 
+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) $(LDFLAGS) -o makecfg ./makecfg.c $(LDLIBS)
+	./makecfg > jsimdcfg.inc
+	$(RM) ./makecfg
+
+.asm.o:
+	$(NASM) $(NAFLAGS) -o $@ $*.asm
+
+jsimdcpu.o: jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.o: jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.o: jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.o: jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.o: jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.o: jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.o: jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.o: jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.o: jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.o: jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.o: jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.o: jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.o: jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.o: jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.o: jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.o: jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.o: jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.o: jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.o: jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.o: jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.o: jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.o: jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.o: jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.o: jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.o: jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.o: jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.o: jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.o: jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.o: jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.o: jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.o: jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.o: jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.o: jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.o: jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.o: jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.o: jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.o: ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.o: jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.o: jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.o: jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.o: jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.o: jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.o: jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
 jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
@@ -157,33 +228,33 @@
 jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
diff --git a/makefile.bc5 b/makefile.bc5
new file mode 100644
index 0000000..f3f7df6
--- /dev/null
+++ b/makefile.bc5
@@ -0,0 +1,320 @@
+# Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension
+
+# This makefile is suitable for Borland C++ Compiler 5.5 (win32)
+
+# Read installation instructions before saying "make" !!
+
+!ifndef srcdir
+srcdir = .
+!endif
+.path.c   = $(srcdir)
+.path.h   = $(srcdir)
+.path.asm = $(srcdir)
+.path.inc = $(srcdir);.
+.path.doc = $(srcdir)
+
+# The name of your C compiler:
+CC= bcc32
+
+# You may need to adjust these cc options:
+CFLAGS= -O2 -OS -Oc -d -ff -w-par -w-aus -w-ccc -w-rch -q -I$(srcdir)
+# Generally, we recommend defining any configuration symbols in jconfig.h,
+# NOT via -D switches here.
+
+# The executable name of NASM and its options:
+NASM= nasmw
+NAFLAGS= $(NASM_OBJFMT) -I$(srcdir)/
+# object file format specifier for NASM
+# see jsimdext.inc for more details.
+NASM_OBJFMT= -fobj -DOBJ32
+
+# Link-time cc options:
+LDFLAGS= -tWC -q
+
+# To link any special libraries, add the necessary -l commands here.
+LDLIBS= noeh32.lib
+
+# Put here the object file name for the correct system-dependent memory
+# manager file. For Win32, we recommend jmemnobs.c (flat memory!)
+# SYSDEPMEMLIB must list the same files with "+" signs for the librarian.
+SYSDEPMEM= jmemnobs.obj
+SYSDEPMEMLIB= +jmemnobs.obj
+
+# OS-dependent SIMD instruction support checker
+# jsimdw32.obj (Win32) / jsimddjg.obj (DJGPP V.2) / jsimdgcc.obj (Unix/gcc)
+SYSDEPSIMDCHK= jsimdw32.obj
+SYSDEPSIMDCHKLIB= +jsimdw32.obj
+
+# End of configurable options.
+
+
+# source files: JPEG library proper
+LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
+        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
+        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
+        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
+        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
+        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
+        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
+        jquant2.c jutils.c jmemmgr.c
+# memmgr back ends: compile only one of these into a working library
+SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
+# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
+APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
+        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
+        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
+SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
+# files included by source files
+INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
+        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
+# documentation, test, and support files
+DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
+        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
+        coderules.doc filelist.doc change.log
+MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
+        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
+        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
+        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
+        makvms.opt
+CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
+        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
+        jconfig.vms
+CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
+OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
+TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
+        testimgp.jpg
+DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
+        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
+# library object files common to compression and decompression
+COMOBJECTS= jcomapi.obj jutils.obj jerror.obj jmemmgr.obj $(SYSDEPMEM) \
+        jsimdcpu.obj $(SYSDEPSIMDCHK)
+# compression library object files
+CLIBOBJECTS= jcapimin.obj jcapistd.obj jctrans.obj jcparam.obj jdatadst.obj \
+        jcinit.obj jcmaster.obj jcmarker.obj jcmainct.obj jcprepct.obj \
+        jccoefct.obj jccolor.obj jcsample.obj jchuff.obj jcphuff.obj \
+        jcdctmgr.obj jccolmmx.obj jccolss2.obj jcsammmx.obj jcsamss2.obj \
+        jcqntint.obj jcqntflt.obj jcqntmmx.obj jcqnt3dn.obj jcqnts2i.obj \
+        jcqntsse.obj jcqnts2f.obj jfdctint.obj jfdctfst.obj jfdctflt.obj \
+        jfmmxint.obj jfmmxfst.obj jf3dnflt.obj jfss2int.obj jfss2fst.obj \
+        jfsseflt.obj
+# decompression library object files
+DLIBOBJECTS= jdapimin.obj jdapistd.obj jdtrans.obj jdatasrc.obj \
+        jdmaster.obj jdinput.obj jdmarker.obj jdhuff.obj jdphuff.obj \
+        jdmainct.obj jdcoefct.obj jdpostct.obj jddctmgr.obj jdsample.obj \
+        jdcolor.obj jquant1.obj jquant2.obj jdmerge.obj jidctint.obj \
+        jidctfst.obj jidctred.obj jidctflt.obj jimmxint.obj jimmxfst.obj \
+        jimmxred.obj ji3dnflt.obj jiss2int.obj jiss2fst.obj jiss2red.obj \
+        jisseflt.obj jiss2flt.obj jdsammmx.obj jdsamss2.obj jdcolmmx.obj \
+        jdcolss2.obj jdmermmx.obj jdmerss2.obj
+# These objectfiles are included in libjpeg.lib
+LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
+# object files for sample applications (excluding library files)
+COBJECTS= cjpeg.obj rdppm.obj rdgif.obj rdtarga.obj rdrle.obj rdbmp.obj \
+        rdswitch.obj cdjpeg.obj
+DOBJECTS= djpeg.obj wrppm.obj wrgif.obj wrtarga.obj wrrle.obj wrbmp.obj \
+        rdcolmap.obj cdjpeg.obj
+TROBJECTS= jpegtran.obj rdswitch.obj cdjpeg.obj transupp.obj
+
+
+all: libjpeg.lib cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
+
+libjpeg.lib: $(LIBOBJECTS)
+	- del libjpeg.lib
+	tlib libjpeg.lib /E /C @&&|
++jcapimin.obj +jcapistd.obj +jctrans.obj +jcparam.obj +jdatadst.obj &
++jcinit.obj +jcmaster.obj +jcmarker.obj +jcmainct.obj +jcprepct.obj &
++jccoefct.obj +jccolor.obj +jcsample.obj +jchuff.obj +jcphuff.obj &
++jcdctmgr.obj +jccolmmx.obj +jccolss2.obj +jcsammmx.obj +jcsamss2.obj &
++jcqntint.obj +jcqntflt.obj +jcqntmmx.obj +jcqnt3dn.obj +jcqnts2i.obj &
++jcqntsse.obj +jcqnts2f.obj +jfdctint.obj +jfdctfst.obj +jfdctflt.obj &
++jfmmxint.obj +jfmmxfst.obj +jf3dnflt.obj +jfss2int.obj +jfss2fst.obj &
++jfsseflt.obj +jdapimin.obj +jdapistd.obj +jdtrans.obj +jdatasrc.obj &
++jdmaster.obj +jdinput.obj +jdmarker.obj +jdhuff.obj +jdphuff.obj &
++jdmainct.obj +jdcoefct.obj +jdpostct.obj +jddctmgr.obj +jdsample.obj &
++jdcolor.obj +jquant1.obj +jquant2.obj +jdmerge.obj +jidctint.obj &
++jidctfst.obj +jidctred.obj +jidctflt.obj +jimmxint.obj +jimmxfst.obj &
++jimmxred.obj +ji3dnflt.obj +jiss2int.obj +jiss2fst.obj +jiss2red.obj &
++jisseflt.obj +jiss2flt.obj +jdsammmx.obj +jdsamss2.obj +jdcolmmx.obj &
++jdcolss2.obj +jdmermmx.obj +jdmerss2.obj +jcomapi.obj +jutils.obj &
++jerror.obj +jmemmgr.obj $(SYSDEPMEMLIB) +jsimdcpu.obj $(SYSDEPSIMDCHKLIB)
+|
+
+cjpeg.exe: $(COBJECTS) libjpeg.lib
+	$(CC) $(LDFLAGS) -ecjpeg.exe $(COBJECTS) libjpeg.lib $(LDLIBS)
+
+djpeg.exe: $(DOBJECTS) libjpeg.lib
+	$(CC) $(LDFLAGS) -edjpeg.exe $(DOBJECTS) libjpeg.lib $(LDLIBS)
+
+jpegtran.exe: $(TROBJECTS) libjpeg.lib
+	$(CC) $(LDFLAGS) -ejpegtran.exe $(TROBJECTS) libjpeg.lib $(LDLIBS)
+
+rdjpgcom.exe: rdjpgcom.obj
+	$(CC) $(LDFLAGS) -erdjpgcom.exe rdjpgcom.obj $(LDLIBS)
+
+wrjpgcom.exe: wrjpgcom.obj
+	$(CC) $(LDFLAGS) -ewrjpgcom.exe wrjpgcom.obj $(LDLIBS)
+
+# This "{}" syntax allows Borland Make to "batch" source files.
+# In this way, each run of the compiler can build many modules.
+.c.obj:
+	$(CC) $(CFLAGS) -c{ $<}
+
+jconfig.h: jconfig.doc
+	echo You must prepare a system-dependent jconfig.h file.
+	echo Please read the installation directions in install.doc.
+	exit 1
+
+clean:
+	- del *.obj
+	- del *.tds
+	- del cjpeg.exe
+	- del djpeg.exe
+	- del jpegtran.exe
+	- del rdjpgcom.exe
+	- del wrjpgcom.exe
+	- del jsimdcfg.inc
+	- del libjpeg.lib
+	- del testout*.*
+
+test: cjpeg.exe djpeg.exe jpegtran.exe
+	- del testout*.*
+	djpeg -dct int -ppm -outfile testout.ppm $(srcdir)\testorig.jpg
+	djpeg -dct int -bmp -colors 256 -outfile testout.bmp $(srcdir)\testorig.jpg
+	cjpeg -dct int -outfile testout.jpg $(srcdir)\testimg.ppm
+	djpeg -dct int -ppm -outfile testoutp.ppm $(srcdir)\testprog.jpg
+	cjpeg -dct int -progressive -opt -outfile testoutp.jpg $(srcdir)\testimg.ppm
+	jpegtran -outfile testoutt.jpg $(srcdir)\testprog.jpg
+	fc /b $(srcdir)\testimg.ppm testout.ppm
+	fc /b $(srcdir)\testimg.bmp testout.bmp
+	fc /b $(srcdir)\testimg.jpg testout.jpg
+	fc /b $(srcdir)\testimg.ppm testoutp.ppm
+	fc /b $(srcdir)\testimgp.jpg testoutp.jpg
+	fc /b $(srcdir)\testorig.jpg testoutt.jpg
+
+
+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) $(srcdir)\makecfg.c
+	$(CC) $(LDFLAGS) -emakecfg.exe makecfg.obj $(LDLIBS)
+	.\makecfg.exe > jsimdcfg.inc
+	- del makecfg.tds
+	- del makecfg.obj
+	- del makecfg.exe
+
+.asm.obj:
+	$(NASM) $(NAFLAGS) -o $@ $<
+
+jsimdcpu.obj: jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.obj: jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.obj: jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.obj: jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.obj: jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.obj: jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.obj: jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.obj: jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.obj: jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.obj: jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.obj: jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.obj: jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.obj: jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.obj: jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.obj: jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.obj: jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.obj: jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.obj: jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.obj: jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.obj: jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.obj: jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.obj: jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.obj: jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.obj: jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.obj: jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.obj: jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.obj: jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.obj: jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.obj: jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.obj: jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.obj: jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.obj: jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.obj: jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.obj: jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.obj: jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.obj: jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.obj: ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.obj: jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.obj: jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.obj: jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.obj: jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.obj: jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.obj: jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
+jcapimin.obj: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcapistd.obj: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccoefct.obj: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.obj: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jcdctmgr.obj: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jchuff.obj: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcinit.obj: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmainct.obj: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmarker.obj: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmaster.obj: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcomapi.obj: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcparam.obj: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcphuff.obj: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcprepct.obj: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.obj: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jctrans.obj: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapimin.obj: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapistd.obj: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdatadst.obj: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdatasrc.obj: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdcoefct.obj: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.obj: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jddctmgr.obj: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jdhuff.obj: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdinput.obj: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmainct.obj: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmarker.obj: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmaster.obj: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.obj: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdphuff.obj: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdpostct.obj: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.obj: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdtrans.obj: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jerror.obj: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
+# jfdctflt.obj: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.obj: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.obj: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.obj: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.obj: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.obj: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.obj: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jquant1.obj: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jquant2.obj: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jutils.obj: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jmemmgr.obj: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemansi.obj: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemname.obj: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemnobs.obj: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemdos.obj: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemmac.obj: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+cjpeg.obj: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+djpeg.obj: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+jpegtran.obj: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
+rdjpgcom.obj: rdjpgcom.c jinclude.h jconfig.h
+wrjpgcom.obj: wrjpgcom.c jinclude.h jconfig.h
+cdjpeg.obj: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdcolmap.obj: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdswitch.obj: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+transupp.obj: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
+rdppm.obj: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrppm.obj: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdgif.obj: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrgif.obj: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdtarga.obj: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrtarga.obj: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdbmp.obj: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrbmp.obj: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdrle.obj: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrrle.obj: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
diff --git a/makefile.cfg b/makefile.cfg
index f25e42e..c18b7e6 100644
--- a/makefile.cfg
+++ b/makefile.cfg
@@ -1,4 +1,5 @@
 # Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension
 
 # makefile.cfg is edited by configure to produce a custom Makefile.
 
@@ -16,8 +17,9 @@
 includedir = $(prefix)/include
 binprefix =
 manprefix =
-manext = 1
-mandir = $(prefix)/man/man$(manext)
+manext = .1
+mandir = $(prefix)/man
+man1dir = $(mandir)/man1
 
 # The name of your C compiler:
 CC= @CC@
@@ -29,6 +31,10 @@
 # However, any special defines for ansi2knr.c may be included here:
 ANSI2KNRFLAGS= @ANSI2KNRFLAGS@
 
+# The executable name of NASM and its options:
+NASM= @NASM@
+NAFLAGS= @NAFLAGS@ @INCLUDEFLAGS@
+
 # Link-time cc options:
 LDFLAGS= @LDFLAGS@
 
@@ -37,6 +43,7 @@
 
 # If using GNU libtool, LIBTOOL references it; if not, LIBTOOL is empty.
 LIBTOOL = @LIBTOOL@
+top_builddir = .
 # $(O) expands to "lo" if using libtool, plain "o" if not.
 # Similarly, $(A) expands to "la" or "a".
 O = @O@
@@ -51,8 +58,12 @@
 # to use jmemansi.o or jmemname.o if you have limited swap space.
 SYSDEPMEM= @MEMORYMGR@
 
+# OS-dependent SIMD instruction support checker
+# jsimdw32.$(O) (Win32) / jsimddjg.$(O) (DJGPP V.2) / jsimdgcc.$(O) (Unix/gcc)
+SYSDEPSIMDCHK= @SIMDCHECKER@
+
 # miscellaneous OS-dependent stuff
-SHELL= /bin/sh
+SHELL= @SHELL@
 # linker
 LN= @LN@
 # file deletion command
@@ -68,6 +79,11 @@
 INSTALL_PROGRAM= @INSTALL_PROGRAM@
 INSTALL_LIB= @INSTALL_LIB@
 INSTALL_DATA= @INSTALL_DATA@
+# uninstallation program
+UNINSTALL= @UNINSTALL@
+# executable suffix. under cygwin,
+# 'rm' doesn't know that executables have .exe suffix.
+EXE = @EXEEXT@
 
 # End of configurable options.
 
@@ -110,19 +126,26 @@
 DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
         $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
 # library object files common to compression and decompression
-COMOBJECTS= jcomapi.$(O) jutils.$(O) jerror.$(O) jmemmgr.$(O) $(SYSDEPMEM)
+COMOBJECTS= jcomapi.$(O) jutils.$(O) jerror.$(O) jmemmgr.$(O) $(SYSDEPMEM) \
+        jsimdcpu.$(O) $(SYSDEPSIMDCHK)
 # compression library object files
 CLIBOBJECTS= jcapimin.$(O) jcapistd.$(O) jctrans.$(O) jcparam.$(O) \
         jdatadst.$(O) jcinit.$(O) jcmaster.$(O) jcmarker.$(O) jcmainct.$(O) \
         jcprepct.$(O) jccoefct.$(O) jccolor.$(O) jcsample.$(O) jchuff.$(O) \
-        jcphuff.$(O) jcdctmgr.$(O) jfdctfst.$(O) jfdctflt.$(O) \
-        jfdctint.$(O)
+        jcphuff.$(O) jcdctmgr.$(O) jccolmmx.$(O) jccolss2.$(O) jcsammmx.$(O) \
+        jcsamss2.$(O) jcqntint.$(O) jcqntflt.$(O) jcqntmmx.$(O) jcqnt3dn.$(O) \
+        jcqnts2i.$(O) jcqntsse.$(O) jcqnts2f.$(O) jfdctint.$(O) jfdctfst.$(O) \
+        jfdctflt.$(O) jfmmxint.$(O) jfmmxfst.$(O) jf3dnflt.$(O) jfss2int.$(O) \
+        jfss2fst.$(O) jfsseflt.$(O)
 # decompression library object files
 DLIBOBJECTS= jdapimin.$(O) jdapistd.$(O) jdtrans.$(O) jdatasrc.$(O) \
         jdmaster.$(O) jdinput.$(O) jdmarker.$(O) jdhuff.$(O) jdphuff.$(O) \
-        jdmainct.$(O) jdcoefct.$(O) jdpostct.$(O) jddctmgr.$(O) \
-        jidctfst.$(O) jidctflt.$(O) jidctint.$(O) jidctred.$(O) \
-        jdsample.$(O) jdcolor.$(O) jquant1.$(O) jquant2.$(O) jdmerge.$(O)
+        jdmainct.$(O) jdcoefct.$(O) jdpostct.$(O) jddctmgr.$(O) jdsample.$(O) \
+        jdcolor.$(O) jquant1.$(O) jquant2.$(O) jdmerge.$(O) jidctint.$(O) \
+        jidctfst.$(O) jidctred.$(O) jidctflt.$(O) jimmxint.$(O) jimmxfst.$(O) \
+        jimmxred.$(O) ji3dnflt.$(O) jiss2int.$(O) jiss2fst.$(O) jiss2red.$(O) \
+        jisseflt.$(O) jiss2flt.$(O) jdsammmx.$(O) jdsamss2.$(O) jdcolmmx.$(O) \
+        jdcolss2.$(O) jdmermmx.$(O) jdmerss2.$(O)
 # These objectfiles are included in libjpeg.a
 LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
 # object files for sample applications (excluding library files)
@@ -136,12 +159,19 @@
 all: @A2K_DEPS@ libjpeg.$(A) cjpeg djpeg jpegtran rdjpgcom wrjpgcom
 
 # Special compilation rules to support ansi2knr and libtool.
-.SUFFIXES: .lo .la
+.SUFFIXES: .lo .la .asm
+
+.asm.o:
+	$(SHELL) $(srcdir)/nasm_lt.sh $(NASM) $(NAFLAGS) $(srcdir)/$*.asm
 
 # How to compile with libtool.
 @COM_LT@.c.lo:
 @COM_LT@	$(LIBTOOL) --mode=compile $(CC) $(CFLAGS) -c $(srcdir)/$*.c
 
+@COM_LT@.asm.lo:
+@COM_LT@	$(LIBTOOL) --mode=compile @TAGCC@ $(SHELL) $(srcdir)/nasm_lt.sh \
+@COM_LT@		$(NASM) $(NAFLAGS) $(srcdir)/$*.asm
+
 # How to use ansi2knr, when not using libtool.
 @COM_A2K@.c.o:
 @COM_A2K@	./ansi2knr $(srcdir)/$*.c knr/$*.c
@@ -169,7 +199,7 @@
 # with libtool:
 libjpeg.la: @A2K_DEPS@ $(LIBOBJECTS)
 	$(LIBTOOL) --mode=link $(CC) -o libjpeg.la $(LIBOBJECTS) \
-		-rpath $(libdir) -version-info $(JPEG_LIB_VERSION)
+		-no-undefined -rpath $(libdir) -version-info $(JPEG_LIB_VERSION)
 
 # sample programs:
 
@@ -191,34 +221,62 @@
 # Installation rules:
 
 install: cjpeg djpeg jpegtran rdjpgcom wrjpgcom @FORCE_INSTALL_LIB@
+	-@if [ ! -d $(bindir) ]; then mkdir -p $(bindir); fi
+	-@if [ ! -d $(man1dir) ]; then mkdir -p $(man1dir); fi
 	$(INSTALL_PROGRAM) cjpeg $(bindir)/$(binprefix)cjpeg
 	$(INSTALL_PROGRAM) djpeg $(bindir)/$(binprefix)djpeg
 	$(INSTALL_PROGRAM) jpegtran $(bindir)/$(binprefix)jpegtran
 	$(INSTALL_PROGRAM) rdjpgcom $(bindir)/$(binprefix)rdjpgcom
 	$(INSTALL_PROGRAM) wrjpgcom $(bindir)/$(binprefix)wrjpgcom
-	$(INSTALL_DATA) $(srcdir)/cjpeg.1 $(mandir)/$(manprefix)cjpeg.$(manext)
-	$(INSTALL_DATA) $(srcdir)/djpeg.1 $(mandir)/$(manprefix)djpeg.$(manext)
-	$(INSTALL_DATA) $(srcdir)/jpegtran.1 $(mandir)/$(manprefix)jpegtran.$(manext)
-	$(INSTALL_DATA) $(srcdir)/rdjpgcom.1 $(mandir)/$(manprefix)rdjpgcom.$(manext)
-	$(INSTALL_DATA) $(srcdir)/wrjpgcom.1 $(mandir)/$(manprefix)wrjpgcom.$(manext)
+	$(INSTALL_DATA) $(srcdir)/cjpeg.1 $(man1dir)/$(manprefix)cjpeg$(manext)
+	$(INSTALL_DATA) $(srcdir)/djpeg.1 $(man1dir)/$(manprefix)djpeg$(manext)
+	$(INSTALL_DATA) $(srcdir)/jpegtran.1 $(man1dir)/$(manprefix)jpegtran$(manext)
+	$(INSTALL_DATA) $(srcdir)/rdjpgcom.1 $(man1dir)/$(manprefix)rdjpgcom$(manext)
+	$(INSTALL_DATA) $(srcdir)/wrjpgcom.1 $(man1dir)/$(manprefix)wrjpgcom$(manext)
 
 install-lib: libjpeg.$(A) install-headers
+	-@if [ ! -d $(libdir) ]; then mkdir -p $(libdir); fi
 	$(INSTALL_LIB) libjpeg.$(A) $(libdir)/$(binprefix)libjpeg.$(A)
 
 install-headers: jconfig.h
+	-@if [ ! -d $(includedir) ]; then mkdir -p $(includedir); fi
 	$(INSTALL_DATA) jconfig.h $(includedir)/jconfig.h
 	$(INSTALL_DATA) $(srcdir)/jpeglib.h $(includedir)/jpeglib.h
 	$(INSTALL_DATA) $(srcdir)/jmorecfg.h $(includedir)/jmorecfg.h
 	$(INSTALL_DATA) $(srcdir)/jerror.h $(includedir)/jerror.h
 
+# Uninstallation rules:
+
+uninstall: @UNINSTALL_LIB@
+	$(UNINSTALL) $(bindir)/$(binprefix)cjpeg$(EXE)
+	$(UNINSTALL) $(bindir)/$(binprefix)djpeg$(EXE)
+	$(UNINSTALL) $(bindir)/$(binprefix)jpegtran$(EXE)
+	$(UNINSTALL) $(bindir)/$(binprefix)rdjpgcom$(EXE)
+	$(UNINSTALL) $(bindir)/$(binprefix)wrjpgcom$(EXE)
+	$(UNINSTALL) $(man1dir)/$(manprefix)cjpeg$(manext)
+	$(UNINSTALL) $(man1dir)/$(manprefix)djpeg$(manext)
+	$(UNINSTALL) $(man1dir)/$(manprefix)jpegtran$(manext)
+	$(UNINSTALL) $(man1dir)/$(manprefix)rdjpgcom$(manext)
+	$(UNINSTALL) $(man1dir)/$(manprefix)wrjpgcom$(manext)
+
+uninstall-lib: uninstall-headers
+	$(UNINSTALL) $(libdir)/$(binprefix)libjpeg.$(A)
+
+uninstall-headers:
+	$(UNINSTALL) $(includedir)/jconfig.h
+	$(UNINSTALL) $(includedir)/jpeglib.h
+	$(UNINSTALL) $(includedir)/jmorecfg.h
+	$(UNINSTALL) $(includedir)/jerror.h
+
 clean:
-	$(RM) *.o *.lo libjpeg.a libjpeg.la
-	$(RM) cjpeg djpeg jpegtran rdjpgcom wrjpgcom
-	$(RM) ansi2knr core testout* config.log config.status
+	$(RM) jsimdcfg.inc *.o *.lo libjpeg.a libjpeg.la
+#	 under cygwin, libtool will create wrapper scripts without suffix.
+	$(RM) cjpeg djpeg jpegtran cjpeg$(EXE) djpeg$(EXE) jpegtran$(EXE)
+	$(RM) rdjpgcom$(EXE) wrjpgcom$(EXE) ansi2knr$(EXE) core testout*
 	$(RM) -r knr .libs _libs
 
 distclean: clean
-	$(RM) Makefile jconfig.h libtool config.cache
+	$(RM) Makefile jconfig.h libtool config.cache config.status config.log
 
 test: cjpeg djpeg jpegtran
 	$(RM) testout*
@@ -248,10 +306,60 @@
 .PHONY: all install install-lib install-headers clean distclean test check
 
 
+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) $(LDFLAGS) -o makecfg $(srcdir)/makecfg.c $(LDLIBS)
+	./makecfg > jsimdcfg.inc
+	$(RM) makecfg$(EXE)
+
+jsimdcpu.$(O): jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.$(O): jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.$(O): jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.$(O): jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.$(O): jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.$(O): jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.$(O): jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.$(O): jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.$(O): jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.$(O): jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.$(O): jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.$(O): jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.$(O): jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.$(O): jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.$(O): jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.$(O): jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.$(O): jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.$(O): jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.$(O): jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.$(O): jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.$(O): jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.$(O): jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.$(O): jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.$(O): jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.$(O): jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.$(O): jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.$(O): jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.$(O): jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.$(O): jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.$(O): jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.$(O): jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.$(O): jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.$(O): jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.$(O): jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.$(O): jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.$(O): jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.$(O): ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.$(O): jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.$(O): jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.$(O): jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.$(O): jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.$(O): jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.$(O): jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
 jcapimin.$(O): jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcapistd.$(O): jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jccoefct.$(O): jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.$(O): jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.$(O): jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jcdctmgr.$(O): jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jchuff.$(O): jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcinit.$(O): jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
@@ -262,33 +370,33 @@
 jcparam.$(O): jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcphuff.$(O): jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcprepct.$(O): jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.$(O): jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.$(O): jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jctrans.$(O): jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapimin.$(O): jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapistd.$(O): jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdatadst.$(O): jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdatasrc.$(O): jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdcoefct.$(O): jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.$(O): jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.$(O): jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jddctmgr.$(O): jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jdhuff.$(O): jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdinput.$(O): jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmainct.$(O): jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmarker.$(O): jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmaster.$(O): jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.$(O): jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.$(O): jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdphuff.$(O): jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdpostct.$(O): jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.$(O): jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.$(O): jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdtrans.$(O): jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jerror.$(O): jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.$(O): jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.$(O): jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.$(O): jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.$(O): jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.$(O): jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.$(O): jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.$(O): jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctflt.$(O): jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.$(O): jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.$(O): jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.$(O): jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.$(O): jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.$(O): jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.$(O): jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jquant1.$(O): jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jquant2.$(O): jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jutils.$(O): jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
diff --git a/makefile.dj b/makefile.dj
index f766d25..2186468 100644
--- a/makefile.dj
+++ b/makefile.dj
@@ -1,18 +1,34 @@
 # Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension
 
 # This makefile is for DJGPP (Delorie's GNU C port on MS-DOS), v2.0 or later.
 # Thanks to Frank J. Donahoe for this version.
 
 # Read installation instructions before saying "make" !!
 
+srcdir = .
+VPATH  = $(srcdir)
+
 # The name of your C compiler:
 CC= gcc
 
 # You may need to adjust these cc options:
-CFLAGS= -O2 -Wall -I.
+# For gcc 3.4.x
+CFLAGS= -O2 -mtune=pentium2 -march=i386 -fomit-frame-pointer -fweb \
+        -mpreferred-stack-boundary=2 -mno-align-stringops -I$(srcdir)
+# For gcc 3.3.x
+#CFLAGS= -O2 -mcpu=pentium2 -march=i386 -fomit-frame-pointer \
+#        -mpreferred-stack-boundary=2 -mno-align-stringops -I$(srcdir)
 # Generally, we recommend defining any configuration symbols in jconfig.h,
 # NOT via -D switches here.
 
+# The executable name of NASM and its options:
+NASM= nasm
+NAFLAGS= $(NASM_OBJFMT) -I$(srcdir)/
+# object file format specifier for NASM
+# see jsimdext.inc for more details.
+NASM_OBJFMT= -fcoff -DDJGPP
+
 # Link-time cc options:
 LDFLAGS= -s
 
@@ -24,6 +40,10 @@
 # use jmemname.o if you want to use named temp files instead of swap space.
 SYSDEPMEM= jmemnobs.o
 
+# OS-dependent SIMD instruction support checker
+# jsimdw32.o (Win32) / jsimddjg.o (DJGPP V.2) / jsimdgcc.o (Unix/gcc)
+SYSDEPSIMDCHK= jsimddjg.o
+
 # miscellaneous OS-dependent stuff
 # linker
 LN= $(CC)
@@ -75,17 +95,23 @@
 DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
         $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
 # library object files common to compression and decompression
-COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM)
+COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM) jsimdcpu.o \
+        $(SYSDEPSIMDCHK)
 # compression library object files
 CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
         jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
-        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jfdctfst.o jfdctflt.o \
-        jfdctint.o
+        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jccolmmx.o jccolss2.o \
+        jcsammmx.o jcsamss2.o jcqntint.o jcqntflt.o jcqntmmx.o jcqnt3dn.o \
+        jcqnts2i.o jcqntsse.o jcqnts2f.o jfdctint.o jfdctfst.o jfdctflt.o \
+        jfmmxint.o jfmmxfst.o jf3dnflt.o jfss2int.o jfss2fst.o jfsseflt.o
 # decompression library object files
 DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
         jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
-        jdpostct.o jddctmgr.o jidctfst.o jidctflt.o jidctint.o jidctred.o \
-        jdsample.o jdcolor.o jquant1.o jquant2.o jdmerge.o
+        jdpostct.o jddctmgr.o jdsample.o jdcolor.o jquant1.o jquant2.o \
+        jdmerge.o jidctint.o jidctfst.o jidctred.o jidctflt.o jimmxint.o \
+        jimmxfst.o jimmxred.o ji3dnflt.o jiss2int.o jiss2fst.o jiss2red.o \
+        jisseflt.o jiss2flt.o jdsammmx.o jdsamss2.o jdcolmmx.o jdcolss2.o \
+        jdmermmx.o jdmerss2.o
 # These objectfiles are included in libjpeg.a
 LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
 # object files for sample applications (excluding library files)
@@ -130,29 +156,83 @@
 	$(RM) jpegtran.exe
 	$(RM) rdjpgcom.exe
 	$(RM) wrjpgcom.exe
+	$(RM) jsimdcfg.inc
 	$(RM) libjpeg.a
 	$(RM) testout*.*
 
 test: cjpeg.exe djpeg.exe jpegtran.exe
 	$(RM) testout*.*
-	./djpeg -dct int -ppm -outfile testout.ppm  testorig.jpg
-	./djpeg -dct int -bmp -colors 256 -outfile testout.bmp  testorig.jpg
-	./cjpeg -dct int -outfile testout.jpg  testimg.ppm
-	./djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
-	./cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
-	./jpegtran -outfile testoutt.jpg testprog.jpg
-	fc /b testimg.ppm testout.ppm
-	fc /b testimg.bmp testout.bmp
-	fc /b testimg.jpg testout.jpg
-	fc /b testimg.ppm testoutp.ppm
-	fc /b testimgp.jpg testoutp.jpg
-	fc /b testorig.jpg testoutt.jpg
+	./djpeg -dct int -ppm -outfile testout.ppm $(srcdir)\testorig.jpg
+	./djpeg -dct int -bmp -colors 256 -outfile testout.bmp $(srcdir)\testorig.jpg
+	./cjpeg -dct int -outfile testout.jpg $(srcdir)\testimg.ppm
+	./djpeg -dct int -ppm -outfile testoutp.ppm $(srcdir)\testprog.jpg
+	./cjpeg -dct int -progressive -opt -outfile testoutp.jpg $(srcdir)\testimg.ppm
+	./jpegtran -outfile testoutt.jpg $(srcdir)\testprog.jpg
+	fc /b $(srcdir)\testimg.ppm testout.ppm
+	fc /b $(srcdir)\testimg.bmp testout.bmp
+	fc /b $(srcdir)\testimg.jpg testout.jpg
+	fc /b $(srcdir)\testimg.ppm testoutp.ppm
+	fc /b $(srcdir)\testimgp.jpg testoutp.jpg
+	fc /b $(srcdir)\testorig.jpg testoutt.jpg
 
 
+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) $(LDFLAGS) -o makecfg.exe $(srcdir)/makecfg.c $(LDLIBS)
+	.\makecfg.exe > jsimdcfg.inc
+	$(RM) makecfg.exe
+
+%.o : %.asm
+	$(NASM) $(NAFLAGS) -o $@ $<
+
+jsimdcpu.o: jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.o: jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.o: jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.o: jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.o: jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.o: jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.o: jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.o: jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.o: jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.o: jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.o: jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.o: jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.o: jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.o: jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.o: jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.o: jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.o: jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.o: jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.o: jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.o: jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.o: jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.o: jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.o: jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.o: jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.o: jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.o: jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.o: jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.o: jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.o: jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.o: jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.o: jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.o: jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.o: jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.o: jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.o: jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.o: jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.o: ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.o: jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.o: jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.o: jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.o: jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.o: jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.o: jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
 jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
@@ -163,33 +243,33 @@
 jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
diff --git a/makefile.linux b/makefile.linux
new file mode 100644
index 0000000..54e1d65
--- /dev/null
+++ b/makefile.linux
@@ -0,0 +1,449 @@
+# Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension
+
+# This makefile is for Linux ELF with gcc
+
+# Read installation instructions before saying "make" !!
+
+# For compiling with source and object files in different directories.
+srcdir = .
+VPATH  = $(srcdir)
+
+# Where to install the programs and man pages.
+prefix = /usr/local
+exec_prefix = ${prefix}
+bindir = $(exec_prefix)/bin
+libdir = $(exec_prefix)/lib
+includedir = $(prefix)/include
+binprefix =
+manprefix =
+manext = 1
+mandir = $(prefix)/man/man$(manext)
+
+LNNAME	= libjpeg.so
+SONAME	= libjpeg.so.62
+LIBNAME	= libjpeg.so.62.1.0
+
+# The name of your C compiler:
+CC= gcc
+
+# You may need to adjust these cc options:
+CFLAGS= -O2 -mcpu=i686 -march=i386 -I$(srcdir)
+# Generally, we recommend defining any configuration symbols in jconfig.h,
+# NOT via -D switches here.
+
+# The executable name of NASM and its options:
+NASM= nasm
+NAFLAGS= $(NASM_OBJFMT) -I$(srcdir)/
+# object file format specifier for NASM
+# see jsimdext.inc for more details.
+NASM_OBJFMT= -felf -DELF
+
+# Link-time cc options:
+LDFLAGS= 
+
+# To link any special libraries, add the necessary -l commands here.
+LDLIBS= 
+
+# Put here the object file name for the correct system-dependent memory
+# manager file.  For Unix this is usually jmemnobs.o, but you may want
+# to use jmemansi.o or jmemname.o if you have limited swap space.
+SYSDEPMEM= jmemnobs.o
+
+# OS-dependent SIMD instruction support checker
+# jsimdw32.o (Win32) / jsimddjg.o (DJGPP V.2) / jsimdgcc.o (Unix/gcc)
+SYSDEPSIMDCHK= jsimdgcc.o
+
+# miscellaneous OS-dependent stuff
+# linker
+LN= $(CC)
+# file deletion command
+RM= rm -f
+# library (.a) file creation command
+AR= ar rc
+# second step in .a creation (use "touch" if not needed)
+AR2= ranlib
+# installation program
+INSTALL= install -c
+INSTALL_PROGRAM= ${INSTALL} -s
+INSTALL_SHARED = ${INSTALL}
+INSTALL_LIB=  ${INSTALL} -m 644
+INSTALL_DATA= ${INSTALL} -m 644
+
+# End of configurable options.
+
+
+# source files: JPEG library proper
+LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
+        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
+        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
+        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
+        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
+        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
+        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
+        jquant2.c jutils.c jmemmgr.c
+# memmgr back ends: compile only one of these into a working library
+SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
+# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
+APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
+        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
+        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
+SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
+# files included by source files
+INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
+        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
+# documentation, test, and support files
+DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
+        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
+        coderules.doc filelist.doc change.log
+MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
+        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
+        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
+        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
+        makvms.opt
+CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
+        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
+        jconfig.vms
+CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
+OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
+TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
+        testimgp.jpg
+DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
+        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
+# library object files common to compression and decompression
+COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM) jsimdcpu.o \
+        $(SYSDEPSIMDCHK)
+# compression library object files
+CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
+        jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
+        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jccolmmx.o jccolss2.o \
+        jcsammmx.o jcsamss2.o jcqntint.o jcqntflt.o jcqntmmx.o jcqnt3dn.o \
+        jcqnts2i.o jcqntsse.o jcqnts2f.o jfdctint.o jfdctfst.o jfdctflt.o \
+        jfmmxint.o jfmmxfst.o jf3dnflt.o jfss2int.o jfss2fst.o jfsseflt.o
+# decompression library object files
+DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
+        jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
+        jdpostct.o jddctmgr.o jdsample.o jdcolor.o jquant1.o jquant2.o \
+        jdmerge.o jidctint.o jidctfst.o jidctred.o jidctflt.o jimmxint.o \
+        jimmxfst.o jimmxred.o ji3dnflt.o jiss2int.o jiss2fst.o jiss2red.o \
+        jisseflt.o jiss2flt.o jdsammmx.o jdsamss2.o jdcolmmx.o jdcolss2.o \
+        jdmermmx.o jdmerss2.o
+# These objectfiles are included in libjpeg.a
+LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
+# These objectfiles are included in libjpeg.so
+DLLOBJECTS= $(LIBOBJECTS:.o=.pic.o)
+# object files for sample applications (excluding library files)
+COBJECTS= cjpeg.o rdppm.o rdgif.o rdtarga.o rdrle.o rdbmp.o rdswitch.o \
+        cdjpeg.o
+DOBJECTS= djpeg.o wrppm.o wrgif.o wrtarga.o wrrle.o wrbmp.o rdcolmap.o \
+        cdjpeg.o
+TROBJECTS= jpegtran.o rdswitch.o cdjpeg.o transupp.o
+
+
+all: static shared app
+app: cjpeg djpeg jpegtran rdjpgcom wrjpgcom
+app-static: cjpeg-static djpeg-static jpegtran-static
+shared: $(LIBNAME)
+static: libjpeg.a
+
+libjpeg.a: $(LIBOBJECTS)
+	$(RM) libjpeg.a
+	$(AR) libjpeg.a  $(LIBOBJECTS)
+	$(AR2) libjpeg.a
+
+$(LIBNAME): $(DLLOBJECTS)
+	$(CC) -shared -Wl,-soname,$(SONAME) -o $(LIBNAME) $(DLLOBJECTS)
+
+$(SONAME): $(LIBNAME)
+	ln -sf $(LIBNAME) $(SONAME)
+
+$(LNNAME): $(SONAME)
+	ln -sf $(LIBNAME) $(LNNAME)
+
+cjpeg-static: $(COBJECTS) libjpeg.a
+	$(LN) $(LDFLAGS) -o cjpeg-static $(COBJECTS) libjpeg.a $(LDLIBS)
+
+djpeg-static: $(DOBJECTS) libjpeg.a
+	$(LN) $(LDFLAGS) -o djpeg-static $(DOBJECTS) libjpeg.a $(LDLIBS)
+
+jpegtran-static: $(TROBJECTS) libjpeg.a
+	$(LN) $(LDFLAGS) -o jpegtran-static $(TROBJECTS) libjpeg.a $(LDLIBS)
+
+cjpeg-shared: $(COBJECTS) $(LNNAME)
+	$(LN) $(LDFLAGS) -o cjpeg-shared $(COBJECTS) $(LNNAME) $(LDLIBS)
+
+djpeg-shared: $(DOBJECTS) $(LNNAME)
+	$(LN) $(LDFLAGS) -o djpeg-shared $(DOBJECTS) $(LNNAME) $(LDLIBS)
+
+jpegtran-shared: $(TROBJECTS) $(LNNAME)
+	$(LN) $(LDFLAGS) -o jpegtran-shared $(TROBJECTS) $(LNNAME) $(LDLIBS)
+
+rdjpgcom: rdjpgcom.o
+	$(LN) $(LDFLAGS) -o rdjpgcom rdjpgcom.o $(LDLIBS)
+
+wrjpgcom: wrjpgcom.o
+	$(LN) $(LDFLAGS) -o wrjpgcom wrjpgcom.o $(LDLIBS)
+
+cjpeg: cjpeg-shared
+	echo '#!/bin/sh'                                       > cjpeg
+	echo export LD_LIBRARY_PATH=`pwd`:'$$LD_LIBRARY_PATH' >> cjpeg
+	echo exec `pwd`/cjpeg-shared '"$$@"'                  >> cjpeg
+	chmod +x cjpeg
+
+djpeg: djpeg-shared
+	echo '#!/bin/sh'                                       > djpeg
+	echo export LD_LIBRARY_PATH=`pwd`:'$$LD_LIBRARY_PATH' >> djpeg
+	echo exec `pwd`/djpeg-shared '"$$@"'                  >> djpeg
+	chmod +x djpeg
+
+jpegtran: jpegtran-shared
+	echo '#!/bin/sh'                                       > jpegtran
+	echo export LD_LIBRARY_PATH=`pwd`:'$$LD_LIBRARY_PATH' >> jpegtran
+	echo exec `pwd`/jpegtran-shared '"$$@"'               >> jpegtran
+	chmod +x jpegtran
+
+jconfig.h: jconfig.doc
+	echo You must prepare a system-dependent jconfig.h file.
+	echo Please read the installation directions in install.doc.
+	exit 1
+
+clean:
+	$(RM) *.o libjpeg.a $(LIBNAME) $(SONAME) $(LNNAME)
+	$(RM) cjpeg djpeg jpegtran rdjpgcom wrjpgcom
+	$(RM) cjpeg-shared djpeg-shared jpegtran-shared
+	$(RM) cjpeg-static djpeg-static jpegtran-static
+	$(RM) core testout*
+	$(RM) jsimdcfg.inc
+
+test: cjpeg djpeg jpegtran
+	$(RM) testout*
+	./djpeg -dct int -ppm -outfile testout.ppm $(srcdir)/testorig.jpg
+	./djpeg -dct int -bmp -colors 256 -outfile testout.bmp $(srcdir)/testorig.jpg
+	./cjpeg -dct int -outfile testout.jpg $(srcdir)/testimg.ppm
+	./djpeg -dct int -ppm -outfile testoutp.ppm $(srcdir)/testprog.jpg
+	./cjpeg -dct int -progressive -opt -outfile testoutp.jpg $(srcdir)/testimg.ppm
+	./jpegtran -outfile testoutt.jpg $(srcdir)/testprog.jpg
+	cmp $(srcdir)/testimg.ppm testout.ppm
+	cmp $(srcdir)/testimg.bmp testout.bmp
+	cmp $(srcdir)/testimg.jpg testout.jpg
+	cmp $(srcdir)/testimg.ppm testoutp.ppm
+	cmp $(srcdir)/testimgp.jpg testoutp.jpg
+	cmp $(srcdir)/testorig.jpg testoutt.jpg
+
+test-static: cjpeg-static djpeg-static jpegtran-static
+	$(RM) testout*
+	./djpeg-static -dct int -ppm -outfile testout.ppm $(srcdir)/testorig.jpg
+	./djpeg-static -dct int -bmp -colors 256 -outfile testout.bmp $(srcdir)/testorig.jpg
+	./cjpeg-static -dct int -outfile testout.jpg $(srcdir)/testimg.ppm
+	./djpeg-static -dct int -ppm -outfile testoutp.ppm $(srcdir)/testprog.jpg
+	./cjpeg-static -dct int -progressive -opt -outfile testoutp.jpg $(srcdir)/testimg.ppm
+	./jpegtran-static -outfile testoutt.jpg $(srcdir)/testprog.jpg
+	cmp $(srcdir)/testimg.ppm testout.ppm
+	cmp $(srcdir)/testimg.bmp testout.bmp
+	cmp $(srcdir)/testimg.jpg testout.jpg
+	cmp $(srcdir)/testimg.ppm testoutp.ppm
+	cmp $(srcdir)/testimgp.jpg testoutp.jpg
+	cmp $(srcdir)/testorig.jpg testoutt.jpg
+
+
+install: install-lib install-app install-man
+
+install-app-static: cjpeg-static djpeg-static jpegtran-static
+	-@if [ ! -d $(bindir) ]; then mkdir -p $(bindir); fi
+	$(INSTALL_PROGRAM) cjpeg-static    $(bindir)/$(binprefix)cjpeg-static
+	$(INSTALL_PROGRAM) djpeg-static    $(bindir)/$(binprefix)djpeg-static
+	$(INSTALL_PROGRAM) jpegtran-static $(bindir)/$(binprefix)jpegtran-static
+
+install-app: install-lib cjpeg-shared djpeg-shared jpegtran-shared rdjpgcom wrjpgcom
+	-@if [ ! -d $(bindir) ]; then mkdir -p $(bindir); fi
+	$(INSTALL_PROGRAM) cjpeg-shared    $(bindir)/$(binprefix)cjpeg
+	$(INSTALL_PROGRAM) djpeg-shared    $(bindir)/$(binprefix)djpeg
+	$(INSTALL_PROGRAM) jpegtran-shared $(bindir)/$(binprefix)jpegtran
+	$(INSTALL_PROGRAM) rdjpgcom        $(bindir)/$(binprefix)rdjpgcom
+	$(INSTALL_PROGRAM) wrjpgcom        $(bindir)/$(binprefix)wrjpgcom
+
+install-man: cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 wrjpgcom.1
+	-@if [ ! -d $(mandir) ]; then mkdir -p $(mandir); fi
+	$(INSTALL_DATA) $(srcdir)/cjpeg.1    $(mandir)/$(manprefix)cjpeg.$(manext)
+	$(INSTALL_DATA) $(srcdir)/djpeg.1    $(mandir)/$(manprefix)djpeg.$(manext)
+	$(INSTALL_DATA) $(srcdir)/jpegtran.1 $(mandir)/$(manprefix)jpegtran.$(manext)
+	$(INSTALL_DATA) $(srcdir)/rdjpgcom.1 $(mandir)/$(manprefix)rdjpgcom.$(manext)
+	$(INSTALL_DATA) $(srcdir)/wrjpgcom.1 $(mandir)/$(manprefix)wrjpgcom.$(manext)
+
+install-lib: install-headers libjpeg.a $(LIBNAME)
+	-@if [ ! -d $(libdir) ]; then mkdir -p $(libdir); fi
+	$(INSTALL_LIB)    libjpeg.a  $(libdir)/libjpeg.a
+	$(INSTALL_SHARED) $(LIBNAME) $(libdir)/$(LIBNAME)
+	(cd $(libdir); ln -sf $(LIBNAME) $(SONAME); ln -sf $(LIBNAME) $(LNNAME))
+
+install-headers: jconfig.h jpeglib.h jmorecfg.h jerror.h
+	-@if [ ! -d $(includedir) ]; then mkdir -p $(includedir); fi
+	$(INSTALL_DATA) $(srcdir)/jconfig.h  $(includedir)/jconfig.h
+	$(INSTALL_DATA) $(srcdir)/jpeglib.h  $(includedir)/jpeglib.h
+	$(INSTALL_DATA) $(srcdir)/jmorecfg.h $(includedir)/jmorecfg.h
+	$(INSTALL_DATA) $(srcdir)/jerror.h   $(includedir)/jerror.h
+
+uninstall: uninstall-lib uninstall-app uninstall-man
+
+uninstall-app-static:
+	$(RM) $(bindir)/$(binprefix)cjpeg-static
+	$(RM) $(bindir)/$(binprefix)djpeg-static
+	$(RM) $(bindir)/$(binprefix)jpegtran-static
+
+uninstall-app: uninstall-lib
+	$(RM) $(bindir)/$(binprefix)cjpeg
+	$(RM) $(bindir)/$(binprefix)djpeg
+	$(RM) $(bindir)/$(binprefix)jpegtran
+	$(RM) $(bindir)/$(binprefix)rdjpgcom
+	$(RM) $(bindir)/$(binprefix)wrjpgcom
+
+uninstall-man:
+	$(RM) $(mandir)/$(manprefix)cjpeg.$(manext)
+	$(RM) $(mandir)/$(manprefix)djpeg.$(manext)
+	$(RM) $(mandir)/$(manprefix)jpegtran.$(manext)
+	$(RM) $(mandir)/$(manprefix)rdjpgcom.$(manext)
+	$(RM) $(mandir)/$(manprefix)wrjpgcom.$(manext)
+
+uninstall-lib: uninstall-headers
+	$(RM) $(libdir)/libjpeg.a
+	$(RM) $(libdir)/$(LIBNAME)
+	$(RM) $(libdir)/$(SONAME)
+	$(RM) $(libdir)/$(LNNAME)
+
+uninstall-headers:
+	$(RM) $(includedir)/jconfig.h
+	$(RM) $(includedir)/jpeglib.h
+	$(RM) $(includedir)/jmorecfg.h
+	$(RM) $(includedir)/jerror.h
+
+
+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) $(LDFLAGS) -o makecfg $(srcdir)/makecfg.c $(LDLIBS)
+	./makecfg > jsimdcfg.inc
+	$(RM) ./makecfg
+
+.SUFFIXES: .c .asm .o .pic.o
+
+%.pic.o : %.c
+	$(CC) $(CFLAGS) -fPIC -c -o $@ $<
+
+%.pic.o : %.asm
+	$(NASM) $(NAFLAGS) -DPIC -o $@ $<
+
+%.o : %.asm
+	$(NASM) $(NAFLAGS) -o $@ $<
+
+jsimdcpu.o jsimdcpu.pic.o: jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.o jsimdw32.pic.o: jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.o jsimddjg.pic.o: jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.o jccolmmx.pic.o: jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.o jccolss2.pic.o: jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.o jcsammmx.pic.o: jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.o jcsamss2.pic.o: jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.o jdcolmmx.pic.o: jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.o jdcolss2.pic.o: jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.o jdmermmx.pic.o: jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.o jdmerss2.pic.o: jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.o jdsammmx.pic.o: jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.o jdsamss2.pic.o: jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.o jcqntint.pic.o: jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.o jcqntflt.pic.o: jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.o jcqntmmx.pic.o: jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.o jcqnt3dn.pic.o: jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.o jcqnts2i.pic.o: jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.o jcqntsse.pic.o: jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.o jcqnts2f.pic.o: jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.o jfdctint.pic.o: jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.o jfdctfst.pic.o: jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.o jfdctflt.pic.o: jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.o jfmmxint.pic.o: jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.o jfmmxfst.pic.o: jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.o jf3dnflt.pic.o: jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.o jfss2int.pic.o: jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.o jfss2fst.pic.o: jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.o jfsseflt.pic.o: jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.o jidctint.pic.o: jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.o jidctfst.pic.o: jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.o jidctred.pic.o: jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.o jidctflt.pic.o: jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.o jimmxint.pic.o: jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.o jimmxfst.pic.o: jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.o jimmxred.pic.o: jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.o ji3dnflt.pic.o: ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.o jiss2int.pic.o: jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.o jiss2fst.pic.o: jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.o jiss2red.pic.o: jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.o jisseflt.pic.o: jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.o jiss2flt.pic.o: jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.o jsimdgcc.pic.o: jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
+jcapimin.o jcapimin.pic.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcapistd.o jcapistd.pic.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccoefct.o jccoefct.pic.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.o jccolor.pic.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jcdctmgr.o jcdctmgr.pic.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jchuff.o jchuff.pic.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcinit.o jcinit.pic.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmainct.o jcmainct.pic.o: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmarker.o jcmarker.pic.o: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmaster.o jcmaster.pic.o: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcomapi.o jcomapi.pic.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcparam.o jcparam.pic.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcphuff.o jcphuff.pic.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcprepct.o jcprepct.pic.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.o jcsample.pic.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jctrans.o jctrans.pic.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapimin.o jdapimin.pic.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapistd.o jdapistd.pic.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdatadst.o jdatadst.pic.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdatasrc.o jdatasrc.pic.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdcoefct.o jdcoefct.pic.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.o jdcolor.pic.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jddctmgr.o jddctmgr.pic.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jdhuff.o jdhuff.pic.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdinput.o jdinput.pic.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmainct.o jdmainct.pic.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmarker.o jdmarker.pic.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmaster.o jdmaster.pic.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.o jdmerge.pic.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdphuff.o jdphuff.pic.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdpostct.o jdpostct.pic.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.o jdsample.pic.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdtrans.o jdtrans.pic.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jerror.o jerror.pic.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
+# jfdctflt.o jfdctflt.pic.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.o jfdctfst.pic.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.o jfdctint.pic.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.o jidctflt.pic.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.o jidctfst.pic.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.o jidctint.pic.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.o jidctred.pic.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jquant1.o jquant1.pic.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jquant2.o jquant2.pic.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jutils.o jutils.pic.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jmemmgr.o jmemmgr.pic.o: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemansi.o jmemansi.pic.o: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemname.o jmemname.pic.o: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemnobs.o jmemnobs.pic.o: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemdos.o jmemdos.pic.o: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemmac.o jmemmac.pic.o: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+cjpeg.o: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+djpeg.o: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+jpegtran.o: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
+rdjpgcom.o: rdjpgcom.c jinclude.h jconfig.h
+wrjpgcom.o: wrjpgcom.c jinclude.h jconfig.h
+cdjpeg.o: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdcolmap.o: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdswitch.o: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+transupp.o: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
+rdppm.o: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrppm.o: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdgif.o: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrgif.o: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdtarga.o: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrtarga.o: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdbmp.o: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrbmp.o: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdrle.o: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrrle.o: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
diff --git a/makefile.mgw b/makefile.mgw
new file mode 100644
index 0000000..06f09e0
--- /dev/null
+++ b/makefile.mgw
@@ -0,0 +1,298 @@
+# Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension
+
+# This makefile is for MinGW.
+
+# Read installation instructions before saying "make" !!
+
+srcdir = .
+VPATH  = $(srcdir)
+
+# The name of your C compiler:
+CC= gcc
+
+# You may need to adjust these cc options:
+# For gcc 3.4.x
+CFLAGS= -O2 -mtune=pentium2 -march=i386 -fomit-frame-pointer -fweb \
+        -mpreferred-stack-boundary=2 -mno-align-stringops -I$(srcdir)
+# For gcc 3.3.x
+#CFLAGS= -O2 -mcpu=pentium2 -march=i386 -fomit-frame-pointer \
+#        -mpreferred-stack-boundary=2 -mno-align-stringops -I$(srcdir)
+# Generally, we recommend defining any configuration symbols in jconfig.h,
+# NOT via -D switches here.
+
+# The executable name of NASM and its options:
+NASM= nasmw
+NAFLAGS= $(NASM_OBJFMT) -I$(srcdir)/
+# object file format specifier for NASM
+# see jsimdext.inc for more details.
+NASM_OBJFMT= -fwin32 -DWIN32
+
+# Link-time cc options:
+LDFLAGS= -s
+
+# To link any special libraries, add the necessary -l commands here.
+LDLIBS= 
+
+# Put here the object file name for the correct system-dependent memory
+# manager file.
+SYSDEPMEM= jmemnobs.o
+
+# OS-dependent SIMD instruction support checker
+# jsimdw32.o (Win32) / jsimddjg.o (DJGPP V.2) / jsimdgcc.o (Unix/gcc)
+SYSDEPSIMDCHK= jsimdw32.o
+
+# miscellaneous OS-dependent stuff
+# linker
+LN= $(CC)
+# file deletion command
+RM= del
+# library (.a) file creation command
+AR= ar rc
+# second step in .a creation (use "touch" if not needed)
+AR2= ranlib
+
+# End of configurable options.
+
+
+# source files: JPEG library proper
+LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
+        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
+        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
+        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
+        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
+        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
+        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
+        jquant2.c jutils.c jmemmgr.c
+# memmgr back ends: compile only one of these into a working library
+SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
+# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
+APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
+        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
+        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
+SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
+# files included by source files
+INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
+        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
+# documentation, test, and support files
+DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
+        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
+        coderules.doc filelist.doc change.log
+MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
+        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
+        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
+        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
+        makvms.opt
+CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
+        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
+        jconfig.vms
+CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
+OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
+TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
+        testimgp.jpg
+DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
+        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
+# library object files common to compression and decompression
+COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM) jsimdcpu.o \
+        $(SYSDEPSIMDCHK)
+# compression library object files
+CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
+        jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
+        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jccolmmx.o jccolss2.o \
+        jcsammmx.o jcsamss2.o jcqntint.o jcqntflt.o jcqntmmx.o jcqnt3dn.o \
+        jcqnts2i.o jcqntsse.o jcqnts2f.o jfdctint.o jfdctfst.o jfdctflt.o \
+        jfmmxint.o jfmmxfst.o jf3dnflt.o jfss2int.o jfss2fst.o jfsseflt.o
+# decompression library object files
+DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
+        jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
+        jdpostct.o jddctmgr.o jdsample.o jdcolor.o jquant1.o jquant2.o \
+        jdmerge.o jidctint.o jidctfst.o jidctred.o jidctflt.o jimmxint.o \
+        jimmxfst.o jimmxred.o ji3dnflt.o jiss2int.o jiss2fst.o jiss2red.o \
+        jisseflt.o jiss2flt.o jdsammmx.o jdsamss2.o jdcolmmx.o jdcolss2.o \
+        jdmermmx.o jdmerss2.o
+# These objectfiles are included in libjpeg.a
+LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
+# object files for sample applications (excluding library files)
+COBJECTS= cjpeg.o rdppm.o rdgif.o rdtarga.o rdrle.o rdbmp.o rdswitch.o \
+        cdjpeg.o
+DOBJECTS= djpeg.o wrppm.o wrgif.o wrtarga.o wrrle.o wrbmp.o rdcolmap.o \
+        cdjpeg.o
+TROBJECTS= jpegtran.o rdswitch.o cdjpeg.o transupp.o
+
+
+all: libjpeg.a cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
+
+libjpeg.a: $(LIBOBJECTS)
+	-$(RM) libjpeg.a
+	$(AR)  libjpeg.a  $(LIBOBJECTS)
+	$(AR2) libjpeg.a
+
+cjpeg.exe: $(COBJECTS) libjpeg.a
+	$(LN) $(LDFLAGS) -o cjpeg.exe $(COBJECTS) libjpeg.a $(LDLIBS)
+
+djpeg.exe: $(DOBJECTS) libjpeg.a
+	$(LN) $(LDFLAGS) -o djpeg.exe $(DOBJECTS) libjpeg.a $(LDLIBS)
+
+jpegtran.exe: $(TROBJECTS) libjpeg.a
+	$(LN) $(LDFLAGS) -o jpegtran.exe $(TROBJECTS) libjpeg.a $(LDLIBS)
+
+rdjpgcom.exe: rdjpgcom.o
+	$(LN) $(LDFLAGS) -o rdjpgcom.exe rdjpgcom.o $(LDLIBS)
+
+wrjpgcom.exe: wrjpgcom.o
+	$(LN) $(LDFLAGS) -o wrjpgcom.exe wrjpgcom.o $(LDLIBS)
+
+jconfig.h: jconfig.doc
+	echo You must prepare a system-dependent jconfig.h file.
+	echo Please read the installation directions in install.doc.
+	exit 1
+
+clean:
+	-$(RM) *.o
+	-$(RM) cjpeg.exe
+	-$(RM) djpeg.exe
+	-$(RM) jpegtran.exe
+	-$(RM) rdjpgcom.exe
+	-$(RM) wrjpgcom.exe
+	-$(RM) jsimdcfg.inc
+	-$(RM) libjpeg.a
+	-$(RM) testout*.*
+
+test: cjpeg.exe djpeg.exe jpegtran.exe
+	-$(RM) testout*.*
+	./djpeg -dct int -ppm -outfile testout.ppm $(srcdir)\testorig.jpg
+	./djpeg -dct int -bmp -colors 256 -outfile testout.bmp $(srcdir)\testorig.jpg
+	./cjpeg -dct int -outfile testout.jpg $(srcdir)\testimg.ppm
+	./djpeg -dct int -ppm -outfile testoutp.ppm $(srcdir)\testprog.jpg
+	./cjpeg -dct int -progressive -opt -outfile testoutp.jpg $(srcdir)\testimg.ppm
+	./jpegtran -outfile testoutt.jpg $(srcdir)\testprog.jpg
+	fc /b $(srcdir)\testimg.ppm testout.ppm
+	fc /b $(srcdir)\testimg.bmp testout.bmp
+	fc /b $(srcdir)\testimg.jpg testout.jpg
+	fc /b $(srcdir)\testimg.ppm testoutp.ppm
+	fc /b $(srcdir)\testimgp.jpg testoutp.jpg
+	fc /b $(srcdir)\testorig.jpg testoutt.jpg
+
+
+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) $(LDFLAGS) -o makecfg.exe $(srcdir)/makecfg.c $(LDLIBS)
+	.\makecfg.exe > jsimdcfg.inc
+	$(RM) makecfg.exe
+
+%.o : %.asm
+	$(NASM) $(NAFLAGS) -o $@ $<
+
+jsimdcpu.o: jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.o: jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.o: jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.o: jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.o: jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.o: jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.o: jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.o: jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.o: jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.o: jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.o: jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.o: jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.o: jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.o: jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.o: jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.o: jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.o: jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.o: jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.o: jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.o: jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.o: jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.o: jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.o: jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.o: jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.o: jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.o: jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.o: jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.o: jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.o: jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.o: jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.o: jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.o: jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.o: jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.o: jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.o: jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.o: jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.o: ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.o: jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.o: jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.o: jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.o: jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.o: jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.o: jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
+jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmainct.o: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmarker.o: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmaster.o: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcomapi.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
+# jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jmemmgr.o: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemansi.o: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemname.o: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemnobs.o: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemdos.o: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemmac.o: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+cjpeg.o: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+djpeg.o: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+jpegtran.o: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
+rdjpgcom.o: rdjpgcom.c jinclude.h jconfig.h
+wrjpgcom.o: wrjpgcom.c jinclude.h jconfig.h
+cdjpeg.o: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdcolmap.o: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdswitch.o: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+transupp.o: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
+rdppm.o: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrppm.o: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdgif.o: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrgif.o: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdtarga.o: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrtarga.o: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdbmp.o: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrbmp.o: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdrle.o: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrrle.o: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
diff --git a/makefile.mgwdll b/makefile.mgwdll
new file mode 100644
index 0000000..08a3e69
--- /dev/null
+++ b/makefile.mgwdll
@@ -0,0 +1,310 @@
+# Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension
+
+# This makefile is for MinGW.
+# It builds the IJG library as a dynamically linkable library (.DLL),
+# and builds the sample applications which are linked against the DLL.
+
+# Read installation instructions before saying "make" !!
+
+srcdir = .
+VPATH  = $(srcdir)
+
+# The name of your C compiler:
+CC= gcc
+
+# You may need to adjust these cc options:
+# For gcc 3.4.x
+CFLAGS= -O2 -mtune=pentium2 -march=i386 -fomit-frame-pointer -fweb \
+        -mpreferred-stack-boundary=2 -mno-align-stringops -I$(srcdir)
+# For gcc 3.3.x
+#CFLAGS= -O2 -mcpu=pentium2 -march=i386 -fomit-frame-pointer \
+#        -mpreferred-stack-boundary=2 -mno-align-stringops -I$(srcdir)
+# Generally, we recommend defining any configuration symbols in jconfig.h,
+# NOT via -D switches here.
+
+# The executable name of NASM and its options:
+NASM= nasmw
+NAFLAGS= $(NASM_OBJFMT) -I$(srcdir)/
+# object file format specifier for NASM
+# see jsimdext.inc for more details.
+NASM_OBJFMT= -fwin32 -DWIN32
+
+# Link-time cc options:
+LDFLAGS= -s
+LDFLAGS_DLL= $(LDFLAGS) -shared
+
+# To link any special libraries, add the necessary -l commands here.
+LDLIBS= 
+
+# DLL to build
+DLLNAME = jpeg62.dll
+# import library
+LIBNAME = libjpeg.dll.a
+
+# Put here the object file name for the correct system-dependent memory
+# manager file.
+SYSDEPMEM= jmemnobs.o
+
+# OS-dependent SIMD instruction support checker
+# jsimdw32.o (Win32) / jsimddjg.o (DJGPP V.2) / jsimdgcc.o (Unix/gcc)
+SYSDEPSIMDCHK= jsimdw32.o
+
+# miscellaneous OS-dependent stuff
+# linker
+LN= $(CC)
+# file deletion command
+RM= del
+# library (.a) file creation command
+AR= ar rc
+# second step in .a creation (use "touch" if not needed)
+AR2= ranlib
+
+# End of configurable options.
+
+
+# source files: JPEG library proper
+LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
+        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
+        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
+        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
+        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
+        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
+        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
+        jquant2.c jutils.c jmemmgr.c
+# memmgr back ends: compile only one of these into a working library
+SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
+# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
+APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
+        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
+        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
+SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
+# files included by source files
+INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
+        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
+# documentation, test, and support files
+DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
+        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
+        coderules.doc filelist.doc change.log
+MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
+        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
+        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
+        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
+        makvms.opt
+CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
+        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
+        jconfig.vms
+CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
+OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
+TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
+        testimgp.jpg
+DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
+        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
+# library object files common to compression and decompression
+COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM) jsimdcpu.o \
+        $(SYSDEPSIMDCHK)
+# compression library object files
+CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
+        jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
+        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jccolmmx.o jccolss2.o \
+        jcsammmx.o jcsamss2.o jcqntint.o jcqntflt.o jcqntmmx.o jcqnt3dn.o \
+        jcqnts2i.o jcqntsse.o jcqnts2f.o jfdctint.o jfdctfst.o jfdctflt.o \
+        jfmmxint.o jfmmxfst.o jf3dnflt.o jfss2int.o jfss2fst.o jfsseflt.o
+# decompression library object files
+DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
+        jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
+        jdpostct.o jddctmgr.o jdsample.o jdcolor.o jquant1.o jquant2.o \
+        jdmerge.o jidctint.o jidctfst.o jidctred.o jidctflt.o jimmxint.o \
+        jimmxfst.o jimmxred.o ji3dnflt.o jiss2int.o jiss2fst.o jiss2red.o \
+        jisseflt.o jiss2flt.o jdsammmx.o jdsamss2.o jdcolmmx.o jdcolss2.o \
+        jdmermmx.o jdmerss2.o
+# These objectfiles are included in libjpeg.a
+LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
+# object files for sample applications (excluding library files)
+COBJECTS= cjpeg.o rdppm.o rdgif.o rdtarga.o rdrle.o rdbmp.o rdswitch.o \
+        cdjpeg.o
+DOBJECTS= djpeg.o wrppm.o wrgif.o wrtarga.o wrrle.o wrbmp.o rdcolmap.o \
+        cdjpeg.o
+TROBJECTS= jpegtran.o rdswitch.o cdjpeg.o transupp.o
+
+
+all: $(DLLNAME) cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
+
+$(LIBNAME): $(DLLNAME)
+$(DLLNAME): $(LIBOBJECTS) jpegdll.o jpegdll.def
+	$(LN) $(LDFLAGS_DLL) -o $(DLLNAME) -Wl,--out-implib,$(LIBNAME) \
+		$(LIBOBJECTS) jpegdll.o jpegdll.def
+
+jpegdll.o: jpegdll.rc
+	windres -O coff -o $@ $*.rc
+
+cjpeg.exe: $(COBJECTS) $(LIBNAME)
+	$(LN) $(LDFLAGS) -o cjpeg.exe $(COBJECTS) $(LIBNAME) $(LDLIBS)
+
+djpeg.exe: $(DOBJECTS) $(LIBNAME)
+	$(LN) $(LDFLAGS) -o djpeg.exe $(DOBJECTS) $(LIBNAME) $(LDLIBS)
+
+jpegtran.exe: $(TROBJECTS) $(LIBNAME)
+	$(LN) $(LDFLAGS) -o jpegtran.exe $(TROBJECTS) $(LIBNAME) $(LDLIBS)
+
+rdjpgcom.exe: rdjpgcom.o
+	$(LN) $(LDFLAGS) -o rdjpgcom.exe rdjpgcom.o $(LDLIBS)
+
+wrjpgcom.exe: wrjpgcom.o
+	$(LN) $(LDFLAGS) -o wrjpgcom.exe wrjpgcom.o $(LDLIBS)
+
+jconfig.h: jconfig.doc
+	echo You must prepare a system-dependent jconfig.h file.
+	echo Please read the installation directions in install.doc.
+	exit 1
+
+clean:
+	-$(RM) *.o
+	-$(RM) cjpeg.exe
+	-$(RM) djpeg.exe
+	-$(RM) jpegtran.exe
+	-$(RM) rdjpgcom.exe
+	-$(RM) wrjpgcom.exe
+	-$(RM) jsimdcfg.inc
+	-$(RM) $(DLLNAME)
+	-$(RM) $(LIBNAME)
+	-$(RM) testout*.*
+
+test: cjpeg.exe djpeg.exe jpegtran.exe
+	-$(RM) testout*.*
+	./djpeg -dct int -ppm -outfile testout.ppm $(srcdir)\testorig.jpg
+	./djpeg -dct int -bmp -colors 256 -outfile testout.bmp $(srcdir)\testorig.jpg
+	./cjpeg -dct int -outfile testout.jpg $(srcdir)\testimg.ppm
+	./djpeg -dct int -ppm -outfile testoutp.ppm $(srcdir)\testprog.jpg
+	./cjpeg -dct int -progressive -opt -outfile testoutp.jpg $(srcdir)\testimg.ppm
+	./jpegtran -outfile testoutt.jpg $(srcdir)\testprog.jpg
+	fc /b $(srcdir)\testimg.ppm testout.ppm
+	fc /b $(srcdir)\testimg.bmp testout.bmp
+	fc /b $(srcdir)\testimg.jpg testout.jpg
+	fc /b $(srcdir)\testimg.ppm testoutp.ppm
+	fc /b $(srcdir)\testimgp.jpg testoutp.jpg
+	fc /b $(srcdir)\testorig.jpg testoutt.jpg
+
+
+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) $(LDFLAGS) -o makecfg.exe $(srcdir)/makecfg.c $(LDLIBS)
+	.\makecfg.exe > jsimdcfg.inc
+	$(RM) makecfg.exe
+
+%.o : %.asm
+	$(NASM) $(NAFLAGS) -o $@ $<
+
+jsimdcpu.o: jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.o: jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.o: jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.o: jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.o: jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.o: jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.o: jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.o: jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.o: jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.o: jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.o: jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.o: jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.o: jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.o: jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.o: jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.o: jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.o: jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.o: jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.o: jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.o: jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.o: jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.o: jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.o: jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.o: jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.o: jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.o: jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.o: jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.o: jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.o: jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.o: jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.o: jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.o: jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.o: jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.o: jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.o: jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.o: jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.o: ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.o: jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.o: jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.o: jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.o: jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.o: jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.o: jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
+jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmainct.o: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmarker.o: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmaster.o: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcomapi.o: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
+# jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jmemmgr.o: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemansi.o: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemname.o: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemnobs.o: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemdos.o: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemmac.o: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+cjpeg.o: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+djpeg.o: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+jpegtran.o: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
+rdjpgcom.o: rdjpgcom.c jinclude.h jconfig.h
+wrjpgcom.o: wrjpgcom.c jinclude.h jconfig.h
+cdjpeg.o: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdcolmap.o: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdswitch.o: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+transupp.o: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
+rdppm.o: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrppm.o: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdgif.o: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrgif.o: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdtarga.o: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrtarga.o: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdbmp.o: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrbmp.o: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdrle.o: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrrle.o: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
diff --git a/makefile.unix b/makefile.unix
index 00455ab..e05ecc0 100644
--- a/makefile.unix
+++ b/makefile.unix
@@ -1,4 +1,5 @@
 # Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension
 
 # This makefile is suitable for Unix-like systems with non-ANSI compilers.
 # If you have an ANSI compiler, makefile.ansi is a better starting point.
@@ -15,6 +16,13 @@
 # However, any special defines for ansi2knr.c may be included here:
 ANSI2KNRFLAGS= 
 
+# The executable name of NASM and its options:
+NASM= nasm
+NAFLAGS= $(NASM_OBJFMT) -I./
+# object file format specifier for NASM
+# see jsimdext.inc for more details.
+NASM_OBJFMT= -faout -DAOUT
+
 # Link-time cc options:
 LDFLAGS= 
 
@@ -26,6 +34,10 @@
 # to use jmemansi.o or jmemname.o if you have limited swap space.
 SYSDEPMEM= jmemnobs.o
 
+# OS-dependent SIMD instruction support checker
+# jsimdw32.o (Win32) / jsimddjg.o (DJGPP V.2) / jsimdgcc.o (Unix/gcc)
+SYSDEPSIMDCHK= jsimdgcc.o
+
 # miscellaneous OS-dependent stuff
 # linker
 LN= $(CC)
@@ -79,17 +91,23 @@
 DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
         $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
 # library object files common to compression and decompression
-COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM)
+COMOBJECTS= jcomapi.o jutils.o jerror.o jmemmgr.o $(SYSDEPMEM) jsimdcpu.o \
+        $(SYSDEPSIMDCHK)
 # compression library object files
 CLIBOBJECTS= jcapimin.o jcapistd.o jctrans.o jcparam.o jdatadst.o jcinit.o \
         jcmaster.o jcmarker.o jcmainct.o jcprepct.o jccoefct.o jccolor.o \
-        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jfdctfst.o jfdctflt.o \
-        jfdctint.o
+        jcsample.o jchuff.o jcphuff.o jcdctmgr.o jccolmmx.o jccolss2.o \
+        jcsammmx.o jcsamss2.o jcqntint.o jcqntflt.o jcqntmmx.o jcqnt3dn.o \
+        jcqnts2i.o jcqntsse.o jcqnts2f.o jfdctint.o jfdctfst.o jfdctflt.o \
+        jfmmxint.o jfmmxfst.o jf3dnflt.o jfss2int.o jfss2fst.o jfsseflt.o
 # decompression library object files
 DLIBOBJECTS= jdapimin.o jdapistd.o jdtrans.o jdatasrc.o jdmaster.o \
         jdinput.o jdmarker.o jdhuff.o jdphuff.o jdmainct.o jdcoefct.o \
-        jdpostct.o jddctmgr.o jidctfst.o jidctflt.o jidctint.o jidctred.o \
-        jdsample.o jdcolor.o jquant1.o jquant2.o jdmerge.o
+        jdpostct.o jddctmgr.o jdsample.o jdcolor.o jquant1.o jquant2.o \
+        jdmerge.o jidctint.o jidctfst.o jidctred.o jidctflt.o jimmxint.o \
+        jimmxfst.o jimmxred.o ji3dnflt.o jiss2int.o jiss2fst.o jiss2red.o \
+        jisseflt.o jiss2flt.o jdsammmx.o jdsamss2.o jdcolmmx.o jdcolss2.o \
+        jdmermmx.o jdmerss2.o
 # These objectfiles are included in libjpeg.a
 LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
 # object files for sample applications (excluding library files)
@@ -139,7 +157,7 @@
 
 clean:
 	$(RM) *.o cjpeg djpeg jpegtran libjpeg.a rdjpgcom wrjpgcom
-	$(RM) ansi2knr core testout*
+	$(RM) jsimdcfg.inc ansi2knr core testout*
 
 test: cjpeg djpeg jpegtran
 	$(RM) testout*
@@ -157,10 +175,63 @@
 	cmp testorig.jpg testoutt.jpg
 
 
+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) $(LDFLAGS) -o makecfg ./makecfg.c $(LDLIBS)
+	./makecfg > jsimdcfg.inc
+	$(RM) ./makecfg
+
+.asm.o:
+	$(NASM) $(NAFLAGS) -o $@ $*.asm
+
+jsimdcpu.o: jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.o: jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.o: jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.o: jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.o: jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.o: jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.o: jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.o: jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.o: jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.o: jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.o: jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.o: jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.o: jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.o: jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.o: jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.o: jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.o: jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.o: jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.o: jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.o: jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.o: jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.o: jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.o: jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.o: jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.o: jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.o: jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.o: jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.o: jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.o: jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.o: jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.o: jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.o: jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.o: jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.o: jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.o: jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.o: jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.o: ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.o: jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.o: jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.o: jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.o: jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.o: jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.o: jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
 jcapimin.o: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcapistd.o: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jccoefct.o: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.o: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jcdctmgr.o: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jchuff.o: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcinit.o: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
@@ -171,33 +242,33 @@
 jcparam.o: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcphuff.o: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcprepct.o: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.o: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jctrans.o: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapimin.o: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapistd.o: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdatadst.o: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdatasrc.o: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdcoefct.o: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.o: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jddctmgr.o: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jdhuff.o: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdinput.o: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmainct.o: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmarker.o: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmaster.o: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.o: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdphuff.o: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdpostct.o: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.o: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdtrans.o: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jerror.o: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctflt.o: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.o: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.o: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.o: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.o: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.o: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.o: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jquant1.o: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jquant2.o: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jutils.o: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
diff --git a/makefile.vc b/makefile.vc
index 2acf069..7cfeda3 100644
--- a/makefile.vc
+++ b/makefile.vc
@@ -1,32 +1,50 @@
 # Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension
 
 # This makefile is for Microsoft Visual C++ on Windows NT (and 95?).
 # It builds the IJG library as a statically linkable library (.LIB),
 # and builds the sample applications as console-mode apps.
-# Thanks to Xingong Chang, Raymond Everly and others.
 
 # Read installation instructions before saying "nmake" !!
-# To build an optimized library without debug info, say "nmake nodebug=1".
 
-# Pull in standard variable definitions
-!include <win32.mak>
+# The name of your C compiler:
+CC= cl
+LD= link
 
 # You may want to adjust these compiler options:
-CFLAGS= $(cflags) $(cdebug) $(cvars) -I.
+!ifdef crtdll
+# (DLL version of CRT)
+CFLAGS= -nologo -c -MD -W3 -O2 -GF -Gy -DNDEBUG -I.
+!else
+# (Single threaded static CRT)
+CFLAGS= -nologo -c -ML -W3 -O2 -GF -Gy -DNDEBUG -I.
+!endif
+
 # Generally, we recommend defining any configuration symbols in jconfig.h,
 # NOT via -D switches here.
 
+# The executable name of NASM and its options:
+NASM= nasmw
+NAFLAGS= $(NASM_OBJFMT) -I./
+# object file format specifier for NASM
+# see jsimdext.inc for more details.
+NASM_OBJFMT= -fwin32 -DWIN32
+
 # Link-time options:
-LDFLAGS= $(ldebug) $(conlflags)
+LDFLAGS= -nologo -release -subsystem:console,4.0 -opt:nowin98
 
 # To link any special libraries, add the necessary commands here.
-LDLIBS= $(conlibs)
+LDLIBS= 
 
 # Put here the object file name for the correct system-dependent memory
 # manager file.  For NT we suggest jmemnobs.obj, which expects the OS to
 # provide adequate virtual memory.
 SYSDEPMEM= jmemnobs.obj
 
+# OS-dependent SIMD instruction support checker
+# jsimdw32.obj (Win32) / jsimddjg.obj (DJGPP V.2) / jsimdgcc.obj (Unix/gcc)
+SYSDEPSIMDCHK= jsimdw32.obj
+
 # miscellaneous OS-dependent stuff
 # file deletion command
 RM= del
@@ -72,18 +90,26 @@
 DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
         $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
 # library object files common to compression and decompression
-COMOBJECTS= jcomapi.obj jutils.obj jerror.obj jmemmgr.obj $(SYSDEPMEM)
+COMOBJECTS= jcomapi.obj jutils.obj jerror.obj jmemmgr.obj $(SYSDEPMEM) \
+        jsimdcpu.obj $(SYSDEPSIMDCHK)
 # compression library object files
 CLIBOBJECTS= jcapimin.obj jcapistd.obj jctrans.obj jcparam.obj jdatadst.obj \
         jcinit.obj jcmaster.obj jcmarker.obj jcmainct.obj jcprepct.obj \
         jccoefct.obj jccolor.obj jcsample.obj jchuff.obj jcphuff.obj \
-        jcdctmgr.obj jfdctfst.obj jfdctflt.obj jfdctint.obj
+        jcdctmgr.obj jccolmmx.obj jccolss2.obj jcsammmx.obj jcsamss2.obj \
+        jcqntint.obj jcqntflt.obj jcqntmmx.obj jcqnt3dn.obj jcqnts2i.obj \
+        jcqntsse.obj jcqnts2f.obj jfdctint.obj jfdctfst.obj jfdctflt.obj \
+        jfmmxint.obj jfmmxfst.obj jf3dnflt.obj jfss2int.obj jfss2fst.obj \
+        jfsseflt.obj
 # decompression library object files
 DLIBOBJECTS= jdapimin.obj jdapistd.obj jdtrans.obj jdatasrc.obj \
         jdmaster.obj jdinput.obj jdmarker.obj jdhuff.obj jdphuff.obj \
-        jdmainct.obj jdcoefct.obj jdpostct.obj jddctmgr.obj jidctfst.obj \
-        jidctflt.obj jidctint.obj jidctred.obj jdsample.obj jdcolor.obj \
-        jquant1.obj jquant2.obj jdmerge.obj
+        jdmainct.obj jdcoefct.obj jdpostct.obj jddctmgr.obj jdsample.obj \
+        jdcolor.obj jquant1.obj jquant2.obj jdmerge.obj jidctint.obj \
+        jidctfst.obj jidctred.obj jidctflt.obj jimmxint.obj jimmxfst.obj \
+        jimmxred.obj ji3dnflt.obj jiss2int.obj jiss2fst.obj jiss2red.obj \
+        jisseflt.obj jiss2flt.obj jdsammmx.obj jdsamss2.obj jdcolmmx.obj \
+        jdcolss2.obj jdmermmx.obj jdmerss2.obj
 # These objectfiles are included in libjpeg.lib
 LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
 # object files for sample applications (excluding library files)
@@ -94,38 +120,46 @@
 TROBJECTS= jpegtran.obj rdswitch.obj cdjpeg.obj transupp.obj
 
 # Template command for compiling .c to .obj
-.c.obj:
-	$(cc) $(CFLAGS) $*.c
+.c.obj::
+	$(CC) $(CFLAGS) $<
 
 
 all: libjpeg.lib cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
 
 libjpeg.lib: $(LIBOBJECTS)
-	$(RM) libjpeg.lib
+	-$(RM) libjpeg.lib
 	lib -out:libjpeg.lib  $(LIBOBJECTS)
 
 cjpeg.exe: $(COBJECTS) libjpeg.lib
-	$(link) $(LDFLAGS) -out:cjpeg.exe $(COBJECTS) libjpeg.lib $(LDLIBS)
+	$(LD) $(LDFLAGS) -out:cjpeg.exe $(COBJECTS) libjpeg.lib $(LDLIBS)
 
 djpeg.exe: $(DOBJECTS) libjpeg.lib
-	$(link) $(LDFLAGS) -out:djpeg.exe $(DOBJECTS) libjpeg.lib $(LDLIBS)
+	$(LD) $(LDFLAGS) -out:djpeg.exe $(DOBJECTS) libjpeg.lib $(LDLIBS)
 
 jpegtran.exe: $(TROBJECTS) libjpeg.lib
-	$(link) $(LDFLAGS) -out:jpegtran.exe $(TROBJECTS) libjpeg.lib $(LDLIBS)
+	$(LD) $(LDFLAGS) -out:jpegtran.exe $(TROBJECTS) libjpeg.lib $(LDLIBS)
 
 rdjpgcom.exe: rdjpgcom.obj
-	$(link) $(LDFLAGS) -out:rdjpgcom.exe rdjpgcom.obj $(LDLIBS)
+	$(LD) $(LDFLAGS) -out:rdjpgcom.exe rdjpgcom.obj $(LDLIBS)
 
 wrjpgcom.exe: wrjpgcom.obj
-	$(link) $(LDFLAGS) -out:wrjpgcom.exe wrjpgcom.obj $(LDLIBS)
+	$(LD) $(LDFLAGS) -out:wrjpgcom.exe wrjpgcom.obj $(LDLIBS)
 
 
 clean:
-	$(RM) *.obj *.exe libjpeg.lib
-	$(RM) testout*
+	-$(RM) *.obj
+	-$(RM) cjpeg.exe
+	-$(RM) djpeg.exe
+	-$(RM) jpegtran.exe
+	-$(RM) rdjpgcom.exe
+	-$(RM) wrjpgcom.exe
+	-$(RM) jsimdcfg.inc
+	-$(RM) libjpeg.lib
+	-if exist *.manifest $(RM) *.manifest
+	-if exist testout*   $(RM) testout*
 
 test: cjpeg.exe djpeg.exe jpegtran.exe
-	$(RM) testout*
+	-if exist testout* $(RM) testout*
 	.\djpeg -dct int -ppm -outfile testout.ppm  testorig.jpg
 	.\djpeg -dct int -bmp -colors 256 -outfile testout.bmp  testorig.jpg
 	.\cjpeg -dct int -outfile testout.jpg  testimg.ppm
@@ -140,10 +174,66 @@
 	fc /b testorig.jpg testoutt.jpg
 
 
+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) makecfg.c
+	$(LD) $(LDFLAGS) -out:makecfg.exe makecfg.obj $(LDLIBS)
+	.\makecfg.exe > jsimdcfg.inc
+	$(RM) makecfg.obj
+	$(RM) makecfg.exe
+	if exist makecfg.exe.manifest $(RM) makecfg.exe.manifest
+
+.asm.obj:
+	$(NASM) $(NAFLAGS) -o $@ $<
+
+jsimdcpu.obj: jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.obj: jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.obj: jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.obj: jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.obj: jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.obj: jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.obj: jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.obj: jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.obj: jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.obj: jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.obj: jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.obj: jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.obj: jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.obj: jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.obj: jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.obj: jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.obj: jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.obj: jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.obj: jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.obj: jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.obj: jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.obj: jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.obj: jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.obj: jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.obj: jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.obj: jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.obj: jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.obj: jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.obj: jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.obj: jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.obj: jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.obj: jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.obj: jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.obj: jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.obj: jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.obj: jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.obj: ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.obj: jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.obj: jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.obj: jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.obj: jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.obj: jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.obj: jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
 jcapimin.obj: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcapistd.obj: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jccoefct.obj: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jccolor.obj: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.obj: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jcdctmgr.obj: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jchuff.obj: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcinit.obj: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
@@ -154,33 +244,33 @@
 jcparam.obj: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jcphuff.obj: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
 jcprepct.obj: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jcsample.obj: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.obj: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jctrans.obj: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapimin.obj: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdapistd.obj: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdatadst.obj: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdatasrc.obj: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
 jdcoefct.obj: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdcolor.obj: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.obj: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jddctmgr.obj: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jdhuff.obj: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdinput.obj: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmainct.obj: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmarker.obj: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jdmaster.obj: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdmerge.obj: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.obj: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdphuff.obj: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
 jdpostct.obj: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
-jdsample.obj: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.obj: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
 jdtrans.obj: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jerror.obj: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
-jfdctflt.obj: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctfst.obj: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jfdctint.obj: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctflt.obj: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctfst.obj: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctint.obj: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
-jidctred.obj: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctflt.obj: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.obj: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.obj: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.obj: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.obj: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.obj: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.obj: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
 jquant1.obj: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jquant2.obj: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
 jutils.obj: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
diff --git a/makefile.vcdll b/makefile.vcdll
new file mode 100644
index 0000000..cd715eb
--- /dev/null
+++ b/makefile.vcdll
@@ -0,0 +1,311 @@
+# Makefile for Independent JPEG Group's software
+# Modified for x86 SIMD extension
+
+# This makefile is for Microsoft Visual C++ 6.0.
+# It builds the IJG library as a dynamically linkable library (.DLL),
+# and builds the sample applications which are linked against the DLL.
+
+# Read installation instructions before saying "nmake" !!
+
+# The name of your C compiler:
+CC= cl
+LD= link
+RC= rc
+
+# You may want to adjust these compiler options:
+#  You have to use a DLL version of C Run-Time library for both
+#  the JPEG DLL and any applications linked to the JPEG DLL.
+CFLAGS= -nologo -c -MD -W3 -O2 -GF -Gy -DNDEBUG -I.
+
+# Generally, we recommend defining any configuration symbols in jconfig.h,
+# NOT via -D switches here.
+
+# The executable name of NASM and its options:
+NASM= nasmw
+NAFLAGS= $(NASM_OBJFMT) -I./
+# object file format specifier for NASM
+# see jsimdext.inc for more details.
+NASM_OBJFMT= -fwin32 -DWIN32
+
+# Link-time options:
+LDFLAGS= -nologo -release -subsystem:console,4.0 -opt:nowin98
+LDFLAGS_DLL= -nologo -release -dll -opt:nowin98
+
+# To link any special libraries, add the necessary commands here.
+LDLIBS= 
+
+# DLL to build
+DLLNAME = jpeg62.dll
+# import library
+LIBNAME = jpeg62.lib
+
+# Put here the object file name for the correct system-dependent memory
+# manager file.  For NT we suggest jmemnobs.obj, which expects the OS to
+# provide adequate virtual memory.
+SYSDEPMEM= jmemnobs.obj
+
+# OS-dependent SIMD instruction support checker
+# jsimdw32.obj (Win32) / jsimddjg.obj (DJGPP V.2) / jsimdgcc.obj (Unix/gcc)
+SYSDEPSIMDCHK= jsimdw32.obj
+
+# miscellaneous OS-dependent stuff
+# file deletion command
+RM= del
+
+# End of configurable options.
+
+
+# source files: JPEG library proper
+LIBSOURCES= jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c \
+        jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c \
+        jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c \
+        jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
+        jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
+        jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
+        jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
+        jquant2.c jutils.c jmemmgr.c
+# memmgr back ends: compile only one of these into a working library
+SYSDEPSOURCES= jmemansi.c jmemname.c jmemnobs.c jmemdos.c jmemmac.c
+# source files: cjpeg/djpeg/jpegtran applications, also rdjpgcom/wrjpgcom
+APPSOURCES= cjpeg.c djpeg.c jpegtran.c rdjpgcom.c wrjpgcom.c cdjpeg.c \
+        rdcolmap.c rdswitch.c transupp.c rdppm.c wrppm.c rdgif.c wrgif.c \
+        rdtarga.c wrtarga.c rdbmp.c wrbmp.c rdrle.c wrrle.c
+SOURCES= $(LIBSOURCES) $(SYSDEPSOURCES) $(APPSOURCES)
+# files included by source files
+INCLUDES= jchuff.h jdhuff.h jdct.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
+        jpegint.h jpeglib.h jversion.h cdjpeg.h cderror.h transupp.h
+# documentation, test, and support files
+DOCS= README install.doc usage.doc cjpeg.1 djpeg.1 jpegtran.1 rdjpgcom.1 \
+        wrjpgcom.1 wizard.doc example.c libjpeg.doc structure.doc \
+        coderules.doc filelist.doc change.log
+MKFILES= configure makefile.cfg makefile.ansi makefile.unix makefile.bcc \
+        makefile.mc6 makefile.dj makefile.wat makefile.vc makelib.ds \
+        makeapps.ds makeproj.mac makcjpeg.st makdjpeg.st makljpeg.st \
+        maktjpeg.st makefile.manx makefile.sas makefile.mms makefile.vms \
+        makvms.opt
+CONFIGFILES= jconfig.cfg jconfig.bcc jconfig.mc6 jconfig.dj jconfig.wat \
+        jconfig.vc jconfig.mac jconfig.st jconfig.manx jconfig.sas \
+        jconfig.vms
+CONFIGUREFILES= config.guess config.sub install-sh ltconfig ltmain.sh
+OTHERFILES= jconfig.doc ckconfig.c ansi2knr.c ansi2knr.1 jmemdosa.asm
+TESTFILES= testorig.jpg testimg.ppm testimg.bmp testimg.jpg testprog.jpg \
+        testimgp.jpg
+DISTFILES= $(DOCS) $(MKFILES) $(CONFIGFILES) $(SOURCES) $(INCLUDES) \
+        $(CONFIGUREFILES) $(OTHERFILES) $(TESTFILES)
+# library object files common to compression and decompression
+COMOBJECTS= jcomapi.obj jutils.obj jerror.obj jmemmgr.obj $(SYSDEPMEM) \
+        jsimdcpu.obj $(SYSDEPSIMDCHK)
+# compression library object files
+CLIBOBJECTS= jcapimin.obj jcapistd.obj jctrans.obj jcparam.obj jdatadst.obj \
+        jcinit.obj jcmaster.obj jcmarker.obj jcmainct.obj jcprepct.obj \
+        jccoefct.obj jccolor.obj jcsample.obj jchuff.obj jcphuff.obj \
+        jcdctmgr.obj jccolmmx.obj jccolss2.obj jcsammmx.obj jcsamss2.obj \
+        jcqntint.obj jcqntflt.obj jcqntmmx.obj jcqnt3dn.obj jcqnts2i.obj \
+        jcqntsse.obj jcqnts2f.obj jfdctint.obj jfdctfst.obj jfdctflt.obj \
+        jfmmxint.obj jfmmxfst.obj jf3dnflt.obj jfss2int.obj jfss2fst.obj \
+        jfsseflt.obj
+# decompression library object files
+DLIBOBJECTS= jdapimin.obj jdapistd.obj jdtrans.obj jdatasrc.obj \
+        jdmaster.obj jdinput.obj jdmarker.obj jdhuff.obj jdphuff.obj \
+        jdmainct.obj jdcoefct.obj jdpostct.obj jddctmgr.obj jdsample.obj \
+        jdcolor.obj jquant1.obj jquant2.obj jdmerge.obj jidctint.obj \
+        jidctfst.obj jidctred.obj jidctflt.obj jimmxint.obj jimmxfst.obj \
+        jimmxred.obj ji3dnflt.obj jiss2int.obj jiss2fst.obj jiss2red.obj \
+        jisseflt.obj jiss2flt.obj jdsammmx.obj jdsamss2.obj jdcolmmx.obj \
+        jdcolss2.obj jdmermmx.obj jdmerss2.obj
+# These objectfiles are included in libjpeg.lib
+LIBOBJECTS= $(CLIBOBJECTS) $(DLIBOBJECTS) $(COMOBJECTS)
+# object files for sample applications (excluding library files)
+COBJECTS= cjpeg.obj rdppm.obj rdgif.obj rdtarga.obj rdrle.obj rdbmp.obj \
+        rdswitch.obj cdjpeg.obj
+DOBJECTS= djpeg.obj wrppm.obj wrgif.obj wrtarga.obj wrrle.obj wrbmp.obj \
+        rdcolmap.obj cdjpeg.obj
+TROBJECTS= jpegtran.obj rdswitch.obj cdjpeg.obj transupp.obj
+
+# Template command for compiling .c to .obj
+.c.obj::
+	$(CC) $(CFLAGS) $<
+
+
+all: $(DLLNAME) cjpeg.exe djpeg.exe jpegtran.exe rdjpgcom.exe wrjpgcom.exe
+
+$(LIBNAME): $(DLLNAME)
+$(DLLNAME): $(LIBOBJECTS) jpegdll.res jpegdll.def
+	$(LD) $(LDFLAGS_DLL) -out:$(DLLNAME) -implib:$(LIBNAME) \
+		$(LIBOBJECTS) jpegdll.res -def:jpegdll.def
+
+jpegdll.res: jpegdll.rc
+	$(RC) -fo $@ $*.rc
+
+cjpeg.exe: $(COBJECTS) $(LIBNAME)
+	$(LD) $(LDFLAGS) -out:cjpeg.exe $(COBJECTS) $(LIBNAME) $(LDLIBS)
+
+djpeg.exe: $(DOBJECTS) $(LIBNAME)
+	$(LD) $(LDFLAGS) -out:djpeg.exe $(DOBJECTS) $(LIBNAME) $(LDLIBS)
+
+jpegtran.exe: $(TROBJECTS) $(LIBNAME)
+	$(LD) $(LDFLAGS) -out:jpegtran.exe $(TROBJECTS) $(LIBNAME) $(LDLIBS)
+
+rdjpgcom.exe: rdjpgcom.obj
+	$(LD) $(LDFLAGS) -out:rdjpgcom.exe rdjpgcom.obj $(LDLIBS)
+
+wrjpgcom.exe: wrjpgcom.obj
+	$(LD) $(LDFLAGS) -out:wrjpgcom.exe wrjpgcom.obj $(LDLIBS)
+
+
+clean:
+	-$(RM) *.obj
+	-$(RM) cjpeg.exe
+	-$(RM) djpeg.exe
+	-$(RM) jpegtran.exe
+	-$(RM) rdjpgcom.exe
+	-$(RM) wrjpgcom.exe
+	-$(RM) jsimdcfg.inc
+	-$(RM) jpegdll.res
+	-$(RM) $(DLLNAME)
+	-$(RM) $(DLLNAME:.dll=.exp)
+	-$(RM) $(LIBNAME)
+	-if exist *.manifest $(RM) *.manifest
+	-if exist testout*   $(RM) testout*
+
+test: cjpeg.exe djpeg.exe jpegtran.exe
+	-if exist testout* $(RM) testout*
+	.\djpeg -dct int -ppm -outfile testout.ppm  testorig.jpg
+	.\djpeg -dct int -bmp -colors 256 -outfile testout.bmp  testorig.jpg
+	.\cjpeg -dct int -outfile testout.jpg  testimg.ppm
+	.\djpeg -dct int -ppm -outfile testoutp.ppm testprog.jpg
+	.\cjpeg -dct int -progressive -opt -outfile testoutp.jpg testimg.ppm
+	.\jpegtran -outfile testoutt.jpg testprog.jpg
+	fc /b testimg.ppm testout.ppm
+	fc /b testimg.bmp testout.bmp
+	fc /b testimg.jpg testout.jpg
+	fc /b testimg.ppm testoutp.ppm
+	fc /b testimgp.jpg testoutp.jpg
+	fc /b testorig.jpg testoutt.jpg
+
+
+jsimdcfg.inc: makecfg.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+	$(CC) $(CFLAGS) makecfg.c
+	$(LD) $(LDFLAGS) -out:makecfg.exe makecfg.obj $(LDLIBS)
+	.\makecfg.exe > jsimdcfg.inc
+	$(RM) makecfg.obj
+	$(RM) makecfg.exe
+	if exist makecfg.exe.manifest $(RM) makecfg.exe.manifest
+
+.asm.obj:
+	$(NASM) $(NAFLAGS) -o $@ $<
+
+jsimdcpu.obj: jsimdcpu.asm jsimdcfg.inc jsimdext.inc
+jsimdw32.obj: jsimdw32.asm jsimdcfg.inc jsimdext.inc
+jsimddjg.obj: jsimddjg.asm jsimdcfg.inc jsimdext.inc
+jccolmmx.obj: jccolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jccolss2.obj: jccolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsammmx.obj: jcsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcsamss2.obj: jcsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolmmx.obj: jdcolmmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdcolss2.obj: jdcolss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmermmx.obj: jdmermmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdmerss2.obj: jdmerss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsammmx.obj: jdsammmx.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jdsamss2.obj: jdsamss2.asm jsimdcfg.inc jsimdext.inc jcolsamp.inc
+jcqntint.obj: jcqntint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntflt.obj: jcqntflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntmmx.obj: jcqntmmx.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnt3dn.obj: jcqnt3dn.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2i.obj: jcqnts2i.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqntsse.obj: jcqntsse.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jcqnts2f.obj: jcqnts2f.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctint.obj: jfdctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctfst.obj: jfdctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfdctflt.obj: jfdctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxint.obj: jfmmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfmmxfst.obj: jfmmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jf3dnflt.obj: jf3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2int.obj: jfss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfss2fst.obj: jfss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jfsseflt.obj: jfsseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctint.obj: jidctint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctfst.obj: jidctfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctred.obj: jidctred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jidctflt.obj: jidctflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxint.obj: jimmxint.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxfst.obj: jimmxfst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jimmxred.obj: jimmxred.asm jsimdcfg.inc jsimdext.inc jdct.inc
+ji3dnflt.obj: ji3dnflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2int.obj: jiss2int.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2fst.obj: jiss2fst.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2red.obj: jiss2red.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jisseflt.obj: jisseflt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+jiss2flt.obj: jiss2flt.asm jsimdcfg.inc jsimdext.inc jdct.inc
+
+jsimdgcc.obj: jsimdgcc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+
+jcapimin.obj: jcapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcapistd.obj: jcapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccoefct.obj: jccoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jccolor.obj: jccolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jcdctmgr.obj: jcdctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jchuff.obj: jchuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcinit.obj: jcinit.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmainct.obj: jcmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmarker.obj: jcmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcmaster.obj: jcmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcomapi.obj: jcomapi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcparam.obj: jcparam.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcphuff.obj: jcphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jchuff.h
+jcprepct.obj: jcprepct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jcsample.obj: jcsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jctrans.obj: jctrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapimin.obj: jdapimin.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdapistd.obj: jdapistd.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdatadst.obj: jdatadst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdatasrc.obj: jdatasrc.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h
+jdcoefct.obj: jdcoefct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdcolor.obj: jdcolor.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jddctmgr.obj: jddctmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jdhuff.obj: jdhuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdinput.obj: jdinput.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmainct.obj: jdmainct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmarker.obj: jdmarker.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmaster.obj: jdmaster.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdmerge.obj: jdmerge.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdphuff.obj: jdphuff.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdhuff.h
+jdpostct.obj: jdpostct.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jdsample.obj: jdsample.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jcolsamp.h
+jdtrans.obj: jdtrans.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jerror.obj: jerror.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jversion.h jerror.h
+# jfdctflt.obj: jfdctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctfst.obj: jfdctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jfdctint.obj: jfdctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctflt.obj: jidctflt.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctfst.obj: jidctfst.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctint.obj: jidctint.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+# jidctred.obj: jidctred.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jdct.h
+jquant1.obj: jquant1.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jquant2.obj: jquant2.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jutils.obj: jutils.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h
+jmemmgr.obj: jmemmgr.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemansi.obj: jmemansi.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemname.obj: jmemname.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemnobs.obj: jmemnobs.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemdos.obj: jmemdos.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+jmemmac.obj: jmemmac.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h jmemsys.h
+cjpeg.obj: cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+djpeg.obj: djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h
+jpegtran.obj: jpegtran.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h transupp.h jversion.h
+rdjpgcom.obj: rdjpgcom.c jinclude.h jconfig.h
+wrjpgcom.obj: wrjpgcom.c jinclude.h jconfig.h
+cdjpeg.obj: cdjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdcolmap.obj: rdcolmap.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdswitch.obj: rdswitch.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+transupp.obj: transupp.c jinclude.h jconfig.h jpeglib.h jmorecfg.h jpegint.h jerror.h transupp.h
+rdppm.obj: rdppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrppm.obj: wrppm.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdgif.obj: rdgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrgif.obj: wrgif.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdtarga.obj: rdtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrtarga.obj: wrtarga.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdbmp.obj: rdbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrbmp.obj: wrbmp.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+rdrle.obj: rdrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
+wrrle.obj: wrrle.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h
diff --git a/nasm_lt.sh b/nasm_lt.sh
new file mode 100644
index 0000000..ef5a591
--- /dev/null
+++ b/nasm_lt.sh
@@ -0,0 +1,57 @@
+#! /bin/sh
+command=""
+infile=""
+o_opt=no
+pic=no
+while [ $# -gt 0 ]; do
+    case "$1" in
+        -DPIC|-fPIC|-fpic)
+            if [ "$pic" != "yes" ] ; then
+                command="$command -DPIC"
+                pic=yes
+            fi
+            ;;
+        -f|-fbin|-faout|-faoutb|-fcoff|-felf|-fas86| \
+        -fobj|-fwin32|-frdf|-fieee|-fmacho)
+            # it's a file format specifier for nasm.
+            command="$command $1"
+            ;;
+        -f*)
+            # maybe a code-generation flag for gcc.
+            ;;
+        -[Ii]*)
+            incdir=`echo "$1" | sed 's/^-[Ii]//'`
+            if [ "x$incdir" = x -a "x$2" != x ] ; then
+                case "$2" in
+                    -*) ;;
+                    *) incdir="$2"; shift;;
+                esac
+            fi
+            if [ "x$incdir" != x ] ; then
+                # In the case of NASM, the trailing slash is necessary.
+                incdir=`echo "$incdir" | sed 's%/*$%/%'`
+                command="$command -I$incdir"
+            fi
+            ;;
+        -o*)
+            o_opt=yes
+            command="$command $1"
+            ;;
+        *.asm)
+            infile=$1
+            command="$command $1"
+            ;;
+        *)
+            command="$command $1"
+            ;;
+    esac
+    shift
+done
+if [ "$o_opt" != yes ] ; then
+    # By default, NASM creates an output file
+    # in the same directory as the input file.
+    outfile="-o `echo $infile | sed -e 's%^.*/%%' -e 's%\.[^.]*$%%'`.o"
+    command="$command $outfile"
+fi
+echo $command
+exec $command
diff --git a/rdbmp.c b/rdbmp.c
index b05fe2a..2245847 100644
--- a/rdbmp.c
+++ b/rdbmp.c
@@ -5,6 +5,13 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified to improve performance.
+ * Last Modified : October 19, 2004
+ * ---------------------------------------------------------------------
+ *
  * This file contains routines to read input images in Microsoft "BMP"
  * format (MS Windows 3.x, OS/2 1.x, and OS/2 2.x flavors).
  * Currently, only 8-bit and 24-bit images are supported, not 1-bit or
@@ -187,11 +194,14 @@
 preload_image (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 {
   bmp_source_ptr source = (bmp_source_ptr) sinfo;
+#if (BITS_IN_JSAMPLE != 8) || defined(NEED_FAR_POINTERS)
   register FILE *infile = source->pub.input_file;
   register int c;
   register JSAMPROW out_ptr;
+  JDIMENSION col;
+#endif
+  JDIMENSION row;
   JSAMPARRAY image_ptr;
-  JDIMENSION row, col;
   cd_progress_ptr progress = (cd_progress_ptr) cinfo->progress;
 
   /* Read the data into a virtual array in input-file row order. */
@@ -204,6 +214,10 @@
     image_ptr = (*cinfo->mem->access_virt_sarray)
       ((j_common_ptr) cinfo, source->whole_image,
        row, (JDIMENSION) 1, TRUE);
+#if (BITS_IN_JSAMPLE == 8) && !defined(NEED_FAR_POINTERS)
+    if (! ReadOK(source->pub.input_file, image_ptr[0], source->row_width))
+      ERREXIT(cinfo, JERR_INPUT_EOF);
+#else
     out_ptr = image_ptr[0];
     for (col = source->row_width; col > 0; col--) {
       /* inline copy of read_byte() for speed */
@@ -211,6 +225,7 @@
 	ERREXIT(cinfo, JERR_INPUT_EOF);
       *out_ptr++ = (JSAMPLE) c;
     }
+#endif
   }
   if (progress != NULL)
     progress->completed_extra_passes++;
diff --git a/rdgif.c b/rdgif.c
index b27c167..0da2515 100644
--- a/rdgif.c
+++ b/rdgif.c
@@ -1,19 +1,39 @@
 /*
  * rdgif.c
  *
- * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Copyright (C) 1991-1996, Thomas G. Lane.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ **************************************************************************
+ * WARNING: You will need an LZW patent license from Unisys in order to   *
+ * use this file legally in any commercial or shareware application.      *
+ **************************************************************************
+ *
  * This file contains routines to read input images in GIF format.
  *
- *****************************************************************************
- * NOTE: to avoid entanglements with Unisys' patent on LZW compression,      *
- * the ability to read GIF files has been removed from the IJG distribution. *
- * Sorry about that.                                                         *
- *****************************************************************************
+ * These routines may need modification for non-Unix environments or
+ * specialized applications.  As they stand, they assume input from
+ * an ordinary stdio stream.  They further assume that reading begins
+ * at the start of the file; input_init may need work if the
+ * user interface has already read some data (e.g., to determine that
+ * the file is indeed GIF format).
+ */
+
+/*
+ * This code is loosely based on giftoppm from the PBMPLUS distribution
+ * of Feb. 1991.  That file contains the following copyright notice:
+ * +-------------------------------------------------------------------+
+ * | Copyright 1990, David Koblas.                                     |
+ * |   Permission to use, copy, modify, and distribute this software   |
+ * |   and its documentation for any purpose and without fee is hereby |
+ * |   granted, provided that the above copyright notice appear in all |
+ * |   copies and that both that copyright notice and this permission  |
+ * |   notice appear in supporting documentation.  This software is    |
+ * |   provided "as is" without express or implied warranty.           |
+ * +-------------------------------------------------------------------+
  *
- * We are required to state that
+ * We are also required to state that
  *    "The Graphics Interchange Format(c) is the Copyright property of
  *    CompuServe Incorporated. GIF(sm) is a Service Mark property of
  *    CompuServe Incorporated."
@@ -23,6 +43,622 @@
 
 #ifdef GIF_SUPPORTED
 
+
+#define	MAXCOLORMAPSIZE	256	/* max # of colors in a GIF colormap */
+#define NUMCOLORS	3	/* # of colors */
+#define CM_RED		0	/* color component numbers */
+#define CM_GREEN	1
+#define CM_BLUE		2
+
+#define	MAX_LZW_BITS	12	/* maximum LZW code size */
+#define LZW_TABLE_SIZE	(1<<MAX_LZW_BITS) /* # of possible LZW symbols */
+
+/* Macros for extracting header data --- note we assume chars may be signed */
+
+#define LM_to_uint(a,b)		((((b)&0xFF) << 8) | ((a)&0xFF))
+
+#define BitSet(byte, bit)	((byte) & (bit))
+#define INTERLACE	0x40	/* mask for bit signifying interlaced image */
+#define COLORMAPFLAG	0x80	/* mask for bit signifying colormap presence */
+
+#define	ReadOK(file,buffer,len)	(JFREAD(file,buffer,len) == ((size_t) (len)))
+
+/* LZW decompression tables look like this:
+ *   symbol_head[K] = prefix symbol of any LZW symbol K (0..LZW_TABLE_SIZE-1)
+ *   symbol_tail[K] = suffix byte   of any LZW symbol K (0..LZW_TABLE_SIZE-1)
+ * Note that entries 0..end_code of the above tables are not used,
+ * since those symbols represent raw bytes or special codes.
+ *
+ * The stack represents the not-yet-used expansion of the last LZW symbol.
+ * In the worst case, a symbol could expand to as many bytes as there are
+ * LZW symbols, so we allocate LZW_TABLE_SIZE bytes for the stack.
+ * (This is conservative since that number includes the raw-byte symbols.)
+ *
+ * The tables are allocated from FAR heap space since they would use up
+ * rather a lot of the near data space in a PC.
+ */
+
+
+/* Private version of data source object */
+
+typedef struct {
+  struct cjpeg_source_struct pub; /* public fields */
+
+  j_compress_ptr cinfo;		/* back link saves passing separate parm */
+
+  JSAMPARRAY colormap;		/* GIF colormap (converted to my format) */
+
+  /* State for GetCode and LZWReadByte */
+  char code_buf[256+4];		/* current input data block */
+  int last_byte;		/* # of bytes in code_buf */
+  int last_bit;			/* # of bits in code_buf */
+  int cur_bit;			/* next bit index to read */
+  boolean out_of_blocks;	/* TRUE if hit terminator data block */
+
+  int input_code_size;		/* codesize given in GIF file */
+  int clear_code,end_code;	/* values for Clear and End codes */
+
+  int code_size;		/* current actual code size */
+  int limit_code;		/* 2^code_size */
+  int max_code;			/* first unused code value */
+  boolean first_time;		/* flags first call to LZWReadByte */
+
+  /* Private state for LZWReadByte */
+  int oldcode;			/* previous LZW symbol */
+  int firstcode;		/* first byte of oldcode's expansion */
+
+  /* LZW symbol table and expansion stack */
+  UINT16 FAR *symbol_head;	/* => table of prefix symbols */
+  UINT8  FAR *symbol_tail;	/* => table of suffix bytes */
+  UINT8  FAR *symbol_stack;	/* => stack for symbol expansions */
+  UINT8  FAR *sp;		/* stack pointer */
+
+  /* State for interlaced image processing */
+  boolean is_interlaced;	/* TRUE if have interlaced image */
+  jvirt_sarray_ptr interlaced_image; /* full image in interlaced order */
+  JDIMENSION cur_row_number;	/* need to know actual row number */
+  JDIMENSION pass2_offset;	/* # of pixel rows in pass 1 */
+  JDIMENSION pass3_offset;	/* # of pixel rows in passes 1&2 */
+  JDIMENSION pass4_offset;	/* # of pixel rows in passes 1,2,3 */
+} gif_source_struct;
+
+typedef gif_source_struct * gif_source_ptr;
+
+
+/* Forward declarations */
+METHODDEF(JDIMENSION) get_pixel_rows
+	JPP((j_compress_ptr cinfo, cjpeg_source_ptr sinfo));
+METHODDEF(JDIMENSION) load_interlaced_image
+	JPP((j_compress_ptr cinfo, cjpeg_source_ptr sinfo));
+METHODDEF(JDIMENSION) get_interlaced_row
+	JPP((j_compress_ptr cinfo, cjpeg_source_ptr sinfo));
+
+
+LOCAL(int)
+ReadByte (gif_source_ptr sinfo)
+/* Read next byte from GIF file */
+{
+  register FILE * infile = sinfo->pub.input_file;
+  int c;
+
+  if ((c = getc(infile)) == EOF)
+    ERREXIT(sinfo->cinfo, JERR_INPUT_EOF);
+  return c;
+}
+
+
+LOCAL(int)
+GetDataBlock (gif_source_ptr sinfo, char *buf)
+/* Read a GIF data block, which has a leading count byte */
+/* A zero-length block marks the end of a data block sequence */
+{
+  int count;
+
+  count = ReadByte(sinfo);
+  if (count > 0) {
+    if (! ReadOK(sinfo->pub.input_file, buf, count))
+      ERREXIT(sinfo->cinfo, JERR_INPUT_EOF);
+  }
+  return count;
+}
+
+
+LOCAL(void)
+SkipDataBlocks (gif_source_ptr sinfo)
+/* Skip a series of data blocks, until a block terminator is found */
+{
+  char buf[256];
+
+  while (GetDataBlock(sinfo, buf) > 0)
+    /* skip */;
+}
+
+
+LOCAL(void)
+ReInitLZW (gif_source_ptr sinfo)
+/* (Re)initialize LZW state; shared code for startup and Clear processing */
+{
+  sinfo->code_size = sinfo->input_code_size + 1;
+  sinfo->limit_code = sinfo->clear_code << 1;	/* 2^code_size */
+  sinfo->max_code = sinfo->clear_code + 2;	/* first unused code value */
+  sinfo->sp = sinfo->symbol_stack;		/* init stack to empty */
+}
+
+
+LOCAL(void)
+InitLZWCode (gif_source_ptr sinfo)
+/* Initialize for a series of LZWReadByte (and hence GetCode) calls */
+{
+  /* GetCode initialization */
+  sinfo->last_byte = 2;		/* make safe to "recopy last two bytes" */
+  sinfo->last_bit = 0;		/* nothing in the buffer */
+  sinfo->cur_bit = 0;		/* force buffer load on first call */
+  sinfo->out_of_blocks = FALSE;
+
+  /* LZWReadByte initialization: */
+  /* compute special code values (note that these do not change later) */
+  sinfo->clear_code = 1 << sinfo->input_code_size;
+  sinfo->end_code = sinfo->clear_code + 1;
+  sinfo->first_time = TRUE;
+  ReInitLZW(sinfo);
+}
+
+
+LOCAL(int)
+GetCode (gif_source_ptr sinfo)
+/* Fetch the next code_size bits from the GIF data */
+/* We assume code_size is less than 16 */
+{
+  register INT32 accum;
+  int offs, ret, count;
+
+  while ( (sinfo->cur_bit + sinfo->code_size) > sinfo->last_bit) {
+    /* Time to reload the buffer */
+    if (sinfo->out_of_blocks) {
+      WARNMS(sinfo->cinfo, JWRN_GIF_NOMOREDATA);
+      return sinfo->end_code;	/* fake something useful */
+    }
+    /* preserve last two bytes of what we have -- assume code_size <= 16 */
+    sinfo->code_buf[0] = sinfo->code_buf[sinfo->last_byte-2];
+    sinfo->code_buf[1] = sinfo->code_buf[sinfo->last_byte-1];
+    /* Load more bytes; set flag if we reach the terminator block */
+    if ((count = GetDataBlock(sinfo, &sinfo->code_buf[2])) == 0) {
+      sinfo->out_of_blocks = TRUE;
+      WARNMS(sinfo->cinfo, JWRN_GIF_NOMOREDATA);
+      return sinfo->end_code;	/* fake something useful */
+    }
+    /* Reset counters */
+    sinfo->cur_bit = (sinfo->cur_bit - sinfo->last_bit) + 16;
+    sinfo->last_byte = 2 + count;
+    sinfo->last_bit = sinfo->last_byte * 8;
+  }
+
+  /* Form up next 24 bits in accum */
+  offs = sinfo->cur_bit >> 3;	/* byte containing cur_bit */
+#ifdef CHAR_IS_UNSIGNED
+  accum = sinfo->code_buf[offs+2];
+  accum <<= 8;
+  accum |= sinfo->code_buf[offs+1];
+  accum <<= 8;
+  accum |= sinfo->code_buf[offs];
+#else
+  accum = sinfo->code_buf[offs+2] & 0xFF;
+  accum <<= 8;
+  accum |= sinfo->code_buf[offs+1] & 0xFF;
+  accum <<= 8;
+  accum |= sinfo->code_buf[offs] & 0xFF;
+#endif
+
+  /* Right-align cur_bit in accum, then mask off desired number of bits */
+  accum >>= (sinfo->cur_bit & 7);
+  ret = ((int) accum) & ((1 << sinfo->code_size) - 1);
+  
+  sinfo->cur_bit += sinfo->code_size;
+  return ret;
+}
+
+
+LOCAL(int)
+LZWReadByte (gif_source_ptr sinfo)
+/* Read an LZW-compressed byte */
+{
+  register int code;		/* current working code */
+  int incode;			/* saves actual input code */
+
+  /* First time, just eat the expected Clear code(s) and return next code, */
+  /* which is expected to be a raw byte. */
+  if (sinfo->first_time) {
+    sinfo->first_time = FALSE;
+    code = sinfo->clear_code;	/* enables sharing code with Clear case */
+  } else {
+
+    /* If any codes are stacked from a previously read symbol, return them */
+    if (sinfo->sp > sinfo->symbol_stack)
+      return (int) *(-- sinfo->sp);
+
+    /* Time to read a new symbol */
+    code = GetCode(sinfo);
+
+  }
+
+  if (code == sinfo->clear_code) {
+    /* Reinit state, swallow any extra Clear codes, and */
+    /* return next code, which is expected to be a raw byte. */
+    ReInitLZW(sinfo);
+    do {
+      code = GetCode(sinfo);
+    } while (code == sinfo->clear_code);
+    if (code > sinfo->clear_code) { /* make sure it is a raw byte */
+      WARNMS(sinfo->cinfo, JWRN_GIF_BADDATA);
+      code = 0;			/* use something valid */
+    }
+    /* make firstcode, oldcode valid! */
+    sinfo->firstcode = sinfo->oldcode = code;
+    return code;
+  }
+
+  if (code == sinfo->end_code) {
+    /* Skip the rest of the image, unless GetCode already read terminator */
+    if (! sinfo->out_of_blocks) {
+      SkipDataBlocks(sinfo);
+      sinfo->out_of_blocks = TRUE;
+    }
+    /* Complain that there's not enough data */
+    WARNMS(sinfo->cinfo, JWRN_GIF_ENDCODE);
+    /* Pad data with 0's */
+    return 0;			/* fake something usable */
+  }
+
+  /* Got normal raw byte or LZW symbol */
+  incode = code;		/* save for a moment */
+  
+  if (code >= sinfo->max_code) { /* special case for not-yet-defined symbol */
+    /* code == max_code is OK; anything bigger is bad data */
+    if (code > sinfo->max_code) {
+      WARNMS(sinfo->cinfo, JWRN_GIF_BADDATA);
+      incode = 0;		/* prevent creation of loops in symbol table */
+    }
+    /* this symbol will be defined as oldcode/firstcode */
+    *(sinfo->sp++) = (UINT8) sinfo->firstcode;
+    code = sinfo->oldcode;
+  }
+
+  /* If it's a symbol, expand it into the stack */
+  while (code >= sinfo->clear_code) {
+    *(sinfo->sp++) = sinfo->symbol_tail[code]; /* tail is a byte value */
+    code = sinfo->symbol_head[code]; /* head is another LZW symbol */
+  }
+  /* At this point code just represents a raw byte */
+  sinfo->firstcode = code;	/* save for possible future use */
+
+  /* If there's room in table, */
+  if ((code = sinfo->max_code) < LZW_TABLE_SIZE) {
+    /* Define a new symbol = prev sym + head of this sym's expansion */
+    sinfo->symbol_head[code] = sinfo->oldcode;
+    sinfo->symbol_tail[code] = (UINT8) sinfo->firstcode;
+    sinfo->max_code++;
+    /* Is it time to increase code_size? */
+    if ((sinfo->max_code >= sinfo->limit_code) &&
+	(sinfo->code_size < MAX_LZW_BITS)) {
+      sinfo->code_size++;
+      sinfo->limit_code <<= 1;	/* keep equal to 2^code_size */
+    }
+  }
+  
+  sinfo->oldcode = incode;	/* save last input symbol for future use */
+  return sinfo->firstcode;	/* return first byte of symbol's expansion */
+}
+
+
+LOCAL(void)
+ReadColorMap (gif_source_ptr sinfo, int cmaplen, JSAMPARRAY cmap)
+/* Read a GIF colormap */
+{
+  int i;
+
+  for (i = 0; i < cmaplen; i++) {
+#if BITS_IN_JSAMPLE == 8
+#define UPSCALE(x)  (x)
+#else
+#define UPSCALE(x)  ((x) << (BITS_IN_JSAMPLE-8))
+#endif
+    cmap[CM_RED][i]   = (JSAMPLE) UPSCALE(ReadByte(sinfo));
+    cmap[CM_GREEN][i] = (JSAMPLE) UPSCALE(ReadByte(sinfo));
+    cmap[CM_BLUE][i]  = (JSAMPLE) UPSCALE(ReadByte(sinfo));
+  }
+}
+
+
+LOCAL(void)
+DoExtension (gif_source_ptr sinfo)
+/* Process an extension block */
+/* Currently we ignore 'em all */
+{
+  int extlabel;
+
+  /* Read extension label byte */
+  extlabel = ReadByte(sinfo);
+  TRACEMS1(sinfo->cinfo, 1, JTRC_GIF_EXTENSION, extlabel);
+  /* Skip the data block(s) associated with the extension */
+  SkipDataBlocks(sinfo);
+}
+
+
+/*
+ * Read the file header; return image size and component count.
+ */
+
+METHODDEF(void)
+start_input_gif (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  gif_source_ptr source = (gif_source_ptr) sinfo;
+  char hdrbuf[10];		/* workspace for reading control blocks */
+  unsigned int width, height;	/* image dimensions */
+  int colormaplen, aspectRatio;
+  int c;
+
+  /* Allocate space to store the colormap */
+  source->colormap = (*cinfo->mem->alloc_sarray)
+    ((j_common_ptr) cinfo, JPOOL_IMAGE,
+     (JDIMENSION) MAXCOLORMAPSIZE, (JDIMENSION) NUMCOLORS);
+
+  /* Read and verify GIF Header */
+  if (! ReadOK(source->pub.input_file, hdrbuf, 6))
+    ERREXIT(cinfo, JERR_GIF_NOT);
+  if (hdrbuf[0] != 'G' || hdrbuf[1] != 'I' || hdrbuf[2] != 'F')
+    ERREXIT(cinfo, JERR_GIF_NOT);
+  /* Check for expected version numbers.
+   * If unknown version, give warning and try to process anyway;
+   * this is per recommendation in GIF89a standard.
+   */
+  if ((hdrbuf[3] != '8' || hdrbuf[4] != '7' || hdrbuf[5] != 'a') &&
+      (hdrbuf[3] != '8' || hdrbuf[4] != '9' || hdrbuf[5] != 'a'))
+    TRACEMS3(cinfo, 1, JTRC_GIF_BADVERSION, hdrbuf[3], hdrbuf[4], hdrbuf[5]);
+
+  /* Read and decipher Logical Screen Descriptor */
+  if (! ReadOK(source->pub.input_file, hdrbuf, 7))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  width = LM_to_uint(hdrbuf[0],hdrbuf[1]);
+  height = LM_to_uint(hdrbuf[2],hdrbuf[3]);
+  colormaplen = 2 << (hdrbuf[4] & 0x07);
+  /* we ignore the color resolution, sort flag, and background color index */
+  aspectRatio = hdrbuf[6] & 0xFF;
+  if (aspectRatio != 0 && aspectRatio != 49)
+    TRACEMS(cinfo, 1, JTRC_GIF_NONSQUARE);
+
+  /* Read global colormap if header indicates it is present */
+  if (BitSet(hdrbuf[4], COLORMAPFLAG))
+    ReadColorMap(source, colormaplen, source->colormap);
+
+  /* Scan until we reach start of desired image.
+   * We don't currently support skipping images, but could add it easily.
+   */
+  for (;;) {
+    c = ReadByte(source);
+
+    if (c == ';')		/* GIF terminator?? */
+      ERREXIT(cinfo, JERR_GIF_IMAGENOTFOUND);
+
+    if (c == '!') {		/* Extension */
+      DoExtension(source);
+      continue;
+    }
+    
+    if (c != ',') {		/* Not an image separator? */
+      WARNMS1(cinfo, JWRN_GIF_CHAR, c);
+      continue;
+    }
+
+    /* Read and decipher Local Image Descriptor */
+    if (! ReadOK(source->pub.input_file, hdrbuf, 9))
+      ERREXIT(cinfo, JERR_INPUT_EOF);
+    /* we ignore top/left position info, also sort flag */
+    width = LM_to_uint(hdrbuf[4],hdrbuf[5]);
+    height = LM_to_uint(hdrbuf[6],hdrbuf[7]);
+    source->is_interlaced = BitSet(hdrbuf[8], INTERLACE);
+
+    /* Read local colormap if header indicates it is present */
+    /* Note: if we wanted to support skipping images, */
+    /* we'd need to skip rather than read colormap for ignored images */
+    if (BitSet(hdrbuf[8], COLORMAPFLAG)) {
+      colormaplen = 2 << (hdrbuf[8] & 0x07);
+      ReadColorMap(source, colormaplen, source->colormap);
+    }
+
+    source->input_code_size = ReadByte(source); /* get min-code-size byte */
+    if (source->input_code_size < 2 || source->input_code_size >= MAX_LZW_BITS)
+      ERREXIT1(cinfo, JERR_GIF_CODESIZE, source->input_code_size);
+
+    /* Reached desired image, so break out of loop */
+    /* If we wanted to skip this image, */
+    /* we'd call SkipDataBlocks and then continue the loop */
+    break;
+  }
+
+  /* Prepare to read selected image: first initialize LZW decompressor */
+  source->symbol_head = (UINT16 FAR *)
+    (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				LZW_TABLE_SIZE * SIZEOF(UINT16));
+  source->symbol_tail = (UINT8 FAR *)
+    (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				LZW_TABLE_SIZE * SIZEOF(UINT8));
+  source->symbol_stack = (UINT8 FAR *)
+    (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				LZW_TABLE_SIZE * SIZEOF(UINT8));
+  InitLZWCode(source);
+
+  /*
+   * If image is interlaced, we read it into a full-size sample array,
+   * decompressing as we go; then get_interlaced_row selects rows from the
+   * sample array in the proper order.
+   */
+  if (source->is_interlaced) {
+    /* We request the virtual array now, but can't access it until virtual
+     * arrays have been allocated.  Hence, the actual work of reading the
+     * image is postponed until the first call to get_pixel_rows.
+     */
+    source->interlaced_image = (*cinfo->mem->request_virt_sarray)
+      ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
+       (JDIMENSION) width, (JDIMENSION) height, (JDIMENSION) 1);
+    if (cinfo->progress != NULL) {
+      cd_progress_ptr progress = (cd_progress_ptr) cinfo->progress;
+      progress->total_extra_passes++; /* count file input as separate pass */
+    }
+    source->pub.get_pixel_rows = load_interlaced_image;
+  } else {
+    source->pub.get_pixel_rows = get_pixel_rows;
+  }
+
+  /* Create compressor input buffer. */
+  source->pub.buffer = (*cinfo->mem->alloc_sarray)
+    ((j_common_ptr) cinfo, JPOOL_IMAGE,
+     (JDIMENSION) width * NUMCOLORS, (JDIMENSION) 1);
+  source->pub.buffer_height = 1;
+
+  /* Return info about the image. */
+  cinfo->in_color_space = JCS_RGB;
+  cinfo->input_components = NUMCOLORS;
+  cinfo->data_precision = BITS_IN_JSAMPLE; /* we always rescale data to this */
+  cinfo->image_width = width;
+  cinfo->image_height = height;
+
+  TRACEMS3(cinfo, 1, JTRC_GIF, width, height, colormaplen);
+}
+
+
+/*
+ * Read one row of pixels.
+ * This version is used for noninterlaced GIF images:
+ * we read directly from the GIF file.
+ */
+
+METHODDEF(JDIMENSION)
+get_pixel_rows (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  gif_source_ptr source = (gif_source_ptr) sinfo;
+  register int c;
+  register JSAMPROW ptr;
+  register JDIMENSION col;
+  register JSAMPARRAY colormap = source->colormap;
+  
+  ptr = source->pub.buffer[0];
+  for (col = cinfo->image_width; col > 0; col--) {
+    c = LZWReadByte(source);
+    *ptr++ = colormap[CM_RED][c];
+    *ptr++ = colormap[CM_GREEN][c];
+    *ptr++ = colormap[CM_BLUE][c];
+  }
+  return 1;
+}
+
+
+/*
+ * Read one row of pixels.
+ * This version is used for the first call on get_pixel_rows when
+ * reading an interlaced GIF file: we read the whole image into memory.
+ */
+
+METHODDEF(JDIMENSION)
+load_interlaced_image (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  gif_source_ptr source = (gif_source_ptr) sinfo;
+  JSAMPARRAY image_ptr;
+  register JSAMPROW sptr;
+  register JDIMENSION col;
+  JDIMENSION row;
+  cd_progress_ptr progress = (cd_progress_ptr) cinfo->progress;
+
+  /* Read the interlaced image into the virtual array we've created. */
+  for (row = 0; row < cinfo->image_height; row++) {
+    if (progress != NULL) {
+      progress->pub.pass_counter = (long) row;
+      progress->pub.pass_limit = (long) cinfo->image_height;
+      (*progress->pub.progress_monitor) ((j_common_ptr) cinfo);
+    }
+    image_ptr = (*cinfo->mem->access_virt_sarray)
+      ((j_common_ptr) cinfo, source->interlaced_image,
+       row, (JDIMENSION) 1, TRUE);
+    sptr = image_ptr[0];
+    for (col = cinfo->image_width; col > 0; col--) {
+      *sptr++ = (JSAMPLE) LZWReadByte(source);
+    }
+  }
+  if (progress != NULL)
+    progress->completed_extra_passes++;
+
+  /* Replace method pointer so subsequent calls don't come here. */
+  source->pub.get_pixel_rows = get_interlaced_row;
+  /* Initialize for get_interlaced_row, and perform first call on it. */
+  source->cur_row_number = 0;
+  source->pass2_offset = (cinfo->image_height + 7) / 8;
+  source->pass3_offset = source->pass2_offset + (cinfo->image_height + 3) / 8;
+  source->pass4_offset = source->pass3_offset + (cinfo->image_height + 1) / 4;
+
+  return get_interlaced_row(cinfo, sinfo);
+}
+
+
+/*
+ * Read one row of pixels.
+ * This version is used for interlaced GIF images:
+ * we read from the virtual array.
+ */
+
+METHODDEF(JDIMENSION)
+get_interlaced_row (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  gif_source_ptr source = (gif_source_ptr) sinfo;
+  JSAMPARRAY image_ptr;
+  register int c;
+  register JSAMPROW sptr, ptr;
+  register JDIMENSION col;
+  register JSAMPARRAY colormap = source->colormap;
+  JDIMENSION irow;
+
+  /* Figure out which row of interlaced image is needed, and access it. */
+  switch ((int) (source->cur_row_number & 7)) {
+  case 0:			/* first-pass row */
+    irow = source->cur_row_number >> 3;
+    break;
+  case 4:			/* second-pass row */
+    irow = (source->cur_row_number >> 3) + source->pass2_offset;
+    break;
+  case 2:			/* third-pass row */
+  case 6:
+    irow = (source->cur_row_number >> 2) + source->pass3_offset;
+    break;
+  default:			/* fourth-pass row */
+    irow = (source->cur_row_number >> 1) + source->pass4_offset;
+    break;
+  }
+  image_ptr = (*cinfo->mem->access_virt_sarray)
+    ((j_common_ptr) cinfo, source->interlaced_image,
+     irow, (JDIMENSION) 1, FALSE);
+  /* Scan the row, expand colormap, and output */
+  sptr = image_ptr[0];
+  ptr = source->pub.buffer[0];
+  for (col = cinfo->image_width; col > 0; col--) {
+    c = GETJSAMPLE(*sptr++);
+    *ptr++ = colormap[CM_RED][c];
+    *ptr++ = colormap[CM_GREEN][c];
+    *ptr++ = colormap[CM_BLUE][c];
+  }
+  source->cur_row_number++;	/* for next time */
+  return 1;
+}
+
+
+/*
+ * Finish up at the end of the file.
+ */
+
+METHODDEF(void)
+finish_input_gif (j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  /* no work */
+}
+
+
 /*
  * The module selection routine for GIF format input.
  */
@@ -30,9 +666,18 @@
 GLOBAL(cjpeg_source_ptr)
 jinit_read_gif (j_compress_ptr cinfo)
 {
-  fprintf(stderr, "GIF input is unsupported for legal reasons.  Sorry.\n");
-  exit(EXIT_FAILURE);
-  return NULL;			/* keep compiler happy */
+  gif_source_ptr source;
+
+  /* Create module interface object */
+  source = (gif_source_ptr)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				  SIZEOF(gif_source_struct));
+  source->cinfo = cinfo;	/* make back link for subroutines */
+  /* Fill in method ptrs, except get_pixel_rows which start_input sets */
+  source->pub.start_input = start_input_gif;
+  source->pub.finish_input = finish_input_gif;
+
+  return (cjpeg_source_ptr) source;
 }
 
 #endif /* GIF_SUPPORTED */
diff --git a/simd_README.ja.txt b/simd_README.ja.txt
new file mode 100644
index 0000000..bc10b63
--- /dev/null
+++ b/simd_README.ja.txt
@@ -0,0 +1,145 @@
+Independent JPEG Group's JPEG software release 6b
+  with x86 SIMD extension for IJG JPEG library version 1.02
+    == README ==
+-----------------------------------------------------------
+
+    ** Note **
+The accompanying documents related to x86 SIMD extension are written in
+Japanese. The English version of these documents is currently unavailable.
+I apologize for this inconvenience to international programmers.
+
+Most of the source code of the extension part is written in assembly
+language. To compile the source, you need NASM (netwide assembler).
+NASM is available from http://nasm.sourceforge.net/ or
+http://sourceforge.net/project/showfiles.php?group_id=6208 .
+
+At present, the x86 SIMD extension doesn't support 64-bit mode of
+AMD64 (x86_64).
+
+The x86 SIMD extension is an unofficial extension to the IJG JPEG
+software. Please do not send any questions about this distribution
+to the Independent JPEG Group.
+
+For conditions of distribution and use, see the IJG's README file.
+The same conditions apply to this SIMD-extended JPEG software.
+
+
+
+¢£¤³¤Î¥½¥Õ¥È¤Ï
+
+  JPEG ¤Î¥µ¥Ý¡¼¥È¥é¥¤¥Ö¥é¥ê¤È¤·¤Æ¹­¤¯»È¤ï¤ì¤Æ¤¤¤ë Independent JPEG Group's
+  JPEG library (libjpeg ¥é¥¤¥Ö¥é¥ê) ¤Ë¡¢Intel x86 ·Ï CPU ¤Î»ý¤Ä SIMD Ì¿Îá¤ò
+  ÍøÍѤ·¤¿¥³¡¼¥É(¥ë¡¼¥Á¥ó)¤ò¿·¤¿¤ËÄɲä·¡¢¹â®²½²þ¤¤·¤¿¤â¤Î¤Ç¤¹¡£
+  MMX ¤ä SSE ¤Ê¤É¤Î SIMD ±é»»µ¡Ç½¤òÁõÈ÷¤·¤Æ¤¤¤ë¥×¥í¥»¥Ã¥µ¾å¤ÇÆ°ºî¤µ¤»¤ë¤È¡¢
+  ¥ª¥ê¥¸¥Ê¥ëÈǤΠlibjpeg ¥é¥¤¥Ö¥é¥ê¤ÈÈæ³Ó¤·¤Æ 2¡Á3 ÇÜÄøÅ٤ήÅÙ¤ÇÆ°ºî¤·¤Þ¤¹¡£
+  ¤Þ¤¿¡¢SIMD ²½¤Ë°Í¤é¤Ê¤¤¹â®²½²þ¤¤â¤¤¤¯¤Ä¤«»Ü¤µ¤ì¤Æ¤ª¤ê¡¢SIMD ±é»»¤Î»È¤¨
+  ¤Ê¤¤µì·¿CPU¤Ë¤ª¤¤¤Æ¤â¡¢¥ª¥ê¥¸¥Ê¥ëÈǤÈÈæ³Ó¤·¤Æ½½¿ô¡óÄøÅٹ⮤ËÆ°ºî¤·¤Þ¤¹¡£
+
+  JPEG °µ½Ì¡¿Å¸³«½èÍý¤Î¹â®²½¤òÌÜŪ¤È¤·¤Æ¤¤¤Þ¤¹¤¬¡¢Æ°ºî®ÅÙºÇÍ¥Àè¤Ç¤Ï¤Ê¤¯¡¢
+  ¥ª¥ê¥¸¥Ê¥ëÈǤÈƱÅù°Ê¾å¤Î·×»»ÀºÅÙ¤ò»ý¤Ä¤³¤È¤òºÇÍ¥Àè¤Ë¹Í¤¨¤¿¥³¡¼¥É¤òºÎÍÑ
+  ¤·¤Æ¤¤¤Þ¤¹¡£¼ÂºÝ¡¢DCT±é»»¤ËÉâÆ°¾®¿ôÅÀDCT¤ò»È¤Ã¤¿¾ì¹ç¡¢¤ª¤è¤Ó¡¢¤ä¤äÆüì¤Ê
+  ¥µ¥ó¥×¥ê¥ó¥°Èæ(h1v2)¤ò»ý¤ÄJPEG¥Õ¥¡¥¤¥ë¤òŸ³«¤¹¤ë¾ì¹ç¤ò½ü¤¤¤Æ¤Ï¡¢
+  ¥ª¥ê¥¸¥Ê¥ëÈǤȣ±¥Ó¥Ã¥È¤â°ã¤ï¤Ê¤¤·ë²Ì¤ò½Ð¤·¤Þ¤¹¡£¾åµ­¤Î£²¤Ä¤ÎÎã³°¤Î¾ì¹ç¤â
+  ¥ª¥ê¥¸¥Ê¥ëÈǤè¤ê¤Ï¹â²è¼Á²½(¹âÀºÅÙ²½)¤µ¤ì¤Æ¤¤¤Þ¤¹¡£
+
+  SIMD Âбþ²½¤ËºÝ¤·¤Æ¤Ï¡¢²Äǽ¤Ê¸Â¤ê¡¢¥ª¥ê¥¸¥Ê¥ëÈǤΠlibjpeg ¥é¥¤¥Ö¥é¥ê¤È¤Î
+  ¸ß´¹À­¤¬¼º¤ï¤ì¤Ê¤¤¤è¤¦¤Ë¹Í褵¤ì¤Æ¤¤¤Þ¤¹¤Î¤Ç¡¢¤Û¤È¤ó¤É¤Î¾ì¹ç¡¢¥ª¥ê¥¸¥Ê¥ë
+  ÈǤò¤½¤Î¤Þ¤ÞÃÖ¤­´¹¤¨¤ë¤³¤È¤¬²Äǽ¤Ç¤¹¡£Æäˡ¢¶¦Í­¥é¥¤¥Ö¥é¥ê¤Ë´Ø¤·¤Æ¸À¤¨¤Ð¡¢
+  °ìÉô¤ÎÎã³°(cygwin ¤Î¾ì¹ç)¤ò½ü¤­¡¢¤½¤ì¤Ï¥ª¥ê¥¸¥Ê¥ëÈǤȥХ¤¥Ê¥ê¥ì¥Ù¥ë¤Ç¤Î
+  ¾å°Ì¸ß´¹À­¤¬¤¢¤ê¤Þ¤¹¤Î¤Ç¡¢¤½¤Î¤Þ¤Þ¥ª¥ê¥¸¥Ê¥ëÈǤòÃÖ¤­´¹¤¨¤ë¤³¤È¤¬¤Ç¤­¤Þ¤¹¡£
+
+  SIMD Âбþ²½¤µ¤ì¤Æ¤¤¤ëÉôʬ¤Ï¡¢°Ê²¼¤Î¤È¤ª¤ê¡§
+
+  °µ½Ì½èÍý¡§
+    ¿§¶õ´ÖÊÑ´¹(RGB->YCbCr)  : MMX or SSE2
+    ¥À¥¦¥ó¥µ¥ó¥×¥ê¥ó¥°      : MMX or SSE2
+    DCT½çÊÑ´¹(¹âÀºÅÙÀ°¿ô)   : MMX or SSE2
+    DCT½çÊÑ´¹(¹â®À°¿ô)     : MMX or SSE2
+    DCT½çÊÑ´¹(ÉâÆ°¾®¿ô)     : 3DNow! or SSE (À°¿ô±é»»Éô: MMX or SSE2)
+    DCT·¸¿ôÎ̻Ҳ½(À°¿ô)     : MMX or SSE2
+    DCT·¸¿ôÎ̻Ҳ½(ÉâÆ°¾®¿ô) : 3DNow! or SSE (À°¿ô±é»»Éô: MMX or SSE2)
+
+  Ÿ³«½èÍý¡§
+    ¿§¶õ´ÖÊÑ´¹(YCbCr->RGB)  : MMX or SSE2
+    ¥¢¥Ã¥×¥µ¥ó¥×¥ê¥ó¥°      : MMX or SSE2
+    DCTµÕÊÑ´¹(¹âÀºÅÙÀ°¿ô)   : MMX or SSE2
+    DCTµÕÊÑ´¹(¹â®À°¿ô)     : MMX or SSE2
+    DCTµÕÊÑ´¹(ÉâÆ°¾®¿ô)     : 3DNow! or SSE (À°¿ô±é»»Éô: MMX or SSE2)
+    DCTµÕÊÑ´¹(½Ì¾®Å¸³«)     : MMX or SSE2
+
+  Ãí¡ËSSE2 ¤Ë¤Ä¤¤¤Æ¤Ï¡¢SIMD À°¿ô±é»»¤Î¤ß¤òÍøÍѤ·¤Æ¤¤¤Þ¤¹¡£SIMD ÇÜÀºÅÙ
+      ÉâÆ°¾®¿ôÅÀ±é»»¤ÏÍøÍѤ·¤Æ¤¤¤Þ¤»¤ó¡£¤Þ¤¿¡¢SSE3 ¤Ï»ÈÍѤµ¤ì¤Æ¤¤¤Þ¤»¤ó¡£
+      ¤³¤Î JPEG ¥é¥¤¥Ö¥é¥ê¤Ë¤ª¤¤¤Æ¤Ï¡¢SSE3 ¤ò»ÈÍѤ·¤Æ¤âÆ°ºî®ÅÙ¸þ¾å¤Î
+      ¸«¹þ¤ß¤¬¤Û¤È¤ó¤É¤Ê¤¤¤¿¤á¤Ç¡¢SSE3 ¤ò¥µ¥Ý¡¼¥È¤¹¤ëͽÄê¤Ï¤¢¤ê¤Þ¤»¤ó¡£
+
+  ¤³¤Î¤Û¤«¤Ë¡¢¥¢¥»¥ó¥Ö¥ê¸À¸ìÈÇDCT¥ë¡¼¥Á¥ó(ÈóSIMD; ½çÊÑ´¹£³¼ï¡¿µÕÊÑ´¹£´¼ï)
+  ¤Ë¤è¤ê¡¢SIMDÌ¿Îá¤Î»È¤¨¤Ê¤¤µì·¿CPU¤Ç¤â½½¿ô¡óÄøÅ٤ι⮲½¤¬´üÂԤǤ­¤Þ¤¹¡£
+  ¤µ¤é¤Ë¡¢Å¸³«½èÍý¤Ç¤Î¥Ï¥Õ¥Þ¥ó¥Ç¥³¡¼¥É¥ë¡¼¥Á¥ó¤Ï¡¢SIMD ²½¤Ë°Í¤é¤Ê¤¤ÊýË¡¤Ç
+  ¹â®²½²þ¤¤µ¤ì¤Æ¤¤¤Þ¤¹¡£
+
+
+¢£Âбþ¤·¤Æ¤¤¤ë¥×¥é¥Ã¥È¥Õ¥©¡¼¥à
+
+  Intel x86 CPU ¤Ë¸ÇÍ­¤Îµ¡Ç½¤òÍøÍѤ·¤Æ¤¤¤Þ¤¹¤Î¤Ç¡¢¥ª¥ê¥¸¥Ê¥ëÈǤȤϰۤʤꡢ
+  Intel x86 CPU ¤ª¤è¤Ó¤½¤Î¸ß´¹ CPU ¤òºÎÍѤ·¤Æ¤¤¤ë¥·¥¹¥Æ¥à¤Ë¸Â¤é¤ì¤Þ¤¹¡£
+  PowerPC ¤Ê¤É¤Î Intel x86 ·Ï°Ê³°¤Î¥·¥¹¥Æ¥à¤Ë¤ÏÂбþ¤·¤Æ¤¤¤Þ¤»¤ó¡£
+  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ¶ñÂÎŪ¤Ë¤Ï¡¢80386 °Ê¹ß¤Î Intel x86 CPU ¤ª¤è¤Ó¤½¤Î¸ß´¹ CPU ¤òºÎÍѤ·¤Æ¤¤¤ë
+  ¥Ï¡¼¥É¥¦¥§¥¢¤Ç¡¢¤«¤Ä¡¢32bit¥Õ¥é¥Ã¥È¥¢¥É¥ì¥¹¥â¡¼¥É(Êݸî¥â¡¼¥É)¤ò»ÈÍѤ·¤Æ
+  ¤¤¤ë¥×¥é¥Ã¥È¥Õ¥©¡¼¥à(OS)¤¬ÂоݤǤ¹¡£¤³¤ì¤Ë¤Ï¡¢Win32 (Windows 9x·Ï/NT·Ï)
+  ¤ä³Æ¼ï PC-UNIX (linux ¤ä xBSD ¥Õ¥¡¥ß¥ê¤Ê¤É) ¤Ê¤É¤¬³ºÅö¤·¤Þ¤¹¡£¤Ê¤ª¡¢
+  AMD64 (EM64T) ¤Î64bit¥â¡¼¥É´Ä¶­¤Ë¤ÏÂбþ¤·¤Æ¤¤¤Þ¤»¤ó¡£¤´Ãí°Õ¤¯¤À¤µ¤¤¡£
+  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+¢£¤³¤Î SIMD ³ÈÄ¥ÈÇ IJG JPEG library ¸ÇÍ­¤ÎÀ©¸Â
+
+  ¥ª¥ê¥¸¥Ê¥ëÈǤΠIJG JPEG library ¤Ç¤Ï¡¢¥³¥ó¥Ñ¥¤¥ë»þ¤Î¥ª¥×¥·¥ç¥ó¤Ç¡¢
+  8bitÀºÅÙJPEG ¤È 12bitÀºÅÙJPEG ¤ÎξÊý¤ËÂбþ¤·¤Þ¤¹¤¬¡¢¤³¤Î SIMD ³ÈÄ¥ÈǤÏ
+  8bitÀºÅÙJPEG ¤Î¤ß¤ÎÂбþ¤Ç¡¢12bitÀºÅÙJPEG ¤Ë¤ÏÂбþ¤·¤Þ¤»¤ó¡£¤È¤Ï¤¤¤¨¡¢
+  12bitÀºÅÙJPEG ¤Ï°åÎÅÍѤʤɤÎÆüìʬÌî¤ò½ü¤¤¤ÆËؤɻȤï¤ì¤Æ¤¤¤Ê¤¤¤Î¤Ç¡¢
+  ÌäÂê¤Ï¾¯¤Ê¤¤¤È»×¤¤¤Þ¤¹¡£
+
+
+¢£»È¤¤Êý
+
+  ¥Þ¥Ë¥å¥¢¥ë¤Ï¡¢°Ê²¼¤Î¥Õ¥¡¥¤¥ë¤Ëʬ¤«¤ì¤Æ¤¤¤Þ¤¹¤Î¤Ç¡¢¼ÂºÝ¤Î»È¤¤Êý¤Ê¤É¤Ë
+  ¤Ä¤¤¤Æ¤Ï¡¢¤½¤Á¤é¤ò»²¾È¤·¤Æ¤¯¤À¤µ¤¤¡£
+
+    simd_README.ja.txt   - ¤³¤Î¥Õ¥¡¥¤¥ë
+    simd_filelist.ja.txt - ¼ýÏ¿¥Õ¥¡¥¤¥ë¤Î¥Õ¥¡¥¤¥ë¥ê¥¹¥È
+    simd_install.ja.txt  - ¥³¥ó¥Ñ¥¤¥ë¤Î»ÅÊý
+    simd_internal.ja.txt - SIMD ³ÈÄ¥Éôʬ¤Î¾ÜºÙ
+    simd_cdjpeg.ja.txt   - SIMD ÈÇ cjpeg/djpeg ¤Ë¸ÇÍ­¤Îµ¡Ç½¤Î²òÀâ
+    simd_changes.ja.txt  - SIMD ³ÈÄ¥Éôʬ¤Î²þÈÇÍúÎò
+
+
+¢£»ÈÍѾò·ï¡¦¥µ¥Ý¡¼¥È
+
+  ¤³¤Î SIMD ³ÈÄ¥ÈÇ IJG JPEG software ¤Î»ÈÍѾò·ï¤Ë¤Ä¤¤¤Æ¤Ï¡¢¥ª¥ê¥¸¥Ê¥ëÈǤÎ
+  IJG JPEG software ¤Î»ÈÍѾò·ï¤¬Å¬ÍѤµ¤ì¤Þ¤¹¡£¾Ü¤·¤¯¤Ï¡¢Æ±º­¤Î README
+  ¥Õ¥¡¥¤¥ë(±Ñʸ)¤Î LEGAL ISSUES ¤Î¹à¤ò»²¾È¤·¤Æ¤¯¤À¤µ¤¤¡£
+
+  ¾åµ­¤Î»ÈÍѾò·ï¤ÎÆâÍƤ竤êÊÖ¤·¤Ë¤Ê¤ê¤Þ¤¹¤¬¡¢¤³¤Î¥½¥Õ¥È¥¦¥§¥¢¤Ï¡Ö¸½¾õ¤Î
+  ¤Þ¤Þ¤Ç¡×Ä󶡤µ¤ì¤Æ¤¤¤ë¤â¤Î¤Ç¡¢¾¦¶ÈŪ¤Ê»ÈÍѲÄǽÀ­¡¢¤ª¤è¤ÓÆÃÄê¤ÎÌÜŪ¤Ë
+  ÂФ¹¤ëŬ¹çÀ­¤Ê¤É¤â´Þ¤á¡¢¤¤¤«¤Ê¤ëÊݾڤ⤢¤ê¤Þ¤»¤ó¡£
+  ¤Þ¤¿¡¢¸¶ºî¼Ô(The Independent JPEG Group)¤â²þ¤¼Ô(MIYASAKA Masaru)¤â¡¢
+  »öͳ¤Î¤¤¤«¤ó¤òÌä¤ï¤º¡¢ËÜ¥½¥Õ¥È¥¦¥§¥¢¤Î»ÈÍѤˤè¤Ã¤ÆȯÀ¸¤·¤¿Ç¡²¿¤Ê¤ë»³²¤Ë
+  ¤Ä¤¤¤Æ¤â¡¢°ìÀÚ¤½¤ÎÀÕǤ¤òÉé¤ï¤Ê¤¤¤â¤Î¤È¤·¤Þ¤¹¡£
+
+  ¤³¤Î SIMD ³ÈÄ¥ÈÇ IJG JPEG software ¤Ï¡¢¥ª¥ê¥¸¥Ê¥ë³«È¯¸µ¤Î IJG ¤È¤Ï´Ø·¸
+  ¤Ê¤¯¡¢Æȼ«¤Ë³ÈÄ¥¤ò¹Ô¤Ê¤Ã¤¿¤â¤Î¤Ç¤¹¡£¤Ç¤¹¤Î¤Ç¡¢¤³¤Î SIMD ³ÈÄ¥ÈÇ IJG JPEG
+  software ¤Ë´Ø¤¹¤ë¼ÁÌä¤ò¡¢¥ª¥ê¥¸¥Ê¥ë³«È¯¸µ (The Independent JPEG Group)
+  ¤ËÁ÷¤é¤Ê¤¤¤Ç¤¯¤À¤µ¤¤¡£
+
+  ¤³¤Î SIMD ³ÈÄ¥ÈÇ IJG JPEG software ¤Ë´Ø¤·¤Æ¤Ï¡¢¸¶Â§¤È¤·¤Æ¥Î¡¼¥µ¥Ý¡¼¥È¤È
+  ¤µ¤»¤Æ¤¤¤¿¤À¤­¤Þ¤¹¡£¥á¡¼¥ë¤Ê¤É¤Ç¤´¼ÁÌä¤Ê¤É¤ò¤¤¤¿¤À¤­¤Þ¤·¤Æ¤â¡¢¾ï¤Ë²¿¤é¤«
+  ¤ÎÊÖÅú¤¬¤Ç¤­¤ë¤ï¤±¤Ç¤Ï¤¢¤ê¤Þ¤»¤ó¤Î¤Ç¡¢¤´¾µÃΤª¤­¤¯¤À¤µ¤¤¡£
+  Æäˡ¢¡Ê¥ª¥ê¥¸¥Ê¥ë¤Î±Ñʸ¥Þ¥Ë¥å¥¢¥ë¤ò´Þ¤á¡ËƱº­¤Î¥Þ¥Ë¥å¥¢¥ëÎà¤Ë²óÅú¤¬
+  ½ñ¤¤¤Æ¤¢¤ë¼ÁÌä¤ä¡¢»ÈÍѼԤΥ½¥Õ¥È¥¦¥§¥¢µ»½Ñ¼Ô¤È¤·¤Æ¤Îµ»ÎÌÉÔ­¡¦·Ð¸³ÉÔ­¤Ë
+  ´Ø¤ï¤ë¼ÁÌä¡¢¼ÁÌä¤ÎÍ×ÎΤòÆÀ¤Ê¤¤¼ÁÌä¤Ê¤É¤Ë¤Ä¤¤¤Æ¤Ï¡¢²óÅú¤ò¤¤¤¿¤·¤Þ¤»¤ó¤Î¤Ç¡¢
+  ¤¢¤·¤«¤é¤º¤´Î»¾µ¤¯¤À¤µ¤¤¡£
+
+
+
+           E-Mail Address : alkaid@coral.ocn.ne.jp (µÜºä ¸­/MIYASAKA Masaru)
+[EOF]
diff --git a/simd_cdjpeg.ja.txt b/simd_cdjpeg.ja.txt
new file mode 100644
index 0000000..941a15a
--- /dev/null
+++ b/simd_cdjpeg.ja.txt
@@ -0,0 +1,75 @@
+Independent JPEG Group's JPEG software release 6b
+  with x86 SIMD extension for IJG JPEG library version 1.02
+    == CDJPEG ==
+-----------------------------------------------------------
+
+¢£¤³¤Î¥Õ¥¡¥¤¥ë¤Ï
+
+  ¤³¤Î¥Õ¥¡¥¤¥ë¤Ç¤Ï¡¢SIMD ÈǤΠcjpeg / djpeg ¤Ë¸ÇÍ­¤Îµ¡Ç½¤ò²òÀ⤷¤Þ¤¹¡£
+
+    ¢£ SIMD Æ°ºî¥â¡¼¥É¾ðÊó (-v ¥ª¥×¥·¥ç¥ó)
+    ¢£ ÆÃÄê¤Î SIMD ±é»»¤ò»ÈÍѤ·¤Ê¤¤¤è¤¦¤Ë¤¹¤ë (-noXXX ¥ª¥×¥·¥ç¥ó)
+    ¢£ GIF ·Á¼°¤ÎÆɤ߹þ¤ß¡¿½ñ¤­½Ð¤· (djpeg ¤Î -gif ¥ª¥×¥·¥ç¥ó)
+
+
+¢£ SIMD Æ°ºî¥â¡¼¥É¾ðÊó (-v ¥ª¥×¥·¥ç¥ó)
+
+  ¤³¤Î SIMD ÈÇ cjpeg / djpeg ¤Ç¤Ï¡¢-v ¥ª¥×¥·¥ç¥ó¤ò¤Ä¤±¤Æµ¯Æ°¤¹¤ë¤È¡¢°Ê²¼
+  ¤Î¤è¤¦¤Ê SIMD Æ°ºî¥â¡¼¥É¾ðÊ󤬥С¼¥¸¥ç¥ó¾ðÊó¤È¶¦¤Ëɽ¼¨¤µ¤ì¤Þ¤¹¡£
+
+    Independent JPEG Group's DJPEG, version 6b  27-Mar-1998
+    Copyright (C) 1998, Thomas G. Lane
+
+    x86 SIMD extension for IJG JPEG library, version 1.02
+
+    SIMD instructions supported by the system : MMX 3DNow! SSE SSE2
+
+          === SIMD Operation Modes ===
+    Accurate integer DCT  (-dct int)   : SSE2
+    Fast integer DCT      (-dct fast)  : SSE2
+    Floating-point DCT    (-dct float) : SSE
+    Reduced-size DCT      (-scale M/N) : SSE2
+    High-quality upsampling (default)  : SSE2
+    Low-quality upsampling (-nosmooth) : SSE2
+    Colorspace conversion (YCbCr->RGB) : SSE2
+
+  "SIMD instructions supported by the system" ¤Î¹àÌܤÇÎóµó¤µ¤ì¤ë¤Î¤Ï¡¢
+  ¥·¥¹¥Æ¥à(CPU/OS)¤Ç¥µ¥Ý¡¼¥È¤µ¤ì¤Æ¤¤¤ë SIMD ±é»»¤Î¼ïÎà¤Ç¤¹¡£¤Ê¤ª¡¢¤³¤Î
+  ¥½¥Õ¥È¤Ç¤Ï SSE3 ¤Ï»ÈÍѤµ¤ì¤Æ¤¤¤Þ¤»¤ó¤· SSE3 ¤Î¥µ¥Ý¡¼¥È¤Î¸¡½Ð¤â¹Ô¤Ê¤ï¤ì
+  ¤Þ¤»¤ó¤Î¤Ç¡¢SSE3 ¤¬¥µ¥Ý¡¼¥È¤µ¤ì¤Æ¤¤¤Æ¤â¤³¤Î¹àÌܤˤϸ½¤ì¤Þ¤»¤ó¡£
+
+  ¤½¤Î²¼¤Î "SIMD Operation Modes" ¤Ï¡¢³Æ½èÍýÃʳ¬¤Ç»ÈÍѤµ¤ì¤ë SIMD ±é»»¤Î
+  ¼ïÎà¤Ç¤¹¡£Floating-point DCT ¤Ç¤Ï 3DNow! ¤« SSE ¡¢¤½¤ì°Ê³°¤Î¤È¤³¤í¤Ç¤Ï
+  MMX ¤« SSE2 ¤¬ÁªÂò¤µ¤ì¤Þ¤¹¡£°ìÈ̤ˡ¢SSE/SSE2 ¤ÎÊý¤¬ MMX/3DNow! ¤è¤ê¤â
+  ¹âÀ­Ç½¤È¤µ¤ì¤ë¤Î¤Ç¡¢¤³¤ÎÁÐÊý¤¬ÍøÍѲÄǽ¤Ê¾ì¹ç¤Ï SSE/SSE2 ¤¬Í¥ÀèŪ¤ËÁªÂò
+  ¤µ¤ì¤Þ¤¹¡£
+
+
+¢£ ÆÃÄê¤Î SIMD ±é»»¤ò»ÈÍѤ·¤Ê¤¤¤è¤¦¤Ë¤¹¤ë (-noXXX ¥ª¥×¥·¥ç¥ó)
+
+  ¤³¤Î SIMD ÈÇ cjpeg / djpeg ¤Ç¤Ï¡¢°Ê²¼¤Î¤è¤¦¤Ê¥ª¥×¥·¥ç¥ó¤ò»ØÄꤹ¤ë¤³¤È¤Ç¡¢
+  ÆÃÄê¤Î SIMD ±é»»¤ò»ÈÍѤ·¤Ê¤¤¤è¤¦¤Ë¤Ç¤­¤Þ¤¹¡£
+
+    -nommx         MMX ¤ò»ÈÍѤ·¤Ê¤¤
+    -no3dnow       3DNow! ¤ò»ÈÍѤ·¤Ê¤¤
+    -nosse         SSE ¤ò»ÈÍѤ·¤Ê¤¤
+    -nosse2        SSE2 ¤ò»ÈÍѤ·¤Ê¤¤
+    -nosimd        ¤¹¤Ù¤Æ¤Î SIMD ±é»»¤ò»ÈÍѤ·¤Ê¤¤
+
+  ¤³¤ì¤é¤Î¥ª¥×¥·¥ç¥ó¤Ï¡¢cjpeg/djpeg ¤Î¥³¥Þ¥ó¥É¥é¥¤¥ó¤ÎÀèƬ¤Ë»ØÄꤹ¤ë¤è¤¦
+  ¤Ë¤·¤Æ¤¯¤À¤µ¤¤¡£
+
+
+¢£ GIF ·Á¼°¤ÎÆɤ߹þ¤ß¡¿½ñ¤­½Ð¤· (djpeg ¤Î -gif ¥ª¥×¥·¥ç¥ó)
+
+  ¥ª¥ê¥¸¥Ê¥ëÈÇ cjpeg/djpeg ¤Î version 6b ¤Ç¤Ï¡¢Æõö¾å¤ÎÌäÂ꤫¤é¡¢GIF ·Á¼°
+  ²èÁü¤ÎÆɤ߹þ¤ß¡¿½ñ¤­½Ð¤·¤¬¥µ¥Ý¡¼¥È¤µ¤ì¤Ê¤¯¤Ê¤Ã¤Æ¤¤¤Þ¤·¤¿¡£¤Ç¤¹¤¬¡¢
+  GIF ¤Ë´Ø¤¹¤ëÆõö¤¬ 2003¡Á2004 ǯ¤Ë¤«¤±¤ÆÀ¤³¦Åª¤Ë´ü¸ÂÀÚ¤ì¤Ë¤Ê¤Ã¤¿¤¿¤á¡¢
+  Åö SIMD ÈÇ¤Ç¤Ï GIF ·Á¼°¤ÎÆɤ߹þ¤ß¡¿½ñ¤­½Ð¤·¤òÉü³è¤µ¤»¤Æ¤¢¤ê¤Þ¤¹¡£
+
+  ;Ã̤Ǥ¹¤¬¡¢Åö SIMD ÈÇ¤Ç GIF ·Á¼°¤ÎÆɤ߹þ¤ß¡¿½ñ¤­½Ð¤·¤Ë»ÈÍѤ·¤Æ¤¤¤ë
+  ¥â¥¸¥å¡¼¥ë¤Ï¡¢µìÈǤǤ¢¤ë version 6a ¤Î¤â¤Î¤òήÍѤ·¤Æ¤¤¤Þ¤¹¡£
+
+
+
+[EOF]
diff --git a/simd_changes.ja.txt b/simd_changes.ja.txt
new file mode 100644
index 0000000..b256002
--- /dev/null
+++ b/simd_changes.ja.txt
@@ -0,0 +1,24 @@
+Independent JPEG Group's JPEG software release 6b
+  with x86 SIMD extension for IJG JPEG library version 1.02
+    == CHANGES ==
+-----------------------------------------------------------
+
+IJG R6b with x86SIMD V1.02 (2006-02-04)
+---------------------------------------
+* x86 ÈÇ Darwin ¤ËÂбþ¤·¤Þ¤·¤¿¡£Æ°ºî³Îǧ¤Ï Darwin 8.0.1 for x86 ¤Ë¤Æ
+  ¹Ô¤Ê¤¤¤Þ¤·¤¿¡£x86 ÈÇ Mac OS X ¤Ç¤â¡¢Æ°ºî¤¹¤ë¤â¤Î¤È»×¤ï¤ì¤Þ¤¹¡£
+  ¤Þ¤¿¡¢Solaris 10 ¤Ç¤ÎÆ°ºî³Îǧ¤â¹Ô¤Ê¤¤¤Þ¤·¤¿¡£
+
+IJG R6b with x86SIMD V1.01 (2006-01-26)
+---------------------------------------
+* jsimdgcc.c ¤ò»È¤Ã¤¿¤È¤­¡¢NEED_SHORT_EXTERNAL_NAMES ¤¬ÄêµÁ¤µ¤ì¤Æ¤¤¤ë¤È
+  Àµ¾ï¤Ë¥ê¥ó¥¯¤Ç¤­¤Ê¤¤¥Ð¥°¤ò½¤Àµ¤·¤Þ¤·¤¿¡£
+* °µ½Ì¦¤Î¥³¡¼¥É¤Î°ìÉô(jcsammmx.asm, jcsamss2.asm, jcqnt3dn.asm)¤Ë¤ä¤ä
+  ¾éĹ¤Ê²Õ½ê¤¬¤¢¤Ã¤¿¤Î¤Ç¡¢¤³¤ì¤ò½¤Àµ¤·¤Þ¤·¤¿¡£
+
+IJG R6b with x86SIMD V1.0 (2006-01-10)
+--------------------------------------
+* ºÇ½é¤Î¸ø³«ÈÇ¡£
+
+
+[EOF]
diff --git a/simd_filelist.ja.txt b/simd_filelist.ja.txt
new file mode 100644
index 0000000..4bee431
--- /dev/null
+++ b/simd_filelist.ja.txt
@@ -0,0 +1,261 @@
+Independent JPEG Group's JPEG software release 6b
+  with x86 SIMD extension for IJG JPEG library version 1.02
+    == FILELIST ==
+-----------------------------------------------------------
+
+¢£¤³¤Î¥Õ¥¡¥¤¥ë¤Ï
+
+  ¤³¤Î¥Õ¥¡¥¤¥ë¤Ç¤Ï¡¢SIMD ÈÇ IJG JPEG software ¤ÎÇÛÉÛ¥¢¡¼¥«¥¤¥Ö¤Ë¼ý¤á¤é¤ì¤Æ
+  ¤¤¤ë³Æ¥Õ¥¡¥¤¥ë¤Î³µÍפò²òÀ⤷¤Þ¤¹¡£¤Ê¤ª¡¢¤³¤³¤Ç¤Ï x86 SIMD extension ¤Ç
+  ¿·¤¿¤ËÄɲ䵤줿¥Õ¥¡¥¤¥ë¤È¡¢x86 SIMD extension ¤Ç²þÊѤ¬²Ã¤¨¤é¤ì¤Æµ¡Ç½¡¿
+  Ìò³ä¤¬¥ª¥ê¥¸¥Ê¥ëÈǤȤϰۤʤë¥Õ¥¡¥¤¥ë¤Î¤ß¤ò²òÀ⤷¤Þ¤¹¡£¤½¤ì°Ê³°¤Î¥Õ¥¡¥¤¥ë
+  ¤Ë¤Ä¤¤¤Æ¤Ï¡¢¥ª¥ê¥¸¥Ê¥ëÈǤΠfilelist.doc (±Ñʸ) ¤ò»²¾È¤·¤Æ¤¯¤À¤µ¤¤¡£
+
+    ¢£¥Þ¥Ë¥å¥¢¥ëÎà
+    ¢£configure ¥¹¥¯¥ê¥×¥È¤Ë´Ø·¸¤¹¤ë¥Õ¥¡¥¤¥ë
+    ¢£Microsoft Visual C++ 6.0 ÍÑ¤Î¥×¥í¥¸¥§¥¯¥È¥Õ¥¡¥¤¥ë·² (vc6proj/)
+    ¢£ÆÃÄê¤Î¥³¥ó¥Ñ¥¤¥éÍѤΠjconfig.h ¤È Makefile
+    ¢£¥½¡¼¥¹¥Õ¥¡¥¤¥ë
+      ¡ü x86 SIMD extension ¤Ç¿·¤¿¤ËÄɲ䵤줿¥Õ¥¡¥¤¥ë
+      ¡ü x86 SIMD extension ¤Ç²þÊѤ¬²Ã¤¨¤é¤ì¤¿¥Õ¥¡¥¤¥ë
+    ¢£Ê£¿ô¥Õ¥¡¥¤¥ëÂбþÈǤΠcjpeg/djpeg (altui/)
+    ¢£SIMD ÈǤǤϻȤï¤ì¤Ê¤¤¥Õ¥¡¥¤¥ë·² (unused/)
+
+
+¢£¥Þ¥Ë¥å¥¢¥ëÎà
+
+  °Ê²¼¤Î SIMD ÈǤΥޥ˥奢¥ë¤Î¾¤Ë¡¢¥ª¥ê¥¸¥Ê¥ëÈǤαÑʸ¥Þ¥Ë¥å¥¢¥ë¤â
+  ¤½¤Î¤Þ¤Þ¼ýÏ¿¤·¤Æ¤¢¤ê¤Þ¤¹¡£Ê»¤»¤Æ»²¾È¤·¤Æ¤¯¤À¤µ¤¤¡£
+
+  simd_README.ja.txt    ¼ç¥Þ¥Ë¥å¥¢¥ë(x86 SIMD extension ¤Î³µÍפʤÉ)
+  simd_filelist.ja.txt  SIMD ÈÇ IJG JPEG software ¤Î¥Õ¥¡¥¤¥ë¥ê¥¹¥È
+  simd_install.ja.txt   SIMD ÈÇ libjpeg ¥é¥¤¥Ö¥é¥ê¤Î¥³¥ó¥Ñ¥¤¥ë¤Î»ÅÊý
+  simd_internal.ja.txt  SIMD ÈÇ libjpeg ¥é¥¤¥Ö¥é¥ê¤Î¡¢SIMD ³ÈÄ¥Éôʬ¤Î¾ÜºÙ
+  simd_cdjpeg.ja.txt    SIMD ÈǤΠcjpeg / djpeg ¤Ë¸ÇÍ­¤Îµ¡Ç½¤Î²òÀâ
+  simd_changes.ja.txt   SIMD ÈÇ libjpeg ¥é¥¤¥Ö¥é¥ê¤Î²þÈÇÍúÎò
+
+
+¢£ configure ¥¹¥¯¥ê¥×¥È¤Ë´Ø·¸¤¹¤ë¥Õ¥¡¥¤¥ë
+
+  UNIX ´Ä¶­¤Ç configure ¥¹¥¯¥ê¥×¥È¤òÁö¤é¤»¤ë¾ì¹ç¤ËɬÍפȤʤë¥Õ¥¡¥¤¥ë·²
+  ¤Ç¤¹¡£Èó UNIX ´Ä¶­¤Ç¤Ï¡¢ºï½ü¤·¤Æ¤â¤«¤Þ¤¤¤Þ¤»¤ó¡£
+
+  configure     configure ¥¹¥¯¥ê¥×¥ÈËÜÂÎ
+  config.ver    configure ¤«¤é¸Æ¤Ó½Ð¤µ¤ì¤ë¥¹¥¯¥ê¥×¥È¤Ç¡¢¶¦Í­¥é¥¤¥Ö¥é¥ê¤Î
+                ¥Ð¡¼¥¸¥ç¥óÈÖ¹æ¤òÄêµÁ¤·¤Æ¤¤¤Þ¤¹
+  ltmain.sh     configure ¤Î¥µ¥Ý¡¼¥È¥¹¥¯¥ê¥×¥È (from GNU libtool)
+  config.guess          ¡·
+  config.sub            ¡·
+  install-sh    install ¥³¥Þ¥ó¥É¤¬¤Ê¤¤¾ì¹ç¤ÎÂåÍÑ¥¹¥¯¥ê¥×¥È
+  nasm_lt.sh    nasm ¤ò GNU libtool ¤Ç»È¤¦¾ì¹ç¤Î¥é¥Ã¥Ñ¡¦¥¹¥¯¥ê¥×¥È
+  jconfig.cfg   configure ¤¬À¸À®¤¹¤ë jconfig.h ¤Î¿÷·¿¥Õ¥¡¥¤¥ë
+  makefile.cfg  configure ¤¬À¸À®¤¹¤ë Makefile ¤Î¿÷·¿¥Õ¥¡¥¤¥ë
+  configure.in  configure ¥¹¥¯¥ê¥×¥È¤Î¥½¡¼¥¹¥Õ¥¡¥¤¥ë (for GNU autoconf)
+  aclocal.m4            ¡·
+  libjpeg.spec  RPM ¤òºÎÍѤ·¤Æ¤¤¤ë linux ¥·¥¹¥Æ¥à¸þ¤±¤Î spec ¥Õ¥¡¥¤¥ë
+
+
+¢£ Microsoft Visual C++ 6.0 ÍÑ¤Î¥×¥í¥¸¥§¥¯¥È¥Õ¥¡¥¤¥ë·² (vc6proj/)
+
+  ¤³¤ì¤é¤Î¥Õ¥¡¥¤¥ë¤ò»È¤¦¾ì¹ç¤Ï¡¢¥½¡¼¥¹¥Õ¥¡¥¤¥ë¤¬¤¢¤ë°ì¤Ä¾å¤Î¥Õ¥©¥ë¥À¤Ë
+  ¤³¤ì¤é¤Î¥Õ¥¡¥¤¥ë¤ò¤¹¤Ù¤Æ°ÜÆ°¤·¤Æ¤¯¤À¤µ¤¤¡£¾Ü¤·¤¯¤Ï¡¢simd_install.ja.txt
+  ¤ò»²¾È¤·¤Æ¤¯¤À¤µ¤¤¡£
+
+  Visual C++ 6.0 °Ê¹ß¤ÎÅý¹ç³«È¯´Ä¶­(DevStudio)¤Î¾ì¹ç¤Ï¡¢¤³¤ì¤é¤Î¥Õ¥¡¥¤¥ë
+  ¤òÊÑ´¹(¥¤¥ó¥Ý¡¼¥È)¤·¤Æ»È¤Ã¤Æ¤¯¤À¤µ¤¤¡£
+
+  vc6proj/libjpeg.dsw   ¥×¥í¥¸¥§¥¯¥È¡¦¥ï¡¼¥¯¥¹¥Ú¡¼¥¹
+  vc6proj/makecfg.dsp   libjpeg.dsp (libjpeg.lib) ¤Î¥Ó¥ë¥É¤ËɬÍפÊ
+                        ÀßÄê¥Õ¥¡¥¤¥ë jsimdcfg.inc ¤òºî¤ë
+  vc6proj/libjpeg.dsp   libjpeg.lib ¤Î¥×¥í¥¸¥§¥¯¥È¥Õ¥¡¥¤¥ë
+  vc6proj/cjpeg.dsp     cjpeg.exe ¤Î¥×¥í¥¸¥§¥¯¥È¥Õ¥¡¥¤¥ë
+  vc6proj/djpeg.dsp     djpeg.exe ¤Î¥×¥í¥¸¥§¥¯¥È¥Õ¥¡¥¤¥ë
+  vc6proj/jpegtran.dsp  jpegtran.exe ¤Î¥×¥í¥¸¥§¥¯¥È¥Õ¥¡¥¤¥ë
+  vc6proj/rdjpgcom.dsp  rdjpgcom.exe ¤Î¥×¥í¥¸¥§¥¯¥È¥Õ¥¡¥¤¥ë
+  vc6proj/wrjpgcom.dsp  wrjpgcom.exe ¤Î¥×¥í¥¸¥§¥¯¥È¥Õ¥¡¥¤¥ë
+  vc6proj/apptest.dsp   cjpeg, djpeg, jpegtran ¤ÎÆ°ºî¥Æ¥¹¥È(make test)
+  vc6proj/jconfig.h     VC++ ÍѤΠjconfig.h (jconfig.vc ¤ÈƱ¤¸¤â¤Î)
+
+
+¢£ÆÃÄê¤Î¥³¥ó¥Ñ¥¤¥éÍѤΠjconfig.h ¤È Makefile
+
+  ¾Ü¤·¤¯¤Ï¡¢simd_install.ja.txt ¤ò»²¾È¤·¤Æ¤¯¤À¤µ¤¤¡£¤³¤ì¤é¤Î¥Õ¥¡¥¤¥ë¤Ë¤Ï¡¢
+  SIMD ²½¤Ëȼ¤Ã¤Æ¿·¤¿¤ËÄɲ䵤줿¥½¡¼¥¹¥Õ¥¡¥¤¥ë¤Ë´Ø¤¹¤ëµ­½Ò¤¬Äɲ䵤ì¤Æ
+  ¤¤¤Þ¤¹¤«¤é¡¢¥ª¥ê¥¸¥Ê¥ëÈǤËÉÕ°¤Î jconfig.* ¤È Makefile.* ¤Ï¡¢
+  ¤³¤Î SIMD ÈǤǤϻÈÍѤǤ­¤Þ¤»¤ó¡£
+
+  jconfig.bc5       Borland C++ Compiler 5.5 (win32) ÍѤΠjconfig.h
+  makefile.bc5      Borland C++ Compiler 5.5 (win32) ÍѤΠMakefile
+  jconfig.dj        DJGPP v2.0 ÍѤΠjconfig.h
+  makefile.dj       DJGPP v2.0 ÍѤΠMakefile
+  jconfig.mgw       MinGW ÍѤΠjconfig.h
+  makefile.mgw      MinGW ÍѤΠMakefile (ÀÅŪ JPEG ¥é¥¤¥Ö¥é¥ê¤òºîÀ®)
+  makefile.mgwdll   MinGW ÍѤΠMakefile (DLL ÈÇ JPEG ¥é¥¤¥Ö¥é¥ê¤òºîÀ®)
+  jconfig.vc        VC++ ÍѤΠjconfig.h
+  makefile.vc       VC++ ÍѤΠMakefile (ÀÅŪ JPEG ¥é¥¤¥Ö¥é¥ê¤òºîÀ®)
+  makefile.vcdll    VC++ ÍѤΠMakefile (DLLÈÇ JPEG ¥é¥¤¥Ö¥é¥ê¤òºîÀ®)
+  jconfig.linux     linux ÍѤΠjconfig.h (Ãí¡§configure ¤Î»ÈÍѤò¿ä¾©)
+  makefile.linux    linux ÍѤΠMakefile (Ãí¡§configure ¤Î»ÈÍѤò¿ä¾©)
+
+  °Ê²¼¤Î¥Õ¥¡¥¤¥ë¤Ï¡¢¾åµ­°Ê³°¤Î¥³¥ó¥Ñ¥¤¥é¤ËÂбþ¤¹¤ë jconfig.h ¤È Makefile
+  ¤ò¿·¤¿¤ËºîÀ®¤¹¤ë¾ì¹ç¤Î¿÷·¿¤È¤Ê¤ë¥Õ¥¡¥¤¥ë¤Ç¤¹¡£¥ª¥ê¥¸¥Ê¥ëÈǤˤ⸺ߤ·¤Þ¤¹
+  ¤¬¡¢SIMD ²½¤Ëȼ¤¦Êѹ¹¤¬»Ü¤µ¤ì¤Æ¤¤¤Þ¤¹¡£
+
+  ckconfig.c        jconfig.h ¤òÀ¸À®¤¹¤ë¥×¥í¥°¥é¥à
+  makefile.ansi     Makefile ¤Î¿÷·¿¥Õ¥¡¥¤¥ë (ANSI ¥³¥ó¥Ñ¥¤¥éÍÑ)
+  makefile.unix     Makefile ¤Î¿÷·¿¥Õ¥¡¥¤¥ë (Èó ANSI ¥³¥ó¥Ñ¥¤¥éÍÑ)
+
+
+¢£¥½¡¼¥¹¥Õ¥¡¥¤¥ë
+
+  ¡ü x86 SIMD extension ¤Ç¿·¤¿¤ËÄɲ䵤줿¥Õ¥¡¥¤¥ë
+
+  jccolmmx.asm  RGB->YCbCr ¿§¶õ´ÖÊÑ´¹ (MMX)
+  jccolss2.asm  RGB->YCbCr ¿§¶õ´ÖÊÑ´¹ (SSE2)
+  jcsammmx.asm  ¥À¥¦¥ó¥µ¥ó¥×¥ê¥ó¥° (MMX)
+  jcsamss2.asm  ¥À¥¦¥ó¥µ¥ó¥×¥ê¥ó¥° (SSE2)
+
+  jdcolmmx.asm  YCbCr->RGB ¿§¶õ´ÖÊÑ´¹ (MMX)
+  jdcolss2.asm  YCbCr->RGB ¿§¶õ´ÖÊÑ´¹ (SSE2)
+  jdsammmx.asm  ¥¢¥Ã¥×¥µ¥ó¥×¥ê¥ó¥° (MMX)
+  jdsamss2.asm  ¥¢¥Ã¥×¥µ¥ó¥×¥ê¥ó¥° (SSE2)
+  jdmermmx.asm  ¿§¶õ´ÖÊÑ´¹¡¿¥¢¥Ã¥×¥µ¥ó¥×¥ê¥ó¥°Åý¹ç (MMX)
+  jdmerss2.asm  ¿§¶õ´ÖÊÑ´¹¡¿¥¢¥Ã¥×¥µ¥ó¥×¥ê¥ó¥°Åý¹ç (SSE2)
+
+  jcqntint.asm  ¥Ç¡¼¥¿ÊÑ´¹¤ÈÎ̻Ҳ½ (ÈóSIMD, À°¿ô)
+  jcqntflt.asm  ¥Ç¡¼¥¿ÊÑ´¹¤ÈÎ̻Ҳ½ (ÈóSIMD, ÉâÆ°¾®¿ôÅÀ)
+  jcqntmmx.asm  ¥Ç¡¼¥¿ÊÑ´¹¤ÈÎ̻Ҳ½ (MMX, À°¿ô)
+  jcqnts2i.asm  ¥Ç¡¼¥¿ÊÑ´¹¤ÈÎ̻Ҳ½ (SSE2, À°¿ô)
+  jcqnt3dn.asm  ¥Ç¡¼¥¿ÊÑ´¹¤ÈÎ̻Ҳ½ (3DNow! & MMX, ÉâÆ°¾®¿ôÅÀ)
+  jcqntsse.asm  ¥Ç¡¼¥¿ÊÑ´¹¤ÈÎ̻Ҳ½ (SSE & MMX, ÉâÆ°¾®¿ôÅÀ)
+  jcqnts2f.asm  ¥Ç¡¼¥¿ÊÑ´¹¤ÈÎ̻Ҳ½ (SSE & SSE2, ÉâÆ°¾®¿ôÅÀ)
+
+  jfdctint.asm  ¹âÀºÅÙÀ°¿ô(½çÊý¸þ)DCT (ÈóSIMD)
+  jfmmxint.asm  ¹âÀºÅÙÀ°¿ô(½çÊý¸þ)DCT (MMX)
+  jfss2int.asm  ¹âÀºÅÙÀ°¿ô(½çÊý¸þ)DCT (SSE2)
+  jfdctfst.asm  ¹â®À°¿ô(½çÊý¸þ)DCT (ÈóSIMD)
+  jfmmxfst.asm  ¹â®À°¿ô(½çÊý¸þ)DCT (MMX)
+  jfss2fst.asm  ¹â®À°¿ô(½çÊý¸þ)DCT (SSE2)
+  jfdctflt.asm  ÉâÆ°¾®¿ôÅÀ(½çÊý¸þ)DCT (ÈóSIMD)
+  jf3dnflt.asm  ÉâÆ°¾®¿ôÅÀ(½çÊý¸þ)DCT (3DNow!)
+  jfsseflt.asm  ÉâÆ°¾®¿ôÅÀ(½çÊý¸þ)DCT (SSE)
+
+  jidctint.asm  ¹âÀºÅÙÀ°¿ô(µÕÊý¸þ)DCT (ÈóSIMD)
+  jimmxint.asm  ¹âÀºÅÙÀ°¿ô(µÕÊý¸þ)DCT (MMX)
+  jiss2int.asm  ¹âÀºÅÙÀ°¿ô(µÕÊý¸þ)DCT (SSE2)
+  jidctfst.asm  ¹â®À°¿ô(µÕÊý¸þ)DCT (ÈóSIMD)
+  jimmxfst.asm  ¹â®À°¿ô(µÕÊý¸þ)DCT (MMX)
+  jiss2fst.asm  ¹â®À°¿ô(µÕÊý¸þ)DCT (SSE2)
+  jidctflt.asm  ÉâÆ°¾®¿ôÅÀ(µÕÊý¸þ)DCT (ÈóSIMD)
+  ji3dnflt.asm  ÉâÆ°¾®¿ôÅÀ(µÕÊý¸þ)DCT (3DNow! & MMX)
+  jisseflt.asm  ÉâÆ°¾®¿ôÅÀ(µÕÊý¸þ)DCT (SSE & MMX)
+  jiss2flt.asm  ÉâÆ°¾®¿ôÅÀ(µÕÊý¸þ)DCT (SSE & SSE2)
+  jidctred.asm  ½Ì¾®Å¸³«ÍÑ(µÕÊý¸þ)DCT (ÈóSIMD)
+  jimmxred.asm  ½Ì¾®Å¸³«ÍÑ(µÕÊý¸þ)DCT (MMX)
+  jiss2red.asm  ½Ì¾®Å¸³«ÍÑ(µÕÊý¸þ)DCT (SSE2)
+
+  jsimdcpu.asm  CPU ¤Î SIMD ¥µ¥Ý¡¼¥È¥Á¥§¥Ã¥¯
+  jsimddjg.asm  OS ¤Î SIMD ¥µ¥Ý¡¼¥È¥Á¥§¥Ã¥¯ (for DJGPP V.2)
+  jsimdw32.asm  OS ¤Î SIMD ¥µ¥Ý¡¼¥È¥Á¥§¥Ã¥¯ (for Win32)
+  jsimdgcc.c    OS ¤Î SIMD ¥µ¥Ý¡¼¥È¥Á¥§¥Ã¥¯ (for gcc)
+
+  makecfg.c     ¥¢¥»¥ó¥Ö¥ê¸À¸ìÍÑÀßÄê¥Õ¥¡¥¤¥ë jsimdcfg.inc ¤òºîÀ®¤¹¤ë
+
+  jsimdext.inc  ¥¢¥»¥ó¥Ö¥ê¸À¸ì¥½¡¼¥¹ÍѤζ¦Ḁ̈إåÀ¥Õ¥¡¥¤¥ë
+  jdct.inc      DCT ´ØÏ¢¥Õ¥¡¥¤¥ëÍѤΥإåÀ¥Õ¥¡¥¤¥ë
+  jcolsamp.inc  ¿§¶õ´ÖÊÑ´¹¡¿¥µ¥ó¥×¥ê¥ó¥°´ØÏ¢¥Õ¥¡¥¤¥ëÍѤΥإåÀ¥Õ¥¡¥¤¥ë
+
+  jcolsamp.h    ¿§¶õ´ÖÊÑ´¹¡¿¥µ¥ó¥×¥ê¥ó¥°´ØÏ¢¥Õ¥¡¥¤¥ëÍѤΥإåÀ¥Õ¥¡¥¤¥ë
+                ¥ª¥ê¥¸¥Ê¥ëÈǤˤϸºß¤·¤Ê¤¤£Ã¸À¸ì¥Ø¥Ã¥À¥Õ¥¡¥¤¥ë¤Ç¡¢
+                SIMD ²½¤ÇƳÆþ¤µ¤ì¤¿´Ø¿ô¤ÎÀë¸À¤¬µ­½Ò¤µ¤ì¤Æ¤¤¤Þ¤¹¡£
+
+  jpegdll.def   DLL ÈÇ JPEG Library ÍѤδؿô¥¨¥¯¥¹¥Ý¡¼¥ÈÄêµÁ¥Õ¥¡¥¤¥ë
+  jpegdll.rc    DLL ÈÇ JPEG Library ÍѤΥС¼¥¸¥ç¥ó¥ê¥½¡¼¥¹ÄêµÁ¥Õ¥¡¥¤¥ë
+                ¤³¤ì¤é¤Î¥Õ¥¡¥¤¥ë¤Ï¡¢IJG JPEG Library ¤ò¤½¤Î¤Þ¤Þ DLL ¤Ë
+                ¤¹¤ë¾ì¹ç¤Ë»ÈÍѤ·¤Þ¤¹(makefile.vcdll/makefile.mgwdll)¡£
+
+  ¡ü x86 SIMD extension ¤Ç²þÊѤ¬²Ã¤¨¤é¤ì¤¿¥Õ¥¡¥¤¥ë
+
+  Êѹ¹ÆâÍƤˤĤ¤¤ÆÆä˵­½Ò¤Î¤Ê¤¤¥Õ¥¡¥¤¥ë¤Ë¤Ï¡¢SIMD ²½¤Ç¿·¤¿¤ËƳÆþ¤µ¤ì¤¿
+  ´Ø¿ô¤ÎÀë¸À¤ä¤½¤Î¸Æ¤Ó½Ð¤·µ­½Ò¡¢SIMD ²½¤Ë´ØÏ¢¤¹¤ë¥Þ¥¯¥íÄêµÁ¤Ê¤É¤¬ÄɲÃ
+  ¤µ¤ì¤Æ¤¤¤Þ¤¹¡£
+
+  jpeglib.h     JPEG ¥é¥¤¥Ö¥é¥ê¤Î¥á¥¤¥ó¥Ø¥Ã¥À¥Õ¥¡¥¤¥ë
+  jpegint.h     JPEG ¥é¥¤¥Ö¥é¥ê¤ÎÆâÉôÍѥإåÀ¥Õ¥¡¥¤¥ë
+  jmorecfg.h    JPEG ¥é¥¤¥Ö¥é¥ê¤Î¾ÜºÙÀßÄê¥Ø¥Ã¥À¥Õ¥¡¥¤¥ë
+
+  jdct.h        DCT ´ØÏ¢¥Õ¥¡¥¤¥ëÍѤΥإåÀ¥Õ¥¡¥¤¥ë
+                SIMD ½èÍý¤ËŬ¤¹¤ë¤è¤¦¤Ë¡¢´ö¤Ä¤«¤ÎÊÑ¿ô¤Î·¿¤âÊѹ¹¤µ¤ì¤Æ
+                ¤¤¤Þ¤¹¡£
+
+  jcdctmgr.c    ½çÊý¸þDCT¤Î¥Þ¥Í¡¼¥¸¥á¥ó¥È½èÍý
+  jddctmgr.c    µÕÊý¸þDCT¤Î¥Þ¥Í¡¼¥¸¥á¥ó¥È½èÍý
+
+  jccolor.c     RGB->YCbCr ¿§¶õ´ÖÊÑ´¹ (ÈóSIMD)
+  jdcolor.c     YCbCr->RGB ¿§¶õ´ÖÊÑ´¹ (ÈóSIMD)
+  jdmerge.c     ¿§¶õ´ÖÊÑ´¹¡¿¥¢¥Ã¥×¥µ¥ó¥×¥ê¥ó¥°Åý¹ç (ÈóSIMD)
+  jcsample.c    ¥À¥¦¥ó¥µ¥ó¥×¥ê¥ó¥° (ÈóSIMD)
+  jdsample.c    ¥¢¥Ã¥×¥µ¥ó¥×¥ê¥ó¥° (ÈóSIMD)
+                jdsample.c ¤Ë¤Ï¡¢ÈóSIMDÈǤΠh1v2 ¥¢¥Ã¥×¥µ¥ó¥×¥ê¥ó¥°´Ø¿ô
+                (h1v2_upsample, h1v2_fancy_upsample) ¤âÄɲ䵤ì¤Æ¤¤¤Þ¤¹¡£
+
+  jdhuff.h      ¥Ï¥Õ¥Þ¥óÉä¹æ¥Ç¥³¡¼¥É½èÍý (¥Ø¥Ã¥À)
+  jdhuff.c      ¥Ï¥Õ¥Þ¥óÉä¹æ¥Ç¥³¡¼¥É½èÍý (¥·¡¼¥±¥ó¥·¥ã¥ë)
+  jdphuff.c     ¥Ï¥Õ¥Þ¥óÉä¹æ¥Ç¥³¡¼¥É½èÍý (¥×¥í¥°¥ì¥Ã¥·¥Ö)
+                ¤³¤ì¤é¤Î£³¤Ä¤Î¥Õ¥¡¥¤¥ë¤ÎÊѹ¹ÅÀ¤Ï¡¢SIMD ²½¤Ç¤Ï¤¢¤ê¤Þ¤»¤ó¡£
+                ¥Ç¥³¡¼¥É½èÍý¤ÎÊýË¡¤ò¸úΨ²½¤µ¤»¤Æ¤¢¤ê¤Þ¤¹¡£
+
+  jdcoefct.c    DCT¥Ç¡¼¥¿¥Ö¥í¥Ã¥¯¤Î¥Þ¥Í¡¼¥¸¥á¥ó¥È
+                SIMD ²½¤È¤Ï´Ø·¸¤Ê¤¯¡¢°ìÉô¤Î¥³¡¼¥É¤ò¸úΨ²½¤µ¤»¤Æ¤¢¤ê¤Þ¤¹¡£
+
+  jcomapi.c     °µ½Ì/Ÿ³« ¶¦ÄÌ API ´Ø¿ôÄêµÁ
+                SIMD ¥µ¥Ý¡¼¥È¥Á¥§¥Ã¥¯´Ø¿ô¤Ê¤É¤¬Äɲ䵤ì¤Æ¤¤¤Þ¤¹¡£
+
+  jmemmgr.c     JPEG library ÍÑ¥á¥â¥ê¥Þ¥Í¡¼¥¸¥ã (¥á¥¤¥ó)
+                SIMD ²½¤Ëȼ¤¤¡¢16¥Ð¥¤¥È¥¢¥É¥ì¥¹¶­³¦¤Ë¹ç¤Ã¤¿¥á¥â¥êÎΰè¤ò
+                ¾ï¤Ë16¥Ð¥¤¥Èñ°Ì¤Ç³ÎÊݤ¹¤ë¤è¤¦¤ËÊѹ¹¤·¤Æ¤¢¤ê¤Þ¤¹¡£
+
+  cjpeg.c       JPEG °µ½ÌÍÑ ¥³¥Þ¥ó¥É¥é¥¤¥ó¡¦¥æ¡¼¥Æ¥£¥ê¥Æ¥£
+  djpeg.c       JPEG Ÿ³«ÍÑ ¥³¥Þ¥ó¥É¥é¥¤¥ó¡¦¥æ¡¼¥Æ¥£¥ê¥Æ¥£
+                -v ¥ª¥×¥·¥ç¥ó¤Ç¤Î SIMD ´ØÏ¢¾ðÊó¤Îɽ¼¨¤ä¡¢-nosimd ¤Ê¤É¤Î
+                ¥ª¥×¥·¥ç¥ó¥¹¥¤¥Ã¥Á¤¬Äɲ䵤ì¤Æ¤¤¤Þ¤¹¡£
+
+  rdbmp.c       BMP ¥Õ¥¡¥¤¥ëÆɤ߹þ¤ß¥â¥¸¥å¡¼¥ë
+  wrbmp.c       BMP ¥Õ¥¡¥¤¥ë½ñ¤­½Ð¤·¥â¥¸¥å¡¼¥ë
+                SIMD ²½¤È¤Ï´Ø·¸¤Ê¤¯¡¢°ìÉô¤Î¥³¡¼¥É¤ò¸úΨ²½¤µ¤»¤Æ¤¢¤ê¤Þ¤¹¡£
+
+  rdgif.c       GIF ¥Õ¥¡¥¤¥ëÆɤ߹þ¤ß¥â¥¸¥å¡¼¥ë(version 6a)
+  wrgif.c       GIF ¥Õ¥¡¥¤¥ë½ñ¤­½Ð¤·¥â¥¸¥å¡¼¥ë(version 6a)
+                Unisys ¤Î GIF (LZW) Æõö¼º¸ú¤Ëȼ¤¤¡¢version 6a ¤Ç¥µ¥Ý¡¼¥È
+                ¤µ¤ì¤Æ¤¤¤¿ cjpeg/djpeg ¤Ç¤Î GIF ¤ÎÆþ½ÐÎϤòÉü³è¤µ¤»¤Þ¤·¤¿¡£
+                ¤³¤Î GIF ¥â¥¸¥å¡¼¥ë¤Ï version 6a ¤Î¤â¤Î¤òήÍѤ·¤Æ¤¤¤Þ¤¹¡£
+                version 6b ¤Î GIF ¥â¥¸¥å¡¼¥ë¤Ï unused/ ¤Ë¤¢¤ê¤Þ¤¹¡£
+
+
+¢£Ê£¿ô¥Õ¥¡¥¤¥ëÂбþÈǤΠcjpeg/djpeg (altui/)
+
+  altui/ ¤Ë¤¢¤ë¥Õ¥¡¥¤¥ë¤Ï¡¢¸µ¡¹ jpegaltui.v6b.tar.gz ¤È¤¤¤¦¥Õ¥¡¥¤¥ë̾¤Ç
+  Ê̤ËÇÛÉÛ¤µ¤ì¤Æ¤¤¤¿¤â¤Î¤Ç¤¹¡£¤³¤Î SIMD ÈǤǤϡ¢£±¥Õ¥¡¥¤¥ëÈÇ cjpeg/djpeg
+  ¤ÈƱÍͤΠSIMD Âбþ²½¤Ë´Ø¤¹¤ë½¤Àµ¤È¡¢Borland C++ / Microsoft VC++ ¤Ë
+  ¤ª¤¤¤Æ¥ï¡¼¥ë¥É¥«¡¼¥ÉŸ³«½èÍý¤òÍ­¸ú²½¤¹¤ë¤¿¤á¤Î¥³¡¼¥É¤ò½ñ¤­²Ã¤¨¤¿¤â¤Î¤Ç¤¹¡£
+
+  altui/cjpeg.c         Ê£¿ô¥Õ¥¡¥¤¥ëÂбþÈǤΠcjpeg
+  altui/djpeg.c         Ê£¿ô¥Õ¥¡¥¤¥ëÂбþÈǤΠdjpeg
+  altui/README.alt      jpegaltui.v6b.tar.gz ¤ËƱº­¤µ¤ì¤Æ¤¤¤¿ README
+  altui/usage.alt       Ê£¿ô¥Õ¥¡¥¤¥ëÂбþÈÇ cjpeg/djpeg ¤Î¥Þ¥Ë¥å¥¢¥ë(º¹Ê¬)
+
+
+¢£SIMD ÈǤǤϻȤï¤ì¤Ê¤¤¥Õ¥¡¥¤¥ë·² (unused/)
+
+  unused/ ¤Ë¤¢¤ë¥Õ¥¡¥¤¥ë¤Ï¡¢¥ª¥ê¥¸¥Ê¥ëÈǤΠIJG JPEG software ¤Ë¼ýÏ¿¤µ¤ì¤Æ
+  ¤¤¤¿¤¬¡¢¤³¤Î SIMD ÈǤǤϻȤï¤ì¤Ê¤¤/»È¤¨¤Ê¤¤¥Õ¥¡¥¤¥ë·²¤¬¼ý¤á¤é¤ì¤Æ¤¤¤Þ¤¹¡£
+
+  unused/j?dct???.c     ¥ª¥ê¥¸¥Ê¥ë¤Î£Ã¸À¸ìÈÇ DCT ´Ø¿ô
+  unused/jmem*.*        ¥·¥¹¥Æ¥à°Í¸¥á¥â¥ê¥Þ¥Í¡¼¥¸¥ã(for MS-DOS/Macintosh)
+  unused/??gif.c        version 6b ¤Î GIF ¥â¥¸¥å¡¼¥ë
+  unused/jconfig.*      ¥ª¥ê¥¸¥Ê¥ëÈǤËÉÕ°¤Î jconfig.*
+  unused/mak*.*         ¥ª¥ê¥¸¥Ê¥ëÈǤËÉÕ°¤Î Makefile.*
+
+
+
+[EOF]
diff --git a/simd_install.ja.txt b/simd_install.ja.txt
new file mode 100644
index 0000000..ef8f825
--- /dev/null
+++ b/simd_install.ja.txt
@@ -0,0 +1,436 @@
+Independent JPEG Group's JPEG software release 6b
+  with x86 SIMD extension for IJG JPEG library version 1.02
+    == INSTALL ==
+-----------------------------------------------------------
+
+¢£¤³¤Î¥Õ¥¡¥¤¥ë¤Ï
+
+  ¤³¤Î¥Õ¥¡¥¤¥ë¤Ç¤Ï¡¢SIMD ÈÇ libjpeg ¥é¥¤¥Ö¥é¥ê¤Î¥³¥ó¥Ñ¥¤¥ë¤Î»ÅÊý¤ò²òÀâ
+  ¤·¤Þ¤¹¡£¾¡¼ê¤Ê¤¬¤é¡¢¤³¤³¤Ç¤Ï¥ª¥ê¥¸¥Ê¥ëÈǤΠlibjpeg ¥é¥¤¥Ö¥é¥ê¤Î°·¤¤Êý
+  (¥³¥ó¥Ñ¥¤¥ë¤Î»ÅÊý¡¿¥×¥í¥°¥é¥à¤ÎÃæ¤Ç¤Î»È¤¤Êý)¤ò¤¢¤ëÄøÅÙ¿´ÆÀ¤Æ¤¤¤ë¤È¤¤¤¦
+  ¿Í¤òÂоݤˤµ¤»¤Æ¤¤¤¿¤À¤­¤Þ¤¹¡£¥ª¥ê¥¸¥Ê¥ëÈǤλÈÍÑË¡¤Ë¤Ä¤¤¤Æ¤Ï¡¢
+  install.doc (±Ñʸ) ¤ò»²¾È¤·¤Æ¤¯¤À¤µ¤¤¡£
+
+    ¢£¥¢¥»¥ó¥Ö¥é NASM ¤ÎÆþ¼ê¡¿¥¤¥ó¥¹¥È¡¼¥ë
+    ¢£¥³¥ó¥Ñ¥¤¥ë¤Î»ÅÊý
+      ¡ü Microsoft Visual C++ 6.0 °Ê¹ß¤ÎÅý¹ç³«È¯´Ä¶­(DevStudio)¤Î¾ì¹ç
+      ¡ü jconfig.h ¤È Makefile ¤òÁªÂò¤·¤Æ¥³¥ó¥Ñ¥¤¥ë¤¹¤ë
+      ¡ü UNIX ´Ä¶­¤Ç configure ¥¹¥¯¥ê¥×¥È¤ò»È¤¦
+    ¢£Ê£¿ô¥Õ¥¡¥¤¥ëÂбþÈǤΠcjpeg/djpeg (altui/)
+    ¢£¥³¡¼¥É¥µ¥¤¥º¤ò¸º¤é¤¹¤Ë¤Ï
+    ¢£ÆÃÄê¤Î SIMD Ì¿Îá¤ò»ÈÍѤ·¤Ê¤¤¤è¤¦¤Ë¤¹¤ë¤Ë¤Ï
+
+
+¢£¥¢¥»¥ó¥Ö¥é NASM ¤ÎÆþ¼ê¡¿¥¤¥ó¥¹¥È¡¼¥ë
+
+  ¤³¤Î x86 SIMD ÈÇ libjpeg ¥é¥¤¥Ö¥é¥ê¤Î SIMD ³ÈÄ¥Éôʬ¤Ï¡¢¤½¤Î¤Û¤È¤ó¤É¤¬
+  x86 ¤Î¥¢¥»¥ó¥Ö¥ê¸À¸ì¤Ç½ñ¤«¤ì¤Æ¤¤¤Þ¤¹¡£¤³¤Î¥¢¥»¥ó¥Ö¥ê¸À¸ì¥½¡¼¥¹¥³¡¼¥É¤ò
+  ¥¢¥»¥ó¥Ö¥ë¤¹¤ë¤Ë¤Ï¡¢NASM (Netwide Assembler) ¤È¤¤¤¦¥¢¥»¥ó¥Ö¥é¤¬É¬ÍפǤ¹¡£
+  Microsoft ¤Î MASM ¤ä¤½¤Î¸ß´¹¥¢¥»¥ó¥Ö¥é¤Ç¤Ï°·¤¨¤Þ¤»¤ó¤Î¤ÇÃí°Õ¤·¤Æ¤¯¤À¤µ¤¤¡£
+  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  NASM (Netwide Assembler) ¤Ï¡¢¸ø¼°¥µ¥¤¥È http://nasm.sourceforge.net/ or
+  http://sourceforge.net/project/showfiles.php?group_id=6208 ¤«¤é¥À¥¦¥ó
+  ¥í¡¼¥É¤Ç¤­¤Þ¤¹¡£Ver.0.98.25 °Ê¹ß¤ÎÈæ³ÓŪ¿·¤·¤¤¥Ð¡¼¥¸¥ç¥ó¤Î¤â¤Î¤¬É¬ÍפǤ¹¡£
+  ¸½»þÅÀ(2006/02)¤Ç¤ÎºÇ¿·ÈÇ¤Ï Ver.0.98.39 ¤Ç¤¹¡£
+
+  ¤Ê¤ª¡¢x86 ÈǤΠDarwin ¤ä Mac OS X ¤Ê¤É¤Ç»ÈÍѤ¹¤ë¾ì¹ç¤Ï¡¢¸½»þÅÀ¤Ç¤Ï¤Þ¤À
+  Àµ¼°¤Ë¥ê¥ê¡¼¥¹¤µ¤ì¤Æ¤¤¤Ê¤¤ Ver.0.98.40 °Ê¹ß¤Î¥Ð¡¼¥¸¥ç¥ó¤Î¤â¤Î¤¬É¬ÍפǤ¹¡£
+  ¸½»þÅÀ¤Ç¤Ï¡¢Ver.0.98.40 ¤Ï¾åµ­¤Î¸ø¼°¥µ¥¤¥È¤Î CVS ¥ê¥Ý¥¸¥È¥ê¤«¤é¥½¡¼¥¹
+  ¥³¡¼¥É¤ò¥À¥¦¥ó¥í¡¼¥É¤·¤Æ¥³¥ó¥Ñ¥¤¥ë¡¿¥¤¥ó¥¹¥È¡¼¥ë¤¹¤ëɬÍפ¬¤¢¤ê¤Þ¤¹¡£
+
+  Windows ·Ï¤Î¾ì¹ç¤Ï¡¢nasm-0.XX.YY-win32.zip (XX.YY ¤Ë¤Ï¥Ð¡¼¥¸¥ç¥óÈֹ椬Æþ¤ë)
+  ¤È¤¤¤¦Ì¾Á°¤Î¥Õ¥¡¥¤¥ë¤ò¥À¥¦¥ó¥í¡¼¥É¤·¤Æ¡¢¤½¤ì¤Ë´Þ¤Þ¤ì¤ë nasmw.exe ¤ò
+  £Ã¥³¥ó¥Ñ¥¤¥é¤Î¼Â¹Ô¥Õ¥¡¥¤¥ë·²¤¬¥¤¥ó¥¹¥È¡¼¥ë¤µ¤ì¤Æ¤¤¤ë¾ì½ê¤Ë¥³¥Ô¡¼¤·¤Þ¤¹¡£
+
+  ³Æ¼ï PC-UNIX ¤Î¾ì¹ç¤Ï¡¢OS ¤ÎÇÛÉÛ¸µ¤Ë¤Æ°Ü¿¢ºÑ¤ß¥Ñ¥Ã¥±¡¼¥¸¤¬Ä󶡤µ¤ì¤Æ¤¤¤ë
+  ¾ì¹ç¤¬¤¢¤ê¤Þ¤¹¤Î¤Ç¡¢¤Þ¤ººÇ½é¤Ë¤½¤Á¤é¤ò³Îǧ¤·¤Æ¤ß¤Æ¤¯¤À¤µ¤¤¡£¤½¤ì¤¬¤Ê¤¤
+  ¾ì¹ç¤Ï¡¢¾åµ­¸ø¼°¥µ¥¤¥È¤«¤é¥½¡¼¥¹¥³¡¼¥É(nasm-0.XX.YY.tar.gz)¤ò¥À¥¦¥ó¥í¡¼¥É
+  ¤·¤Æ¥³¥ó¥Ñ¥¤¥ë¡¿¥¤¥ó¥¹¥È¡¼¥ë¤·¤Æ¤¯¤À¤µ¤¤¡£linux ¤Î¾ì¹ç¤Ç rpm ¥Ñ¥Ã¥±¡¼¥¸¤ò
+  °·¤¨¤ë¥·¥¹¥Æ¥à¤Î¾ì¹ç¤Ï¡¢¾åµ­¸ø¼°¥µ¥¤¥È¤Ë¤Æ rpm ¥Ð¥¤¥Ê¥ê¥Ñ¥Ã¥±¡¼¥¸¤âÆþ¼ê
+  ¤Ç¤­¤Þ¤¹¡£
+
+  Ãí°ÕÅÀ¤È¤·¤Æ¡¢YASM (http://www.tortall.net/projects/yasm/) ¤Ï»È¤ï¤Ê¤¤¤Ç
+  ¤¯¤À¤µ¤¤¡£YASM ¤Ï NASM ¸ß´¹¤òëð¤Ã¤Æ¤¤¤Þ¤¹¤¬¡¢¸½ºß¤Î¥Ð¡¼¥¸¥ç¥ó(0.4.0)¤Ç¤Ï
+  ¤Þ¤À¸ß´¹ÅÙ¤¬Ä㤤¾å¤Ë¥Ð¥°¤¬¤¢¤ë(¥¢¥É¥ì¥¹·×»»¤¬¤Þ¤Ã¤¿¤¯¥Ç¥¿¥é¥á¤Ê¥³¡¼¥É¤ò
+  À¸À®¤·¤Æ¤¤¤ë)¤¿¤á¡¢ÅöÊý¤Î¥Æ¥¹¥È¤Ç¤Ï YASM ¤Ç¥¢¥»¥ó¥Ö¥ë¤·¤¿¥³¡¼¥É¤Ï¤Þ¤Ã¤¿¤¯
+  Æ°¤­¤Þ¤»¤ó¤Ç¤·¤¿¡£¡ÊÃí¡§¤³¤Î x86 SIMD extension for IJG JPEG library
+  ¤Ç¤Ï¡¢¤¿¤È¤¨ YASM ¤òÍѤ¤¤Æ¤â AMD64 ¤Î 64bit Âбþ¤Ë¤Ï¤Ê¤ê¤Þ¤»¤ó¡£¡Ë
+
+
+¢£¥³¥ó¥Ñ¥¤¥ë¤Î»ÅÊý
+
+  ¤Û¤È¤ó¤É¤ÎÉôʬ¤Ç¥ª¥ê¥¸¥Ê¥ëÈǤÈÊѤï¤ê¤¢¤ê¤Þ¤»¤ó¤Î¤Ç¡¢°Ê²¼¤ÎÀâÌÀ¤Ç¤Ï¡¢
+  ¤³¤Î SIMD ³ÈÄ¥ÈǤËÆÃÍ­¤ÎÃí°ÕÅÀ¤òÃæ¿´¤Ë½Ò¤Ù¤Þ¤¹¡£
+
+
+  ¡ü Microsoft Visual C++ 6.0 °Ê¹ß¤ÎÅý¹ç³«È¯´Ä¶­(DevStudio)¤Î¾ì¹ç
+
+    ¤³¤ÎÇÛÉÛ¥»¥Ã¥È¤Ë¤Ï¡¢Microsoft Visual C++ 6.0 ÍÑ¤Î¥×¥í¥¸¥§¥¯¥È¥Õ¥¡¥¤¥ë
+    ¤¬ÉÕ°¤·¤Æ¤¤¤Þ¤¹¡£V6.0 °Ê¹ß¤Î VC++ ¤Î¾ì¹ç¤Ï¡¢VC++ 6.0 ¤Î¥Õ¥¡¥¤¥ë¤ò
+    ÊÑ´¹(¥¤¥ó¥Ý¡¼¥È)¤·¤Æ»È¤Ã¤Æ¤¯¤À¤µ¤¤¡£Microsoft Visual C++ 2005 Express
+    Edition ¤Ë¤Æ¡¢¥¤¥ó¥Ý¡¼¥È¡¿¥³¥ó¥Ñ¥¤¥ë¤Ç¤­¤ë¤³¤È¤ò³Îǧ¤·¤Æ¤¤¤Þ¤¹¡£
+
+    ÊýË¡¤Ï¡¢¤Þ¤º vc6proj ¥Õ¥©¥ë¥À¤ÎÃæ¤Ë¤¢¤ë¥Õ¥¡¥¤¥ë¤ò¡¢¥½¡¼¥¹¥Õ¥¡¥¤¥ë·²
+    (*.c) ¤Î¤¢¤ë¥Õ¥©¥ë¥À¤Ë¤¹¤Ù¤Æ°ÜÆ°¤·¤Þ¤¹¡£¤½¤·¤Æ libjpeg.dsw ¤ò³«¤¤¤Æ
+    (¤â¤·¤¯¤Ï¥¤¥ó¥Ý¡¼¥È¤·¤Æ)¡¢¥á¥Ë¥å¡¼¤Î ¥Ó¥ë¥É¢ª¥Ð¥Ã¥Á¥Ó¥ë¥É ¤Ç¤¹¤Ù¤Æ¤Ë
+    ¥Á¥§¥Ã¥¯¤òÆþ¤ì¤Æ¥Ó¥ë¥É¤¹¤ì¤Ð£Ï£Ë¤Ç¤¹¡£
+
+    ¥ï¡¼¥¯¥¹¥Ú¡¼¥¹ libjpeg.dsw ¤Ë¤Ï¡¢°Ê²¼¤Î¥×¥í¥¸¥§¥¯¥È¤¬¼ý¤á¤é¤ì¤Æ¤¤¤Þ¤¹¡£
+
+    ¢¡ makecfg.dsp
+
+      ¤³¤Î¥×¥í¥¸¥§¥¯¥È¤Ï¡¢makecfg.c ¤ò¥³¥ó¥Ñ¥¤¥ë¡¿¥ê¥ó¥¯¤·¤Æ¡¢¥«¥¹¥¿¥à
+      ¥Ó¥ë¥É¥¹¥Æ¥Ã¥×¤Ç¤½¤ì(makecfg.exe)¤ò¼Â¹Ô¤·¡¢libjpeg.dsp ¤Î¥Ó¥ë¥É¤Ë
+      ɬÍפÊÀßÄê¥Õ¥¡¥¤¥ë jsimdcfg.inc ¤òºî¤ëƯ¤­¤ò¤·¤Æ¤¤¤Þ¤¹¡£
+
+      ¤³¤ì¤Ï¡¢libjpeg.dsp ¤¬¥³¥ó¥Ñ¥¤¥ë¤µ¤ì¤ëÁ°¤Ëɬ¤º¥³¥ó¥Ñ¥¤¥ë(¼Â¹Ô)
+      ¤µ¤ì¤Ê¤±¤ì¤Ð¤Ê¤ê¤Þ¤»¤ó¡£
+
+    ¢¡ libjpeg.dsp
+
+      JPEG ¥é¥¤¥Ö¥é¥ê libjpeg.lib ¤ò¥Ó¥ë¥É¤·¤Þ¤¹¡£makecfg.dsp ¤¬½ÐÎϤ¹¤ë
+      ÀßÄê¥Õ¥¡¥¤¥ë jsimdcfg.inc ¤¬É¬ÍפǤ¹¡£
+
+    ¢¡ cjpeg.dsp, djpeg.dsp, jpegtran.dsp, rdjpgcom.dsp, wrjpgcom.dsp
+
+      IJG JPEG library ¤ËÉÕ°¤·¤Æ¤¤¤ë¥µ¥ó¥×¥ë¡¦¥¢¥×¥ê¥±¡¼¥·¥ç¥ó¤Ç¤¹¡£
+      cjpeg ¤Ï JPEG °µ½Ì¤ò¡¢djpeg ¤Ï JPEG Ÿ³«¤ò¡¢jpegtran ¤Ï JPEG
+      ¥Õ¥¡¥¤¥ë¤ÎÊÑ´¹¤ò¤·¤Þ¤¹¡£rdjpgcom ¤È wrjpgcom ¤Ï¡¢JPEG¥Õ¥¡¥¤¥ë
+      Ãæ¤Î¥Æ¥­¥¹¥È¥³¥á¥ó¥Èʸ¤òÁàºî¤·¤Þ¤¹¡£
+
+    ¢¡ apptest.dsp
+
+      ¤Ç¤­¤¢¤¬¤Ã¤¿ cjpeg, djpeg, jpegtran ¤ËÂФ·¤Æ´Êñ¤ÊÆ°ºî¥Æ¥¹¥È¤ò
+      ¤·¤Þ¤¹(make test ¤ÈƱÅù)¡£¤Þ¤º¡¢¤³¤ì¤é¤Î¥½¥Õ¥È¤Ç¼ÂºÝ¤Ë²èÁü¥Õ¥¡¥¤¥ë
+      ¤òÊÑ´¹¤µ¤»¡¢¤½¤Î½ÐÎϤò fc.exe ¤ÇÈæ³Ó¤·¤Þ¤¹¡£"FC: Áê°ãÅÀ¤Ï¸¡½Ð
+      ¤µ¤ì¤Þ¤»¤ó¤Ç¤·¤¿" ¤È¤¤¤¦¥á¥Ã¥»¡¼¥¸¤¬£¶²óɽ¼¨¤µ¤ì¤ì¤Ð¡¢¥×¥í¥°¥é¥à¤Ï
+      Àµ¤·¤¯Æ°¤¤¤Æ¤¤¤Þ¤¹¡£
+
+    JPEG ¥é¥¤¥Ö¥é¥ê libjpeg.lib ¤ò¾¤Î¥½¥Õ¥È¤Ë¥ê¥ó¥¯¤¹¤ë¤¿¤á¤Ë libjpeg.dsp
+    ¤ò¾¤Î¥ï¡¼¥¯¥¹¥Ú¡¼¥¹¡¿¥½¥ê¥å¡¼¥·¥ç¥ó¤Ë´Þ¤á¤ë¾ì¹ç¤Ï¡¢makecfg.dsp ¤â
+    ɬ¤ºÆ±¤¸¥ï¡¼¥¯¥¹¥Ú¡¼¥¹¡¿¥½¥ê¥å¡¼¥·¥ç¥ó¤Ë´Þ¤á¡¢¾ï¤Ë makecfg.dsp ¤¬
+    libjpeg.dsp ¤è¤ê¤âÀè¤Ë¥Ó¥ë¥É¤µ¤ì¤ë¤è¤¦¤Ë¡¢°Í¸´Ø·¸¤òÀßÄꤷ¤Æ¤¯¤À¤µ¤¤¡£
+    ¤³¤ì¤Ï¡¢Àè¤Ë½Ò¤Ù¤¿¤è¤¦¤Ë¡¢libjpeg.dsp ¤Ï makecfg.dsp ¤¬½ÐÎϤ¹¤ëÀßÄê
+    ¥Õ¥¡¥¤¥ë jsimdcfg.inc ¤òɬÍפȤ·¤Æ¤¤¤ë¤¿¤á¤Ç¤¹¡£
+
+    ÀßÄêË¡¤Ï¡¢(VC++ 6.0¤Î¾ì¹ç) ¥á¥Ë¥å¡¼¤Î ¥×¥í¥¸¥§¥¯¥È ¢ª °Í¸´Ø·¸ ¤Ç
+    libjpeg ¤òÁªÂò¤·¡¢²¼¤Î°ìÍ÷¤ÎÃæ¤Î makecfg ¤Ë¥Á¥§¥Ã¥¯¤òÆþ¤ì¤Þ¤¹¡£
+
+
+  ¡ü jconfig.h ¤È Makefile ¤òÁªÂò¤·¤Æ¥³¥ó¥Ñ¥¤¥ë¤¹¤ë
+
+    Windows ·Ï¤Î¥³¥ó¥Ñ¥¤¥é¤Î¾ì¹ç¤Ç¡¢¥³¥Þ¥ó¥É¥é¥¤¥ó¤«¤é¥³¥ó¥Ñ¥¤¥ë¤¹¤ë¾ì¹ç
+    ¤Ï¡¢¤³¤ÎÊýË¡¤ò¤È¤Ã¤Æ¤¯¤À¤µ¤¤¡£
+
+    ÊýË¡¤Ï¡¢ÉÕ°¤Î jconfig.* ¤È Makefile.* ¤ÎÃ椫¤éŬÀڤʤâ¤Î¤ò¤½¤ì¤¾¤ì
+    °ì¤Ä¤º¤ÄÁª¤Ó¡¢¤½¤ì¤¾¤ì jconfig.h ¤È Makefile ¤Ë̾Á°¤òÊѤ¨¤Þ¤¹¡£
+    ¤½¤·¤Æ¡¢¥½¡¼¥¹¥Õ¥¡¥¤¥ë¤Î¤¢¤ë¥Ç¥£¥ì¥¯¥È¥ê(¥Õ¥©¥ë¥À)¤ò¥«¥ì¥ó¥È¥Ç¥£¥ì
+    ¥¯¥È¥ê¤Ë¤·¤Æ¡¢¥³¥Þ¥ó¥É¥é¥¤¥ó¤Ç make (VC++ ¤Î¾ì¹ç¤Ï nmake)¤È¥¿¥¤¥×
+    ¤¹¤ì¤Ð£Ï£Ë¤Ç¤¹¡£¸å¤Ï¼«Æ°Åª¤Ë¡¢libjpeg ¥é¥¤¥Ö¥é¥ê¤È¡¢¥µ¥ó¥×¥ë¥¢¥×¥ê
+    ¥±¡¼¥·¥ç¥ó(cjpeg ¤ä djpeg ¤Ê¤É)¤¬¥³¥ó¥Ñ¥¤¥ë¤µ¤ì¤Þ¤¹¡£
+
+    ÉÕ°¤·¤Æ¤¤¤ë jconfig.* ¤È Makefile.* ¤Ï¡¢°Ê²¼¤Î½èÍý·Ï¤ËÂбþ¤·¤Æ¤¤¤Þ¤¹¡£
+    ¤Ê¤ª¡¢¥ª¥ê¥¸¥Ê¥ëÈǤËÉÕ°¤·¤Æ¤¤¤ë jconfig.* ¤È Makefile.* ¤Ï¡¢¤³¤Î
+    SIMD ÂбþÈǤǤϻÈÍѤǤ­¤Þ¤»¤ó(¥½¡¼¥¹¥Õ¥¡¥¤¥ë¤Î¹½À®¤¬ÊѤï¤Ã¤Æ¤¤¤ë¤¿¤á)¡£
+
+    ¡¦ jconfig.dj  & makefile.dj  -- DJGPP v2.0 or later
+    ¡¦ jconfig.bc5 & makefile.bc5 -- Borland C++ Compiler 5.5 (win32)
+    ¡¦ jconfig.mgw & makefile.mgw, makefile.mgwdll -- MinGW
+    ¡¦ jconfig.vc  & makefile.vc , makefile.vcdll  -- VC++ 6.0 or later
+    ¡¦ jconfig.linux & makefile.linux -- linux
+
+    ¤³¤ì¤é°Ê³°¤Î¥³¥ó¥Ñ¥¤¥é¤ËÂбþ¤µ¤»¤ë¾ì¹ç¤Ï¡¢install.doc ¤Ë½ñ¤¤¤Æ¤¢¤ë
+    ¤è¤¦¤Ë¡¢¤Þ¤º ckconfig.c ¤ò¥³¥ó¥Ñ¥¤¥ë¡¿¼Â¹Ô¤·¤Æ jconfig.h ¤òÀ¸À®¤·¡¢
+    makefile.unix ¤â¤·¤¯¤Ï makefile.ansi ¤ò¼êºî¶È¤ÇÊÔ½¸¤·¤Æ Makefile ¤ò
+    ºîÀ®¤·¤Æ¤¯¤À¤µ¤¤¡£
+
+    °Ê²¼¡¢³Æ¥³¥ó¥Ñ¥¤¥é¤Ë¸ÇÍ­¤ÎÃí°ÕÅÀ¤ò½Ò¤Ù¤Þ¤¹¡£
+
+    ¢¡ jconfig.dj  & makefile.dj  -- DJGPP v2.0 or later.
+
+      ²Äǽ¤Ê¸Â¤ê¿·¤·¤¤¥Ð¡¼¥¸¥ç¥ó¤ò»ÈÍѤ·¤Æ¤¯¤À¤µ¤¤¡£DJGPP 2.03 + gcc 3.4.4
+      + binutils 2.16.1 ¤È¤¤¤¦ÁȤ߹ç¤ï¤»¤ÇÆ°ºî¤ò³Îǧ¤·¤Æ¤¤¤Þ¤¹¡£
+
+      makefile.dj ¤Ë¤Ï¡¢gcc 3.4.x ¸þ¤±¤Î¥³¥ó¥Ñ¥¤¥ë¥ª¥×¥·¥ç¥ó¤¬½ñ¤«¤ì¤Æ
+      ¤¤¤Þ¤¹¤Î¤Ç¡¢gcc 3.4.x °Ê³°¤Ç¤¦¤Þ¤¯¥³¥ó¥Ñ¥¤¥ë¤Ç¤­¤Ê¤¤¾ì¹ç¤Ï¡¢Å¬Åö¤Ë
+      ½ñ¤­Ä¾¤·¤Æ»È¤Ã¤Æ¤¯¤À¤µ¤¤¡£
+
+    ¢¡ jconfig.bc5 & makefile.bc5 -- Borland C++ Compiler 5.5 (win32)
+
+      ²¿¤ÎÌäÂê¤â¤Ê¤¯¥³¥ó¥Ñ¥¤¥ë¤Ï¤Ç¤­¤ë¤Ï¤º¤Ç¤¹¤¬¡¢¤³¤Î BCC 5.5 ¤Î¾ì¹ç¤Ï¡¢
+      ¤Û¤È¤ó¤É¤Î¥±¡¼¥¹¤Ç SSE/SSE2 ¤¬Æ°ºî¤·¤Þ¤»¤ó¡£Íýͳ¤Ï¡¢BCC 5.5 ¤ËÉÕ°¤Î
+      ¥ê¥ó¥«(ilink32.exe)¤¬¸Å¤¯¡¢SSEÄê¿ô¤ò16¥Ð¥¤¥È¶­³¦¥¢¥É¥ì¥¹¤ËÇÛÃÖ¤¹¤ë
+      ¤³¤È¤¬¤Ç¤­¤Ê¤¤¤¿¤á¤Ç¤¹¡£
+
+      ¤½¤Î¤¿¤á¡¢BCC 5.5 ¤ò»È¤¦¾ì¹ç¤Ï SSE/SSE2 ¤Î¥µ¥Ý¡¼¥È¤òºï½ü(¸å½Ò)¤·¤Æ
+      ¥³¥ó¥Ñ¥¤¥ë¤¹¤ë¤³¤È¤ò¤ª´«¤á¤¤¤¿¤·¤Þ¤¹¡£
+
+    ¢¡ jconfig.mgw & makefile.mgw, makefile.mgwdll -- MinGW
+
+      ²Äǽ¤Ê¸Â¤ê¿·¤·¤¤¥Ð¡¼¥¸¥ç¥ó¤ò»ÈÍѤ·¤Æ¤¯¤À¤µ¤¤¡£gcc 3.4.4 + binutils
+      2.16.91 ¤È¤¤¤¦ÁȤ߹ç¤ï¤»¤ÇÆ°ºî¤ò³Îǧ¤·¤Æ¤¤¤Þ¤¹¡£
+
+      ¤Þ¤¿¡¢GNU make ¤¬¥¤¥ó¥¹¥È¡¼¥ë¤µ¤ì¤Æ¤¤¤ë¤³¤È¤âɬÍפǤ¹¡£MinGW ¤Î¾ì¹ç¡¢
+      (make.exe ¤Ç¤Ï¤Ê¤¯) mingw32-make.exe ¤È¤¤¤¦Ì¾Á°¤Ç¥¤¥ó¥¹¥È¡¼¥ë¤µ¤ì
+      ¤Þ¤¹¤Î¤Ç¡¢make ¤Ç¤Ï¤Ê¤¯ mingw32-make ¤È¥¿¥¤¥×¤¹¤ëɬÍפ¬¤¢¤ê¤Þ¤¹¡£
+
+      ¤³¤ì¤é¤Î makefile ¤Ë¤Ï¡¢gcc 3.4.x ¸þ¤±¤Î¥³¥ó¥Ñ¥¤¥ë¥ª¥×¥·¥ç¥ó¤¬½ñ¤«
+      ¤ì¤Æ¤¤¤Þ¤¹¤Î¤Ç¡¢gcc 3.4.x °Ê³°¤Ç¤¦¤Þ¤¯¥³¥ó¥Ñ¥¤¥ë¤Ç¤­¤Ê¤¤¾ì¹ç¤Ï¡¢
+      ŬÅö¤Ë½ñ¤­Ä¾¤·¤Æ»È¤Ã¤Æ¤¯¤À¤µ¤¤¡£
+
+      makefile.mgw ¤Ï¡¢libjpeg ¥é¥¤¥Ö¥é¥ê¤òÀÅŪ¥é¥¤¥Ö¥é¥ê(libjpeg.a)¤Ë
+      ¥³¥ó¥Ñ¥¤¥ë¤·¤¿¾å¤Ç¡¢¤³¤Î libjpeg.a ¤ò¥ê¥ó¥¯¤·¤¿ cjpeg, djpeg,
+      jpegtran ¤òºîÀ®¤·¤Þ¤¹¡£makefile.mgwdll ¤Ï¡¢libjpeg ¥é¥¤¥Ö¥é¥ê¤ò
+      DLL(jpeg62.dll)¤Ë¥³¥ó¥Ñ¥¤¥ë¤·¤¿¾å¤Ç¡¢¤³¤Î jpeg62.dll ¤ò¥ê¥ó¥¯¤·¤¿
+      cjpeg, djpeg, jpegtran ¤òºîÀ®¤·¤Þ¤¹¡£
+
+    ¢¡ jconfig.vc  & makefile.vc , makefile.vcdll  -- VC++ 6.0 or later
+
+      VC++ 6.0 °Ê¹ß¤Ê¤é²¿¤ÎÌäÂê¤â¤Ê¤¯¥³¥ó¥Ñ¥¤¥ë¤Ç¤­¤ë¤Ï¤º¤Ç¤¹¡£¥Õ¥ê¡¼¤Ç
+      ¸ø³«¤µ¤ì¤Æ¤¤¤ë Microsoft Visual C++ Toolkit 2003 ¤Ç¤â¡¢nmake.exe
+      ¤¬¥¤¥ó¥¹¥È¡¼¥ë¤µ¤ì¤Æ¤¤¤ì¤Ð¥³¥ó¥Ñ¥¤¥ë¤Ç¤­¤Þ¤¹¡£
+
+      makefile.vc ¤Ï¡¢libjpeg ¥é¥¤¥Ö¥é¥ê¤òÀÅŪ¥é¥¤¥Ö¥é¥ê(libjpeg.lib)¤Ë
+      ¥³¥ó¥Ñ¥¤¥ë¤·¤¿¾å¤Ç¡¢¤³¤Î libjpeg.lib ¤ò¥ê¥ó¥¯¤·¤¿ cjpeg, djpeg,
+      jpegtran ¤òºîÀ®¤·¤Þ¤¹¡£makefile.vcdll ¤Ï¡¢libjpeg ¥é¥¤¥Ö¥é¥ê¤ò
+      DLL(jpeg62.dll)¤Ë¥³¥ó¥Ñ¥¤¥ë¤·¤¿¾å¤Ç¡¢¤³¤Î jpeg62.dll ¤ò¥ê¥ó¥¯¤·¤¿
+      cjpeg, djpeg, jpegtran ¤òºîÀ®¤·¤Þ¤¹¡£
+
+    ¢¡ jconfig.linux & makefile.linux -- linux
+
+      ³Æ¼ï¤Î linux ¤ËÂбþ¤·¤Þ¤¹¤¬¡¢linux ¤Î¾ì¹ç¤Ï configure ¥¹¥¯¥ê¥×¥È¤ò
+      »È¤¦¤³¤È¤ò¶¯¤¯¤ª´«¤á¤¤¤¿¤·¤Þ¤¹¡£
+
+
+  ¡ü UNIX ´Ä¶­¤Ç configure ¥¹¥¯¥ê¥×¥È¤ò»È¤¦
+
+    ³Æ¼ï¤Î PC-UNIX ¤Î¾ì¹ç¤Ê¤É¡¢¥·¥§¥ë¡¦¥¹¥¯¥ê¥×¥È¤òÁö¤é¤»¤ë¤³¤È¤Î¤Ç¤­¤ë
+    ´Ä¶­¤Î¾ì¹ç¤Ï¡¢configure ¥¹¥¯¥ê¥×¥È¤ò»È¤¦¤È´Êñ¤Ë¥³¥ó¥Ñ¥¤¥ë¤Ç¤­¤Þ¤¹¡£
+
+      $ ./configure --enable-shared --enable-static
+      $ make
+      $ make test      (¢«Æ°ºî¥Æ¥¹¥È¡¨É¬Íפ˱þ¤¸¤Æ)
+      # make install
+
+    ¤³¤ÎÊýË¡¤Ç¤Ï¡¢°Ê²¼¤Î¥×¥é¥Ã¥È¥Õ¥©¡¼¥à¤Ç¥³¥ó¥Ñ¥¤¥ë¤Ç¤­¤ë¤³¤È¤ò³Îǧ¤·¤Æ
+    ¤¤¤Þ¤¹¡£¤³¤ì°Ê³°¤Î UNIX ·Ï OS ¤Ç¤â¥³¥ó¥Ñ¥¤¥ë¤Ï²Äǽ¤À¤È»×¤ï¤ì¤Þ¤¹¤¬¡¢
+    ¼ã´³¤Î¼êľ¤·¤¬É¬Íפˤʤë¾ì¹ç¤â¤¢¤ë¤«¤È»×¤¤¤Þ¤¹¡£
+
+    ¡¦Vine Linux 2.6r4 (gcc 2.95.3) ¤ª¤è¤Ó Vine Linux 3.2 (gcc 3.4.4)
+    ¡¦Fedora core 3 (gcc 3.4.2) ¤ª¤è¤Ó Fedora core 4 (gcc 4.0.0)
+    ¡¦FreeBSD 5.4 (gcc 3.4.2) ¤ª¤è¤Ó FreeBSD 6.0 (gcc 3.4.4)
+    ¡¦NetBSD 2.0 (gcc 3.3.3) ¤ª¤è¤Ó NetBSD 3.0 (gcc 3.3.3)
+    ¡¦Solaris 10 1/06 for x64/x86 (i386-pc-solaris2.10)
+    ¡¦Darwin 8.0.1 for x86 (i386-apple-darwin8.0.1; gcc 3.3)
+    ¡¦MinGW & MSYS (gcc 3.4.4)
+    ¡¦cygwin (gcc 3.4.4)
+
+    Ʊº­¤Î configure ¥¹¥¯¥ê¥×¥È¤Ï GNU autoconf ¤ÇÀ¸À®¤µ¤ì¤¿¤â¤Î¤Ç¤¹¡£
+    ¤³¤Î configure ¥¹¥¯¥ê¥×¥È¤Ï¡¢°ìÈÌŪ¤Ê configure ¥¹¥¯¥ê¥×¥È¤¬Ç§¼±¤¹¤ë
+    ¥¹¥¤¥Ã¥Á¥ª¥×¥·¥ç¥ó¤Î¾¤Ë¡¢°Ê²¼¤Î¥¹¥¤¥Ã¥Á¤òǧ¼±¤·¤Þ¤¹¡£
+
+    ¡û --enable-shared / --enable-static
+
+      --enable-shared ¤ò»ØÄꤹ¤ë¤È¡¢GNU libtool ¤ò»È¤Ã¤Æ¶¦Í­¥é¥¤¥Ö¥é¥ê¤ò
+      ºîÀ®¤·¤Þ¤¹¡£--enable-static ¤ò»ØÄꤹ¤ë¤È¡¢Æ±¤¸¤¯ GNU libtool ¤ò
+      »È¤Ã¤ÆÀÅŪ¥é¥¤¥Ö¥é¥ê¤òºîÀ®¤·¤Þ¤¹¡£Î¾Êý¤ò»ØÄꤹ¤ë¤È¡¢¶¦Í­¥é¥¤¥Ö¥é¥ê
+      ¤ÈÀÅŪ¥é¥¤¥Ö¥é¥ê¤ÎξÊý¤òºîÀ®¤·¤Þ¤¹¡£µÕ¤Ë¤³¤ÎξÊý¤È¤â»ØÄꤷ¤Ê¤¤¤È¡¢
+      GNU libtool ¤ò»È¤ï¤º¤ËÀÅŪ¥é¥¤¥Ö¥é¥ê¤Î¤ß¤òºîÀ®¤·¤Þ¤¹¡£
+
+      ¥·¥¹¥Æ¥à¤Ë¥¤¥ó¥¹¥È¡¼¥ë¤¹¤ë¥é¥¤¥Ö¥é¥ê¤òºîÀ®¤¹¤ë¾ì¹ç¤ÏξÊý¤ò»ØÄꤹ¤ë
+      ɬÍפ¬¤¢¤ë¤Ç¤·¤ç¤¦¡£
+
+    ¡û --disable-mmx / --disable-3dnow / --disable-sse / --disable-sse2
+
+      ÆÃÄê¤Î SIMD Ì¿Î᥻¥Ã¥È¤Î¥µ¥Ý¡¼¥È(¥³¡¼¥É)¤òºï½ü¤·¤Æ¥³¥ó¥Ñ¥¤¥ë¤·¤Þ¤¹¡£
+
+    ¡û --enable-uchar-boolean
+
+      ¥½¡¼¥¹¥³¡¼¥ÉÃæ¤Î bool ·¿¤ÎÄêµÁ¤ò int ·¿¤Ç¤Ï¤Ê¤¯ unsigned char ·¿¤Ë
+      Êѹ¹¤·¤Þ¤¹¡£¤³¤Î¥ª¥×¥·¥ç¥ó¤ÏÄ̾ï¤Ï(ÆäËɬÍפǤʤ¤¸Â¤ê)»ÈÍѤ·¤Ê¤¤¤Ç
+      ¤¯¤À¤µ¤¤¡£¤³¤ì¤Ï¡¢MinGW ¤Ê¤É¤Î Windows ¾å¤Î UNIX ´Ä¶­¤Ë¤Æ¡¢bool ·¿
+      ¤ÎÄêµÁ¤ò Windows ¤Î½¬´·¤Ë¹ç¤ï¤»¤ë¤¿¤á¤ËÍÑ°Õ¤µ¤ì¤Æ¤¤¤ë¤â¤Î¤Ç¤¹¡£
+
+    °Ê²¼¡¢³Æ¥×¥é¥Ã¥È¥Õ¥©¡¼¥à¤Ë¸ÇÍ­¤ÎÃí°ÕÅÀ¤ò½Ò¤Ù¤Þ¤¹¡£
+
+    ¢¡ ³Æ¼ï linux ¥Ç¥£¥¹¥È¥ê¥Ó¥å¡¼¥·¥ç¥ó
+
+      ¤è¤Û¤É¸Å¤¤¤â¤Î¤Ç¤Ê¤¤¸Â¤ê¡¢¾åµ­¤Î¼ê½ç¤Ç²¿¤ÎÌäÂê¤â¤Ê¤¯¥³¥ó¥Ñ¥¤¥ë¤Ç¤­¤ë
+      ¤Ï¤º¤Ç¤¹¡£
+
+      ¾åµ­¤Î¼ê½ç¤Ç¶¦Í­¥é¥¤¥Ö¥é¥ê¤òºîÀ®¤·¤¿¾ì¹ç¡¢¤½¤Î¥Ð¡¼¥¸¥ç¥óÈÖ¹æ¤Ï
+      62.1.0 (¥Õ¥¡¥¤¥ë̾¡§libjpeg.so.62.1.0) ¤È¤Ê¤ê¤Þ¤¹¡£¤³¤ì¤ÏÁ°½Ò¤·¤¿
+      ¤È¤ª¤ê¡¢¥ª¥ê¥¸¥Ê¥ëÈÇ(¥Õ¥¡¥¤¥ë̾¡§libjpeg.so.62.0.0)¤È¥Ð¥¤¥Ê¥ê¥ì¥Ù¥ë
+      ¤Ç¤Î¾å°Ì¸ß´¹À­¤¬¤¢¤ê¤Þ¤¹¤Î¤Ç¡¢¥ª¥ê¥¸¥Ê¥ëÈǤȤ½¤Î¤Þ¤ÞÃÖ¤­´¹¤¨¤ë¤³¤È
+      ¤¬¤Ç¤­¤Þ¤¹¡£
+
+      rpm ¤ò»È¤Ã¤¿¥Ñ¥Ã¥±¡¼¥¸´ÉÍý¤òºÎÍѤ·¤Æ¤¤¤ë¥Ç¥£¥¹¥È¥ê¥Ó¥å¡¼¥·¥ç¥ó¤Ç¤Ï¡¢
+      Ʊº­¤Î spec ¥Õ¥¡¥¤¥ë (libjpeg.spec) ¤â¤´ÍøÍѤ¤¤¿¤À¤±¤Þ¤¹¡£¤³¤ì¤Ï¡¢
+      Vine Linux 3.2 ¤ª¤è¤Ó Fedora core 4 ¤Ç¤ÎÆ°ºî¤ò³Îǧ¤·¤Æ¤¤¤Þ¤¹¡£
+
+    ¢¡ xBSD ¥Õ¥¡¥ß¥ê (FreeBSD/NetBSD/OpenBSD)
+
+      ºÇ¶á¤Î¥Ð¡¼¥¸¥ç¥ó¤Î FreeBSD ¤È NetBSD ¤Ë´Ø¤·¤Æ¤Ï¡¢¥³¥ó¥Ñ¥¤¥ë¤Ç¤­¤ë
+      ¤³¤È¤ò³Îǧ¤·¤Æ¤¤¤Þ¤¹¡£OpenBSD ¤Ë´Ø¤·¤Æ¤âÌäÂê¤Ï¤Ê¤¤¤È»×¤¤¤Þ¤¹¡£
+      ¤¿¤À¡¢¥ª¥Ö¥¸¥§¥¯¥È¥Õ¥©¡¼¥Þ¥Ã¥È¤Ë a.out ¤ò»È¤Ã¤¿¸Å¤¤¤â¤Î¤Ë´Ø¤·¤Æ¤Ï¡¢
+      ¥Õ¥¡¥¤¥ë¥Õ¥©¡¼¥Þ¥Ã¥È¤Î¼ïÎà¤â°ì±þ configure ¥¹¥¯¥ê¥×¥È¤Ë¤Æ¸¡½Ð¤Ç¤­¤ë
+      ¤è¤¦¤Ë¤·¤Æ¤¢¤ê¤Þ¤¹¤¬¡¢Æ°ºî¤Ï̤³Îǧ¤Ç¤¹¡£
+
+      FreeBSD ¤Î¾ì¹ç¡¢¾åµ­¤Î¼ê½ç¤Ç¶¦Í­¥é¥¤¥Ö¥é¥ê¤òºîÀ®¤·¤¿¾ì¹ç¡¢¤½¤Î
+      ¥Ð¡¼¥¸¥ç¥óÈÖ¹æ¤Ï 9 (¥Õ¥¡¥¤¥ë̾¡§libjpeg.so.9) ¤È¤Ê¤ê¤Þ¤¹¡£¤³¤ì¤Ï¡¢
+      ports collection ¤ÎÃæ¤Ë¤¢¤ë¸ø¼°ÈǤΥС¼¥¸¥ç¥óÈÖ¹æ¤Ë½à¤¸¤¿¤â¤Î¤Ç¡¢
+      (¥Ð¥¤¥Ê¥ê¾å°Ì¸ß´¹¤Ê¤Î¤Ç)¸ø¼°ÈǤȤ½¤Î¤Þ¤ÞÃÖ¤­´¹¤¨¤ë¤³¤È¤¬¤Ç¤­¤Þ¤¹¡£
+      NetBSD/OpenBSD ¤Î¾ì¹ç¤Î¥Ð¡¼¥¸¥ç¥óÈÖ¹æ¤Ï 62.1.0 (¥Õ¥¡¥¤¥ë̾¡§
+      libjpeg.so.62.1.0) ¤Ë¤Ê¤ê¤Þ¤¹(¥Ð¥¤¥Ê¥ê¾å°Ì¸ß´¹)¡£
+
+    ¢¡ Solaris 10
+
+      ºî¼Ô¤Î¥Æ¥¹¥È¤Ç¤Ï¡¢Àµ¾ï¤Ë¥³¥ó¥Ñ¥¤¥ë¤Ç¤­¤Æ¡¢¶¦Í­¥é¥¤¥Ö¥é¥ê¤¬¥Ð¥¤¥Ê¥ê
+      ¸ß´¹¤Ë¤Ê¤ë¤³¤È¤ò³Îǧ¤·¤Æ¤¤¤Þ¤¹(¥Õ¥¡¥¤¥ë̾¡§libjpeg.so.62.1.0)¡£
+      ¤Ç¤¹¤¬¡¢¤³¤Î SIMD ÈÇ libjpeg ¥é¥¤¥Ö¥é¥ê¤Ï AMD64 ¤Ë¤Ï¸½»þÅÀ¤Ç¤ÏÂбþ
+      ¤·¤Æ¤¤¤Ê¤¤¤¿¤á¡¢32bitÈǤΥ饤¥Ö¥é¥ê¤·¤«ºî¤ì¤Þ¤»¤ó¡£
+
+    ¢¡ Darwin for x86
+
+      ºî¼Ô¤Î¥Æ¥¹¥È¤Ç¤Ï¡¢Àµ¾ï¤Ë¥³¥ó¥Ñ¥¤¥ë¤Ç¤­¤Æ¡¢¶¦Í­¥é¥¤¥Ö¥é¥ê¤¬¥Ð¥¤¥Ê¥ê
+      ¸ß´¹¤Ë¤Ê¤ë¤³¤È¤ò³Îǧ¤·¤Æ¤¤¤Þ¤¹(¥Õ¥¡¥¤¥ë̾¡§libjpeg.62.1.0.dylib)¡£
+      x86 ÈÇ Mac OS X ¤Ç¤â¡¢¥³¥ó¥Ñ¥¤¥ë¤µ¤¨¤Ç¤­¤ì¤ÐÆ°ºî¤¹¤ë¤â¤Î¤È»×¤ï¤ì¤Þ¤¹¡£
+
+      ¸½»þÅÀ¤Ç¤Ï¡¢¥¢¥»¥ó¥Ö¥é nasm ¤Î Darwin ¤Ø¤Î¥µ¥Ý¡¼¥È¤¬½½Ê¬¤Ç¤Ê¤¤¤¿¤á¡¢
+      ¤ä¤ä¥È¥ê¥Ã¥­¡¼¤Ê¥³¡¼¥É¤Ç Darwin / Mac OS X ¤ËÂбþ¤µ¤»¤Æ¤¤¤Þ¤¹¤¬¡¢
+      Æ°ºî¤Ë¤Ï¤Þ¤Ã¤¿¤¯ÌäÂê¤Ê¤¤¤Ï¤º¤Ç¤¹¡£
+
+    ¢¡ MinGW & MSYS (gcc 3.4.4)
+
+      MinGW ¤Î¾ì¹ç¤Ï¡¢ÉÕ°¤Î makefile.mgw / makefile.mgwdll ¤ò»È¤¦¤³¤È¤ò
+      ¿ä¾©¤·¤Þ¤¹¤¬¡¢MSYS ¤¬¥¤¥ó¥¹¥È¡¼¥ë¤µ¤ì¤Æ¤¤¤ì¤Ð configure ¥¹¥¯¥ê¥×¥È
+      ¤â»È¤¨¤Þ¤¹¡£¤³¤Î¾ì¹ç¤Ï¡¢configure ¥¹¥¯¥ê¥×¥È¤òÁö¤é¤»¤ëºÝ¤Ë
+      --enable-uchar-boolean ¤òɬ¤º»ØÄꤷ¤Æ¤¯¤À¤µ¤¤¡£¤³¤¦¤¹¤ë¤³¤È¤Ç¡¢
+      ¾¤Î Windows ·Ï¤Î½èÍý·Ï(VC++¤Ê¤É)¤¬½ÐÎϤ¹¤ë¥³¡¼¥É¤È¥Ð¥¤¥Ê¥ê¸ß´¹¤Ë
+      ¤Ê¤ê¤Þ¤¹¡£
+
+    ¢¡ cygwin (gcc 3.4.4)
+
+      ¤Þ¤ºÃí°Õ¤¹¤Ù¤­ÅÀ¤Ï¡¢cygwin ¤Î¾ì¹ç¡¢cygwin ¤«¤é¸ø¼°¤Ë¥ê¥ê¡¼¥¹¤µ¤ì¤Æ
+      ¤¤¤ë DLL (cygjpeg-62.dll) ¤È¤Ï¥Ð¥¤¥Ê¥ê¸ß´¹¤Ë¤Ï¤Ê¤ê¤Þ¤»¤ó¡£¤³¤ì¤Ï¡¢
+      ¸ø¼°ÈǤΥХ¤¥Ê¥ê¤Ë¤Ï lossless jpeg patch (ljpeg-6b.tar.gz) ¤È¤¤¤¦
+      ½¤Àµ¥Ñ¥Ã¥Á¤¬´Þ¤Þ¤ì¤Æ¤¤¤ë¤¿¤á¤Ç¡¢ÅöSIMDÈǤËÂФ·¤Æ¤³¤Î¥Ñ¥Ã¥Á¤òŬÍÑ
+      ¤¹¤ë¤³¤È¤Ïº¤Æñ¤À¤«¤é¤Ç¤¹¡£
+
+      ¥Ð¥¤¥Ê¥ê¸ß´¹¤Ç¤Ï¤Ê¤¤¤¿¤á¡¢¸ø¼°¥ê¥ê¡¼¥¹ÈǤΠDLL ¤ò¤³¤ÎSIMDÈǤÇÃÖ¤­
+      ´¹¤¨¤ë¤³¤È¤Ï¤Ç¤­¤Þ¤»¤ó¡£¤½¤Î¤¿¤áÅöSIMDÈǤΠDLL ¤Ï cygjpeg-162.dll
+      ¤È¤¤¤¦Ì¾Á°¤Ë¤Ê¤ë¤è¤¦¤Ë¤·¤Æ¤¢¤ê¤Þ¤¹¡£¤³¤ì¤ò¥·¥¹¥Æ¥à¤Ë¥¤¥ó¥¹¥È¡¼¥ë¤¹¤ë
+      ¤³¤È¤â¤Ç¤­¤Þ¤¹¤¬¡¢¤³¤ÎSIMDÈǤΠDLL ¤òÍøÍѤ¹¤ë¤Ë¤Ï¡¢JPEG ¥é¥¤¥Ö¥é¥ê¤ò
+      »ÈÍѤ·¤Æ¤¤¤ë¥½¥Õ¥È¤òºÆ¥³¥ó¥Ñ¥¤¥ë¡¿ºÆ¥ê¥ó¥¯¤¹¤ëɬÍפ¬¤¢¤ê¤Þ¤¹¡£
+
+      ¤Ê¤ª¡¢¤³¤Î DLL ¤Ë¤Ä¤±¤é¤ì¤ë¥Ð¡¼¥¸¥ç¥óÈÖ¹æ¤òÊѤ¨¤¿¤±¤ì¤Ð¡¢configure
+      ¥¹¥¯¥ê¥×¥È¤òÁö¤é¤»¤ëÁ°¤Ë config.ver ¤ÎÆâÍƤòÊѹ¹¤·¤Æ¤¯¤À¤µ¤¤¡£
+
+
+¢£Ê£¿ô¥Õ¥¡¥¤¥ëÂбþÈǤΠcjpeg/djpeg (altui/)
+
+  ¥µ¥ó¥×¥ë¥¢¥×¥ê¥±¡¼¥·¥ç¥ó¤Î cjpeg ¤È djpeg ¤Ë¤Ä¤¤¤Æ¤Ç¤¹¤¬¡¢¥Ç¥Õ¥©¥ë¥È¤Î
+  ¾õÂ֤ǥ³¥ó¥Ñ¥¤¥ë¤µ¤ì¤ë¤â¤Î(¾¤Î¥½¡¼¥¹¥Õ¥¡¥¤¥ë·²¤ÈƱ¤¸¾ì½ê¤Ë¤¢¤ë cjpeg.c
+  ¤È djpeg.c) ¤Ï¡¢°ìÅ٤˰ì¤Ä¤Î¥Õ¥¡¥¤¥ë¤·¤«ÊÑ´¹¤Ç¤­¤Ê¤¤¤â¤Î¤Ç¤¹¡£¤Ä¤Þ¤ê¡¢
+  ÆþÎÏ¥Õ¥¡¥¤¥ë¤Ï¥³¥Þ¥ó¥É¥é¥¤¥ó¾å¤Ë°ì¤Ä¤·¤«»ØÄê¤Ç¤­¤º¡¢½ÐÎϤÏɸ½à½ÐÎϤ«
+  -outfile ¥ª¥×¥·¥ç¥ó¤Ç»ØÄꤷ¤¿¥Õ¥¡¥¤¥ë¤Ë½ñ¤­½Ð¤µ¤ì¤Þ¤¹¡£³Æ¼ï¤Î UNIX ·Ï
+  OS ¤Ë¥¤¥ó¥¹¥È¡¼¥ë¤µ¤ìÍøÍѤµ¤ì¤Æ¤¤¤ë cjpeg/djpeg ¤Ï¤³¤Î¥¿¥¤¥×¤Î¤â¤Î¤Ç¤¹¡£
+
+  °ìÊý¡¢IJG ¤«¤é¸ø¼°¤Ë¥ê¥ê¡¼¥¹¤µ¤ì¤Æ¤¤¤ë MS-DOS ÈǤΠcjpeg/djpeg
+  (ftp://ftp.simtel.net/.2/simtelnet/msdos/graphics/jpeg6_b.zip) ¤Ç¤Ï¡¢
+  Ê£¿ô¤ÎÆþÎÏ¥Õ¥¡¥¤¥ë¤ò»ØÄê¤Ç¤­¡¢½ÐÎϤÏÆþÎÏ¥Õ¥¡¥¤¥ë¤ÈƱ¤¸¾ì½ê¤Ë¼«Æ°Åª¤Ë
+  ºî¤é¤ì¤Þ¤¹¡£¤³¤Î¥¿¥¤¥×¤Î cjpeg/djpeg ¤òºî¤ê¤¿¤±¤ì¤Ð¡¢altui/ ¤ÎÃæ¤Ë¤¢¤ë
+  cjpeg.c ¤È djpeg.c ¤ò¡¢¸µ¤«¤é¤¢¤ë(£±¥Õ¥¡¥¤¥ëÈǤÎ) cjpeg.c / djpeg.c ¤È
+  Æþ¤ìÂؤ¨¤Æ¥³¥ó¥Ñ¥¤¥ë¤·¤Æ¤¯¤À¤µ¤¤¡£¤³¤Î altui ÈǤΠcjpeg.c / djpeg.c ¤Ï¡¢
+  ¸µ¡¹ jpegaltui.v6b.tar.gz ¤È¤¤¤¦¥Õ¥¡¥¤¥ë̾¤ÇÇÛÉÛ¤µ¤ì¤Æ¤¤¤¿¤â¤Î¤ËÂФ·¤Æ
+  SIMD Âбþ²½¤Ë´Ø¤¹¤ë½¤Àµ¤ò²Ã¤¨¤¿¤â¤Î¤Ç¤¹¡£
+
+  ¤³¤ÎÊ£¿ô¥Õ¥¡¥¤¥ëÂбþÈǤΠcjpeg/djpeg ¤ò Windows ·Ï¤Ê¤É¤ÎÈó UNIX ´Ä¶­¤Ç
+  »ÈÍѤ¹¤ë¾ì¹ç¡¢ÆþÎÏ¥Õ¥¡¥¤¥ë̾¤ò¥ï¥¤¥ë¥É¥«¡¼¥É¤Ç»ØÄê¤Ç¤­¤ë¤è¤¦¤Ë¤¹¤ë¤Ë¤Ï¡¢
+  ³Æ¥³¥ó¥Ñ¥¤¥é¤Ë¸ÇÍ­¤ÎÆÃÊ̤ÊÀßÄ꤬ɬÍפˤʤë¾ì¹ç¤¬¤¢¤ê¤Þ¤¹¡£¤Ê¤¼¤Ê¤é¡¢
+  MS-DOS·Ï¡¿Windows·Ï¤Î´Ä¶­¤Ç¤Ï°ìÈ̤ˡ¢¥ï¥¤¥ë¥É¥«¡¼¥É¤ÎŸ³«½èÍý¤Ï¥³¥ó¥Ñ¥¤¥é
+  ¤ËÉÕ°¤Î¥¹¥¿¡¼¥È¥¢¥Ã¥×¥³¡¼¥ÉÆâ¤Ç¹Ô¤Ê¤ï¤ì¤ë¤¿¤á¤Ç¤¹¡£
+
+  MinGW ¤ä DJGPP V.2 ¤Ê¤É¤Î¾ì¹ç¤Ï¡¢¥ï¥¤¥ë¥É¥«¡¼¥É¤ÎŸ³«½èÍý¤ÏºÇ½é¤«¤éÍ­¸ú
+  ¤Ë¤Ê¤Ã¤Æ¤¤¤ë¤¿¤á¡¢ÆÃÊ̤ʤ³¤È¤ò¤·¤Ê¤¯¤Æ¤â¥ï¥¤¥ë¥É¥«¡¼¥É¤Ë¤è¤ë¥Õ¥¡¥¤¥ë»ØÄê
+  ¤Ï¤Ç¤­¤Þ¤¹¡£Microsoft Visual C++ ¤ä Borland C++ ¤Î¾ì¹ç¤Ï¡¢ÉáÄÌ¡¢¥ï¥¤¥ë¥É
+  ¥«¡¼¥ÉŸ³«¤òÍ­¸ú²½¤¹¤ë¥ª¥Ö¥¸¥§¥¯¥È¥Õ¥¡¥¤¥ë¤Ç¤¢¤ë setargv.obj ¤ä
+  wildargs.obj ¤ò EXE ¥Õ¥¡¥¤¥ë¤Î¥ê¥ó¥¯»þ¤Ë¾¤Î¥ª¥Ö¥¸¥§¥¯¥È¥Õ¥¡¥¤¥ë¤È°ì½ï¤Ë
+  ¥ê¥ó¥¯¤¹¤ë¤³¤È¤Ç¡¢¥ï¥¤¥ë¥É¥«¡¼¥ÉŸ³«¤òÍ­¸ú²½¤Ç¤­¤Þ¤¹¤¬¡¢¤³¤Î SIMD ÈÇ
+  cjpeg/djpeg ¤Î¾ì¹ç¤Ï¡¢setargv.obj ¤ä wildargs.obj ¤ÎÆâÍƤËÁêÅö¤¹¤ë¥³¡¼¥É¤ò
+  cjpeg.c/djpeg.c ¤ËľÀܽñ¤­¹þ¤ó¤Ç¤¢¤ë¤¿¤á¡¢¥×¥í¥¸¥§¥¯¥È¥Õ¥¡¥¤¥ë¤ä Makefile
+  ¤Ë¾åµ­¤Î¥ª¥Ö¥¸¥§¥¯¥È¥Õ¥¡¥¤¥ë¤òÄɲ䷤ʤ¯¤Æ¤â¡¢¥ï¥¤¥ë¥É¥«¡¼¥É¤Ë¤è¤ë¥Õ¥¡¥¤¥ë
+  »ØÄ꤬¤Ç¤­¤ë¤è¤¦¤Ë¤·¤Æ¤¢¤ê¤Þ¤¹¡£¤³¤ì¤é°Ê³°¤Î£Ã¥³¥ó¥Ñ¥¤¥é¤ò»ÈÍѤ·¤¿¾ì¹ç¤Ç¡¢
+  ¥ï¥¤¥ë¥É¥«¡¼¥É¤Ë¤è¤ë¥Õ¥¡¥¤¥ë»ØÄ꤬¤Ç¤­¤Ê¤¤¾ì¹ç¤Ï¡¢¥³¥ó¥Ñ¥¤¥é¤Î¥Þ¥Ë¥å¥¢¥ë
+  ¤ò»²¾È¤·¤Æ¡¢¥ï¥¤¥ë¥É¥«¡¼¥ÉŸ³«¤òÍ­¸ú²½¤¹¤ëÀßÄê¤Ç¥³¥ó¥Ñ¥¤¥ë¤·¤Æ¤¯¤À¤µ¤¤¡£
+
+  °ìÊý¡¢linux ¤Ê¤É¤Î UNIX ´Ä¶­¤Ç¤Ï¡¢¥ï¥¤¥ë¥É¥«¡¼¥É¤ÎŸ³«½èÍý¤Ï¥×¥í¥°¥é¥à¤¬
+  µ¯Æ°¤µ¤ì¤ëÁ°¤Ë¥³¥Þ¥ó¥É¥·¥§¥ë¤Ë¤è¤Ã¤Æ¹Ô¤Ê¤ï¤ì¤ë¤¿¤á¡¢¥³¥ó¥Ñ¥¤¥é¤ÎÀßÄê¤Ê¤É
+  ¤ÏɬÍפ¢¤ê¤Þ¤»¤ó¡£¥ï¥¤¥ë¥É¥«¡¼¥É¤Ë¤è¤ë¥Õ¥¡¥¤¥ë̾»ØÄê¤Ï¾ï¤Ë»È¤¨¤Þ¤¹¡£
+
+
+¢£¥³¡¼¥É¥µ¥¤¥º¤ò¸º¤é¤¹¤Ë¤Ï
+
+  SIMD ¥³¡¼¥É¤òÉղä·¤¿¤¿¤á¡¢¤½¤Îʬ¤À¤±¥³¡¼¥É¥µ¥¤¥º¤¬Áý¤¨¤Æ¤¤¤Þ¤¹¡£¤Ç¤¹¤¬¡¢
+  JPEG library ¤ò°Ê²¼¤Î¤è¤¦¤Ê¥Ç¥Õ¥©¥ë¥È¤Î¾õÂ֤Ǿï¤Ë»ÈÍѤ·¤Æ¤¤¤ë¤Î¤Ê¤é¤Ð¡¢
+  jmorecfg.h ¤ÎÃæ¤Ë¤¢¤ëÀßÄê¹àÌÜ(¥Þ¥¯¥í)¤ò°Ê²¼¤Î¤è¤¦¤ËÊѹ¹¤·¤Æ¥³¥ó¥Ñ¥¤¥ë
+  ¤¹¤ë¤³¤È¤Ç¡¢»ÈÍѤµ¤ì¤Ê¤¤¥³¡¼¥É¤ò½ü³°¤¹¤ë¤³¤È¤¬¤Ç¤­¡¢¥³¡¼¥É¥µ¥¤¥º¤ò¸º¤é¤¹
+  ¤³¤È¤¬¤Ç¤­¤Þ¤¹¡£
+
+  ¡û cinfo.dct_method ¤ÎÃͤòÊѹ¹¤·¤Æ¤¤¤Ê¤¤¾ì¹ç
+
+    ¤³¤ÎÊÑ¿ô¤Ï DCT±é»»¤ÎÊýË¡¤ò»ØÄꤷ¡¢cjpeg/djpeg ¤Ç¤Ï -dct ¥ª¥×¥·¥ç¥ó¤Ë
+    Âбþ¤·¤Þ¤¹¡£¤³¤ÎÊÑ¿ô¤ÎÃͤϥǥե©¥ë¥È¤Ç¤Ï JDCT_ISLOW ¤Ç¡¢ÆäËÍýͳ¤Î
+    ¤Ê¤¤¸Â¤ê¤³¤Î¥Ç¥Õ¥©¥ë¥È¤Î¾õÂ֤ǻÈÍѤ¹¤ë¤³¤È¤ò¶¯¤¯¿ä¾©¤·¤Þ¤¹¡£°Ê²¼¤Î
+    ¥Þ¥¯¥í¤ò #undef ¤Ë¤¹¤ë¤³¤È¤Ç¡¢JDCT_ISLOW ¤Î¾õÂ֤ǤϷ褷¤Æ»ÈÍѤµ¤ì¤Ê¤¤
+    ¥³¡¼¥É¤ò½ü³°¤Ç¤­¤Þ¤¹¡£
+
+    #define DCT_IFAST_SUPPORTED  ->  #undef DCT_IFAST_SUPPORTED
+    #define DCT_FLOAT_SUPPORTED  ->  #undef DCT_FLOAT_SUPPORTED
+
+    ¤³¤ì¤À¤±¤Ç¤â¤«¤Ê¤ê¤Î¥³¡¼¥É¥µ¥¤¥º¤¬ºï¸º¤Ç¤­¤Þ¤¹¡£ÆÃ¤Ë DCT_FLOAT_SUPPORTED
+    ¤ò #undef ¤Ë¤¹¤ë¤È¡¢3DNow! ¤È SSE ¤Î¥µ¥Ý¡¼¥È¤â¼«Æ°Åª¤Ë̵¸ú¤Ë¤Ê¤ê¤Þ¤¹¡£
+
+  ¡ûŸ³«½èÍý¤Ç cinfo.do_fancy_upsampling ¤ÎÃͤòÊѹ¹¤·¤Æ¤¤¤Ê¤¤¾ì¹ç
+
+    ¤³¤ÎÊÑ¿ô¤Ï djpeg ¤Ç¤Ï -nosmooth ¥ª¥×¥·¥ç¥ó¤ËÁêÅö¤·¡¢-nosmooth ¤ò»ØÄê
+    ¤¹¤ë¤È FALSE ¤ËÀßÄꤵ¤ì¤Þ¤¹¡£¥Ç¥Õ¥©¥ë¥È¤Ç¤Ï TRUE ¤Ç¡¢¤³¤ì¤âÆäËÍýͳ¤Î
+    ¤Ê¤¤¸Â¤ê¤³¤Î¥Ç¥Õ¥©¥ë¥È¤Î¾õÂ֤ǻÈÍѤ¹¤ë¤³¤È¤ò¶¯¤¯¿ä¾©¤·¤Þ¤¹¡£°Ê²¼¤Î
+    ¥Þ¥¯¥í¤ò #undef ¤Ë¤¹¤ë¤³¤È¤Ç¡¢TRUE ¤Î¾õÂ֤ǤϷ褷¤Æ»ÈÍѤµ¤ì¤Ê¤¤¥³¡¼¥É¤ò
+    ½ü³°¤Ç¤­¤Þ¤¹¡£
+
+    #define UPSAMPLE_MERGING_SUPPORTED  ->  #undef UPSAMPLE_MERGING_SUPPORTED
+
+  ¡ûŸ³«½èÍý¤Ç cinfo.scale_num, cinfo.scale_denom ¤ÎÃͤòÊѹ¹¤·¤Æ¤¤¤Ê¤¤¾ì¹ç
+
+    ¤³¤ì¤ÏÍפ¹¤ë¤Ë¡ÖJPEG½Ì¾®Å¸³«¡×¤Îµ¡Ç½¤Ç¡¢djpeg ¤Ç¤Ï -scale M/N ¥ª¥×
+    ¥·¥ç¥ó¤ËÁêÅö¤·¤Þ¤¹¡£¥µ¥à¥Í¥¤¥ëºîÀ®¤Ê¤É¤Î¾ì¹ç¤ËÍøÍѤµ¤ì¤ë¤³¤È¤¬Â¿¤¤
+    µ¡Ç½¤Ç¤¹¤¬¡¢¤³¤ì¤ò¤Þ¤Ã¤¿¤¯»ÈÍѤ·¤Æ¤¤¤Ê¤¤¾ì¹ç¤Ï¡¢°Ê²¼¤Î¥Þ¥¯¥í¤ò #undef
+    ¤Ë¤¹¤ë¤³¤È¤Ç¡¢¥³¡¼¥ÉÎ̤òºï¸º¤Ç¤­¤Þ¤¹¡£
+
+    #define IDCT_SCALING_SUPPORTED  ->  #undef IDCT_SCALING_SUPPORTED
+
+  Ãí°ÕÅÀ¤È¤·¤Æ¡¢¤³¤ì¤é¤ÎÀßÄêÊÑ¿ô¤¬¤É¤Î¤è¤¦¤Ê¾õÂ֤ǻȤï¤ì¤ë¤«Í½Â¬¤Ç¤­¤Ê¤¤
+  ¾ì¹ç¡¢¤¿¤È¤¨¤Ð¡¢¥·¥¹¥Æ¥à¤Ë¥¤¥ó¥¹¥È¡¼¥ë¤¹¤ë¶¦Í­¥é¥¤¥Ö¥é¥ê¤òºî¤ë¾ì¹ç¤Ê¤É
+  ¤Ï¡¢¤³¤¦¤¤¤Ã¤¿¥³¡¼¥Éºï¸º¤Ï¹Ô¤Ê¤¦¤Ù¤­¤Ç¤Ï¤¢¤ê¤Þ¤»¤ó¡£¥³¡¼¥Éºï¸º¤ò¹Ô¤Ê¤¦
+  ¤Î¤Ï¡¢JPEG ¥é¥¤¥Ö¥é¥ê¤Î»È¤ï¤ìÊý¤¬¤è¤¯¤ï¤«¤Ã¤Æ¤¤¤ëÆÃÄê¤Î¥¢¥×¥ê¥±¡¼¥·¥ç¥ó
+  ¤Ë¥ê¥ó¥¯¤¹¤ë¾ì¹ç¤Î¤ß¤Ë¤·¤Æ¤¯¤À¤µ¤¤¡£
+
+
+¢£ÆÃÄê¤Î SIMD Ì¿Îá¤ò»ÈÍѤ·¤Ê¤¤¤è¤¦¤Ë¤¹¤ë¤Ë¤Ï
+
+  ¤³¤ì¤Ï jconfig.h ¤ÎÃæÄø¤Ë¤¢¤ë¡¢#undef JSIMD_***_NOT_SUPPORTED ¤È¤¤¤¦
+  ¥Þ¥¯¥í¤ò #define ¤Ë¤¹¤ë¤³¤È¤Ç¼Â¸½¤Ç¤­¤Þ¤¹¡£configure ¥¹¥¯¥ê¥×¥È¤Ç
+  --disable-mmx ¤Ê¤É¤Î¥ª¥×¥·¥ç¥ó¤ò»ØÄꤷ¤¿¾ì¹ç¤Ï¡¢¤³¤Î¥Þ¥¯¥í¤Ï¼«Æ°Åª¤Ë
+  #define ¤µ¤ì¤Þ¤¹¡£
+
+  3DNow! ¤È SSE ¤Ï¸µ¡¹¡¢ÉâÆ°¾®¿ôÅÀDCT¤Ë¤·¤«ÍøÍѤµ¤ì¤Æ¤¤¤Þ¤»¤ó¤Î¤Ç¡¢¾å½Ò¤Î
+  DCT_FLOAT_SUPPORTED ¤ò #undef ¤Ë¤·¤¿¤À¤±¤Ç¤Þ¤È¤á¤Æ̵¸ú¤Ë¤µ¤ì¤Þ¤¹¡£
+  MMX ¤È SSE2 ¤Ï¡¢°µ½ÌŸ³«½èÍý¤Î³Æ½ê¤ËÍøÍѤµ¤ì¤Æ¤¤¤Æ¡¢¹â®²½¤Ø¤Î¹×¸¥ÅÙ¤¬
+  ¹â¤¤¤Î¤Ç¡¢Í­¸ú¤Ë¤·¤Æ¤ª¤¯¤³¤È¤ò¤ªÁ¦¤á¤·¤Þ¤¹¤¬¡¢ÁȤ߹þ¤ßÍÑÅӤʤɡ¢¥³¡¼¥É
+  ¤òÁö¤é¤»¤ë¥×¥í¥»¥Ã¥µ¤Î¼ïÎब¤ï¤«¤Ã¤Æ¤¤¤ë¾ì¹ç¤Ï¡¢¡ÖÄ̾ïÍøÍѤµ¤ì¤Ê¤¤Êý¡×
+  ¤Î¥µ¥Ý¡¼¥È¤ò³°¤¹¤³¤È¤Ç¡¢¥³¡¼¥É¥µ¥¤¥º¤Îºï¸º¤¬¤Ç¤­¤Þ¤¹¡£
+
+
+
+[EOF]
diff --git a/simd_internal.ja.txt b/simd_internal.ja.txt
new file mode 100644
index 0000000..d234901
--- /dev/null
+++ b/simd_internal.ja.txt
@@ -0,0 +1,293 @@
+Independent JPEG Group's JPEG software release 6b
+  with x86 SIMD extension for IJG JPEG library version 1.02
+    == INTERNAL ==
+-----------------------------------------------------------
+
+¢£¤³¤Î¥Õ¥¡¥¤¥ë¤Ï
+
+  ¤³¤Î¥Õ¥¡¥¤¥ë¤Ç¤Ï¡¢SIMD ÈÇ libjpeg ¥é¥¤¥Ö¥é¥ê¤Î¡¢SIMD ³ÈÄ¥Éôʬ¤Î¾ÜºÙ¤ò
+  ²òÀ⤷¤Þ¤¹¡£SIMD ³ÈÄ¥Éôʬ¤ËÂФ·¤Æ²¿¤é¤«¤Î¼ê¤ò²Ã¤¨¤¿¤¤¾ì¹ç¤ä¡¢É¸½à¤Ç¤Ï
+  Âбþ¤·¤Æ¤¤¤Ê¤¤¥×¥é¥Ã¥È¥Õ¥©¡¼¥à¤ËÂбþ¤µ¤»¤¿¤¤¾ì¹ç¤Ê¤É¤Ï¡¢¤³¤³¤òÆɤó¤Ç
+  ¤¯¤À¤µ¤¤¡£
+
+    ¢£¥Õ¥¡¥¤¥ë¥Õ¥©¡¼¥Þ¥Ã¥È¡¿¸Æ¤Ó½Ð¤·µ¬Ìó(ABI)¤Î»ØÄê
+    ¢£OS ¤Î SIMD ¥µ¥Ý¡¼¥È¥Á¥§¥Ã¥¯ (jsimdgcc.c / jsimddjg.asm / jsimdw32.asm)
+    ¢£¥¢¥»¥ó¥Ö¥ê¸À¸ìÍÑÀßÄê¥Õ¥¡¥¤¥ë jsimdcfg.inc ¤ÎºîÀ® (makecfg.c)
+    ¢£SIMD Ì¿Îá¤Î¼Â¹Ô»þ¤ÎÁªÂò¡¿SIMD Æ°ºî¥â¡¼¥É¾ðÊó
+    ¢£¤½¤Î¤Û¤«¤ÎÀßÄê¹àÌÜ¥Þ¥¯¥í
+      ¡û RGB_RED / RGB_GREEN / RGB_BLUE / RGB_PIXELSIZE
+      ¡û RGBX_FILLER_0XFF
+      ¡û JFDCT_INT_QUANTIZE_WITH_DIVISION
+      ¡û UPSAMPLE_H1V2_SUPPORTED
+
+
+¢£¥Õ¥¡¥¤¥ë¥Õ¥©¡¼¥Þ¥Ã¥È¡¿¸Æ¤Ó½Ð¤·µ¬Ìó(ABI)¤Î»ØÄê
+
+  ¥¢¥»¥ó¥Ö¥ê¸À¸ì¤Ç½ñ¤«¤ì¤¿¥³¡¼¥É¤ò¡¢£Ã¸À¸ì¤Ê¤É¤Î¹âµé¸À¸ì¤Ç½ñ¤«¤ì¤¿¥³¡¼¥É
+  ¤È¥ê¥ó¥¯¤¹¤ë¤Ë¤Ï¡¢¥ª¥Ö¥¸¥§¥¯¥È¥Õ¥¡¥¤¥ë¤Î¥Õ¥©¡¼¥Þ¥Ã¥È¤ò°ìÃפµ¤»¤ë¤³¤È¡¢
+  ¤ª¤è¤Ó¡¢¸Æ¤Ó½Ð¤·µ¬Ìó¤Ê¤É¤Î¥Ð¥¤¥Ê¥ê¥³¡¼¥É¤Îµ¬Ìó(ABI)¤ò°ìÃפµ¤»¤ë¤³¤È¤¬
+  ɬÍפǤ¹¡£¤³¤Î¥½¥Õ¥È¤Ç¤Ï¡¢¤³¤ì¤é¤Î»ØÄê¤ò¡¢¥¢¥»¥ó¥Ö¥é nasm ¤ËÍ¿¤¨¤ë¥ª¥×
+  ¥·¥ç¥ó¤Ç»ØÄꤷ¤Æ¤¤¤Þ¤¹¡£
+
+  ¡¦nasm -fwin32 -DWIN32 ...
+
+    Win32 ¤Î coff ¥Õ¥©¡¼¥Þ¥Ã¥È¡£Microsoft Visual C++ ¤ä MinGW¡¦CygWin
+    ¤Ê¤É¡¢Win32 ¥³¥ó¥Ñ¥¤¥é¤ÎÂçÉôʬ¤¬³ºÅö¡£
+
+  ¡¦nasm -fobj -DOBJ32 ...
+
+    Win32 ¤Î obj ¥Õ¥©¡¼¥Þ¥Ã¥È¡£¸µ¡¹¤Ï MS-DOS ¤Ç»È¤ï¤ì¤Æ¤¤¤¿ obj ·Á¼°
+    (MSOMF)¤ò 32bit ¤Ë³ÈÄ¥¤·¤¿¤â¤Î¡£Borland C++ Complier (Win32) ¤Ê¤É¡£
+
+  ¡¦nasm -felf -DELF ...
+
+    ³Æ¼ï¤Î UNIX ¤Ç¹­¤¯ºÎÍѤµ¤ì¤Æ¤¤¤ë ELF ¥Õ¥©¡¼¥Þ¥Ã¥È¡£linux ¤ä xBSD
+    ¥Õ¥¡¥ß¥ê¤Ê¤É¡¢¸½ºß¤Î UNIX ¤ÎÂçÉôʬ¤¬³ºÅö¡£
+
+  ¡¦nasm -faoutb -DAOUT ...
+
+    °ÊÁ°¤Î xBSD ¥Õ¥¡¥ß¥ê¤Ç»È¤ï¤ì¤Æ¤¤¤¿ a.out ¥Õ¥©¡¼¥Þ¥Ã¥È¡£
+
+  ¡¦nasm -fmacho -DMACHO ...
+
+    Darwin (MacOS X) ¤Ê¤É¤ÇºÎÍѤµ¤ì¤Æ¤¤¤ë Mach-O ¥Õ¥©¡¼¥Þ¥Ã¥È¡£
+    Ãí¡Ë-fmacho ¥ª¥×¥·¥ç¥ó¤Ï nasm 0.98.40 °Ê¹ß¤Ç¥µ¥Ý¡¼¥È¤µ¤ì¤Þ¤¹¡£
+
+  ¡¦nasm -fcoff -DDJGPP ...
+
+    MS-DOS ¤Î DJGPP ¥³¥ó¥Ñ¥¤¥é¤Ç»È¤ï¤ì¤ë coff ¥Õ¥©¡¼¥Þ¥Ã¥È¡£
+
+  ¤³¤Î¤¦¤Á¡¢-f ¥ª¥×¥·¥ç¥ó¤Ï nasm ¤¬²ò¼á¤¹¤ë¥Õ¥¡¥¤¥ë¥Õ¥©¡¼¥Þ¥Ã¥È¤Î»ØÄê»Ò¤Ç¡¢
+  -D ¥ª¥×¥·¥ç¥ó(¥Þ¥¯¥í¤ÎÄêµÁ)¤Ï jsimdext.inc ¤ÎÃæ¤Ç²ò¼á¤µ¤ì¤ë¥Ð¥¤¥Ê¥êµ¬Ìó
+  (ABI)¤Î»ØÄê»Ò¤Ç¤¹¡£jsimdext.inc ¤Ç¤Ï¡¢-D ¥ª¥×¥·¥ç¥ó¤Ç¤Î¥Þ¥¯¥íÄêµÁ¤Ë½¾¤Ã¤Æ¡¢
+  ¥»¥°¥á¥ó¥È(¥»¥¯¥·¥ç¥ó)¤ÎÄêµÁ¤ä³°Éô̾̾Á°Áõ¾þ¤ÎÄêµÁ¤ò¹Ô¤Ê¤Ã¤Æ¤¤¤Þ¤¹¡£
+  ¾Ü¤·¤¯¤Ï jsimdext.inc ¤ò¤´Í÷¤¯¤À¤µ¤¤¡£
+
+  ELF ·Á¼° ¤ª¤è¤Ó a.out ·Á¼° ¤Î¾ì¹ç¡¢-DPIC ¤òÄɲ䷤ƻØÄꤹ¤ë¤È¥³¡¼¥É¤¬
+  Position Independent Code (°ÌÃÖÆÈΩ¥³¡¼¥É) ¤Ë¤Ê¤ê¤Þ¤¹¡£-DPIC ¤Ï
+  jsimdext.inc ¤ÎÃæ¤Ç²ò¼á¤µ¤ì¡¢¥³¡¼¥É¤ò PIC ¤Ë¤¹¤ë¤¿¤á¤Î¥Þ¥¯¥í¤òÄêµÁ¤·¤Þ¤¹¡£
+  Mach-O ·Á¼°¤Î¾ì¹ç¤Ï¡¢¥³¡¼¥É¤Ï¾ï¤Ë PIC ¤Ç¤¢¤ëɬÍפ¬¤¢¤ë¤¿¤á¡¢-DPIC ¤ò
+  »ØÄꤷ¤Ê¤¯¤Æ¤â¾ï¤Ë PIC ·Á¼°¤Î¥³¡¼¥É¤òÀ¸À®¤·¤Þ¤¹¡£
+
+  ¥×¥é¥Ã¥È¥Õ¥©¡¼¥à¤Ë±þ¤¸¤Æ¡¢¤³¤ì¤é¤ÎÃ椫¤éŬÀڤʤâ¤Î¤òÁªÂò¤¹¤ëɬÍפ¬¤¢¤ê
+  ¤Þ¤¹¡£Æ±º­¤Î makefile ¤Ç¤Ï¡¢¤¢¤é¤«¤¸¤áŬÀڤʤâ¤Î¤¬»ØÄꤵ¤ì¤Æ¤¤¤Þ¤¹¡£
+  configure ¥¹¥¯¥ê¥×¥È¤Ç¤Ï¡¢config.guess ¤¬½ÐÎϤ¹¤ë¥Û¥¹¥È¾ðÊó¤ò¸µ¤ËÁªÂò
+  ¤·¤Æ¤¤¤Þ¤¹¡£
+
+
+¢£OS ¤Î SIMD ¥µ¥Ý¡¼¥È¥Á¥§¥Ã¥¯ (jsimdgcc.c / jsimddjg.asm / jsimdw32.asm)
+
+  SIMD Ì¿Îá¤ò¼Â¹Ô¤¹¤ë¤Ë¤Ï¡¢»öÁ°¤Î CPU ¤Î¥µ¥Ý¡¼¥È¥Á¥§¥Ã¥¯¤À¤±¤Ç¤Ï¤Ê¤¯¡¢
+  OS ¤Î¥µ¥Ý¡¼¥È¥Á¥§¥Ã¥¯¤âɬÍפǤ¹¡£ÆÃ¤Ë SSE/SSE2 ¤Ë¤Ä¤¤¤Æ¤Ï¡¢OS ¦¤Ç
+  SSE/SSE2 Ì¿Îá¤ò¼Â¹Ô¤Ç¤­¤ë¤è¤¦¤Ë»öÁ°¤ËCPU¤òÀßÄꤹ¤ëɬÍפ¬¤¢¤ê¡¢¤½¤ì¤ò
+  ¹Ô¤Ê¤Ã¤Æ¤¤¤Ê¤¤ OS ¤Ç¤Ï¡¢OS ¤¬¥·¥ó¥°¥ë¥¿¥¹¥¯¤«¥Þ¥ë¥Á¥¿¥¹¥¯¤«¤Ë´Ø¤ï¤é¤º¡¢
+  SSE/SSE2 ¤Ï¼Â¹Ô¤Ç¤­¤Þ¤»¤ó¡£¤µ¤é¤Ë¡¢¤¢¤Þ¤êÃΤé¤ì¤Æ¤¤¤Þ¤»¤ó¤¬¡¢CPU ¤Ë
+  Æ⢤µ¤ì¤¿ FPU (¿ôÃͱ黻¥×¥í¥»¥Ã¥µ) ¤ò»ÈÍѤ·¤Ê¤¤(¥¨¥ß¥å¥ì¡¼¥È¤¹¤ë)ÀßÄê
+  ¤Ë¤Ê¤Ã¤Æ¤¤¤ë¤È¡¢MMX ¤ä 3DNow! ¤â´Þ¤á¤¹¤Ù¤Æ¤Î SIMD Ì¿Îá¤ÏÁ´¤¯¼Â¹Ô¤Ç¤­
+  ¤Þ¤»¤ó¡£Íפ¹¤ë¤Ë¡¢SIMD Ì¿Îá¤ò¼Â¹Ô¤Ç¤­¤ë¤«¤É¤¦¤«¤òÄ´¤Ù¤ë¤Ë¤Ï¡¢CPUID ¤Î
+  ¥Õ¥é¥°¤òÄ´¤Ù¤ë¤À¤±¤Ç¤ÏÉÔ½½Ê¬¤È¤¤¤¦¤³¤È¤Ç¤¹¡£
+
+  SIMD Ì¿Îá¤Î OS ¥µ¥Ý¡¼¥È¥Á¥§¥Ã¥¯¤È¤¤¤Ã¤Æ¤â¡¢Êݸî¥â¡¼¥É¤ÇÆ°¤¤¤Æ¤¤¤ë
+  ¥×¥í¥°¥é¥à¤Î¾ì¹ç¡¢CPU ¤ÎÀßÄê¥Õ¥é¥°¤Î¾õÂÖ¤òľÀܥ桼¥¶¡¦¥×¥í¥°¥é¥à¤«¤é
+  ÆɤߤȤ뤳¤È¤¬¤Ç¤­¤Ê¤¤¤¿¤á¡¢SIMD Ì¿Îá¤ò»î¤·¤Ë¼Â¹Ô¤·¤Æ¤ß¤Æ̵¸úÌ¿ÎáÎã³°¤¬
+  ȯÀ¸¤¹¤ë¤«¤É¤¦¤«¤ò³Î¤«¤á¤ë¤È¤¤¤¦¡¢°Ü¿¢À­¤Î°­¤¤´ÖÀÜŪ¤ÊÊýË¡¤òºÎ¤é¤¶¤ë¤ò
+  ÆÀ¤Ê¤¤¤Î¤¬¸½¾õ¤Ç¤¹¡£
+
+  ¤³¤Î SIMD Ì¿Îá¤Î OS ¥µ¥Ý¡¼¥È¥Á¥§¥Ã¥¯¤ò¤ä¤Ã¤Æ¤¤¤ë¤Î¤¬¡¢jsimdgcc.c /
+  jsimddjg.asm / jsimdw32.asm ¤Î£³¤Ä¤Î¥½¡¼¥¹¥Õ¥¡¥¤¥ë¤Ç¤¹¡£¤½¤ì¤¾¤ì¡¢
+  UNIX/gccÍÑ¡¢DJGPPÍÑ¡¢Win32ÍѤǤ¹¡£jsimdgcc.c ¤Ç¤Ï¡¢Îã³°¤ÎȯÀ¸¤ò
+  signal() ´Ø¿ô¤Î¥·¥°¥Ê¥ë¥Ï¥ó¥É¥é¤ÇÊá¤Þ¤¨¤Æ¤¤¤Þ¤¹¡£¥³¡¼¥É¤Î°ìÉô¤Ë gcc ¤Î
+  ¥¤¥ó¥é¥¤¥ó¥¢¥»¥ó¥Ö¥é¤ò»È¤Ã¤Æ¤¤¤ë¤¿¤á¡¢gcc ÀìÍѤǤ¹¡£gcc °Ê³°¤Ç¤â
+  ¥³¥ó¥Ñ¥¤¥ë¤Ï¤Ç¤­¤Þ¤¹¤¬¡¢¤³¤Î¾ì¹ç¤Ï SIMD Ì¿Îá¤Î¥µ¥Ý¡¼¥È¥Á¥§¥Ã¥¯¤Ï
+  ¹Ô¤Ê¤ï¤ì¤Þ¤»¤ó¡£jsimddjg.asm ¤Ï DPMI ¤ÎÎã³°½èÍýµ¡¹½¤òľÀÜÍøÍѤ·¤¿
+  ÊýË¡¤Ç¡¢jsimdw32.asm ¤Ï Win32 ¤ÎÎã³°½èÍýµ¡¹½¤òľÀÜÍøÍѤ·¤¿ÊýË¡¤Ç¤¹¡£
+
+  ¥×¥é¥Ã¥È¥Õ¥©¡¼¥à¤Ë±þ¤¸¤Æ¡¢¤³¤Î£³¼ïÎà¤ÎÃ椫¤éŬÀڤʤâ¤Î¤òÁªÂò¤¹¤ëɬÍפ¬
+  ¤¢¤ê¤Þ¤¹¡£Æ±º­¤Î makefile ¤Ç¤Ï¡¢¤¢¤é¤«¤¸¤áŬÀڤʤâ¤Î¤¬»ØÄꤵ¤ì¤Æ¤¤¤Þ¤¹¡£
+  configure ¥¹¥¯¥ê¥×¥È¤Ç¤Ï¡¢config.guess ¤¬½ÐÎϤ¹¤ë¥Û¥¹¥È¾ðÊó¤ò¸µ¤ËÁªÂò
+  ¤·¤Æ¤¤¤Þ¤¹¡£
+
+  ¤³¤Î£³¼ïÎà¤Î¤É¤ì¤È¤âŬ¹ç¤·¤Ê¤¤¥×¥é¥Ã¥È¥Õ¥©¡¼¥à¤Î¾ì¹ç¤Ï¡¢¿·¤¿¤Ê¥Á¥§¥Ã¥¯
+  ´Ø¿ô¤ò½ñ¤¯É¬Íפ¬¤¢¤ê¤Þ¤¹¡£¤Ç¤¹¤¬¡¢¤½¤Î OS ¤¬Á´¤Æ¤Î SIMD Ì¿Îá¤ò¥µ¥Ý¡¼¥È
+  ¤·¤Æ¤¤¤ë OS ¤Ç¤¢¤ë¤³¤È¤¬¤ï¤«¤Ã¤Æ¤¤¤ë¾ì¹ç¤Ï¡¢¤³¤Î OS ¥µ¥Ý¡¼¥È¥Á¥§¥Ã¥¯¤Ï
+  ¾Êά¤¹¤ë¤³¤È¤â²Äǽ¤Ç¤¹¡£°Ê²¼¤Î¤è¤¦¤Ê¶õ¤Î´Ø¿ô¤Î¤ß¤Î¥½¡¼¥¹¥Õ¥¡¥¤¥ë¤ò
+  ºî¤Ã¤Æ¾åµ­¤Î£³¤Ä¤Î¥Õ¥¡¥¤¥ë¤ÎÂå¤ï¤ê¤Ë»ÈÍѤ¹¤ë¤«¡¢¤â¤·¤¯¤Ï jcomapi.c ¤ò
+  ²þÊѤ·¤Æ jpeg_simd_os_support ¤Î¸Æ¤Ó½Ð¤·¤ò¥Ð¥¤¥Ñ¥¹¤¹¤ë¤è¤¦¤Ë¤¹¤ì¤Ð£Ï£Ë
+  ¤Ç¤¹¡£
+
+    GLOBAL(unsigned int)
+    jpeg_simd_os_support (unsigned int simd)
+    {
+      return simd;
+    }
+
+
+¢£¥¢¥»¥ó¥Ö¥ê¸À¸ìÍÑÀßÄê¥Õ¥¡¥¤¥ë jsimdcfg.inc ¤ÎºîÀ® (makecfg.c)
+
+  Åö¥½¥Õ¥È¤Î¾ì¹ç¡¢¥¢¥»¥ó¥Ö¥ê¸À¸ì¤Î¥½¡¼¥¹¥³¡¼¥ÉÃ椫¤é£Ã¸À¸ì¤Î¥Ø¥Ã¥À¥Õ¥¡¥¤¥ë
+  ¤Ë¤¢¤ë¾ðÊ󡢤¿¤È¤¨¤Ð¡¢¥×¥ê¥×¥í¥»¥Ã¥µ¥Þ¥¯¥í¤ÎÃͤ乽¤ÂΤÎÃæ¤Ë¤¢¤ëÊÑ¿ô¤Î
+  ¥ª¥Õ¥»¥Ã¥È¤Ê¤É¡¢¤òÃΤëɬÍפ¬¤¢¤ê¤Þ¤¹¡£makecfg.c ¤Ï¡¢JPEG ¥é¥¤¥Ö¥é¥ê¤Î
+  ¥³¥ó¥Ñ¥¤¥ë¤ËÀèΩ¤Ã¤Æ¥³¥ó¥Ñ¥¤¥ë¡¦¥ê¥ó¥¯¡¦¼Â¹Ô¤µ¤ì¡¢¥¢¥»¥ó¥Ö¥ê¸À¸ì¦¤Î
+  ¥½¡¼¥¹¥³¡¼¥É¤ÇɬÍפȤʤë¾ðÊó¤ò¥¢¥»¥ó¥Ö¥ê¸À¸ìÍÑÀßÄê¥Õ¥¡¥¤¥ë jsimdcfg.inc
+  ¤È¤·¤Æ½ÐÎϤ¹¤ëƯ¤­¤ò¤·¤Æ¤¤¤Þ¤¹¡£
+
+  Ãí°ÕÅÀ¤È¤·¤Æ¡¢makecfg.c ¤Ï JPEG ¥é¥¤¥Ö¥é¥ê¤Î¥½¡¼¥¹¥³¡¼¥É¤ÈƱ¤¸¥³¥ó¥Ñ¥¤¥ë
+  ¥ª¥×¥·¥ç¥ó¤Ç¥³¥ó¥Ñ¥¤¥ë¤µ¤ì¤ëɬÍפ¬¤¢¤ê¤Þ¤¹¡£Æäˡ¢¹½Â¤ÂΤβò¼á(¥µ¥¤¥º
+  ¤Ê¤É)¤¬ JPEG ¥é¥¤¥Ö¥é¥êËÜÂΤΤâ¤Î¤È°Û¤Ê¤Ã¤Æ¤·¤Þ¤¦¤È¡¢JPEG ¥é¥¤¥Ö¥é¥ê¤¬
+  ¥¯¥é¥Ã¥·¥å¤·¤Þ¤¹¡£
+
+
+¢£SIMD Ì¿Îá¤Î¼Â¹Ô»þ¤ÎÁªÂò¡¿SIMD Æ°ºî¥â¡¼¥É¾ðÊó
+
+  ¤³¤Î SIMD ³ÈÄ¥ÈÇ JPEG ¥é¥¤¥Ö¥é¥ê¤Ç¤Ï¡¢¥×¥í¥°¥é¥à¤Î¼Â¹Ô»þ¤ËÆÃÄê¤Î SIMD
+  Ì¿Îá¤ò»ÈÍѤ·¤Ê¤¤¤è¤¦¤Ë¤·¤¿¤ê¡¢³Æ½èÍýÃʳ¬¤Ç¤É¤Î SIMD Ì¿Îá¤ò»ÈÍѤ·¤ÆÆ°ºî
+  ¤¹¤ë¤Î¤«¤òÇÄ°®¤Ç¤­¤ë»ÅÁȤߤ¬ÍÑ°Õ¤µ¤ì¤Æ¤¤¤Þ¤¹¡£
+
+  jpeg_simd_mask() ¤ò»È¤¦¤È¡¢ÆÃÄê¤Î SIMD Ì¿Îá¤ò¼Â¹Ô»þ¤Ë»ÈÍѤ·¤Ê¤¤¤è¤¦¤Ë
+  ¤Ç¤­¤Þ¤¹¡£
+
+    GLOBAL(unsigned)
+    jpeg_simd_mask (j_common_ptr cinfo, unsigned remove, unsigned add);
+
+  ¤³¤Î´Ø¿ô¤Ï¡¢³Æ SIMD Ì¿Î᥻¥Ã¥È¤ËÂбþ¤¹¤ë¥Þ¥¹¥¯¥Ó¥Ã¥È¤òÀßÄꡦÊѹ¹¤·¤Þ¤¹¡£
+  remove, add ¤½¤·¤ÆÌá¤êÃͤϡ¢³Æ SIMD Ì¿Îá¤ËÂбþ¤¹¤ë¥Ó¥Ã¥ÈÃÍ (JSIMD_MMX,
+  JSIMD_3DNOW, JSIMD_SSE, JSIMD_SSE2) ¤ò OR ±é»»¤ÇÁȤ߹ç¤ï¤»¤¿¤â¤Î¤Ç¤¹¡£
+
+  "¥Þ¥¹¥¯¥Ó¥Ã¥È" ¤Ï¡¢¹½Â¤ÂÎ cinfo ¤ËÊÝ»ý¤µ¤ì¤ëÃͤǡ¢¤½¤Î½é´üÃÍ¤Ï 0 ¤Ç¤¹¡£
+  ¤½¤·¤Æ¤³¤Î´Ø¿ô¤Ï¡¢¤³¤Î¥Þ¥¹¥¯¥Ó¥Ã¥È¤ò°Ê²¼¤Î¤è¤¦¤Ë¹¹¿·(Áàºî)¤·¤Þ¤¹¡£
+
+    (¿·¤·¤¤¥Þ¥¹¥¯¥Ó¥Ã¥È) = ((¸Å¤¤¥Þ¥¹¥¯¥Ó¥Ã¥È) & ~remove) | add;
+
+  ¤½¤·¤Æ¡¢¤³¤Î¥Þ¥¹¥¯¥Ó¥Ã¥È¤¬ 1 ¤Ë¤µ¤ì¤¿ SIMD Ì¿Î᥻¥Ã¥È¤Ï¡¢¤¿¤È¤¨ CPU/OS
+  ¤ÇÂбþ¤·¤Æ¤¤¤Æ¤â»ÈÍѤµ¤ì¤Þ¤»¤ó¡£¤³¤Î´Ø¿ô¤Ï¡¢¤³¤Î´Ø¿ô¤ò¸Æ¤ÖľÁ°¤Þ¤ÇÀßÄê
+  ¤µ¤ì¤Æ¤¤¤¿¥Þ¥¹¥¯¥Ó¥Ã¥È¤òÊÖ¤·¤Þ¤¹¡£¤Ê¤Î¤Ç¡¢remove, add ¶¦¤Ë 0 ¤òÍ¿¤¨¤Æ
+  ´Ø¿ô¤ò¸Æ¤Ù¤Ð¡¢¸½ºßÀßÄꤵ¤ì¤Æ¤¤¤ë¥Þ¥¹¥¯¥Ó¥Ã¥È¤ò¼èÆÀ¤Ç¤­¤Þ¤¹¡£¤³¤Î´Ø¿ô¤Î
+  »ÈÍÑÎã¤Ï¡¢cjpeg.c, djpeg.c, jcomapi.c ¤Ë¤¢¤ê¤Þ¤¹¡£
+
+  ¤³¤Î¥Þ¥¹¥¯¥Ó¥Ã¥È¤ÎÃͤϡ¢¼ÂºÝ¤Ë¤Ï¹½Â¤ÂÎ cinfo ¤Î output_gamma ¤â¤·¤¯¤Ï
+  input_gamma ÊÑ¿ô¤Î²¼°Ì¥Ó¥Ã¥È¤ÎÊݸ¤µ¤ì¤Æ¤¤¤Þ¤¹(¾Ü¤·¤¯¤Ï jcomapi.c ¤ò
+  »²¾È)¡£¤³¤ì¤Ï¡¢¹½Â¤ÂÎ cinfo ¤Ë¿·¤¿¤ÊÊÑ¿ô¤òÄɲ䷤Ƥ·¤Þ¤¦¤È¥Ð¥¤¥Ê¥ê¸ß´¹
+  ¤¬Êø¤ì¤Æ¤·¤Þ¤¦¤¿¤á¤Ç¡¢¸½¾õ¤Ç¤Ï̤»ÈÍѤȻפï¤ì¤ë¾åµ­¤ÎÊÑ¿ô¤ò¡Ö´Ö¼Ú¤ê¡×
+  ¤·¤Æ¤¤¤Þ¤¹¡£
+
+  ¤Þ¤¿¡¢°Ê²¼¤Î´Ø¿ô·²¤ò»È¤¦¤È¡¢¥é¥¤¥Ö¥é¥êÆâÉô¤Î³Æ½èÍýÃʳ¬¤Ç¤É¤Î SIMD Ì¿Îá
+  ¤ò»ÈÍѤ·¤ÆÆ°ºî¤¹¤ë¤Î¤«¤òÇÄ°®¤Ç¤­¤Þ¤¹¡£
+
+    jpeg_simd_color_converter();   -> ¿§¶õ´ÖÊÑ´¹(RGB->YCbCr)
+    jpeg_simd_downsampler();       -> ¥À¥¦¥ó¥µ¥ó¥×¥ê¥ó¥°
+    jpeg_simd_forward_dct();       -> DCT½çÊÑ´¹
+    jpeg_simd_color_deconverter(); -> ¿§¶õ´ÖÊÑ´¹(YCbCr->RGB)
+    jpeg_simd_upsampler();         -> ¥¢¥Ã¥×¥µ¥ó¥×¥ê¥ó¥°
+    jpeg_simd_inverse_dct();       -> DCTµÕÊÑ´¹
+
+  ÊÖ¤¹Ãͤϡ¢ÉâÆ°¾®¿ôÅÀDCT½çÊÑ´¹/µÕÊÑ´¹ ¤Î¾ì¹ç¤Ï JSIMD_3DNOW ¤« JSIMD_SSE¡¢
+  ¤½¤ì°Ê³°¤Î¾ì¹ç¤Ï JSIMD_MMX ¤« JSIMD_SSE2 ¤òÊÖ¤·¤Þ¤¹¡£¤Þ¤¿¡¢0 ¤¬Ê֤äÆ
+  ¤­¤¿¾ì¹ç¤Ï SIMD Ì¿Î᥻¥Ã¥È¤Ï»È¤ï¤ì¤º¡¢½¾Íè¤Î¥ë¡¼¥Á¥ó¤¬»È¤ï¤ì¤ë¤³¤È¤ò
+  °ÕÌ£¤·¤Þ¤¹¡£
+
+  ¤³¤ì¤é¤Î´Ø¿ô¤Î¾Ü¤·¤¤»È¤¤Êý¤Ë¤Ä¤¤¤Æ¤Ï¡¢cjpeg.c, djpeg.c (»ÈÍÑÎã) ¤ò¤´Í÷
+  ¤¯¤À¤µ¤¤¡£
+
+  ¤Ê¤ª¡¢¤³¤ì¤é¤Î SIMD ¥Þ¥¹¥¯´Ø¿ô¡¿SIMD ¥â¡¼¥É¾ðÊó´Ø¿ô ¤¬É¬Íפʤ¤¾ì¹ç¤Ï¡¢
+  °Ê²¼¤Î¥Þ¥¯¥í¤ò jconfig.h ¤Ê¤É¤Ë´Þ¤á¤ë¤³¤È¤Ç¡¢¶Ï¤«¤Ç¤¹¤¬¥³¡¼¥É¥µ¥¤¥º¤¬
+  ÀáÌó¤Ç¤­¤Þ¤¹¡£
+
+    #define JSIMD_MASKFUNC_NOT_SUPPORTED
+    #define JSIMD_MODEINFO_NOT_SUPPORTED
+
+
+¢£¤½¤Î¤Û¤«¤ÎÀßÄê¹àÌÜ¥Þ¥¯¥í
+
+  ¡û RGB_RED / RGB_GREEN / RGB_BLUE / RGB_PIXELSIZE
+
+    ¤³¤ì¤Ï¡¢jmorecfg.h ¤ÎÃæ¤Ë¤¢¤ëÀßÄê¹àÌÜ¥Þ¥¯¥í¤Ç¡¢¼è¤ê°·¤¦ RGB ·Á¼°²èÁü
+    ¥Ç¡¼¥¿¤Î RGB ¤Îʤӽç¤ä¥Ô¥¯¥»¥ë¥µ¥¤¥º¤òÀßÄꤷ¤Þ¤¹¡£¤³¤Î SIMD ³ÈÄ¥ÈÇ
+    ¤Ç¤âÊѹ¹¤Ç¤­¤ë¤è¤¦¤Ë¤·¤Æ¤¢¤ê¤Þ¤¹¤¬¡¢RGB_PIXELSIZE ¤¬ 3 ¤« 4 ¤Î¾ì¹ç¤Î¤ß¡¢
+    SIMD ÈǤ臨õ´ÖÊÑ´¹¥ë¡¼¥Á¥ó¤¬Í­¸ú¤Ë¤Ê¤ê¤Þ¤¹¡£¤½¤ì°Ê³°¤ÎÃͤˤ·¤¿¾ì¹ç¤Ï¡¢
+    SIMD ÈǤ臨õ´ÖÊÑ´¹¥ë¡¼¥Á¥ó¤Ï¼«Æ°Åª¤Ë̵¸ú²½¤µ¤ì¤Æ¡¢½¾Íè¤Î¿§¶õ´ÖÊÑ´¹
+    ¥ë¡¼¥Á¥ó¤¬»È¤ï¤ì¤Þ¤¹(¤ä¤äÄ㮤ˤʤê¤Þ¤¹)¡£
+
+    ¤³¤ì¤é¤ÎÃͤòÊѹ¹¤¹¤ë¤³¤È¤Ç¡¢½ÐÎϤò 32bit/pixel ·Á¼°¤Ë¤·¤¿¤ê¡¢BMP ·Á¼°¤Ë
+    ¹ç¤ï¤»¤Æ¥Ô¥¯¥»¥ë¤ò BGR ½ç¤Ë¤·¤¿¤ê¤¹¤ë¤³¤È¤¬²Äǽ¤Ë¤Ê¤ê¤Þ¤¹¡£¤Ê¤ª¡¢
+    ¤³¤ì¤é¤ÎÃͤòÌ·½â¤¹¤ëÃÍ¤Ë #define ¤·¤¿¾ì¹ç¤Ï¥³¥ó¥Ñ¥¤¥ë¤Ç¤­¤Ê¤¤¤è¤¦¤Ë¤·¤Æ
+    ¤¢¤ê¤Þ¤¹¤Î¤Ç¡¢Ãí°Õ¤·¤Æ¤¯¤À¤µ¤¤¡£
+
+  ¡û RGBX_FILLER_0XFF
+
+    ¤³¤ì¤â jmorecfg.h ¤ÎÃæ¤Ë¤¢¤ëÀßÄê¹àÌÜ¥Þ¥¯¥í¤Ç¤¹¡£¥Ç¥Õ¥©¥ë¥È¤Ç¤Ï #undef
+    ¤Ë¤Ê¤Ã¤Æ¤¤¤Þ¤¹¡£¾å¤Î RGB_PIXELSIZE ¤ò 4 ¤Ë¤·¤¿¾ì¹ç¡¢£±¤Ä¤Î¥Ô¥¯¥»¥ë
+    ¥Ç¡¼¥¿Ãæ¤Ë(RGB¤Î£³¥Ð¥¤¥È¤Î¾¤Ë);·×¤Ê£±¥Ð¥¤¥È¤¬Â¸ºß¤¹¤ë¤³¤È¤Ë¤Ê¤ê¤Þ¤¹¡£
+    ¥Ç¥Õ¥©¥ë¥È¤Ç¤Ï¡¢¤³¤Î;·×¤Ê£±¥Ð¥¤¥È(filler byte)¤Ë¤Ï 0x00 ¤¬Ëä¤á¤é¤ì¤Æ
+    ½ÐÎϤµ¤ì¤Þ¤¹¤¬¡¢¤³¤Î RGBX_FILLER_0XFF ¤ò #define ¤¹¤ë¤È 0x00 ¤ÎÂå¤ï¤ê
+    ¤Ë 0xFF ¤¬ filler byte ¤ËËä¤á¤é¤ì¤Æ½ÐÎϤµ¤ì¤Þ¤¹¡£
+
+    ½ÐÎϤò 32bit/pixel ·Á¼°¤Ë¤·¤¿¾ì¹ç¤Ç¡¢filler byte ¤ò¥¢¥ë¥Õ¥¡¥Á¥ã¥Í¥ë
+    ¤È¤·¤Æ°·¤¤¤¿¤¤¾ì¹ç¤Ê¤É¤Ï¡¢RGBX_FILLER_0XFF ¤ò #define ¤¹¤ë¤ÈÅԹ礬Îɤ¤
+    ¾ì¹ç¤¬¤¢¤ë¤Ç¤·¤ç¤¦¡£
+
+    ¤Ê¤ª¡¢¥ª¥ê¥¸¥Ê¥ëÈǤΠJPEG ¥é¥¤¥Ö¥é¥ê¤Ç¤Ï¡¢¤³¤Î filler byte ¤Ë¤Ï²¿¤â
+    µÍ¤á¤é¤ì¤º¡¢¸µ¤ÎÃͤ¬¤½¤Î¤Þ¤ÞÊÝ»ý¤µ¤ì¤Þ¤¹¡£¤Ç¤¹¤¬¡¢SIMD ÈǤ臨õ´ÖÊÑ´¹
+    ¥ë¡¼¥Á¥ó¤Ç¤Ï¸µ¤ÎÃͤòÊÝ»ý¤¹¤ë¤Ë¤Ï¼ê´Ö¤¬¤«¤«¤ë¤¿¤á¡¢¾ï¤Ë 0x00 ¤« 0xFF
+    ¤ÇËä¤á¤Æ½ÐÎϤ¹¤ë¤è¤¦¤Ë»ÅÍÍÊѹ¹¤ò¹Ô¤Ê¤¤¤Þ¤·¤¿¡£
+
+  ¡û JFDCT_INT_QUANTIZE_WITH_DIVISION
+
+    ¤³¤ì¤Ï¡¢jmorecfg.h ¤ÎÃæ¤Ë¤¢¤ëÀßÄê¹àÌÜ¥Þ¥¯¥í¤Ç¡¢°µ½Ì½èÍý¤Ç¤Î DCT·¸¿ô¤Î
+    Î̻Ҳ½½èÍý¤ÎÊýË¡¤òÊѹ¹¤·¤Þ¤¹¡£¥Ç¥Õ¥©¥ë¥È¤Ç¤Ï #undef ¤Ç¡¢#undef ¤Î¾õÂÖ¤Î
+    Êý¤¬¹â®¤Ê¤Î¤Ç¡¢ÆäËÍýͳ¤Î¤Ê¤¤¸Â¤ê #undef ¤Ç»ÈÍѤ¹¤ë¤³¤È¤ò¤ªÁ¦¤á¤¤¤¿¤·
+    ¤Þ¤¹¡£
+
+    DCT·¸¿ô¤ÎÎ̻Ҳ½½èÍý¤È¤¤¤¦¤Î¤Ï¡¢Ã¼Åª¤Ë¸À¤¨¤Ð²èÁü¥Ç¡¼¥¿¤ËÂФ·¤Æ°ì¤Ä°ì¤Ä
+    ³ä¤ê»»(À°¿ô½ü»»)¤ò¼Â¹Ô¤¹¤ë¤³¤È¤Ç¤¹¡£¤Ç¤¹¤¬¡¢½ü»»¤Ï¸¶ÍýŪ¤Ë¹â®²½¤¬
+    ÉÔ²Äǽ¤Ê¤Î¤Ç¡¢¤³¤Î SIMD ³ÈÄ¥ÈǤǤÏÀ°¿ô½ü»»¤ÎÂå¤ï¤ê¤ËÀ°¿ô¾è»»¤ò»ÈÍѤ·¤Æ
+    Î̻Ҳ½½èÍý¤ò¼Â¹Ô¤·¤Æ¤¤¤Þ¤¹¡£
+
+    ¤³¤ÎÀ°¿ô¾è»»¤òÂåÍѤ¹¤ëÊýË¡¤Ç¤â¡¢¹âÀºÅÙÀ°¿ôDCT/¹â®À°¿ôDCT¤ò»È¤Ã¤¿¾ì¹ç¤Ç¡¢
+    ¤«¤Ä¡¢0¡Á100 ¤Î¤¹¤Ù¤Æ¤Î°µ½Ì¥¯¥ª¥ê¥Æ¥£ÀßÄê¤Ç¥ª¥ê¥¸¥Ê¥ëÈǤÈÁ´¤¯Æ±¤¸·ë²Ì¤ò
+    ½Ð¤¹¤³¤È¤ò³Îǧ¤·¤Æ¤¤¤Þ¤¹¡£¤Ç¤¹¤¬¡¢°µ½Ì²è¼Á¤ò "¥¯¥ª¥ê¥Æ¥£" ¤Î»Øɸ¤ÇÀßÄê
+    ¤»¤º¡¢¥¯¥ª¥ê¥Æ¥£ 0 ¤è¤ê¤âÄã²è¼Á¤ÎÎ̻Ҳ½¥Æ¡¼¥Ö¥ë¤òľÀÜÍ¿¤¨¤Æ°µ½Ì¤·¤¿¾ì¹ç
+    ¤Ê¤É¤Ï¡¢±é»»ÅÓÃæ¤Î¿ôÃÍÈϰϤδط¸¤Ç¡¢¥ª¥ê¥¸¥Ê¥ëÈǤȤϰۤʤë·ë²Ì¤¬½Ð¤ë¤³¤È
+    ¤âÈÝÄê¤Ç¤­¤Þ¤»¤ó¡£¤½¤¦¤¤¤Ã¤¿Æüì¤Ê¶­³¦¾ò·ï²¼¤Ç¤â¥ª¥ê¥¸¥Ê¥ëÈǤȤθߴ¹ÅÙ¤¬
+    ¹â¤¯¤Ê¤é¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¾ì¹ç¤Ê¤É¤Ë¤Ï¡¢¤³¤Î¹àÌܤò #define ¤Ë¤·¤Æ»ÈÍѤ·¤Æ
+    ¤¯¤À¤µ¤¤¡£¼ã´³Â®ÅÙ¤ÏÍî¤Á¤Þ¤¹¤¬¡¢½¾Íè¤É¤ª¤ê¡¢°ì¤Ä°ì¤Ä½ü»»¤ò¹Ô¤Ã¤ÆÎ̻Ҳ½
+    ½èÍý¤ò¹Ô¤¤¤Þ¤¹¡£
+
+    ¤â¤Ã¤È¤â¡¢¥¯¥ª¥ê¥Æ¥£ 0 ¤è¤ê¤âÄã²è¼Á¤ÎÀßÄê¤Ç°µ½Ì¤·¤Æ¤â¡¢¤Û¤È¤ó¤É¼ÂÍѤË
+    ¤Ê¤ê¤Þ¤»¤ó¤Î¤Ç¡¢¤³¤ÎÀ°¿ô¾è»»¤òÂåÍѤ¹¤ëÊýË¡¤Ç¤â¡¢ÌäÂê¤Ë¤Ê¤ë¤³¤È¤Ï¤Ê¤¤¤È
+    »×¤¤¤Þ¤¹¡£
+
+  ¡û UPSAMPLE_H1V2_SUPPORTED
+
+    ¤³¤ì¤Ï¡¢jmorecfg.h ¤ÎÃæ¤Ë¤¢¤ë¡¢¥ª¥ê¥¸¥Ê¥ëÈǤΠIJG JPEG library ¤Ë¤Ï
+    ¸ºß¤·¤Ê¤¤ÀßÄê¹àÌÜ¥Þ¥¯¥í¤Ç¤¹¡£¤³¤ì¤Ï¡¢Y:1x2 Cb:1x1 Cr:1x1 (4:2:2) ¤Î
+    ¥µ¥Ö¥µ¥ó¥×¥ê¥ó¥°Èæ¤ò»ý¤Ä JPEG ¥Õ¥¡¥¤¥ë¤ò¡¢¥ª¥ê¥¸¥Ê¥ëÈǤΠIJG JPEG
+    library ¤è¤ê¤â¹â®¡¿¹â²è¼Á¤ËŸ³«¤Ç¤­¤ë¤è¤¦¤Ë¤¹¤ë¤â¤Î¤Ç¤¹¡£
+
+    ¥µ¥Ö¥µ¥ó¥×¥ê¥ó¥°Èæ Y:1x2 Cb:1x1 Cr:1x1 (4:2:2) ¤ò»ý¤Ä JPEG ¥Õ¥¡¥¤¥ë¤Ï¡¢
+    ¥ª¥ê¥¸¥Ê¥ë¤Î IJG JPEG Library ¤Ç¤âŸ³«¤Ç¤­¤Þ¤¹¤¬¡¢¤³¤Î¥µ¥Ö¥µ¥ó¥×¥ê¥ó¥°
+    Èæ¤ËÂбþ¤¹¤ë¥¢¥Ã¥×¥µ¥ó¥×¥ê¥ó¥°¡¦¥ë¡¼¥Á¥ó¤¬´Êñ¤Ê¤â¤Î¤·¤«ÍÑ°Õ¤µ¤ì¤Æ
+    ¤¤¤Ê¤¤¤¿¤á¡¢Å¸³«Â®ÅÙ¤âÃÙ¤¯¡¢¤Þ¤¿¡¢¿§¤Î¶­Ìܤ¬¤Ï¤Ã¤­¤ê¤·¤Æ¤¤¤ë£Ã£Ç²èÁü
+    ¤Ê¤É¤Î¾ì¹ç¤Ï¥¸¥ã¥®¡¼¤¬ÌÜΩ¤Ã¤Æ¤·¤Þ¤¦¤³¤È¤¬¤¢¤ê¤Þ¤¹¡£¤³¤Î¹àÌܤòÍ­¸ú¤Ë
+    ¤¹¤ë¤³¤È¤Ç¡¢¤³¤Î¤è¤¦¤Ê¥µ¥Ö¥µ¥ó¥×¥ê¥ó¥°Èæ Y:1x2 Cb:1x1 Cr:1x1 ¤ò»ý¤Ä
+    JPEG ¥Õ¥¡¥¤¥ë¤ò¹â®¤Ë¡¢¤Þ¤¿¡¢¥¸¥ã¥®¡¼¤¬ÌÜΩ¤¿¤Ê¤¤¤è¤¦¤Ë¹â²è¼Á¤ËŸ³«
+    ¤Ç¤­¤ë¤è¤¦¤Ë¤·¤Þ¤¹¡£
+
+    ¤³¤Î¡¢¥µ¥Ö¥µ¥ó¥×¥ê¥ó¥°Èæ Y:1x2 Cb:1x1 Cr:1x1 ¤Î JPEG ¥Õ¥¡¥¤¥ë¤Ï¡¢¤¢¤Þ¤ê
+    °ìÈÌŪ¤Ê¤â¤Î¤Ç¤Ï¤¢¤ê¤Þ¤»¤ó¤¬¡¢¥Ç¥£¥¸¥¿¥ë¥«¥á¥é¤Ê¤É¤¬½ÐÎϤ¹¤ë¤³¤È¤Î¿¤¤¡¢
+    ¥µ¥Ö¥µ¥ó¥×¥ê¥ó¥°Èæ Y:2x1 Cb:1x1 Cr:1x1 (4:2:2) ¤Î JPEG ¥Õ¥¡¥¤¥ë¤ËÂФ·¤Æ
+    ¡ÖJPEG ¥í¥¹¥ì¥¹²óž¡×½èÍý¤ò¹Ô¤Ê¤¦¤È¡¢¤³¤Î Y:1x2 Cb:1x1 Cr:1x1 ¤Î JPEG
+    ¥Õ¥¡¥¤¥ë¤Ë¤Ê¤ê¤Þ¤¹¡£¥Ç¥£¥¸¥¿¥ë¥«¥á¥é¤Ç¡¢¥«¥á¥é¤ò½Ä¤Ë¤·¤Æ¡Ê½Ä°ÌÃ֤ǡ˻£±Æ
+    ¤·¤¿²èÁü¤ò¡ÖJPEG ¥í¥¹¥ì¥¹²óž¡×¤·¤ÆÀµ¾ï¤Ê¸þ¤­¤Ëľ¤¹¡¢¤Ê¤É¤È¤¤¤¦¤³¤È¤Ï¡¢
+    ¤è¤¯¤ä¤ë¤³¤È¤À¤È»×¤¤¤Þ¤¹¡£¤Ç¤¹¤¬¡¢¤³¤Î¤è¤¦¤Ê¡ÖJPEG ¥í¥¹¥ì¥¹²óž¡×¤µ¤ì¤¿
+    JPEG ¥Õ¥¡¥¤¥ë¤ò¥ª¥ê¥¸¥Ê¥ë¤Î IJG JPEG Library ¤ÇŸ³«¤¹¤ë¤È¡¢¾åµ­¤ÎÍýͳ¤«¤é¡¢
+    JPEG ¥Õ¥¡¥¤¥ë¤òŸ³«¤·¤Æ¤«¤é²èÁü½èÍý¥½¥Õ¥È¤Ç²óž¤µ¤»¤¿²èÁü¤ËÈæ¤Ù¤Æ²è¼Á¤¬
+    Îô¤Ã¤Æ¤·¤Þ¤¤¤Þ¤¹¡£¤³¤Î¹àÌܤòÍ­¸ú¤Ë¤¹¤ë¤³¤È¤Ç¡¢²èÁü½èÍý¥½¥Õ¥È¤Ç²óž¤µ¤»¤¿
+    ²èÁü¤È¤Û¤ÜƱ¤¸¥¯¥ª¥ê¥Æ¥£¤Ç²èÁü¤òŸ³«¤¹¤ë¤³¤È¤¬²Äǽ¤Ë¤Ê¤ê¤Þ¤¹¡£
+
+    ¤³¤Î¹àÌܤϡ¢ÆäËÍýͳ¤Î¤Ê¤¤¸Â¤ê¡¢#define ¤Î¾õÂ֤ˤ·¤Æ¤ª¤¯¤³¤È¤ò¤ªÁ¦¤á
+    ¤¤¤¿¤·¤Þ¤¹¡£¥ª¥ê¥¸¥Ê¥ë¤Î IJG JPEG Library ¤È´°Á´¤ËƱ°ì¤Î·ë²Ì¤¬É¬ÍפÊ
+    ¾ì¹ç¤Î¤ß #undef ¤Ë¤·¤Æ¥³¥ó¥Ñ¥¤¥ë¤·¤Æ¤¯¤À¤µ¤¤¡£
+
+
+
+[EOF]
diff --git a/jconfig.bcc b/unused/jconfig.bcc
similarity index 100%
rename from jconfig.bcc
rename to unused/jconfig.bcc
diff --git a/jconfig.mac b/unused/jconfig.mac
similarity index 100%
rename from jconfig.mac
rename to unused/jconfig.mac
diff --git a/jconfig.manx b/unused/jconfig.manx
similarity index 100%
rename from jconfig.manx
rename to unused/jconfig.manx
diff --git a/jconfig.mc6 b/unused/jconfig.mc6
similarity index 100%
rename from jconfig.mc6
rename to unused/jconfig.mc6
diff --git a/jconfig.sas b/unused/jconfig.sas
similarity index 100%
rename from jconfig.sas
rename to unused/jconfig.sas
diff --git a/jconfig.st b/unused/jconfig.st
similarity index 100%
rename from jconfig.st
rename to unused/jconfig.st
diff --git a/jconfig.vms b/unused/jconfig.vms
similarity index 100%
rename from jconfig.vms
rename to unused/jconfig.vms
diff --git a/jconfig.wat b/unused/jconfig.wat
similarity index 100%
rename from jconfig.wat
rename to unused/jconfig.wat
diff --git a/jfdctflt.c b/unused/jfdctflt.c
similarity index 100%
rename from jfdctflt.c
rename to unused/jfdctflt.c
diff --git a/jfdctfst.c b/unused/jfdctfst.c
similarity index 100%
rename from jfdctfst.c
rename to unused/jfdctfst.c
diff --git a/jfdctint.c b/unused/jfdctint.c
similarity index 100%
rename from jfdctint.c
rename to unused/jfdctint.c
diff --git a/jidctflt.c b/unused/jidctflt.c
similarity index 100%
rename from jidctflt.c
rename to unused/jidctflt.c
diff --git a/jidctfst.c b/unused/jidctfst.c
similarity index 100%
rename from jidctfst.c
rename to unused/jidctfst.c
diff --git a/jidctint.c b/unused/jidctint.c
similarity index 100%
rename from jidctint.c
rename to unused/jidctint.c
diff --git a/jidctred.c b/unused/jidctred.c
similarity index 100%
rename from jidctred.c
rename to unused/jidctred.c
diff --git a/jmemdos.c b/unused/jmemdos.c
similarity index 100%
rename from jmemdos.c
rename to unused/jmemdos.c
diff --git a/jmemdosa.asm b/unused/jmemdosa.asm
similarity index 100%
rename from jmemdosa.asm
rename to unused/jmemdosa.asm
diff --git a/jmemmac.c b/unused/jmemmac.c
similarity index 100%
rename from jmemmac.c
rename to unused/jmemmac.c
diff --git a/makcjpeg.st b/unused/makcjpeg.st
similarity index 100%
rename from makcjpeg.st
rename to unused/makcjpeg.st
diff --git a/makdjpeg.st b/unused/makdjpeg.st
similarity index 100%
rename from makdjpeg.st
rename to unused/makdjpeg.st
diff --git a/makeapps.ds b/unused/makeapps.ds
similarity index 100%
rename from makeapps.ds
rename to unused/makeapps.ds
diff --git a/makefile.bcc b/unused/makefile.bcc
similarity index 100%
rename from makefile.bcc
rename to unused/makefile.bcc
diff --git a/makefile.manx b/unused/makefile.manx
similarity index 100%
rename from makefile.manx
rename to unused/makefile.manx
diff --git a/makefile.mc6 b/unused/makefile.mc6
similarity index 100%
rename from makefile.mc6
rename to unused/makefile.mc6
diff --git a/makefile.mms b/unused/makefile.mms
similarity index 100%
rename from makefile.mms
rename to unused/makefile.mms
diff --git a/makefile.sas b/unused/makefile.sas
similarity index 100%
rename from makefile.sas
rename to unused/makefile.sas
diff --git a/makefile.vms b/unused/makefile.vms
similarity index 100%
rename from makefile.vms
rename to unused/makefile.vms
diff --git a/makefile.wat b/unused/makefile.wat
similarity index 100%
rename from makefile.wat
rename to unused/makefile.wat
diff --git a/makelib.ds b/unused/makelib.ds
similarity index 100%
rename from makelib.ds
rename to unused/makelib.ds
diff --git a/makeproj.mac b/unused/makeproj.mac
similarity index 100%
rename from makeproj.mac
rename to unused/makeproj.mac
diff --git a/makljpeg.st b/unused/makljpeg.st
similarity index 100%
rename from makljpeg.st
rename to unused/makljpeg.st
diff --git a/maktjpeg.st b/unused/maktjpeg.st
similarity index 100%
rename from maktjpeg.st
rename to unused/maktjpeg.st
diff --git a/makvms.opt b/unused/makvms.opt
similarity index 100%
rename from makvms.opt
rename to unused/makvms.opt
diff --git a/unused/rdgif.c b/unused/rdgif.c
new file mode 100644
index 0000000..b27c167
--- /dev/null
+++ b/unused/rdgif.c
@@ -0,0 +1,38 @@
+/*
+ * rdgif.c
+ *
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains routines to read input images in GIF format.
+ *
+ *****************************************************************************
+ * NOTE: to avoid entanglements with Unisys' patent on LZW compression,      *
+ * the ability to read GIF files has been removed from the IJG distribution. *
+ * Sorry about that.                                                         *
+ *****************************************************************************
+ *
+ * We are required to state that
+ *    "The Graphics Interchange Format(c) is the Copyright property of
+ *    CompuServe Incorporated. GIF(sm) is a Service Mark property of
+ *    CompuServe Incorporated."
+ */
+
+#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
+
+#ifdef GIF_SUPPORTED
+
+/*
+ * The module selection routine for GIF format input.
+ */
+
+GLOBAL(cjpeg_source_ptr)
+jinit_read_gif (j_compress_ptr cinfo)
+{
+  fprintf(stderr, "GIF input is unsupported for legal reasons.  Sorry.\n");
+  exit(EXIT_FAILURE);
+  return NULL;			/* keep compiler happy */
+}
+
+#endif /* GIF_SUPPORTED */
diff --git a/unused/wrgif.c b/unused/wrgif.c
new file mode 100644
index 0000000..5fe8328
--- /dev/null
+++ b/unused/wrgif.c
@@ -0,0 +1,399 @@
+/*
+ * wrgif.c
+ *
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains routines to write output images in GIF format.
+ *
+ **************************************************************************
+ * NOTE: to avoid entanglements with Unisys' patent on LZW compression,   *
+ * this code has been modified to output "uncompressed GIF" files.        *
+ * There is no trace of the LZW algorithm in this file.                   *
+ **************************************************************************
+ *
+ * These routines may need modification for non-Unix environments or
+ * specialized applications.  As they stand, they assume output to
+ * an ordinary stdio stream.
+ */
+
+/*
+ * This code is loosely based on ppmtogif from the PBMPLUS distribution
+ * of Feb. 1991.  That file contains the following copyright notice:
+ *    Based on GIFENCODE by David Rowley <mgardi@watdscu.waterloo.edu>.
+ *    Lempel-Ziv compression based on "compress" by Spencer W. Thomas et al.
+ *    Copyright (C) 1989 by Jef Poskanzer.
+ *    Permission to use, copy, modify, and distribute this software and its
+ *    documentation for any purpose and without fee is hereby granted, provided
+ *    that the above copyright notice appear in all copies and that both that
+ *    copyright notice and this permission notice appear in supporting
+ *    documentation.  This software is provided "as is" without express or
+ *    implied warranty.
+ *
+ * We are also required to state that
+ *    "The Graphics Interchange Format(c) is the Copyright property of
+ *    CompuServe Incorporated. GIF(sm) is a Service Mark property of
+ *    CompuServe Incorporated."
+ */
+
+#include "cdjpeg.h"		/* Common decls for cjpeg/djpeg applications */
+
+#ifdef GIF_SUPPORTED
+
+
+/* Private version of data destination object */
+
+typedef struct {
+  struct djpeg_dest_struct pub;	/* public fields */
+
+  j_decompress_ptr cinfo;	/* back link saves passing separate parm */
+
+  /* State for packing variable-width codes into a bitstream */
+  int n_bits;			/* current number of bits/code */
+  int maxcode;			/* maximum code, given n_bits */
+  INT32 cur_accum;		/* holds bits not yet output */
+  int cur_bits;			/* # of bits in cur_accum */
+
+  /* State for GIF code assignment */
+  int ClearCode;		/* clear code (doesn't change) */
+  int EOFCode;			/* EOF code (ditto) */
+  int code_counter;		/* counts output symbols */
+
+  /* GIF data packet construction buffer */
+  int bytesinpkt;		/* # of bytes in current packet */
+  char packetbuf[256];		/* workspace for accumulating packet */
+
+} gif_dest_struct;
+
+typedef gif_dest_struct * gif_dest_ptr;
+
+/* Largest value that will fit in N bits */
+#define MAXCODE(n_bits)	((1 << (n_bits)) - 1)
+
+
+/*
+ * Routines to package finished data bytes into GIF data blocks.
+ * A data block consists of a count byte (1..255) and that many data bytes.
+ */
+
+LOCAL(void)
+flush_packet (gif_dest_ptr dinfo)
+/* flush any accumulated data */
+{
+  if (dinfo->bytesinpkt > 0) {	/* never write zero-length packet */
+    dinfo->packetbuf[0] = (char) dinfo->bytesinpkt++;
+    if (JFWRITE(dinfo->pub.output_file, dinfo->packetbuf, dinfo->bytesinpkt)
+	!= (size_t) dinfo->bytesinpkt)
+      ERREXIT(dinfo->cinfo, JERR_FILE_WRITE);
+    dinfo->bytesinpkt = 0;
+  }
+}
+
+
+/* Add a character to current packet; flush to disk if necessary */
+#define CHAR_OUT(dinfo,c)  \
+	{ (dinfo)->packetbuf[++(dinfo)->bytesinpkt] = (char) (c);  \
+	    if ((dinfo)->bytesinpkt >= 255)  \
+	      flush_packet(dinfo);  \
+	}
+
+
+/* Routine to convert variable-width codes into a byte stream */
+
+LOCAL(void)
+output (gif_dest_ptr dinfo, int code)
+/* Emit a code of n_bits bits */
+/* Uses cur_accum and cur_bits to reblock into 8-bit bytes */
+{
+  dinfo->cur_accum |= ((INT32) code) << dinfo->cur_bits;
+  dinfo->cur_bits += dinfo->n_bits;
+
+  while (dinfo->cur_bits >= 8) {
+    CHAR_OUT(dinfo, dinfo->cur_accum & 0xFF);
+    dinfo->cur_accum >>= 8;
+    dinfo->cur_bits -= 8;
+  }
+}
+
+
+/* The pseudo-compression algorithm.
+ *
+ * In this module we simply output each pixel value as a separate symbol;
+ * thus, no compression occurs.  In fact, there is expansion of one bit per
+ * pixel, because we use a symbol width one bit wider than the pixel width.
+ *
+ * GIF ordinarily uses variable-width symbols, and the decoder will expect
+ * to ratchet up the symbol width after a fixed number of symbols.
+ * To simplify the logic and keep the expansion penalty down, we emit a
+ * GIF Clear code to reset the decoder just before the width would ratchet up.
+ * Thus, all the symbols in the output file will have the same bit width.
+ * Note that emitting the Clear codes at the right times is a mere matter of
+ * counting output symbols and is in no way dependent on the LZW patent.
+ *
+ * With a small basic pixel width (low color count), Clear codes will be
+ * needed very frequently, causing the file to expand even more.  So this
+ * simplistic approach wouldn't work too well on bilevel images, for example.
+ * But for output of JPEG conversions the pixel width will usually be 8 bits
+ * (129 to 256 colors), so the overhead added by Clear symbols is only about
+ * one symbol in every 256.
+ */
+
+LOCAL(void)
+compress_init (gif_dest_ptr dinfo, int i_bits)
+/* Initialize pseudo-compressor */
+{
+  /* init all the state variables */
+  dinfo->n_bits = i_bits;
+  dinfo->maxcode = MAXCODE(dinfo->n_bits);
+  dinfo->ClearCode = (1 << (i_bits - 1));
+  dinfo->EOFCode = dinfo->ClearCode + 1;
+  dinfo->code_counter = dinfo->ClearCode + 2;
+  /* init output buffering vars */
+  dinfo->bytesinpkt = 0;
+  dinfo->cur_accum = 0;
+  dinfo->cur_bits = 0;
+  /* GIF specifies an initial Clear code */
+  output(dinfo, dinfo->ClearCode);
+}
+
+
+LOCAL(void)
+compress_pixel (gif_dest_ptr dinfo, int c)
+/* Accept and "compress" one pixel value.
+ * The given value must be less than n_bits wide.
+ */
+{
+  /* Output the given pixel value as a symbol. */
+  output(dinfo, c);
+  /* Issue Clear codes often enough to keep the reader from ratcheting up
+   * its symbol size.
+   */
+  if (dinfo->code_counter < dinfo->maxcode) {
+    dinfo->code_counter++;
+  } else {
+    output(dinfo, dinfo->ClearCode);
+    dinfo->code_counter = dinfo->ClearCode + 2;	/* reset the counter */
+  }
+}
+
+
+LOCAL(void)
+compress_term (gif_dest_ptr dinfo)
+/* Clean up at end */
+{
+  /* Send an EOF code */
+  output(dinfo, dinfo->EOFCode);
+  /* Flush the bit-packing buffer */
+  if (dinfo->cur_bits > 0) {
+    CHAR_OUT(dinfo, dinfo->cur_accum & 0xFF);
+  }
+  /* Flush the packet buffer */
+  flush_packet(dinfo);
+}
+
+
+/* GIF header construction */
+
+
+LOCAL(void)
+put_word (gif_dest_ptr dinfo, unsigned int w)
+/* Emit a 16-bit word, LSB first */
+{
+  putc(w & 0xFF, dinfo->pub.output_file);
+  putc((w >> 8) & 0xFF, dinfo->pub.output_file);
+}
+
+
+LOCAL(void)
+put_3bytes (gif_dest_ptr dinfo, int val)
+/* Emit 3 copies of same byte value --- handy subr for colormap construction */
+{
+  putc(val, dinfo->pub.output_file);
+  putc(val, dinfo->pub.output_file);
+  putc(val, dinfo->pub.output_file);
+}
+
+
+LOCAL(void)
+emit_header (gif_dest_ptr dinfo, int num_colors, JSAMPARRAY colormap)
+/* Output the GIF file header, including color map */
+/* If colormap==NULL, synthesize a gray-scale colormap */
+{
+  int BitsPerPixel, ColorMapSize, InitCodeSize, FlagByte;
+  int cshift = dinfo->cinfo->data_precision - 8;
+  int i;
+
+  if (num_colors > 256)
+    ERREXIT1(dinfo->cinfo, JERR_TOO_MANY_COLORS, num_colors);
+  /* Compute bits/pixel and related values */
+  BitsPerPixel = 1;
+  while (num_colors > (1 << BitsPerPixel))
+    BitsPerPixel++;
+  ColorMapSize = 1 << BitsPerPixel;
+  if (BitsPerPixel <= 1)
+    InitCodeSize = 2;
+  else
+    InitCodeSize = BitsPerPixel;
+  /*
+   * Write the GIF header.
+   * Note that we generate a plain GIF87 header for maximum compatibility.
+   */
+  putc('G', dinfo->pub.output_file);
+  putc('I', dinfo->pub.output_file);
+  putc('F', dinfo->pub.output_file);
+  putc('8', dinfo->pub.output_file);
+  putc('7', dinfo->pub.output_file);
+  putc('a', dinfo->pub.output_file);
+  /* Write the Logical Screen Descriptor */
+  put_word(dinfo, (unsigned int) dinfo->cinfo->output_width);
+  put_word(dinfo, (unsigned int) dinfo->cinfo->output_height);
+  FlagByte = 0x80;		/* Yes, there is a global color table */
+  FlagByte |= (BitsPerPixel-1) << 4; /* color resolution */
+  FlagByte |= (BitsPerPixel-1);	/* size of global color table */
+  putc(FlagByte, dinfo->pub.output_file);
+  putc(0, dinfo->pub.output_file); /* Background color index */
+  putc(0, dinfo->pub.output_file); /* Reserved (aspect ratio in GIF89) */
+  /* Write the Global Color Map */
+  /* If the color map is more than 8 bits precision, */
+  /* we reduce it to 8 bits by shifting */
+  for (i=0; i < ColorMapSize; i++) {
+    if (i < num_colors) {
+      if (colormap != NULL) {
+	if (dinfo->cinfo->out_color_space == JCS_RGB) {
+	  /* Normal case: RGB color map */
+	  putc(GETJSAMPLE(colormap[0][i]) >> cshift, dinfo->pub.output_file);
+	  putc(GETJSAMPLE(colormap[1][i]) >> cshift, dinfo->pub.output_file);
+	  putc(GETJSAMPLE(colormap[2][i]) >> cshift, dinfo->pub.output_file);
+	} else {
+	  /* Grayscale "color map": possible if quantizing grayscale image */
+	  put_3bytes(dinfo, GETJSAMPLE(colormap[0][i]) >> cshift);
+	}
+      } else {
+	/* Create a gray-scale map of num_colors values, range 0..255 */
+	put_3bytes(dinfo, (i * 255 + (num_colors-1)/2) / (num_colors-1));
+      }
+    } else {
+      /* fill out the map to a power of 2 */
+      put_3bytes(dinfo, 0);
+    }
+  }
+  /* Write image separator and Image Descriptor */
+  putc(',', dinfo->pub.output_file); /* separator */
+  put_word(dinfo, 0);		/* left/top offset */
+  put_word(dinfo, 0);
+  put_word(dinfo, (unsigned int) dinfo->cinfo->output_width); /* image size */
+  put_word(dinfo, (unsigned int) dinfo->cinfo->output_height);
+  /* flag byte: not interlaced, no local color map */
+  putc(0x00, dinfo->pub.output_file);
+  /* Write Initial Code Size byte */
+  putc(InitCodeSize, dinfo->pub.output_file);
+
+  /* Initialize for "compression" of image data */
+  compress_init(dinfo, InitCodeSize+1);
+}
+
+
+/*
+ * Startup: write the file header.
+ */
+
+METHODDEF(void)
+start_output_gif (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
+{
+  gif_dest_ptr dest = (gif_dest_ptr) dinfo;
+
+  if (cinfo->quantize_colors)
+    emit_header(dest, cinfo->actual_number_of_colors, cinfo->colormap);
+  else
+    emit_header(dest, 256, (JSAMPARRAY) NULL);
+}
+
+
+/*
+ * Write some pixel data.
+ * In this module rows_supplied will always be 1.
+ */
+
+METHODDEF(void)
+put_pixel_rows (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
+		JDIMENSION rows_supplied)
+{
+  gif_dest_ptr dest = (gif_dest_ptr) dinfo;
+  register JSAMPROW ptr;
+  register JDIMENSION col;
+
+  ptr = dest->pub.buffer[0];
+  for (col = cinfo->output_width; col > 0; col--) {
+    compress_pixel(dest, GETJSAMPLE(*ptr++));
+  }
+}
+
+
+/*
+ * Finish up at the end of the file.
+ */
+
+METHODDEF(void)
+finish_output_gif (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
+{
+  gif_dest_ptr dest = (gif_dest_ptr) dinfo;
+
+  /* Flush "compression" mechanism */
+  compress_term(dest);
+  /* Write a zero-length data block to end the series */
+  putc(0, dest->pub.output_file);
+  /* Write the GIF terminator mark */
+  putc(';', dest->pub.output_file);
+  /* Make sure we wrote the output file OK */
+  fflush(dest->pub.output_file);
+  if (ferror(dest->pub.output_file))
+    ERREXIT(cinfo, JERR_FILE_WRITE);
+}
+
+
+/*
+ * The module selection routine for GIF format output.
+ */
+
+GLOBAL(djpeg_dest_ptr)
+jinit_write_gif (j_decompress_ptr cinfo)
+{
+  gif_dest_ptr dest;
+
+  /* Create module interface object, fill in method pointers */
+  dest = (gif_dest_ptr)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				  SIZEOF(gif_dest_struct));
+  dest->cinfo = cinfo;		/* make back link for subroutines */
+  dest->pub.start_output = start_output_gif;
+  dest->pub.put_pixel_rows = put_pixel_rows;
+  dest->pub.finish_output = finish_output_gif;
+
+  if (cinfo->out_color_space != JCS_GRAYSCALE &&
+      cinfo->out_color_space != JCS_RGB)
+    ERREXIT(cinfo, JERR_GIF_COLORSPACE);
+
+  /* Force quantization if color or if > 8 bits input */
+  if (cinfo->out_color_space != JCS_GRAYSCALE || cinfo->data_precision > 8) {
+    /* Force quantization to at most 256 colors */
+    cinfo->quantize_colors = TRUE;
+    if (cinfo->desired_number_of_colors > 256)
+      cinfo->desired_number_of_colors = 256;
+  }
+
+  /* Calculate output image dimensions so we can allocate space */
+  jpeg_calc_output_dimensions(cinfo);
+
+  if (cinfo->output_components != 1) /* safety check: just one component? */
+    ERREXIT(cinfo, JERR_GIF_BUG);
+
+  /* Create decompressor output buffer. */
+  dest->pub.buffer = (*cinfo->mem->alloc_sarray)
+    ((j_common_ptr) cinfo, JPOOL_IMAGE, cinfo->output_width, (JDIMENSION) 1);
+  dest->pub.buffer_height = 1;
+
+  return (djpeg_dest_ptr) dest;
+}
+
+#endif /* GIF_SUPPORTED */
diff --git a/vc6proj/apptest.dsp b/vc6proj/apptest.dsp
new file mode 100644
index 0000000..0f5c35b
--- /dev/null
+++ b/vc6proj/apptest.dsp
@@ -0,0 +1,242 @@
+# Microsoft Developer Studio Project File - Name="apptest" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** ÊÔ½¸¤·¤Ê¤¤¤Ç¤¯¤À¤µ¤¤ **
+
+# TARGTYPE "Win32 (x86) Generic Project" 0x010a
+
+CFG=apptest - Win32 Debug
+!MESSAGE ¤³¤ì¤ÏÍ­¸ú¤ÊŽÒŽ²Ž¸ŽÌŽ§Ž²ŽÙ¤Ç¤Ï¤¢¤ê¤Þ¤»¤ó¡£ ¤³¤ÎŽÌŽßŽÛŽ¼ŽÞŽªŽ¸ŽÄ¤òŽËŽÞŽÙŽÄŽÞ¤¹¤ë¤¿¤á¤Ë¤Ï NMAKE ¤ò»ÈÍѤ·¤Æ¤¯¤À¤µ¤¤¡£
+!MESSAGE [ŽÒŽ²Ž¸ŽÌŽ§Ž²ŽÙ¤ÎŽ´Ž¸Ž½ŽÎŽßŽ°ŽÄ] ŽºŽÏŽÝŽÄŽÞ¤ò»ÈÍѤ·¤Æ¼Â¹Ô¤·¤Æ¤¯¤À¤µ¤¤
+!MESSAGE 
+!MESSAGE NMAKE /f "apptest.mak".
+!MESSAGE 
+!MESSAGE NMAKE ¤Î¼Â¹Ô»þ¤Ë¹½À®¤ò»ØÄê¤Ç¤­¤Þ¤¹
+!MESSAGE ŽºŽÏŽÝŽÄŽÞ Ž×Ž²ŽÝ¾å¤ÇŽÏŽ¸ŽÛ¤ÎÀßÄê¤òÄêµÁ¤·¤Þ¤¹¡£Îã:
+!MESSAGE 
+!MESSAGE NMAKE /f "apptest.mak" CFG="apptest - Win32 Debug"
+!MESSAGE 
+!MESSAGE ÁªÂò²Äǽ¤ÊŽËŽÞŽÙŽÄŽÞ ŽÓŽ°ŽÄŽÞ:
+!MESSAGE 
+!MESSAGE "apptest - Win32 Release" ("Win32 (x86) Generic Project" ÍÑ)
+!MESSAGE "apptest - Win32 Debug" ("Win32 (x86) Generic Project" ÍÑ)
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+MTL=midl.exe
+
+!IF  "$(CFG)" == "apptest - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Target_Dir ""
+# Begin Special Build Tool
+OutDir=.\Release
+SOURCE="$(InputPath)"
+PostBuild_Cmds=fc /b .\testimg.ppm $(OutDir)\testout.ppm	fc /b .\testimg.bmp $(OutDir)\testout.bmp	fc /b .\testimg.jpg $(OutDir)\testout.jpg	fc /b .\testimg.ppm $(OutDir)\testoutp.ppm	fc /b .\testimgp.jpg $(OutDir)\testoutp.jpg	fc /b .\testorig.jpg $(OutDir)\testoutt.jpg
+# End Special Build Tool
+
+!ELSEIF  "$(CFG)" == "apptest - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Target_Dir ""
+# Begin Special Build Tool
+OutDir=.\Debug
+SOURCE="$(InputPath)"
+PostBuild_Cmds=fc /b .\testimg.ppm $(OutDir)\testout.ppm	fc /b .\testimg.bmp $(OutDir)\testout.bmp	fc /b .\testimg.jpg $(OutDir)\testout.jpg	fc /b .\testimg.ppm $(OutDir)\testoutp.ppm	fc /b .\testimgp.jpg $(OutDir)\testoutp.jpg	fc /b .\testorig.jpg $(OutDir)\testoutt.jpg
+# End Special Build Tool
+
+!ENDIF 
+
+# Begin Target
+
+# Name "apptest - Win32 Release"
+# Name "apptest - Win32 Debug"
+# Begin Group "Test Image Files"
+
+# PROP Default_Filter "*.jpg;*.bmp;*.ppm"
+# Begin Source File
+
+SOURCE=.\testimg.bmp
+# End Source File
+# Begin Source File
+
+SOURCE=.\testimg.jpg
+# End Source File
+# Begin Source File
+
+SOURCE=.\testimg.ppm
+
+!IF  "$(CFG)" == "apptest - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+# Begin Custom Build
+InputDir=.
+OutDir=.\Release
+InputPath=.\testimg.ppm
+
+BuildCmds= \
+	echo $(OutDir)\cjpeg -dct int -outfile $(OutDir)\testout.jpg .\testimg.ppm \
+	$(OutDir)\cjpeg -dct int -outfile $(OutDir)\testout.jpg .\testimg.ppm \
+	echo $(OutDir)\cjpeg -dct int -progressive -opt -outfile $(OutDir)\testoutp.jpg .\testimg.ppm \
+	$(OutDir)\cjpeg -dct int -progressive -opt -outfile $(OutDir)\testoutp.jpg .\testimg.ppm \
+	
+
+"$(OutDir)\testout.jpg" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+   $(BuildCmds)
+
+"$(OutDir)\testoutp.jpg" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+   $(BuildCmds)
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "apptest - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+# Begin Custom Build - Testing - $(InputPath)
+InputDir=.
+OutDir=.\Debug
+InputPath=.\testimg.ppm
+
+BuildCmds= \
+	echo $(OutDir)\cjpeg -dct int -outfile $(OutDir)\testout.jpg .\testimg.ppm \
+	$(OutDir)\cjpeg -dct int -outfile $(OutDir)\testout.jpg .\testimg.ppm \
+	echo $(OutDir)\cjpeg -dct int -progressive -opt -outfile $(OutDir)\testoutp.jpg .\testimg.ppm \
+	$(OutDir)\cjpeg -dct int -progressive -opt -outfile $(OutDir)\testoutp.jpg .\testimg.ppm \
+	
+
+"$(OutDir)\testout.jpg" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+   $(BuildCmds)
+
+"$(OutDir)\testoutp.jpg" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+   $(BuildCmds)
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\testimgp.jpg
+# End Source File
+# Begin Source File
+
+SOURCE=.\testorig.jpg
+
+!IF  "$(CFG)" == "apptest - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+# Begin Custom Build
+InputDir=.
+OutDir=.\Release
+InputPath=.\testorig.jpg
+
+BuildCmds= \
+	echo $(OutDir)\djpeg -dct int -ppm -outfile $(OutDir)\testout.ppm .\testorig.jpg \
+	$(OutDir)\djpeg -dct int -ppm -outfile $(OutDir)\testout.ppm .\testorig.jpg \
+	echo $(OutDir)\djpeg -dct int -bmp -colors 256 -outfile $(OutDir)\testout.bmp .\testorig.jpg \
+	$(OutDir)\djpeg -dct int -bmp -colors 256 -outfile $(OutDir)\testout.bmp .\testorig.jpg \
+	
+
+"$(OutDir)\testout.ppm" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+   $(BuildCmds)
+
+"$(OutDir)\testout.bmp" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+   $(BuildCmds)
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "apptest - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+# Begin Custom Build - Testing - $(InputPath)
+InputDir=.
+OutDir=.\Debug
+InputPath=.\testorig.jpg
+
+BuildCmds= \
+	echo $(OutDir)\djpeg -dct int -ppm -outfile $(OutDir)\testout.ppm .\testorig.jpg \
+	$(OutDir)\djpeg -dct int -ppm -outfile $(OutDir)\testout.ppm .\testorig.jpg \
+	echo $(OutDir)\djpeg -dct int -bmp -colors 256 -outfile $(OutDir)\testout.bmp .\testorig.jpg \
+	$(OutDir)\djpeg -dct int -bmp -colors 256 -outfile $(OutDir)\testout.bmp .\testorig.jpg \
+	
+
+"$(OutDir)\testout.ppm" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+   $(BuildCmds)
+
+"$(OutDir)\testout.bmp" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+   $(BuildCmds)
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\testprog.jpg
+
+!IF  "$(CFG)" == "apptest - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+# Begin Custom Build
+InputDir=.
+OutDir=.\Release
+InputPath=.\testprog.jpg
+
+BuildCmds= \
+	echo $(OutDir)\djpeg -dct int -ppm -outfile $(OutDir)\testoutp.ppm .\testprog.jpg \
+	$(OutDir)\djpeg -dct int -ppm -outfile $(OutDir)\testoutp.ppm .\testprog.jpg \
+	echo $(OutDir)\jpegtran -outfile $(OutDir)\testoutt.jpg .\testprog.jpg \
+	$(OutDir)\jpegtran -outfile $(OutDir)\testoutt.jpg .\testprog.jpg \
+	
+
+"$(OutDir)\testoutp.ppm" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+   $(BuildCmds)
+
+"$(OutDir)\testoutt.jpg" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+   $(BuildCmds)
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "apptest - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+# Begin Custom Build - Testing - $(InputPath)
+InputDir=.
+OutDir=.\Debug
+InputPath=.\testprog.jpg
+
+BuildCmds= \
+	echo $(OutDir)\djpeg -dct int -ppm -outfile $(OutDir)\testoutp.ppm .\testprog.jpg \
+	$(OutDir)\djpeg -dct int -ppm -outfile $(OutDir)\testoutp.ppm .\testprog.jpg \
+	echo $(OutDir)\jpegtran -outfile $(OutDir)\testoutt.jpg .\testprog.jpg \
+	$(OutDir)\jpegtran -outfile $(OutDir)\testoutt.jpg .\testprog.jpg \
+	
+
+"$(OutDir)\testoutp.ppm" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+   $(BuildCmds)
+
+"$(OutDir)\testoutt.jpg" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+   $(BuildCmds)
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# End Group
+# End Target
+# End Project
diff --git a/vc6proj/cjpeg.dsp b/vc6proj/cjpeg.dsp
new file mode 100644
index 0000000..573e619
--- /dev/null
+++ b/vc6proj/cjpeg.dsp
@@ -0,0 +1,164 @@
+# Microsoft Developer Studio Project File - Name="cjpeg" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** ÊÔ½¸¤·¤Ê¤¤¤Ç¤¯¤À¤µ¤¤ **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=cjpeg - Win32 Debug
+!MESSAGE ¤³¤ì¤ÏÍ­¸ú¤ÊŽÒŽ²Ž¸ŽÌŽ§Ž²ŽÙ¤Ç¤Ï¤¢¤ê¤Þ¤»¤ó¡£ ¤³¤ÎŽÌŽßŽÛŽ¼ŽÞŽªŽ¸ŽÄ¤òŽËŽÞŽÙŽÄŽÞ¤¹¤ë¤¿¤á¤Ë¤Ï NMAKE ¤ò»ÈÍѤ·¤Æ¤¯¤À¤µ¤¤¡£
+!MESSAGE [ŽÒŽ²Ž¸ŽÌŽ§Ž²ŽÙ¤ÎŽ´Ž¸Ž½ŽÎŽßŽ°ŽÄ] ŽºŽÏŽÝŽÄŽÞ¤ò»ÈÍѤ·¤Æ¼Â¹Ô¤·¤Æ¤¯¤À¤µ¤¤
+!MESSAGE 
+!MESSAGE NMAKE /f "cjpeg.mak".
+!MESSAGE 
+!MESSAGE NMAKE ¤Î¼Â¹Ô»þ¤Ë¹½À®¤ò»ØÄê¤Ç¤­¤Þ¤¹
+!MESSAGE ŽºŽÏŽÝŽÄŽÞ Ž×Ž²ŽÝ¾å¤ÇŽÏŽ¸ŽÛ¤ÎÀßÄê¤òÄêµÁ¤·¤Þ¤¹¡£Îã:
+!MESSAGE 
+!MESSAGE NMAKE /f "cjpeg.mak" CFG="cjpeg - Win32 Debug"
+!MESSAGE 
+!MESSAGE ÁªÂò²Äǽ¤ÊŽËŽÞŽÙŽÄŽÞ ŽÓŽ°ŽÄŽÞ:
+!MESSAGE 
+!MESSAGE "cjpeg - Win32 Release" ("Win32 (x86) Console Application" ÍÑ)
+!MESSAGE "cjpeg - Win32 Debug" ("Win32 (x86) Console Application" ÍÑ)
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF  "$(CFG)" == "cjpeg - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /FD /c
+# ADD CPP /nologo /W3 /O2 /GF /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /FD /c
+# ADD BASE RSC /l 0x411 /d "NDEBUG"
+# ADD RSC /l 0x411 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 libjpeg.lib kernel32.lib /nologo /subsystem:console /machine:I386 /libpath:"Release" /opt:nowin98
+# SUBTRACT LINK32 /pdb:none
+
+!ELSEIF  "$(CFG)" == "cjpeg - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x411 /d "_DEBUG"
+# ADD RSC /l 0x411 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 libjpeg.lib kernel32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept /libpath:"Debug" /opt:nowin98
+# SUBTRACT LINK32 /pdb:none
+
+!ENDIF 
+
+# Begin Target
+
+# Name "cjpeg - Win32 Release"
+# Name "cjpeg - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=.\cdjpeg.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\cjpeg.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\rdbmp.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\rdgif.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\rdppm.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\rdrle.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\rdswitch.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\rdtarga.c
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=.\cderror.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\cdjpeg.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jconfig.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jerror.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jinclude.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jmorecfg.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jpeglib.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jversion.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/vc6proj/djpeg.dsp b/vc6proj/djpeg.dsp
new file mode 100644
index 0000000..156b378
--- /dev/null
+++ b/vc6proj/djpeg.dsp
@@ -0,0 +1,164 @@
+# Microsoft Developer Studio Project File - Name="djpeg" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** ÊÔ½¸¤·¤Ê¤¤¤Ç¤¯¤À¤µ¤¤ **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=djpeg - Win32 Debug
+!MESSAGE ¤³¤ì¤ÏÍ­¸ú¤ÊŽÒŽ²Ž¸ŽÌŽ§Ž²ŽÙ¤Ç¤Ï¤¢¤ê¤Þ¤»¤ó¡£ ¤³¤ÎŽÌŽßŽÛŽ¼ŽÞŽªŽ¸ŽÄ¤òŽËŽÞŽÙŽÄŽÞ¤¹¤ë¤¿¤á¤Ë¤Ï NMAKE ¤ò»ÈÍѤ·¤Æ¤¯¤À¤µ¤¤¡£
+!MESSAGE [ŽÒŽ²Ž¸ŽÌŽ§Ž²ŽÙ¤ÎŽ´Ž¸Ž½ŽÎŽßŽ°ŽÄ] ŽºŽÏŽÝŽÄŽÞ¤ò»ÈÍѤ·¤Æ¼Â¹Ô¤·¤Æ¤¯¤À¤µ¤¤
+!MESSAGE 
+!MESSAGE NMAKE /f "djpeg.mak".
+!MESSAGE 
+!MESSAGE NMAKE ¤Î¼Â¹Ô»þ¤Ë¹½À®¤ò»ØÄê¤Ç¤­¤Þ¤¹
+!MESSAGE ŽºŽÏŽÝŽÄŽÞ Ž×Ž²ŽÝ¾å¤ÇŽÏŽ¸ŽÛ¤ÎÀßÄê¤òÄêµÁ¤·¤Þ¤¹¡£Îã:
+!MESSAGE 
+!MESSAGE NMAKE /f "djpeg.mak" CFG="djpeg - Win32 Debug"
+!MESSAGE 
+!MESSAGE ÁªÂò²Äǽ¤ÊŽËŽÞŽÙŽÄŽÞ ŽÓŽ°ŽÄŽÞ:
+!MESSAGE 
+!MESSAGE "djpeg - Win32 Release" ("Win32 (x86) Console Application" ÍÑ)
+!MESSAGE "djpeg - Win32 Debug" ("Win32 (x86) Console Application" ÍÑ)
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF  "$(CFG)" == "djpeg - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /FD /c
+# ADD CPP /nologo /W3 /O2 /GF /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /FD /c
+# ADD BASE RSC /l 0x411 /d "NDEBUG"
+# ADD RSC /l 0x411 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 libjpeg.lib kernel32.lib /nologo /subsystem:console /machine:I386 /libpath:"Release" /opt:nowin98
+# SUBTRACT LINK32 /pdb:none
+
+!ELSEIF  "$(CFG)" == "djpeg - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x411 /d "_DEBUG"
+# ADD RSC /l 0x411 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 libjpeg.lib kernel32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept /libpath:"Debug" /opt:nowin98
+# SUBTRACT LINK32 /pdb:none
+
+!ENDIF 
+
+# Begin Target
+
+# Name "djpeg - Win32 Release"
+# Name "djpeg - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=.\cdjpeg.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\djpeg.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\rdcolmap.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\wrbmp.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\wrgif.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\wrppm.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\wrrle.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\wrtarga.c
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=.\cderror.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\cdjpeg.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jconfig.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jerror.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jinclude.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jmorecfg.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jpeglib.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jversion.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/vc6proj/jconfig.h b/vc6proj/jconfig.h
new file mode 100644
index 0000000..d5bc9f9
--- /dev/null
+++ b/vc6proj/jconfig.h
@@ -0,0 +1,48 @@
+/* jconfig.vc --- jconfig.h for Microsoft Visual C++ on Windows 95 or NT. */
+/* see jconfig.doc for explanations */
+
+#define HAVE_PROTOTYPES
+#define HAVE_UNSIGNED_CHAR
+#define HAVE_UNSIGNED_SHORT
+/* #define void char */
+/* #define const */
+#undef CHAR_IS_UNSIGNED
+#define HAVE_STDDEF_H
+#define HAVE_STDLIB_H
+#undef NEED_BSD_STRINGS
+#undef NEED_SYS_TYPES_H
+#undef NEED_FAR_POINTERS	/* we presume a 32-bit flat memory model */
+#undef NEED_SHORT_EXTERNAL_NAMES
+#undef INCOMPLETE_TYPES_BROKEN
+
+/* Define "boolean" as unsigned char, not int, per Windows custom */
+#define TYPEDEF_UCHAR_BOOLEAN
+
+#ifdef JPEG_INTERNALS
+
+#undef RIGHT_SHIFT_IS_UNSIGNED
+
+#endif /* JPEG_INTERNALS */
+
+#if defined(JPEG_INTERNALS) || defined(JPEG_INTERNAL_OPTIONS)
+#undef JSIMD_MMX_NOT_SUPPORTED
+#undef JSIMD_3DNOW_NOT_SUPPORTED
+#undef JSIMD_SSE_NOT_SUPPORTED
+#undef JSIMD_SSE2_NOT_SUPPORTED
+#endif
+
+#ifdef JPEG_CJPEG_DJPEG
+
+#define BMP_SUPPORTED		/* BMP image file format */
+#define GIF_SUPPORTED		/* GIF image file format */
+#define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
+#undef RLE_SUPPORTED		/* Utah RLE image file format */
+#define TARGA_SUPPORTED		/* Targa image file format */
+
+#define TWO_FILE_COMMANDLINE	/* optional */
+#define USE_SETMODE		/* Microsoft has setmode() */
+#undef NEED_SIGNAL_CATCHER
+#undef DONT_USE_B_MODE
+#undef PROGRESS_REPORT		/* optional */
+
+#endif /* JPEG_CJPEG_DJPEG */
diff --git a/vc6proj/jpegtran.dsp b/vc6proj/jpegtran.dsp
new file mode 100644
index 0000000..8dc38d4
--- /dev/null
+++ b/vc6proj/jpegtran.dsp
@@ -0,0 +1,156 @@
+# Microsoft Developer Studio Project File - Name="jpegtran" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** ÊÔ½¸¤·¤Ê¤¤¤Ç¤¯¤À¤µ¤¤ **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=jpegtran - Win32 Debug
+!MESSAGE ¤³¤ì¤ÏÍ­¸ú¤ÊŽÒŽ²Ž¸ŽÌŽ§Ž²ŽÙ¤Ç¤Ï¤¢¤ê¤Þ¤»¤ó¡£ ¤³¤ÎŽÌŽßŽÛŽ¼ŽÞŽªŽ¸ŽÄ¤òŽËŽÞŽÙŽÄŽÞ¤¹¤ë¤¿¤á¤Ë¤Ï NMAKE ¤ò»ÈÍѤ·¤Æ¤¯¤À¤µ¤¤¡£
+!MESSAGE [ŽÒŽ²Ž¸ŽÌŽ§Ž²ŽÙ¤ÎŽ´Ž¸Ž½ŽÎŽßŽ°ŽÄ] ŽºŽÏŽÝŽÄŽÞ¤ò»ÈÍѤ·¤Æ¼Â¹Ô¤·¤Æ¤¯¤À¤µ¤¤
+!MESSAGE 
+!MESSAGE NMAKE /f "jpegtran.mak".
+!MESSAGE 
+!MESSAGE NMAKE ¤Î¼Â¹Ô»þ¤Ë¹½À®¤ò»ØÄê¤Ç¤­¤Þ¤¹
+!MESSAGE ŽºŽÏŽÝŽÄŽÞ Ž×Ž²ŽÝ¾å¤ÇŽÏŽ¸ŽÛ¤ÎÀßÄê¤òÄêµÁ¤·¤Þ¤¹¡£Îã:
+!MESSAGE 
+!MESSAGE NMAKE /f "jpegtran.mak" CFG="jpegtran - Win32 Debug"
+!MESSAGE 
+!MESSAGE ÁªÂò²Äǽ¤ÊŽËŽÞŽÙŽÄŽÞ ŽÓŽ°ŽÄŽÞ:
+!MESSAGE 
+!MESSAGE "jpegtran - Win32 Release" ("Win32 (x86) Console Application" ÍÑ)
+!MESSAGE "jpegtran - Win32 Debug" ("Win32 (x86) Console Application" ÍÑ)
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF  "$(CFG)" == "jpegtran - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /FD /c
+# ADD CPP /nologo /W3 /O2 /GF /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /FD /c
+# ADD BASE RSC /l 0x411 /d "NDEBUG"
+# ADD RSC /l 0x411 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 libjpeg.lib kernel32.lib /nologo /subsystem:console /machine:I386 /libpath:"Release" /opt:nowin98
+# SUBTRACT LINK32 /pdb:none
+
+!ELSEIF  "$(CFG)" == "jpegtran - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x411 /d "_DEBUG"
+# ADD RSC /l 0x411 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 libjpeg.lib kernel32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept /libpath:"Debug" /opt:nowin98
+# SUBTRACT LINK32 /pdb:none
+
+!ENDIF 
+
+# Begin Target
+
+# Name "jpegtran - Win32 Release"
+# Name "jpegtran - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=.\cdjpeg.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jpegtran.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\rdswitch.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\transupp.c
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=.\cderror.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\cdjpeg.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jconfig.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jerror.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jinclude.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jmorecfg.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jpegint.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jpeglib.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jversion.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\transupp.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/vc6proj/libjpeg.dsp b/vc6proj/libjpeg.dsp
new file mode 100644
index 0000000..59647d0
--- /dev/null
+++ b/vc6proj/libjpeg.dsp
@@ -0,0 +1,1751 @@
+# Microsoft Developer Studio Project File - Name="libjpeg" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** ÊÔ½¸¤·¤Ê¤¤¤Ç¤¯¤À¤µ¤¤ **
+
+# TARGTYPE "Win32 (x86) Static Library" 0x0104
+
+CFG=libjpeg - Win32 Debug
+!MESSAGE ¤³¤ì¤ÏÍ­¸ú¤ÊŽÒŽ²Ž¸ŽÌŽ§Ž²ŽÙ¤Ç¤Ï¤¢¤ê¤Þ¤»¤ó¡£ ¤³¤ÎŽÌŽßŽÛŽ¼ŽÞŽªŽ¸ŽÄ¤òŽËŽÞŽÙŽÄŽÞ¤¹¤ë¤¿¤á¤Ë¤Ï NMAKE ¤ò»ÈÍѤ·¤Æ¤¯¤À¤µ¤¤¡£
+!MESSAGE [ŽÒŽ²Ž¸ŽÌŽ§Ž²ŽÙ¤ÎŽ´Ž¸Ž½ŽÎŽßŽ°ŽÄ] ŽºŽÏŽÝŽÄŽÞ¤ò»ÈÍѤ·¤Æ¼Â¹Ô¤·¤Æ¤¯¤À¤µ¤¤
+!MESSAGE 
+!MESSAGE NMAKE /f "libjpeg.mak".
+!MESSAGE 
+!MESSAGE NMAKE ¤Î¼Â¹Ô»þ¤Ë¹½À®¤ò»ØÄê¤Ç¤­¤Þ¤¹
+!MESSAGE ŽºŽÏŽÝŽÄŽÞ Ž×Ž²ŽÝ¾å¤ÇŽÏŽ¸ŽÛ¤ÎÀßÄê¤òÄêµÁ¤·¤Þ¤¹¡£Îã:
+!MESSAGE 
+!MESSAGE NMAKE /f "libjpeg.mak" CFG="libjpeg - Win32 Debug"
+!MESSAGE 
+!MESSAGE ÁªÂò²Äǽ¤ÊŽËŽÞŽÙŽÄŽÞ ŽÓŽ°ŽÄŽÞ:
+!MESSAGE 
+!MESSAGE "libjpeg - Win32 Release" ("Win32 (x86) Static Library" ÍÑ)
+!MESSAGE "libjpeg - Win32 Debug" ("Win32 (x86) Static Library" ÍÑ)
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_LIB" /YX /FD /c
+# ADD CPP /nologo /W3 /O2 /D "WIN32" /D "NDEBUG" /D "_LIB" /YX /Zl /FD /GF /c
+# ADD BASE RSC /l 0x411 /d "NDEBUG"
+# ADD RSC /l 0x411 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LIB32=link.exe -lib
+# ADD BASE LIB32 /nologo
+# ADD LIB32 /nologo
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_LIB" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /ZI /Od /D "WIN32" /D "_DEBUG" /D "_LIB" /YX /Zl /FD /GZ /c
+# ADD BASE RSC /l 0x411 /d "_DEBUG"
+# ADD RSC /l 0x411 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LIB32=link.exe -lib
+# ADD BASE LIB32 /nologo
+# ADD LIB32 /nologo
+
+!ENDIF 
+
+# Begin Target
+
+# Name "libjpeg - Win32 Release"
+# Name "libjpeg - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=.\jcapimin.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcapistd.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jccoefct.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jccolor.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcdctmgr.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jchuff.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcinit.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcmainct.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcmarker.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcmaster.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcomapi.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcparam.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcphuff.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcprepct.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcsample.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jctrans.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdapimin.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdapistd.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdatadst.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdatasrc.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdcoefct.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdcolor.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jddctmgr.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdhuff.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdinput.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdmainct.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdmarker.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdmaster.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdmerge.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdphuff.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdpostct.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdsample.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdtrans.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jerror.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jmemmgr.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jmemnobs.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jquant1.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jquant2.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\jutils.c
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=.\jchuff.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcolsamp.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jconfig.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdct.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdhuff.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jerror.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jinclude.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jmemsys.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jmorecfg.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jpegint.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jpeglib.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jversion.h
+# End Source File
+# End Group
+# Begin Group "NASM Source"
+
+# PROP Default_Filter "asm"
+# Begin Source File
+
+SOURCE=.\jccolmmx.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCCOL="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jccolmmx.asm
+InputName=jccolmmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCCOL="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jccolmmx.asm
+InputName=jccolmmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jccolss2.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCCOLS="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jccolss2.asm
+InputName=jccolss2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCCOLS="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jccolss2.asm
+InputName=jccolss2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcqnt3dn.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCQNT="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jcqnt3dn.asm
+InputName=jcqnt3dn
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCQNT="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jcqnt3dn.asm
+InputName=jcqnt3dn
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcqntflt.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCQNTF="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jcqntflt.asm
+InputName=jcqntflt
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCQNTF="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jcqntflt.asm
+InputName=jcqntflt
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcqntint.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCQNTI="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jcqntint.asm
+InputName=jcqntint
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCQNTI="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jcqntint.asm
+InputName=jcqntint
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcqntmmx.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCQNTM="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jcqntmmx.asm
+InputName=jcqntmmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCQNTM="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jcqntmmx.asm
+InputName=jcqntmmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcqnts2f.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCQNTS="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jcqnts2f.asm
+InputName=jcqnts2f
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCQNTS="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jcqnts2f.asm
+InputName=jcqnts2f
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcqnts2i.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCQNTS2="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jcqnts2i.asm
+InputName=jcqnts2i
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCQNTS2="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jcqnts2i.asm
+InputName=jcqnts2i
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcqntsse.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCQNTSS="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jcqntsse.asm
+InputName=jcqntsse
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCQNTSS="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jcqntsse.asm
+InputName=jcqntsse
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcsammmx.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCSAM="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jcsammmx.asm
+InputName=jcsammmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCSAM="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jcsammmx.asm
+InputName=jcsammmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jcsamss2.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCSAMS="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jcsamss2.asm
+InputName=jcsamss2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JCSAMS="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jcsamss2.asm
+InputName=jcsamss2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdcolmmx.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JDCOL="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jdcolmmx.asm
+InputName=jdcolmmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JDCOL="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jdcolmmx.asm
+InputName=jdcolmmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdcolss2.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JDCOLS="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jdcolss2.asm
+InputName=jdcolss2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JDCOLS="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jdcolss2.asm
+InputName=jdcolss2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdmermmx.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JDMER="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jdmermmx.asm
+InputName=jdmermmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JDMER="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jdmermmx.asm
+InputName=jdmermmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdmerss2.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JDMERS="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jdmerss2.asm
+InputName=jdmerss2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JDMERS="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jdmerss2.asm
+InputName=jdmerss2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdsammmx.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JDSAM="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jdsammmx.asm
+InputName=jdsammmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JDSAM="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jdsammmx.asm
+InputName=jdsammmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdsamss2.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JDSAMS="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jdsamss2.asm
+InputName=jdsamss2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JDSAMS="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jcolsamp.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jdsamss2.asm
+InputName=jdsamss2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jf3dnflt.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JF3DN="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jf3dnflt.asm
+InputName=jf3dnflt
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JF3DN="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jf3dnflt.asm
+InputName=jf3dnflt
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jfdctflt.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JFDCT="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jfdctflt.asm
+InputName=jfdctflt
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JFDCT="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jfdctflt.asm
+InputName=jfdctflt
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jfdctfst.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JFDCTF="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jfdctfst.asm
+InputName=jfdctfst
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JFDCTF="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jfdctfst.asm
+InputName=jfdctfst
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jfdctint.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JFDCTI="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jfdctint.asm
+InputName=jfdctint
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JFDCTI="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jfdctint.asm
+InputName=jfdctint
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jfmmxfst.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JFMMX="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jfmmxfst.asm
+InputName=jfmmxfst
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JFMMX="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jfmmxfst.asm
+InputName=jfmmxfst
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jfmmxint.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JFMMXI="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jfmmxint.asm
+InputName=jfmmxint
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JFMMXI="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jfmmxint.asm
+InputName=jfmmxint
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jfss2fst.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JFSS2="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jfss2fst.asm
+InputName=jfss2fst
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JFSS2="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jfss2fst.asm
+InputName=jfss2fst
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jfss2int.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JFSS2I="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jfss2int.asm
+InputName=jfss2int
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JFSS2I="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jfss2int.asm
+InputName=jfss2int
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jfsseflt.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JFSSE="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jfsseflt.asm
+InputName=jfsseflt
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JFSSE="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jfsseflt.asm
+InputName=jfsseflt
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\ji3dnflt.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JI3DN="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\ji3dnflt.asm
+InputName=ji3dnflt
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JI3DN="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\ji3dnflt.asm
+InputName=ji3dnflt
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jidctflt.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JIDCT="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jidctflt.asm
+InputName=jidctflt
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JIDCT="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jidctflt.asm
+InputName=jidctflt
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jidctfst.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JIDCTF="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jidctfst.asm
+InputName=jidctfst
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JIDCTF="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jidctfst.asm
+InputName=jidctfst
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jidctint.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JIDCTI="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jidctint.asm
+InputName=jidctint
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JIDCTI="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jidctint.asm
+InputName=jidctint
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jidctred.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JIDCTR="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jidctred.asm
+InputName=jidctred
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JIDCTR="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jidctred.asm
+InputName=jidctred
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jimmxfst.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JIMMX="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jimmxfst.asm
+InputName=jimmxfst
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JIMMX="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jimmxfst.asm
+InputName=jimmxfst
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jimmxint.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JIMMXI="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jimmxint.asm
+InputName=jimmxint
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JIMMXI="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jimmxint.asm
+InputName=jimmxint
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jimmxred.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JIMMXR="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jimmxred.asm
+InputName=jimmxred
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JIMMXR="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jimmxred.asm
+InputName=jimmxred
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jiss2flt.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JISS2="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jiss2flt.asm
+InputName=jiss2flt
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JISS2="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jiss2flt.asm
+InputName=jiss2flt
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jiss2fst.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JISS2F="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jiss2fst.asm
+InputName=jiss2fst
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JISS2F="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jiss2fst.asm
+InputName=jiss2fst
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jiss2int.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JISS2I="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jiss2int.asm
+InputName=jiss2int
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JISS2I="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jiss2int.asm
+InputName=jiss2int
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jiss2red.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JISS2R="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jiss2red.asm
+InputName=jiss2red
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JISS2R="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jiss2red.asm
+InputName=jiss2red
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jisseflt.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JISSE="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jisseflt.asm
+InputName=jisseflt
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JISSE="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	"jdct.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jisseflt.asm
+InputName=jisseflt
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jsimdcpu.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JSIMD="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jsimdcpu.asm
+InputName=jsimdcpu
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JSIMD="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jsimdcpu.asm
+InputName=jsimdcpu
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=.\jsimdw32.asm
+
+!IF  "$(CFG)" == "libjpeg - Win32 Release"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JSIMDW="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Release
+InputPath=.\jsimdw32.asm
+InputName=jsimdw32
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "libjpeg - Win32 Debug"
+
+# PROP Ignore_Default_Tool 1
+USERDEP__JSIMDW="$(IntDir)\jsimdcfg.inc"	"jsimdext.inc"	
+# Begin Custom Build - Assembling - $(InputPath)
+IntDir=.\Debug
+InputPath=.\jsimdw32.asm
+InputName=jsimdw32
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasmw -Xvc -fwin32 -DWIN32 -I $(IntDir)\ -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# End Group
+# Begin Group "NASM Header"
+
+# PROP Default_Filter "inc"
+# Begin Source File
+
+SOURCE=.\jcolsamp.inc
+# End Source File
+# Begin Source File
+
+SOURCE=.\jdct.inc
+# End Source File
+# Begin Source File
+
+SOURCE=.\jsimdext.inc
+# End Source File
+# End Group
+# End Target
+# End Project
diff --git a/vc6proj/libjpeg.dsw b/vc6proj/libjpeg.dsw
new file mode 100644
index 0000000..4ace153
--- /dev/null
+++ b/vc6proj/libjpeg.dsw
@@ -0,0 +1,134 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# ·Ù¹ð: ¤³¤ÎŽÜŽ°Ž¸Ž½ŽÍŽßŽ°Ž½ ŽÌŽ§Ž²ŽÙ ¤òÊÔ½¸¤Þ¤¿¤Ïºï½ü¤·¤Ê¤¤¤Ç¤¯¤À¤µ¤¤!
+
+###############################################################################
+
+Project: "apptest"=".\apptest.dsp" - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+    Begin Project Dependency
+    Project_Dep_Name cjpeg
+    End Project Dependency
+    Begin Project Dependency
+    Project_Dep_Name djpeg
+    End Project Dependency
+    Begin Project Dependency
+    Project_Dep_Name jpegtran
+    End Project Dependency
+}}}
+
+###############################################################################
+
+Project: "cjpeg"=".\cjpeg.dsp" - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+    Begin Project Dependency
+    Project_Dep_Name libjpeg
+    End Project Dependency
+}}}
+
+###############################################################################
+
+Project: "djpeg"=".\djpeg.dsp" - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+    Begin Project Dependency
+    Project_Dep_Name libjpeg
+    End Project Dependency
+}}}
+
+###############################################################################
+
+Project: "jpegtran"=".\jpegtran.dsp" - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+    Begin Project Dependency
+    Project_Dep_Name libjpeg
+    End Project Dependency
+}}}
+
+###############################################################################
+
+Project: "libjpeg"=".\libjpeg.dsp" - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+    Begin Project Dependency
+    Project_Dep_Name makecfg
+    End Project Dependency
+}}}
+
+###############################################################################
+
+Project: "makecfg"=".\makecfg.dsp" - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Project: "rdjpgcom"=".\rdjpgcom.dsp" - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Project: "wrjpgcom"=".\wrjpgcom.dsp" - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/vc6proj/makecfg.dsp b/vc6proj/makecfg.dsp
new file mode 100644
index 0000000..dbe914a
--- /dev/null
+++ b/vc6proj/makecfg.dsp
@@ -0,0 +1,142 @@
+# Microsoft Developer Studio Project File - Name="makecfg" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** ÊÔ½¸¤·¤Ê¤¤¤Ç¤¯¤À¤µ¤¤ **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=makecfg - Win32 Debug
+!MESSAGE ¤³¤ì¤ÏÍ­¸ú¤ÊŽÒŽ²Ž¸ŽÌŽ§Ž²ŽÙ¤Ç¤Ï¤¢¤ê¤Þ¤»¤ó¡£ ¤³¤ÎŽÌŽßŽÛŽ¼ŽÞŽªŽ¸ŽÄ¤òŽËŽÞŽÙŽÄŽÞ¤¹¤ë¤¿¤á¤Ë¤Ï NMAKE ¤ò»ÈÍѤ·¤Æ¤¯¤À¤µ¤¤¡£
+!MESSAGE [ŽÒŽ²Ž¸ŽÌŽ§Ž²ŽÙ¤ÎŽ´Ž¸Ž½ŽÎŽßŽ°ŽÄ] ŽºŽÏŽÝŽÄŽÞ¤ò»ÈÍѤ·¤Æ¼Â¹Ô¤·¤Æ¤¯¤À¤µ¤¤
+!MESSAGE 
+!MESSAGE NMAKE /f "makecfg.mak".
+!MESSAGE 
+!MESSAGE NMAKE ¤Î¼Â¹Ô»þ¤Ë¹½À®¤ò»ØÄê¤Ç¤­¤Þ¤¹
+!MESSAGE ŽºŽÏŽÝŽÄŽÞ Ž×Ž²ŽÝ¾å¤ÇŽÏŽ¸ŽÛ¤ÎÀßÄê¤òÄêµÁ¤·¤Þ¤¹¡£Îã:
+!MESSAGE 
+!MESSAGE NMAKE /f "makecfg.mak" CFG="makecfg - Win32 Debug"
+!MESSAGE 
+!MESSAGE ÁªÂò²Äǽ¤ÊŽËŽÞŽÙŽÄŽÞ ŽÓŽ°ŽÄŽÞ:
+!MESSAGE 
+!MESSAGE "makecfg - Win32 Release" ("Win32 (x86) Console Application" ÍÑ)
+!MESSAGE "makecfg - Win32 Debug" ("Win32 (x86) Console Application" ÍÑ)
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF  "$(CFG)" == "makecfg - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /FD /c
+# ADD CPP /nologo /W3 /O2 /GF /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /FD /c
+# ADD BASE RSC /l 0x411 /d "NDEBUG"
+# ADD RSC /l 0x411 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 /nologo /subsystem:console /machine:I386 /opt:nowin98
+# SUBTRACT LINK32 /pdb:none
+# Begin Custom Build - Generating - $(OutDir)\jsimdcfg.inc
+OutDir=.\Release
+InputPath=.\Release\makecfg.exe
+SOURCE="$(InputPath)"
+
+"$(OutDir)\jsimdcfg.inc" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	$(TargetPath) > $(OutDir)\jsimdcfg.inc
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "makecfg - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x411 /d "_DEBUG"
+# ADD RSC /l 0x411 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept /opt:nowin98
+# SUBTRACT LINK32 /pdb:none
+# Begin Custom Build - Generating - $(OutDir)\jsimdcfg.inc
+OutDir=.\Debug
+InputPath=.\Debug\makecfg.exe
+SOURCE="$(InputPath)"
+
+"$(OutDir)\jsimdcfg.inc" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	$(TargetPath) > $(OutDir)\jsimdcfg.inc
+
+# End Custom Build
+
+!ENDIF 
+
+# Begin Target
+
+# Name "makecfg - Win32 Release"
+# Name "makecfg - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=.\makecfg.c
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=.\jconfig.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jerror.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jinclude.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jmorecfg.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jpegint.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jpeglib.h
+# End Source File
+# End Group
+# End Target
+# End Project
diff --git a/vc6proj/rdjpgcom.dsp b/vc6proj/rdjpgcom.dsp
new file mode 100644
index 0000000..7a5eda2
--- /dev/null
+++ b/vc6proj/rdjpgcom.dsp
@@ -0,0 +1,112 @@
+# Microsoft Developer Studio Project File - Name="rdjpgcom" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** ÊÔ½¸¤·¤Ê¤¤¤Ç¤¯¤À¤µ¤¤ **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=rdjpgcom - Win32 Debug
+!MESSAGE ¤³¤ì¤ÏÍ­¸ú¤ÊŽÒŽ²Ž¸ŽÌŽ§Ž²ŽÙ¤Ç¤Ï¤¢¤ê¤Þ¤»¤ó¡£ ¤³¤ÎŽÌŽßŽÛŽ¼ŽÞŽªŽ¸ŽÄ¤òŽËŽÞŽÙŽÄŽÞ¤¹¤ë¤¿¤á¤Ë¤Ï NMAKE ¤ò»ÈÍѤ·¤Æ¤¯¤À¤µ¤¤¡£
+!MESSAGE [ŽÒŽ²Ž¸ŽÌŽ§Ž²ŽÙ¤ÎŽ´Ž¸Ž½ŽÎŽßŽ°ŽÄ] ŽºŽÏŽÝŽÄŽÞ¤ò»ÈÍѤ·¤Æ¼Â¹Ô¤·¤Æ¤¯¤À¤µ¤¤
+!MESSAGE 
+!MESSAGE NMAKE /f "rdjpgcom.mak".
+!MESSAGE 
+!MESSAGE NMAKE ¤Î¼Â¹Ô»þ¤Ë¹½À®¤ò»ØÄê¤Ç¤­¤Þ¤¹
+!MESSAGE ŽºŽÏŽÝŽÄŽÞ Ž×Ž²ŽÝ¾å¤ÇŽÏŽ¸ŽÛ¤ÎÀßÄê¤òÄêµÁ¤·¤Þ¤¹¡£Îã:
+!MESSAGE 
+!MESSAGE NMAKE /f "rdjpgcom.mak" CFG="rdjpgcom - Win32 Debug"
+!MESSAGE 
+!MESSAGE ÁªÂò²Äǽ¤ÊŽËŽÞŽÙŽÄŽÞ ŽÓŽ°ŽÄŽÞ:
+!MESSAGE 
+!MESSAGE "rdjpgcom - Win32 Release" ("Win32 (x86) Console Application" ÍÑ)
+!MESSAGE "rdjpgcom - Win32 Debug" ("Win32 (x86) Console Application" ÍÑ)
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF  "$(CFG)" == "rdjpgcom - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /FD /c
+# ADD CPP /nologo /W3 /O2 /GF /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /FD /c
+# ADD BASE RSC /l 0x411 /d "NDEBUG"
+# ADD RSC /l 0x411 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib /nologo /subsystem:console /machine:I386 /opt:nowin98
+# SUBTRACT LINK32 /pdb:none
+
+!ELSEIF  "$(CFG)" == "rdjpgcom - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x411 /d "_DEBUG"
+# ADD RSC /l 0x411 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept /opt:nowin98
+# SUBTRACT LINK32 /pdb:none
+
+!ENDIF 
+
+# Begin Target
+
+# Name "rdjpgcom - Win32 Release"
+# Name "rdjpgcom - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=.\rdjpgcom.c
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=.\jconfig.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jinclude.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/vc6proj/wrjpgcom.dsp b/vc6proj/wrjpgcom.dsp
new file mode 100644
index 0000000..7fdf9ec
--- /dev/null
+++ b/vc6proj/wrjpgcom.dsp
@@ -0,0 +1,112 @@
+# Microsoft Developer Studio Project File - Name="wrjpgcom" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** ÊÔ½¸¤·¤Ê¤¤¤Ç¤¯¤À¤µ¤¤ **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=wrjpgcom - Win32 Debug
+!MESSAGE ¤³¤ì¤ÏÍ­¸ú¤ÊŽÒŽ²Ž¸ŽÌŽ§Ž²ŽÙ¤Ç¤Ï¤¢¤ê¤Þ¤»¤ó¡£ ¤³¤ÎŽÌŽßŽÛŽ¼ŽÞŽªŽ¸ŽÄ¤òŽËŽÞŽÙŽÄŽÞ¤¹¤ë¤¿¤á¤Ë¤Ï NMAKE ¤ò»ÈÍѤ·¤Æ¤¯¤À¤µ¤¤¡£
+!MESSAGE [ŽÒŽ²Ž¸ŽÌŽ§Ž²ŽÙ¤ÎŽ´Ž¸Ž½ŽÎŽßŽ°ŽÄ] ŽºŽÏŽÝŽÄŽÞ¤ò»ÈÍѤ·¤Æ¼Â¹Ô¤·¤Æ¤¯¤À¤µ¤¤
+!MESSAGE 
+!MESSAGE NMAKE /f "wrjpgcom.mak".
+!MESSAGE 
+!MESSAGE NMAKE ¤Î¼Â¹Ô»þ¤Ë¹½À®¤ò»ØÄê¤Ç¤­¤Þ¤¹
+!MESSAGE ŽºŽÏŽÝŽÄŽÞ Ž×Ž²ŽÝ¾å¤ÇŽÏŽ¸ŽÛ¤ÎÀßÄê¤òÄêµÁ¤·¤Þ¤¹¡£Îã:
+!MESSAGE 
+!MESSAGE NMAKE /f "wrjpgcom.mak" CFG="wrjpgcom - Win32 Debug"
+!MESSAGE 
+!MESSAGE ÁªÂò²Äǽ¤ÊŽËŽÞŽÙŽÄŽÞ ŽÓŽ°ŽÄŽÞ:
+!MESSAGE 
+!MESSAGE "wrjpgcom - Win32 Release" ("Win32 (x86) Console Application" ÍÑ)
+!MESSAGE "wrjpgcom - Win32 Debug" ("Win32 (x86) Console Application" ÍÑ)
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF  "$(CFG)" == "wrjpgcom - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /FD /c
+# ADD CPP /nologo /W3 /O2 /GF /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /YX /FD /c
+# ADD BASE RSC /l 0x411 /d "NDEBUG"
+# ADD RSC /l 0x411 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib /nologo /subsystem:console /machine:I386 /opt:nowin98
+# SUBTRACT LINK32 /pdb:none
+
+!ELSEIF  "$(CFG)" == "wrjpgcom - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x411 /d "_DEBUG"
+# ADD RSC /l 0x411 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept /opt:nowin98
+# SUBTRACT LINK32 /pdb:none
+
+!ENDIF 
+
+# Begin Target
+
+# Name "wrjpgcom - Win32 Release"
+# Name "wrjpgcom - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=.\wrjpgcom.c
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=.\jconfig.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\jinclude.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/wrbmp.c b/wrbmp.c
index 3283b0f..517441a 100644
--- a/wrbmp.c
+++ b/wrbmp.c
@@ -5,6 +5,13 @@
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
+ * ---------------------------------------------------------------------
+ * x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * This file has been modified to improve performance.
+ * Last Modified : October 19, 2004
+ * ---------------------------------------------------------------------
+ *
  * This file contains routines to write output images in Microsoft "BMP"
  * format (MS Windows 3.x and OS/2 1.x flavors).
  * Either 8-bit colormapped or 24-bit full-color format can be written.
@@ -346,9 +353,11 @@
   bmp_dest_ptr dest = (bmp_dest_ptr) dinfo;
   register FILE * outfile = dest->pub.output_file;
   JSAMPARRAY image_ptr;
+#if (BITS_IN_JSAMPLE != 8) || defined(NEED_FAR_POINTERS)
   register JSAMPROW data_ptr;
-  JDIMENSION row;
   register JDIMENSION col;
+#endif
+  JDIMENSION row;
   cd_progress_ptr progress = (cd_progress_ptr) cinfo->progress;
 
   /* Write the header and colormap */
@@ -366,11 +375,17 @@
     }
     image_ptr = (*cinfo->mem->access_virt_sarray)
       ((j_common_ptr) cinfo, dest->whole_image, row-1, (JDIMENSION) 1, FALSE);
+#if (BITS_IN_JSAMPLE == 8) && !defined(NEED_FAR_POINTERS)
+    if (JFWRITE(outfile, image_ptr[0], dest->row_width)
+	!= (size_t) dest->row_width)
+      ERREXIT(cinfo, JERR_FILE_WRITE);
+#else
     data_ptr = image_ptr[0];
     for (col = dest->row_width; col > 0; col--) {
       putc(GETJSAMPLE(*data_ptr), outfile);
       data_ptr++;
     }
+#endif
   }
   if (progress != NULL)
     progress->completed_extra_passes++;
diff --git a/wrgif.c b/wrgif.c
index 5fe8328..85cfaa8 100644
--- a/wrgif.c
+++ b/wrgif.c
@@ -1,17 +1,16 @@
 /*
  * wrgif.c
  *
- * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Copyright (C) 1991-1996, Thomas G. Lane.
  * This file is part of the Independent JPEG Group's software.
  * For conditions of distribution and use, see the accompanying README file.
  *
- * This file contains routines to write output images in GIF format.
+ **************************************************************************
+ * WARNING: You will need an LZW patent license from Unisys in order to   *
+ * use this file legally in any commercial or shareware application.      *
+ **************************************************************************
  *
- **************************************************************************
- * NOTE: to avoid entanglements with Unisys' patent on LZW compression,   *
- * this code has been modified to output "uncompressed GIF" files.        *
- * There is no trace of the LZW algorithm in this file.                   *
- **************************************************************************
+ * This file contains routines to write output images in GIF format.
  *
  * These routines may need modification for non-Unix environments or
  * specialized applications.  As they stand, they assume output to
@@ -42,6 +41,40 @@
 #ifdef GIF_SUPPORTED
 
 
+#define	MAX_LZW_BITS	12	/* maximum LZW code size (4096 symbols) */
+
+typedef INT16 code_int;		/* must hold -1 .. 2**MAX_LZW_BITS */
+
+#define LZW_TABLE_SIZE	((code_int) 1 << MAX_LZW_BITS)
+
+#define HSIZE		5003	/* hash table size for 80% occupancy */
+
+typedef int hash_int;		/* must hold -2*HSIZE..2*HSIZE */
+
+#define MAXCODE(n_bits)	(((code_int) 1 << (n_bits)) - 1)
+
+
+/*
+ * The LZW hash table consists of two parallel arrays:
+ *   hash_code[i]	code of symbol in slot i, or 0 if empty slot
+ *   hash_value[i]	symbol's value; undefined if empty slot
+ * where slot values (i) range from 0 to HSIZE-1.  The symbol value is
+ * its prefix symbol's code concatenated with its suffix character.
+ *
+ * Algorithm:  use open addressing double hashing (no chaining) on the
+ * prefix code / suffix character combination.  We do a variant of Knuth's
+ * algorithm D (vol. 3, sec. 6.4) along with G. Knott's relatively-prime
+ * secondary probe.
+ *
+ * The hash_value[] table is allocated from FAR heap space since it would
+ * use up rather a lot of the near data space in a PC.
+ */
+
+typedef INT32 hash_entry;	/* must hold (code_int<<8) | byte */
+
+#define HASH_ENTRY(prefix,suffix)  ((((hash_entry) (prefix)) << 8) | (suffix))
+
+
 /* Private version of data destination object */
 
 typedef struct {
@@ -51,14 +84,23 @@
 
   /* State for packing variable-width codes into a bitstream */
   int n_bits;			/* current number of bits/code */
-  int maxcode;			/* maximum code, given n_bits */
+  code_int maxcode;		/* maximum code, given n_bits */
+  int init_bits;		/* initial n_bits ... restored after clear */
   INT32 cur_accum;		/* holds bits not yet output */
   int cur_bits;			/* # of bits in cur_accum */
 
-  /* State for GIF code assignment */
-  int ClearCode;		/* clear code (doesn't change) */
-  int EOFCode;			/* EOF code (ditto) */
-  int code_counter;		/* counts output symbols */
+  /* LZW string construction */
+  code_int waiting_code;	/* symbol not yet output; may be extendable */
+  boolean first_byte;		/* if TRUE, waiting_code is not valid */
+
+  /* State for LZW code assignment */
+  code_int ClearCode;		/* clear code (doesn't change) */
+  code_int EOFCode;		/* EOF code (ditto) */
+  code_int free_code;		/* first not-yet-used symbol code */
+
+  /* LZW hash table */
+  code_int *hash_code;		/* => hash table of symbol codes */
+  hash_entry FAR *hash_value;	/* => hash table of symbol values */
 
   /* GIF data packet construction buffer */
   int bytesinpkt;		/* # of bytes in current packet */
@@ -68,12 +110,9 @@
 
 typedef gif_dest_struct * gif_dest_ptr;
 
-/* Largest value that will fit in N bits */
-#define MAXCODE(n_bits)	((1 << (n_bits)) - 1)
-
 
 /*
- * Routines to package finished data bytes into GIF data blocks.
+ * Routines to package compressed data bytes into GIF data blocks.
  * A data block consists of a count byte (1..255) and that many data bytes.
  */
 
@@ -102,7 +141,7 @@
 /* Routine to convert variable-width codes into a byte stream */
 
 LOCAL(void)
-output (gif_dest_ptr dinfo, int code)
+output (gif_dest_ptr dinfo, code_int code)
 /* Emit a code of n_bits bits */
 /* Uses cur_accum and cur_bits to reblock into 8-bit bytes */
 {
@@ -114,67 +153,123 @@
     dinfo->cur_accum >>= 8;
     dinfo->cur_bits -= 8;
   }
+
+  /*
+   * If the next entry is going to be too big for the code size,
+   * then increase it, if possible.  We do this here to ensure
+   * that it's done in sync with the decoder's codesize increases.
+   */
+  if (dinfo->free_code > dinfo->maxcode) {
+    dinfo->n_bits++;
+    if (dinfo->n_bits == MAX_LZW_BITS)
+      dinfo->maxcode = LZW_TABLE_SIZE; /* free_code will never exceed this */
+    else
+      dinfo->maxcode = MAXCODE(dinfo->n_bits);
+  }
 }
 
 
-/* The pseudo-compression algorithm.
- *
- * In this module we simply output each pixel value as a separate symbol;
- * thus, no compression occurs.  In fact, there is expansion of one bit per
- * pixel, because we use a symbol width one bit wider than the pixel width.
- *
- * GIF ordinarily uses variable-width symbols, and the decoder will expect
- * to ratchet up the symbol width after a fixed number of symbols.
- * To simplify the logic and keep the expansion penalty down, we emit a
- * GIF Clear code to reset the decoder just before the width would ratchet up.
- * Thus, all the symbols in the output file will have the same bit width.
- * Note that emitting the Clear codes at the right times is a mere matter of
- * counting output symbols and is in no way dependent on the LZW patent.
- *
- * With a small basic pixel width (low color count), Clear codes will be
- * needed very frequently, causing the file to expand even more.  So this
- * simplistic approach wouldn't work too well on bilevel images, for example.
- * But for output of JPEG conversions the pixel width will usually be 8 bits
- * (129 to 256 colors), so the overhead added by Clear symbols is only about
- * one symbol in every 256.
- */
+/* The LZW algorithm proper */
+
+
+LOCAL(void)
+clear_hash (gif_dest_ptr dinfo)
+/* Fill the hash table with empty entries */
+{
+  /* It's sufficient to zero hash_code[] */
+  MEMZERO(dinfo->hash_code, HSIZE * SIZEOF(code_int));
+}
+
+
+LOCAL(void)
+clear_block (gif_dest_ptr dinfo)
+/* Reset compressor and issue a Clear code */
+{
+  clear_hash(dinfo);			/* delete all the symbols */
+  dinfo->free_code = dinfo->ClearCode + 2;
+  output(dinfo, dinfo->ClearCode);	/* inform decoder */
+  dinfo->n_bits = dinfo->init_bits;	/* reset code size */
+  dinfo->maxcode = MAXCODE(dinfo->n_bits);
+}
+
 
 LOCAL(void)
 compress_init (gif_dest_ptr dinfo, int i_bits)
-/* Initialize pseudo-compressor */
+/* Initialize LZW compressor */
 {
   /* init all the state variables */
-  dinfo->n_bits = i_bits;
+  dinfo->n_bits = dinfo->init_bits = i_bits;
   dinfo->maxcode = MAXCODE(dinfo->n_bits);
-  dinfo->ClearCode = (1 << (i_bits - 1));
+  dinfo->ClearCode = ((code_int) 1 << (i_bits - 1));
   dinfo->EOFCode = dinfo->ClearCode + 1;
-  dinfo->code_counter = dinfo->ClearCode + 2;
+  dinfo->free_code = dinfo->ClearCode + 2;
+  dinfo->first_byte = TRUE;	/* no waiting symbol yet */
   /* init output buffering vars */
   dinfo->bytesinpkt = 0;
   dinfo->cur_accum = 0;
   dinfo->cur_bits = 0;
+  /* clear hash table */
+  clear_hash(dinfo);
   /* GIF specifies an initial Clear code */
   output(dinfo, dinfo->ClearCode);
 }
 
 
 LOCAL(void)
-compress_pixel (gif_dest_ptr dinfo, int c)
-/* Accept and "compress" one pixel value.
- * The given value must be less than n_bits wide.
- */
+compress_byte (gif_dest_ptr dinfo, int c)
+/* Accept and compress one 8-bit byte */
 {
-  /* Output the given pixel value as a symbol. */
-  output(dinfo, c);
-  /* Issue Clear codes often enough to keep the reader from ratcheting up
-   * its symbol size.
-   */
-  if (dinfo->code_counter < dinfo->maxcode) {
-    dinfo->code_counter++;
-  } else {
-    output(dinfo, dinfo->ClearCode);
-    dinfo->code_counter = dinfo->ClearCode + 2;	/* reset the counter */
+  register hash_int i;
+  register hash_int disp;
+  register hash_entry probe_value;
+
+  if (dinfo->first_byte) {	/* need to initialize waiting_code */
+    dinfo->waiting_code = c;
+    dinfo->first_byte = FALSE;
+    return;
   }
+
+  /* Probe hash table to see if a symbol exists for
+   * waiting_code followed by c.
+   * If so, replace waiting_code by that symbol and return.
+   */
+  i = ((hash_int) c << (MAX_LZW_BITS-8)) + dinfo->waiting_code;
+  /* i is less than twice 2**MAX_LZW_BITS, therefore less than twice HSIZE */
+  if (i >= HSIZE)
+    i -= HSIZE;
+
+  probe_value = HASH_ENTRY(dinfo->waiting_code, c);
+  
+  if (dinfo->hash_code[i] != 0) { /* is first probed slot empty? */
+    if (dinfo->hash_value[i] == probe_value) {
+      dinfo->waiting_code = dinfo->hash_code[i];
+      return;
+    }
+    if (i == 0)			/* secondary hash (after G. Knott) */
+      disp = 1;
+    else
+      disp = HSIZE - i;
+    for (;;) {
+      i -= disp;
+      if (i < 0)
+	i += HSIZE;
+      if (dinfo->hash_code[i] == 0)
+	break;			/* hit empty slot */
+      if (dinfo->hash_value[i] == probe_value) {
+	dinfo->waiting_code = dinfo->hash_code[i];
+	return;
+      }
+    }
+  }
+
+  /* here when hashtable[i] is an empty slot; desired symbol not in table */
+  output(dinfo, dinfo->waiting_code);
+  if (dinfo->free_code < LZW_TABLE_SIZE) {
+    dinfo->hash_code[i] = dinfo->free_code++; /* add symbol to hashtable */
+    dinfo->hash_value[i] = probe_value;
+  } else
+    clear_block(dinfo);
+  dinfo->waiting_code = c;
 }
 
 
@@ -182,6 +277,9 @@
 compress_term (gif_dest_ptr dinfo)
 /* Clean up at end */
 {
+  /* Flush out the buffered code */
+  if (! dinfo->first_byte)
+    output(dinfo, dinfo->waiting_code);
   /* Send an EOF code */
   output(dinfo, dinfo->EOFCode);
   /* Flush the bit-packing buffer */
@@ -289,7 +387,7 @@
   /* Write Initial Code Size byte */
   putc(InitCodeSize, dinfo->pub.output_file);
 
-  /* Initialize for "compression" of image data */
+  /* Initialize for LZW compression of image data */
   compress_init(dinfo, InitCodeSize+1);
 }
 
@@ -325,7 +423,7 @@
 
   ptr = dest->pub.buffer[0];
   for (col = cinfo->output_width; col > 0; col--) {
-    compress_pixel(dest, GETJSAMPLE(*ptr++));
+    compress_byte(dest, GETJSAMPLE(*ptr++));
   }
 }
 
@@ -339,7 +437,7 @@
 {
   gif_dest_ptr dest = (gif_dest_ptr) dinfo;
 
-  /* Flush "compression" mechanism */
+  /* Flush LZW mechanism */
   compress_term(dest);
   /* Write a zero-length data block to end the series */
   putc(0, dest->pub.output_file);
@@ -393,6 +491,14 @@
     ((j_common_ptr) cinfo, JPOOL_IMAGE, cinfo->output_width, (JDIMENSION) 1);
   dest->pub.buffer_height = 1;
 
+  /* Allocate space for hash table */
+  dest->hash_code = (code_int *)
+    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				HSIZE * SIZEOF(code_int));
+  dest->hash_value = (hash_entry FAR *)
+    (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+				HSIZE * SIZEOF(hash_entry));
+
   return (djpeg_dest_ptr) dest;
 }