MMI: Fix comp. perf. issue w/ unaligned image rows

Using ldc1 with a non-64-bit-aligned memory location causes as much as a
10x slow-down in overall compression performance.
diff --git a/ChangeLog.md b/ChangeLog.md
index ebd0e3c..a855bc4 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -34,6 +34,9 @@
 7. Fixed an issue whereby a static build of libjpeg-turbo (a build in which
 `ENABLE_SHARED` is `0`) could not be installed using the Visual Studio IDE.
 
+8. Fixed a severe performance issue in the Loongson MMI SIMD extensions that
+occurred when compressing RGB images whose image rows were not 64-bit-aligned.
+
 
 2.0.1
 =====
diff --git a/simd/loongson/jccolext-mmi.c b/simd/loongson/jccolext-mmi.c
index e1c4e69..a94b53b 100644
--- a/simd/loongson/jccolext-mmi.c
+++ b/simd/loongson/jccolext-mmi.c
@@ -2,12 +2,13 @@
  * Loongson MMI optimizations for libjpeg-turbo
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  *                          All Rights Reserved.
  * Authors:  ZhuChen     <zhuchen@loongson.cn>
  *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
  *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           ZhangLixia  <zhanglixia-hf@loongson.cn>
  *
  * Based on the x86 SIMD extension for IJG JPEG library
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -184,9 +185,15 @@
               "$14", "memory"
            );
       } else {
-        mmA = _mm_load_si64((__m64 *)&inptr[0]);
-        mmG = _mm_load_si64((__m64 *)&inptr[8]);
-        mmF = _mm_load_si64((__m64 *)&inptr[16]);
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmG = _mm_load_si64((__m64 *)&inptr[8]);
+          mmF = _mm_load_si64((__m64 *)&inptr[16]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+        }
         inptr += RGB_PIXELSIZE * 8;
       }
       mmD = mmA;
diff --git a/simd/loongson/loongson-mmintrin.h b/simd/loongson/loongson-mmintrin.h
index 4aea763..50d166b 100644
--- a/simd/loongson/loongson-mmintrin.h
+++ b/simd/loongson/loongson-mmintrin.h
@@ -1,8 +1,9 @@
 /*
  * Loongson MMI optimizations for libjpeg-turbo
  *
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  *                          All Rights Reserved.
+ * Copyright (C) 2019, D. R. Commander.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -41,7 +42,7 @@
 
 /********** Set Operations **********/
 
-extern __inline __m64
+extern __inline __m64 FUNCTION_ATTRIBS
 _mm_setzero_si64(void)
 {
   return 0.0;
@@ -1245,6 +1246,22 @@
   asm("ldc1 %0, %1\n\t"
       : "=f" (ret)
       : "m" (*src)
+      : "memory"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadu_si64(const __m64 *src)
+{
+  __m64 ret;
+
+  asm("gsldlc1 %0,  7(%1)\n\t"
+      "gsldrc1 %0,  0(%1)\n\t"
+      : "=f" (ret)
+      : "r" (src)
+      : "memory"
      );
 
   return ret;