Fix invalid memory access and optimise Blit_3or4_to_3or4__*

Fix invalid write at last pixel of the surface:
  when surface has no padding (pitch == w * bpp) and bpp is 3
  with Blit, no colorkey, and NO_ALPHA same or inverse rgb triplet

Optimise by using int32 access:

BGR24 -> ARGB8888 :  faster x1.897875   (362405 -> 190953)
RGB24 -> ABGR8888 :  faster x1.660416   (363304 -> 218803)

ABGR8888 -> RGB24 :  faster x1.686319   (334962 -> 198635)
ARGB8888 -> BGR24 :  faster x1.691868   (324524 -> 191814)
BGR24 -> RGB888 :  faster x1.678459   (326811 -> 194709)
BGR888 -> RGB24 :  faster x1.731772   (327724 -> 189242)
RGB24 -> BGR888 :  faster x1.690989   (328916 -> 194511)
RGB888 -> BGR24 :  faster x1.698333   (326175 -> 192056)
diff --git a/src/video/SDL_blit_N.c b/src/video/SDL_blit_N.c
index c31cd9a..9a2f241 100644
--- a/src/video/SDL_blit_N.c
+++ b/src/video/SDL_blit_N.c
@@ -2938,15 +2938,54 @@
     if (dstfmt->Amask) {
         /* SET_ALPHA */
         Uint32 mask = info->a << dstfmt->Ashift;
+        int last_line = 0;
+        if (srcbpp == 3 && height) {
+            height -= 1;
+            last_line = 1;
+        }
+
         while (height--) {
             /* *INDENT-OFF* */
             DUFFS_LOOP(
             {
                 Uint32  *dst32 = (Uint32*)dst;
+                Uint32  *src32 = (Uint32*)src;
+                *dst32 = *src32 | mask;
+                dst += 4;
+                src += srcbpp;
+            }, width);
+            /* *INDENT-ON* */
+            src += srcskip;
+            dst += dstskip;
+        }
+
+        if (last_line) {
+            while (width--) {
+                Uint32  *dst32 = (Uint32*)dst;
                 Uint8 s0 = src[0];
                 Uint8 s1 = src[1];
                 Uint8 s2 = src[2];
                 *dst32 = (s0) | (s1 << 8) | (s2 << 16) | mask;
+                dst += 4;
+                src += srcbpp;
+            }
+        }
+    } else {
+        /* NO_ALPHA */
+        int mask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
+        int last_line = 0;
+        if ((dstbpp == 3 || srcbpp == 3) && height) {
+            height -= 1;
+            last_line = 1;
+        }
+
+        while (height--) {
+            /* *INDENT-OFF* */
+            DUFFS_LOOP(
+            {
+                Uint32  *dst32 = (Uint32*)dst;
+                Uint32  *src32 = (Uint32*)src;
+                *dst32 = *src32 & mask;
                 dst += dstbpp;
                 src += srcbpp;
             }, width);
@@ -2954,23 +2993,18 @@
             src += srcskip;
             dst += dstskip;
         }
-    } else {
-        /* NO_ALPHA */
-        while (height--) {
-            /* *INDENT-OFF* */
-            DUFFS_LOOP(
-            {
-                Uint32  *dst32 = (Uint32*)dst;
+
+        if (last_line) {
+            while (width--) {
                 Uint8 s0 = src[0];
                 Uint8 s1 = src[1];
                 Uint8 s2 = src[2];
-                *dst32 = (s0) | (s1 << 8) | (s2 << 16);
+                dst[0] = s0;
+                dst[1] = s1;
+                dst[2] = s2;
                 dst += dstbpp;
                 src += srcbpp;
-            }, width);
-            /* *INDENT-ON* */
-            src += srcskip;
-            dst += dstskip;
+            }
         }
     }
     return;
@@ -3036,6 +3070,12 @@
         }
     } else {
         /* NO_ALPHA */
+        int last_line = 0;
+        if (dstbpp == 3 && height) {
+            height -= 1;
+            last_line = 1;
+        }
+
         while (height--) {
             /* *INDENT-OFF* */
             DUFFS_LOOP(
@@ -3053,6 +3093,20 @@
             src += srcskip;
             dst += dstskip;
         }
+
+        if (last_line) {
+            while (width--) {
+                Uint8 s0 = src[0];
+                Uint8 s1 = src[1];
+                Uint8 s2 = src[2];
+                /* inversed, compared to Blit_3or4_to_3or4__same_rgb */
+                dst[0] = s2;
+                dst[1] = s1;
+                dst[2] = s0;
+                dst += dstbpp;
+                src += srcbpp;
+            }
+        }
     }
     return;
 }