Prefer memcpy to manual pointer arithmetic.
diff --git a/intel/filter_sse2_intrinsics.c b/intel/filter_sse2_intrinsics.c
index ef1fc7d..a5f5ecf 100644
--- a/intel/filter_sse2_intrinsics.c
+++ b/intel/filter_sse2_intrinsics.c
@@ -29,39 +29,25 @@
  */
 
 static __m128i load4(const void* p) {
-   return _mm_cvtsi32_si128(*(const int*)p);
+   int tmp;
+   memcpy(&tmp, p, sizeof(tmp));
+   return _mm_cvtsi32_si128(tmp);
 }
 
 static void store4(void* p, __m128i v) {
-   *(int*)p = _mm_cvtsi128_si32(v);
+   int tmp = _mm_cvtsi128_si32(v);
+   memcpy(p, &tmp, sizeof(int));
 }
 
 static __m128i load3(const void* p) {
-   /* We'll load 2 bytes, then 1 byte,
-    * then mask them together, and finally load into SSE.
-    */
-   const png_uint_16* p01 = (png_const_uint_16p)p;
-   const png_byte*    p2  = (const png_byte*)(p01+1);
-
-   png_uint_32 v012 = (png_uint_32)(*p01)
-                    | (png_uint_32)(*p2) << 16;
-   return load4(&v012);
+   png_uint_32 tmp = 0;
+   memcpy(&tmp, p, 3);
+   return _mm_cvtsi32_si128(tmp);
 }
 
 static void store3(void* p, __m128i v) {
-   /* We'll pull from SSE as a 32-bit int, then write
-    * its bottom two bytes, then its third byte.
-    */
-   png_uint_32 v012;
-   png_uint_16* p01;
-   png_byte*    p2;
-
-   store4(&v012, v);
-
-   p01 = (png_uint_16p)p;
-   p2  = (png_byte*)(p01+1);
-   *p01 = (png_uint_16)v012;
-   *p2  = (png_byte)(v012 >> 16);
+   int tmp = _mm_cvtsi128_si32(v);
+   memcpy(p, &tmp, 3);
 }
 
 void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,