Remove small_memcpy.

Modern clang does not appear to differentiate between memcpy
and __builtin_memcpy. The performance is identical either way.

Change-Id: I320061bf0c22184e86445ebc9ea5c23f6663db61
Reviewed-on: https://skia-review.googlesource.com/c/skcms/+/772598
Reviewed-by: Robert Phillips <robertphillips@google.com>
Auto-Submit: John Stiles <johnstiles@google.com>
Commit-Queue: Robert Phillips <robertphillips@google.com>
diff --git a/skcms.cc b/skcms.cc
index 0e2770d..fcca0b0 100644
--- a/skcms.cc
+++ b/skcms.cc
@@ -64,23 +64,17 @@
 } inf_ = { 0x7f800000 };
 #define INFINITY_ inf_.f
 
-#if defined(__clang__) || defined(__GNUC__)
-    #define small_memcpy __builtin_memcpy
-#else
-    #define small_memcpy memcpy
-#endif
-
 static float log2f_(float x) {
     // The first approximation of log2(x) is its exponent 'e', minus 127.
     int32_t bits;
-    small_memcpy(&bits, &x, sizeof(bits));
+    memcpy(&bits, &x, sizeof(bits));
 
     float e = (float)bits * (1.0f / (1<<23));
 
     // If we use the mantissa too we can refine the error signficantly.
     int32_t m_bits = (bits & 0x007fffff) | 0x3f000000;
     float m;
-    small_memcpy(&m, &m_bits, sizeof(m));
+    memcpy(&m, &m_bits, sizeof(m));
 
     return (e - 124.225514990f
               -   1.498030302f*m
@@ -114,7 +108,7 @@
     }
 
     int32_t bits = (int32_t)fbits;
-    small_memcpy(&x, &bits, sizeof(x));
+    memcpy(&x, &bits, sizeof(x));
     return x;
 }
 
diff --git a/src/Transform_inl.h b/src/Transform_inl.h
index 6f64b08..95ce302 100644
--- a/src/Transform_inl.h
+++ b/src/Transform_inl.h
@@ -106,12 +106,12 @@
 template <typename T, typename P>
 SI T load(const P* ptr) {
     T val;
-    small_memcpy(&val, ptr, sizeof(val));
+    memcpy(&val, ptr, sizeof(val));
     return val;
 }
 template <typename T, typename P>
 SI void store(P* ptr, const T& val) {
-    small_memcpy(ptr, &val, sizeof(val));
+    memcpy(ptr, &val, sizeof(val));
 }
 
 // (T)v is a cast when N == 1 and a bit-pun when N>1,