add RGB_fff and RGBA_ffff loads

Change-Id: I7d9a5b33f9d085bcbbc77ac2494195eeb14368c6
Reviewed-on: https://skia-review.googlesource.com/94120
Reviewed-by: Brian Osman <brianosman@google.com>
diff --git a/skcms.h b/skcms.h
index 36fd7c9..de2f247 100644
--- a/skcms.h
+++ b/skcms.h
@@ -111,18 +111,18 @@
     skcms_PixelFormat_RGBA_1010102,
     skcms_PixelFormat_BGRA_1010102,
 
-    skcms_PixelFormat_RGB_161616,     // Big-endian.
+    skcms_PixelFormat_RGB_161616,     // Big-endian.  Pointers must be 16-bit aligned.
     skcms_PixelFormat_BGR_161616,
     skcms_PixelFormat_RGBA_16161616,
     skcms_PixelFormat_BGRA_16161616,
 
     skcms_PixelFormat_RGB_hhh,        // 1-5-10 half-precision float.
-    skcms_PixelFormat_BGR_hhh,
+    skcms_PixelFormat_BGR_hhh,        // Pointers must be 16-bit aligned.
     skcms_PixelFormat_RGBA_hhhh,
     skcms_PixelFormat_BGRA_hhhh,
 
     skcms_PixelFormat_RGB_fff,        // 1-8-23 single-precision float (the normal kind).
-    skcms_PixelFormat_BGR_fff,
+    skcms_PixelFormat_BGR_fff,        // Pointers must be 32-bit aligned.
     skcms_PixelFormat_RGBA_ffff,
     skcms_PixelFormat_BGRA_ffff,
 } skcms_PixelFormat;
diff --git a/src/Transform.c b/src/Transform.c
index 49bfb81..081972f 100644
--- a/src/Transform.c
+++ b/src/Transform.c
@@ -135,25 +135,24 @@
     load_565_N(i,ip,dst,src,tmp, r,g,b,a);
 }
 
+// Strided loads of N values, starting from p.
+#if N == 1
+    #define LOAD_3(p) (p)[0]
+    #define LOAD_4(p) (p)[0]
+#elif N == 4
+    #define LOAD_3(p) {(p)[0], (p)[3], (p)[6], (p)[ 9]}
+    #define LOAD_4(p) {(p)[0], (p)[4], (p)[8], (p)[12]};
+#elif N == 8
+    #define LOAD_3(p) {(p)[0], (p)[3], (p)[6], (p)[ 9],  (p)[12], (p)[15], (p)[18], (p)[21]}
+    #define LOAD_4(p) {(p)[0], (p)[4], (p)[8], (p)[12],  (p)[16], (p)[20], (p)[24], (p)[28]}
+#endif
+
 static void load_888_N(size_t i, void** ip, char* dst, const char* src, char* tmp,
                        F r, F g, F b, F a) {
     const uint8_t* rgb = (const uint8_t*)(src + 3*i);
-#if N == 1
-    U32 R = rgb[0],
-        G = rgb[1],
-        B = rgb[2];
-#elif N == 4
-    U32 R = { rgb[0], rgb[3], rgb[6], rgb[ 9] },
-        G = { rgb[1], rgb[4], rgb[7], rgb[10] },
-        B = { rgb[2], rgb[5], rgb[8], rgb[11] };
-#elif N == 8
-    U32 R = { rgb[0], rgb[3], rgb[6], rgb[ 9],  rgb[12], rgb[15], rgb[18], rgb[21] },
-        G = { rgb[1], rgb[4], rgb[7], rgb[10],  rgb[13], rgb[16], rgb[19], rgb[22] },
-        B = { rgb[2], rgb[5], rgb[8], rgb[11],  rgb[14], rgb[17], rgb[20], rgb[23] };
-#endif
-    r = F_from_U32(R) * (1/255.0f);
-    g = F_from_U32(G) * (1/255.0f);
-    b = F_from_U32(B) * (1/255.0f);
+    r = F_from_U32( (U32)LOAD_3(rgb+0) ) * (1/255.0f);
+    g = F_from_U32( (U32)LOAD_3(rgb+1) ) * (1/255.0f);
+    b = F_from_U32( (U32)LOAD_3(rgb+2) ) * (1/255.0f);
     a = F1;
     next_stage(i,ip,dst,src,tmp, r,g,b,a);
 }
@@ -223,19 +222,9 @@
     uintptr_t ptr = (uintptr_t)(src + 6*i);
     assert( (ptr & 1) == 0 );                   // The src pointer must be 2-byte aligned
     const uint16_t* rgb = (const uint16_t*)ptr; // for this cast to const uint16_t* to be safe.
-#if N == 1
-    U32 R = rgb[0],
-        G = rgb[1],
-        B = rgb[2];
-#elif N == 4
-    U32 R = { rgb[0], rgb[3], rgb[6], rgb[ 9] },
-        G = { rgb[1], rgb[4], rgb[7], rgb[10] },
-        B = { rgb[2], rgb[5], rgb[8], rgb[11] };
-#elif N == 8
-    U32 R = { rgb[0], rgb[3], rgb[6], rgb[ 9],  rgb[12], rgb[15], rgb[18], rgb[21] },
-        G = { rgb[1], rgb[4], rgb[7], rgb[10],  rgb[13], rgb[16], rgb[19], rgb[22] },
-        B = { rgb[2], rgb[5], rgb[8], rgb[11],  rgb[14], rgb[17], rgb[20], rgb[23] };
-#endif
+    U32 R = LOAD_3(rgb+0),
+        G = LOAD_3(rgb+1),
+        B = LOAD_3(rgb+2);
     // R,G,B are big-endian 16-bit, so byte swap them before converting to float.
     r = F_from_U32( (R & 0x00ff)<<8 | (R & 0xff00)>>8 ) * (1/65535.0f);
     g = F_from_U32( (G & 0x00ff)<<8 | (G & 0xff00)>>8 ) * (1/65535.0f);
@@ -276,11 +265,42 @@
 //TODO: load_hhh_1
 //TODO: load_hhhh_N
 //TODO: load_hhhh_1
-//
-//TODO: load_fff_N
-//TODO: load_fff_1
-//TODO: load_ffff_N
-//TODO: load_ffff_1
+
+static void load_fff_N(size_t i, void** ip, char* dst, const char* src, char* tmp,
+                       F r, F g, F b, F a) {
+    uintptr_t ptr = (uintptr_t)(src + 12*i);
+    assert( (ptr & 3) == 0 );                   // The src pointer must be 4-byte aligned
+    const float* rgb = (const float*)ptr;       // for this cast to const float* to be safe.
+    r = (F)LOAD_3(rgb+0);
+    g = (F)LOAD_3(rgb+1);
+    b = (F)LOAD_3(rgb+2);
+    a = F1;
+    next_stage(i,ip,dst,src,tmp, r,g,b,a);
+}
+static void load_fff_1(size_t i, void** ip, char* dst, const char* src, char* tmp,
+                            F r, F g, F b, F a) {
+    memcpy(tmp, src + 12*i, 12);
+    src = tmp - 12*i;
+    load_fff_N(i,ip,dst,src,tmp, r,g,b,a);
+}
+
+static void load_ffff_N(size_t i, void** ip, char* dst, const char* src, char* tmp,
+                        F r, F g, F b, F a) {
+    uintptr_t ptr = (uintptr_t)(src + 16*i);
+    assert( (ptr & 3) == 0 );                   // The src pointer must be 4-byte aligned
+    const float* rgb = (const float*)ptr;       // for this cast to const float* to be safe.
+    r = (F)LOAD_4(rgb+0);
+    g = (F)LOAD_4(rgb+1);
+    b = (F)LOAD_4(rgb+2);
+    a = (F)LOAD_4(rgb+3);
+    next_stage(i,ip,dst,src,tmp, r,g,b,a);
+}
+static void load_ffff_1(size_t i, void** ip, char* dst, const char* src, char* tmp,
+                        F r, F g, F b, F a) {
+    memcpy(tmp, src + 16*i, 16);
+    src = tmp - 16*i;
+    load_ffff_N(i,ip,dst,src,tmp, r,g,b,a);
+}
 
 static void store_8888_N(size_t i, void** ip, char* dst, const char* src, char* tmp,
                          F r, F g, F b, F a) {
@@ -344,6 +364,12 @@
         case skcms_PixelFormat_RGBA_16161616 >> 1: *ip_N++ = (void*)load_16161616_N;
                                                    *ip_1++ = (void*)load_16161616_1;
                                                    break;
+        case skcms_PixelFormat_RGB_fff       >> 1: *ip_N++ = (void*)load_fff_N;
+                                                   *ip_1++ = (void*)load_fff_1;
+                                                   break;
+        case skcms_PixelFormat_RGBA_ffff     >> 1: *ip_N++ = (void*)load_ffff_N;
+                                                   *ip_1++ = (void*)load_ffff_1;
+                                                   break;
     }
     if (srcFmt & 1) {
         *ip_N++ = (void*)swap_rb;
diff --git a/tests.c b/tests.c
index fd4aad6..a0b138f 100644
--- a/tests.c
+++ b/tests.c
@@ -230,6 +230,22 @@
     expect(dst == 0xff017fff);
 }
 
+static void test_FormatConversions_float() {
+    skcms_ICCProfile profile;
+
+    float src[] = { 1.0f, 0.5f, 1/255.0f, 1/512.0f };
+
+    uint32_t dst;
+    expect(skcms_Transform(&dst, skcms_PixelFormat_RGBA_8888, &profile,
+                           &src, skcms_PixelFormat_RGBA_ffff, &profile, 1));
+    expect(dst == 0x000180ff);
+
+    // Same as above, but we'll ignore the 1/512 alpha and fill in 1.0.
+    expect(skcms_Transform(&dst, skcms_PixelFormat_RGBA_8888, &profile,
+                           &src, skcms_PixelFormat_RGB_fff,   &profile, 1));
+    expect(dst == 0xff0180ff);
+}
+
 int main(void) {
     test_ICCProfile();
     test_Transform();
@@ -238,5 +254,6 @@
     test_FormatConversions_16161616();
     test_FormatConversions_161616();
     test_FormatConversions_101010();
+    test_FormatConversions_float();
     return 0;
 }