add paletted source support

For the moment, looks like we only need an 8-bit palette with an 8888
payload.

Some little updates to bench.c to keep it sane with palettes.

Bug: skia:8473
Change-Id: I4efffea3273b5565954c0d20a666979f11ea685b
Reviewed-on: https://skia-review.googlesource.com/c/163170
Reviewed-by: Brian Osman <brianosman@google.com>
Reviewed-by: Leon Scroggins <scroggo@google.com>
Commit-Queue: Mike Klein <mtklein@google.com>
diff --git a/bench.c b/bench.c
index 64ce3c8..d67a916 100644
--- a/bench.c
+++ b/bench.c
@@ -12,6 +12,7 @@
 #endif
 
 #include "skcms.h"
+#include "skcms_internal.h"
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -70,13 +71,22 @@
                       dst_fmt = skcms_PixelFormat_RGB_565;
     const int wrap = skcms_PixelFormat_BGRA_ffff+1;
 
+    uint32_t palette[256];
+    for (int i = 0; i < 256; i++) {
+        palette[i] = (uint32_t)(255 - i%256) * 0x01010101;
+    }
+
     clock_t start = clock();
+    bool all_ok = true;
     for (int i = 0; i < n; i++) {
-        (void)skcms_Transform(src_pixels, src_fmt, skcms_AlphaFormat_Unpremul, &src_profile,
-                              dst_pixels, dst_fmt, skcms_AlphaFormat_Unpremul, &dst_profile,
-                              NPIXELS);
+        const skcms_AlphaFormat upm = skcms_AlphaFormat_Unpremul;
+        all_ok &= skcms_TransformWithPalette(src_pixels, src_fmt, upm, &src_profile,
+                                             dst_pixels, dst_fmt, upm, &dst_profile,
+                                             NPIXELS, palette);
         src_fmt = (src_fmt + 3) % wrap;
-        dst_fmt = (dst_fmt + 7) % wrap;
+        do {
+            dst_fmt = (dst_fmt + 7) % wrap;
+        } while (needs_palette(dst_fmt));
     }
 
     clock_t ticks = clock() - start;
@@ -86,5 +96,5 @@
     free(src_buf);
     free(dst_buf);
 
-    return 0;
+    return all_ok ? 0 : 1;
 }
diff --git a/skcms.cc b/skcms.cc
index fb42ecb..2b1ca69 100644
--- a/skcms.cc
+++ b/skcms.cc
@@ -1752,6 +1752,7 @@
 typedef enum {
     Op_load_a8,
     Op_load_g8,
+    Op_load_8888_palette8,
     Op_load_4444,
     Op_load_565,
     Op_load_888,
@@ -1979,21 +1980,22 @@
 
 static size_t bytes_per_pixel(skcms_PixelFormat fmt) {
     switch (fmt >> 1) {   // ignore rgb/bgr
-        case skcms_PixelFormat_A_8             >> 1: return  1;
-        case skcms_PixelFormat_G_8             >> 1: return  1;
-        case skcms_PixelFormat_ABGR_4444       >> 1: return  2;
-        case skcms_PixelFormat_RGB_565         >> 1: return  2;
-        case skcms_PixelFormat_RGB_888         >> 1: return  3;
-        case skcms_PixelFormat_RGBA_8888       >> 1: return  4;
-        case skcms_PixelFormat_RGBA_1010102    >> 1: return  4;
-        case skcms_PixelFormat_RGB_161616LE    >> 1: return  6;
-        case skcms_PixelFormat_RGBA_16161616LE >> 1: return  8;
-        case skcms_PixelFormat_RGB_161616BE    >> 1: return  6;
-        case skcms_PixelFormat_RGBA_16161616BE >> 1: return  8;
-        case skcms_PixelFormat_RGB_hhh         >> 1: return  6;
-        case skcms_PixelFormat_RGBA_hhhh       >> 1: return  8;
-        case skcms_PixelFormat_RGB_fff         >> 1: return 12;
-        case skcms_PixelFormat_RGBA_ffff       >> 1: return 16;
+        case skcms_PixelFormat_A_8                >> 1: return  1;
+        case skcms_PixelFormat_G_8                >> 1: return  1;
+        case skcms_PixelFormat_RGBA_8888_Palette8 >> 1: return  1;
+        case skcms_PixelFormat_ABGR_4444          >> 1: return  2;
+        case skcms_PixelFormat_RGB_565            >> 1: return  2;
+        case skcms_PixelFormat_RGB_888            >> 1: return  3;
+        case skcms_PixelFormat_RGBA_8888          >> 1: return  4;
+        case skcms_PixelFormat_RGBA_1010102       >> 1: return  4;
+        case skcms_PixelFormat_RGB_161616LE       >> 1: return  6;
+        case skcms_PixelFormat_RGBA_16161616LE    >> 1: return  8;
+        case skcms_PixelFormat_RGB_161616BE       >> 1: return  6;
+        case skcms_PixelFormat_RGBA_16161616BE    >> 1: return  8;
+        case skcms_PixelFormat_RGB_hhh            >> 1: return  6;
+        case skcms_PixelFormat_RGBA_hhhh          >> 1: return  8;
+        case skcms_PixelFormat_RGB_fff            >> 1: return 12;
+        case skcms_PixelFormat_RGBA_ffff          >> 1: return 16;
     }
     assert(false);
     return 0;
@@ -2025,7 +2027,22 @@
                      skcms_PixelFormat       dstFmt,
                      skcms_AlphaFormat       dstAlpha,
                      const skcms_ICCProfile* dstProfile,
-                     size_t                  nz) {
+                     size_t                  npixels) {
+    return skcms_TransformWithPalette(src, srcFmt, srcAlpha, srcProfile,
+                                      dst, dstFmt, dstAlpha, dstProfile,
+                                      npixels, nullptr);
+}
+
+bool skcms_TransformWithPalette(const void*             src,
+                                skcms_PixelFormat       srcFmt,
+                                skcms_AlphaFormat       srcAlpha,
+                                const skcms_ICCProfile* srcProfile,
+                                void*                   dst,
+                                skcms_PixelFormat       dstFmt,
+                                skcms_AlphaFormat       dstAlpha,
+                                const skcms_ICCProfile* dstProfile,
+                                size_t                  nz,
+                                const void*             palette) {
     const size_t dst_bpp = bytes_per_pixel(dstFmt),
                  src_bpp = bytes_per_pixel(srcFmt);
     // Let's just refuse if the request is absurdly big.
@@ -2048,6 +2065,10 @@
     }
     // TODO: more careful alias rejection (like, dst == src + 1)?
 
+    if (needs_palette(srcFmt) && !palette) {
+        return false;
+    }
+
     Op          program  [32];
     const void* arguments[32];
 
@@ -2074,6 +2095,10 @@
         case skcms_PixelFormat_RGBA_hhhh       >> 1: *ops++ = Op_load_hhhh;       break;
         case skcms_PixelFormat_RGB_fff         >> 1: *ops++ = Op_load_fff;        break;
         case skcms_PixelFormat_RGBA_ffff       >> 1: *ops++ = Op_load_ffff;       break;
+
+        case skcms_PixelFormat_RGBA_8888_Palette8 >> 1: *ops++  = Op_load_8888_palette8;
+                                                        *args++ = palette;
+                                                        break;
     }
     if (srcFmt & 1) {
         *ops++ = Op_swap_rb;
diff --git a/skcms.h b/skcms.h
index cbabfc8..d924f34 100644
--- a/skcms.h
+++ b/skcms.h
@@ -175,6 +175,8 @@
     skcms_PixelFormat_A_8_,
     skcms_PixelFormat_G_8,
     skcms_PixelFormat_G_8_,
+    skcms_PixelFormat_RGBA_8888_Palette8,
+    skcms_PixelFormat_BGRA_8888_Palette8,
 
     skcms_PixelFormat_RGB_565,
     skcms_PixelFormat_BGR_565,
@@ -248,6 +250,18 @@
                                const skcms_ICCProfile* dstProfile,
                                size_t                  npixels);
 
+// As skcms_Transform(), supporting srcFmts with a palette.
+SKCMS_API bool skcms_TransformWithPalette(const void*             src,
+                                          skcms_PixelFormat       srcFmt,
+                                          skcms_AlphaFormat       srcAlpha,
+                                          const skcms_ICCProfile* srcProfile,
+                                          void*                   dst,
+                                          skcms_PixelFormat       dstFmt,
+                                          skcms_AlphaFormat       dstAlpha,
+                                          const skcms_ICCProfile* dstProfile,
+                                          size_t                  npixels,
+                                          const void*             palette);
+
 // If profile can be used as a destination in skcms_Transform, return true. Otherwise, attempt to
 // rewrite it with approximations where reasonable. If successful, return true. If no reasonable
 // approximation exists, leave the profile unchanged and return false.
diff --git a/skcms_internal.h b/skcms_internal.h
index 4b0c395..551128a 100644
--- a/skcms_internal.h
+++ b/skcms_internal.h
@@ -37,6 +37,11 @@
     static inline float fabsf_(float x) { return x < 0 ? -x : x; }
     float powf_(float, float);
 
+// ~~~~ Does this pixel format need a palette pointer to be usable? ~~~~
+    static inline bool needs_palette(skcms_PixelFormat fmt) {
+        return (fmt >> 1) == (skcms_PixelFormat_RGBA_8888_Palette8 >> 1);
+    }
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/Transform_inl.h b/src/Transform_inl.h
index b3217c8..15262e2 100644
--- a/src/Transform_inl.h
+++ b/src/Transform_inl.h
@@ -348,52 +348,69 @@
     return v;
 }
 
-// Helper for gather_16(), loading the ix'th 16-bit value from p.
-SI uint16_t load_16(const uint8_t* p, int ix) {
-    return load<uint16_t>(p + 2*ix);
-}
-
 SI U16 gather_16(const uint8_t* p, I32 ix) {
+    // Load the i'th 16-bit value from p.
+    auto load_16 = [p](int i) {
+        return load<uint16_t>(p + 2*i);
+    };
 #if N == 1
-    U16 v = load_16(p,ix);
+    U16 v = load_16(ix);
 #elif N == 4
-    U16 v = { load_16(p,ix[0]), load_16(p,ix[1]), load_16(p,ix[2]), load_16(p,ix[3]) };
+    U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) };
 #elif N == 8
-    U16 v = { load_16(p,ix[0]), load_16(p,ix[1]), load_16(p,ix[2]), load_16(p,ix[3]),
-              load_16(p,ix[4]), load_16(p,ix[5]), load_16(p,ix[6]), load_16(p,ix[7]) };
+    U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]),
+              load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) };
 #elif N == 16
-    U16 v = { load_16(p,ix[ 0]), load_16(p,ix[ 1]), load_16(p,ix[ 2]), load_16(p,ix[ 3]),
-              load_16(p,ix[ 4]), load_16(p,ix[ 5]), load_16(p,ix[ 6]), load_16(p,ix[ 7]),
-              load_16(p,ix[ 8]), load_16(p,ix[ 9]), load_16(p,ix[10]), load_16(p,ix[11]),
-              load_16(p,ix[12]), load_16(p,ix[13]), load_16(p,ix[14]), load_16(p,ix[15]) };
+    U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]),
+              load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]),
+              load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]),
+              load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) };
 #endif
     return v;
 }
 
-#if !defined(USING_AVX2)
-    // Helpers for gather_24/48(), loading the ix'th 24/48-bit value from p, and 1/2 extra bytes.
-    SI uint32_t load_24_32(const uint8_t* p, int ix) {
-        return load<uint32_t>(p + 3*ix);
-    }
-    SI uint64_t load_48_64(const uint8_t* p, int ix) {
-        return load<uint64_t>(p + 6*ix);
-    }
+SI U32 gather_32(const uint8_t* p, I32 ix) {
+    // Load the i'th 32-bit value from p.
+    auto load_32 = [p](int i) {
+        return load<uint32_t>(p + 4*i);
+    };
+#if N == 1
+    U32 v = load_32(ix);
+#elif N == 4
+    U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) };
+#elif N == 8
+    U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]),
+              load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) };
+#elif N == 16
+    U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]),
+              load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]),
+              load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]),
+              load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) };
 #endif
+    // TODO: AVX2 and AVX-512 gathers (c.f. gather_24).
+    return v;
+}
 
 SI U32 gather_24(const uint8_t* p, I32 ix) {
     // First, back up a byte.  Any place we're gathering from has a safe junk byte to read
     // in front of it, either a previous table value, or some tag metadata.
     p -= 1;
 
+    // Load the i'th 24-bit value from p, and 1 extra byte.
+    auto load_24_32 = [p](int i) {
+        return load<uint32_t>(p + 3*i);
+    };
+
     // Now load multiples of 4 bytes (a junk byte, then r,g,b).
 #if N == 1
-    U32 v = load_24_32(p,ix);
+    U32 v = load_24_32(ix);
 #elif N == 4
-    U32 v = { load_24_32(p,ix[0]), load_24_32(p,ix[1]), load_24_32(p,ix[2]), load_24_32(p,ix[3]) };
+    U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) };
 #elif N == 8 && !defined(USING_AVX2)
-    U32 v = { load_24_32(p,ix[0]), load_24_32(p,ix[1]), load_24_32(p,ix[2]), load_24_32(p,ix[3]),
-              load_24_32(p,ix[4]), load_24_32(p,ix[5]), load_24_32(p,ix[6]), load_24_32(p,ix[7]) };
+    U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]),
+              load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) };
 #elif N == 8
+    (void)load_24_32;
     // The gather instruction here doesn't need any particular alignment,
     // but the intrinsic takes a const int*.
     const int* p4 = bit_pun<const int*>(p);
@@ -405,6 +422,7 @@
         U32 v = (U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1);
     #endif
 #elif N == 16
+    (void)load_24_32;
     // The intrinsic is supposed to take const void* now, but it takes const int*, just like AVX2.
     // And AVX-512 swapped the order of arguments.  :/
     const int* p4 = bit_pun<const int*>(p);
@@ -420,18 +438,24 @@
         // As in gather_24(), with everything doubled.
         p -= 2;
 
+        // Load the i'th 48-bit value from p, and 2 extra bytes.
+        auto load_48_64 = [p](int i) {
+            return load<uint64_t>(p + 6*i);
+        };
+
     #if N == 1
-        *v = load_48_64(p,ix);
+        *v = load_48_64(ix);
     #elif N == 4
         *v = U64{
-            load_48_64(p,ix[0]), load_48_64(p,ix[1]), load_48_64(p,ix[2]), load_48_64(p,ix[3]),
+            load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
         };
     #elif N == 8 && !defined(USING_AVX2)
         *v = U64{
-            load_48_64(p,ix[0]), load_48_64(p,ix[1]), load_48_64(p,ix[2]), load_48_64(p,ix[3]),
-            load_48_64(p,ix[4]), load_48_64(p,ix[5]), load_48_64(p,ix[6]), load_48_64(p,ix[7]),
+            load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
+            load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]),
         };
     #elif N == 8
+        (void)load_48_64;
         typedef int32_t   __attribute__((vector_size(16))) Half_I32;
         typedef long long __attribute__((vector_size(32))) Half_I64;
 
@@ -456,6 +480,7 @@
         store((char*)v +  0, lo);
         store((char*)v + 32, hi);
     #elif N == 16
+        (void)load_48_64;
         const long long int* p8 = bit_pun<const long long int*>(p);
         __m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1),
                 hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1);
@@ -676,6 +701,17 @@
                 a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
             } break;
 
+            case Op_load_8888_palette8:{
+                const uint8_t* palette = (const uint8_t*) *args++;
+                I32 ix = cast<I32>(load<U8>(src + 1*i));
+                U32 rgba = gather_32(palette, ix);
+
+                r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f);
+                g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f);
+                b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
+                a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
+            } break;
+
             case Op_load_1010102:{
                 U32 rgba = load<U32>(src + 4*i);
 
diff --git a/tests.c b/tests.c
index ccd737f..23a6792 100644
--- a/tests.c
+++ b/tests.c
@@ -1231,6 +1231,40 @@
                             &buf, skcms_PixelFormat_BGR_161616BE, upm, xyz, 1) );
 }
 
+static void test_Palette8() {
+    uint32_t palette[256];
+    for (int i = 0; i < 256; i++) {
+        palette[i] = (uint32_t)(255 - i) * 0x01010101;
+    }
+
+    uint8_t  src[512];
+    uint32_t dst[512];
+    for (int i = 0; i < 512; i++) {
+        src[i] = (uint8_t)(i % 256);
+    }
+
+    const skcms_ICCProfile* srgb = skcms_sRGB_profile();
+    const skcms_AlphaFormat upm = skcms_AlphaFormat_Unpremul;
+
+    expect( skcms_TransformWithPalette(src, skcms_PixelFormat_RGBA_8888_Palette8, upm, srgb,
+                                       dst, skcms_PixelFormat_RGBA_8888         , upm, srgb,
+                                       512, palette) );
+
+    for (int i = 0; i < 512; i++) {
+        uint32_t expected = (uint32_t)(255 - i%256) * 0x01010101;
+        expect( dst[i] == expected );
+    }
+
+
+    // Double check we can't transform skcms_PixelFormat_RGBA_8888_Palette8 without a palette.
+    expect( !skcms_Transform(src, skcms_PixelFormat_RGBA_8888_Palette8, upm, srgb,
+                             dst, skcms_PixelFormat_RGBA_8888         , upm, srgb,
+                             512) );
+    expect( !skcms_TransformWithPalette(src, skcms_PixelFormat_RGBA_8888_Palette8, upm, srgb,
+                                        dst, skcms_PixelFormat_RGBA_8888         , upm, srgb,
+                                        512, NULL) );
+}
+
 int main(int argc, char** argv) {
     bool regenTestData = false;
     for (int i = 1; i < argc; ++i) {
@@ -1265,6 +1299,7 @@
     test_ExactlyEqual();
     test_Clamp();
     test_AliasedTransforms();
+    test_Palette8();
 #if 0
     test_CLUT();
 #endif